nbtsearch.c source code [PostgreSQL/src/backend/access/nbtree/nbtsearch.c]

1	/-------------------------------------------------------------------------*
2	*
3	* nbtsearch.c
4	* Search code for postgres btrees.
5	*
6	*
7	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1994, Regents of the University of California
9	*
10	* IDENTIFICATION
11	* src/backend/access/nbtree/nbtsearch.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15
16	#include "postgres.h"
17
18	#include "access/nbtree.h"
19	#include "access/relscan.h"
20	#include "miscadmin.h"
21	#include "pgstat.h"
22	#include "storage/predicate.h"
23	#include "utils/lsyscache.h"
24	#include "utils/rel.h"
25
26
27	static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
28	static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
29	static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
30	OffsetNumber offnum);
31	static void _bt_saveitem(BTScanOpaque so, int itemIndex,
32	OffsetNumber offnum, IndexTuple itup);
33	static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
34	static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
35	static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
36	ScanDirection dir);
37	static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
38	static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
39	static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
40
41
42	/*
43	* _bt_drop_lock_and_maybe_pin()
44	*
45	* Unlock the buffer; and if it is safe to release the pin, do that, too. It
46	* is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
47	* This will prevent vacuum from stalling in a blocked state trying to read a
48	* page when a cursor is sitting on it -- at least in many important cases.
49	*
50	* Set the buffer to invalid if the pin is released, since the buffer may be
51	* re-used. If we need to go back to this block (for example, to apply
52	* LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
53	* will remain in shared memory for as long as it takes to scan the index
54	* buffer page.
55	*/
56	static void
57	_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
58	{
59	LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
60
61	if (IsMVCCSnapshot(scan->xs_snapshot) &&
62	RelationNeedsWAL(scan->indexRelation) &&
63	!scan->xs_want_itup)
64	{
65	ReleaseBuffer(sp->buf);
66	sp->buf = InvalidBuffer;
67	}
68	}
69
70	/*
71	* _bt_search() -- Search the tree for a particular scankey,
72	* or more precisely for the first leaf page it could be on.
73	*
74	* The passed scankey is an insertion-type scankey (see nbtree/README),
75	* but it can omit the rightmost column(s) of the index.
76	*
77	* Return value is a stack of parent-page pointers. *bufP is set to the
78	* address of the leaf-page buffer, which is read-locked and pinned.
79	* No locks are held on the parent pages, however!
80	*
81	* If the snapshot parameter is not NULL, "old snapshot" checking will take
82	* place during the descent through the tree. This is not needed when
83	* positioning for an insert or delete, so NULL is used for those cases.
84	*
85	* The returned buffer is locked according to access parameter. Additionally,
86	* access = BT_WRITE will allow an empty root page to be created and returned.
87	* When access = BT_READ, an empty index will result in *bufP being set to
88	* InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
89	* during the search will be finished.
90	*/
91	BTStack
92	_bt_search(Relation rel, BTScanInsert key, Buffer bufP, int* access,
93	Snapshot snapshot)
94	{
95	BTStack stack_in = NULL;
96	int page_access = BT_READ;
97
98	/ Get the root page to start with /
99	*bufP = _bt_getroot(rel, access);
100
101	/ If index is empty and access = BT_READ, no root page is created. /
102	if (!BufferIsValid(*bufP))
103	return (BTStack) NULL;
104
105	/ Loop iterates once per level descended in the tree /
106	for (;;)
107	{
108	Page page;
109	BTPageOpaque opaque;
110	OffsetNumber offnum;
111	ItemId itemid;
112	IndexTuple itup;
113	BlockNumber blkno;
114	BlockNumber par_blkno;
115	BTStack new_stack;
116
117	/*
118	* Race -- the page we just grabbed may have split since we read its
119	* pointer in the parent (or metapage). If it has, we may need to
120	* move right to its new sibling. Do that.
121	*
122	* In write-mode, allow _bt_moveright to finish any incomplete splits
123	* along the way. Strictly speaking, we'd only need to finish an
124	* incomplete split on the leaf page we're about to insert to, not on
125	* any of the upper levels (they are taken care of in _bt_getstackbuf,
126	* if the leaf page is split and we insert to the parent page). But
127	* this is a good opportunity to finish splits of internal pages too.
128	*/
129	bufP = _bt_moveright(rel, key, bufP, (access == BT_WRITE), stack_in,
130	page_access, snapshot);
131
132	/ if this is a leaf page, we're done /
133	page = BufferGetPage(*bufP);
134	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
135	if (P_ISLEAF(opaque))
136	break;
137
138	/*
139	* Find the appropriate item on the internal page, and get the child
140	* page that it points to.
141	*/
142	offnum = _bt_binsrch(rel, key, *bufP);
143	itemid = PageGetItemId(page, offnum);
144	itup = (IndexTuple) PageGetItem(page, itemid);
145	blkno = BTreeInnerTupleGetDownLink(itup);
146	par_blkno = BufferGetBlockNumber(*bufP);
147
148	/*
149	* We need to save the location of the index entry we chose in the
150	* parent page on a stack. In case we split the tree, we'll use the
151	* stack to work back up to the parent page. We also save the actual
152	* downlink (block) to uniquely identify the index entry, in case it
153	* moves right while we're working lower in the tree. See the paper
154	* by Lehman and Yao for how this is detected and handled. (We use the
155	* child link during the second half of a page split -- if caller ends
156	* up splitting the child it usually ends up inserting a new pivot
157	* tuple for child's new right sibling immediately after the original
158	* bts_offset offset recorded here. The downlink block will be needed
159	* to check if bts_offset remains the position of this same pivot
160	* tuple.)
161	*/
162	new_stack = (BTStack) palloc(sizeof(BTStackData));
163	new_stack->bts_blkno = par_blkno;
164	new_stack->bts_offset = offnum;
165	new_stack->bts_btentry = blkno;
166	new_stack->bts_parent = stack_in;
167
168	/*
169	* Page level 1 is lowest non-leaf page level prior to leaves. So, if
170	* we're on the level 1 and asked to lock leaf page in write mode,
171	* then lock next page in write mode, because it must be a leaf.
172	*/
173	if (opaque->btpo.level == `1` && access == BT_WRITE)
174	page_access = BT_WRITE;
175
176	/ drop the read lock on the parent page, acquire one on the child /
177	bufP = _bt_relandgetbuf(rel, bufP, blkno, page_access);
178
179	/ okay, all set to move down a level /
180	stack_in = new_stack;
181	}
182
183	/*
184	* If we're asked to lock leaf in write mode, but didn't manage to, then
185	* relock. This should only happen when the root page is a leaf page (and
186	* the only page in the index other than the metapage).
187	*/
188	if (access == BT_WRITE && page_access == BT_READ)
189	{
190	/ trade in our read lock for a write lock /
191	LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
192	LockBuffer(*bufP, BT_WRITE);
193
194	/*
195	* If the page was split between the time that we surrendered our read
196	* lock and acquired our write lock, then this page may no longer be
197	* the right place for the key we want to insert. In this case, we
198	* need to move right in the tree. See Lehman and Yao for an
199	* excruciatingly precise description.
200	*/
201	bufP = _bt_moveright(rel, key, bufP, true, stack_in, BT_WRITE,
202	snapshot);
203	}
204
205	return stack_in;
206	}
207
208	/*
209	* _bt_moveright() -- move right in the btree if necessary.
210	*
211	* When we follow a pointer to reach a page, it is possible that
212	* the page has changed in the meanwhile. If this happens, we're
213	* guaranteed that the page has "split right" -- that is, that any
214	* data that appeared on the page originally is either on the page
215	* or strictly to the right of it.
216	*
217	* This routine decides whether or not we need to move right in the
218	* tree by examining the high key entry on the page. If that entry is
219	* strictly less than the scankey, or <= the scankey in the
220	* key.nextkey=true case, then we followed the wrong link and we need
221	* to move right.
222	*
223	* The passed insertion-type scankey can omit the rightmost column(s) of the
224	* index. (see nbtree/README)
225	*
226	* When key.nextkey is false (the usual case), we are looking for the first
227	* item >= key. When key.nextkey is true, we are looking for the first item
228	* strictly greater than key.
229	*
230	* If forupdate is true, we will attempt to finish any incomplete splits
231	* that we encounter. This is required when locking a target page for an
232	* insertion, because we don't allow inserting on a page before the split
233	* is completed. 'stack' is only used if forupdate is true.
234	*
235	* On entry, we have the buffer pinned and a lock of the type specified by
236	* 'access'. If we move right, we release the buffer and lock and acquire
237	* the same on the right sibling. Return value is the buffer we stop at.
238	*
239	* If the snapshot parameter is not NULL, "old snapshot" checking will take
240	* place during the descent through the tree. This is not needed when
241	* positioning for an insert or delete, so NULL is used for those cases.
242	*/
243	Buffer
244	_bt_moveright(Relation rel,
245	BTScanInsert key,
246	Buffer buf,
247	bool forupdate,
248	BTStack stack,
249	int access,
250	Snapshot snapshot)
251	{
252	Page page;
253	BTPageOpaque opaque;
254	int32 cmpval;
255
256	/*
257	* When nextkey = false (normal case): if the scan key that brought us to
258	* this page is > the high key stored on the page, then the page has split
259	* and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
260	* have some duplicates to the right as well as the left, but that's
261	* something that's only ever dealt with on the leaf level, after
262	* _bt_search has found an initial leaf page.)
263	*
264	* When nextkey = true: move right if the scan key is >= page's high key.
265	* (Note that key.scantid cannot be set in this case.)
266	*
267	* The page could even have split more than once, so scan as far as
268	* needed.
269	*
270	* We also have to move right if we followed a link that brought us to a
271	* dead page.
272	*/
273	cmpval = key->nextkey ? `0` : `1`;
274
275	for (;;)
276	{
277	page = BufferGetPage(buf);
278	TestForOldSnapshot(snapshot, rel, page);
279	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
280
281	if (P_RIGHTMOST(opaque))
282	break;
283
284	/*
285	* Finish any incomplete splits we encounter along the way.
286	*/
287	if (forupdate && P_INCOMPLETE_SPLIT(opaque))
288	{
289	BlockNumber blkno = BufferGetBlockNumber(buf);
290
291	/ upgrade our lock if necessary /
292	if (access == BT_READ)
293	{
294	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
295	LockBuffer(buf, BT_WRITE);
296	}
297
298	if (P_INCOMPLETE_SPLIT(opaque))
299	_bt_finish_split(rel, buf, stack);
300	else
301	_bt_relbuf(rel, buf);
302
303	/ re-acquire the lock in the right mode, and re-check /
304	buf = _bt_getbuf(rel, blkno, access);
305	continue;
306	}
307
308	if (P_IGNORE(opaque) \|\| _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
309	{
310	/ step right one page /
311	buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
312	continue;
313	}
314	else
315	break;
316	}
317
318	if (P_IGNORE(opaque))
319	elog(ERROR, "fell off the end of index \"%s\"",
320	RelationGetRelationName(rel));
321
322	return buf;
323	}
324
325	/*
326	* _bt_binsrch() -- Do a binary search for a key on a particular page.
327	*
328	* On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
329	* key >= given scankey, or > scankey if nextkey is true. (NOTE: in
330	* particular, this means it is possible to return a value 1 greater than the
331	* number of keys on the page, if the scankey is > all keys on the page.)
332	*
333	* On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
334	* of the last key < given scankey, or last key <= given scankey if nextkey
335	* is true. (Since _bt_compare treats the first data key of such a page as
336	* minus infinity, there will be at least one key < scankey, so the result
337	* always points at one of the keys on the page.) This key indicates the
338	* right place to descend to be sure we find all leaf keys >= given scankey
339	* (or leaf keys > given scankey when nextkey is true).
340	*
341	* This procedure is not responsible for walking right, it just examines
342	* the given page. _bt_binsrch() has no lock or refcount side effects
343	* on the buffer.
344	*/
345	static OffsetNumber
346	_bt_binsrch(Relation rel,
347	BTScanInsert key,
348	Buffer buf)
349	{
350	Page page;
351	BTPageOpaque opaque;
352	OffsetNumber low,
353	high;
354	int32 result,
355	cmpval;
356
357	/ Requesting nextkey semantics while using scantid seems nonsensical /
358	Assert(!key->nextkey \|\| key->scantid == NULL);
359
360	page = BufferGetPage(buf);
361	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
362
363	low = P_FIRSTDATAKEY(opaque);
364	high = PageGetMaxOffsetNumber(page);
365
366	/*
367	* If there are no keys on the page, return the first available slot. Note
368	* this covers two cases: the page is really empty (no keys), or it
369	* contains only a high key. The latter case is possible after vacuuming.
370	* This can never happen on an internal page, however, since they are
371	* never empty (an internal page must have children).
372	*/
373	if (unlikely(high < low))
374	return low;
375
376	/*
377	* Binary search to find the first key on the page >= scan key, or first
378	* key > scankey when nextkey is true.
379	*
380	* For nextkey=false (cmpval=1), the loop invariant is: all slots before
381	* 'low' are < scan key, all slots at or after 'high' are >= scan key.
382	*
383	* For nextkey=true (cmpval=0), the loop invariant is: all slots before
384	* 'low' are <= scan key, all slots at or after 'high' are > scan key.
385	*
386	* We can fall out when high == low.
387	*/
388	high++; / establish the loop invariant for high /
389
390	cmpval = key->nextkey ? `0` : `1`; / select comparison value /
391
392	while (high > low)
393	{
394	OffsetNumber mid = low + ((high - low) / `2`);
395
396	/ We have low <= mid < high, so mid points at a real slot /
397
398	result = _bt_compare(rel, key, page, mid);
399
400	if (result >= cmpval)
401	low = mid + `1`;
402	else
403	high = mid;
404	}
405
406	/*
407	* At this point we have high == low, but be careful: they could point
408	* past the last slot on the page.
409	*
410	* On a leaf page, we always return the first key >= scan key (resp. >
411	* scan key), which could be the last slot + 1.
412	*/
413	if (P_ISLEAF(opaque))
414	return low;
415
416	/*
417	* On a non-leaf page, return the last key < scan key (resp. <= scan key).
418	* There must be one if _bt_compare() is playing by the rules.
419	*/
420	Assert(low > P_FIRSTDATAKEY(opaque));
421
422	return OffsetNumberPrev(low);
423	}
424
425	/*
426	*
427	* _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
428	*
429	* Like _bt_binsrch(), but with support for caching the binary search
430	* bounds. Only used during insertion, and only on the leaf page that it
431	* looks like caller will insert tuple on. Exclusive-locked and pinned
432	* leaf page is contained within insertstate.
433	*
434	* Caches the bounds fields in insertstate so that a subsequent call can
435	* reuse the low and strict high bounds of original binary search. Callers
436	* that use these fields directly must be prepared for the case where low
437	* and/or stricthigh are not on the same page (one or both exceed maxoff
438	* for the page). The case where there are no items on the page (high <
439	* low) makes bounds invalid.
440	*
441	* Caller is responsible for invalidating bounds when it modifies the page
442	* before calling here a second time.
443	*/
444	OffsetNumber
445	_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
446	{
447	BTScanInsert key = insertstate->itup_key;
448	Page page;
449	BTPageOpaque opaque;
450	OffsetNumber low,
451	high,
452	stricthigh;
453	int32 result,
454	cmpval;
455
456	page = BufferGetPage(insertstate->buf);
457	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
458
459	Assert(P_ISLEAF(opaque));
460	Assert(!key->nextkey);
461
462	if (!insertstate->bounds_valid)
463	{
464	/ Start new binary search /
465	low = P_FIRSTDATAKEY(opaque);
466	high = PageGetMaxOffsetNumber(page);
467	}
468	else
469	{
470	/ Restore result of previous binary search against same page /
471	low = insertstate->low;
472	high = insertstate->stricthigh;
473	}
474
475	/ If there are no keys on the page, return the first available slot /
476	if (unlikely(high < low))
477	{
478	/ Caller can't reuse bounds /
479	insertstate->low = InvalidOffsetNumber;
480	insertstate->stricthigh = InvalidOffsetNumber;
481	insertstate->bounds_valid = false;
482	return low;
483	}
484
485	/*
486	* Binary search to find the first key on the page >= scan key. (nextkey
487	* is always false when inserting).
488	*
489	* The loop invariant is: all slots before 'low' are < scan key, all slots
490	* at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
491	* maintained to save additional search effort for caller.
492	*
493	* We can fall out when high == low.
494	*/
495	if (!insertstate->bounds_valid)
496	high++; / establish the loop invariant for high /
497	stricthigh = high; / high initially strictly higher /
498
499	cmpval = `1`; / !nextkey comparison value /
500
501	while (high > low)
502	{
503	OffsetNumber mid = low + ((high - low) / `2`);
504
505	/ We have low <= mid < high, so mid points at a real slot /
506
507	result = _bt_compare(rel, key, page, mid);
508
509	if (result >= cmpval)
510	low = mid + `1`;
511	else
512	{
513	high = mid;
514	if (result != `0`)
515	stricthigh = high;
516	}
517	}
518
519	/*
520	* On a leaf page, a binary search always returns the first key >= scan
521	* key (at least in !nextkey case), which could be the last slot + 1. This
522	* is also the lower bound of cached search.
523	*
524	* stricthigh may also be the last slot + 1, which prevents caller from
525	* using bounds directly, but is still useful to us if we're called a
526	* second time with cached bounds (cached low will be < stricthigh when
527	* that happens).
528	*/
529	insertstate->low = low;
530	insertstate->stricthigh = stricthigh;
531	insertstate->bounds_valid = true;
532
533	return low;
534	}
535
536	/----------*
537	* _bt_compare() -- Compare insertion-type scankey to tuple on a page.
538	*
539	* page/offnum: location of btree item to be compared to.
540	*
541	* This routine returns:
542	* <0 if scankey < tuple at offnum;
543	* 0 if scankey == tuple at offnum;
544	* >0 if scankey > tuple at offnum.
545	* NULLs in the keys are treated as sortable values. Therefore
546	* "equality" does not necessarily mean that the item should be
547	* returned to the caller as a matching key!
548	*
549	* CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
550	* "minus infinity": this routine will always claim it is less than the
551	* scankey. The actual key value stored is explicitly truncated to 0
552	* attributes (explicitly minus infinity) with version 3+ indexes, but
553	* that isn't relied upon. This allows us to implement the Lehman and
554	* Yao convention that the first down-link pointer is before the first
555	* key. See backend/access/nbtree/README for details.
556	*----------
557	*/
558	int32
559	_bt_compare(Relation rel,
560	BTScanInsert key,
561	Page page,
562	OffsetNumber offnum)
563	{
564	TupleDesc itupdesc = RelationGetDescr(rel);
565	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
566	IndexTuple itup;
567	ItemPointer heapTid;
568	ScanKey scankey;
569	int ncmpkey;
570	int ntupatts;
571
572	Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
573	Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
574	Assert(key->heapkeyspace \|\| key->scantid == NULL);
575
576	/*
577	* Force result ">" if target item is first data item on an internal page
578	* --- see NOTE above.
579	*/
580	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
581	return `1`;
582
583	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
584	ntupatts = BTreeTupleGetNAtts(itup, rel);
585
586	/*
587	* The scan key is set up with the attribute number associated with each
588	* term in the key. It is important that, if the index is multi-key, the
589	* scan contain the first k key attributes, and that they be in order. If
590	* you think about how multi-key ordering works, you'll understand why
591	* this is.
592	*
593	* We don't test for violation of this condition here, however. The
594	* initial setup for the index scan had better have gotten it right (see
595	* _bt_first).
596	*/
597
598	ncmpkey = Min(ntupatts, key->keysz);
599	Assert(key->heapkeyspace \|\| ncmpkey == key->keysz);
600	scankey = key->scankeys;
601	for (int i = `1`; i <= ncmpkey; i++)
602	{
603	Datum datum;
604	bool isNull;
605	int32 result;
606
607	datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
608
609	/ see comments about NULLs handling in btbuild /
610	if (scankey->sk_flags & SK_ISNULL) / key is NULL /
611	{
612	if (isNull)
613	result = `0`; / NULL "=" NULL /
614	else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
615	result = -`1`; / NULL "<" NOT_NULL /
616	else
617	result = `1`; / NULL ">" NOT_NULL /
618	}
619	else if (isNull) / key is NOT_NULL and item is NULL /
620	{
621	if (scankey->sk_flags & SK_BT_NULLS_FIRST)
622	result = `1`; / NOT_NULL ">" NULL /
623	else
624	result = -`1`; / NOT_NULL "<" NULL /
625	}
626	else
627	{
628	/*
629	* The sk_func needs to be passed the index value as left arg and
630	* the sk_argument as right arg (they might be of different
631	* types). Since it is convenient for callers to think of
632	* _bt_compare as comparing the scankey to the index item, we have
633	* to flip the sign of the comparison result. (Unless it's a DESC
634	* column, in which case we don't flip the sign.)
635	*/
636	result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
637	scankey->sk_collation,
638	datum,
639	scankey->sk_argument));
640
641	if (!(scankey->sk_flags & SK_BT_DESC))
642	INVERT_COMPARE_RESULT(result);
643	}
644
645	/ if the keys are unequal, return the difference /
646	if (result != `0`)
647	return result;
648
649	scankey++;
650	}
651
652	/*
653	* All non-truncated attributes (other than heap TID) were found to be
654	* equal. Treat truncated attributes as minus infinity when scankey has a
655	* key attribute value that would otherwise be compared directly.
656	*
657	* Note: it doesn't matter if ntupatts includes non-key attributes;
658	* scankey won't, so explicitly excluding non-key attributes isn't
659	* necessary.
660	*/
661	if (key->keysz > ntupatts)
662	return `1`;
663
664	/*
665	* Use the heap TID attribute and scantid to try to break the tie. The
666	* rules are the same as any other key attribute -- only the
667	* representation differs.
668	*/
669	heapTid = BTreeTupleGetHeapTID(itup);
670	if (key->scantid == NULL)
671	{
672	/*
673	* Most searches have a scankey that is considered greater than a
674	* truncated pivot tuple if and when the scankey has equal values for
675	* attributes up to and including the least significant untruncated
676	* attribute in tuple.
677	*
678	* For example, if an index has the minimum two attributes (single
679	* user key attribute, plus heap TID attribute), and a page's high key
680	* is ('foo', -inf), and scankey is ('foo', <omitted>), the search
681	* will not descend to the page to the left. The search will descend
682	* right instead. The truncated attribute in pivot tuple means that
683	* all non-pivot tuples on the page to the left are strictly < 'foo',
684	* so it isn't necessary to descend left. In other words, search
685	* doesn't have to descend left because it isn't interested in a match
686	* that has a heap TID value of -inf.
687	*
688	* However, some searches (pivotsearch searches) actually require that
689	* we descend left when this happens. -inf is treated as a possible
690	* match for omitted scankey attribute(s). This is needed by page
691	* deletion, which must re-find leaf pages that are targets for
692	* deletion using their high keys.
693	*
694	* Note: the heap TID part of the test ensures that scankey is being
695	* compared to a pivot tuple with one or more truncated key
696	* attributes.
697	*
698	* Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
699	* left here, since they have no heap TID attribute (and cannot have
700	* any -inf key values in any case, since truncation can only remove
701	* non-key attributes). !heapkeyspace searches must always be
702	* prepared to deal with matches on both sides of the pivot once the
703	* leaf level is reached.
704	*/
705	if (key->heapkeyspace && !key->pivotsearch &&
706	key->keysz == ntupatts && heapTid == NULL)
707	return `1`;
708
709	/ All provided scankey arguments found to be equal /
710	return `0`;
711	}
712
713	/*
714	* Treat truncated heap TID as minus infinity, since scankey has a key
715	* attribute value (scantid) that would otherwise be compared directly
716	*/
717	Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
718	if (heapTid == NULL)
719	return `1`;
720
721	Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
722	return ItemPointerCompare(key->scantid, heapTid);
723	}
724
725	/*
726	* _bt_first() -- Find the first item in a scan.
727	*
728	* We need to be clever about the direction of scan, the search
729	* conditions, and the tree ordering. We find the first item (or,
730	* if backwards scan, the last item) in the tree that satisfies the
731	* qualifications in the scan key. On success exit, the page containing
732	* the current index tuple is pinned but not locked, and data about
733	* the matching tuple(s) on the page has been loaded into so->currPos.
734	* scan->xs_ctup.t_self is set to the heap TID of the current tuple,
735	* and if requested, scan->xs_itup points to a copy of the index tuple.
736	*
737	* If there are no matching items in the index, we return false, with no
738	* pins or locks held.
739	*
740	* Note that scan->keyData[], and the so->keyData[] scankey built from it,
741	* are both search-type scankeys (see nbtree/README for more about this).
742	* Within this routine, we build a temporary insertion-type scankey to use
743	* in locating the scan start position.
744	*/
745	bool
746	_bt_first(IndexScanDesc scan, ScanDirection dir)
747	{
748	Relation rel = scan->indexRelation;
749	BTScanOpaque so = (BTScanOpaque) scan->opaque;
750	Buffer buf;
751	BTStack stack;
752	OffsetNumber offnum;
753	StrategyNumber strat;
754	bool nextkey;
755	bool goback;
756	BTScanInsertData inskey;
757	ScanKey startKeys[INDEX_MAX_KEYS];
758	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
759	int keysCount = `0`;
760	int i;
761	bool status = true;
762	StrategyNumber strat_total;
763	BTScanPosItem *currItem;
764	BlockNumber blkno;
765
766	Assert(!BTScanPosIsValid(so->currPos));
767
768	pgstat_count_index_scan(rel);
769
770	/*
771	* Examine the scan keys and eliminate any redundant keys; also mark the
772	* keys that must be matched to continue the scan.
773	*/
774	_bt_preprocess_keys(scan);
775
776	/*
777	* Quit now if _bt_preprocess_keys() discovered that the scan keys can
778	* never be satisfied (eg, x == 1 AND x > 2).
779	*/
780	if (!so->qual_ok)
781	return false;
782
783	/*
784	* For parallel scans, get the starting page from shared state. If the
785	* scan has not started, proceed to find out first leaf page in the usual
786	* way while keeping other participating processes waiting. If the scan
787	* has already begun, use the page number from the shared structure.
788	*/
789	if (scan->parallel_scan != NULL)
790	{
791	status = _bt_parallel_seize(scan, &blkno);
792	if (!status)
793	return false;
794	else if (blkno == P_NONE)
795	{
796	_bt_parallel_done(scan);
797	return false;
798	}
799	else if (blkno != InvalidBlockNumber)
800	{
801	if (!_bt_parallel_readpage(scan, blkno, dir))
802	return false;
803	goto readcomplete;
804	}
805	}
806
807	/----------*
808	* Examine the scan keys to discover where we need to start the scan.
809	*
810	* We want to identify the keys that can be used as starting boundaries;
811	* these are =, >, or >= keys for a forward scan or =, <, <= keys for
812	* a backwards scan. We can use keys for multiple attributes so long as
813	* the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
814	* a > or < boundary or find an attribute with no boundary (which can be
815	* thought of as the same as "> -infinity"), we can't use keys for any
816	* attributes to its right, because it would break our simplistic notion
817	* of what initial positioning strategy to use.
818	*
819	* When the scan keys include cross-type operators, _bt_preprocess_keys
820	* may not be able to eliminate redundant keys; in such cases we will
821	* arbitrarily pick a usable one for each attribute. This is correct
822	* but possibly not optimal behavior. (For example, with keys like
823	* "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
824	* x=5 would be more efficient.) Since the situation only arises given
825	* a poorly-worded query plus an incomplete opfamily, live with it.
826	*
827	* When both equality and inequality keys appear for a single attribute
828	* (again, only possible when cross-type operators appear), we must
829	* select one of the equality keys for the starting point, because
830	* _bt_checkkeys() will stop the scan as soon as an equality qual fails.
831	* For example, if we have keys like "x >= 4 AND x = 10" and we elect to
832	* start at x=4, we will fail and stop before reaching x=10. If multiple
833	* equality quals survive preprocessing, however, it doesn't matter which
834	* one we use --- by definition, they are either redundant or
835	* contradictory.
836	*
837	* Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
838	* If the index stores nulls at the end of the index we'll be starting
839	* from, and we have no boundary key for the column (which means the key
840	* we deduced NOT NULL from is an inequality key that constrains the other
841	* end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
842	* use as a boundary key. If we didn't do this, we might find ourselves
843	* traversing a lot of null entries at the start of the scan.
844	*
845	* In this loop, row-comparison keys are treated the same as keys on their
846	* first (leftmost) columns. We'll add on lower-order columns of the row
847	* comparison below, if possible.
848	*
849	* The selected scan keys (at most one per index column) are remembered by
850	* storing their addresses into the local startKeys[] array.
851	*----------
852	*/
853	strat_total = BTEqualStrategyNumber;
854	if (so->numberOfKeys > `0`)
855	{
856	AttrNumber curattr;
857	ScanKey chosen;
858	ScanKey impliesNN;
859	ScanKey cur;
860
861	/*
862	* chosen is the so-far-chosen key for the current attribute, if any.
863	* We don't cast the decision in stone until we reach keys for the
864	* next attribute.
865	*/
866	curattr = `1`;
867	chosen = NULL;
868	/ Also remember any scankey that implies a NOT NULL constraint /
869	impliesNN = NULL;
870
871	/*
872	* Loop iterates from 0 to numberOfKeys inclusive; we use the last
873	* pass to handle after-last-key processing. Actual exit from the
874	* loop is at one of the "break" statements below.
875	*/
876	for (cur = so->keyData, i = `0`;; cur++, i++)
877	{
878	if (i >= so->numberOfKeys \|\| cur->sk_attno != curattr)
879	{
880	/*
881	* Done looking at keys for curattr. If we didn't find a
882	* usable boundary key, see if we can deduce a NOT NULL key.
883	*/
884	if (chosen == NULL && impliesNN != NULL &&
885	((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
886	ScanDirectionIsForward(dir) :
887	ScanDirectionIsBackward(dir)))
888	{
889	/ Yes, so build the key in notnullkeys[keysCount] /
890	chosen = &notnullkeys[keysCount];
891	ScanKeyEntryInitialize(chosen,
892	(SK_SEARCHNOTNULL \| SK_ISNULL \|
893	(impliesNN->sk_flags &
894	(SK_BT_DESC \| SK_BT_NULLS_FIRST))),
895	curattr,
896	((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
897	BTGreaterStrategyNumber :
898	BTLessStrategyNumber),
899	InvalidOid,
900	InvalidOid,
901	InvalidOid,
902	(Datum) `0`);
903	}
904
905	/*
906	* If we still didn't find a usable boundary key, quit; else
907	* save the boundary key pointer in startKeys.
908	*/
909	if (chosen == NULL)
910	break;
911	startKeys[keysCount++] = chosen;
912
913	/*
914	* Adjust strat_total, and quit if we have stored a > or <
915	* key.
916	*/
917	strat = chosen->sk_strategy;
918	if (strat != BTEqualStrategyNumber)
919	{
920	strat_total = strat;
921	if (strat == BTGreaterStrategyNumber \|\|
922	strat == BTLessStrategyNumber)
923	break;
924	}
925
926	/*
927	* Done if that was the last attribute, or if next key is not
928	* in sequence (implying no boundary key is available for the
929	* next attribute).
930	*/
931	if (i >= so->numberOfKeys \|\|
932	cur->sk_attno != curattr + `1`)
933	break;
934
935	/*
936	* Reset for next attr.
937	*/
938	curattr = cur->sk_attno;
939	chosen = NULL;
940	impliesNN = NULL;
941	}
942
943	/*
944	* Can we use this key as a starting boundary for this attr?
945	*
946	* If not, does it imply a NOT NULL constraint? (Because
947	* SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
948	* any inequality key works for that; we need not test.)
949	*/
950	switch (cur->sk_strategy)
951	{
952	case BTLessStrategyNumber:
953	case BTLessEqualStrategyNumber:
954	if (chosen == NULL)
955	{
956	if (ScanDirectionIsBackward(dir))
957	chosen = cur;
958	else
959	impliesNN = cur;
960	}
961	break;
962	case BTEqualStrategyNumber:
963	/ override any non-equality choice /
964	chosen = cur;
965	break;
966	case BTGreaterEqualStrategyNumber:
967	case BTGreaterStrategyNumber:
968	if (chosen == NULL)
969	{
970	if (ScanDirectionIsForward(dir))
971	chosen = cur;
972	else
973	impliesNN = cur;
974	}
975	break;
976	}
977	}
978	}
979
980	/*
981	* If we found no usable boundary keys, we have to start from one end of
982	* the tree. Walk down that edge to the first or last key, and scan from
983	* there.
984	*/
985	if (keysCount == `0`)
986	{
987	bool match;
988
989	match = _bt_endpoint(scan, dir);
990
991	if (!match)
992	{
993	/ No match, so mark (parallel) scan finished /
994	_bt_parallel_done(scan);
995	}
996
997	return match;
998	}
999
1000	/*
1001	* We want to start the scan somewhere within the index. Set up an
1002	* insertion scankey we can use to search for the boundary point we
1003	* identified above. The insertion scankey is built using the keys
1004	* identified by startKeys[]. (Remaining insertion scankey fields are
1005	* initialized after initial-positioning strategy is finalized.)
1006	*/
1007	Assert(keysCount <= INDEX_MAX_KEYS);
1008	for (i = `0`; i < keysCount; i++)
1009	{
1010	ScanKey cur = startKeys[i];
1011
1012	Assert(cur->sk_attno == i + `1`);
1013
1014	if (cur->sk_flags & SK_ROW_HEADER)
1015	{
1016	/*
1017	* Row comparison header: look to the first row member instead.
1018	*
1019	* The member scankeys are already in insertion format (ie, they
1020	* have sk_func = 3-way-comparison function), but we have to watch
1021	* out for nulls, which _bt_preprocess_keys didn't check. A null
1022	* in the first row member makes the condition unmatchable, just
1023	* like qual_ok = false.
1024	*/
1025	ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1026
1027	Assert(subkey->sk_flags & SK_ROW_MEMBER);
1028	if (subkey->sk_flags & SK_ISNULL)
1029	{
1030	_bt_parallel_done(scan);
1031	return false;
1032	}
1033	memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1034
1035	/*
1036	* If the row comparison is the last positioning key we accepted,
1037	* try to add additional keys from the lower-order row members.
1038	* (If we accepted independent conditions on additional index
1039	* columns, we use those instead --- doesn't seem worth trying to
1040	* determine which is more restrictive.) Note that this is OK
1041	* even if the row comparison is of ">" or "<" type, because the
1042	* condition applied to all but the last row member is effectively
1043	* ">=" or "<=", and so the extra keys don't break the positioning
1044	* scheme. But, by the same token, if we aren't able to use all
1045	* the row members, then the part of the row comparison that we
1046	* did use has to be treated as just a ">=" or "<=" condition, and
1047	* so we'd better adjust strat_total accordingly.
1048	*/
1049	if (i == keysCount - `1`)
1050	{
1051	bool used_all_subkeys = false;
1052
1053	Assert(!(subkey->sk_flags & SK_ROW_END));
1054	for (;;)
1055	{
1056	subkey++;
1057	Assert(subkey->sk_flags & SK_ROW_MEMBER);
1058	if (subkey->sk_attno != keysCount + `1`)
1059	break; / out-of-sequence, can't use it /
1060	if (subkey->sk_strategy != cur->sk_strategy)
1061	break; / wrong direction, can't use it /
1062	if (subkey->sk_flags & SK_ISNULL)
1063	break; / can't use null keys /
1064	Assert(keysCount < INDEX_MAX_KEYS);
1065	memcpy(inskey.scankeys + keysCount, subkey,
1066	sizeof(ScanKeyData));
1067	keysCount++;
1068	if (subkey->sk_flags & SK_ROW_END)
1069	{
1070	used_all_subkeys = true;
1071	break;
1072	}
1073	}
1074	if (!used_all_subkeys)
1075	{
1076	switch (strat_total)
1077	{
1078	case BTLessStrategyNumber:
1079	strat_total = BTLessEqualStrategyNumber;
1080	break;
1081	case BTGreaterStrategyNumber:
1082	strat_total = BTGreaterEqualStrategyNumber;
1083	break;
1084	}
1085	}
1086	break; / done with outer loop /
1087	}
1088	}
1089	else
1090	{
1091	/*
1092	* Ordinary comparison key. Transform the search-style scan key
1093	* to an insertion scan key by replacing the sk_func with the
1094	* appropriate btree comparison function.
1095	*
1096	* If scankey operator is not a cross-type comparison, we can use
1097	* the cached comparison function; otherwise gotta look it up in
1098	* the catalogs. (That can't lead to infinite recursion, since no
1099	* indexscan initiated by syscache lookup will use cross-data-type
1100	* operators.)
1101	*
1102	* We support the convention that sk_subtype == InvalidOid means
1103	* the opclass input type; this is a hack to simplify life for
1104	* ScanKeyInit().
1105	*/
1106	if (cur->sk_subtype == rel->rd_opcintype[i] \|\|
1107	cur->sk_subtype == InvalidOid)
1108	{
1109	FmgrInfo *procinfo;
1110
1111	procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1112	ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1113	cur->sk_flags,
1114	cur->sk_attno,
1115	InvalidStrategy,
1116	cur->sk_subtype,
1117	cur->sk_collation,
1118	procinfo,
1119	cur->sk_argument);
1120	}
1121	else
1122	{
1123	RegProcedure cmp_proc;
1124
1125	cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1126	rel->rd_opcintype[i],
1127	cur->sk_subtype,
1128	BTORDER_PROC);
1129	if (!RegProcedureIsValid(cmp_proc))
1130	elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1131	BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1132	cur->sk_attno, RelationGetRelationName(rel));
1133	ScanKeyEntryInitialize(inskey.scankeys + i,
1134	cur->sk_flags,
1135	cur->sk_attno,
1136	InvalidStrategy,
1137	cur->sk_subtype,
1138	cur->sk_collation,
1139	cmp_proc,
1140	cur->sk_argument);
1141	}
1142	}
1143	}
1144
1145	/----------*
1146	* Examine the selected initial-positioning strategy to determine exactly
1147	* where we need to start the scan, and set flag variables to control the
1148	* code below.
1149	*
1150	* If nextkey = false, _bt_search and _bt_binsrch will locate the first
1151	* item >= scan key. If nextkey = true, they will locate the first
1152	* item > scan key.
1153	*
1154	* If goback = true, we will then step back one item, while if
1155	* goback = false, we will start the scan on the located item.
1156	*----------
1157	*/
1158	switch (strat_total)
1159	{
1160	case BTLessStrategyNumber:
1161
1162	/*
1163	* Find first item >= scankey, then back up one to arrive at last
1164	* item < scankey. (Note: this positioning strategy is only used
1165	* for a backward scan, so that is always the correct starting
1166	* position.)
1167	*/
1168	nextkey = false;
1169	goback = true;
1170	break;
1171
1172	case BTLessEqualStrategyNumber:
1173
1174	/*
1175	* Find first item > scankey, then back up one to arrive at last
1176	* item <= scankey. (Note: this positioning strategy is only used
1177	* for a backward scan, so that is always the correct starting
1178	* position.)
1179	*/
1180	nextkey = true;
1181	goback = true;
1182	break;
1183
1184	case BTEqualStrategyNumber:
1185
1186	/*
1187	* If a backward scan was specified, need to start with last equal
1188	* item not first one.
1189	*/
1190	if (ScanDirectionIsBackward(dir))
1191	{
1192	/*
1193	* This is the same as the <= strategy. We will check at the
1194	* end whether the found item is actually =.
1195	*/
1196	nextkey = true;
1197	goback = true;
1198	}
1199	else
1200	{
1201	/*
1202	* This is the same as the >= strategy. We will check at the
1203	* end whether the found item is actually =.
1204	*/
1205	nextkey = false;
1206	goback = false;
1207	}
1208	break;
1209
1210	case BTGreaterEqualStrategyNumber:
1211
1212	/*
1213	* Find first item >= scankey. (This is only used for forward
1214	* scans.)
1215	*/
1216	nextkey = false;
1217	goback = false;
1218	break;
1219
1220	case BTGreaterStrategyNumber:
1221
1222	/*
1223	* Find first item > scankey. (This is only used for forward
1224	* scans.)
1225	*/
1226	nextkey = true;
1227	goback = false;
1228	break;
1229
1230	default:
1231	/ can't get here, but keep compiler quiet /
1232	elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1233	return false;
1234	}
1235
1236	/ Initialize remaining insertion scan key fields /
1237	inskey.heapkeyspace = _bt_heapkeyspace(rel);
1238	inskey.anynullkeys = false; / unused /
1239	inskey.nextkey = nextkey;
1240	inskey.pivotsearch = false;
1241	inskey.scantid = NULL;
1242	inskey.keysz = keysCount;
1243
1244	/*
1245	* Use the manufactured insertion scan key to descend the tree and
1246	* position ourselves on the target leaf page.
1247	*/
1248	stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1249
1250	/ don't need to keep the stack around... /
1251	_bt_freestack(stack);
1252
1253	if (!BufferIsValid(buf))
1254	{
1255	/*
1256	* We only get here if the index is completely empty. Lock relation
1257	* because nothing finer to lock exists.
1258	*/
1259	PredicateLockRelation(rel, scan->xs_snapshot);
1260
1261	/*
1262	* mark parallel scan as done, so that all the workers can finish
1263	* their scan
1264	*/
1265	_bt_parallel_done(scan);
1266	BTScanPosInvalidate(so->currPos);
1267
1268	return false;
1269	}
1270	else
1271	PredicateLockPage(rel, BufferGetBlockNumber(buf),
1272	scan->xs_snapshot);
1273
1274	_bt_initialize_more_data(so, dir);
1275
1276	/ position to the precise item on the page /
1277	offnum = _bt_binsrch(rel, &inskey, buf);
1278
1279	/*
1280	* If nextkey = false, we are positioned at the first item >= scan key, or
1281	* possibly at the end of a page on which all the existing items are less
1282	* than the scan key and we know that everything on later pages is greater
1283	* than or equal to scan key.
1284	*
1285	* If nextkey = true, we are positioned at the first item > scan key, or
1286	* possibly at the end of a page on which all the existing items are less
1287	* than or equal to the scan key and we know that everything on later
1288	* pages is greater than scan key.
1289	*
1290	* The actually desired starting point is either this item or the prior
1291	* one, or in the end-of-page case it's the first item on the next page or
1292	* the last item on this page. Adjust the starting offset if needed. (If
1293	* this results in an offset before the first item or after the last one,
1294	* _bt_readpage will report no items found, and then we'll step to the
1295	* next page as needed.)
1296	*/
1297	if (goback)
1298	offnum = OffsetNumberPrev(offnum);
1299
1300	/ remember which buffer we have pinned, if any /
1301	Assert(!BTScanPosIsValid(so->currPos));
1302	so->currPos.buf = buf;
1303
1304	/*
1305	* Now load data from the first page of the scan.
1306	*/
1307	if (!_bt_readpage(scan, dir, offnum))
1308	{
1309	/*
1310	* There's no actually-matching data on this page. Try to advance to
1311	* the next page. Return false if there's no matching data at all.
1312	*/
1313	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1314	if (!_bt_steppage(scan, dir))
1315	return false;
1316	}
1317	else
1318	{
1319	/ Drop the lock, and maybe the pin, on the current page /
1320	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1321	}
1322
1323	readcomplete:
1324	/ OK, itemIndex says what to return /
1325	currItem = &so->currPos.items[so->currPos.itemIndex];
1326	scan->xs_heaptid = currItem->heapTid;
1327	if (scan->xs_want_itup)
1328	scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1329
1330	return true;
1331	}
1332
1333	/*
1334	* _bt_next() -- Get the next item in a scan.
1335	*
1336	* On entry, so->currPos describes the current page, which may be pinned
1337	* but is not locked, and so->currPos.itemIndex identifies which item was
1338	* previously returned.
1339	*
1340	* On successful exit, scan->xs_ctup.t_self is set to the TID of the
1341	* next heap tuple, and if requested, scan->xs_itup points to a copy of
1342	* the index tuple. so->currPos is updated as needed.
1343	*
1344	* On failure exit (no more tuples), we release pin and set
1345	* so->currPos.buf to InvalidBuffer.
1346	*/
1347	bool
1348	_bt_next(IndexScanDesc scan, ScanDirection dir)
1349	{
1350	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1351	BTScanPosItem *currItem;
1352
1353	/*
1354	* Advance to next tuple on current page; or if there's no more, try to
1355	* step to the next page with data.
1356	*/
1357	if (ScanDirectionIsForward(dir))
1358	{
1359	if (++so->currPos.itemIndex > so->currPos.lastItem)
1360	{
1361	if (!_bt_steppage(scan, dir))
1362	return false;
1363	}
1364	}
1365	else
1366	{
1367	if (--so->currPos.itemIndex < so->currPos.firstItem)
1368	{
1369	if (!_bt_steppage(scan, dir))
1370	return false;
1371	}
1372	}
1373
1374	/ OK, itemIndex says what to return /
1375	currItem = &so->currPos.items[so->currPos.itemIndex];
1376	scan->xs_heaptid = currItem->heapTid;
1377	if (scan->xs_want_itup)
1378	scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1379
1380	return true;
1381	}
1382
1383	/*
1384	* _bt_readpage() -- Load data from current index page into so->currPos
1385	*
1386	* Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1387	* is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1388	* they are updated as appropriate. All other fields of so->currPos are
1389	* initialized from scratch here.
1390	*
1391	* We scan the current page starting at offnum and moving in the indicated
1392	* direction. All items matching the scan keys are loaded into currPos.items.
1393	* moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1394	* that there can be no more matching tuples in the current scan direction.
1395	*
1396	* In the case of a parallel scan, caller must have called _bt_parallel_seize
1397	* prior to calling this function; this function will invoke
1398	* _bt_parallel_release before returning.
1399	*
1400	* Returns true if any matching items found on the page, false if none.
1401	*/
1402	static bool
1403	_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1404	{
1405	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1406	Page page;
1407	BTPageOpaque opaque;
1408	OffsetNumber minoff;
1409	OffsetNumber maxoff;
1410	int itemIndex;
1411	bool continuescan;
1412	int indnatts;
1413
1414	/*
1415	* We must have the buffer pinned and locked, but the usual macro can't be
1416	* used here; this function is what makes it good for currPos.
1417	*/
1418	Assert(BufferIsValid(so->currPos.buf));
1419
1420	page = BufferGetPage(so->currPos.buf);
1421	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1422
1423	/ allow next page be processed by parallel worker /
1424	if (scan->parallel_scan)
1425	{
1426	if (ScanDirectionIsForward(dir))
1427	_bt_parallel_release(scan, opaque->btpo_next);
1428	else
1429	_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1430	}
1431
1432	continuescan = true; / default assumption /
1433	indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1434	minoff = P_FIRSTDATAKEY(opaque);
1435	maxoff = PageGetMaxOffsetNumber(page);
1436
1437	/*
1438	* We note the buffer's block number so that we can release the pin later.
1439	* This allows us to re-read the buffer if it is needed again for hinting.
1440	*/
1441	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1442
1443	/*
1444	* We save the LSN of the page as we read it, so that we know whether it
1445	* safe to apply LP_DEAD hints to the page later. This allows us to drop
1446	* the pin for MVCC scans, which allows vacuum to avoid blocking.
1447	*/
1448	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1449
1450	/*
1451	* we must save the page's right-link while scanning it; this tells us
1452	* where to step right to after we're done with these items. There is no
1453	* corresponding need for the left-link, since splits always go right.
1454	*/
1455	so->currPos.nextPage = opaque->btpo_next;
1456
1457	/ initialize tuple workspace to empty /
1458	so->currPos.nextTupleOffset = `0`;
1459
1460	/*
1461	* Now that the current page has been made consistent, the macro should be
1462	* good.
1463	*/
1464	Assert(BTScanPosIsPinned(so->currPos));
1465
1466	if (ScanDirectionIsForward(dir))
1467	{
1468	/ load items[] in ascending order /
1469	itemIndex = `0`;
1470
1471	offnum = Max(offnum, minoff);
1472
1473	while (offnum <= maxoff)
1474	{
1475	ItemId iid = PageGetItemId(page, offnum);
1476	IndexTuple itup;
1477
1478	/*
1479	* If the scan specifies not to return killed tuples, then we
1480	* treat a killed tuple as not passing the qual
1481	*/
1482	if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1483	{
1484	offnum = OffsetNumberNext(offnum);
1485	continue;
1486	}
1487
1488	itup = (IndexTuple) PageGetItem(page, iid);
1489
1490	if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1491	{
1492	/ tuple passes all scan key conditions, so remember it /
1493	_bt_saveitem(so, itemIndex, offnum, itup);
1494	itemIndex++;
1495	}
1496	/ When !continuescan, there can't be any more matches, so stop /
1497	if (!continuescan)
1498	break;
1499
1500	offnum = OffsetNumberNext(offnum);
1501	}
1502
1503	/*
1504	* We don't need to visit page to the right when the high key
1505	* indicates that no more matches will be found there.
1506	*
1507	* Checking the high key like this works out more often than you might
1508	* think. Leaf page splits pick a split point between the two most
1509	* dissimilar tuples (this is weighed against the need to evenly share
1510	* free space). Leaf pages with high key attribute values that can
1511	* only appear on non-pivot tuples on the right sibling page are
1512	* common.
1513	*/
1514	if (continuescan && !P_RIGHTMOST(opaque))
1515	{
1516	ItemId iid = PageGetItemId(page, P_HIKEY);
1517	IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1518	int truncatt;
1519
1520	truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1521	_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1522	}
1523
1524	if (!continuescan)
1525	so->currPos.moreRight = false;
1526
1527	Assert(itemIndex <= MaxIndexTuplesPerPage);
1528	so->currPos.firstItem = `0`;
1529	so->currPos.lastItem = itemIndex - `1`;
1530	so->currPos.itemIndex = `0`;
1531	}
1532	else
1533	{
1534	/ load items[] in descending order /
1535	itemIndex = MaxIndexTuplesPerPage;
1536
1537	offnum = Min(offnum, maxoff);
1538
1539	while (offnum >= minoff)
1540	{
1541	ItemId iid = PageGetItemId(page, offnum);
1542	IndexTuple itup;
1543	bool tuple_alive;
1544	bool passes_quals;
1545
1546	/*
1547	* If the scan specifies not to return killed tuples, then we
1548	* treat a killed tuple as not passing the qual. Most of the
1549	* time, it's a win to not bother examining the tuple's index
1550	* keys, but just skip to the next tuple (previous, actually,
1551	* since we're scanning backwards). However, if this is the first
1552	* tuple on the page, we do check the index keys, to prevent
1553	* uselessly advancing to the page to the left. This is similar
1554	* to the high key optimization used by forward scans.
1555	*/
1556	if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1557	{
1558	Assert(offnum >= P_FIRSTDATAKEY(opaque));
1559	if (offnum > P_FIRSTDATAKEY(opaque))
1560	{
1561	offnum = OffsetNumberPrev(offnum);
1562	continue;
1563	}
1564
1565	tuple_alive = false;
1566	}
1567	else
1568	tuple_alive = true;
1569
1570	itup = (IndexTuple) PageGetItem(page, iid);
1571
1572	passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1573	&continuescan);
1574	if (passes_quals && tuple_alive)
1575	{
1576	/ tuple passes all scan key conditions, so remember it /
1577	itemIndex--;
1578	_bt_saveitem(so, itemIndex, offnum, itup);
1579	}
1580	if (!continuescan)
1581	{
1582	/ there can't be any more matches, so stop /
1583	so->currPos.moreLeft = false;
1584	break;
1585	}
1586
1587	offnum = OffsetNumberPrev(offnum);
1588	}
1589
1590	Assert(itemIndex >= `0`);
1591	so->currPos.firstItem = itemIndex;
1592	so->currPos.lastItem = MaxIndexTuplesPerPage - `1`;
1593	so->currPos.itemIndex = MaxIndexTuplesPerPage - `1`;
1594	}
1595
1596	return (so->currPos.firstItem <= so->currPos.lastItem);
1597	}
1598
1599	/ Save an index item into so->currPos.items[itemIndex] /
1600	static void
1601	_bt_saveitem(BTScanOpaque so, int itemIndex,
1602	OffsetNumber offnum, IndexTuple itup)
1603	{
1604	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1605
1606	currItem->heapTid = itup->t_tid;
1607	currItem->indexOffset = offnum;
1608	if (so->currTuples)
1609	{
1610	Size itupsz = IndexTupleSize(itup);
1611
1612	currItem->tupleOffset = so->currPos.nextTupleOffset;
1613	memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1614	so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1615	}
1616	}
1617
1618	/*
1619	* _bt_steppage() -- Step to next page containing valid data for scan
1620	*
1621	* On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1622	* if pinned, we'll drop the pin before moving to next page. The buffer is
1623	* not locked on entry.
1624	*
1625	* For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1626	* read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1627	* to InvalidBuffer. We return true to indicate success.
1628	*/
1629	static bool
1630	_bt_steppage(IndexScanDesc scan, ScanDirection dir)
1631	{
1632	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1633	BlockNumber blkno = InvalidBlockNumber;
1634	bool status = true;
1635
1636	Assert(BTScanPosIsValid(so->currPos));
1637
1638	/ Before leaving current page, deal with any killed items /
1639	if (so->numKilled > `0`)
1640	_bt_killitems(scan);
1641
1642	/*
1643	* Before we modify currPos, make a copy of the page data if there was a
1644	* mark position that needs it.
1645	*/
1646	if (so->markItemIndex >= `0`)
1647	{
1648	/ bump pin on current buffer for assignment to mark buffer /
1649	if (BTScanPosIsPinned(so->currPos))
1650	IncrBufferRefCount(so->currPos.buf);
1651	memcpy(&so->markPos, &so->currPos,
1652	offsetof(BTScanPosData, items[`1`]) +
1653	so->currPos.lastItem * sizeof(BTScanPosItem));
1654	if (so->markTuples)
1655	memcpy(so->markTuples, so->currTuples,
1656	so->currPos.nextTupleOffset);
1657	so->markPos.itemIndex = so->markItemIndex;
1658	so->markItemIndex = -`1`;
1659	}
1660
1661	if (ScanDirectionIsForward(dir))
1662	{
1663	/ Walk right to the next page with data /
1664	if (scan->parallel_scan != NULL)
1665	{
1666	/*
1667	* Seize the scan to get the next block number; if the scan has
1668	* ended already, bail out.
1669	*/
1670	status = _bt_parallel_seize(scan, &blkno);
1671	if (!status)
1672	{
1673	/ release the previous buffer, if pinned /
1674	BTScanPosUnpinIfPinned(so->currPos);
1675	BTScanPosInvalidate(so->currPos);
1676	return false;
1677	}
1678	}
1679	else
1680	{
1681	/ Not parallel, so use the previously-saved nextPage link. /
1682	blkno = so->currPos.nextPage;
1683	}
1684
1685	/ Remember we left a page with data /
1686	so->currPos.moreLeft = true;
1687
1688	/ release the previous buffer, if pinned /
1689	BTScanPosUnpinIfPinned(so->currPos);
1690	}
1691	else
1692	{
1693	/ Remember we left a page with data /
1694	so->currPos.moreRight = true;
1695
1696	if (scan->parallel_scan != NULL)
1697	{
1698	/*
1699	* Seize the scan to get the current block number; if the scan has
1700	* ended already, bail out.
1701	*/
1702	status = _bt_parallel_seize(scan, &blkno);
1703	BTScanPosUnpinIfPinned(so->currPos);
1704	if (!status)
1705	{
1706	BTScanPosInvalidate(so->currPos);
1707	return false;
1708	}
1709	}
1710	else
1711	{
1712	/ Not parallel, so just use our own notion of the current page /
1713	blkno = so->currPos.currPage;
1714	}
1715	}
1716
1717	if (!_bt_readnextpage(scan, blkno, dir))
1718	return false;
1719
1720	/ Drop the lock, and maybe the pin, on the current page /
1721	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1722
1723	return true;
1724	}
1725
1726	/*
1727	* _bt_readnextpage() -- Read next page containing valid data for scan
1728	*
1729	* On success exit, so->currPos is updated to contain data from the next
1730	* interesting page. Caller is responsible to release lock and pin on
1731	* buffer on success. We return true to indicate success.
1732	*
1733	* If there are no more matching records in the given direction, we drop all
1734	* locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1735	*/
1736	static bool
1737	_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1738	{
1739	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1740	Relation rel;
1741	Page page;
1742	BTPageOpaque opaque;
1743	bool status = true;
1744
1745	rel = scan->indexRelation;
1746
1747	if (ScanDirectionIsForward(dir))
1748	{
1749	for (;;)
1750	{
1751	/*
1752	* if we're at end of scan, give up and mark parallel scan as
1753	* done, so that all the workers can finish their scan
1754	*/
1755	if (blkno == P_NONE \|\| !so->currPos.moreRight)
1756	{
1757	_bt_parallel_done(scan);
1758	BTScanPosInvalidate(so->currPos);
1759	return false;
1760	}
1761	/ check for interrupts while we're not holding any buffer lock /
1762	CHECK_FOR_INTERRUPTS();
1763	/ step right one page /
1764	so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1765	page = BufferGetPage(so->currPos.buf);
1766	TestForOldSnapshot(scan->xs_snapshot, rel, page);
1767	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1768	/ check for deleted page /
1769	if (!P_IGNORE(opaque))
1770	{
1771	PredicateLockPage(rel, blkno, scan->xs_snapshot);
1772	/ see if there are any matches on this page /
1773	/ note that this will clear moreRight if we can stop /
1774	if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1775	break;
1776	}
1777	else if (scan->parallel_scan != NULL)
1778	{
1779	/ allow next page be processed by parallel worker /
1780	_bt_parallel_release(scan, opaque->btpo_next);
1781	}
1782
1783	/ nope, keep going /
1784	if (scan->parallel_scan != NULL)
1785	{
1786	_bt_relbuf(rel, so->currPos.buf);
1787	status = _bt_parallel_seize(scan, &blkno);
1788	if (!status)
1789	{
1790	BTScanPosInvalidate(so->currPos);
1791	return false;
1792	}
1793	}
1794	else
1795	{
1796	blkno = opaque->btpo_next;
1797	_bt_relbuf(rel, so->currPos.buf);
1798	}
1799	}
1800	}
1801	else
1802	{
1803	/*
1804	* Should only happen in parallel cases, when some other backend
1805	* advanced the scan.
1806	*/
1807	if (so->currPos.currPage != blkno)
1808	{
1809	BTScanPosUnpinIfPinned(so->currPos);
1810	so->currPos.currPage = blkno;
1811	}
1812
1813	/*
1814	* Walk left to the next page with data. This is much more complex
1815	* than the walk-right case because of the possibility that the page
1816	* to our left splits while we are in flight to it, plus the
1817	* possibility that the page we were on gets deleted after we leave
1818	* it. See nbtree/README for details.
1819	*
1820	* It might be possible to rearrange this code to have less overhead
1821	* in pinning and locking, but that would require capturing the left
1822	* pointer when the page is initially read, and using it here, along
1823	* with big changes to _bt_walk_left() and the code below. It is not
1824	* clear whether this would be a win, since if the page immediately to
1825	* the left splits after we read this page and before we step left, we
1826	* would need to visit more pages than with the current code.
1827	*
1828	* Note that if we change the code so that we drop the pin for a scan
1829	* which uses a non-MVCC snapshot, we will need to modify the code for
1830	* walking left, to allow for the possibility that a referenced page
1831	* has been deleted. As long as the buffer is pinned or the snapshot
1832	* is MVCC the page cannot move past the half-dead state to fully
1833	* deleted.
1834	*/
1835	if (BTScanPosIsPinned(so->currPos))
1836	LockBuffer(so->currPos.buf, BT_READ);
1837	else
1838	so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1839
1840	for (;;)
1841	{
1842	/ Done if we know there are no matching keys to the left /
1843	if (!so->currPos.moreLeft)
1844	{
1845	_bt_relbuf(rel, so->currPos.buf);
1846	_bt_parallel_done(scan);
1847	BTScanPosInvalidate(so->currPos);
1848	return false;
1849	}
1850
1851	/ Step to next physical page /
1852	so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1853	scan->xs_snapshot);
1854
1855	/ if we're physically at end of index, return failure /
1856	if (so->currPos.buf == InvalidBuffer)
1857	{
1858	_bt_parallel_done(scan);
1859	BTScanPosInvalidate(so->currPos);
1860	return false;
1861	}
1862
1863	/*
1864	* Okay, we managed to move left to a non-deleted page. Done if
1865	* it's not half-dead and contains matching tuples. Else loop back
1866	* and do it all again.
1867	*/
1868	page = BufferGetPage(so->currPos.buf);
1869	TestForOldSnapshot(scan->xs_snapshot, rel, page);
1870	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1871	if (!P_IGNORE(opaque))
1872	{
1873	PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
1874	/ see if there are any matches on this page /
1875	/ note that this will clear moreLeft if we can stop /
1876	if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1877	break;
1878	}
1879	else if (scan->parallel_scan != NULL)
1880	{
1881	/ allow next page be processed by parallel worker /
1882	_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1883	}
1884
1885	/*
1886	* For parallel scans, get the last page scanned as it is quite
1887	* possible that by the time we try to seize the scan, some other
1888	* worker has already advanced the scan to a different page. We
1889	* must continue based on the latest page scanned by any worker.
1890	*/
1891	if (scan->parallel_scan != NULL)
1892	{
1893	_bt_relbuf(rel, so->currPos.buf);
1894	status = _bt_parallel_seize(scan, &blkno);
1895	if (!status)
1896	{
1897	BTScanPosInvalidate(so->currPos);
1898	return false;
1899	}
1900	so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1901	}
1902	}
1903	}
1904
1905	return true;
1906	}
1907
1908	/*
1909	* _bt_parallel_readpage() -- Read current page containing valid data for scan
1910	*
1911	* On success, release lock and maybe pin on buffer. We return true to
1912	* indicate success.
1913	*/
1914	static bool
1915	_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1916	{
1917	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1918
1919	_bt_initialize_more_data(so, dir);
1920
1921	if (!_bt_readnextpage(scan, blkno, dir))
1922	return false;
1923
1924	/ Drop the lock, and maybe the pin, on the current page /
1925	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1926
1927	return true;
1928	}
1929
1930	/*
1931	* _bt_walk_left() -- step left one page, if possible
1932	*
1933	* The given buffer must be pinned and read-locked. This will be dropped
1934	* before stepping left. On return, we have pin and read lock on the
1935	* returned page, instead.
1936	*
1937	* Returns InvalidBuffer if there is no page to the left (no lock is held
1938	* in that case).
1939	*
1940	* When working on a non-leaf level, it is possible for the returned page
1941	* to be half-dead; the caller should check that condition and step left
1942	* again if it's important.
1943	*/
1944	static Buffer
1945	_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
1946	{
1947	Page page;
1948	BTPageOpaque opaque;
1949
1950	page = BufferGetPage(buf);
1951	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1952
1953	for (;;)
1954	{
1955	BlockNumber obknum;
1956	BlockNumber lblkno;
1957	BlockNumber blkno;
1958	int tries;
1959
1960	/ if we're at end of tree, release buf and return failure /
1961	if (P_LEFTMOST(opaque))
1962	{
1963	_bt_relbuf(rel, buf);
1964	break;
1965	}
1966	/ remember original page we are stepping left from /
1967	obknum = BufferGetBlockNumber(buf);
1968	/ step left /
1969	blkno = lblkno = opaque->btpo_prev;
1970	_bt_relbuf(rel, buf);
1971	/ check for interrupts while we're not holding any buffer lock /
1972	CHECK_FOR_INTERRUPTS();
1973	buf = _bt_getbuf(rel, blkno, BT_READ);
1974	page = BufferGetPage(buf);
1975	TestForOldSnapshot(snapshot, rel, page);
1976	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1977
1978	/*
1979	* If this isn't the page we want, walk right till we find what we
1980	* want --- but go no more than four hops (an arbitrary limit). If we
1981	* don't find the correct page by then, the most likely bet is that
1982	* the original page got deleted and isn't in the sibling chain at all
1983	* anymore, not that its left sibling got split more than four times.
1984	*
1985	* Note that it is correct to test P_ISDELETED not P_IGNORE here,
1986	* because half-dead pages are still in the sibling chain. Caller
1987	* must reject half-dead pages if wanted.
1988	*/
1989	tries = `0`;
1990	for (;;)
1991	{
1992	if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1993	{
1994	/ Found desired page, return it /
1995	return buf;
1996	}
1997	if (P_RIGHTMOST(opaque) \|\| ++tries > `4`)
1998	break;
1999	blkno = opaque->btpo_next;
2000	buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2001	page = BufferGetPage(buf);
2002	TestForOldSnapshot(snapshot, rel, page);
2003	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2004	}
2005
2006	/ Return to the original page to see what's up /
2007	buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2008	page = BufferGetPage(buf);
2009	TestForOldSnapshot(snapshot, rel, page);
2010	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2011	if (P_ISDELETED(opaque))
2012	{
2013	/*
2014	* It was deleted. Move right to first nondeleted page (there
2015	* must be one); that is the page that has acquired the deleted
2016	* one's keyspace, so stepping left from it will take us where we
2017	* want to be.
2018	*/
2019	for (;;)
2020	{
2021	if (P_RIGHTMOST(opaque))
2022	elog(ERROR, "fell off the end of index \"%s\"",
2023	RelationGetRelationName(rel));
2024	blkno = opaque->btpo_next;
2025	buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2026	page = BufferGetPage(buf);
2027	TestForOldSnapshot(snapshot, rel, page);
2028	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2029	if (!P_ISDELETED(opaque))
2030	break;
2031	}
2032
2033	/*
2034	* Now return to top of loop, resetting obknum to point to this
2035	* nondeleted page, and try again.
2036	*/
2037	}
2038	else
2039	{
2040	/*
2041	* It wasn't deleted; the explanation had better be that the page
2042	* to the left got split or deleted. Without this check, we'd go
2043	* into an infinite loop if there's anything wrong.
2044	*/
2045	if (opaque->btpo_prev == lblkno)
2046	elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2047	obknum, RelationGetRelationName(rel));
2048	/ Okay to try again with new lblkno value /
2049	}
2050	}
2051
2052	return InvalidBuffer;
2053	}
2054
2055	/*
2056	* _bt_get_endpoint() -- Find the first or last page on a given tree level
2057	*
2058	* If the index is empty, we will return InvalidBuffer; any other failure
2059	* condition causes ereport(). We will not return a dead page.
2060	*
2061	* The returned buffer is pinned and read-locked.
2062	*/
2063	Buffer
2064	_bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
2065	Snapshot snapshot)
2066	{
2067	Buffer buf;
2068	Page page;
2069	BTPageOpaque opaque;
2070	OffsetNumber offnum;
2071	BlockNumber blkno;
2072	IndexTuple itup;
2073
2074	/*
2075	* If we are looking for a leaf page, okay to descend from fast root;
2076	* otherwise better descend from true root. (There is no point in being
2077	* smarter about intermediate levels.)
2078	*/
2079	if (level == `0`)
2080	buf = _bt_getroot(rel, BT_READ);
2081	else
2082	buf = _bt_gettrueroot(rel);
2083
2084	if (!BufferIsValid(buf))
2085	return InvalidBuffer;
2086
2087	page = BufferGetPage(buf);
2088	TestForOldSnapshot(snapshot, rel, page);
2089	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2090
2091	for (;;)
2092	{
2093	/*
2094	* If we landed on a deleted page, step right to find a live page
2095	* (there must be one). Also, if we want the rightmost page, step
2096	* right if needed to get to it (this could happen if the page split
2097	* since we obtained a pointer to it).
2098	*/
2099	while (P_IGNORE(opaque) \|\|
2100	(rightmost && !P_RIGHTMOST(opaque)))
2101	{
2102	blkno = opaque->btpo_next;
2103	if (blkno == P_NONE)
2104	elog(ERROR, "fell off the end of index \"%s\"",
2105	RelationGetRelationName(rel));
2106	buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2107	page = BufferGetPage(buf);
2108	TestForOldSnapshot(snapshot, rel, page);
2109	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2110	}
2111
2112	/ Done? /
2113	if (opaque->btpo.level == level)
2114	break;
2115	if (opaque->btpo.level < level)
2116	elog(ERROR, "btree level %u not found in index \"%s\"",
2117	level, RelationGetRelationName(rel));
2118
2119	/ Descend to leftmost or rightmost child page /
2120	if (rightmost)
2121	offnum = PageGetMaxOffsetNumber(page);
2122	else
2123	offnum = P_FIRSTDATAKEY(opaque);
2124
2125	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2126	blkno = BTreeInnerTupleGetDownLink(itup);
2127
2128	buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2129	page = BufferGetPage(buf);
2130	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2131	}
2132
2133	return buf;
2134	}
2135
2136	/*
2137	* _bt_endpoint() -- Find the first or last page in the index, and scan
2138	* from there to the first key satisfying all the quals.
2139	*
2140	* This is used by _bt_first() to set up a scan when we've determined
2141	* that the scan must start at the beginning or end of the index (for
2142	* a forward or backward scan respectively). Exit conditions are the
2143	* same as for _bt_first().
2144	*/
2145	static bool
2146	_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2147	{
2148	Relation rel = scan->indexRelation;
2149	BTScanOpaque so = (BTScanOpaque) scan->opaque;
2150	Buffer buf;
2151	Page page;
2152	BTPageOpaque opaque;
2153	OffsetNumber start;
2154	BTScanPosItem *currItem;
2155
2156	/*
2157	* Scan down to the leftmost or rightmost leaf page. This is a simplified
2158	* version of _bt_search(). We don't maintain a stack since we know we
2159	* won't need it.
2160	*/
2161	buf = _bt_get_endpoint(rel, `0`, ScanDirectionIsBackward(dir), scan->xs_snapshot);
2162
2163	if (!BufferIsValid(buf))
2164	{
2165	/*
2166	* Empty index. Lock the whole relation, as nothing finer to lock
2167	* exists.
2168	*/
2169	PredicateLockRelation(rel, scan->xs_snapshot);
2170	BTScanPosInvalidate(so->currPos);
2171	return false;
2172	}
2173
2174	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2175	page = BufferGetPage(buf);
2176	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2177	Assert(P_ISLEAF(opaque));
2178
2179	if (ScanDirectionIsForward(dir))
2180	{
2181	/ There could be dead pages to the left, so not this: /
2182	/ Assert(P_LEFTMOST(opaque)); /
2183
2184	start = P_FIRSTDATAKEY(opaque);
2185	}
2186	else if (ScanDirectionIsBackward(dir))
2187	{
2188	Assert(P_RIGHTMOST(opaque));
2189
2190	start = PageGetMaxOffsetNumber(page);
2191	}
2192	else
2193	{
2194	elog(ERROR, "invalid scan direction: %d", (int) dir);
2195	start = `0`; / keep compiler quiet /
2196	}
2197
2198	/ remember which buffer we have pinned /
2199	so->currPos.buf = buf;
2200
2201	_bt_initialize_more_data(so, dir);
2202
2203	/*
2204	* Now load data from the first page of the scan.
2205	*/
2206	if (!_bt_readpage(scan, dir, start))
2207	{
2208	/*
2209	* There's no actually-matching data on this page. Try to advance to
2210	* the next page. Return false if there's no matching data at all.
2211	*/
2212	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
2213	if (!_bt_steppage(scan, dir))
2214	return false;
2215	}
2216	else
2217	{
2218	/ Drop the lock, and maybe the pin, on the current page /
2219	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2220	}
2221
2222	/ OK, itemIndex says what to return /
2223	currItem = &so->currPos.items[so->currPos.itemIndex];
2224	scan->xs_heaptid = currItem->heapTid;
2225	if (scan->xs_want_itup)
2226	scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2227
2228	return true;
2229	}
2230
2231	/*
2232	* _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2233	* for scan direction
2234	*/
2235	static inline void
2236	_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2237	{
2238	/ initialize moreLeft/moreRight appropriately for scan direction /
2239	if (ScanDirectionIsForward(dir))
2240	{
2241	so->currPos.moreLeft = false;
2242	so->currPos.moreRight = true;
2243	}
2244	else
2245	{
2246	so->currPos.moreLeft = true;
2247	so->currPos.moreRight = false;
2248	}
2249	so->numKilled = `0`; / just paranoia /
2250	so->markItemIndex = -`1`; / ditto /
2251	}
2252

Browse the source code of PostgreSQL/src/backend/access/nbtree/nbtsearch.c