1/*-------------------------------------------------------------------------
2 *
3 * nbtsearch.c
4 * Search code for postgres btrees.
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtsearch.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "postgres.h"
17
18#include "access/nbtree.h"
19#include "access/relscan.h"
20#include "miscadmin.h"
21#include "pgstat.h"
22#include "storage/predicate.h"
23#include "utils/lsyscache.h"
24#include "utils/rel.h"
25
26
27static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
28static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
29static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
30 OffsetNumber offnum);
31static void _bt_saveitem(BTScanOpaque so, int itemIndex,
32 OffsetNumber offnum, IndexTuple itup);
33static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
34static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
35static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
36 ScanDirection dir);
37static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
38static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
39static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
40
41
42/*
43 * _bt_drop_lock_and_maybe_pin()
44 *
45 * Unlock the buffer; and if it is safe to release the pin, do that, too. It
46 * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
47 * This will prevent vacuum from stalling in a blocked state trying to read a
48 * page when a cursor is sitting on it -- at least in many important cases.
49 *
50 * Set the buffer to invalid if the pin is released, since the buffer may be
51 * re-used. If we need to go back to this block (for example, to apply
52 * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
53 * will remain in shared memory for as long as it takes to scan the index
54 * buffer page.
55 */
56static void
57_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
58{
59 LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
60
61 if (IsMVCCSnapshot(scan->xs_snapshot) &&
62 RelationNeedsWAL(scan->indexRelation) &&
63 !scan->xs_want_itup)
64 {
65 ReleaseBuffer(sp->buf);
66 sp->buf = InvalidBuffer;
67 }
68}
69
70/*
71 * _bt_search() -- Search the tree for a particular scankey,
72 * or more precisely for the first leaf page it could be on.
73 *
74 * The passed scankey is an insertion-type scankey (see nbtree/README),
75 * but it can omit the rightmost column(s) of the index.
76 *
77 * Return value is a stack of parent-page pointers. *bufP is set to the
78 * address of the leaf-page buffer, which is read-locked and pinned.
79 * No locks are held on the parent pages, however!
80 *
81 * If the snapshot parameter is not NULL, "old snapshot" checking will take
82 * place during the descent through the tree. This is not needed when
83 * positioning for an insert or delete, so NULL is used for those cases.
84 *
85 * The returned buffer is locked according to access parameter. Additionally,
86 * access = BT_WRITE will allow an empty root page to be created and returned.
87 * When access = BT_READ, an empty index will result in *bufP being set to
88 * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
89 * during the search will be finished.
90 */
91BTStack
92_bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
93 Snapshot snapshot)
94{
95 BTStack stack_in = NULL;
96 int page_access = BT_READ;
97
98 /* Get the root page to start with */
99 *bufP = _bt_getroot(rel, access);
100
101 /* If index is empty and access = BT_READ, no root page is created. */
102 if (!BufferIsValid(*bufP))
103 return (BTStack) NULL;
104
105 /* Loop iterates once per level descended in the tree */
106 for (;;)
107 {
108 Page page;
109 BTPageOpaque opaque;
110 OffsetNumber offnum;
111 ItemId itemid;
112 IndexTuple itup;
113 BlockNumber blkno;
114 BlockNumber par_blkno;
115 BTStack new_stack;
116
117 /*
118 * Race -- the page we just grabbed may have split since we read its
119 * pointer in the parent (or metapage). If it has, we may need to
120 * move right to its new sibling. Do that.
121 *
122 * In write-mode, allow _bt_moveright to finish any incomplete splits
123 * along the way. Strictly speaking, we'd only need to finish an
124 * incomplete split on the leaf page we're about to insert to, not on
125 * any of the upper levels (they are taken care of in _bt_getstackbuf,
126 * if the leaf page is split and we insert to the parent page). But
127 * this is a good opportunity to finish splits of internal pages too.
128 */
129 *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
130 page_access, snapshot);
131
132 /* if this is a leaf page, we're done */
133 page = BufferGetPage(*bufP);
134 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
135 if (P_ISLEAF(opaque))
136 break;
137
138 /*
139 * Find the appropriate item on the internal page, and get the child
140 * page that it points to.
141 */
142 offnum = _bt_binsrch(rel, key, *bufP);
143 itemid = PageGetItemId(page, offnum);
144 itup = (IndexTuple) PageGetItem(page, itemid);
145 blkno = BTreeInnerTupleGetDownLink(itup);
146 par_blkno = BufferGetBlockNumber(*bufP);
147
148 /*
149 * We need to save the location of the index entry we chose in the
150 * parent page on a stack. In case we split the tree, we'll use the
151 * stack to work back up to the parent page. We also save the actual
152 * downlink (block) to uniquely identify the index entry, in case it
153 * moves right while we're working lower in the tree. See the paper
154 * by Lehman and Yao for how this is detected and handled. (We use the
155 * child link during the second half of a page split -- if caller ends
156 * up splitting the child it usually ends up inserting a new pivot
157 * tuple for child's new right sibling immediately after the original
158 * bts_offset offset recorded here. The downlink block will be needed
159 * to check if bts_offset remains the position of this same pivot
160 * tuple.)
161 */
162 new_stack = (BTStack) palloc(sizeof(BTStackData));
163 new_stack->bts_blkno = par_blkno;
164 new_stack->bts_offset = offnum;
165 new_stack->bts_btentry = blkno;
166 new_stack->bts_parent = stack_in;
167
168 /*
169 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
170 * we're on the level 1 and asked to lock leaf page in write mode,
171 * then lock next page in write mode, because it must be a leaf.
172 */
173 if (opaque->btpo.level == 1 && access == BT_WRITE)
174 page_access = BT_WRITE;
175
176 /* drop the read lock on the parent page, acquire one on the child */
177 *bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access);
178
179 /* okay, all set to move down a level */
180 stack_in = new_stack;
181 }
182
183 /*
184 * If we're asked to lock leaf in write mode, but didn't manage to, then
185 * relock. This should only happen when the root page is a leaf page (and
186 * the only page in the index other than the metapage).
187 */
188 if (access == BT_WRITE && page_access == BT_READ)
189 {
190 /* trade in our read lock for a write lock */
191 LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
192 LockBuffer(*bufP, BT_WRITE);
193
194 /*
195 * If the page was split between the time that we surrendered our read
196 * lock and acquired our write lock, then this page may no longer be
197 * the right place for the key we want to insert. In this case, we
198 * need to move right in the tree. See Lehman and Yao for an
199 * excruciatingly precise description.
200 */
201 *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
202 snapshot);
203 }
204
205 return stack_in;
206}
207
208/*
209 * _bt_moveright() -- move right in the btree if necessary.
210 *
211 * When we follow a pointer to reach a page, it is possible that
212 * the page has changed in the meanwhile. If this happens, we're
213 * guaranteed that the page has "split right" -- that is, that any
214 * data that appeared on the page originally is either on the page
215 * or strictly to the right of it.
216 *
217 * This routine decides whether or not we need to move right in the
218 * tree by examining the high key entry on the page. If that entry is
219 * strictly less than the scankey, or <= the scankey in the
220 * key.nextkey=true case, then we followed the wrong link and we need
221 * to move right.
222 *
223 * The passed insertion-type scankey can omit the rightmost column(s) of the
224 * index. (see nbtree/README)
225 *
226 * When key.nextkey is false (the usual case), we are looking for the first
227 * item >= key. When key.nextkey is true, we are looking for the first item
228 * strictly greater than key.
229 *
230 * If forupdate is true, we will attempt to finish any incomplete splits
231 * that we encounter. This is required when locking a target page for an
232 * insertion, because we don't allow inserting on a page before the split
233 * is completed. 'stack' is only used if forupdate is true.
234 *
235 * On entry, we have the buffer pinned and a lock of the type specified by
236 * 'access'. If we move right, we release the buffer and lock and acquire
237 * the same on the right sibling. Return value is the buffer we stop at.
238 *
239 * If the snapshot parameter is not NULL, "old snapshot" checking will take
240 * place during the descent through the tree. This is not needed when
241 * positioning for an insert or delete, so NULL is used for those cases.
242 */
243Buffer
244_bt_moveright(Relation rel,
245 BTScanInsert key,
246 Buffer buf,
247 bool forupdate,
248 BTStack stack,
249 int access,
250 Snapshot snapshot)
251{
252 Page page;
253 BTPageOpaque opaque;
254 int32 cmpval;
255
256 /*
257 * When nextkey = false (normal case): if the scan key that brought us to
258 * this page is > the high key stored on the page, then the page has split
259 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
260 * have some duplicates to the right as well as the left, but that's
261 * something that's only ever dealt with on the leaf level, after
262 * _bt_search has found an initial leaf page.)
263 *
264 * When nextkey = true: move right if the scan key is >= page's high key.
265 * (Note that key.scantid cannot be set in this case.)
266 *
267 * The page could even have split more than once, so scan as far as
268 * needed.
269 *
270 * We also have to move right if we followed a link that brought us to a
271 * dead page.
272 */
273 cmpval = key->nextkey ? 0 : 1;
274
275 for (;;)
276 {
277 page = BufferGetPage(buf);
278 TestForOldSnapshot(snapshot, rel, page);
279 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
280
281 if (P_RIGHTMOST(opaque))
282 break;
283
284 /*
285 * Finish any incomplete splits we encounter along the way.
286 */
287 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
288 {
289 BlockNumber blkno = BufferGetBlockNumber(buf);
290
291 /* upgrade our lock if necessary */
292 if (access == BT_READ)
293 {
294 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
295 LockBuffer(buf, BT_WRITE);
296 }
297
298 if (P_INCOMPLETE_SPLIT(opaque))
299 _bt_finish_split(rel, buf, stack);
300 else
301 _bt_relbuf(rel, buf);
302
303 /* re-acquire the lock in the right mode, and re-check */
304 buf = _bt_getbuf(rel, blkno, access);
305 continue;
306 }
307
308 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
309 {
310 /* step right one page */
311 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
312 continue;
313 }
314 else
315 break;
316 }
317
318 if (P_IGNORE(opaque))
319 elog(ERROR, "fell off the end of index \"%s\"",
320 RelationGetRelationName(rel));
321
322 return buf;
323}
324
325/*
326 * _bt_binsrch() -- Do a binary search for a key on a particular page.
327 *
328 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
329 * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
330 * particular, this means it is possible to return a value 1 greater than the
331 * number of keys on the page, if the scankey is > all keys on the page.)
332 *
333 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
334 * of the last key < given scankey, or last key <= given scankey if nextkey
335 * is true. (Since _bt_compare treats the first data key of such a page as
336 * minus infinity, there will be at least one key < scankey, so the result
337 * always points at one of the keys on the page.) This key indicates the
338 * right place to descend to be sure we find all leaf keys >= given scankey
339 * (or leaf keys > given scankey when nextkey is true).
340 *
341 * This procedure is not responsible for walking right, it just examines
342 * the given page. _bt_binsrch() has no lock or refcount side effects
343 * on the buffer.
344 */
345static OffsetNumber
346_bt_binsrch(Relation rel,
347 BTScanInsert key,
348 Buffer buf)
349{
350 Page page;
351 BTPageOpaque opaque;
352 OffsetNumber low,
353 high;
354 int32 result,
355 cmpval;
356
357 /* Requesting nextkey semantics while using scantid seems nonsensical */
358 Assert(!key->nextkey || key->scantid == NULL);
359
360 page = BufferGetPage(buf);
361 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
362
363 low = P_FIRSTDATAKEY(opaque);
364 high = PageGetMaxOffsetNumber(page);
365
366 /*
367 * If there are no keys on the page, return the first available slot. Note
368 * this covers two cases: the page is really empty (no keys), or it
369 * contains only a high key. The latter case is possible after vacuuming.
370 * This can never happen on an internal page, however, since they are
371 * never empty (an internal page must have children).
372 */
373 if (unlikely(high < low))
374 return low;
375
376 /*
377 * Binary search to find the first key on the page >= scan key, or first
378 * key > scankey when nextkey is true.
379 *
380 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
381 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
382 *
383 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
384 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
385 *
386 * We can fall out when high == low.
387 */
388 high++; /* establish the loop invariant for high */
389
390 cmpval = key->nextkey ? 0 : 1; /* select comparison value */
391
392 while (high > low)
393 {
394 OffsetNumber mid = low + ((high - low) / 2);
395
396 /* We have low <= mid < high, so mid points at a real slot */
397
398 result = _bt_compare(rel, key, page, mid);
399
400 if (result >= cmpval)
401 low = mid + 1;
402 else
403 high = mid;
404 }
405
406 /*
407 * At this point we have high == low, but be careful: they could point
408 * past the last slot on the page.
409 *
410 * On a leaf page, we always return the first key >= scan key (resp. >
411 * scan key), which could be the last slot + 1.
412 */
413 if (P_ISLEAF(opaque))
414 return low;
415
416 /*
417 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
418 * There must be one if _bt_compare() is playing by the rules.
419 */
420 Assert(low > P_FIRSTDATAKEY(opaque));
421
422 return OffsetNumberPrev(low);
423}
424
425/*
426 *
427 * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
428 *
429 * Like _bt_binsrch(), but with support for caching the binary search
430 * bounds. Only used during insertion, and only on the leaf page that it
431 * looks like caller will insert tuple on. Exclusive-locked and pinned
432 * leaf page is contained within insertstate.
433 *
434 * Caches the bounds fields in insertstate so that a subsequent call can
435 * reuse the low and strict high bounds of original binary search. Callers
436 * that use these fields directly must be prepared for the case where low
437 * and/or stricthigh are not on the same page (one or both exceed maxoff
438 * for the page). The case where there are no items on the page (high <
439 * low) makes bounds invalid.
440 *
441 * Caller is responsible for invalidating bounds when it modifies the page
442 * before calling here a second time.
443 */
444OffsetNumber
445_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
446{
447 BTScanInsert key = insertstate->itup_key;
448 Page page;
449 BTPageOpaque opaque;
450 OffsetNumber low,
451 high,
452 stricthigh;
453 int32 result,
454 cmpval;
455
456 page = BufferGetPage(insertstate->buf);
457 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
458
459 Assert(P_ISLEAF(opaque));
460 Assert(!key->nextkey);
461
462 if (!insertstate->bounds_valid)
463 {
464 /* Start new binary search */
465 low = P_FIRSTDATAKEY(opaque);
466 high = PageGetMaxOffsetNumber(page);
467 }
468 else
469 {
470 /* Restore result of previous binary search against same page */
471 low = insertstate->low;
472 high = insertstate->stricthigh;
473 }
474
475 /* If there are no keys on the page, return the first available slot */
476 if (unlikely(high < low))
477 {
478 /* Caller can't reuse bounds */
479 insertstate->low = InvalidOffsetNumber;
480 insertstate->stricthigh = InvalidOffsetNumber;
481 insertstate->bounds_valid = false;
482 return low;
483 }
484
485 /*
486 * Binary search to find the first key on the page >= scan key. (nextkey
487 * is always false when inserting).
488 *
489 * The loop invariant is: all slots before 'low' are < scan key, all slots
490 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
491 * maintained to save additional search effort for caller.
492 *
493 * We can fall out when high == low.
494 */
495 if (!insertstate->bounds_valid)
496 high++; /* establish the loop invariant for high */
497 stricthigh = high; /* high initially strictly higher */
498
499 cmpval = 1; /* !nextkey comparison value */
500
501 while (high > low)
502 {
503 OffsetNumber mid = low + ((high - low) / 2);
504
505 /* We have low <= mid < high, so mid points at a real slot */
506
507 result = _bt_compare(rel, key, page, mid);
508
509 if (result >= cmpval)
510 low = mid + 1;
511 else
512 {
513 high = mid;
514 if (result != 0)
515 stricthigh = high;
516 }
517 }
518
519 /*
520 * On a leaf page, a binary search always returns the first key >= scan
521 * key (at least in !nextkey case), which could be the last slot + 1. This
522 * is also the lower bound of cached search.
523 *
524 * stricthigh may also be the last slot + 1, which prevents caller from
525 * using bounds directly, but is still useful to us if we're called a
526 * second time with cached bounds (cached low will be < stricthigh when
527 * that happens).
528 */
529 insertstate->low = low;
530 insertstate->stricthigh = stricthigh;
531 insertstate->bounds_valid = true;
532
533 return low;
534}
535
536/*----------
537 * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
538 *
539 * page/offnum: location of btree item to be compared to.
540 *
541 * This routine returns:
542 * <0 if scankey < tuple at offnum;
543 * 0 if scankey == tuple at offnum;
544 * >0 if scankey > tuple at offnum.
545 * NULLs in the keys are treated as sortable values. Therefore
546 * "equality" does not necessarily mean that the item should be
547 * returned to the caller as a matching key!
548 *
549 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
550 * "minus infinity": this routine will always claim it is less than the
551 * scankey. The actual key value stored is explicitly truncated to 0
552 * attributes (explicitly minus infinity) with version 3+ indexes, but
553 * that isn't relied upon. This allows us to implement the Lehman and
554 * Yao convention that the first down-link pointer is before the first
555 * key. See backend/access/nbtree/README for details.
556 *----------
557 */
558int32
559_bt_compare(Relation rel,
560 BTScanInsert key,
561 Page page,
562 OffsetNumber offnum)
563{
564 TupleDesc itupdesc = RelationGetDescr(rel);
565 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
566 IndexTuple itup;
567 ItemPointer heapTid;
568 ScanKey scankey;
569 int ncmpkey;
570 int ntupatts;
571
572 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
573 Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
574 Assert(key->heapkeyspace || key->scantid == NULL);
575
576 /*
577 * Force result ">" if target item is first data item on an internal page
578 * --- see NOTE above.
579 */
580 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
581 return 1;
582
583 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
584 ntupatts = BTreeTupleGetNAtts(itup, rel);
585
586 /*
587 * The scan key is set up with the attribute number associated with each
588 * term in the key. It is important that, if the index is multi-key, the
589 * scan contain the first k key attributes, and that they be in order. If
590 * you think about how multi-key ordering works, you'll understand why
591 * this is.
592 *
593 * We don't test for violation of this condition here, however. The
594 * initial setup for the index scan had better have gotten it right (see
595 * _bt_first).
596 */
597
598 ncmpkey = Min(ntupatts, key->keysz);
599 Assert(key->heapkeyspace || ncmpkey == key->keysz);
600 scankey = key->scankeys;
601 for (int i = 1; i <= ncmpkey; i++)
602 {
603 Datum datum;
604 bool isNull;
605 int32 result;
606
607 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
608
609 /* see comments about NULLs handling in btbuild */
610 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
611 {
612 if (isNull)
613 result = 0; /* NULL "=" NULL */
614 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
615 result = -1; /* NULL "<" NOT_NULL */
616 else
617 result = 1; /* NULL ">" NOT_NULL */
618 }
619 else if (isNull) /* key is NOT_NULL and item is NULL */
620 {
621 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
622 result = 1; /* NOT_NULL ">" NULL */
623 else
624 result = -1; /* NOT_NULL "<" NULL */
625 }
626 else
627 {
628 /*
629 * The sk_func needs to be passed the index value as left arg and
630 * the sk_argument as right arg (they might be of different
631 * types). Since it is convenient for callers to think of
632 * _bt_compare as comparing the scankey to the index item, we have
633 * to flip the sign of the comparison result. (Unless it's a DESC
634 * column, in which case we *don't* flip the sign.)
635 */
636 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
637 scankey->sk_collation,
638 datum,
639 scankey->sk_argument));
640
641 if (!(scankey->sk_flags & SK_BT_DESC))
642 INVERT_COMPARE_RESULT(result);
643 }
644
645 /* if the keys are unequal, return the difference */
646 if (result != 0)
647 return result;
648
649 scankey++;
650 }
651
652 /*
653 * All non-truncated attributes (other than heap TID) were found to be
654 * equal. Treat truncated attributes as minus infinity when scankey has a
655 * key attribute value that would otherwise be compared directly.
656 *
657 * Note: it doesn't matter if ntupatts includes non-key attributes;
658 * scankey won't, so explicitly excluding non-key attributes isn't
659 * necessary.
660 */
661 if (key->keysz > ntupatts)
662 return 1;
663
664 /*
665 * Use the heap TID attribute and scantid to try to break the tie. The
666 * rules are the same as any other key attribute -- only the
667 * representation differs.
668 */
669 heapTid = BTreeTupleGetHeapTID(itup);
670 if (key->scantid == NULL)
671 {
672 /*
673 * Most searches have a scankey that is considered greater than a
674 * truncated pivot tuple if and when the scankey has equal values for
675 * attributes up to and including the least significant untruncated
676 * attribute in tuple.
677 *
678 * For example, if an index has the minimum two attributes (single
679 * user key attribute, plus heap TID attribute), and a page's high key
680 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
681 * will not descend to the page to the left. The search will descend
682 * right instead. The truncated attribute in pivot tuple means that
683 * all non-pivot tuples on the page to the left are strictly < 'foo',
684 * so it isn't necessary to descend left. In other words, search
685 * doesn't have to descend left because it isn't interested in a match
686 * that has a heap TID value of -inf.
687 *
688 * However, some searches (pivotsearch searches) actually require that
689 * we descend left when this happens. -inf is treated as a possible
690 * match for omitted scankey attribute(s). This is needed by page
691 * deletion, which must re-find leaf pages that are targets for
692 * deletion using their high keys.
693 *
694 * Note: the heap TID part of the test ensures that scankey is being
695 * compared to a pivot tuple with one or more truncated key
696 * attributes.
697 *
698 * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
699 * left here, since they have no heap TID attribute (and cannot have
700 * any -inf key values in any case, since truncation can only remove
701 * non-key attributes). !heapkeyspace searches must always be
702 * prepared to deal with matches on both sides of the pivot once the
703 * leaf level is reached.
704 */
705 if (key->heapkeyspace && !key->pivotsearch &&
706 key->keysz == ntupatts && heapTid == NULL)
707 return 1;
708
709 /* All provided scankey arguments found to be equal */
710 return 0;
711 }
712
713 /*
714 * Treat truncated heap TID as minus infinity, since scankey has a key
715 * attribute value (scantid) that would otherwise be compared directly
716 */
717 Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
718 if (heapTid == NULL)
719 return 1;
720
721 Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
722 return ItemPointerCompare(key->scantid, heapTid);
723}
724
725/*
726 * _bt_first() -- Find the first item in a scan.
727 *
728 * We need to be clever about the direction of scan, the search
729 * conditions, and the tree ordering. We find the first item (or,
730 * if backwards scan, the last item) in the tree that satisfies the
731 * qualifications in the scan key. On success exit, the page containing
732 * the current index tuple is pinned but not locked, and data about
733 * the matching tuple(s) on the page has been loaded into so->currPos.
734 * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
735 * and if requested, scan->xs_itup points to a copy of the index tuple.
736 *
737 * If there are no matching items in the index, we return false, with no
738 * pins or locks held.
739 *
740 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
741 * are both search-type scankeys (see nbtree/README for more about this).
742 * Within this routine, we build a temporary insertion-type scankey to use
743 * in locating the scan start position.
744 */
745bool
746_bt_first(IndexScanDesc scan, ScanDirection dir)
747{
748 Relation rel = scan->indexRelation;
749 BTScanOpaque so = (BTScanOpaque) scan->opaque;
750 Buffer buf;
751 BTStack stack;
752 OffsetNumber offnum;
753 StrategyNumber strat;
754 bool nextkey;
755 bool goback;
756 BTScanInsertData inskey;
757 ScanKey startKeys[INDEX_MAX_KEYS];
758 ScanKeyData notnullkeys[INDEX_MAX_KEYS];
759 int keysCount = 0;
760 int i;
761 bool status = true;
762 StrategyNumber strat_total;
763 BTScanPosItem *currItem;
764 BlockNumber blkno;
765
766 Assert(!BTScanPosIsValid(so->currPos));
767
768 pgstat_count_index_scan(rel);
769
770 /*
771 * Examine the scan keys and eliminate any redundant keys; also mark the
772 * keys that must be matched to continue the scan.
773 */
774 _bt_preprocess_keys(scan);
775
776 /*
777 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
778 * never be satisfied (eg, x == 1 AND x > 2).
779 */
780 if (!so->qual_ok)
781 return false;
782
783 /*
784 * For parallel scans, get the starting page from shared state. If the
785 * scan has not started, proceed to find out first leaf page in the usual
786 * way while keeping other participating processes waiting. If the scan
787 * has already begun, use the page number from the shared structure.
788 */
789 if (scan->parallel_scan != NULL)
790 {
791 status = _bt_parallel_seize(scan, &blkno);
792 if (!status)
793 return false;
794 else if (blkno == P_NONE)
795 {
796 _bt_parallel_done(scan);
797 return false;
798 }
799 else if (blkno != InvalidBlockNumber)
800 {
801 if (!_bt_parallel_readpage(scan, blkno, dir))
802 return false;
803 goto readcomplete;
804 }
805 }
806
807 /*----------
808 * Examine the scan keys to discover where we need to start the scan.
809 *
810 * We want to identify the keys that can be used as starting boundaries;
811 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
812 * a backwards scan. We can use keys for multiple attributes so long as
813 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
814 * a > or < boundary or find an attribute with no boundary (which can be
815 * thought of as the same as "> -infinity"), we can't use keys for any
816 * attributes to its right, because it would break our simplistic notion
817 * of what initial positioning strategy to use.
818 *
819 * When the scan keys include cross-type operators, _bt_preprocess_keys
820 * may not be able to eliminate redundant keys; in such cases we will
821 * arbitrarily pick a usable one for each attribute. This is correct
822 * but possibly not optimal behavior. (For example, with keys like
823 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
824 * x=5 would be more efficient.) Since the situation only arises given
825 * a poorly-worded query plus an incomplete opfamily, live with it.
826 *
827 * When both equality and inequality keys appear for a single attribute
828 * (again, only possible when cross-type operators appear), we *must*
829 * select one of the equality keys for the starting point, because
830 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
831 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
832 * start at x=4, we will fail and stop before reaching x=10. If multiple
833 * equality quals survive preprocessing, however, it doesn't matter which
834 * one we use --- by definition, they are either redundant or
835 * contradictory.
836 *
837 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
838 * If the index stores nulls at the end of the index we'll be starting
839 * from, and we have no boundary key for the column (which means the key
840 * we deduced NOT NULL from is an inequality key that constrains the other
841 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
842 * use as a boundary key. If we didn't do this, we might find ourselves
843 * traversing a lot of null entries at the start of the scan.
844 *
845 * In this loop, row-comparison keys are treated the same as keys on their
846 * first (leftmost) columns. We'll add on lower-order columns of the row
847 * comparison below, if possible.
848 *
849 * The selected scan keys (at most one per index column) are remembered by
850 * storing their addresses into the local startKeys[] array.
851 *----------
852 */
853 strat_total = BTEqualStrategyNumber;
854 if (so->numberOfKeys > 0)
855 {
856 AttrNumber curattr;
857 ScanKey chosen;
858 ScanKey impliesNN;
859 ScanKey cur;
860
861 /*
862 * chosen is the so-far-chosen key for the current attribute, if any.
863 * We don't cast the decision in stone until we reach keys for the
864 * next attribute.
865 */
866 curattr = 1;
867 chosen = NULL;
868 /* Also remember any scankey that implies a NOT NULL constraint */
869 impliesNN = NULL;
870
871 /*
872 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
873 * pass to handle after-last-key processing. Actual exit from the
874 * loop is at one of the "break" statements below.
875 */
876 for (cur = so->keyData, i = 0;; cur++, i++)
877 {
878 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
879 {
880 /*
881 * Done looking at keys for curattr. If we didn't find a
882 * usable boundary key, see if we can deduce a NOT NULL key.
883 */
884 if (chosen == NULL && impliesNN != NULL &&
885 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
886 ScanDirectionIsForward(dir) :
887 ScanDirectionIsBackward(dir)))
888 {
889 /* Yes, so build the key in notnullkeys[keysCount] */
890 chosen = &notnullkeys[keysCount];
891 ScanKeyEntryInitialize(chosen,
892 (SK_SEARCHNOTNULL | SK_ISNULL |
893 (impliesNN->sk_flags &
894 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
895 curattr,
896 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
897 BTGreaterStrategyNumber :
898 BTLessStrategyNumber),
899 InvalidOid,
900 InvalidOid,
901 InvalidOid,
902 (Datum) 0);
903 }
904
905 /*
906 * If we still didn't find a usable boundary key, quit; else
907 * save the boundary key pointer in startKeys.
908 */
909 if (chosen == NULL)
910 break;
911 startKeys[keysCount++] = chosen;
912
913 /*
914 * Adjust strat_total, and quit if we have stored a > or <
915 * key.
916 */
917 strat = chosen->sk_strategy;
918 if (strat != BTEqualStrategyNumber)
919 {
920 strat_total = strat;
921 if (strat == BTGreaterStrategyNumber ||
922 strat == BTLessStrategyNumber)
923 break;
924 }
925
926 /*
927 * Done if that was the last attribute, or if next key is not
928 * in sequence (implying no boundary key is available for the
929 * next attribute).
930 */
931 if (i >= so->numberOfKeys ||
932 cur->sk_attno != curattr + 1)
933 break;
934
935 /*
936 * Reset for next attr.
937 */
938 curattr = cur->sk_attno;
939 chosen = NULL;
940 impliesNN = NULL;
941 }
942
943 /*
944 * Can we use this key as a starting boundary for this attr?
945 *
946 * If not, does it imply a NOT NULL constraint? (Because
947 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
948 * *any* inequality key works for that; we need not test.)
949 */
950 switch (cur->sk_strategy)
951 {
952 case BTLessStrategyNumber:
953 case BTLessEqualStrategyNumber:
954 if (chosen == NULL)
955 {
956 if (ScanDirectionIsBackward(dir))
957 chosen = cur;
958 else
959 impliesNN = cur;
960 }
961 break;
962 case BTEqualStrategyNumber:
963 /* override any non-equality choice */
964 chosen = cur;
965 break;
966 case BTGreaterEqualStrategyNumber:
967 case BTGreaterStrategyNumber:
968 if (chosen == NULL)
969 {
970 if (ScanDirectionIsForward(dir))
971 chosen = cur;
972 else
973 impliesNN = cur;
974 }
975 break;
976 }
977 }
978 }
979
980 /*
981 * If we found no usable boundary keys, we have to start from one end of
982 * the tree. Walk down that edge to the first or last key, and scan from
983 * there.
984 */
985 if (keysCount == 0)
986 {
987 bool match;
988
989 match = _bt_endpoint(scan, dir);
990
991 if (!match)
992 {
993 /* No match, so mark (parallel) scan finished */
994 _bt_parallel_done(scan);
995 }
996
997 return match;
998 }
999
1000 /*
1001 * We want to start the scan somewhere within the index. Set up an
1002 * insertion scankey we can use to search for the boundary point we
1003 * identified above. The insertion scankey is built using the keys
1004 * identified by startKeys[]. (Remaining insertion scankey fields are
1005 * initialized after initial-positioning strategy is finalized.)
1006 */
1007 Assert(keysCount <= INDEX_MAX_KEYS);
1008 for (i = 0; i < keysCount; i++)
1009 {
1010 ScanKey cur = startKeys[i];
1011
1012 Assert(cur->sk_attno == i + 1);
1013
1014 if (cur->sk_flags & SK_ROW_HEADER)
1015 {
1016 /*
1017 * Row comparison header: look to the first row member instead.
1018 *
1019 * The member scankeys are already in insertion format (ie, they
1020 * have sk_func = 3-way-comparison function), but we have to watch
1021 * out for nulls, which _bt_preprocess_keys didn't check. A null
1022 * in the first row member makes the condition unmatchable, just
1023 * like qual_ok = false.
1024 */
1025 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1026
1027 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1028 if (subkey->sk_flags & SK_ISNULL)
1029 {
1030 _bt_parallel_done(scan);
1031 return false;
1032 }
1033 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1034
1035 /*
1036 * If the row comparison is the last positioning key we accepted,
1037 * try to add additional keys from the lower-order row members.
1038 * (If we accepted independent conditions on additional index
1039 * columns, we use those instead --- doesn't seem worth trying to
1040 * determine which is more restrictive.) Note that this is OK
1041 * even if the row comparison is of ">" or "<" type, because the
1042 * condition applied to all but the last row member is effectively
1043 * ">=" or "<=", and so the extra keys don't break the positioning
1044 * scheme. But, by the same token, if we aren't able to use all
1045 * the row members, then the part of the row comparison that we
1046 * did use has to be treated as just a ">=" or "<=" condition, and
1047 * so we'd better adjust strat_total accordingly.
1048 */
1049 if (i == keysCount - 1)
1050 {
1051 bool used_all_subkeys = false;
1052
1053 Assert(!(subkey->sk_flags & SK_ROW_END));
1054 for (;;)
1055 {
1056 subkey++;
1057 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1058 if (subkey->sk_attno != keysCount + 1)
1059 break; /* out-of-sequence, can't use it */
1060 if (subkey->sk_strategy != cur->sk_strategy)
1061 break; /* wrong direction, can't use it */
1062 if (subkey->sk_flags & SK_ISNULL)
1063 break; /* can't use null keys */
1064 Assert(keysCount < INDEX_MAX_KEYS);
1065 memcpy(inskey.scankeys + keysCount, subkey,
1066 sizeof(ScanKeyData));
1067 keysCount++;
1068 if (subkey->sk_flags & SK_ROW_END)
1069 {
1070 used_all_subkeys = true;
1071 break;
1072 }
1073 }
1074 if (!used_all_subkeys)
1075 {
1076 switch (strat_total)
1077 {
1078 case BTLessStrategyNumber:
1079 strat_total = BTLessEqualStrategyNumber;
1080 break;
1081 case BTGreaterStrategyNumber:
1082 strat_total = BTGreaterEqualStrategyNumber;
1083 break;
1084 }
1085 }
1086 break; /* done with outer loop */
1087 }
1088 }
1089 else
1090 {
1091 /*
1092 * Ordinary comparison key. Transform the search-style scan key
1093 * to an insertion scan key by replacing the sk_func with the
1094 * appropriate btree comparison function.
1095 *
1096 * If scankey operator is not a cross-type comparison, we can use
1097 * the cached comparison function; otherwise gotta look it up in
1098 * the catalogs. (That can't lead to infinite recursion, since no
1099 * indexscan initiated by syscache lookup will use cross-data-type
1100 * operators.)
1101 *
1102 * We support the convention that sk_subtype == InvalidOid means
1103 * the opclass input type; this is a hack to simplify life for
1104 * ScanKeyInit().
1105 */
1106 if (cur->sk_subtype == rel->rd_opcintype[i] ||
1107 cur->sk_subtype == InvalidOid)
1108 {
1109 FmgrInfo *procinfo;
1110
1111 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1112 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1113 cur->sk_flags,
1114 cur->sk_attno,
1115 InvalidStrategy,
1116 cur->sk_subtype,
1117 cur->sk_collation,
1118 procinfo,
1119 cur->sk_argument);
1120 }
1121 else
1122 {
1123 RegProcedure cmp_proc;
1124
1125 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1126 rel->rd_opcintype[i],
1127 cur->sk_subtype,
1128 BTORDER_PROC);
1129 if (!RegProcedureIsValid(cmp_proc))
1130 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1131 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1132 cur->sk_attno, RelationGetRelationName(rel));
1133 ScanKeyEntryInitialize(inskey.scankeys + i,
1134 cur->sk_flags,
1135 cur->sk_attno,
1136 InvalidStrategy,
1137 cur->sk_subtype,
1138 cur->sk_collation,
1139 cmp_proc,
1140 cur->sk_argument);
1141 }
1142 }
1143 }
1144
1145 /*----------
1146 * Examine the selected initial-positioning strategy to determine exactly
1147 * where we need to start the scan, and set flag variables to control the
1148 * code below.
1149 *
1150 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1151 * item >= scan key. If nextkey = true, they will locate the first
1152 * item > scan key.
1153 *
1154 * If goback = true, we will then step back one item, while if
1155 * goback = false, we will start the scan on the located item.
1156 *----------
1157 */
1158 switch (strat_total)
1159 {
1160 case BTLessStrategyNumber:
1161
1162 /*
1163 * Find first item >= scankey, then back up one to arrive at last
1164 * item < scankey. (Note: this positioning strategy is only used
1165 * for a backward scan, so that is always the correct starting
1166 * position.)
1167 */
1168 nextkey = false;
1169 goback = true;
1170 break;
1171
1172 case BTLessEqualStrategyNumber:
1173
1174 /*
1175 * Find first item > scankey, then back up one to arrive at last
1176 * item <= scankey. (Note: this positioning strategy is only used
1177 * for a backward scan, so that is always the correct starting
1178 * position.)
1179 */
1180 nextkey = true;
1181 goback = true;
1182 break;
1183
1184 case BTEqualStrategyNumber:
1185
1186 /*
1187 * If a backward scan was specified, need to start with last equal
1188 * item not first one.
1189 */
1190 if (ScanDirectionIsBackward(dir))
1191 {
1192 /*
1193 * This is the same as the <= strategy. We will check at the
1194 * end whether the found item is actually =.
1195 */
1196 nextkey = true;
1197 goback = true;
1198 }
1199 else
1200 {
1201 /*
1202 * This is the same as the >= strategy. We will check at the
1203 * end whether the found item is actually =.
1204 */
1205 nextkey = false;
1206 goback = false;
1207 }
1208 break;
1209
1210 case BTGreaterEqualStrategyNumber:
1211
1212 /*
1213 * Find first item >= scankey. (This is only used for forward
1214 * scans.)
1215 */
1216 nextkey = false;
1217 goback = false;
1218 break;
1219
1220 case BTGreaterStrategyNumber:
1221
1222 /*
1223 * Find first item > scankey. (This is only used for forward
1224 * scans.)
1225 */
1226 nextkey = true;
1227 goback = false;
1228 break;
1229
1230 default:
1231 /* can't get here, but keep compiler quiet */
1232 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1233 return false;
1234 }
1235
1236 /* Initialize remaining insertion scan key fields */
1237 inskey.heapkeyspace = _bt_heapkeyspace(rel);
1238 inskey.anynullkeys = false; /* unused */
1239 inskey.nextkey = nextkey;
1240 inskey.pivotsearch = false;
1241 inskey.scantid = NULL;
1242 inskey.keysz = keysCount;
1243
1244 /*
1245 * Use the manufactured insertion scan key to descend the tree and
1246 * position ourselves on the target leaf page.
1247 */
1248 stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1249
1250 /* don't need to keep the stack around... */
1251 _bt_freestack(stack);
1252
1253 if (!BufferIsValid(buf))
1254 {
1255 /*
1256 * We only get here if the index is completely empty. Lock relation
1257 * because nothing finer to lock exists.
1258 */
1259 PredicateLockRelation(rel, scan->xs_snapshot);
1260
1261 /*
1262 * mark parallel scan as done, so that all the workers can finish
1263 * their scan
1264 */
1265 _bt_parallel_done(scan);
1266 BTScanPosInvalidate(so->currPos);
1267
1268 return false;
1269 }
1270 else
1271 PredicateLockPage(rel, BufferGetBlockNumber(buf),
1272 scan->xs_snapshot);
1273
1274 _bt_initialize_more_data(so, dir);
1275
1276 /* position to the precise item on the page */
1277 offnum = _bt_binsrch(rel, &inskey, buf);
1278
1279 /*
1280 * If nextkey = false, we are positioned at the first item >= scan key, or
1281 * possibly at the end of a page on which all the existing items are less
1282 * than the scan key and we know that everything on later pages is greater
1283 * than or equal to scan key.
1284 *
1285 * If nextkey = true, we are positioned at the first item > scan key, or
1286 * possibly at the end of a page on which all the existing items are less
1287 * than or equal to the scan key and we know that everything on later
1288 * pages is greater than scan key.
1289 *
1290 * The actually desired starting point is either this item or the prior
1291 * one, or in the end-of-page case it's the first item on the next page or
1292 * the last item on this page. Adjust the starting offset if needed. (If
1293 * this results in an offset before the first item or after the last one,
1294 * _bt_readpage will report no items found, and then we'll step to the
1295 * next page as needed.)
1296 */
1297 if (goback)
1298 offnum = OffsetNumberPrev(offnum);
1299
1300 /* remember which buffer we have pinned, if any */
1301 Assert(!BTScanPosIsValid(so->currPos));
1302 so->currPos.buf = buf;
1303
1304 /*
1305 * Now load data from the first page of the scan.
1306 */
1307 if (!_bt_readpage(scan, dir, offnum))
1308 {
1309 /*
1310 * There's no actually-matching data on this page. Try to advance to
1311 * the next page. Return false if there's no matching data at all.
1312 */
1313 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1314 if (!_bt_steppage(scan, dir))
1315 return false;
1316 }
1317 else
1318 {
1319 /* Drop the lock, and maybe the pin, on the current page */
1320 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1321 }
1322
1323readcomplete:
1324 /* OK, itemIndex says what to return */
1325 currItem = &so->currPos.items[so->currPos.itemIndex];
1326 scan->xs_heaptid = currItem->heapTid;
1327 if (scan->xs_want_itup)
1328 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1329
1330 return true;
1331}
1332
1333/*
1334 * _bt_next() -- Get the next item in a scan.
1335 *
1336 * On entry, so->currPos describes the current page, which may be pinned
1337 * but is not locked, and so->currPos.itemIndex identifies which item was
1338 * previously returned.
1339 *
1340 * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1341 * next heap tuple, and if requested, scan->xs_itup points to a copy of
1342 * the index tuple. so->currPos is updated as needed.
1343 *
1344 * On failure exit (no more tuples), we release pin and set
1345 * so->currPos.buf to InvalidBuffer.
1346 */
1347bool
1348_bt_next(IndexScanDesc scan, ScanDirection dir)
1349{
1350 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1351 BTScanPosItem *currItem;
1352
1353 /*
1354 * Advance to next tuple on current page; or if there's no more, try to
1355 * step to the next page with data.
1356 */
1357 if (ScanDirectionIsForward(dir))
1358 {
1359 if (++so->currPos.itemIndex > so->currPos.lastItem)
1360 {
1361 if (!_bt_steppage(scan, dir))
1362 return false;
1363 }
1364 }
1365 else
1366 {
1367 if (--so->currPos.itemIndex < so->currPos.firstItem)
1368 {
1369 if (!_bt_steppage(scan, dir))
1370 return false;
1371 }
1372 }
1373
1374 /* OK, itemIndex says what to return */
1375 currItem = &so->currPos.items[so->currPos.itemIndex];
1376 scan->xs_heaptid = currItem->heapTid;
1377 if (scan->xs_want_itup)
1378 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1379
1380 return true;
1381}
1382
1383/*
1384 * _bt_readpage() -- Load data from current index page into so->currPos
1385 *
1386 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1387 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1388 * they are updated as appropriate. All other fields of so->currPos are
1389 * initialized from scratch here.
1390 *
1391 * We scan the current page starting at offnum and moving in the indicated
1392 * direction. All items matching the scan keys are loaded into currPos.items.
1393 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1394 * that there can be no more matching tuples in the current scan direction.
1395 *
1396 * In the case of a parallel scan, caller must have called _bt_parallel_seize
1397 * prior to calling this function; this function will invoke
1398 * _bt_parallel_release before returning.
1399 *
1400 * Returns true if any matching items found on the page, false if none.
1401 */
1402static bool
1403_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1404{
1405 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1406 Page page;
1407 BTPageOpaque opaque;
1408 OffsetNumber minoff;
1409 OffsetNumber maxoff;
1410 int itemIndex;
1411 bool continuescan;
1412 int indnatts;
1413
1414 /*
1415 * We must have the buffer pinned and locked, but the usual macro can't be
1416 * used here; this function is what makes it good for currPos.
1417 */
1418 Assert(BufferIsValid(so->currPos.buf));
1419
1420 page = BufferGetPage(so->currPos.buf);
1421 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1422
1423 /* allow next page be processed by parallel worker */
1424 if (scan->parallel_scan)
1425 {
1426 if (ScanDirectionIsForward(dir))
1427 _bt_parallel_release(scan, opaque->btpo_next);
1428 else
1429 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1430 }
1431
1432 continuescan = true; /* default assumption */
1433 indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1434 minoff = P_FIRSTDATAKEY(opaque);
1435 maxoff = PageGetMaxOffsetNumber(page);
1436
1437 /*
1438 * We note the buffer's block number so that we can release the pin later.
1439 * This allows us to re-read the buffer if it is needed again for hinting.
1440 */
1441 so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1442
1443 /*
1444 * We save the LSN of the page as we read it, so that we know whether it
1445 * safe to apply LP_DEAD hints to the page later. This allows us to drop
1446 * the pin for MVCC scans, which allows vacuum to avoid blocking.
1447 */
1448 so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1449
1450 /*
1451 * we must save the page's right-link while scanning it; this tells us
1452 * where to step right to after we're done with these items. There is no
1453 * corresponding need for the left-link, since splits always go right.
1454 */
1455 so->currPos.nextPage = opaque->btpo_next;
1456
1457 /* initialize tuple workspace to empty */
1458 so->currPos.nextTupleOffset = 0;
1459
1460 /*
1461 * Now that the current page has been made consistent, the macro should be
1462 * good.
1463 */
1464 Assert(BTScanPosIsPinned(so->currPos));
1465
1466 if (ScanDirectionIsForward(dir))
1467 {
1468 /* load items[] in ascending order */
1469 itemIndex = 0;
1470
1471 offnum = Max(offnum, minoff);
1472
1473 while (offnum <= maxoff)
1474 {
1475 ItemId iid = PageGetItemId(page, offnum);
1476 IndexTuple itup;
1477
1478 /*
1479 * If the scan specifies not to return killed tuples, then we
1480 * treat a killed tuple as not passing the qual
1481 */
1482 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1483 {
1484 offnum = OffsetNumberNext(offnum);
1485 continue;
1486 }
1487
1488 itup = (IndexTuple) PageGetItem(page, iid);
1489
1490 if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1491 {
1492 /* tuple passes all scan key conditions, so remember it */
1493 _bt_saveitem(so, itemIndex, offnum, itup);
1494 itemIndex++;
1495 }
1496 /* When !continuescan, there can't be any more matches, so stop */
1497 if (!continuescan)
1498 break;
1499
1500 offnum = OffsetNumberNext(offnum);
1501 }
1502
1503 /*
1504 * We don't need to visit page to the right when the high key
1505 * indicates that no more matches will be found there.
1506 *
1507 * Checking the high key like this works out more often than you might
1508 * think. Leaf page splits pick a split point between the two most
1509 * dissimilar tuples (this is weighed against the need to evenly share
1510 * free space). Leaf pages with high key attribute values that can
1511 * only appear on non-pivot tuples on the right sibling page are
1512 * common.
1513 */
1514 if (continuescan && !P_RIGHTMOST(opaque))
1515 {
1516 ItemId iid = PageGetItemId(page, P_HIKEY);
1517 IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1518 int truncatt;
1519
1520 truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1521 _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1522 }
1523
1524 if (!continuescan)
1525 so->currPos.moreRight = false;
1526
1527 Assert(itemIndex <= MaxIndexTuplesPerPage);
1528 so->currPos.firstItem = 0;
1529 so->currPos.lastItem = itemIndex - 1;
1530 so->currPos.itemIndex = 0;
1531 }
1532 else
1533 {
1534 /* load items[] in descending order */
1535 itemIndex = MaxIndexTuplesPerPage;
1536
1537 offnum = Min(offnum, maxoff);
1538
1539 while (offnum >= minoff)
1540 {
1541 ItemId iid = PageGetItemId(page, offnum);
1542 IndexTuple itup;
1543 bool tuple_alive;
1544 bool passes_quals;
1545
1546 /*
1547 * If the scan specifies not to return killed tuples, then we
1548 * treat a killed tuple as not passing the qual. Most of the
1549 * time, it's a win to not bother examining the tuple's index
1550 * keys, but just skip to the next tuple (previous, actually,
1551 * since we're scanning backwards). However, if this is the first
1552 * tuple on the page, we do check the index keys, to prevent
1553 * uselessly advancing to the page to the left. This is similar
1554 * to the high key optimization used by forward scans.
1555 */
1556 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1557 {
1558 Assert(offnum >= P_FIRSTDATAKEY(opaque));
1559 if (offnum > P_FIRSTDATAKEY(opaque))
1560 {
1561 offnum = OffsetNumberPrev(offnum);
1562 continue;
1563 }
1564
1565 tuple_alive = false;
1566 }
1567 else
1568 tuple_alive = true;
1569
1570 itup = (IndexTuple) PageGetItem(page, iid);
1571
1572 passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1573 &continuescan);
1574 if (passes_quals && tuple_alive)
1575 {
1576 /* tuple passes all scan key conditions, so remember it */
1577 itemIndex--;
1578 _bt_saveitem(so, itemIndex, offnum, itup);
1579 }
1580 if (!continuescan)
1581 {
1582 /* there can't be any more matches, so stop */
1583 so->currPos.moreLeft = false;
1584 break;
1585 }
1586
1587 offnum = OffsetNumberPrev(offnum);
1588 }
1589
1590 Assert(itemIndex >= 0);
1591 so->currPos.firstItem = itemIndex;
1592 so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1593 so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1594 }
1595
1596 return (so->currPos.firstItem <= so->currPos.lastItem);
1597}
1598
1599/* Save an index item into so->currPos.items[itemIndex] */
1600static void
1601_bt_saveitem(BTScanOpaque so, int itemIndex,
1602 OffsetNumber offnum, IndexTuple itup)
1603{
1604 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1605
1606 currItem->heapTid = itup->t_tid;
1607 currItem->indexOffset = offnum;
1608 if (so->currTuples)
1609 {
1610 Size itupsz = IndexTupleSize(itup);
1611
1612 currItem->tupleOffset = so->currPos.nextTupleOffset;
1613 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1614 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1615 }
1616}
1617
1618/*
1619 * _bt_steppage() -- Step to next page containing valid data for scan
1620 *
1621 * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1622 * if pinned, we'll drop the pin before moving to next page. The buffer is
1623 * not locked on entry.
1624 *
1625 * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1626 * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1627 * to InvalidBuffer. We return true to indicate success.
1628 */
1629static bool
1630_bt_steppage(IndexScanDesc scan, ScanDirection dir)
1631{
1632 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1633 BlockNumber blkno = InvalidBlockNumber;
1634 bool status = true;
1635
1636 Assert(BTScanPosIsValid(so->currPos));
1637
1638 /* Before leaving current page, deal with any killed items */
1639 if (so->numKilled > 0)
1640 _bt_killitems(scan);
1641
1642 /*
1643 * Before we modify currPos, make a copy of the page data if there was a
1644 * mark position that needs it.
1645 */
1646 if (so->markItemIndex >= 0)
1647 {
1648 /* bump pin on current buffer for assignment to mark buffer */
1649 if (BTScanPosIsPinned(so->currPos))
1650 IncrBufferRefCount(so->currPos.buf);
1651 memcpy(&so->markPos, &so->currPos,
1652 offsetof(BTScanPosData, items[1]) +
1653 so->currPos.lastItem * sizeof(BTScanPosItem));
1654 if (so->markTuples)
1655 memcpy(so->markTuples, so->currTuples,
1656 so->currPos.nextTupleOffset);
1657 so->markPos.itemIndex = so->markItemIndex;
1658 so->markItemIndex = -1;
1659 }
1660
1661 if (ScanDirectionIsForward(dir))
1662 {
1663 /* Walk right to the next page with data */
1664 if (scan->parallel_scan != NULL)
1665 {
1666 /*
1667 * Seize the scan to get the next block number; if the scan has
1668 * ended already, bail out.
1669 */
1670 status = _bt_parallel_seize(scan, &blkno);
1671 if (!status)
1672 {
1673 /* release the previous buffer, if pinned */
1674 BTScanPosUnpinIfPinned(so->currPos);
1675 BTScanPosInvalidate(so->currPos);
1676 return false;
1677 }
1678 }
1679 else
1680 {
1681 /* Not parallel, so use the previously-saved nextPage link. */
1682 blkno = so->currPos.nextPage;
1683 }
1684
1685 /* Remember we left a page with data */
1686 so->currPos.moreLeft = true;
1687
1688 /* release the previous buffer, if pinned */
1689 BTScanPosUnpinIfPinned(so->currPos);
1690 }
1691 else
1692 {
1693 /* Remember we left a page with data */
1694 so->currPos.moreRight = true;
1695
1696 if (scan->parallel_scan != NULL)
1697 {
1698 /*
1699 * Seize the scan to get the current block number; if the scan has
1700 * ended already, bail out.
1701 */
1702 status = _bt_parallel_seize(scan, &blkno);
1703 BTScanPosUnpinIfPinned(so->currPos);
1704 if (!status)
1705 {
1706 BTScanPosInvalidate(so->currPos);
1707 return false;
1708 }
1709 }
1710 else
1711 {
1712 /* Not parallel, so just use our own notion of the current page */
1713 blkno = so->currPos.currPage;
1714 }
1715 }
1716
1717 if (!_bt_readnextpage(scan, blkno, dir))
1718 return false;
1719
1720 /* Drop the lock, and maybe the pin, on the current page */
1721 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1722
1723 return true;
1724}
1725
1726/*
1727 * _bt_readnextpage() -- Read next page containing valid data for scan
1728 *
1729 * On success exit, so->currPos is updated to contain data from the next
1730 * interesting page. Caller is responsible to release lock and pin on
1731 * buffer on success. We return true to indicate success.
1732 *
1733 * If there are no more matching records in the given direction, we drop all
1734 * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1735 */
1736static bool
1737_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1738{
1739 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1740 Relation rel;
1741 Page page;
1742 BTPageOpaque opaque;
1743 bool status = true;
1744
1745 rel = scan->indexRelation;
1746
1747 if (ScanDirectionIsForward(dir))
1748 {
1749 for (;;)
1750 {
1751 /*
1752 * if we're at end of scan, give up and mark parallel scan as
1753 * done, so that all the workers can finish their scan
1754 */
1755 if (blkno == P_NONE || !so->currPos.moreRight)
1756 {
1757 _bt_parallel_done(scan);
1758 BTScanPosInvalidate(so->currPos);
1759 return false;
1760 }
1761 /* check for interrupts while we're not holding any buffer lock */
1762 CHECK_FOR_INTERRUPTS();
1763 /* step right one page */
1764 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1765 page = BufferGetPage(so->currPos.buf);
1766 TestForOldSnapshot(scan->xs_snapshot, rel, page);
1767 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1768 /* check for deleted page */
1769 if (!P_IGNORE(opaque))
1770 {
1771 PredicateLockPage(rel, blkno, scan->xs_snapshot);
1772 /* see if there are any matches on this page */
1773 /* note that this will clear moreRight if we can stop */
1774 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1775 break;
1776 }
1777 else if (scan->parallel_scan != NULL)
1778 {
1779 /* allow next page be processed by parallel worker */
1780 _bt_parallel_release(scan, opaque->btpo_next);
1781 }
1782
1783 /* nope, keep going */
1784 if (scan->parallel_scan != NULL)
1785 {
1786 _bt_relbuf(rel, so->currPos.buf);
1787 status = _bt_parallel_seize(scan, &blkno);
1788 if (!status)
1789 {
1790 BTScanPosInvalidate(so->currPos);
1791 return false;
1792 }
1793 }
1794 else
1795 {
1796 blkno = opaque->btpo_next;
1797 _bt_relbuf(rel, so->currPos.buf);
1798 }
1799 }
1800 }
1801 else
1802 {
1803 /*
1804 * Should only happen in parallel cases, when some other backend
1805 * advanced the scan.
1806 */
1807 if (so->currPos.currPage != blkno)
1808 {
1809 BTScanPosUnpinIfPinned(so->currPos);
1810 so->currPos.currPage = blkno;
1811 }
1812
1813 /*
1814 * Walk left to the next page with data. This is much more complex
1815 * than the walk-right case because of the possibility that the page
1816 * to our left splits while we are in flight to it, plus the
1817 * possibility that the page we were on gets deleted after we leave
1818 * it. See nbtree/README for details.
1819 *
1820 * It might be possible to rearrange this code to have less overhead
1821 * in pinning and locking, but that would require capturing the left
1822 * pointer when the page is initially read, and using it here, along
1823 * with big changes to _bt_walk_left() and the code below. It is not
1824 * clear whether this would be a win, since if the page immediately to
1825 * the left splits after we read this page and before we step left, we
1826 * would need to visit more pages than with the current code.
1827 *
1828 * Note that if we change the code so that we drop the pin for a scan
1829 * which uses a non-MVCC snapshot, we will need to modify the code for
1830 * walking left, to allow for the possibility that a referenced page
1831 * has been deleted. As long as the buffer is pinned or the snapshot
1832 * is MVCC the page cannot move past the half-dead state to fully
1833 * deleted.
1834 */
1835 if (BTScanPosIsPinned(so->currPos))
1836 LockBuffer(so->currPos.buf, BT_READ);
1837 else
1838 so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1839
1840 for (;;)
1841 {
1842 /* Done if we know there are no matching keys to the left */
1843 if (!so->currPos.moreLeft)
1844 {
1845 _bt_relbuf(rel, so->currPos.buf);
1846 _bt_parallel_done(scan);
1847 BTScanPosInvalidate(so->currPos);
1848 return false;
1849 }
1850
1851 /* Step to next physical page */
1852 so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1853 scan->xs_snapshot);
1854
1855 /* if we're physically at end of index, return failure */
1856 if (so->currPos.buf == InvalidBuffer)
1857 {
1858 _bt_parallel_done(scan);
1859 BTScanPosInvalidate(so->currPos);
1860 return false;
1861 }
1862
1863 /*
1864 * Okay, we managed to move left to a non-deleted page. Done if
1865 * it's not half-dead and contains matching tuples. Else loop back
1866 * and do it all again.
1867 */
1868 page = BufferGetPage(so->currPos.buf);
1869 TestForOldSnapshot(scan->xs_snapshot, rel, page);
1870 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1871 if (!P_IGNORE(opaque))
1872 {
1873 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
1874 /* see if there are any matches on this page */
1875 /* note that this will clear moreLeft if we can stop */
1876 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1877 break;
1878 }
1879 else if (scan->parallel_scan != NULL)
1880 {
1881 /* allow next page be processed by parallel worker */
1882 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1883 }
1884
1885 /*
1886 * For parallel scans, get the last page scanned as it is quite
1887 * possible that by the time we try to seize the scan, some other
1888 * worker has already advanced the scan to a different page. We
1889 * must continue based on the latest page scanned by any worker.
1890 */
1891 if (scan->parallel_scan != NULL)
1892 {
1893 _bt_relbuf(rel, so->currPos.buf);
1894 status = _bt_parallel_seize(scan, &blkno);
1895 if (!status)
1896 {
1897 BTScanPosInvalidate(so->currPos);
1898 return false;
1899 }
1900 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1901 }
1902 }
1903 }
1904
1905 return true;
1906}
1907
1908/*
1909 * _bt_parallel_readpage() -- Read current page containing valid data for scan
1910 *
1911 * On success, release lock and maybe pin on buffer. We return true to
1912 * indicate success.
1913 */
1914static bool
1915_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1916{
1917 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1918
1919 _bt_initialize_more_data(so, dir);
1920
1921 if (!_bt_readnextpage(scan, blkno, dir))
1922 return false;
1923
1924 /* Drop the lock, and maybe the pin, on the current page */
1925 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1926
1927 return true;
1928}
1929
1930/*
1931 * _bt_walk_left() -- step left one page, if possible
1932 *
1933 * The given buffer must be pinned and read-locked. This will be dropped
1934 * before stepping left. On return, we have pin and read lock on the
1935 * returned page, instead.
1936 *
1937 * Returns InvalidBuffer if there is no page to the left (no lock is held
1938 * in that case).
1939 *
1940 * When working on a non-leaf level, it is possible for the returned page
1941 * to be half-dead; the caller should check that condition and step left
1942 * again if it's important.
1943 */
1944static Buffer
1945_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
1946{
1947 Page page;
1948 BTPageOpaque opaque;
1949
1950 page = BufferGetPage(buf);
1951 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1952
1953 for (;;)
1954 {
1955 BlockNumber obknum;
1956 BlockNumber lblkno;
1957 BlockNumber blkno;
1958 int tries;
1959
1960 /* if we're at end of tree, release buf and return failure */
1961 if (P_LEFTMOST(opaque))
1962 {
1963 _bt_relbuf(rel, buf);
1964 break;
1965 }
1966 /* remember original page we are stepping left from */
1967 obknum = BufferGetBlockNumber(buf);
1968 /* step left */
1969 blkno = lblkno = opaque->btpo_prev;
1970 _bt_relbuf(rel, buf);
1971 /* check for interrupts while we're not holding any buffer lock */
1972 CHECK_FOR_INTERRUPTS();
1973 buf = _bt_getbuf(rel, blkno, BT_READ);
1974 page = BufferGetPage(buf);
1975 TestForOldSnapshot(snapshot, rel, page);
1976 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1977
1978 /*
1979 * If this isn't the page we want, walk right till we find what we
1980 * want --- but go no more than four hops (an arbitrary limit). If we
1981 * don't find the correct page by then, the most likely bet is that
1982 * the original page got deleted and isn't in the sibling chain at all
1983 * anymore, not that its left sibling got split more than four times.
1984 *
1985 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1986 * because half-dead pages are still in the sibling chain. Caller
1987 * must reject half-dead pages if wanted.
1988 */
1989 tries = 0;
1990 for (;;)
1991 {
1992 if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1993 {
1994 /* Found desired page, return it */
1995 return buf;
1996 }
1997 if (P_RIGHTMOST(opaque) || ++tries > 4)
1998 break;
1999 blkno = opaque->btpo_next;
2000 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2001 page = BufferGetPage(buf);
2002 TestForOldSnapshot(snapshot, rel, page);
2003 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2004 }
2005
2006 /* Return to the original page to see what's up */
2007 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2008 page = BufferGetPage(buf);
2009 TestForOldSnapshot(snapshot, rel, page);
2010 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2011 if (P_ISDELETED(opaque))
2012 {
2013 /*
2014 * It was deleted. Move right to first nondeleted page (there
2015 * must be one); that is the page that has acquired the deleted
2016 * one's keyspace, so stepping left from it will take us where we
2017 * want to be.
2018 */
2019 for (;;)
2020 {
2021 if (P_RIGHTMOST(opaque))
2022 elog(ERROR, "fell off the end of index \"%s\"",
2023 RelationGetRelationName(rel));
2024 blkno = opaque->btpo_next;
2025 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2026 page = BufferGetPage(buf);
2027 TestForOldSnapshot(snapshot, rel, page);
2028 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2029 if (!P_ISDELETED(opaque))
2030 break;
2031 }
2032
2033 /*
2034 * Now return to top of loop, resetting obknum to point to this
2035 * nondeleted page, and try again.
2036 */
2037 }
2038 else
2039 {
2040 /*
2041 * It wasn't deleted; the explanation had better be that the page
2042 * to the left got split or deleted. Without this check, we'd go
2043 * into an infinite loop if there's anything wrong.
2044 */
2045 if (opaque->btpo_prev == lblkno)
2046 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2047 obknum, RelationGetRelationName(rel));
2048 /* Okay to try again with new lblkno value */
2049 }
2050 }
2051
2052 return InvalidBuffer;
2053}
2054
2055/*
2056 * _bt_get_endpoint() -- Find the first or last page on a given tree level
2057 *
2058 * If the index is empty, we will return InvalidBuffer; any other failure
2059 * condition causes ereport(). We will not return a dead page.
2060 *
2061 * The returned buffer is pinned and read-locked.
2062 */
2063Buffer
2064_bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
2065 Snapshot snapshot)
2066{
2067 Buffer buf;
2068 Page page;
2069 BTPageOpaque opaque;
2070 OffsetNumber offnum;
2071 BlockNumber blkno;
2072 IndexTuple itup;
2073
2074 /*
2075 * If we are looking for a leaf page, okay to descend from fast root;
2076 * otherwise better descend from true root. (There is no point in being
2077 * smarter about intermediate levels.)
2078 */
2079 if (level == 0)
2080 buf = _bt_getroot(rel, BT_READ);
2081 else
2082 buf = _bt_gettrueroot(rel);
2083
2084 if (!BufferIsValid(buf))
2085 return InvalidBuffer;
2086
2087 page = BufferGetPage(buf);
2088 TestForOldSnapshot(snapshot, rel, page);
2089 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2090
2091 for (;;)
2092 {
2093 /*
2094 * If we landed on a deleted page, step right to find a live page
2095 * (there must be one). Also, if we want the rightmost page, step
2096 * right if needed to get to it (this could happen if the page split
2097 * since we obtained a pointer to it).
2098 */
2099 while (P_IGNORE(opaque) ||
2100 (rightmost && !P_RIGHTMOST(opaque)))
2101 {
2102 blkno = opaque->btpo_next;
2103 if (blkno == P_NONE)
2104 elog(ERROR, "fell off the end of index \"%s\"",
2105 RelationGetRelationName(rel));
2106 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2107 page = BufferGetPage(buf);
2108 TestForOldSnapshot(snapshot, rel, page);
2109 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2110 }
2111
2112 /* Done? */
2113 if (opaque->btpo.level == level)
2114 break;
2115 if (opaque->btpo.level < level)
2116 elog(ERROR, "btree level %u not found in index \"%s\"",
2117 level, RelationGetRelationName(rel));
2118
2119 /* Descend to leftmost or rightmost child page */
2120 if (rightmost)
2121 offnum = PageGetMaxOffsetNumber(page);
2122 else
2123 offnum = P_FIRSTDATAKEY(opaque);
2124
2125 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2126 blkno = BTreeInnerTupleGetDownLink(itup);
2127
2128 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2129 page = BufferGetPage(buf);
2130 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2131 }
2132
2133 return buf;
2134}
2135
2136/*
2137 * _bt_endpoint() -- Find the first or last page in the index, and scan
2138 * from there to the first key satisfying all the quals.
2139 *
2140 * This is used by _bt_first() to set up a scan when we've determined
2141 * that the scan must start at the beginning or end of the index (for
2142 * a forward or backward scan respectively). Exit conditions are the
2143 * same as for _bt_first().
2144 */
2145static bool
2146_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2147{
2148 Relation rel = scan->indexRelation;
2149 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2150 Buffer buf;
2151 Page page;
2152 BTPageOpaque opaque;
2153 OffsetNumber start;
2154 BTScanPosItem *currItem;
2155
2156 /*
2157 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2158 * version of _bt_search(). We don't maintain a stack since we know we
2159 * won't need it.
2160 */
2161 buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
2162
2163 if (!BufferIsValid(buf))
2164 {
2165 /*
2166 * Empty index. Lock the whole relation, as nothing finer to lock
2167 * exists.
2168 */
2169 PredicateLockRelation(rel, scan->xs_snapshot);
2170 BTScanPosInvalidate(so->currPos);
2171 return false;
2172 }
2173
2174 PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2175 page = BufferGetPage(buf);
2176 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2177 Assert(P_ISLEAF(opaque));
2178
2179 if (ScanDirectionIsForward(dir))
2180 {
2181 /* There could be dead pages to the left, so not this: */
2182 /* Assert(P_LEFTMOST(opaque)); */
2183
2184 start = P_FIRSTDATAKEY(opaque);
2185 }
2186 else if (ScanDirectionIsBackward(dir))
2187 {
2188 Assert(P_RIGHTMOST(opaque));
2189
2190 start = PageGetMaxOffsetNumber(page);
2191 }
2192 else
2193 {
2194 elog(ERROR, "invalid scan direction: %d", (int) dir);
2195 start = 0; /* keep compiler quiet */
2196 }
2197
2198 /* remember which buffer we have pinned */
2199 so->currPos.buf = buf;
2200
2201 _bt_initialize_more_data(so, dir);
2202
2203 /*
2204 * Now load data from the first page of the scan.
2205 */
2206 if (!_bt_readpage(scan, dir, start))
2207 {
2208 /*
2209 * There's no actually-matching data on this page. Try to advance to
2210 * the next page. Return false if there's no matching data at all.
2211 */
2212 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
2213 if (!_bt_steppage(scan, dir))
2214 return false;
2215 }
2216 else
2217 {
2218 /* Drop the lock, and maybe the pin, on the current page */
2219 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2220 }
2221
2222 /* OK, itemIndex says what to return */
2223 currItem = &so->currPos.items[so->currPos.itemIndex];
2224 scan->xs_heaptid = currItem->heapTid;
2225 if (scan->xs_want_itup)
2226 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2227
2228 return true;
2229}
2230
2231/*
2232 * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2233 * for scan direction
2234 */
2235static inline void
2236_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2237{
2238 /* initialize moreLeft/moreRight appropriately for scan direction */
2239 if (ScanDirectionIsForward(dir))
2240 {
2241 so->currPos.moreLeft = false;
2242 so->currPos.moreRight = true;
2243 }
2244 else
2245 {
2246 so->currPos.moreLeft = true;
2247 so->currPos.moreRight = false;
2248 }
2249 so->numKilled = 0; /* just paranoia */
2250 so->markItemIndex = -1; /* ditto */
2251}
2252