1/*-------------------------------------------------------------------------
2 *
3 * ginget.c
4 * fetch tuples from a GIN scan.
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/gin/ginget.c
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "access/gin_private.h"
18#include "access/relscan.h"
19#include "miscadmin.h"
20#include "storage/predicate.h"
21#include "utils/datum.h"
22#include "utils/memutils.h"
23#include "utils/rel.h"
24
25/* GUC parameter */
26int GinFuzzySearchLimit = 0;
27
28typedef struct pendingPosition
29{
30 Buffer pendingBuffer;
31 OffsetNumber firstOffset;
32 OffsetNumber lastOffset;
33 ItemPointerData item;
34 bool *hasMatchKey;
35} pendingPosition;
36
37
38/*
39 * Goes to the next page if current offset is outside of bounds
40 */
41static bool
42moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot)
43{
44 Page page = BufferGetPage(stack->buffer);
45
46 if (stack->off > PageGetMaxOffsetNumber(page))
47 {
48 /*
49 * We scanned the whole page, so we should take right page
50 */
51 if (GinPageRightMost(page))
52 return false; /* no more pages */
53
54 stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
55 stack->blkno = BufferGetBlockNumber(stack->buffer);
56 stack->off = FirstOffsetNumber;
57 PredicateLockPage(btree->index, stack->blkno, snapshot);
58 }
59
60 return true;
61}
62
63/*
64 * Scan all pages of a posting tree and save all its heap ItemPointers
65 * in scanEntry->matchBitmap
66 */
67static void
68scanPostingTree(Relation index, GinScanEntry scanEntry,
69 BlockNumber rootPostingTree, Snapshot snapshot)
70{
71 GinBtreeData btree;
72 GinBtreeStack *stack;
73 Buffer buffer;
74 Page page;
75
76 /* Descend to the leftmost leaf page */
77 stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot);
78 buffer = stack->buffer;
79
80 IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
81
82 freeGinBtreeStack(stack);
83
84 /*
85 * Loop iterates through all leaf pages of posting tree
86 */
87 for (;;)
88 {
89 page = BufferGetPage(buffer);
90 if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
91 {
92 int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
93
94 scanEntry->predictNumberResult += n;
95 }
96
97 if (GinPageRightMost(page))
98 break; /* no more pages */
99
100 buffer = ginStepRight(buffer, index, GIN_SHARE);
101 }
102
103 UnlockReleaseBuffer(buffer);
104}
105
106/*
107 * Collects TIDs into scanEntry->matchBitmap for all heap tuples that
108 * match the search entry. This supports three different match modes:
109 *
110 * 1. Partial-match support: scan from current point until the
111 * comparePartialFn says we're done.
112 * 2. SEARCH_MODE_ALL: scan from current point (which should be first
113 * key for the current attnum) until we hit null items or end of attnum
114 * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
115 * key for the current attnum) until we hit end of attnum
116 *
117 * Returns true if done, false if it's necessary to restart scan from scratch
118 */
119static bool
120collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
121 GinScanEntry scanEntry, Snapshot snapshot)
122{
123 OffsetNumber attnum;
124 Form_pg_attribute attr;
125
126 /* Initialize empty bitmap result */
127 scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL);
128
129 /* Null query cannot partial-match anything */
130 if (scanEntry->isPartialMatch &&
131 scanEntry->queryCategory != GIN_CAT_NORM_KEY)
132 return true;
133
134 /* Locate tupdesc entry for key column (for attbyval/attlen data) */
135 attnum = scanEntry->attnum;
136 attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1);
137
138 /*
139 * Predicate lock entry leaf page, following pages will be locked by
140 * moveRightIfItNeeded()
141 */
142 PredicateLockPage(btree->index, stack->buffer, snapshot);
143
144 for (;;)
145 {
146 Page page;
147 IndexTuple itup;
148 Datum idatum;
149 GinNullCategory icategory;
150
151 /*
152 * stack->off points to the interested entry, buffer is already locked
153 */
154 if (moveRightIfItNeeded(btree, stack, snapshot) == false)
155 return true;
156
157 page = BufferGetPage(stack->buffer);
158 TestForOldSnapshot(snapshot, btree->index, page);
159 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
160
161 /*
162 * If tuple stores another attribute then stop scan
163 */
164 if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
165 return true;
166
167 /* Safe to fetch attribute value */
168 idatum = gintuple_get_key(btree->ginstate, itup, &icategory);
169
170 /*
171 * Check for appropriate scan stop conditions
172 */
173 if (scanEntry->isPartialMatch)
174 {
175 int32 cmp;
176
177 /*
178 * In partial match, stop scan at any null (including
179 * placeholders); partial matches never match nulls
180 */
181 if (icategory != GIN_CAT_NORM_KEY)
182 return true;
183
184 /*----------
185 * Check of partial match.
186 * case cmp == 0 => match
187 * case cmp > 0 => not match and finish scan
188 * case cmp < 0 => not match and continue scan
189 *----------
190 */
191 cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1],
192 btree->ginstate->supportCollation[attnum - 1],
193 scanEntry->queryKey,
194 idatum,
195 UInt16GetDatum(scanEntry->strategy),
196 PointerGetDatum(scanEntry->extra_data)));
197
198 if (cmp > 0)
199 return true;
200 else if (cmp < 0)
201 {
202 stack->off++;
203 continue;
204 }
205 }
206 else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
207 {
208 /*
209 * In ALL mode, we are not interested in null items, so we can
210 * stop if we get to a null-item placeholder (which will be the
211 * last entry for a given attnum). We do want to include NULL_KEY
212 * and EMPTY_ITEM entries, though.
213 */
214 if (icategory == GIN_CAT_NULL_ITEM)
215 return true;
216 }
217
218 /*
219 * OK, we want to return the TIDs listed in this entry.
220 */
221 if (GinIsPostingTree(itup))
222 {
223 BlockNumber rootPostingTree = GinGetPostingTree(itup);
224
225 /*
226 * We should unlock current page (but not unpin) during tree scan
227 * to prevent deadlock with vacuum processes.
228 *
229 * We save current entry value (idatum) to be able to re-find our
230 * tuple after re-locking
231 */
232 if (icategory == GIN_CAT_NORM_KEY)
233 idatum = datumCopy(idatum, attr->attbyval, attr->attlen);
234
235 LockBuffer(stack->buffer, GIN_UNLOCK);
236
237 /*
238 * Acquire predicate lock on the posting tree. We already hold a
239 * lock on the entry page, but insertions to the posting tree
240 * don't check for conflicts on that level.
241 */
242 PredicateLockPage(btree->index, rootPostingTree, snapshot);
243
244 /* Collect all the TIDs in this entry's posting tree */
245 scanPostingTree(btree->index, scanEntry, rootPostingTree,
246 snapshot);
247
248 /*
249 * We lock again the entry page and while it was unlocked insert
250 * might have occurred, so we need to re-find our position.
251 */
252 LockBuffer(stack->buffer, GIN_SHARE);
253 page = BufferGetPage(stack->buffer);
254 if (!GinPageIsLeaf(page))
255 {
256 /*
257 * Root page becomes non-leaf while we unlock it. We will
258 * start again, this situation doesn't occur often - root can
259 * became a non-leaf only once per life of index.
260 */
261 return false;
262 }
263
264 /* Search forward to re-find idatum */
265 for (;;)
266 {
267 Datum newDatum;
268 GinNullCategory newCategory;
269
270 if (moveRightIfItNeeded(btree, stack, snapshot) == false)
271 elog(ERROR, "lost saved point in index"); /* must not happen !!! */
272
273 page = BufferGetPage(stack->buffer);
274 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
275
276 if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
277 elog(ERROR, "lost saved point in index"); /* must not happen !!! */
278 newDatum = gintuple_get_key(btree->ginstate, itup,
279 &newCategory);
280
281 if (ginCompareEntries(btree->ginstate, attnum,
282 newDatum, newCategory,
283 idatum, icategory) == 0)
284 break; /* Found! */
285
286 stack->off++;
287 }
288
289 if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
290 pfree(DatumGetPointer(idatum));
291 }
292 else
293 {
294 ItemPointer ipd;
295 int nipd;
296
297 ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
298 tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
299 scanEntry->predictNumberResult += GinGetNPosting(itup);
300 pfree(ipd);
301 }
302
303 /*
304 * Done with this entry, go to the next
305 */
306 stack->off++;
307 }
308}
309
310/*
311 * Start* functions setup beginning state of searches: finds correct buffer and pins it.
312 */
313static void
314startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
315{
316 GinBtreeData btreeEntry;
317 GinBtreeStack *stackEntry;
318 Page page;
319 bool needUnlock;
320
321restartScanEntry:
322 entry->buffer = InvalidBuffer;
323 ItemPointerSetMin(&entry->curItem);
324 entry->offset = InvalidOffsetNumber;
325 if (entry->list)
326 pfree(entry->list);
327 entry->list = NULL;
328 entry->nlist = 0;
329 entry->matchBitmap = NULL;
330 entry->matchResult = NULL;
331 entry->reduceResult = false;
332 entry->predictNumberResult = 0;
333
334 /*
335 * we should find entry, and begin scan of posting tree or just store
336 * posting list in memory
337 */
338 ginPrepareEntryScan(&btreeEntry, entry->attnum,
339 entry->queryKey, entry->queryCategory,
340 ginstate);
341 stackEntry = ginFindLeafPage(&btreeEntry, true, false, snapshot);
342 page = BufferGetPage(stackEntry->buffer);
343
344 /* ginFindLeafPage() will have already checked snapshot age. */
345 needUnlock = true;
346
347 entry->isFinished = true;
348
349 if (entry->isPartialMatch ||
350 entry->queryCategory == GIN_CAT_EMPTY_QUERY)
351 {
352 /*
353 * btreeEntry.findItem locates the first item >= given search key.
354 * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
355 * because of the way the GIN_CAT_EMPTY_QUERY category code is
356 * assigned.) We scan forward from there and collect all TIDs needed
357 * for the entry type.
358 */
359 btreeEntry.findItem(&btreeEntry, stackEntry);
360 if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot)
361 == false)
362 {
363 /*
364 * GIN tree was seriously restructured, so we will cleanup all
365 * found data and rescan. See comments near 'return false' in
366 * collectMatchBitmap()
367 */
368 if (entry->matchBitmap)
369 {
370 if (entry->matchIterator)
371 tbm_end_iterate(entry->matchIterator);
372 entry->matchIterator = NULL;
373 tbm_free(entry->matchBitmap);
374 entry->matchBitmap = NULL;
375 }
376 LockBuffer(stackEntry->buffer, GIN_UNLOCK);
377 freeGinBtreeStack(stackEntry);
378 goto restartScanEntry;
379 }
380
381 if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
382 {
383 entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
384 entry->isFinished = false;
385 }
386 }
387 else if (btreeEntry.findItem(&btreeEntry, stackEntry))
388 {
389 IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
390
391 if (GinIsPostingTree(itup))
392 {
393 BlockNumber rootPostingTree = GinGetPostingTree(itup);
394 GinBtreeStack *stack;
395 Page page;
396 ItemPointerData minItem;
397
398 /*
399 * This is an equality scan, so lock the root of the posting tree.
400 * It represents a lock on the exact key value, and covers all the
401 * items in the posting tree.
402 */
403 PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
404
405 /*
406 * We should unlock entry page before touching posting tree to
407 * prevent deadlocks with vacuum processes. Because entry is never
408 * deleted from page and posting tree is never reduced to the
409 * posting list, we can unlock page after getting BlockNumber of
410 * root of posting tree.
411 */
412 LockBuffer(stackEntry->buffer, GIN_UNLOCK);
413 needUnlock = false;
414
415 stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
416 rootPostingTree, snapshot);
417 entry->buffer = stack->buffer;
418
419 /*
420 * We keep buffer pinned because we need to prevent deletion of
421 * page during scan. See GIN's vacuum implementation. RefCount is
422 * increased to keep buffer pinned after freeGinBtreeStack() call.
423 */
424 IncrBufferRefCount(entry->buffer);
425
426 page = BufferGetPage(entry->buffer);
427
428 /*
429 * Load the first page into memory.
430 */
431 ItemPointerSetMin(&minItem);
432 entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem);
433
434 entry->predictNumberResult = stack->predictNumber * entry->nlist;
435
436 LockBuffer(entry->buffer, GIN_UNLOCK);
437 freeGinBtreeStack(stack);
438 entry->isFinished = false;
439 }
440 else
441 {
442 /*
443 * Lock the entry leaf page. This is more coarse-grained than
444 * necessary, because it will conflict with any insertions that
445 * land on the same leaf page, not only the exact key we searched
446 * for. But locking an individual tuple would require updating
447 * that lock whenever it moves because of insertions or vacuums,
448 * which seems too complicated.
449 */
450 PredicateLockPage(ginstate->index,
451 BufferGetBlockNumber(stackEntry->buffer),
452 snapshot);
453 if (GinGetNPosting(itup) > 0)
454 {
455 entry->list = ginReadTuple(ginstate, entry->attnum, itup,
456 &entry->nlist);
457 entry->predictNumberResult = entry->nlist;
458
459 entry->isFinished = false;
460 }
461 }
462 }
463 else
464 {
465 /*
466 * No entry found. Predicate lock the leaf page, to lock the place
467 * where the entry would've been, had there been one.
468 */
469 PredicateLockPage(ginstate->index,
470 BufferGetBlockNumber(stackEntry->buffer), snapshot);
471 }
472
473 if (needUnlock)
474 LockBuffer(stackEntry->buffer, GIN_UNLOCK);
475 freeGinBtreeStack(stackEntry);
476}
477
478/*
479 * Comparison function for scan entry indexes. Sorts by predictNumberResult,
480 * least frequent items first.
481 */
482static int
483entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
484{
485 const GinScanKey key = (const GinScanKey) arg;
486 int i1 = *(const int *) a1;
487 int i2 = *(const int *) a2;
488 uint32 n1 = key->scanEntry[i1]->predictNumberResult;
489 uint32 n2 = key->scanEntry[i2]->predictNumberResult;
490
491 if (n1 < n2)
492 return -1;
493 else if (n1 == n2)
494 return 0;
495 else
496 return 1;
497}
498
499static void
500startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
501{
502 MemoryContext oldCtx = CurrentMemoryContext;
503 int i;
504 int j;
505 int *entryIndexes;
506
507 ItemPointerSetMin(&key->curItem);
508 key->curItemMatches = false;
509 key->recheckCurItem = false;
510 key->isFinished = false;
511
512 /*
513 * Divide the entries into two distinct sets: required and additional.
514 * Additional entries are not enough for a match alone, without any items
515 * from the required set, but are needed by the consistent function to
516 * decide if an item matches. When scanning, we can skip over items from
517 * additional entries that have no corresponding matches in any of the
518 * required entries. That speeds up queries like "frequent & rare"
519 * considerably, if the frequent term can be put in the additional set.
520 *
521 * There can be many legal ways to divide them entries into these two
522 * sets. A conservative division is to just put everything in the required
523 * set, but the more you can put in the additional set, the more you can
524 * skip during the scan. To maximize skipping, we try to put as many
525 * frequent items as possible into additional, and less frequent ones into
526 * required. To do that, sort the entries by frequency
527 * (predictNumberResult), and put entries into the required set in that
528 * order, until the consistent function says that none of the remaining
529 * entries can form a match, without any items from the required set. The
530 * rest go to the additional set.
531 */
532 if (key->nentries > 1)
533 {
534 MemoryContextSwitchTo(so->tempCtx);
535
536 entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
537 for (i = 0; i < key->nentries; i++)
538 entryIndexes[i] = i;
539 qsort_arg(entryIndexes, key->nentries, sizeof(int),
540 entryIndexByFrequencyCmp, key);
541
542 for (i = 0; i < key->nentries - 1; i++)
543 {
544 /* Pass all entries <= i as FALSE, and the rest as MAYBE */
545 for (j = 0; j <= i; j++)
546 key->entryRes[entryIndexes[j]] = GIN_FALSE;
547 for (j = i + 1; j < key->nentries; j++)
548 key->entryRes[entryIndexes[j]] = GIN_MAYBE;
549
550 if (key->triConsistentFn(key) == GIN_FALSE)
551 break;
552 }
553 /* i is now the last required entry. */
554
555 MemoryContextSwitchTo(so->keyCtx);
556
557 key->nrequired = i + 1;
558 key->nadditional = key->nentries - key->nrequired;
559 key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
560 key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
561
562 j = 0;
563 for (i = 0; i < key->nrequired; i++)
564 key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
565 for (i = 0; i < key->nadditional; i++)
566 key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
567
568 /* clean up after consistentFn calls (also frees entryIndexes) */
569 MemoryContextReset(so->tempCtx);
570 }
571 else
572 {
573 MemoryContextSwitchTo(so->keyCtx);
574
575 key->nrequired = 1;
576 key->nadditional = 0;
577 key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
578 key->requiredEntries[0] = key->scanEntry[0];
579 }
580 MemoryContextSwitchTo(oldCtx);
581}
582
583static void
584startScan(IndexScanDesc scan)
585{
586 GinScanOpaque so = (GinScanOpaque) scan->opaque;
587 GinState *ginstate = &so->ginstate;
588 uint32 i;
589
590 for (i = 0; i < so->totalentries; i++)
591 startScanEntry(ginstate, so->entries[i], scan->xs_snapshot);
592
593 if (GinFuzzySearchLimit > 0)
594 {
595 /*
596 * If all of keys more than threshold we will try to reduce result, we
597 * hope (and only hope, for intersection operation of array our
598 * supposition isn't true), that total result will not more than
599 * minimal predictNumberResult.
600 */
601 bool reduce = true;
602
603 for (i = 0; i < so->totalentries; i++)
604 {
605 if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
606 {
607 reduce = false;
608 break;
609 }
610 }
611 if (reduce)
612 {
613 for (i = 0; i < so->totalentries; i++)
614 {
615 so->entries[i]->predictNumberResult /= so->totalentries;
616 so->entries[i]->reduceResult = true;
617 }
618 }
619 }
620
621 /*
622 * Now that we have the estimates for the entry frequencies, finish
623 * initializing the scan keys.
624 */
625 for (i = 0; i < so->nkeys; i++)
626 startScanKey(ginstate, so, so->keys + i);
627}
628
629/*
630 * Load the next batch of item pointers from a posting tree.
631 *
632 * Note that we copy the page into GinScanEntry->list array and unlock it, but
633 * keep it pinned to prevent interference with vacuum.
634 */
635static void
636entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
637 ItemPointerData advancePast, Snapshot snapshot)
638{
639 Page page;
640 int i;
641 bool stepright;
642
643 if (!BufferIsValid(entry->buffer))
644 {
645 entry->isFinished = true;
646 return;
647 }
648
649 /*
650 * We have two strategies for finding the correct page: step right from
651 * the current page, or descend the tree again from the root. If
652 * advancePast equals the current item, the next matching item should be
653 * on the next page, so we step right. Otherwise, descend from root.
654 */
655 if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
656 {
657 stepright = true;
658 LockBuffer(entry->buffer, GIN_SHARE);
659 }
660 else
661 {
662 GinBtreeStack *stack;
663
664 ReleaseBuffer(entry->buffer);
665
666 /*
667 * Set the search key, and find the correct leaf page.
668 */
669 if (ItemPointerIsLossyPage(&advancePast))
670 {
671 ItemPointerSet(&entry->btree.itemptr,
672 GinItemPointerGetBlockNumber(&advancePast) + 1,
673 FirstOffsetNumber);
674 }
675 else
676 {
677 ItemPointerSet(&entry->btree.itemptr,
678 GinItemPointerGetBlockNumber(&advancePast),
679 OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
680 }
681 entry->btree.fullScan = false;
682 stack = ginFindLeafPage(&entry->btree, true, false, snapshot);
683
684 /* we don't need the stack, just the buffer. */
685 entry->buffer = stack->buffer;
686 IncrBufferRefCount(entry->buffer);
687 freeGinBtreeStack(stack);
688 stepright = false;
689 }
690
691 elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
692 GinItemPointerGetBlockNumber(&advancePast),
693 GinItemPointerGetOffsetNumber(&advancePast),
694 !stepright);
695
696 page = BufferGetPage(entry->buffer);
697 for (;;)
698 {
699 entry->offset = InvalidOffsetNumber;
700 if (entry->list)
701 {
702 pfree(entry->list);
703 entry->list = NULL;
704 entry->nlist = 0;
705 }
706
707 if (stepright)
708 {
709 /*
710 * We've processed all the entries on this page. If it was the
711 * last page in the tree, we're done.
712 */
713 if (GinPageRightMost(page))
714 {
715 UnlockReleaseBuffer(entry->buffer);
716 entry->buffer = InvalidBuffer;
717 entry->isFinished = true;
718 return;
719 }
720
721 /*
722 * Step to next page, following the right link. then find the
723 * first ItemPointer greater than advancePast.
724 */
725 entry->buffer = ginStepRight(entry->buffer,
726 ginstate->index,
727 GIN_SHARE);
728 page = BufferGetPage(entry->buffer);
729 }
730 stepright = true;
731
732 if (GinPageGetOpaque(page)->flags & GIN_DELETED)
733 continue; /* page was deleted by concurrent vacuum */
734
735 /*
736 * The first item > advancePast might not be on this page, but
737 * somewhere to the right, if the page was split, or a non-match from
738 * another key in the query allowed us to skip some items from this
739 * entry. Keep following the right-links until we re-find the correct
740 * page.
741 */
742 if (!GinPageRightMost(page) &&
743 ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
744 {
745 /*
746 * the item we're looking is > the right bound of the page, so it
747 * can't be on this page.
748 */
749 continue;
750 }
751
752 entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
753
754 for (i = 0; i < entry->nlist; i++)
755 {
756 if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
757 {
758 entry->offset = i;
759
760 if (GinPageRightMost(page))
761 {
762 /* after processing the copied items, we're done. */
763 UnlockReleaseBuffer(entry->buffer);
764 entry->buffer = InvalidBuffer;
765 }
766 else
767 LockBuffer(entry->buffer, GIN_UNLOCK);
768 return;
769 }
770 }
771 }
772}
773
774#define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE))
775#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
776
777/*
778 * Sets entry->curItem to next heap item pointer > advancePast, for one entry
779 * of one scan key, or sets entry->isFinished to true if there are no more.
780 *
781 * Item pointers are returned in ascending order.
782 *
783 * Note: this can return a "lossy page" item pointer, indicating that the
784 * entry potentially matches all items on that heap page. However, it is
785 * not allowed to return both a lossy page pointer and exact (regular)
786 * item pointers for the same page. (Doing so would break the key-combination
787 * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the
788 * current implementation this is guaranteed by the behavior of tidbitmaps.
789 */
790static void
791entryGetItem(GinState *ginstate, GinScanEntry entry,
792 ItemPointerData advancePast, Snapshot snapshot)
793{
794 Assert(!entry->isFinished);
795
796 Assert(!ItemPointerIsValid(&entry->curItem) ||
797 ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
798
799 if (entry->matchBitmap)
800 {
801 /* A bitmap result */
802 BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
803 OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
804 bool gotitem = false;
805
806 do
807 {
808 /*
809 * If we've exhausted all items on this block, move to next block
810 * in the bitmap.
811 */
812 while (entry->matchResult == NULL ||
813 (entry->matchResult->ntuples >= 0 &&
814 entry->offset >= entry->matchResult->ntuples) ||
815 entry->matchResult->blockno < advancePastBlk ||
816 (ItemPointerIsLossyPage(&advancePast) &&
817 entry->matchResult->blockno == advancePastBlk))
818 {
819 entry->matchResult = tbm_iterate(entry->matchIterator);
820
821 if (entry->matchResult == NULL)
822 {
823 ItemPointerSetInvalid(&entry->curItem);
824 tbm_end_iterate(entry->matchIterator);
825 entry->matchIterator = NULL;
826 entry->isFinished = true;
827 break;
828 }
829
830 /*
831 * Reset counter to the beginning of entry->matchResult. Note:
832 * entry->offset is still greater than matchResult->ntuples if
833 * matchResult is lossy. So, on next call we will get next
834 * result from TIDBitmap.
835 */
836 entry->offset = 0;
837 }
838 if (entry->isFinished)
839 break;
840
841 /*
842 * We're now on the first page after advancePast which has any
843 * items on it. If it's a lossy result, return that.
844 */
845 if (entry->matchResult->ntuples < 0)
846 {
847 ItemPointerSetLossyPage(&entry->curItem,
848 entry->matchResult->blockno);
849
850 /*
851 * We might as well fall out of the loop; we could not
852 * estimate number of results on this page to support correct
853 * reducing of result even if it's enabled.
854 */
855 gotitem = true;
856 break;
857 }
858
859 /*
860 * Not a lossy page. Skip over any offsets <= advancePast, and
861 * return that.
862 */
863 if (entry->matchResult->blockno == advancePastBlk)
864 {
865 /*
866 * First, do a quick check against the last offset on the
867 * page. If that's > advancePast, so are all the other
868 * offsets.
869 */
870 if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
871 {
872 entry->offset = entry->matchResult->ntuples;
873 continue;
874 }
875
876 /* Otherwise scan to find the first item > advancePast */
877 while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
878 entry->offset++;
879 }
880
881 ItemPointerSet(&entry->curItem,
882 entry->matchResult->blockno,
883 entry->matchResult->offsets[entry->offset]);
884 entry->offset++;
885 gotitem = true;
886 } while (!gotitem || (entry->reduceResult == true && dropItem(entry)));
887 }
888 else if (!BufferIsValid(entry->buffer))
889 {
890 /*
891 * A posting list from an entry tuple, or the last page of a posting
892 * tree.
893 */
894 do
895 {
896 if (entry->offset >= entry->nlist)
897 {
898 ItemPointerSetInvalid(&entry->curItem);
899 entry->isFinished = true;
900 break;
901 }
902
903 entry->curItem = entry->list[entry->offset++];
904 } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
905 /* XXX: shouldn't we apply the fuzzy search limit here? */
906 }
907 else
908 {
909 /* A posting tree */
910 do
911 {
912 /* If we've processed the current batch, load more items */
913 while (entry->offset >= entry->nlist)
914 {
915 entryLoadMoreItems(ginstate, entry, advancePast, snapshot);
916
917 if (entry->isFinished)
918 {
919 ItemPointerSetInvalid(&entry->curItem);
920 return;
921 }
922 }
923
924 entry->curItem = entry->list[entry->offset++];
925
926 } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
927 (entry->reduceResult == true && dropItem(entry)));
928 }
929}
930
931/*
932 * Identify the "current" item among the input entry streams for this scan key
933 * that is greater than advancePast, and test whether it passes the scan key
934 * qual condition.
935 *
936 * The current item is the smallest curItem among the inputs. key->curItem
937 * is set to that value. key->curItemMatches is set to indicate whether that
938 * TID passes the consistentFn test. If so, key->recheckCurItem is set true
939 * iff recheck is needed for this item pointer (including the case where the
940 * item pointer is a lossy page pointer).
941 *
942 * If all entry streams are exhausted, sets key->isFinished to true.
943 *
944 * Item pointers must be returned in ascending order.
945 *
946 * Note: this can return a "lossy page" item pointer, indicating that the
947 * key potentially matches all items on that heap page. However, it is
948 * not allowed to return both a lossy page pointer and exact (regular)
949 * item pointers for the same page. (Doing so would break the key-combination
950 * logic in scanGetItem.)
951 */
952static void
953keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
954 ItemPointerData advancePast, Snapshot snapshot)
955{
956 ItemPointerData minItem;
957 ItemPointerData curPageLossy;
958 uint32 i;
959 bool haveLossyEntry;
960 GinScanEntry entry;
961 GinTernaryValue res;
962 MemoryContext oldCtx;
963 bool allFinished;
964
965 Assert(!key->isFinished);
966
967 /*
968 * We might have already tested this item; if so, no need to repeat work.
969 * (Note: the ">" case can happen, if advancePast is exact but we
970 * previously had to set curItem to a lossy-page pointer.)
971 */
972 if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
973 return;
974
975 /*
976 * Find the minimum item > advancePast among the active entry streams.
977 *
978 * Note: a lossy-page entry is encoded by a ItemPointer with max value for
979 * offset (0xffff), so that it will sort after any exact entries for the
980 * same page. So we'll prefer to return exact pointers not lossy
981 * pointers, which is good.
982 */
983 ItemPointerSetMax(&minItem);
984 allFinished = true;
985 for (i = 0; i < key->nrequired; i++)
986 {
987 entry = key->requiredEntries[i];
988
989 if (entry->isFinished)
990 continue;
991
992 /*
993 * Advance this stream if necessary.
994 *
995 * In particular, since entry->curItem was initialized with
996 * ItemPointerSetMin, this ensures we fetch the first item for each
997 * entry on the first call.
998 */
999 if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1000 {
1001 entryGetItem(ginstate, entry, advancePast, snapshot);
1002 if (entry->isFinished)
1003 continue;
1004 }
1005
1006 allFinished = false;
1007 if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
1008 minItem = entry->curItem;
1009 }
1010
1011 if (allFinished)
1012 {
1013 /* all entries are finished */
1014 key->isFinished = true;
1015 return;
1016 }
1017
1018 /*
1019 * Ok, we now know that there are no matches < minItem.
1020 *
1021 * If minItem is lossy, it means that there were no exact items on the
1022 * page among requiredEntries, because lossy pointers sort after exact
1023 * items. However, there might be exact items for the same page among
1024 * additionalEntries, so we mustn't advance past them.
1025 */
1026 if (ItemPointerIsLossyPage(&minItem))
1027 {
1028 if (GinItemPointerGetBlockNumber(&advancePast) <
1029 GinItemPointerGetBlockNumber(&minItem))
1030 {
1031 ItemPointerSet(&advancePast,
1032 GinItemPointerGetBlockNumber(&minItem),
1033 InvalidOffsetNumber);
1034 }
1035 }
1036 else
1037 {
1038 Assert(GinItemPointerGetOffsetNumber(&minItem) > 0);
1039 ItemPointerSet(&advancePast,
1040 GinItemPointerGetBlockNumber(&minItem),
1041 OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem)));
1042 }
1043
1044 /*
1045 * We might not have loaded all the entry streams for this TID yet. We
1046 * could call the consistent function, passing MAYBE for those entries, to
1047 * see if it can decide if this TID matches based on the information we
1048 * have. But if the consistent-function is expensive, and cannot in fact
1049 * decide with partial information, that could be a big loss. So, load all
1050 * the additional entries, before calling the consistent function.
1051 */
1052 for (i = 0; i < key->nadditional; i++)
1053 {
1054 entry = key->additionalEntries[i];
1055
1056 if (entry->isFinished)
1057 continue;
1058
1059 if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1060 {
1061 entryGetItem(ginstate, entry, advancePast, snapshot);
1062 if (entry->isFinished)
1063 continue;
1064 }
1065
1066 /*
1067 * Normally, none of the items in additionalEntries can have a curItem
1068 * larger than minItem. But if minItem is a lossy page, then there
1069 * might be exact items on the same page among additionalEntries.
1070 */
1071 if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
1072 {
1073 Assert(ItemPointerIsLossyPage(&minItem));
1074 minItem = entry->curItem;
1075 }
1076 }
1077
1078 /*
1079 * Ok, we've advanced all the entries up to minItem now. Set key->curItem,
1080 * and perform consistentFn test.
1081 *
1082 * Lossy-page entries pose a problem, since we don't know the correct
1083 * entryRes state to pass to the consistentFn, and we also don't know what
1084 * its combining logic will be (could be AND, OR, or even NOT). If the
1085 * logic is OR then the consistentFn might succeed for all items in the
1086 * lossy page even when none of the other entries match.
1087 *
1088 * Our strategy is to call the tri-state consistent function, with the
1089 * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
1090 * returns FALSE, none of the lossy items alone are enough for a match, so
1091 * we don't need to return a lossy-page pointer. Otherwise, return a
1092 * lossy-page pointer to indicate that the whole heap page must be
1093 * checked. (On subsequent calls, we'll do nothing until minItem is past
1094 * the page altogether, thus ensuring that we never return both regular
1095 * and lossy pointers for the same page.)
1096 *
1097 * An exception is that it doesn't matter what we pass for lossy pointers
1098 * in "hidden" entries, because the consistentFn's result can't depend on
1099 * them. We could pass them as MAYBE as well, but if we're using the
1100 * "shim" implementation of a tri-state consistent function (see
1101 * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
1102 * them as true.
1103 *
1104 * Note that only lossy-page entries pointing to the current item's page
1105 * should trigger this processing; we might have future lossy pages in the
1106 * entry array, but they aren't relevant yet.
1107 */
1108 key->curItem = minItem;
1109 ItemPointerSetLossyPage(&curPageLossy,
1110 GinItemPointerGetBlockNumber(&key->curItem));
1111 haveLossyEntry = false;
1112 for (i = 0; i < key->nentries; i++)
1113 {
1114 entry = key->scanEntry[i];
1115 if (entry->isFinished == false &&
1116 ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
1117 {
1118 if (i < key->nuserentries)
1119 key->entryRes[i] = GIN_MAYBE;
1120 else
1121 key->entryRes[i] = GIN_TRUE;
1122 haveLossyEntry = true;
1123 }
1124 else
1125 key->entryRes[i] = GIN_FALSE;
1126 }
1127
1128 /* prepare for calling consistentFn in temp context */
1129 oldCtx = MemoryContextSwitchTo(tempCtx);
1130
1131 if (haveLossyEntry)
1132 {
1133 /* Have lossy-page entries, so see if whole page matches */
1134 res = key->triConsistentFn(key);
1135
1136 if (res == GIN_TRUE || res == GIN_MAYBE)
1137 {
1138 /* Yes, so clean up ... */
1139 MemoryContextSwitchTo(oldCtx);
1140 MemoryContextReset(tempCtx);
1141
1142 /* and return lossy pointer for whole page */
1143 key->curItem = curPageLossy;
1144 key->curItemMatches = true;
1145 key->recheckCurItem = true;
1146 return;
1147 }
1148 }
1149
1150 /*
1151 * At this point we know that we don't need to return a lossy whole-page
1152 * pointer, but we might have matches for individual exact item pointers,
1153 * possibly in combination with a lossy pointer. Pass lossy pointers as
1154 * MAYBE to the ternary consistent function, to let it decide if this
1155 * tuple satisfies the overall key, even though we don't know if the lossy
1156 * entries match.
1157 *
1158 * Prepare entryRes array to be passed to consistentFn.
1159 */
1160 for (i = 0; i < key->nentries; i++)
1161 {
1162 entry = key->scanEntry[i];
1163 if (entry->isFinished)
1164 key->entryRes[i] = GIN_FALSE;
1165#if 0
1166
1167 /*
1168 * This case can't currently happen, because we loaded all the entries
1169 * for this item earlier.
1170 */
1171 else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1172 key->entryRes[i] = GIN_MAYBE;
1173#endif
1174 else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
1175 key->entryRes[i] = GIN_MAYBE;
1176 else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
1177 key->entryRes[i] = GIN_TRUE;
1178 else
1179 key->entryRes[i] = GIN_FALSE;
1180 }
1181
1182 res = key->triConsistentFn(key);
1183
1184 switch (res)
1185 {
1186 case GIN_TRUE:
1187 key->curItemMatches = true;
1188 /* triConsistentFn set recheckCurItem */
1189 break;
1190
1191 case GIN_FALSE:
1192 key->curItemMatches = false;
1193 break;
1194
1195 case GIN_MAYBE:
1196 key->curItemMatches = true;
1197 key->recheckCurItem = true;
1198 break;
1199
1200 default:
1201
1202 /*
1203 * the 'default' case shouldn't happen, but if the consistent
1204 * function returns something bogus, this is the safe result
1205 */
1206 key->curItemMatches = true;
1207 key->recheckCurItem = true;
1208 break;
1209 }
1210
1211 /*
1212 * We have a tuple, and we know if it matches or not. If it's a non-match,
1213 * we could continue to find the next matching tuple, but let's break out
1214 * and give scanGetItem a chance to advance the other keys. They might be
1215 * able to skip past to a much higher TID, allowing us to save work.
1216 */
1217
1218 /* clean up after consistentFn calls */
1219 MemoryContextSwitchTo(oldCtx);
1220 MemoryContextReset(tempCtx);
1221}
1222
1223/*
1224 * Get next heap item pointer (after advancePast) from scan.
1225 * Returns true if anything found.
1226 * On success, *item and *recheck are set.
1227 *
1228 * Note: this is very nearly the same logic as in keyGetItem(), except
1229 * that we know the keys are to be combined with AND logic, whereas in
1230 * keyGetItem() the combination logic is known only to the consistentFn.
1231 */
1232static bool
1233scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
1234 ItemPointerData *item, bool *recheck)
1235{
1236 GinScanOpaque so = (GinScanOpaque) scan->opaque;
1237 uint32 i;
1238 bool match;
1239
1240 /*----------
1241 * Advance the scan keys in lock-step, until we find an item that matches
1242 * all the keys. If any key reports isFinished, meaning its subset of the
1243 * entries is exhausted, we can stop. Otherwise, set *item to the next
1244 * matching item.
1245 *
1246 * This logic works only if a keyGetItem stream can never contain both
1247 * exact and lossy pointers for the same page. Else we could have a
1248 * case like
1249 *
1250 * stream 1 stream 2
1251 * ... ...
1252 * 42/6 42/7
1253 * 50/1 42/0xffff
1254 * ... ...
1255 *
1256 * We would conclude that 42/6 is not a match and advance stream 1,
1257 * thus never detecting the match to the lossy pointer in stream 2.
1258 * (keyGetItem has a similar problem versus entryGetItem.)
1259 *----------
1260 */
1261 do
1262 {
1263 ItemPointerSetMin(item);
1264 match = true;
1265 for (i = 0; i < so->nkeys && match; i++)
1266 {
1267 GinScanKey key = so->keys + i;
1268
1269 /* Fetch the next item for this key that is > advancePast. */
1270 keyGetItem(&so->ginstate, so->tempCtx, key, advancePast,
1271 scan->xs_snapshot);
1272
1273 if (key->isFinished)
1274 return false;
1275
1276 /*
1277 * If it's not a match, we can immediately conclude that nothing
1278 * <= this item matches, without checking the rest of the keys.
1279 */
1280 if (!key->curItemMatches)
1281 {
1282 advancePast = key->curItem;
1283 match = false;
1284 break;
1285 }
1286
1287 /*
1288 * It's a match. We can conclude that nothing < matches, so the
1289 * other key streams can skip to this item.
1290 *
1291 * Beware of lossy pointers, though; from a lossy pointer, we can
1292 * only conclude that nothing smaller than this *block* matches.
1293 */
1294 if (ItemPointerIsLossyPage(&key->curItem))
1295 {
1296 if (GinItemPointerGetBlockNumber(&advancePast) <
1297 GinItemPointerGetBlockNumber(&key->curItem))
1298 {
1299 ItemPointerSet(&advancePast,
1300 GinItemPointerGetBlockNumber(&key->curItem),
1301 InvalidOffsetNumber);
1302 }
1303 }
1304 else
1305 {
1306 Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0);
1307 ItemPointerSet(&advancePast,
1308 GinItemPointerGetBlockNumber(&key->curItem),
1309 OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem)));
1310 }
1311
1312 /*
1313 * If this is the first key, remember this location as a potential
1314 * match, and proceed to check the rest of the keys.
1315 *
1316 * Otherwise, check if this is the same item that we checked the
1317 * previous keys for (or a lossy pointer for the same page). If
1318 * not, loop back to check the previous keys for this item (we
1319 * will check this key again too, but keyGetItem returns quickly
1320 * for that)
1321 */
1322 if (i == 0)
1323 {
1324 *item = key->curItem;
1325 }
1326 else
1327 {
1328 if (ItemPointerIsLossyPage(&key->curItem) ||
1329 ItemPointerIsLossyPage(item))
1330 {
1331 Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
1332 match = (GinItemPointerGetBlockNumber(&key->curItem) ==
1333 GinItemPointerGetBlockNumber(item));
1334 }
1335 else
1336 {
1337 Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
1338 match = (ginCompareItemPointers(&key->curItem, item) == 0);
1339 }
1340 }
1341 }
1342 } while (!match);
1343
1344 Assert(!ItemPointerIsMin(item));
1345
1346 /*
1347 * Now *item contains the first ItemPointer after previous result that
1348 * satisfied all the keys for that exact TID, or a lossy reference to the
1349 * same page.
1350 *
1351 * We must return recheck = true if any of the keys are marked recheck.
1352 */
1353 *recheck = false;
1354 for (i = 0; i < so->nkeys; i++)
1355 {
1356 GinScanKey key = so->keys + i;
1357
1358 if (key->recheckCurItem)
1359 {
1360 *recheck = true;
1361 break;
1362 }
1363 }
1364
1365 return true;
1366}
1367
1368
1369/*
1370 * Functions for scanning the pending list
1371 */
1372
1373
1374/*
1375 * Get ItemPointer of next heap row to be checked from pending list.
1376 * Returns false if there are no more. On pages with several heap rows
1377 * it returns each row separately, on page with part of heap row returns
1378 * per page data. pos->firstOffset and pos->lastOffset are set to identify
1379 * the range of pending-list tuples belonging to this heap row.
1380 *
1381 * The pendingBuffer is presumed pinned and share-locked on entry, and is
1382 * pinned and share-locked on success exit. On failure exit it's released.
1383 */
1384static bool
1385scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
1386{
1387 OffsetNumber maxoff;
1388 Page page;
1389 IndexTuple itup;
1390
1391 ItemPointerSetInvalid(&pos->item);
1392 for (;;)
1393 {
1394 page = BufferGetPage(pos->pendingBuffer);
1395 TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1396
1397 maxoff = PageGetMaxOffsetNumber(page);
1398 if (pos->firstOffset > maxoff)
1399 {
1400 BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
1401
1402 if (blkno == InvalidBlockNumber)
1403 {
1404 UnlockReleaseBuffer(pos->pendingBuffer);
1405 pos->pendingBuffer = InvalidBuffer;
1406
1407 return false;
1408 }
1409 else
1410 {
1411 /*
1412 * Here we must prevent deletion of next page by insertcleanup
1413 * process, which may be trying to obtain exclusive lock on
1414 * current page. So, we lock next page before releasing the
1415 * current one
1416 */
1417 Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno);
1418
1419 LockBuffer(tmpbuf, GIN_SHARE);
1420 UnlockReleaseBuffer(pos->pendingBuffer);
1421
1422 pos->pendingBuffer = tmpbuf;
1423 pos->firstOffset = FirstOffsetNumber;
1424 }
1425 }
1426 else
1427 {
1428 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset));
1429 pos->item = itup->t_tid;
1430 if (GinPageHasFullRow(page))
1431 {
1432 /*
1433 * find itempointer to the next row
1434 */
1435 for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++)
1436 {
1437 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset));
1438 if (!ItemPointerEquals(&pos->item, &itup->t_tid))
1439 break;
1440 }
1441 }
1442 else
1443 {
1444 /*
1445 * All itempointers are the same on this page
1446 */
1447 pos->lastOffset = maxoff + 1;
1448 }
1449
1450 /*
1451 * Now pos->firstOffset points to the first tuple of current heap
1452 * row, pos->lastOffset points to the first tuple of next heap row
1453 * (or to the end of page)
1454 */
1455 break;
1456 }
1457 }
1458
1459 return true;
1460}
1461
1462/*
1463 * Scan pending-list page from current tuple (off) up till the first of:
1464 * - match is found (then returns true)
1465 * - no later match is possible
1466 * - tuple's attribute number is not equal to entry's attrnum
1467 * - reach end of page
1468 *
1469 * datum[]/category[]/datumExtracted[] arrays are used to cache the results
1470 * of gintuple_get_key() on the current page.
1471 */
1472static bool
1473matchPartialInPendingList(GinState *ginstate, Page page,
1474 OffsetNumber off, OffsetNumber maxoff,
1475 GinScanEntry entry,
1476 Datum *datum, GinNullCategory *category,
1477 bool *datumExtracted)
1478{
1479 IndexTuple itup;
1480 int32 cmp;
1481
1482 /* Partial match to a null is not possible */
1483 if (entry->queryCategory != GIN_CAT_NORM_KEY)
1484 return false;
1485
1486 while (off < maxoff)
1487 {
1488 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
1489
1490 if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
1491 return false;
1492
1493 if (datumExtracted[off - 1] == false)
1494 {
1495 datum[off - 1] = gintuple_get_key(ginstate, itup,
1496 &category[off - 1]);
1497 datumExtracted[off - 1] = true;
1498 }
1499
1500 /* Once we hit nulls, no further match is possible */
1501 if (category[off - 1] != GIN_CAT_NORM_KEY)
1502 return false;
1503
1504 /*----------
1505 * Check partial match.
1506 * case cmp == 0 => match
1507 * case cmp > 0 => not match and end scan (no later match possible)
1508 * case cmp < 0 => not match and continue scan
1509 *----------
1510 */
1511 cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1],
1512 ginstate->supportCollation[entry->attnum - 1],
1513 entry->queryKey,
1514 datum[off - 1],
1515 UInt16GetDatum(entry->strategy),
1516 PointerGetDatum(entry->extra_data)));
1517 if (cmp == 0)
1518 return true;
1519 else if (cmp > 0)
1520 return false;
1521
1522 off++;
1523 }
1524
1525 return false;
1526}
1527
1528/*
1529 * Set up the entryRes array for each key by looking at
1530 * every entry for current heap row in pending list.
1531 *
1532 * Returns true if each scan key has at least one entryRes match.
1533 * This corresponds to the situations where the normal index search will
1534 * try to apply the key's consistentFn. (A tuple not meeting that requirement
1535 * cannot be returned by the normal search since no entry stream will
1536 * source its TID.)
1537 *
1538 * The pendingBuffer is presumed pinned and share-locked on entry.
1539 */
1540static bool
1541collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
1542{
1543 GinScanOpaque so = (GinScanOpaque) scan->opaque;
1544 OffsetNumber attrnum;
1545 Page page;
1546 IndexTuple itup;
1547 int i,
1548 j;
1549
1550 /*
1551 * Reset all entryRes and hasMatchKey flags
1552 */
1553 for (i = 0; i < so->nkeys; i++)
1554 {
1555 GinScanKey key = so->keys + i;
1556
1557 memset(key->entryRes, GIN_FALSE, key->nentries);
1558 }
1559 memset(pos->hasMatchKey, false, so->nkeys);
1560
1561 /*
1562 * Outer loop iterates over multiple pending-list pages when a single heap
1563 * row has entries spanning those pages.
1564 */
1565 for (;;)
1566 {
1567 Datum datum[BLCKSZ / sizeof(IndexTupleData)];
1568 GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
1569 bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)];
1570
1571 Assert(pos->lastOffset > pos->firstOffset);
1572 memset(datumExtracted + pos->firstOffset - 1, 0,
1573 sizeof(bool) * (pos->lastOffset - pos->firstOffset));
1574
1575 page = BufferGetPage(pos->pendingBuffer);
1576 TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1577
1578 for (i = 0; i < so->nkeys; i++)
1579 {
1580 GinScanKey key = so->keys + i;
1581
1582 for (j = 0; j < key->nentries; j++)
1583 {
1584 GinScanEntry entry = key->scanEntry[j];
1585 OffsetNumber StopLow = pos->firstOffset,
1586 StopHigh = pos->lastOffset,
1587 StopMiddle;
1588
1589 /* If already matched on earlier page, do no extra work */
1590 if (key->entryRes[j])
1591 continue;
1592
1593 /*
1594 * Interesting tuples are from pos->firstOffset to
1595 * pos->lastOffset and they are ordered by (attnum, Datum) as
1596 * it's done in entry tree. So we can use binary search to
1597 * avoid linear scanning.
1598 */
1599 while (StopLow < StopHigh)
1600 {
1601 int res;
1602
1603 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1604
1605 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));
1606
1607 attrnum = gintuple_get_attrnum(&so->ginstate, itup);
1608
1609 if (key->attnum < attrnum)
1610 {
1611 StopHigh = StopMiddle;
1612 continue;
1613 }
1614 if (key->attnum > attrnum)
1615 {
1616 StopLow = StopMiddle + 1;
1617 continue;
1618 }
1619
1620 if (datumExtracted[StopMiddle - 1] == false)
1621 {
1622 datum[StopMiddle - 1] =
1623 gintuple_get_key(&so->ginstate, itup,
1624 &category[StopMiddle - 1]);
1625 datumExtracted[StopMiddle - 1] = true;
1626 }
1627
1628 if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
1629 {
1630 /* special behavior depending on searchMode */
1631 if (entry->searchMode == GIN_SEARCH_MODE_ALL)
1632 {
1633 /* match anything except NULL_ITEM */
1634 if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM)
1635 res = -1;
1636 else
1637 res = 0;
1638 }
1639 else
1640 {
1641 /* match everything */
1642 res = 0;
1643 }
1644 }
1645 else
1646 {
1647 res = ginCompareEntries(&so->ginstate,
1648 entry->attnum,
1649 entry->queryKey,
1650 entry->queryCategory,
1651 datum[StopMiddle - 1],
1652 category[StopMiddle - 1]);
1653 }
1654
1655 if (res == 0)
1656 {
1657 /*
1658 * Found exact match (there can be only one, except in
1659 * EMPTY_QUERY mode).
1660 *
1661 * If doing partial match, scan forward from here to
1662 * end of page to check for matches.
1663 *
1664 * See comment above about tuple's ordering.
1665 */
1666 if (entry->isPartialMatch)
1667 key->entryRes[j] =
1668 matchPartialInPendingList(&so->ginstate,
1669 page,
1670 StopMiddle,
1671 pos->lastOffset,
1672 entry,
1673 datum,
1674 category,
1675 datumExtracted);
1676 else
1677 key->entryRes[j] = true;
1678
1679 /* done with binary search */
1680 break;
1681 }
1682 else if (res < 0)
1683 StopHigh = StopMiddle;
1684 else
1685 StopLow = StopMiddle + 1;
1686 }
1687
1688 if (StopLow >= StopHigh && entry->isPartialMatch)
1689 {
1690 /*
1691 * No exact match on this page. If doing partial match,
1692 * scan from the first tuple greater than target value to
1693 * end of page. Note that since we don't remember whether
1694 * the comparePartialFn told us to stop early on a
1695 * previous page, we will uselessly apply comparePartialFn
1696 * to the first tuple on each subsequent page.
1697 */
1698 key->entryRes[j] =
1699 matchPartialInPendingList(&so->ginstate,
1700 page,
1701 StopHigh,
1702 pos->lastOffset,
1703 entry,
1704 datum,
1705 category,
1706 datumExtracted);
1707 }
1708
1709 pos->hasMatchKey[i] |= key->entryRes[j];
1710 }
1711 }
1712
1713 /* Advance firstOffset over the scanned tuples */
1714 pos->firstOffset = pos->lastOffset;
1715
1716 if (GinPageHasFullRow(page))
1717 {
1718 /*
1719 * We have examined all pending entries for the current heap row.
1720 * Break out of loop over pages.
1721 */
1722 break;
1723 }
1724 else
1725 {
1726 /*
1727 * Advance to next page of pending entries for the current heap
1728 * row. Complain if there isn't one.
1729 */
1730 ItemPointerData item = pos->item;
1731
1732 if (scanGetCandidate(scan, pos) == false ||
1733 !ItemPointerEquals(&pos->item, &item))
1734 elog(ERROR, "could not find additional pending pages for same heap tuple");
1735 }
1736 }
1737
1738 /*
1739 * Now return "true" if all scan keys have at least one matching datum
1740 */
1741 for (i = 0; i < so->nkeys; i++)
1742 {
1743 if (pos->hasMatchKey[i] == false)
1744 return false;
1745 }
1746
1747 return true;
1748}
1749
1750/*
1751 * Collect all matched rows from pending list into bitmap.
1752 */
1753static void
1754scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
1755{
1756 GinScanOpaque so = (GinScanOpaque) scan->opaque;
1757 MemoryContext oldCtx;
1758 bool recheck,
1759 match;
1760 int i;
1761 pendingPosition pos;
1762 Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO);
1763 Page page;
1764 BlockNumber blkno;
1765
1766 *ntids = 0;
1767
1768 /*
1769 * Acquire predicate lock on the metapage, to conflict with any fastupdate
1770 * insertions.
1771 */
1772 PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
1773
1774 LockBuffer(metabuffer, GIN_SHARE);
1775 page = BufferGetPage(metabuffer);
1776 TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1777 blkno = GinPageGetMeta(page)->head;
1778
1779 /*
1780 * fetch head of list before unlocking metapage. head page must be pinned
1781 * to prevent deletion by vacuum process
1782 */
1783 if (blkno == InvalidBlockNumber)
1784 {
1785 /* No pending list, so proceed with normal scan */
1786 UnlockReleaseBuffer(metabuffer);
1787 return;
1788 }
1789
1790 pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
1791 LockBuffer(pos.pendingBuffer, GIN_SHARE);
1792 pos.firstOffset = FirstOffsetNumber;
1793 UnlockReleaseBuffer(metabuffer);
1794 pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys);
1795
1796 /*
1797 * loop for each heap row. scanGetCandidate returns full row or row's
1798 * tuples from first page.
1799 */
1800 while (scanGetCandidate(scan, &pos))
1801 {
1802 /*
1803 * Check entries in tuple and set up entryRes array.
1804 *
1805 * If pending tuples belonging to the current heap row are spread
1806 * across several pages, collectMatchesForHeapRow will read all of
1807 * those pages.
1808 */
1809 if (!collectMatchesForHeapRow(scan, &pos))
1810 continue;
1811
1812 /*
1813 * Matching of entries of one row is finished, so check row using
1814 * consistent functions.
1815 */
1816 oldCtx = MemoryContextSwitchTo(so->tempCtx);
1817 recheck = false;
1818 match = true;
1819
1820 for (i = 0; i < so->nkeys; i++)
1821 {
1822 GinScanKey key = so->keys + i;
1823
1824 if (!key->boolConsistentFn(key))
1825 {
1826 match = false;
1827 break;
1828 }
1829 recheck |= key->recheckCurItem;
1830 }
1831
1832 MemoryContextSwitchTo(oldCtx);
1833 MemoryContextReset(so->tempCtx);
1834
1835 if (match)
1836 {
1837 tbm_add_tuples(tbm, &pos.item, 1, recheck);
1838 (*ntids)++;
1839 }
1840 }
1841
1842 pfree(pos.hasMatchKey);
1843}
1844
1845
1846#define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes )
1847
1848int64
1849gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
1850{
1851 GinScanOpaque so = (GinScanOpaque) scan->opaque;
1852 int64 ntids;
1853 ItemPointerData iptr;
1854 bool recheck;
1855
1856 /*
1857 * Set up the scan keys, and check for unsatisfiable query.
1858 */
1859 ginFreeScanKeys(so); /* there should be no keys yet, but just to be
1860 * sure */
1861 ginNewScanKey(scan);
1862
1863 if (GinIsVoidRes(scan))
1864 return 0;
1865
1866 ntids = 0;
1867
1868 /*
1869 * First, scan the pending list and collect any matching entries into the
1870 * bitmap. After we scan a pending item, some other backend could post it
1871 * into the main index, and so we might visit it a second time during the
1872 * main scan. This is okay because we'll just re-set the same bit in the
1873 * bitmap. (The possibility of duplicate visits is a major reason why GIN
1874 * can't support the amgettuple API, however.) Note that it would not do
1875 * to scan the main index before the pending list, since concurrent
1876 * cleanup could then make us miss entries entirely.
1877 */
1878 scanPendingInsert(scan, tbm, &ntids);
1879
1880 /*
1881 * Now scan the main index.
1882 */
1883 startScan(scan);
1884
1885 ItemPointerSetMin(&iptr);
1886
1887 for (;;)
1888 {
1889 CHECK_FOR_INTERRUPTS();
1890
1891 if (!scanGetItem(scan, iptr, &iptr, &recheck))
1892 break;
1893
1894 if (ItemPointerIsLossyPage(&iptr))
1895 tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr));
1896 else
1897 tbm_add_tuples(tbm, &iptr, 1, recheck);
1898 ntids++;
1899 }
1900
1901 return ntids;
1902}
1903