ginget.c source code [PostgreSQL/src/backend/access/gin/ginget.c]

1	/-------------------------------------------------------------------------*
2	*
3	* ginget.c
4	* fetch tuples from a GIN scan.
5	*
6	*
7	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1994, Regents of the University of California
9	*
10	* IDENTIFICATION
11	* src/backend/access/gin/ginget.c
12	*-------------------------------------------------------------------------
13	*/
14
15	#include "postgres.h"
16
17	#include "access/gin_private.h"
18	#include "access/relscan.h"
19	#include "miscadmin.h"
20	#include "storage/predicate.h"
21	#include "utils/datum.h"
22	#include "utils/memutils.h"
23	#include "utils/rel.h"
24
25	/ GUC parameter /
26	int GinFuzzySearchLimit = `0`;
27
28	typedef struct pendingPosition
29	{
30	Buffer pendingBuffer;
31	OffsetNumber firstOffset;
32	OffsetNumber lastOffset;
33	ItemPointerData item;
34	bool *hasMatchKey;
35	} pendingPosition;
36
37
38	/*
39	* Goes to the next page if current offset is outside of bounds
40	*/
41	static bool
42	moveRightIfItNeeded(GinBtreeData btree, GinBtreeStack stack, Snapshot snapshot)
43	{
44	Page page = BufferGetPage(stack->buffer);
45
46	if (stack->off > PageGetMaxOffsetNumber(page))
47	{
48	/*
49	* We scanned the whole page, so we should take right page
50	*/
51	if (GinPageRightMost(page))
52	return false; / no more pages /
53
54	stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
55	stack->blkno = BufferGetBlockNumber(stack->buffer);
56	stack->off = FirstOffsetNumber;
57	PredicateLockPage(btree->index, stack->blkno, snapshot);
58	}
59
60	return true;
61	}
62
63	/*
64	* Scan all pages of a posting tree and save all its heap ItemPointers
65	* in scanEntry->matchBitmap
66	*/
67	static void
68	scanPostingTree(Relation index, GinScanEntry scanEntry,
69	BlockNumber rootPostingTree, Snapshot snapshot)
70	{
71	GinBtreeData btree;
72	GinBtreeStack *stack;
73	Buffer buffer;
74	Page page;
75
76	/ Descend to the leftmost leaf page /
77	stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot);
78	buffer = stack->buffer;
79
80	IncrBufferRefCount(buffer); / prevent unpin in freeGinBtreeStack /
81
82	freeGinBtreeStack(stack);
83
84	/*
85	* Loop iterates through all leaf pages of posting tree
86	*/
87	for (;;)
88	{
89	page = BufferGetPage(buffer);
90	if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == `0`)
91	{
92	int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
93
94	scanEntry->predictNumberResult += n;
95	}
96
97	if (GinPageRightMost(page))
98	break; / no more pages /
99
100	buffer = ginStepRight(buffer, index, GIN_SHARE);
101	}
102
103	UnlockReleaseBuffer(buffer);
104	}
105
106	/*
107	* Collects TIDs into scanEntry->matchBitmap for all heap tuples that
108	* match the search entry. This supports three different match modes:
109	*
110	* 1. Partial-match support: scan from current point until the
111	* comparePartialFn says we're done.
112	* 2. SEARCH_MODE_ALL: scan from current point (which should be first
113	* key for the current attnum) until we hit null items or end of attnum
114	* 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
115	* key for the current attnum) until we hit end of attnum
116	*
117	* Returns true if done, false if it's necessary to restart scan from scratch
118	*/
119	static bool
120	collectMatchBitmap(GinBtreeData btree, GinBtreeStack stack,
121	GinScanEntry scanEntry, Snapshot snapshot)
122	{
123	OffsetNumber attnum;
124	Form_pg_attribute attr;
125
126	/ Initialize empty bitmap result /
127	scanEntry->matchBitmap = tbm_create(work_mem * `1024L`, NULL);
128
129	/ Null query cannot partial-match anything /
130	if (scanEntry->isPartialMatch &&
131	scanEntry->queryCategory != GIN_CAT_NORM_KEY)
132	return true;
133
134	/ Locate tupdesc entry for key column (for attbyval/attlen data) /
135	attnum = scanEntry->attnum;
136	attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - `1`);
137
138	/*
139	* Predicate lock entry leaf page, following pages will be locked by
140	* moveRightIfItNeeded()
141	*/
142	PredicateLockPage(btree->index, stack->buffer, snapshot);
143
144	for (;;)
145	{
146	Page page;
147	IndexTuple itup;
148	Datum idatum;
149	GinNullCategory icategory;
150
151	/*
152	* stack->off points to the interested entry, buffer is already locked
153	*/
154	if (moveRightIfItNeeded(btree, stack, snapshot) == false)
155	return true;
156
157	page = BufferGetPage(stack->buffer);
158	TestForOldSnapshot(snapshot, btree->index, page);
159	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
160
161	/*
162	* If tuple stores another attribute then stop scan
163	*/
164	if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
165	return true;
166
167	/ Safe to fetch attribute value /
168	idatum = gintuple_get_key(btree->ginstate, itup, &icategory);
169
170	/*
171	* Check for appropriate scan stop conditions
172	*/
173	if (scanEntry->isPartialMatch)
174	{
175	int32 cmp;
176
177	/*
178	* In partial match, stop scan at any null (including
179	* placeholders); partial matches never match nulls
180	*/
181	if (icategory != GIN_CAT_NORM_KEY)
182	return true;
183
184	/----------*
185	* Check of partial match.
186	* case cmp == 0 => match
187	* case cmp > 0 => not match and finish scan
188	* case cmp < 0 => not match and continue scan
189	*----------
190	*/
191	cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - `1`],
192	btree->ginstate->supportCollation[attnum - `1`],
193	scanEntry->queryKey,
194	idatum,
195	UInt16GetDatum(scanEntry->strategy),
196	PointerGetDatum(scanEntry->extra_data)));
197
198	if (cmp > `0`)
199	return true;
200	else if (cmp < `0`)
201	{
202	stack->off++;
203	continue;
204	}
205	}
206	else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
207	{
208	/*
209	* In ALL mode, we are not interested in null items, so we can
210	* stop if we get to a null-item placeholder (which will be the
211	* last entry for a given attnum). We do want to include NULL_KEY
212	* and EMPTY_ITEM entries, though.
213	*/
214	if (icategory == GIN_CAT_NULL_ITEM)
215	return true;
216	}
217
218	/*
219	* OK, we want to return the TIDs listed in this entry.
220	*/
221	if (GinIsPostingTree(itup))
222	{
223	BlockNumber rootPostingTree = GinGetPostingTree(itup);
224
225	/*
226	* We should unlock current page (but not unpin) during tree scan
227	* to prevent deadlock with vacuum processes.
228	*
229	* We save current entry value (idatum) to be able to re-find our
230	* tuple after re-locking
231	*/
232	if (icategory == GIN_CAT_NORM_KEY)
233	idatum = datumCopy(idatum, attr->attbyval, attr->attlen);
234
235	LockBuffer(stack->buffer, GIN_UNLOCK);
236
237	/*
238	* Acquire predicate lock on the posting tree. We already hold a
239	* lock on the entry page, but insertions to the posting tree
240	* don't check for conflicts on that level.
241	*/
242	PredicateLockPage(btree->index, rootPostingTree, snapshot);
243
244	/ Collect all the TIDs in this entry's posting tree /
245	scanPostingTree(btree->index, scanEntry, rootPostingTree,
246	snapshot);
247
248	/*
249	* We lock again the entry page and while it was unlocked insert
250	* might have occurred, so we need to re-find our position.
251	*/
252	LockBuffer(stack->buffer, GIN_SHARE);
253	page = BufferGetPage(stack->buffer);
254	if (!GinPageIsLeaf(page))
255	{
256	/*
257	* Root page becomes non-leaf while we unlock it. We will
258	* start again, this situation doesn't occur often - root can
259	* became a non-leaf only once per life of index.
260	*/
261	return false;
262	}
263
264	/ Search forward to re-find idatum /
265	for (;;)
266	{
267	Datum newDatum;
268	GinNullCategory newCategory;
269
270	if (moveRightIfItNeeded(btree, stack, snapshot) == false)
271	elog(ERROR, "lost saved point in index"); / must not happen !!! /
272
273	page = BufferGetPage(stack->buffer);
274	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
275
276	if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
277	elog(ERROR, "lost saved point in index"); / must not happen !!! /
278	newDatum = gintuple_get_key(btree->ginstate, itup,
279	&newCategory);
280
281	if (ginCompareEntries(btree->ginstate, attnum,
282	newDatum, newCategory,
283	idatum, icategory) == `0`)
284	break; / Found! /
285
286	stack->off++;
287	}
288
289	if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
290	pfree(DatumGetPointer(idatum));
291	}
292	else
293	{
294	ItemPointer ipd;
295	int nipd;
296
297	ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
298	tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
299	scanEntry->predictNumberResult += GinGetNPosting(itup);
300	pfree(ipd);
301	}
302
303	/*
304	* Done with this entry, go to the next
305	*/
306	stack->off++;
307	}
308	}
309
310	/*
311	* Start* functions setup beginning state of searches: finds correct buffer and pins it.
312	*/
313	static void
314	startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
315	{
316	GinBtreeData btreeEntry;
317	GinBtreeStack *stackEntry;
318	Page page;
319	bool needUnlock;
320
321	restartScanEntry:
322	entry->buffer = InvalidBuffer;
323	ItemPointerSetMin(&entry->curItem);
324	entry->offset = InvalidOffsetNumber;
325	if (entry->list)
326	pfree(entry->list);
327	entry->list = NULL;
328	entry->nlist = `0`;
329	entry->matchBitmap = NULL;
330	entry->matchResult = NULL;
331	entry->reduceResult = false;
332	entry->predictNumberResult = `0`;
333
334	/*
335	* we should find entry, and begin scan of posting tree or just store
336	* posting list in memory
337	*/
338	ginPrepareEntryScan(&btreeEntry, entry->attnum,
339	entry->queryKey, entry->queryCategory,
340	ginstate);
341	stackEntry = ginFindLeafPage(&btreeEntry, true, false, snapshot);
342	page = BufferGetPage(stackEntry->buffer);
343
344	/ ginFindLeafPage() will have already checked snapshot age. /
345	needUnlock = true;
346
347	entry->isFinished = true;
348
349	if (entry->isPartialMatch \|\|
350	entry->queryCategory == GIN_CAT_EMPTY_QUERY)
351	{
352	/*
353	* btreeEntry.findItem locates the first item >= given search key.
354	* (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
355	* because of the way the GIN_CAT_EMPTY_QUERY category code is
356	* assigned.) We scan forward from there and collect all TIDs needed
357	* for the entry type.
358	*/
359	btreeEntry.findItem(&btreeEntry, stackEntry);
360	if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot)
361	== false)
362	{
363	/*
364	* GIN tree was seriously restructured, so we will cleanup all
365	* found data and rescan. See comments near 'return false' in
366	* collectMatchBitmap()
367	*/
368	if (entry->matchBitmap)
369	{
370	if (entry->matchIterator)
371	tbm_end_iterate(entry->matchIterator);
372	entry->matchIterator = NULL;
373	tbm_free(entry->matchBitmap);
374	entry->matchBitmap = NULL;
375	}
376	LockBuffer(stackEntry->buffer, GIN_UNLOCK);
377	freeGinBtreeStack(stackEntry);
378	goto restartScanEntry;
379	}
380
381	if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
382	{
383	entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
384	entry->isFinished = false;
385	}
386	}
387	else if (btreeEntry.findItem(&btreeEntry, stackEntry))
388	{
389	IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
390
391	if (GinIsPostingTree(itup))
392	{
393	BlockNumber rootPostingTree = GinGetPostingTree(itup);
394	GinBtreeStack *stack;
395	Page page;
396	ItemPointerData minItem;
397
398	/*
399	* This is an equality scan, so lock the root of the posting tree.
400	* It represents a lock on the exact key value, and covers all the
401	* items in the posting tree.
402	*/
403	PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
404
405	/*
406	* We should unlock entry page before touching posting tree to
407	* prevent deadlocks with vacuum processes. Because entry is never
408	* deleted from page and posting tree is never reduced to the
409	* posting list, we can unlock page after getting BlockNumber of
410	* root of posting tree.
411	*/
412	LockBuffer(stackEntry->buffer, GIN_UNLOCK);
413	needUnlock = false;
414
415	stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
416	rootPostingTree, snapshot);
417	entry->buffer = stack->buffer;
418
419	/*
420	* We keep buffer pinned because we need to prevent deletion of
421	* page during scan. See GIN's vacuum implementation. RefCount is
422	* increased to keep buffer pinned after freeGinBtreeStack() call.
423	*/
424	IncrBufferRefCount(entry->buffer);
425
426	page = BufferGetPage(entry->buffer);
427
428	/*
429	* Load the first page into memory.
430	*/
431	ItemPointerSetMin(&minItem);
432	entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem);
433
434	entry->predictNumberResult = stack->predictNumber * entry->nlist;
435
436	LockBuffer(entry->buffer, GIN_UNLOCK);
437	freeGinBtreeStack(stack);
438	entry->isFinished = false;
439	}
440	else
441	{
442	/*
443	* Lock the entry leaf page. This is more coarse-grained than
444	* necessary, because it will conflict with any insertions that
445	* land on the same leaf page, not only the exact key we searched
446	* for. But locking an individual tuple would require updating
447	* that lock whenever it moves because of insertions or vacuums,
448	* which seems too complicated.
449	*/
450	PredicateLockPage(ginstate->index,
451	BufferGetBlockNumber(stackEntry->buffer),
452	snapshot);
453	if (GinGetNPosting(itup) > `0`)
454	{
455	entry->list = ginReadTuple(ginstate, entry->attnum, itup,
456	&entry->nlist);
457	entry->predictNumberResult = entry->nlist;
458
459	entry->isFinished = false;
460	}
461	}
462	}
463	else
464	{
465	/*
466	* No entry found. Predicate lock the leaf page, to lock the place
467	* where the entry would've been, had there been one.
468	*/
469	PredicateLockPage(ginstate->index,
470	BufferGetBlockNumber(stackEntry->buffer), snapshot);
471	}
472
473	if (needUnlock)
474	LockBuffer(stackEntry->buffer, GIN_UNLOCK);
475	freeGinBtreeStack(stackEntry);
476	}
477
478	/*
479	* Comparison function for scan entry indexes. Sorts by predictNumberResult,
480	* least frequent items first.
481	*/
482	static int
483	entryIndexByFrequencyCmp(const void a1, const* void a2, void* *arg)
484	{
485	const GinScanKey key = (const GinScanKey) arg;
486	int i1 = (const* int *) a1;
487	int i2 = (const* int *) a2;
488	uint32 n1 = key->scanEntry[i1]->predictNumberResult;
489	uint32 n2 = key->scanEntry[i2]->predictNumberResult;
490
491	if (n1 < n2)
492	return -`1`;
493	else if (n1 == n2)
494	return `0`;
495	else
496	return `1`;
497	}
498
499	static void
500	startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
501	{
502	MemoryContext oldCtx = CurrentMemoryContext;
503	int i;
504	int j;
505	int *entryIndexes;
506
507	ItemPointerSetMin(&key->curItem);
508	key->curItemMatches = false;
509	key->recheckCurItem = false;
510	key->isFinished = false;
511
512	/*
513	* Divide the entries into two distinct sets: required and additional.
514	* Additional entries are not enough for a match alone, without any items
515	* from the required set, but are needed by the consistent function to
516	* decide if an item matches. When scanning, we can skip over items from
517	* additional entries that have no corresponding matches in any of the
518	* required entries. That speeds up queries like "frequent & rare"
519	* considerably, if the frequent term can be put in the additional set.
520	*
521	* There can be many legal ways to divide them entries into these two
522	* sets. A conservative division is to just put everything in the required
523	* set, but the more you can put in the additional set, the more you can
524	* skip during the scan. To maximize skipping, we try to put as many
525	* frequent items as possible into additional, and less frequent ones into
526	* required. To do that, sort the entries by frequency
527	* (predictNumberResult), and put entries into the required set in that
528	* order, until the consistent function says that none of the remaining
529	* entries can form a match, without any items from the required set. The
530	* rest go to the additional set.
531	*/
532	if (key->nentries > `1`)
533	{
534	MemoryContextSwitchTo(so->tempCtx);
535
536	entryIndexes = (int ) palloc(sizeof(int) key->nentries);
537	for (i = `0`; i < key->nentries; i++)
538	entryIndexes[i] = i;
539	qsort_arg(entryIndexes, key->nentries, sizeof(int),
540	entryIndexByFrequencyCmp, key);
541
542	for (i = `0`; i < key->nentries - `1`; i++)
543	{
544	/ Pass all entries <= i as FALSE, and the rest as MAYBE /
545	for (j = `0`; j <= i; j++)
546	key->entryRes[entryIndexes[j]] = GIN_FALSE;
547	for (j = i + `1`; j < key->nentries; j++)
548	key->entryRes[entryIndexes[j]] = GIN_MAYBE;
549
550	if (key->triConsistentFn(key) == GIN_FALSE)
551	break;
552	}
553	/ i is now the last required entry. /
554
555	MemoryContextSwitchTo(so->keyCtx);
556
557	key->nrequired = i + `1`;
558	key->nadditional = key->nentries - key->nrequired;
559	key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
560	key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
561
562	j = `0`;
563	for (i = `0`; i < key->nrequired; i++)
564	key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
565	for (i = `0`; i < key->nadditional; i++)
566	key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
567
568	/ clean up after consistentFn calls (also frees entryIndexes) /
569	MemoryContextReset(so->tempCtx);
570	}
571	else
572	{
573	MemoryContextSwitchTo(so->keyCtx);
574
575	key->nrequired = `1`;
576	key->nadditional = `0`;
577	key->requiredEntries = palloc(`1` * sizeof(GinScanEntry));
578	key->requiredEntries[`0`] = key->scanEntry[`0`];
579	}
580	MemoryContextSwitchTo(oldCtx);
581	}
582
583	static void
584	startScan(IndexScanDesc scan)
585	{
586	GinScanOpaque so = (GinScanOpaque) scan->opaque;
587	GinState *ginstate = &so->ginstate;
588	uint32 i;
589
590	for (i = `0`; i < so->totalentries; i++)
591	startScanEntry(ginstate, so->entries[i], scan->xs_snapshot);
592
593	if (GinFuzzySearchLimit > `0`)
594	{
595	/*
596	* If all of keys more than threshold we will try to reduce result, we
597	* hope (and only hope, for intersection operation of array our
598	* supposition isn't true), that total result will not more than
599	* minimal predictNumberResult.
600	*/
601	bool reduce = true;
602
603	for (i = `0`; i < so->totalentries; i++)
604	{
605	if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
606	{
607	reduce = false;
608	break;
609	}
610	}
611	if (reduce)
612	{
613	for (i = `0`; i < so->totalentries; i++)
614	{
615	so->entries[i]->predictNumberResult /= so->totalentries;
616	so->entries[i]->reduceResult = true;
617	}
618	}
619	}
620
621	/*
622	* Now that we have the estimates for the entry frequencies, finish
623	* initializing the scan keys.
624	*/
625	for (i = `0`; i < so->nkeys; i++)
626	startScanKey(ginstate, so, so->keys + i);
627	}
628
629	/*
630	* Load the next batch of item pointers from a posting tree.
631	*
632	* Note that we copy the page into GinScanEntry->list array and unlock it, but
633	* keep it pinned to prevent interference with vacuum.
634	*/
635	static void
636	entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
637	ItemPointerData advancePast, Snapshot snapshot)
638	{
639	Page page;
640	int i;
641	bool stepright;
642
643	if (!BufferIsValid(entry->buffer))
644	{
645	entry->isFinished = true;
646	return;
647	}
648
649	/*
650	* We have two strategies for finding the correct page: step right from
651	* the current page, or descend the tree again from the root. If
652	* advancePast equals the current item, the next matching item should be
653	* on the next page, so we step right. Otherwise, descend from root.
654	*/
655	if (ginCompareItemPointers(&entry->curItem, &advancePast) == `0`)
656	{
657	stepright = true;
658	LockBuffer(entry->buffer, GIN_SHARE);
659	}
660	else
661	{
662	GinBtreeStack *stack;
663
664	ReleaseBuffer(entry->buffer);
665
666	/*
667	* Set the search key, and find the correct leaf page.
668	*/
669	if (ItemPointerIsLossyPage(&advancePast))
670	{
671	ItemPointerSet(&entry->btree.itemptr,
672	GinItemPointerGetBlockNumber(&advancePast) + `1`,
673	FirstOffsetNumber);
674	}
675	else
676	{
677	ItemPointerSet(&entry->btree.itemptr,
678	GinItemPointerGetBlockNumber(&advancePast),
679	OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
680	}
681	entry->btree.fullScan = false;
682	stack = ginFindLeafPage(&entry->btree, true, false, snapshot);
683
684	/ we don't need the stack, just the buffer. /
685	entry->buffer = stack->buffer;
686	IncrBufferRefCount(entry->buffer);
687	freeGinBtreeStack(stack);
688	stepright = false;
689	}
690
691	elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
692	GinItemPointerGetBlockNumber(&advancePast),
693	GinItemPointerGetOffsetNumber(&advancePast),
694	!stepright);
695
696	page = BufferGetPage(entry->buffer);
697	for (;;)
698	{
699	entry->offset = InvalidOffsetNumber;
700	if (entry->list)
701	{
702	pfree(entry->list);
703	entry->list = NULL;
704	entry->nlist = `0`;
705	}
706
707	if (stepright)
708	{
709	/*
710	* We've processed all the entries on this page. If it was the
711	* last page in the tree, we're done.
712	*/
713	if (GinPageRightMost(page))
714	{
715	UnlockReleaseBuffer(entry->buffer);
716	entry->buffer = InvalidBuffer;
717	entry->isFinished = true;
718	return;
719	}
720
721	/*
722	* Step to next page, following the right link. then find the
723	* first ItemPointer greater than advancePast.
724	*/
725	entry->buffer = ginStepRight(entry->buffer,
726	ginstate->index,
727	GIN_SHARE);
728	page = BufferGetPage(entry->buffer);
729	}
730	stepright = true;
731
732	if (GinPageGetOpaque(page)->flags & GIN_DELETED)
733	continue; / page was deleted by concurrent vacuum /
734
735	/*
736	* The first item > advancePast might not be on this page, but
737	* somewhere to the right, if the page was split, or a non-match from
738	* another key in the query allowed us to skip some items from this
739	* entry. Keep following the right-links until we re-find the correct
740	* page.
741	*/
742	if (!GinPageRightMost(page) &&
743	ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= `0`)
744	{
745	/*
746	* the item we're looking is > the right bound of the page, so it
747	* can't be on this page.
748	*/
749	continue;
750	}
751
752	entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
753
754	for (i = `0`; i < entry->nlist; i++)
755	{
756	if (ginCompareItemPointers(&advancePast, &entry->list[i]) < `0`)
757	{
758	entry->offset = i;
759
760	if (GinPageRightMost(page))
761	{
762	/ after processing the copied items, we're done. /
763	UnlockReleaseBuffer(entry->buffer);
764	entry->buffer = InvalidBuffer;
765	}
766	else
767	LockBuffer(entry->buffer, GIN_UNLOCK);
768	return;
769	}
770	}
771	}
772	}
773
774	#define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE))
775	#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
776
777	/*
778	* Sets entry->curItem to next heap item pointer > advancePast, for one entry
779	* of one scan key, or sets entry->isFinished to true if there are no more.
780	*
781	* Item pointers are returned in ascending order.
782	*
783	* Note: this can return a "lossy page" item pointer, indicating that the
784	* entry potentially matches all items on that heap page. However, it is
785	* not allowed to return both a lossy page pointer and exact (regular)
786	* item pointers for the same page. (Doing so would break the key-combination
787	* logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the
788	* current implementation this is guaranteed by the behavior of tidbitmaps.
789	*/
790	static void
791	entryGetItem(GinState *ginstate, GinScanEntry entry,
792	ItemPointerData advancePast, Snapshot snapshot)
793	{
794	Assert(!entry->isFinished);
795
796	Assert(!ItemPointerIsValid(&entry->curItem) \|\|
797	ginCompareItemPointers(&entry->curItem, &advancePast) <= `0`);
798
799	if (entry->matchBitmap)
800	{
801	/ A bitmap result /
802	BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
803	OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
804	bool gotitem = false;
805
806	do
807	{
808	/*
809	* If we've exhausted all items on this block, move to next block
810	* in the bitmap.
811	*/
812	while (entry->matchResult == NULL \|\|
813	(entry->matchResult->ntuples >= `0` &&
814	entry->offset >= entry->matchResult->ntuples) \|\|
815	entry->matchResult->blockno < advancePastBlk \|\|
816	(ItemPointerIsLossyPage(&advancePast) &&
817	entry->matchResult->blockno == advancePastBlk))
818	{
819	entry->matchResult = tbm_iterate(entry->matchIterator);
820
821	if (entry->matchResult == NULL)
822	{
823	ItemPointerSetInvalid(&entry->curItem);
824	tbm_end_iterate(entry->matchIterator);
825	entry->matchIterator = NULL;
826	entry->isFinished = true;
827	break;
828	}
829
830	/*
831	* Reset counter to the beginning of entry->matchResult. Note:
832	* entry->offset is still greater than matchResult->ntuples if
833	* matchResult is lossy. So, on next call we will get next
834	* result from TIDBitmap.
835	*/
836	entry->offset = `0`;
837	}
838	if (entry->isFinished)
839	break;
840
841	/*
842	* We're now on the first page after advancePast which has any
843	* items on it. If it's a lossy result, return that.
844	*/
845	if (entry->matchResult->ntuples < `0`)
846	{
847	ItemPointerSetLossyPage(&entry->curItem,
848	entry->matchResult->blockno);
849
850	/*
851	* We might as well fall out of the loop; we could not
852	* estimate number of results on this page to support correct
853	* reducing of result even if it's enabled.
854	*/
855	gotitem = true;
856	break;
857	}
858
859	/*
860	* Not a lossy page. Skip over any offsets <= advancePast, and
861	* return that.
862	*/
863	if (entry->matchResult->blockno == advancePastBlk)
864	{
865	/*
866	* First, do a quick check against the last offset on the
867	* page. If that's > advancePast, so are all the other
868	* offsets.
869	*/
870	if (entry->matchResult->offsets[entry->matchResult->ntuples - `1`] <= advancePastOff)
871	{
872	entry->offset = entry->matchResult->ntuples;
873	continue;
874	}
875
876	/ Otherwise scan to find the first item > advancePast /
877	while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
878	entry->offset++;
879	}
880
881	ItemPointerSet(&entry->curItem,
882	entry->matchResult->blockno,
883	entry->matchResult->offsets[entry->offset]);
884	entry->offset++;
885	gotitem = true;
886	} while (!gotitem \|\| (entry->reduceResult == true && dropItem(entry)));
887	}
888	else if (!BufferIsValid(entry->buffer))
889	{
890	/*
891	* A posting list from an entry tuple, or the last page of a posting
892	* tree.
893	*/
894	do
895	{
896	if (entry->offset >= entry->nlist)
897	{
898	ItemPointerSetInvalid(&entry->curItem);
899	entry->isFinished = true;
900	break;
901	}
902
903	entry->curItem = entry->list[entry->offset++];
904	} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= `0`);
905	/ XXX: shouldn't we apply the fuzzy search limit here? /
906	}
907	else
908	{
909	/ A posting tree /
910	do
911	{
912	/ If we've processed the current batch, load more items /
913	while (entry->offset >= entry->nlist)
914	{
915	entryLoadMoreItems(ginstate, entry, advancePast, snapshot);
916
917	if (entry->isFinished)
918	{
919	ItemPointerSetInvalid(&entry->curItem);
920	return;
921	}
922	}
923
924	entry->curItem = entry->list[entry->offset++];
925
926	} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= `0` \|\|
927	(entry->reduceResult == true && dropItem(entry)));
928	}
929	}
930
931	/*
932	* Identify the "current" item among the input entry streams for this scan key
933	* that is greater than advancePast, and test whether it passes the scan key
934	* qual condition.
935	*
936	* The current item is the smallest curItem among the inputs. key->curItem
937	* is set to that value. key->curItemMatches is set to indicate whether that
938	* TID passes the consistentFn test. If so, key->recheckCurItem is set true
939	* iff recheck is needed for this item pointer (including the case where the
940	* item pointer is a lossy page pointer).
941	*
942	* If all entry streams are exhausted, sets key->isFinished to true.
943	*
944	* Item pointers must be returned in ascending order.
945	*
946	* Note: this can return a "lossy page" item pointer, indicating that the
947	* key potentially matches all items on that heap page. However, it is
948	* not allowed to return both a lossy page pointer and exact (regular)
949	* item pointers for the same page. (Doing so would break the key-combination
950	* logic in scanGetItem.)
951	*/
952	static void
953	keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
954	ItemPointerData advancePast, Snapshot snapshot)
955	{
956	ItemPointerData minItem;
957	ItemPointerData curPageLossy;
958	uint32 i;
959	bool haveLossyEntry;
960	GinScanEntry entry;
961	GinTernaryValue res;
962	MemoryContext oldCtx;
963	bool allFinished;
964
965	Assert(!key->isFinished);
966
967	/*
968	* We might have already tested this item; if so, no need to repeat work.
969	* (Note: the ">" case can happen, if advancePast is exact but we
970	* previously had to set curItem to a lossy-page pointer.)
971	*/
972	if (ginCompareItemPointers(&key->curItem, &advancePast) > `0`)
973	return;
974
975	/*
976	* Find the minimum item > advancePast among the active entry streams.
977	*
978	* Note: a lossy-page entry is encoded by a ItemPointer with max value for
979	* offset (0xffff), so that it will sort after any exact entries for the
980	* same page. So we'll prefer to return exact pointers not lossy
981	* pointers, which is good.
982	*/
983	ItemPointerSetMax(&minItem);
984	allFinished = true;
985	for (i = `0`; i < key->nrequired; i++)
986	{
987	entry = key->requiredEntries[i];
988
989	if (entry->isFinished)
990	continue;
991
992	/*
993	* Advance this stream if necessary.
994	*
995	* In particular, since entry->curItem was initialized with
996	* ItemPointerSetMin, this ensures we fetch the first item for each
997	* entry on the first call.
998	*/
999	if (ginCompareItemPointers(&entry->curItem, &advancePast) <= `0`)
1000	{
1001	entryGetItem(ginstate, entry, advancePast, snapshot);
1002	if (entry->isFinished)
1003	continue;
1004	}
1005
1006	allFinished = false;
1007	if (ginCompareItemPointers(&entry->curItem, &minItem) < `0`)
1008	minItem = entry->curItem;
1009	}
1010
1011	if (allFinished)
1012	{
1013	/ all entries are finished /
1014	key->isFinished = true;
1015	return;
1016	}
1017
1018	/*
1019	* Ok, we now know that there are no matches < minItem.
1020	*
1021	* If minItem is lossy, it means that there were no exact items on the
1022	* page among requiredEntries, because lossy pointers sort after exact
1023	* items. However, there might be exact items for the same page among
1024	* additionalEntries, so we mustn't advance past them.
1025	*/
1026	if (ItemPointerIsLossyPage(&minItem))
1027	{
1028	if (GinItemPointerGetBlockNumber(&advancePast) <
1029	GinItemPointerGetBlockNumber(&minItem))
1030	{
1031	ItemPointerSet(&advancePast,
1032	GinItemPointerGetBlockNumber(&minItem),
1033	InvalidOffsetNumber);
1034	}
1035	}
1036	else
1037	{
1038	Assert(GinItemPointerGetOffsetNumber(&minItem) > `0`);
1039	ItemPointerSet(&advancePast,
1040	GinItemPointerGetBlockNumber(&minItem),
1041	OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem)));
1042	}
1043
1044	/*
1045	* We might not have loaded all the entry streams for this TID yet. We
1046	* could call the consistent function, passing MAYBE for those entries, to
1047	* see if it can decide if this TID matches based on the information we
1048	* have. But if the consistent-function is expensive, and cannot in fact
1049	* decide with partial information, that could be a big loss. So, load all
1050	* the additional entries, before calling the consistent function.
1051	*/
1052	for (i = `0`; i < key->nadditional; i++)
1053	{
1054	entry = key->additionalEntries[i];
1055
1056	if (entry->isFinished)
1057	continue;
1058
1059	if (ginCompareItemPointers(&entry->curItem, &advancePast) <= `0`)
1060	{
1061	entryGetItem(ginstate, entry, advancePast, snapshot);
1062	if (entry->isFinished)
1063	continue;
1064	}
1065
1066	/*
1067	* Normally, none of the items in additionalEntries can have a curItem
1068	* larger than minItem. But if minItem is a lossy page, then there
1069	* might be exact items on the same page among additionalEntries.
1070	*/
1071	if (ginCompareItemPointers(&entry->curItem, &minItem) < `0`)
1072	{
1073	Assert(ItemPointerIsLossyPage(&minItem));
1074	minItem = entry->curItem;
1075	}
1076	}
1077
1078	/*
1079	* Ok, we've advanced all the entries up to minItem now. Set key->curItem,
1080	* and perform consistentFn test.
1081	*
1082	* Lossy-page entries pose a problem, since we don't know the correct
1083	* entryRes state to pass to the consistentFn, and we also don't know what
1084	* its combining logic will be (could be AND, OR, or even NOT). If the
1085	* logic is OR then the consistentFn might succeed for all items in the
1086	* lossy page even when none of the other entries match.
1087	*
1088	* Our strategy is to call the tri-state consistent function, with the
1089	* lossy-page entries set to MAYBE, and all the other entries FALSE. If it
1090	* returns FALSE, none of the lossy items alone are enough for a match, so
1091	* we don't need to return a lossy-page pointer. Otherwise, return a
1092	* lossy-page pointer to indicate that the whole heap page must be
1093	* checked. (On subsequent calls, we'll do nothing until minItem is past
1094	* the page altogether, thus ensuring that we never return both regular
1095	* and lossy pointers for the same page.)
1096	*
1097	* An exception is that it doesn't matter what we pass for lossy pointers
1098	* in "hidden" entries, because the consistentFn's result can't depend on
1099	* them. We could pass them as MAYBE as well, but if we're using the
1100	* "shim" implementation of a tri-state consistent function (see
1101	* ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
1102	* them as true.
1103	*
1104	* Note that only lossy-page entries pointing to the current item's page
1105	* should trigger this processing; we might have future lossy pages in the
1106	* entry array, but they aren't relevant yet.
1107	*/
1108	key->curItem = minItem;
1109	ItemPointerSetLossyPage(&curPageLossy,
1110	GinItemPointerGetBlockNumber(&key->curItem));
1111	haveLossyEntry = false;
1112	for (i = `0`; i < key->nentries; i++)
1113	{
1114	entry = key->scanEntry[i];
1115	if (entry->isFinished == false &&
1116	ginCompareItemPointers(&entry->curItem, &curPageLossy) == `0`)
1117	{
1118	if (i < key->nuserentries)
1119	key->entryRes[i] = GIN_MAYBE;
1120	else
1121	key->entryRes[i] = GIN_TRUE;
1122	haveLossyEntry = true;
1123	}
1124	else
1125	key->entryRes[i] = GIN_FALSE;
1126	}
1127
1128	/ prepare for calling consistentFn in temp context /
1129	oldCtx = MemoryContextSwitchTo(tempCtx);
1130
1131	if (haveLossyEntry)
1132	{
1133	/ Have lossy-page entries, so see if whole page matches /
1134	res = key->triConsistentFn(key);
1135
1136	if (res == GIN_TRUE \|\| res == GIN_MAYBE)
1137	{
1138	/ Yes, so clean up ... /
1139	MemoryContextSwitchTo(oldCtx);
1140	MemoryContextReset(tempCtx);
1141
1142	/ and return lossy pointer for whole page /
1143	key->curItem = curPageLossy;
1144	key->curItemMatches = true;
1145	key->recheckCurItem = true;
1146	return;
1147	}
1148	}
1149
1150	/*
1151	* At this point we know that we don't need to return a lossy whole-page
1152	* pointer, but we might have matches for individual exact item pointers,
1153	* possibly in combination with a lossy pointer. Pass lossy pointers as
1154	* MAYBE to the ternary consistent function, to let it decide if this
1155	* tuple satisfies the overall key, even though we don't know if the lossy
1156	* entries match.
1157	*
1158	* Prepare entryRes array to be passed to consistentFn.
1159	*/
1160	for (i = `0`; i < key->nentries; i++)
1161	{
1162	entry = key->scanEntry[i];
1163	if (entry->isFinished)
1164	key->entryRes[i] = GIN_FALSE;
1165	#if 0
1166
1167	/*
1168	* This case can't currently happen, because we loaded all the entries
1169	* for this item earlier.
1170	*/
1171	else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= `0`)
1172	key->entryRes[i] = GIN_MAYBE;
1173	#endif
1174	else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == `0`)
1175	key->entryRes[i] = GIN_MAYBE;
1176	else if (ginCompareItemPointers(&entry->curItem, &minItem) == `0`)
1177	key->entryRes[i] = GIN_TRUE;
1178	else
1179	key->entryRes[i] = GIN_FALSE;
1180	}
1181
1182	res = key->triConsistentFn(key);
1183
1184	switch (res)
1185	{
1186	case GIN_TRUE:
1187	key->curItemMatches = true;
1188	/ triConsistentFn set recheckCurItem /
1189	break;
1190
1191	case GIN_FALSE:
1192	key->curItemMatches = false;
1193	break;
1194
1195	case GIN_MAYBE:
1196	key->curItemMatches = true;
1197	key->recheckCurItem = true;
1198	break;
1199
1200	default:
1201
1202	/*
1203	* the 'default' case shouldn't happen, but if the consistent
1204	* function returns something bogus, this is the safe result
1205	*/
1206	key->curItemMatches = true;
1207	key->recheckCurItem = true;
1208	break;
1209	}
1210
1211	/*
1212	* We have a tuple, and we know if it matches or not. If it's a non-match,
1213	* we could continue to find the next matching tuple, but let's break out
1214	* and give scanGetItem a chance to advance the other keys. They might be
1215	* able to skip past to a much higher TID, allowing us to save work.
1216	*/
1217
1218	/ clean up after consistentFn calls /
1219	MemoryContextSwitchTo(oldCtx);
1220	MemoryContextReset(tempCtx);
1221	}
1222
1223	/*
1224	* Get next heap item pointer (after advancePast) from scan.
1225	* Returns true if anything found.
1226	* On success, item and recheck are set.
1227	*
1228	* Note: this is very nearly the same logic as in keyGetItem(), except
1229	* that we know the keys are to be combined with AND logic, whereas in
1230	* keyGetItem() the combination logic is known only to the consistentFn.
1231	*/
1232	static bool
1233	scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
1234	ItemPointerData item, bool recheck)
1235	{
1236	GinScanOpaque so = (GinScanOpaque) scan->opaque;
1237	uint32 i;
1238	bool match;
1239
1240	/----------*
1241	* Advance the scan keys in lock-step, until we find an item that matches
1242	* all the keys. If any key reports isFinished, meaning its subset of the
1243	* entries is exhausted, we can stop. Otherwise, set *item to the next
1244	* matching item.
1245	*
1246	* This logic works only if a keyGetItem stream can never contain both
1247	* exact and lossy pointers for the same page. Else we could have a
1248	* case like
1249	*
1250	* stream 1 stream 2
1251	* ... ...
1252	* 42/6 42/7
1253	* 50/1 42/0xffff
1254	* ... ...
1255	*
1256	* We would conclude that 42/6 is not a match and advance stream 1,
1257	* thus never detecting the match to the lossy pointer in stream 2.
1258	* (keyGetItem has a similar problem versus entryGetItem.)
1259	*----------
1260	*/
1261	do
1262	{
1263	ItemPointerSetMin(item);
1264	match = true;
1265	for (i = `0`; i < so->nkeys && match; i++)
1266	{
1267	GinScanKey key = so->keys + i;
1268
1269	/ Fetch the next item for this key that is > advancePast. /
1270	keyGetItem(&so->ginstate, so->tempCtx, key, advancePast,
1271	scan->xs_snapshot);
1272
1273	if (key->isFinished)
1274	return false;
1275
1276	/*
1277	* If it's not a match, we can immediately conclude that nothing
1278	* <= this item matches, without checking the rest of the keys.
1279	*/
1280	if (!key->curItemMatches)
1281	{
1282	advancePast = key->curItem;
1283	match = false;
1284	break;
1285	}
1286
1287	/*
1288	* It's a match. We can conclude that nothing < matches, so the
1289	* other key streams can skip to this item.
1290	*
1291	* Beware of lossy pointers, though; from a lossy pointer, we can
1292	* only conclude that nothing smaller than this block matches.
1293	*/
1294	if (ItemPointerIsLossyPage(&key->curItem))
1295	{
1296	if (GinItemPointerGetBlockNumber(&advancePast) <
1297	GinItemPointerGetBlockNumber(&key->curItem))
1298	{
1299	ItemPointerSet(&advancePast,
1300	GinItemPointerGetBlockNumber(&key->curItem),
1301	InvalidOffsetNumber);
1302	}
1303	}
1304	else
1305	{
1306	Assert(GinItemPointerGetOffsetNumber(&key->curItem) > `0`);
1307	ItemPointerSet(&advancePast,
1308	GinItemPointerGetBlockNumber(&key->curItem),
1309	OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem)));
1310	}
1311
1312	/*
1313	* If this is the first key, remember this location as a potential
1314	* match, and proceed to check the rest of the keys.
1315	*
1316	* Otherwise, check if this is the same item that we checked the
1317	* previous keys for (or a lossy pointer for the same page). If
1318	* not, loop back to check the previous keys for this item (we
1319	* will check this key again too, but keyGetItem returns quickly
1320	* for that)
1321	*/
1322	if (i == `0`)
1323	{
1324	*item = key->curItem;
1325	}
1326	else
1327	{
1328	if (ItemPointerIsLossyPage(&key->curItem) \|\|
1329	ItemPointerIsLossyPage(item))
1330	{
1331	Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
1332	match = (GinItemPointerGetBlockNumber(&key->curItem) ==
1333	GinItemPointerGetBlockNumber(item));
1334	}
1335	else
1336	{
1337	Assert(ginCompareItemPointers(&key->curItem, item) >= `0`);
1338	match = (ginCompareItemPointers(&key->curItem, item) == `0`);
1339	}
1340	}
1341	}
1342	} while (!match);
1343
1344	Assert(!ItemPointerIsMin(item));
1345
1346	/*
1347	* Now *item contains the first ItemPointer after previous result that
1348	* satisfied all the keys for that exact TID, or a lossy reference to the
1349	* same page.
1350	*
1351	* We must return recheck = true if any of the keys are marked recheck.
1352	*/
1353	*recheck = false;
1354	for (i = `0`; i < so->nkeys; i++)
1355	{
1356	GinScanKey key = so->keys + i;
1357
1358	if (key->recheckCurItem)
1359	{
1360	*recheck = true;
1361	break;
1362	}
1363	}
1364
1365	return true;
1366	}
1367
1368
1369	/*
1370	* Functions for scanning the pending list
1371	*/
1372
1373
1374	/*
1375	* Get ItemPointer of next heap row to be checked from pending list.
1376	* Returns false if there are no more. On pages with several heap rows
1377	* it returns each row separately, on page with part of heap row returns
1378	* per page data. pos->firstOffset and pos->lastOffset are set to identify
1379	* the range of pending-list tuples belonging to this heap row.
1380	*
1381	* The pendingBuffer is presumed pinned and share-locked on entry, and is
1382	* pinned and share-locked on success exit. On failure exit it's released.
1383	*/
1384	static bool
1385	scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
1386	{
1387	OffsetNumber maxoff;
1388	Page page;
1389	IndexTuple itup;
1390
1391	ItemPointerSetInvalid(&pos->item);
1392	for (;;)
1393	{
1394	page = BufferGetPage(pos->pendingBuffer);
1395	TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1396
1397	maxoff = PageGetMaxOffsetNumber(page);
1398	if (pos->firstOffset > maxoff)
1399	{
1400	BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
1401
1402	if (blkno == InvalidBlockNumber)
1403	{
1404	UnlockReleaseBuffer(pos->pendingBuffer);
1405	pos->pendingBuffer = InvalidBuffer;
1406
1407	return false;
1408	}
1409	else
1410	{
1411	/*
1412	* Here we must prevent deletion of next page by insertcleanup
1413	* process, which may be trying to obtain exclusive lock on
1414	* current page. So, we lock next page before releasing the
1415	* current one
1416	*/
1417	Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno);
1418
1419	LockBuffer(tmpbuf, GIN_SHARE);
1420	UnlockReleaseBuffer(pos->pendingBuffer);
1421
1422	pos->pendingBuffer = tmpbuf;
1423	pos->firstOffset = FirstOffsetNumber;
1424	}
1425	}
1426	else
1427	{
1428	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset));
1429	pos->item = itup->t_tid;
1430	if (GinPageHasFullRow(page))
1431	{
1432	/*
1433	* find itempointer to the next row
1434	*/
1435	for (pos->lastOffset = pos->firstOffset + `1`; pos->lastOffset <= maxoff; pos->lastOffset++)
1436	{
1437	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset));
1438	if (!ItemPointerEquals(&pos->item, &itup->t_tid))
1439	break;
1440	}
1441	}
1442	else
1443	{
1444	/*
1445	* All itempointers are the same on this page
1446	*/
1447	pos->lastOffset = maxoff + `1`;
1448	}
1449
1450	/*
1451	* Now pos->firstOffset points to the first tuple of current heap
1452	* row, pos->lastOffset points to the first tuple of next heap row
1453	* (or to the end of page)
1454	*/
1455	break;
1456	}
1457	}
1458
1459	return true;
1460	}
1461
1462	/*
1463	* Scan pending-list page from current tuple (off) up till the first of:
1464	* - match is found (then returns true)
1465	* - no later match is possible
1466	* - tuple's attribute number is not equal to entry's attrnum
1467	* - reach end of page
1468	*
1469	* datum[]/category[]/datumExtracted[] arrays are used to cache the results
1470	* of gintuple_get_key() on the current page.
1471	*/
1472	static bool
1473	matchPartialInPendingList(GinState *ginstate, Page page,
1474	OffsetNumber off, OffsetNumber maxoff,
1475	GinScanEntry entry,
1476	Datum datum, GinNullCategory category,
1477	bool *datumExtracted)
1478	{
1479	IndexTuple itup;
1480	int32 cmp;
1481
1482	/ Partial match to a null is not possible /
1483	if (entry->queryCategory != GIN_CAT_NORM_KEY)
1484	return false;
1485
1486	while (off < maxoff)
1487	{
1488	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
1489
1490	if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
1491	return false;
1492
1493	if (datumExtracted[off - `1`] == false)
1494	{
1495	datum[off - `1`] = gintuple_get_key(ginstate, itup,
1496	&category[off - `1`]);
1497	datumExtracted[off - `1`] = true;
1498	}
1499
1500	/ Once we hit nulls, no further match is possible /
1501	if (category[off - `1`] != GIN_CAT_NORM_KEY)
1502	return false;
1503
1504	/----------*
1505	* Check partial match.
1506	* case cmp == 0 => match
1507	* case cmp > 0 => not match and end scan (no later match possible)
1508	* case cmp < 0 => not match and continue scan
1509	*----------
1510	*/
1511	cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - `1`],
1512	ginstate->supportCollation[entry->attnum - `1`],
1513	entry->queryKey,
1514	datum[off - `1`],
1515	UInt16GetDatum(entry->strategy),
1516	PointerGetDatum(entry->extra_data)));
1517	if (cmp == `0`)
1518	return true;
1519	else if (cmp > `0`)
1520	return false;
1521
1522	off++;
1523	}
1524
1525	return false;
1526	}
1527
1528	/*
1529	* Set up the entryRes array for each key by looking at
1530	* every entry for current heap row in pending list.
1531	*
1532	* Returns true if each scan key has at least one entryRes match.
1533	* This corresponds to the situations where the normal index search will
1534	* try to apply the key's consistentFn. (A tuple not meeting that requirement
1535	* cannot be returned by the normal search since no entry stream will
1536	* source its TID.)
1537	*
1538	* The pendingBuffer is presumed pinned and share-locked on entry.
1539	*/
1540	static bool
1541	collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
1542	{
1543	GinScanOpaque so = (GinScanOpaque) scan->opaque;
1544	OffsetNumber attrnum;
1545	Page page;
1546	IndexTuple itup;
1547	int i,
1548	j;
1549
1550	/*
1551	* Reset all entryRes and hasMatchKey flags
1552	*/
1553	for (i = `0`; i < so->nkeys; i++)
1554	{
1555	GinScanKey key = so->keys + i;
1556
1557	memset(key->entryRes, GIN_FALSE, key->nentries);
1558	}
1559	memset(pos->hasMatchKey, false, so->nkeys);
1560
1561	/*
1562	* Outer loop iterates over multiple pending-list pages when a single heap
1563	* row has entries spanning those pages.
1564	*/
1565	for (;;)
1566	{
1567	Datum datum[BLCKSZ / sizeof(IndexTupleData)];
1568	GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
1569	bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)];
1570
1571	Assert(pos->lastOffset > pos->firstOffset);
1572	memset(datumExtracted + pos->firstOffset - `1`, `0`,
1573	sizeof(bool) * (pos->lastOffset - pos->firstOffset));
1574
1575	page = BufferGetPage(pos->pendingBuffer);
1576	TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1577
1578	for (i = `0`; i < so->nkeys; i++)
1579	{
1580	GinScanKey key = so->keys + i;
1581
1582	for (j = `0`; j < key->nentries; j++)
1583	{
1584	GinScanEntry entry = key->scanEntry[j];
1585	OffsetNumber StopLow = pos->firstOffset,
1586	StopHigh = pos->lastOffset,
1587	StopMiddle;
1588
1589	/ If already matched on earlier page, do no extra work /
1590	if (key->entryRes[j])
1591	continue;
1592
1593	/*
1594	* Interesting tuples are from pos->firstOffset to
1595	* pos->lastOffset and they are ordered by (attnum, Datum) as
1596	* it's done in entry tree. So we can use binary search to
1597	* avoid linear scanning.
1598	*/
1599	while (StopLow < StopHigh)
1600	{
1601	int res;
1602
1603	StopMiddle = StopLow + ((StopHigh - StopLow) >> `1`);
1604
1605	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));
1606
1607	attrnum = gintuple_get_attrnum(&so->ginstate, itup);
1608
1609	if (key->attnum < attrnum)
1610	{
1611	StopHigh = StopMiddle;
1612	continue;
1613	}
1614	if (key->attnum > attrnum)
1615	{
1616	StopLow = StopMiddle + `1`;
1617	continue;
1618	}
1619
1620	if (datumExtracted[StopMiddle - `1`] == false)
1621	{
1622	datum[StopMiddle - `1`] =
1623	gintuple_get_key(&so->ginstate, itup,
1624	&category[StopMiddle - `1`]);
1625	datumExtracted[StopMiddle - `1`] = true;
1626	}
1627
1628	if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
1629	{
1630	/ special behavior depending on searchMode /
1631	if (entry->searchMode == GIN_SEARCH_MODE_ALL)
1632	{
1633	/ match anything except NULL_ITEM /
1634	if (category[StopMiddle - `1`] == GIN_CAT_NULL_ITEM)
1635	res = -`1`;
1636	else
1637	res = `0`;
1638	}
1639	else
1640	{
1641	/ match everything /
1642	res = `0`;
1643	}
1644	}
1645	else
1646	{
1647	res = ginCompareEntries(&so->ginstate,
1648	entry->attnum,
1649	entry->queryKey,
1650	entry->queryCategory,
1651	datum[StopMiddle - `1`],
1652	category[StopMiddle - `1`]);
1653	}
1654
1655	if (res == `0`)
1656	{
1657	/*
1658	* Found exact match (there can be only one, except in
1659	* EMPTY_QUERY mode).
1660	*
1661	* If doing partial match, scan forward from here to
1662	* end of page to check for matches.
1663	*
1664	* See comment above about tuple's ordering.
1665	*/
1666	if (entry->isPartialMatch)
1667	key->entryRes[j] =
1668	matchPartialInPendingList(&so->ginstate,
1669	page,
1670	StopMiddle,
1671	pos->lastOffset,
1672	entry,
1673	datum,
1674	category,
1675	datumExtracted);
1676	else
1677	key->entryRes[j] = true;
1678
1679	/ done with binary search /
1680	break;
1681	}
1682	else if (res < `0`)
1683	StopHigh = StopMiddle;
1684	else
1685	StopLow = StopMiddle + `1`;
1686	}
1687
1688	if (StopLow >= StopHigh && entry->isPartialMatch)
1689	{
1690	/*
1691	* No exact match on this page. If doing partial match,
1692	* scan from the first tuple greater than target value to
1693	* end of page. Note that since we don't remember whether
1694	* the comparePartialFn told us to stop early on a
1695	* previous page, we will uselessly apply comparePartialFn
1696	* to the first tuple on each subsequent page.
1697	*/
1698	key->entryRes[j] =
1699	matchPartialInPendingList(&so->ginstate,
1700	page,
1701	StopHigh,
1702	pos->lastOffset,
1703	entry,
1704	datum,
1705	category,
1706	datumExtracted);
1707	}
1708
1709	pos->hasMatchKey[i] \|= key->entryRes[j];
1710	}
1711	}
1712
1713	/ Advance firstOffset over the scanned tuples /
1714	pos->firstOffset = pos->lastOffset;
1715
1716	if (GinPageHasFullRow(page))
1717	{
1718	/*
1719	* We have examined all pending entries for the current heap row.
1720	* Break out of loop over pages.
1721	*/
1722	break;
1723	}
1724	else
1725	{
1726	/*
1727	* Advance to next page of pending entries for the current heap
1728	* row. Complain if there isn't one.
1729	*/
1730	ItemPointerData item = pos->item;
1731
1732	if (scanGetCandidate(scan, pos) == false \|\|
1733	!ItemPointerEquals(&pos->item, &item))
1734	elog(ERROR, "could not find additional pending pages for same heap tuple");
1735	}
1736	}
1737
1738	/*
1739	* Now return "true" if all scan keys have at least one matching datum
1740	*/
1741	for (i = `0`; i < so->nkeys; i++)
1742	{
1743	if (pos->hasMatchKey[i] == false)
1744	return false;
1745	}
1746
1747	return true;
1748	}
1749
1750	/*
1751	* Collect all matched rows from pending list into bitmap.
1752	*/
1753	static void
1754	scanPendingInsert(IndexScanDesc scan, TIDBitmap tbm, int64 ntids)
1755	{
1756	GinScanOpaque so = (GinScanOpaque) scan->opaque;
1757	MemoryContext oldCtx;
1758	bool recheck,
1759	match;
1760	int i;
1761	pendingPosition pos;
1762	Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO);
1763	Page page;
1764	BlockNumber blkno;
1765
1766	*ntids = `0`;
1767
1768	/*
1769	* Acquire predicate lock on the metapage, to conflict with any fastupdate
1770	* insertions.
1771	*/
1772	PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
1773
1774	LockBuffer(metabuffer, GIN_SHARE);
1775	page = BufferGetPage(metabuffer);
1776	TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1777	blkno = GinPageGetMeta(page)->head;
1778
1779	/*
1780	* fetch head of list before unlocking metapage. head page must be pinned
1781	* to prevent deletion by vacuum process
1782	*/
1783	if (blkno == InvalidBlockNumber)
1784	{
1785	/ No pending list, so proceed with normal scan /
1786	UnlockReleaseBuffer(metabuffer);
1787	return;
1788	}
1789
1790	pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
1791	LockBuffer(pos.pendingBuffer, GIN_SHARE);
1792	pos.firstOffset = FirstOffsetNumber;
1793	UnlockReleaseBuffer(metabuffer);
1794	pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys);
1795
1796	/*
1797	* loop for each heap row. scanGetCandidate returns full row or row's
1798	* tuples from first page.
1799	*/
1800	while (scanGetCandidate(scan, &pos))
1801	{
1802	/*
1803	* Check entries in tuple and set up entryRes array.
1804	*
1805	* If pending tuples belonging to the current heap row are spread
1806	* across several pages, collectMatchesForHeapRow will read all of
1807	* those pages.
1808	*/
1809	if (!collectMatchesForHeapRow(scan, &pos))
1810	continue;
1811
1812	/*
1813	* Matching of entries of one row is finished, so check row using
1814	* consistent functions.
1815	*/
1816	oldCtx = MemoryContextSwitchTo(so->tempCtx);
1817	recheck = false;
1818	match = true;
1819
1820	for (i = `0`; i < so->nkeys; i++)
1821	{
1822	GinScanKey key = so->keys + i;
1823
1824	if (!key->boolConsistentFn(key))
1825	{
1826	match = false;
1827	break;
1828	}
1829	recheck \|= key->recheckCurItem;
1830	}
1831
1832	MemoryContextSwitchTo(oldCtx);
1833	MemoryContextReset(so->tempCtx);
1834
1835	if (match)
1836	{
1837	tbm_add_tuples(tbm, &pos.item, `1`, recheck);
1838	(*ntids)++;
1839	}
1840	}
1841
1842	pfree(pos.hasMatchKey);
1843	}
1844
1845
1846	#define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes )
1847
1848	int64
1849	gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
1850	{
1851	GinScanOpaque so = (GinScanOpaque) scan->opaque;
1852	int64 ntids;
1853	ItemPointerData iptr;
1854	bool recheck;
1855
1856	/*
1857	* Set up the scan keys, and check for unsatisfiable query.
1858	*/
1859	ginFreeScanKeys(so); / there should be no keys yet, but just to be*
1860	* sure */
1861	ginNewScanKey(scan);
1862
1863	if (GinIsVoidRes(scan))
1864	return `0`;
1865
1866	ntids = `0`;
1867
1868	/*
1869	* First, scan the pending list and collect any matching entries into the
1870	* bitmap. After we scan a pending item, some other backend could post it
1871	* into the main index, and so we might visit it a second time during the
1872	* main scan. This is okay because we'll just re-set the same bit in the
1873	* bitmap. (The possibility of duplicate visits is a major reason why GIN
1874	* can't support the amgettuple API, however.) Note that it would not do
1875	* to scan the main index before the pending list, since concurrent
1876	* cleanup could then make us miss entries entirely.
1877	*/
1878	scanPendingInsert(scan, tbm, &ntids);
1879
1880	/*
1881	* Now scan the main index.
1882	*/
1883	startScan(scan);
1884
1885	ItemPointerSetMin(&iptr);
1886
1887	for (;;)
1888	{
1889	CHECK_FOR_INTERRUPTS();
1890
1891	if (!scanGetItem(scan, iptr, &iptr, &recheck))
1892	break;
1893
1894	if (ItemPointerIsLossyPage(&iptr))
1895	tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr));
1896	else
1897	tbm_add_tuples(tbm, &iptr, `1`, recheck);
1898	ntids++;
1899	}
1900
1901	return ntids;
1902	}
1903

Browse the source code of PostgreSQL/src/backend/access/gin/ginget.c