nbtutils.c source code [PostgreSQL/src/backend/access/nbtree/nbtutils.c]

1	/-------------------------------------------------------------------------*
2	*
3	* nbtutils.c
4	* Utility code for Postgres btree implementation.
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/access/nbtree/nbtutils.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15
16	#include "postgres.h"
17
18	#include <time.h>
19
20	#include "access/nbtree.h"
21	#include "access/reloptions.h"
22	#include "access/relscan.h"
23	#include "commands/progress.h"
24	#include "miscadmin.h"
25	#include "utils/array.h"
26	#include "utils/datum.h"
27	#include "utils/lsyscache.h"
28	#include "utils/memutils.h"
29	#include "utils/rel.h"
30
31
32	typedef struct BTSortArrayContext
33	{
34	FmgrInfo flinfo;
35	Oid collation;
36	bool reverse;
37	} BTSortArrayContext;
38
39	static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
40	StrategyNumber strat,
41	Datum elems, int* nelems);
42	static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
43	bool reverse,
44	Datum elems, int* nelems);
45	static int _bt_compare_array_elements(const void a, const* void b, void* *arg);
46	static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
47	ScanKey leftarg, ScanKey rightarg,
48	bool *result);
49	static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
50	static void _bt_mark_scankey_required(ScanKey skey);
51	static bool _bt_check_rowcompare(ScanKey skey,
52	IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
53	ScanDirection dir, bool *continuescan);
54	static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
55	IndexTuple firstright, BTScanInsert itup_key);
56
57
58	/*
59	* _bt_mkscankey
60	* Build an insertion scan key that contains comparison data from itup
61	* as well as comparator routines appropriate to the key datatypes.
62	*
63	* When itup is a non-pivot tuple, the returned insertion scan key is
64	* suitable for finding a place for it to go on the leaf level. Pivot
65	* tuples can be used to re-find leaf page with matching high key, but
66	* then caller needs to set scan key's pivotsearch field to true. This
67	* allows caller to search for a leaf page with a matching high key,
68	* which is usually to the left of the first leaf page a non-pivot match
69	* might appear on.
70	*
71	* The result is intended for use with _bt_compare() and _bt_truncate().
72	* Callers that don't need to fill out the insertion scankey arguments
73	* (e.g. they use an ad-hoc comparison routine, or only need a scankey
74	* for _bt_truncate()) can pass a NULL index tuple. The scankey will
75	* be initialized as if an "all truncated" pivot tuple was passed
76	* instead.
77	*
78	* Note that we may occasionally have to share lock the metapage to
79	* determine whether or not the keys in the index are expected to be
80	* unique (i.e. if this is a "heapkeyspace" index). We assume a
81	* heapkeyspace index when caller passes a NULL tuple, allowing index
82	* build callers to avoid accessing the non-existent metapage.
83	*/
84	BTScanInsert
85	_bt_mkscankey(Relation rel, IndexTuple itup)
86	{
87	BTScanInsert key;
88	ScanKey skey;
89	TupleDesc itupdesc;
90	int indnkeyatts;
91	int16 *indoption;
92	int tupnatts;
93	int i;
94
95	itupdesc = RelationGetDescr(rel);
96	indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
97	indoption = rel->rd_indoption;
98	tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : `0`;
99
100	Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
101
102	/*
103	* We'll execute search using scan key constructed on key columns.
104	* Truncated attributes and non-key attributes are omitted from the final
105	* scan key.
106	*/
107	key = palloc(offsetof(BTScanInsertData, scankeys) +
108	sizeof(ScanKeyData) * indnkeyatts);
109	key->heapkeyspace = itup == NULL \|\| _bt_heapkeyspace(rel);
110	key->anynullkeys = false; / initial assumption /
111	key->nextkey = false;
112	key->pivotsearch = false;
113	key->keysz = Min(indnkeyatts, tupnatts);
114	key->scantid = key->heapkeyspace && itup ?
115	BTreeTupleGetHeapTID(itup) : NULL;
116	skey = key->scankeys;
117	for (i = `0`; i < indnkeyatts; i++)
118	{
119	FmgrInfo *procinfo;
120	Datum arg;
121	bool null;
122	int flags;
123
124	/*
125	* We can use the cached (default) support procs since no cross-type
126	* comparison can be needed.
127	*/
128	procinfo = index_getprocinfo(rel, i + `1`, BTORDER_PROC);
129
130	/*
131	* Key arguments built from truncated attributes (or when caller
132	* provides no tuple) are defensively represented as NULL values. They
133	* should never be used.
134	*/
135	if (i < tupnatts)
136	arg = index_getattr(itup, i + `1`, itupdesc, &null);
137	else
138	{
139	arg = (Datum) `0`;
140	null = true;
141	}
142	flags = (null ? SK_ISNULL : `0`) \| (indoption[i] << SK_BT_INDOPTION_SHIFT);
143	ScanKeyEntryInitializeWithInfo(&skey[i],
144	flags,
145	(AttrNumber) (i + `1`),
146	InvalidStrategy,
147	InvalidOid,
148	rel->rd_indcollation[i],
149	procinfo,
150	arg);
151	/ Record if any key attribute is NULL (or truncated) /
152	if (null)
153	key->anynullkeys = true;
154	}
155
156	return key;
157	}
158
159	/*
160	* free a retracement stack made by _bt_search.
161	*/
162	void
163	_bt_freestack(BTStack stack)
164	{
165	BTStack ostack;
166
167	while (stack != NULL)
168	{
169	ostack = stack;
170	stack = stack->bts_parent;
171	pfree(ostack);
172	}
173	}
174
175
176	/*
177	* _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys
178	*
179	* If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
180	* set up BTArrayKeyInfo info for each one that is an equality-type key.
181	* Prepare modified scan keys in so->arrayKeyData, which will hold the current
182	* array elements during each primitive indexscan operation. For inequality
183	* array keys, it's sufficient to find the extreme element value and replace
184	* the whole array with that scalar value.
185	*
186	* Note: the reason we need so->arrayKeyData, rather than just scribbling
187	* on scan->keyData, is that callers are permitted to call btrescan without
188	* supplying a new set of scankey data.
189	*/
190	void
191	_bt_preprocess_array_keys(IndexScanDesc scan)
192	{
193	BTScanOpaque so = (BTScanOpaque) scan->opaque;
194	int numberOfKeys = scan->numberOfKeys;
195	int16 *indoption = scan->indexRelation->rd_indoption;
196	int numArrayKeys;
197	ScanKey cur;
198	int i;
199	MemoryContext oldContext;
200
201	/ Quick check to see if there are any array keys /
202	numArrayKeys = `0`;
203	for (i = `0`; i < numberOfKeys; i++)
204	{
205	cur = &scan->keyData[i];
206	if (cur->sk_flags & SK_SEARCHARRAY)
207	{
208	numArrayKeys++;
209	Assert(!(cur->sk_flags & (SK_ROW_HEADER \| SK_SEARCHNULL \| SK_SEARCHNOTNULL)));
210	/ If any arrays are null as a whole, we can quit right now. /
211	if (cur->sk_flags & SK_ISNULL)
212	{
213	so->numArrayKeys = -`1`;
214	so->arrayKeyData = NULL;
215	return;
216	}
217	}
218	}
219
220	/ Quit if nothing to do. /
221	if (numArrayKeys == `0`)
222	{
223	so->numArrayKeys = `0`;
224	so->arrayKeyData = NULL;
225	return;
226	}
227
228	/*
229	* Make a scan-lifespan context to hold array-associated data, or reset it
230	* if we already have one from a previous rescan cycle.
231	*/
232	if (so->arrayContext == NULL)
233	so->arrayContext = AllocSetContextCreate(CurrentMemoryContext,
234	"BTree array context",
235	ALLOCSET_SMALL_SIZES);
236	else
237	MemoryContextReset(so->arrayContext);
238
239	oldContext = MemoryContextSwitchTo(so->arrayContext);
240
241	/ Create modifiable copy of scan->keyData in the workspace context /
242	so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
243	memcpy(so->arrayKeyData,
244	scan->keyData,
245	scan->numberOfKeys * sizeof(ScanKeyData));
246
247	/ Allocate space for per-array data in the workspace context /
248	so->arrayKeys = (BTArrayKeyInfo ) palloc0(numArrayKeys sizeof(BTArrayKeyInfo));
249
250	/ Now process each array key /
251	numArrayKeys = `0`;
252	for (i = `0`; i < numberOfKeys; i++)
253	{
254	ArrayType *arrayval;
255	int16 elmlen;
256	bool elmbyval;
257	char elmalign;
258	int num_elems;
259	Datum *elem_values;
260	bool *elem_nulls;
261	int num_nonnulls;
262	int j;
263
264	cur = &so->arrayKeyData[i];
265	if (!(cur->sk_flags & SK_SEARCHARRAY))
266	continue;
267
268	/*
269	* First, deconstruct the array into elements. Anything allocated
270	* here (including a possibly detoasted array value) is in the
271	* workspace context.
272	*/
273	arrayval = DatumGetArrayTypeP(cur->sk_argument);
274	/ We could cache this data, but not clear it's worth it /
275	get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
276	&elmlen, &elmbyval, &elmalign);
277	deconstruct_array(arrayval,
278	ARR_ELEMTYPE(arrayval),
279	elmlen, elmbyval, elmalign,
280	&elem_values, &elem_nulls, &num_elems);
281
282	/*
283	* Compress out any null elements. We can ignore them since we assume
284	* all btree operators are strict.
285	*/
286	num_nonnulls = `0`;
287	for (j = `0`; j < num_elems; j++)
288	{
289	if (!elem_nulls[j])
290	elem_values[num_nonnulls++] = elem_values[j];
291	}
292
293	/ We could pfree(elem_nulls) now, but not worth the cycles /
294
295	/ If there's no non-nulls, the scan qual is unsatisfiable /
296	if (num_nonnulls == `0`)
297	{
298	numArrayKeys = -`1`;
299	break;
300	}
301
302	/*
303	* If the comparison operator is not equality, then the array qual
304	* degenerates to a simple comparison against the smallest or largest
305	* non-null array element, as appropriate.
306	*/
307	switch (cur->sk_strategy)
308	{
309	case BTLessStrategyNumber:
310	case BTLessEqualStrategyNumber:
311	cur->sk_argument =
312	_bt_find_extreme_element(scan, cur,
313	BTGreaterStrategyNumber,
314	elem_values, num_nonnulls);
315	continue;
316	case BTEqualStrategyNumber:
317	/ proceed with rest of loop /
318	break;
319	case BTGreaterEqualStrategyNumber:
320	case BTGreaterStrategyNumber:
321	cur->sk_argument =
322	_bt_find_extreme_element(scan, cur,
323	BTLessStrategyNumber,
324	elem_values, num_nonnulls);
325	continue;
326	default:
327	elog(ERROR, "unrecognized StrategyNumber: %d",
328	(int) cur->sk_strategy);
329	break;
330	}
331
332	/*
333	* Sort the non-null elements and eliminate any duplicates. We must
334	* sort in the same ordering used by the index column, so that the
335	* successive primitive indexscans produce data in index order.
336	*/
337	num_elems = _bt_sort_array_elements(scan, cur,
338	(indoption[cur->sk_attno - `1`] & INDOPTION_DESC) != `0`,
339	elem_values, num_nonnulls);
340
341	/*
342	* And set up the BTArrayKeyInfo data.
343	*/
344	so->arrayKeys[numArrayKeys].scan_key = i;
345	so->arrayKeys[numArrayKeys].num_elems = num_elems;
346	so->arrayKeys[numArrayKeys].elem_values = elem_values;
347	numArrayKeys++;
348	}
349
350	so->numArrayKeys = numArrayKeys;
351
352	MemoryContextSwitchTo(oldContext);
353	}
354
355	/*
356	* _bt_find_extreme_element() -- get least or greatest array element
357	*
358	* scan and skey identify the index column, whose opfamily determines the
359	* comparison semantics. strat should be BTLessStrategyNumber to get the
360	* least element, or BTGreaterStrategyNumber to get the greatest.
361	*/
362	static Datum
363	_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
364	StrategyNumber strat,
365	Datum elems, int* nelems)
366	{
367	Relation rel = scan->indexRelation;
368	Oid elemtype,
369	cmp_op;
370	RegProcedure cmp_proc;
371	FmgrInfo flinfo;
372	Datum result;
373	int i;
374
375	/*
376	* Determine the nominal datatype of the array elements. We have to
377	* support the convention that sk_subtype == InvalidOid means the opclass
378	* input type; this is a hack to simplify life for ScanKeyInit().
379	*/
380	elemtype = skey->sk_subtype;
381	if (elemtype == InvalidOid)
382	elemtype = rel->rd_opcintype[skey->sk_attno - `1`];
383
384	/*
385	* Look up the appropriate comparison operator in the opfamily.
386	*
387	* Note: it's possible that this would fail, if the opfamily is
388	* incomplete, but it seems quite unlikely that an opfamily would omit
389	* non-cross-type comparison operators for any datatype that it supports
390	* at all.
391	*/
392	cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - `1`],
393	elemtype,
394	elemtype,
395	strat);
396	if (!OidIsValid(cmp_op))
397	elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
398	strat, elemtype, elemtype,
399	rel->rd_opfamily[skey->sk_attno - `1`]);
400	cmp_proc = get_opcode(cmp_op);
401	if (!RegProcedureIsValid(cmp_proc))
402	elog(ERROR, "missing oprcode for operator %u", cmp_op);
403
404	fmgr_info(cmp_proc, &flinfo);
405
406	Assert(nelems > `0`);
407	result = elems[`0`];
408	for (i = `1`; i < nelems; i++)
409	{
410	if (DatumGetBool(FunctionCall2Coll(&flinfo,
411	skey->sk_collation,
412	elems[i],
413	result)))
414	result = elems[i];
415	}
416
417	return result;
418	}
419
420	/*
421	* _bt_sort_array_elements() -- sort and de-dup array elements
422	*
423	* The array elements are sorted in-place, and the new number of elements
424	* after duplicate removal is returned.
425	*
426	* scan and skey identify the index column, whose opfamily determines the
427	* comparison semantics. If reverse is true, we sort in descending order.
428	*/
429	static int
430	_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
431	bool reverse,
432	Datum elems, int* nelems)
433	{
434	Relation rel = scan->indexRelation;
435	Oid elemtype;
436	RegProcedure cmp_proc;
437	BTSortArrayContext cxt;
438	int last_non_dup;
439	int i;
440
441	if (nelems <= `1`)
442	return nelems; / no work to do /
443
444	/*
445	* Determine the nominal datatype of the array elements. We have to
446	* support the convention that sk_subtype == InvalidOid means the opclass
447	* input type; this is a hack to simplify life for ScanKeyInit().
448	*/
449	elemtype = skey->sk_subtype;
450	if (elemtype == InvalidOid)
451	elemtype = rel->rd_opcintype[skey->sk_attno - `1`];
452
453	/*
454	* Look up the appropriate comparison function in the opfamily.
455	*
456	* Note: it's possible that this would fail, if the opfamily is
457	* incomplete, but it seems quite unlikely that an opfamily would omit
458	* non-cross-type support functions for any datatype that it supports at
459	* all.
460	*/
461	cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - `1`],
462	elemtype,
463	elemtype,
464	BTORDER_PROC);
465	if (!RegProcedureIsValid(cmp_proc))
466	elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
467	BTORDER_PROC, elemtype, elemtype,
468	rel->rd_opfamily[skey->sk_attno - `1`]);
469
470	/ Sort the array elements /
471	fmgr_info(cmp_proc, &cxt.flinfo);
472	cxt.collation = skey->sk_collation;
473	cxt.reverse = reverse;
474	qsort_arg((void ) elems, nelems, sizeof*(Datum),
475	_bt_compare_array_elements, (void *) &cxt);
476
477	/ Now scan the sorted elements and remove duplicates /
478	last_non_dup = `0`;
479	for (i = `1`; i < nelems; i++)
480	{
481	int32 compare;
482
483	compare = DatumGetInt32(FunctionCall2Coll(&cxt.flinfo,
484	cxt.collation,
485	elems[last_non_dup],
486	elems[i]));
487	if (compare != `0`)
488	elems[++last_non_dup] = elems[i];
489	}
490
491	return last_non_dup + `1`;
492	}
493
494	/*
495	* qsort_arg comparator for sorting array elements
496	*/
497	static int
498	_bt_compare_array_elements(const void a, const* void b, void* *arg)
499	{
500	Datum da = ((const* Datum *) a);
501	Datum db = ((const* Datum *) b);
502	BTSortArrayContext cxt = (BTSortArrayContext ) arg;
503	int32 compare;
504
505	compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo,
506	cxt->collation,
507	da, db));
508	if (cxt->reverse)
509	INVERT_COMPARE_RESULT(compare);
510	return compare;
511	}
512
513	/*
514	* _bt_start_array_keys() -- Initialize array keys at start of a scan
515	*
516	* Set up the cur_elem counters and fill in the first sk_argument value for
517	* each array scankey. We can't do this until we know the scan direction.
518	*/
519	void
520	_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
521	{
522	BTScanOpaque so = (BTScanOpaque) scan->opaque;
523	int i;
524
525	for (i = `0`; i < so->numArrayKeys; i++)
526	{
527	BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
528	ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
529
530	Assert(curArrayKey->num_elems > `0`);
531	if (ScanDirectionIsBackward(dir))
532	curArrayKey->cur_elem = curArrayKey->num_elems - `1`;
533	else
534	curArrayKey->cur_elem = `0`;
535	skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
536	}
537	}
538
539	/*
540	* _bt_advance_array_keys() -- Advance to next set of array elements
541	*
542	* Returns true if there is another set of values to consider, false if not.
543	* On true result, the scankeys are initialized with the next set of values.
544	*/
545	bool
546	_bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
547	{
548	BTScanOpaque so = (BTScanOpaque) scan->opaque;
549	bool found = false;
550	int i;
551
552	/*
553	* We must advance the last array key most quickly, since it will
554	* correspond to the lowest-order index column among the available
555	* qualifications. This is necessary to ensure correct ordering of output
556	* when there are multiple array keys.
557	*/
558	for (i = so->numArrayKeys - `1`; i >= `0`; i--)
559	{
560	BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
561	ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
562	int cur_elem = curArrayKey->cur_elem;
563	int num_elems = curArrayKey->num_elems;
564
565	if (ScanDirectionIsBackward(dir))
566	{
567	if (--cur_elem < `0`)
568	{
569	cur_elem = num_elems - `1`;
570	found = false; / need to advance next array key /
571	}
572	else
573	found = true;
574	}
575	else
576	{
577	if (++cur_elem >= num_elems)
578	{
579	cur_elem = `0`;
580	found = false; / need to advance next array key /
581	}
582	else
583	found = true;
584	}
585
586	curArrayKey->cur_elem = cur_elem;
587	skey->sk_argument = curArrayKey->elem_values[cur_elem];
588	if (found)
589	break;
590	}
591
592	/ advance parallel scan /
593	if (scan->parallel_scan != NULL)
594	_bt_parallel_advance_array_keys(scan);
595
596	return found;
597	}
598
599	/*
600	* _bt_mark_array_keys() -- Handle array keys during btmarkpos
601	*
602	* Save the current state of the array keys as the "mark" position.
603	*/
604	void
605	_bt_mark_array_keys(IndexScanDesc scan)
606	{
607	BTScanOpaque so = (BTScanOpaque) scan->opaque;
608	int i;
609
610	for (i = `0`; i < so->numArrayKeys; i++)
611	{
612	BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
613
614	curArrayKey->mark_elem = curArrayKey->cur_elem;
615	}
616	}
617
618	/*
619	* _bt_restore_array_keys() -- Handle array keys during btrestrpos
620	*
621	* Restore the array keys to where they were when the mark was set.
622	*/
623	void
624	_bt_restore_array_keys(IndexScanDesc scan)
625	{
626	BTScanOpaque so = (BTScanOpaque) scan->opaque;
627	bool changed = false;
628	int i;
629
630	/ Restore each array key to its position when the mark was set /
631	for (i = `0`; i < so->numArrayKeys; i++)
632	{
633	BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
634	ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
635	int mark_elem = curArrayKey->mark_elem;
636
637	if (curArrayKey->cur_elem != mark_elem)
638	{
639	curArrayKey->cur_elem = mark_elem;
640	skey->sk_argument = curArrayKey->elem_values[mark_elem];
641	changed = true;
642	}
643	}
644
645	/*
646	* If we changed any keys, we must redo _bt_preprocess_keys. That might
647	* sound like overkill, but in cases with multiple keys per index column
648	* it seems necessary to do the full set of pushups.
649	*/
650	if (changed)
651	{
652	_bt_preprocess_keys(scan);
653	/ The mark should have been set on a consistent set of keys... /
654	Assert(so->qual_ok);
655	}
656	}
657
658
659	/*
660	* _bt_preprocess_keys() -- Preprocess scan keys
661	*
662	* The given search-type keys (in scan->keyData[] or so->arrayKeyData[])
663	* are copied to so->keyData[] with possible transformation.
664	* scan->numberOfKeys is the number of input keys, so->numberOfKeys gets
665	* the number of output keys (possibly less, never greater).
666	*
667	* The output keys are marked with additional sk_flag bits beyond the
668	* system-standard bits supplied by the caller. The DESC and NULLS_FIRST
669	* indoption bits for the relevant index attribute are copied into the flags.
670	* Also, for a DESC column, we commute (flip) all the sk_strategy numbers
671	* so that the index sorts in the desired direction.
672	*
673	* One key purpose of this routine is to discover which scan keys must be
674	* satisfied to continue the scan. It also attempts to eliminate redundant
675	* keys and detect contradictory keys. (If the index opfamily provides
676	* incomplete sets of cross-type operators, we may fail to detect redundant
677	* or contradictory keys, but we can survive that.)
678	*
679	* The output keys must be sorted by index attribute. Presently we expect
680	* (but verify) that the input keys are already so sorted --- this is done
681	* by match_clauses_to_index() in indxpath.c. Some reordering of the keys
682	* within each attribute may be done as a byproduct of the processing here,
683	* but no other code depends on that.
684	*
685	* The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
686	* if they must be satisfied in order to continue the scan forward or backward
687	* respectively. _bt_checkkeys uses these flags. For example, if the quals
688	* are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
689	* (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
690	* But once we reach tuples like (1,4,z) we can stop scanning because no
691	* later tuples could match. This is reflected by marking the x and y keys,
692	* but not the z key, with SK_BT_REQFWD. In general, the keys for leading
693	* attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
694	* For the first attribute without an "=" key, any "<" and "<=" keys are
695	* marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
696	* This can be seen to be correct by considering the above example. Note
697	* in particular that if there are no keys for a given attribute, the keys for
698	* subsequent attributes can never be required; for instance "WHERE y = 4"
699	* requires a full-index scan.
700	*
701	* If possible, redundant keys are eliminated: we keep only the tightest
702	* >/>= bound and the tightest </<= bound, and if there's an = key then
703	* that's the only one returned. (So, we return either a single = key,
704	* or one or two boundary-condition keys for each attr.) However, if we
705	* cannot compare two keys for lack of a suitable cross-type operator,
706	* we cannot eliminate either. If there are two such keys of the same
707	* operator strategy, the second one is just pushed into the output array
708	* without further processing here. We may also emit both >/>= or both
709	* </<= keys if we can't compare them. The logic about required keys still
710	* works if we don't eliminate redundant keys.
711	*
712	* Note that one reason we need direction-sensitive required-key flags is
713	* precisely that we may not be able to eliminate redundant keys. Suppose
714	* we have "x > 4::int AND x > 10::bigint", and we are unable to determine
715	* which key is more restrictive for lack of a suitable cross-type operator.
716	* _bt_first will arbitrarily pick one of the keys to do the initial
717	* positioning with. If it picks x > 4, then the x > 10 condition will fail
718	* until we reach index entries > 10; but we can't stop the scan just because
719	* x > 10 is failing. On the other hand, if we are scanning backwards, then
720	* failure of either key is indeed enough to stop the scan. (In general, when
721	* inequality keys are present, the initial-positioning code only promises to
722	* position before the first possible match, not exactly at the first match,
723	* for a forward scan; or after the last match for a backward scan.)
724	*
725	* As a byproduct of this work, we can detect contradictory quals such
726	* as "x = 1 AND x > 2". If we see that, we return so->qual_ok = false,
727	* indicating the scan need not be run at all since no tuples can match.
728	* (In this case we do not bother completing the output key array!)
729	* Again, missing cross-type operators might cause us to fail to prove the
730	* quals contradictory when they really are, but the scan will work correctly.
731	*
732	* Row comparison keys are currently also treated without any smarts:
733	* we just transfer them into the preprocessed array without any
734	* editorialization. We can treat them the same as an ordinary inequality
735	* comparison on the row's first index column, for the purposes of the logic
736	* about required keys.
737	*
738	* Note: the reason we have to copy the preprocessed scan keys into private
739	* storage is that we are modifying the array based on comparisons of the
740	* key argument values, which could change on a rescan or after moving to
741	* new elements of array keys. Therefore we can't overwrite the source data.
742	*/
743	void
744	_bt_preprocess_keys(IndexScanDesc scan)
745	{
746	BTScanOpaque so = (BTScanOpaque) scan->opaque;
747	int numberOfKeys = scan->numberOfKeys;
748	int16 *indoption = scan->indexRelation->rd_indoption;
749	int new_numberOfKeys;
750	int numberOfEqualCols;
751	ScanKey inkeys;
752	ScanKey outkeys;
753	ScanKey cur;
754	ScanKey xform[BTMaxStrategyNumber];
755	bool test_result;
756	int i,
757	j;
758	AttrNumber attno;
759
760	/ initialize result variables /
761	so->qual_ok = true;
762	so->numberOfKeys = `0`;
763
764	if (numberOfKeys < `1`)
765	return; / done if qual-less scan /
766
767	/*
768	* Read so->arrayKeyData if array keys are present, else scan->keyData
769	*/
770	if (so->arrayKeyData != NULL)
771	inkeys = so->arrayKeyData;
772	else
773	inkeys = scan->keyData;
774
775	outkeys = so->keyData;
776	cur = &inkeys[`0`];
777	/ we check that input keys are correctly ordered /
778	if (cur->sk_attno < `1`)
779	elog(ERROR, "btree index keys must be ordered by attribute");
780
781	/ We can short-circuit most of the work if there's just one key /
782	if (numberOfKeys == `1`)
783	{
784	/ Apply indoption to scankey (might change sk_strategy!) /
785	if (!_bt_fix_scankey_strategy(cur, indoption))
786	so->qual_ok = false;
787	memcpy(outkeys, cur, sizeof(ScanKeyData));
788	so->numberOfKeys = `1`;
789	/ We can mark the qual as required if it's for first index col /
790	if (cur->sk_attno == `1`)
791	_bt_mark_scankey_required(outkeys);
792	return;
793	}
794
795	/*
796	* Otherwise, do the full set of pushups.
797	*/
798	new_numberOfKeys = `0`;
799	numberOfEqualCols = `0`;
800
801	/*
802	* Initialize for processing of keys for attr 1.
803	*
804	* xform[i] points to the currently best scan key of strategy type i+1; it
805	* is NULL if we haven't yet found such a key for this attr.
806	*/
807	attno = `1`;
808	memset(xform, `0`, sizeof(xform));
809
810	/*
811	* Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
812	* handle after-last-key processing. Actual exit from the loop is at the
813	* "break" statement below.
814	*/
815	for (i = `0`;; cur++, i++)
816	{
817	if (i < numberOfKeys)
818	{
819	/ Apply indoption to scankey (might change sk_strategy!) /
820	if (!_bt_fix_scankey_strategy(cur, indoption))
821	{
822	/ NULL can't be matched, so give up /
823	so->qual_ok = false;
824	return;
825	}
826	}
827
828	/*
829	* If we are at the end of the keys for a particular attr, finish up
830	* processing and emit the cleaned-up keys.
831	*/
832	if (i == numberOfKeys \|\| cur->sk_attno != attno)
833	{
834	int priorNumberOfEqualCols = numberOfEqualCols;
835
836	/ check input keys are correctly ordered /
837	if (i < numberOfKeys && cur->sk_attno < attno)
838	elog(ERROR, "btree index keys must be ordered by attribute");
839
840	/*
841	* If = has been specified, all other keys can be eliminated as
842	* redundant. If we have a case like key = 1 AND key > 2, we can
843	* set qual_ok to false and abandon further processing.
844	*
845	* We also have to deal with the case of "key IS NULL", which is
846	* unsatisfiable in combination with any other index condition. By
847	* the time we get here, that's been classified as an equality
848	* check, and we've rejected any combination of it with a regular
849	* equality condition; but not with other types of conditions.
850	*/
851	if (xform[BTEqualStrategyNumber - `1`])
852	{
853	ScanKey eq = xform[BTEqualStrategyNumber - `1`];
854
855	for (j = BTMaxStrategyNumber; --j >= `0`;)
856	{
857	ScanKey chk = xform[j];
858
859	if (!chk \|\| j == (BTEqualStrategyNumber - `1`))
860	continue;
861
862	if (eq->sk_flags & SK_SEARCHNULL)
863	{
864	/ IS NULL is contradictory to anything else /
865	so->qual_ok = false;
866	return;
867	}
868
869	if (_bt_compare_scankey_args(scan, chk, eq, chk,
870	&test_result))
871	{
872	if (!test_result)
873	{
874	/ keys proven mutually contradictory /
875	so->qual_ok = false;
876	return;
877	}
878	/ else discard the redundant non-equality key /
879	xform[j] = NULL;
880	}
881	/ else, cannot determine redundancy, keep both keys /
882	}
883	/ track number of attrs for which we have "=" keys /
884	numberOfEqualCols++;
885	}
886
887	/ try to keep only one of <, <= /
888	if (xform[BTLessStrategyNumber - `1`]
889	&& xform[BTLessEqualStrategyNumber - `1`])
890	{
891	ScanKey lt = xform[BTLessStrategyNumber - `1`];
892	ScanKey le = xform[BTLessEqualStrategyNumber - `1`];
893
894	if (_bt_compare_scankey_args(scan, le, lt, le,
895	&test_result))
896	{
897	if (test_result)
898	xform[BTLessEqualStrategyNumber - `1`] = NULL;
899	else
900	xform[BTLessStrategyNumber - `1`] = NULL;
901	}
902	}
903
904	/ try to keep only one of >, >= /
905	if (xform[BTGreaterStrategyNumber - `1`]
906	&& xform[BTGreaterEqualStrategyNumber - `1`])
907	{
908	ScanKey gt = xform[BTGreaterStrategyNumber - `1`];
909	ScanKey ge = xform[BTGreaterEqualStrategyNumber - `1`];
910
911	if (_bt_compare_scankey_args(scan, ge, gt, ge,
912	&test_result))
913	{
914	if (test_result)
915	xform[BTGreaterEqualStrategyNumber - `1`] = NULL;
916	else
917	xform[BTGreaterStrategyNumber - `1`] = NULL;
918	}
919	}
920
921	/*
922	* Emit the cleaned-up keys into the outkeys[] array, and then
923	* mark them if they are required. They are required (possibly
924	* only in one direction) if all attrs before this one had "=".
925	*/
926	for (j = BTMaxStrategyNumber; --j >= `0`;)
927	{
928	if (xform[j])
929	{
930	ScanKey outkey = &outkeys[new_numberOfKeys++];
931
932	memcpy(outkey, xform[j], sizeof(ScanKeyData));
933	if (priorNumberOfEqualCols == attno - `1`)
934	_bt_mark_scankey_required(outkey);
935	}
936	}
937
938	/*
939	* Exit loop here if done.
940	*/
941	if (i == numberOfKeys)
942	break;
943
944	/ Re-initialize for new attno /
945	attno = cur->sk_attno;
946	memset(xform, `0`, sizeof(xform));
947	}
948
949	/ check strategy this key's operator corresponds to /
950	j = cur->sk_strategy - `1`;
951
952	/ if row comparison, push it directly to the output array /
953	if (cur->sk_flags & SK_ROW_HEADER)
954	{
955	ScanKey outkey = &outkeys[new_numberOfKeys++];
956
957	memcpy(outkey, cur, sizeof(ScanKeyData));
958	if (numberOfEqualCols == attno - `1`)
959	_bt_mark_scankey_required(outkey);
960
961	/*
962	* We don't support RowCompare using equality; such a qual would
963	* mess up the numberOfEqualCols tracking.
964	*/
965	Assert(j != (BTEqualStrategyNumber - `1`));
966	continue;
967	}
968
969	/ have we seen one of these before? /
970	if (xform[j] == NULL)
971	{
972	/ nope, so remember this scankey /
973	xform[j] = cur;
974	}
975	else
976	{
977	/ yup, keep only the more restrictive key /
978	if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
979	&test_result))
980	{
981	if (test_result)
982	xform[j] = cur;
983	else if (j == (BTEqualStrategyNumber - `1`))
984	{
985	/ key == a && key == b, but a != b /
986	so->qual_ok = false;
987	return;
988	}
989	/ else old key is more restrictive, keep it /
990	}
991	else
992	{
993	/*
994	* We can't determine which key is more restrictive. Keep the
995	* previous one in xform[j] and push this one directly to the
996	* output array.
997	*/
998	ScanKey outkey = &outkeys[new_numberOfKeys++];
999
1000	memcpy(outkey, cur, sizeof(ScanKeyData));
1001	if (numberOfEqualCols == attno - `1`)
1002	_bt_mark_scankey_required(outkey);
1003	}
1004	}
1005	}
1006
1007	so->numberOfKeys = new_numberOfKeys;
1008	}
1009
1010	/*
1011	* Compare two scankey values using a specified operator.
1012	*
1013	* The test we want to perform is logically "leftarg op rightarg", where
1014	* leftarg and rightarg are the sk_argument values in those ScanKeys, and
1015	* the comparison operator is the one in the op ScanKey. However, in
1016	* cross-data-type situations we may need to look up the correct operator in
1017	* the index's opfamily: it is the one having amopstrategy = op->sk_strategy
1018	* and amoplefttype/amoprighttype equal to the two argument datatypes.
1019	*
1020	* If the opfamily doesn't supply a complete set of cross-type operators we
1021	* may not be able to make the comparison. If we can make the comparison
1022	* we store the operator result in *result and return true. We return false
1023	* if the comparison could not be made.
1024	*
1025	* Note: op always points at the same ScanKey as either leftarg or rightarg.
1026	* Since we don't scribble on the scankeys, this aliasing should cause no
1027	* trouble.
1028	*
1029	* Note: this routine needs to be insensitive to any DESC option applied
1030	* to the index column. For example, "x < 4" is a tighter constraint than
1031	* "x < 5" regardless of which way the index is sorted.
1032	*/
1033	static bool
1034	_bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
1035	ScanKey leftarg, ScanKey rightarg,
1036	bool *result)
1037	{
1038	Relation rel = scan->indexRelation;
1039	Oid lefttype,
1040	righttype,
1041	optype,
1042	opcintype,
1043	cmp_op;
1044	StrategyNumber strat;
1045
1046	/*
1047	* First, deal with cases where one or both args are NULL. This should
1048	* only happen when the scankeys represent IS NULL/NOT NULL conditions.
1049	*/
1050	if ((leftarg->sk_flags \| rightarg->sk_flags) & SK_ISNULL)
1051	{
1052	bool leftnull,
1053	rightnull;
1054
1055	if (leftarg->sk_flags & SK_ISNULL)
1056	{
1057	Assert(leftarg->sk_flags & (SK_SEARCHNULL \| SK_SEARCHNOTNULL));
1058	leftnull = true;
1059	}
1060	else
1061	leftnull = false;
1062	if (rightarg->sk_flags & SK_ISNULL)
1063	{
1064	Assert(rightarg->sk_flags & (SK_SEARCHNULL \| SK_SEARCHNOTNULL));
1065	rightnull = true;
1066	}
1067	else
1068	rightnull = false;
1069
1070	/*
1071	* We treat NULL as either greater than or less than all other values.
1072	* Since true > false, the tests below work correctly for NULLS LAST
1073	* logic. If the index is NULLS FIRST, we need to flip the strategy.
1074	*/
1075	strat = op->sk_strategy;
1076	if (op->sk_flags & SK_BT_NULLS_FIRST)
1077	strat = BTCommuteStrategyNumber(strat);
1078
1079	switch (strat)
1080	{
1081	case BTLessStrategyNumber:
1082	*result = (leftnull < rightnull);
1083	break;
1084	case BTLessEqualStrategyNumber:
1085	*result = (leftnull <= rightnull);
1086	break;
1087	case BTEqualStrategyNumber:
1088	*result = (leftnull == rightnull);
1089	break;
1090	case BTGreaterEqualStrategyNumber:
1091	*result = (leftnull >= rightnull);
1092	break;
1093	case BTGreaterStrategyNumber:
1094	*result = (leftnull > rightnull);
1095	break;
1096	default:
1097	elog(ERROR, "unrecognized StrategyNumber: %d", (int) strat);
1098	result = false; /* keep compiler quiet /
1099	break;
1100	}
1101	return true;
1102	}
1103
1104	/*
1105	* The opfamily we need to worry about is identified by the index column.
1106	*/
1107	Assert(leftarg->sk_attno == rightarg->sk_attno);
1108
1109	opcintype = rel->rd_opcintype[leftarg->sk_attno - `1`];
1110
1111	/*
1112	* Determine the actual datatypes of the ScanKey arguments. We have to
1113	* support the convention that sk_subtype == InvalidOid means the opclass
1114	* input type; this is a hack to simplify life for ScanKeyInit().
1115	*/
1116	lefttype = leftarg->sk_subtype;
1117	if (lefttype == InvalidOid)
1118	lefttype = opcintype;
1119	righttype = rightarg->sk_subtype;
1120	if (righttype == InvalidOid)
1121	righttype = opcintype;
1122	optype = op->sk_subtype;
1123	if (optype == InvalidOid)
1124	optype = opcintype;
1125
1126	/*
1127	* If leftarg and rightarg match the types expected for the "op" scankey,
1128	* we can use its already-looked-up comparison function.
1129	*/
1130	if (lefttype == opcintype && righttype == optype)
1131	{
1132	*result = DatumGetBool(FunctionCall2Coll(&op->sk_func,
1133	op->sk_collation,
1134	leftarg->sk_argument,
1135	rightarg->sk_argument));
1136	return true;
1137	}
1138
1139	/*
1140	* Otherwise, we need to go to the syscache to find the appropriate
1141	* operator. (This cannot result in infinite recursion, since no
1142	* indexscan initiated by syscache lookup will use cross-data-type
1143	* operators.)
1144	*
1145	* If the sk_strategy was flipped by _bt_fix_scankey_strategy, we have to
1146	* un-flip it to get the correct opfamily member.
1147	*/
1148	strat = op->sk_strategy;
1149	if (op->sk_flags & SK_BT_DESC)
1150	strat = BTCommuteStrategyNumber(strat);
1151
1152	cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - `1`],
1153	lefttype,
1154	righttype,
1155	strat);
1156	if (OidIsValid(cmp_op))
1157	{
1158	RegProcedure cmp_proc = get_opcode(cmp_op);
1159
1160	if (RegProcedureIsValid(cmp_proc))
1161	{
1162	*result = DatumGetBool(OidFunctionCall2Coll(cmp_proc,
1163	op->sk_collation,
1164	leftarg->sk_argument,
1165	rightarg->sk_argument));
1166	return true;
1167	}
1168	}
1169
1170	/ Can't make the comparison /
1171	result = false; /* suppress compiler warnings /
1172	return false;
1173	}
1174
1175	/*
1176	* Adjust a scankey's strategy and flags setting as needed for indoptions.
1177	*
1178	* We copy the appropriate indoption value into the scankey sk_flags
1179	* (shifting to avoid clobbering system-defined flag bits). Also, if
1180	* the DESC option is set, commute (flip) the operator strategy number.
1181	*
1182	* A secondary purpose is to check for IS NULL/NOT NULL scankeys and set up
1183	* the strategy field correctly for them.
1184	*
1185	* Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a
1186	* NULL comparison value. Since all btree operators are assumed strict,
1187	* a NULL means that the qual cannot be satisfied. We return true if the
1188	* comparison value isn't NULL, or false if the scan should be abandoned.
1189	*
1190	* This function is applied to the input scankey structure; therefore
1191	* on a rescan we will be looking at already-processed scankeys. Hence
1192	* we have to be careful not to re-commute the strategy if we already did it.
1193	* It's a bit ugly to modify the caller's copy of the scankey but in practice
1194	* there shouldn't be any problem, since the index's indoptions are certainly
1195	* not going to change while the scankey survives.
1196	*/
1197	static bool
1198	_bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
1199	{
1200	int addflags;
1201
1202	addflags = indoption[skey->sk_attno - `1`] << SK_BT_INDOPTION_SHIFT;
1203
1204	/*
1205	* We treat all btree operators as strict (even if they're not so marked
1206	* in pg_proc). This means that it is impossible for an operator condition
1207	* with a NULL comparison constant to succeed, and we can reject it right
1208	* away.
1209	*
1210	* However, we now also support "x IS NULL" clauses as search conditions,
1211	* so in that case keep going. The planner has not filled in any
1212	* particular strategy in this case, so set it to BTEqualStrategyNumber
1213	* --- we can treat IS NULL as an equality operator for purposes of search
1214	* strategy.
1215	*
1216	* Likewise, "x IS NOT NULL" is supported. We treat that as either "less
1217	* than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS
1218	* FIRST index.
1219	*
1220	* Note: someday we might have to fill in sk_collation from the index
1221	* column's collation. At the moment this is a non-issue because we'll
1222	* never actually call the comparison operator on a NULL.
1223	*/
1224	if (skey->sk_flags & SK_ISNULL)
1225	{
1226	/ SK_ISNULL shouldn't be set in a row header scankey /
1227	Assert(!(skey->sk_flags & SK_ROW_HEADER));
1228
1229	/ Set indoption flags in scankey (might be done already) /
1230	skey->sk_flags \|= addflags;
1231
1232	/ Set correct strategy for IS NULL or NOT NULL search /
1233	if (skey->sk_flags & SK_SEARCHNULL)
1234	{
1235	skey->sk_strategy = BTEqualStrategyNumber;
1236	skey->sk_subtype = InvalidOid;
1237	skey->sk_collation = InvalidOid;
1238	}
1239	else if (skey->sk_flags & SK_SEARCHNOTNULL)
1240	{
1241	if (skey->sk_flags & SK_BT_NULLS_FIRST)
1242	skey->sk_strategy = BTGreaterStrategyNumber;
1243	else
1244	skey->sk_strategy = BTLessStrategyNumber;
1245	skey->sk_subtype = InvalidOid;
1246	skey->sk_collation = InvalidOid;
1247	}
1248	else
1249	{
1250	/ regular qual, so it cannot be satisfied /
1251	return false;
1252	}
1253
1254	/ Needn't do the rest /
1255	return true;
1256	}
1257
1258	/ Adjust strategy for DESC, if we didn't already /
1259	if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
1260	skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
1261	skey->sk_flags \|= addflags;
1262
1263	/ If it's a row header, fix row member flags and strategies similarly /
1264	if (skey->sk_flags & SK_ROW_HEADER)
1265	{
1266	ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
1267
1268	for (;;)
1269	{
1270	Assert(subkey->sk_flags & SK_ROW_MEMBER);
1271	addflags = indoption[subkey->sk_attno - `1`] << SK_BT_INDOPTION_SHIFT;
1272	if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
1273	subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
1274	subkey->sk_flags \|= addflags;
1275	if (subkey->sk_flags & SK_ROW_END)
1276	break;
1277	subkey++;
1278	}
1279	}
1280
1281	return true;
1282	}
1283
1284	/*
1285	* Mark a scankey as "required to continue the scan".
1286	*
1287	* Depending on the operator type, the key may be required for both scan
1288	* directions or just one. Also, if the key is a row comparison header,
1289	* we have to mark its first subsidiary ScanKey as required. (Subsequent
1290	* subsidiary ScanKeys are normally for lower-order columns, and thus
1291	* cannot be required, since they're after the first non-equality scankey.)
1292	*
1293	* Note: when we set required-key flag bits in a subsidiary scankey, we are
1294	* scribbling on a data structure belonging to the index AM's caller, not on
1295	* our private copy. This should be OK because the marking will not change
1296	* from scan to scan within a query, and so we'd just re-mark the same way
1297	* anyway on a rescan. Something to keep an eye on though.
1298	*/
1299	static void
1300	_bt_mark_scankey_required(ScanKey skey)
1301	{
1302	int addflags;
1303
1304	switch (skey->sk_strategy)
1305	{
1306	case BTLessStrategyNumber:
1307	case BTLessEqualStrategyNumber:
1308	addflags = SK_BT_REQFWD;
1309	break;
1310	case BTEqualStrategyNumber:
1311	addflags = SK_BT_REQFWD \| SK_BT_REQBKWD;
1312	break;
1313	case BTGreaterEqualStrategyNumber:
1314	case BTGreaterStrategyNumber:
1315	addflags = SK_BT_REQBKWD;
1316	break;
1317	default:
1318	elog(ERROR, "unrecognized StrategyNumber: %d",
1319	(int) skey->sk_strategy);
1320	addflags = `0`; / keep compiler quiet /
1321	break;
1322	}
1323
1324	skey->sk_flags \|= addflags;
1325
1326	if (skey->sk_flags & SK_ROW_HEADER)
1327	{
1328	ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
1329
1330	/ First subkey should be same column/operator as the header /
1331	Assert(subkey->sk_flags & SK_ROW_MEMBER);
1332	Assert(subkey->sk_attno == skey->sk_attno);
1333	Assert(subkey->sk_strategy == skey->sk_strategy);
1334	subkey->sk_flags \|= addflags;
1335	}
1336	}
1337
1338	/*
1339	* Test whether an indextuple satisfies all the scankey conditions.
1340	*
1341	* Return true if so, false if not. If the tuple fails to pass the qual,
1342	* we also determine whether there's any need to continue the scan beyond
1343	* this tuple, and set *continuescan accordingly. See comments for
1344	* _bt_preprocess_keys(), above, about how this is done.
1345	*
1346	* Forward scan callers can pass a high key tuple in the hopes of having
1347	* us set *continuescan to false, and avoiding an unnecessary visit to
1348	* the page to the right.
1349	*
1350	* scan: index scan descriptor (containing a search-type scankey)
1351	* tuple: index tuple to test
1352	* tupnatts: number of attributes in tupnatts (high key may be truncated)
1353	* dir: direction we are scanning in
1354	* continuescan: output parameter (will be set correctly in all cases)
1355	*/
1356	bool
1357	_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
1358	ScanDirection dir, bool *continuescan)
1359	{
1360	TupleDesc tupdesc;
1361	BTScanOpaque so;
1362	int keysz;
1363	int ikey;
1364	ScanKey key;
1365
1366	Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
1367
1368	continuescan = true; /* default assumption /
1369
1370	tupdesc = RelationGetDescr(scan->indexRelation);
1371	so = (BTScanOpaque) scan->opaque;
1372	keysz = so->numberOfKeys;
1373
1374	for (key = so->keyData, ikey = `0`; ikey < keysz; key++, ikey++)
1375	{
1376	Datum datum;
1377	bool isNull;
1378	Datum test;
1379
1380	if (key->sk_attno > tupnatts)
1381	{
1382	/*
1383	* This attribute is truncated (must be high key). The value for
1384	* this attribute in the first non-pivot tuple on the page to the
1385	* right could be any possible value. Assume that truncated
1386	* attribute passes the qual.
1387	*/
1388	Assert(ScanDirectionIsForward(dir));
1389	continue;
1390	}
1391
1392	/ row-comparison keys need special processing /
1393	if (key->sk_flags & SK_ROW_HEADER)
1394	{
1395	if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
1396	continuescan))
1397	continue;
1398	return false;
1399	}
1400
1401	datum = index_getattr(tuple,
1402	key->sk_attno,
1403	tupdesc,
1404	&isNull);
1405
1406	if (key->sk_flags & SK_ISNULL)
1407	{
1408	/ Handle IS NULL/NOT NULL tests /
1409	if (key->sk_flags & SK_SEARCHNULL)
1410	{
1411	if (isNull)
1412	continue; / tuple satisfies this qual /
1413	}
1414	else
1415	{
1416	Assert(key->sk_flags & SK_SEARCHNOTNULL);
1417	if (!isNull)
1418	continue; / tuple satisfies this qual /
1419	}
1420
1421	/*
1422	* Tuple fails this qual. If it's a required qual for the current
1423	* scan direction, then we can conclude no further tuples will
1424	* pass, either.
1425	*/
1426	if ((key->sk_flags & SK_BT_REQFWD) &&
1427	ScanDirectionIsForward(dir))
1428	*continuescan = false;
1429	else if ((key->sk_flags & SK_BT_REQBKWD) &&
1430	ScanDirectionIsBackward(dir))
1431	*continuescan = false;
1432
1433	/*
1434	* In any case, this indextuple doesn't match the qual.
1435	*/
1436	return false;
1437	}
1438
1439	if (isNull)
1440	{
1441	if (key->sk_flags & SK_BT_NULLS_FIRST)
1442	{
1443	/*
1444	* Since NULLs are sorted before non-NULLs, we know we have
1445	* reached the lower limit of the range of values for this
1446	* index attr. On a backward scan, we can stop if this qual
1447	* is one of the "must match" subset. We can stop regardless
1448	* of whether the qual is > or <, so long as it's required,
1449	* because it's not possible for any future tuples to pass. On
1450	* a forward scan, however, we must keep going, because we may
1451	* have initially positioned to the start of the index.
1452	*/
1453	if ((key->sk_flags & (SK_BT_REQFWD \| SK_BT_REQBKWD)) &&
1454	ScanDirectionIsBackward(dir))
1455	*continuescan = false;
1456	}
1457	else
1458	{
1459	/*
1460	* Since NULLs are sorted after non-NULLs, we know we have
1461	* reached the upper limit of the range of values for this
1462	* index attr. On a forward scan, we can stop if this qual is
1463	* one of the "must match" subset. We can stop regardless of
1464	* whether the qual is > or <, so long as it's required,
1465	* because it's not possible for any future tuples to pass. On
1466	* a backward scan, however, we must keep going, because we
1467	* may have initially positioned to the end of the index.
1468	*/
1469	if ((key->sk_flags & (SK_BT_REQFWD \| SK_BT_REQBKWD)) &&
1470	ScanDirectionIsForward(dir))
1471	*continuescan = false;
1472	}
1473
1474	/*
1475	* In any case, this indextuple doesn't match the qual.
1476	*/
1477	return false;
1478	}
1479
1480	test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
1481	datum, key->sk_argument);
1482
1483	if (!DatumGetBool(test))
1484	{
1485	/*
1486	* Tuple fails this qual. If it's a required qual for the current
1487	* scan direction, then we can conclude no further tuples will
1488	* pass, either.
1489	*
1490	* Note: because we stop the scan as soon as any required equality
1491	* qual fails, it is critical that equality quals be used for the
1492	* initial positioning in _bt_first() when they are available. See
1493	* comments in _bt_first().
1494	*/
1495	if ((key->sk_flags & SK_BT_REQFWD) &&
1496	ScanDirectionIsForward(dir))
1497	*continuescan = false;
1498	else if ((key->sk_flags & SK_BT_REQBKWD) &&
1499	ScanDirectionIsBackward(dir))
1500	*continuescan = false;
1501
1502	/*
1503	* In any case, this indextuple doesn't match the qual.
1504	*/
1505	return false;
1506	}
1507	}
1508
1509	/ If we get here, the tuple passes all index quals. /
1510	return true;
1511	}
1512
1513	/*
1514	* Test whether an indextuple satisfies a row-comparison scan condition.
1515	*
1516	* Return true if so, false if not. If not, also clear *continuescan if
1517	* it's not possible for any future tuples in the current scan direction
1518	* to pass the qual.
1519	*
1520	* This is a subroutine for _bt_checkkeys, which see for more info.
1521	*/
1522	static bool
1523	_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
1524	TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
1525	{
1526	ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
1527	int32 cmpresult = `0`;
1528	bool result;
1529
1530	/ First subkey should be same as the header says /
1531	Assert(subkey->sk_attno == skey->sk_attno);
1532
1533	/ Loop over columns of the row condition /
1534	for (;;)
1535	{
1536	Datum datum;
1537	bool isNull;
1538
1539	Assert(subkey->sk_flags & SK_ROW_MEMBER);
1540
1541	if (subkey->sk_attno > tupnatts)
1542	{
1543	/*
1544	* This attribute is truncated (must be high key). The value for
1545	* this attribute in the first non-pivot tuple on the page to the
1546	* right could be any possible value. Assume that truncated
1547	* attribute passes the qual.
1548	*/
1549	Assert(ScanDirectionIsForward(dir));
1550	cmpresult = `0`;
1551	if (subkey->sk_flags & SK_ROW_END)
1552	break;
1553	subkey++;
1554	continue;
1555	}
1556
1557	datum = index_getattr(tuple,
1558	subkey->sk_attno,
1559	tupdesc,
1560	&isNull);
1561
1562	if (isNull)
1563	{
1564	if (subkey->sk_flags & SK_BT_NULLS_FIRST)
1565	{
1566	/*
1567	* Since NULLs are sorted before non-NULLs, we know we have
1568	* reached the lower limit of the range of values for this
1569	* index attr. On a backward scan, we can stop if this qual
1570	* is one of the "must match" subset. We can stop regardless
1571	* of whether the qual is > or <, so long as it's required,
1572	* because it's not possible for any future tuples to pass. On
1573	* a forward scan, however, we must keep going, because we may
1574	* have initially positioned to the start of the index.
1575	*/
1576	if ((subkey->sk_flags & (SK_BT_REQFWD \| SK_BT_REQBKWD)) &&
1577	ScanDirectionIsBackward(dir))
1578	*continuescan = false;
1579	}
1580	else
1581	{
1582	/*
1583	* Since NULLs are sorted after non-NULLs, we know we have
1584	* reached the upper limit of the range of values for this
1585	* index attr. On a forward scan, we can stop if this qual is
1586	* one of the "must match" subset. We can stop regardless of
1587	* whether the qual is > or <, so long as it's required,
1588	* because it's not possible for any future tuples to pass. On
1589	* a backward scan, however, we must keep going, because we
1590	* may have initially positioned to the end of the index.
1591	*/
1592	if ((subkey->sk_flags & (SK_BT_REQFWD \| SK_BT_REQBKWD)) &&
1593	ScanDirectionIsForward(dir))
1594	*continuescan = false;
1595	}
1596
1597	/*
1598	* In any case, this indextuple doesn't match the qual.
1599	*/
1600	return false;
1601	}
1602
1603	if (subkey->sk_flags & SK_ISNULL)
1604	{
1605	/*
1606	* Unlike the simple-scankey case, this isn't a disallowed case.
1607	* But it can never match. If all the earlier row comparison
1608	* columns are required for the scan direction, we can stop the
1609	* scan, because there can't be another tuple that will succeed.
1610	*/
1611	if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
1612	subkey--;
1613	if ((subkey->sk_flags & SK_BT_REQFWD) &&
1614	ScanDirectionIsForward(dir))
1615	*continuescan = false;
1616	else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
1617	ScanDirectionIsBackward(dir))
1618	*continuescan = false;
1619	return false;
1620	}
1621
1622	/ Perform the test --- three-way comparison not bool operator /
1623	cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
1624	subkey->sk_collation,
1625	datum,
1626	subkey->sk_argument));
1627
1628	if (subkey->sk_flags & SK_BT_DESC)
1629	INVERT_COMPARE_RESULT(cmpresult);
1630
1631	/ Done comparing if unequal, else advance to next column /
1632	if (cmpresult != `0`)
1633	break;
1634
1635	if (subkey->sk_flags & SK_ROW_END)
1636	break;
1637	subkey++;
1638	}
1639
1640	/*
1641	* At this point cmpresult indicates the overall result of the row
1642	* comparison, and subkey points to the deciding column (or the last
1643	* column if the result is "=").
1644	*/
1645	switch (subkey->sk_strategy)
1646	{
1647	/ EQ and NE cases aren't allowed here /
1648	case BTLessStrategyNumber:
1649	result = (cmpresult < `0`);
1650	break;
1651	case BTLessEqualStrategyNumber:
1652	result = (cmpresult <= `0`);
1653	break;
1654	case BTGreaterEqualStrategyNumber:
1655	result = (cmpresult >= `0`);
1656	break;
1657	case BTGreaterStrategyNumber:
1658	result = (cmpresult > `0`);
1659	break;
1660	default:
1661	elog(ERROR, "unrecognized RowCompareType: %d",
1662	(int) subkey->sk_strategy);
1663	result = `0`; / keep compiler quiet /
1664	break;
1665	}
1666
1667	if (!result)
1668	{
1669	/*
1670	* Tuple fails this qual. If it's a required qual for the current
1671	* scan direction, then we can conclude no further tuples will pass,
1672	* either. Note we have to look at the deciding column, not
1673	* necessarily the first or last column of the row condition.
1674	*/
1675	if ((subkey->sk_flags & SK_BT_REQFWD) &&
1676	ScanDirectionIsForward(dir))
1677	*continuescan = false;
1678	else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
1679	ScanDirectionIsBackward(dir))
1680	*continuescan = false;
1681	}
1682
1683	return result;
1684	}
1685
1686	/*
1687	* _bt_killitems - set LP_DEAD state for items an indexscan caller has
1688	* told us were killed
1689	*
1690	* scan->opaque, referenced locally through so, contains information about the
1691	* current page and killed tuples thereon (generally, this should only be
1692	* called if so->numKilled > 0).
1693	*
1694	* The caller does not have a lock on the page and may or may not have the
1695	* page pinned in a buffer. Note that read-lock is sufficient for setting
1696	* LP_DEAD status (which is only a hint).
1697	*
1698	* We match items by heap TID before assuming they are the right ones to
1699	* delete. We cope with cases where items have moved right due to insertions.
1700	* If an item has moved off the current page due to a split, we'll fail to
1701	* find it and do nothing (this is not an error case --- we assume the item
1702	* will eventually get marked in a future indexscan).
1703	*
1704	* Note that if we hold a pin on the target page continuously from initially
1705	* reading the items until applying this function, VACUUM cannot have deleted
1706	* any items from the page, and so there is no need to search left from the
1707	* recorded offset. (This observation also guarantees that the item is still
1708	* the right one to delete, which might otherwise be questionable since heap
1709	* TIDs can get recycled.) This holds true even if the page has been modified
1710	* by inserts and page splits, so there is no need to consult the LSN.
1711	*
1712	* If the pin was released after reading the page, then we re-read it. If it
1713	* has been modified since we read it (as determined by the LSN), we dare not
1714	* flag any entries because it is possible that the old entry was vacuumed
1715	* away and the TID was re-used by a completely different heap tuple.
1716	*/
1717	void
1718	_bt_killitems(IndexScanDesc scan)
1719	{
1720	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1721	Page page;
1722	BTPageOpaque opaque;
1723	OffsetNumber minoff;
1724	OffsetNumber maxoff;
1725	int i;
1726	int numKilled = so->numKilled;
1727	bool killedsomething = false;
1728
1729	Assert(BTScanPosIsValid(so->currPos));
1730
1731	/*
1732	* Always reset the scan state, so we don't look for same items on other
1733	* pages.
1734	*/
1735	so->numKilled = `0`;
1736
1737	if (BTScanPosIsPinned(so->currPos))
1738	{
1739	/*
1740	* We have held the pin on this page since we read the index tuples,
1741	* so all we need to do is lock it. The pin will have prevented
1742	* re-use of any TID on the page, so there is no need to check the
1743	* LSN.
1744	*/
1745	LockBuffer(so->currPos.buf, BT_READ);
1746
1747	page = BufferGetPage(so->currPos.buf);
1748	}
1749	else
1750	{
1751	Buffer buf;
1752
1753	/ Attempt to re-read the buffer, getting pin and lock. /
1754	buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
1755
1756	/ It might not exist anymore; in which case we can't hint it. /
1757	if (!BufferIsValid(buf))
1758	return;
1759
1760	page = BufferGetPage(buf);
1761	if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
1762	so->currPos.buf = buf;
1763	else
1764	{
1765	/ Modified while not pinned means hinting is not safe. /
1766	_bt_relbuf(scan->indexRelation, buf);
1767	return;
1768	}
1769	}
1770
1771	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1772	minoff = P_FIRSTDATAKEY(opaque);
1773	maxoff = PageGetMaxOffsetNumber(page);
1774
1775	for (i = `0`; i < numKilled; i++)
1776	{
1777	int itemIndex = so->killedItems[i];
1778	BTScanPosItem *kitem = &so->currPos.items[itemIndex];
1779	OffsetNumber offnum = kitem->indexOffset;
1780
1781	Assert(itemIndex >= so->currPos.firstItem &&
1782	itemIndex <= so->currPos.lastItem);
1783	if (offnum < minoff)
1784	continue; / pure paranoia /
1785	while (offnum <= maxoff)
1786	{
1787	ItemId iid = PageGetItemId(page, offnum);
1788	IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
1789
1790	if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
1791	{
1792	/ found the item /
1793	ItemIdMarkDead(iid);
1794	killedsomething = true;
1795	break; / out of inner search loop /
1796	}
1797	offnum = OffsetNumberNext(offnum);
1798	}
1799	}
1800
1801	/*
1802	* Since this can be redone later if needed, mark as dirty hint.
1803	*
1804	* Whenever we mark anything LP_DEAD, we also set the page's
1805	* BTP_HAS_GARBAGE flag, which is likewise just a hint.
1806	*/
1807	if (killedsomething)
1808	{
1809	opaque->btpo_flags \|= BTP_HAS_GARBAGE;
1810	MarkBufferDirtyHint(so->currPos.buf, true);
1811	}
1812
1813	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1814	}
1815
1816
1817	/*
1818	* The following routines manage a shared-memory area in which we track
1819	* assignment of "vacuum cycle IDs" to currently-active btree vacuuming
1820	* operations. There is a single counter which increments each time we
1821	* start a vacuum to assign it a cycle ID. Since multiple vacuums could
1822	* be active concurrently, we have to track the cycle ID for each active
1823	* vacuum; this requires at most MaxBackends entries (usually far fewer).
1824	* We assume at most one vacuum can be active for a given index.
1825	*
1826	* Access to the shared memory area is controlled by BtreeVacuumLock.
1827	* In principle we could use a separate lmgr locktag for each index,
1828	* but a single LWLock is much cheaper, and given the short time that
1829	* the lock is ever held, the concurrency hit should be minimal.
1830	*/
1831
1832	typedef struct BTOneVacInfo
1833	{
1834	LockRelId relid; / global identifier of an index /
1835	BTCycleId cycleid; / cycle ID for its active VACUUM /
1836	} BTOneVacInfo;
1837
1838	typedef struct BTVacInfo
1839	{
1840	BTCycleId cycle_ctr; / cycle ID most recently assigned /
1841	int num_vacuums; / number of currently active VACUUMs /
1842	int max_vacuums; / allocated length of vacuums[] array /
1843	BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
1844	} BTVacInfo;
1845
1846	static BTVacInfo *btvacinfo;
1847
1848
1849	/*
1850	* _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
1851	* or zero if there is no active VACUUM
1852	*
1853	* Note: for correct interlocking, the caller must already hold pin and
1854	* exclusive lock on each buffer it will store the cycle ID into. This
1855	* ensures that even if a VACUUM starts immediately afterwards, it cannot
1856	* process those pages until the page split is complete.
1857	*/
1858	BTCycleId
1859	_bt_vacuum_cycleid(Relation rel)
1860	{
1861	BTCycleId result = `0`;
1862	int i;
1863
1864	/ Share lock is enough since this is a read-only operation /
1865	LWLockAcquire(BtreeVacuumLock, LW_SHARED);
1866
1867	for (i = `0`; i < btvacinfo->num_vacuums; i++)
1868	{
1869	BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1870
1871	if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1872	vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1873	{
1874	result = vac->cycleid;
1875	break;
1876	}
1877	}
1878
1879	LWLockRelease(BtreeVacuumLock);
1880	return result;
1881	}
1882
1883	/*
1884	* _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
1885	*
1886	* Note: the caller must guarantee that it will eventually call
1887	* _bt_end_vacuum, else we'll permanently leak an array slot. To ensure
1888	* that this happens even in elog(FATAL) scenarios, the appropriate coding
1889	* is not just a PG_TRY, but
1890	* PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
1891	*/
1892	BTCycleId
1893	_bt_start_vacuum(Relation rel)
1894	{
1895	BTCycleId result;
1896	int i;
1897	BTOneVacInfo *vac;
1898
1899	LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1900
1901	/*
1902	* Assign the next cycle ID, being careful to avoid zero as well as the
1903	* reserved high values.
1904	*/
1905	result = ++(btvacinfo->cycle_ctr);
1906	if (result == `0` \|\| result > MAX_BT_CYCLE_ID)
1907	result = btvacinfo->cycle_ctr = `1`;
1908
1909	/ Let's just make sure there's no entry already for this index /
1910	for (i = `0`; i < btvacinfo->num_vacuums; i++)
1911	{
1912	vac = &btvacinfo->vacuums[i];
1913	if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1914	vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1915	{
1916	/*
1917	* Unlike most places in the backend, we have to explicitly
1918	* release our LWLock before throwing an error. This is because
1919	* we expect _bt_end_vacuum() to be called before transaction
1920	* abort cleanup can run to release LWLocks.
1921	*/
1922	LWLockRelease(BtreeVacuumLock);
1923	elog(ERROR, "multiple active vacuums for index \"%s\"",
1924	RelationGetRelationName(rel));
1925	}
1926	}
1927
1928	/ OK, add an entry /
1929	if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
1930	{
1931	LWLockRelease(BtreeVacuumLock);
1932	elog(ERROR, "out of btvacinfo slots");
1933	}
1934	vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
1935	vac->relid = rel->rd_lockInfo.lockRelId;
1936	vac->cycleid = result;
1937	btvacinfo->num_vacuums++;
1938
1939	LWLockRelease(BtreeVacuumLock);
1940	return result;
1941	}
1942
1943	/*
1944	* _bt_end_vacuum --- mark a btree VACUUM operation as done
1945	*
1946	* Note: this is deliberately coded not to complain if no entry is found;
1947	* this allows the caller to put PG_TRY around the start_vacuum operation.
1948	*/
1949	void
1950	_bt_end_vacuum(Relation rel)
1951	{
1952	int i;
1953
1954	LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1955
1956	/ Find the array entry /
1957	for (i = `0`; i < btvacinfo->num_vacuums; i++)
1958	{
1959	BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1960
1961	if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1962	vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1963	{
1964	/ Remove it by shifting down the last entry /
1965	*vac = btvacinfo->vacuums[btvacinfo->num_vacuums - `1`];
1966	btvacinfo->num_vacuums--;
1967	break;
1968	}
1969	}
1970
1971	LWLockRelease(BtreeVacuumLock);
1972	}
1973
1974	/*
1975	* _bt_end_vacuum wrapped as an on_shmem_exit callback function
1976	*/
1977	void
1978	_bt_end_vacuum_callback(int code, Datum arg)
1979	{
1980	_bt_end_vacuum((Relation) DatumGetPointer(arg));
1981	}
1982
1983	/*
1984	* BTreeShmemSize --- report amount of shared memory space needed
1985	*/
1986	Size
1987	BTreeShmemSize(void)
1988	{
1989	Size size;
1990
1991	size = offsetof(BTVacInfo, vacuums);
1992	size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
1993	return size;
1994	}
1995
1996	/*
1997	* BTreeShmemInit --- initialize this module's shared memory
1998	*/
1999	void
2000	BTreeShmemInit(void)
2001	{
2002	bool found;
2003
2004	btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
2005	BTreeShmemSize(),
2006	&found);
2007
2008	if (!IsUnderPostmaster)
2009	{
2010	/ Initialize shared memory area /
2011	Assert(!found);
2012
2013	/*
2014	* It doesn't really matter what the cycle counter starts at, but
2015	* having it always start the same doesn't seem good. Seed with
2016	* low-order bits of time() instead.
2017	*/
2018	btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
2019
2020	btvacinfo->num_vacuums = `0`;
2021	btvacinfo->max_vacuums = MaxBackends;
2022	}
2023	else
2024	Assert(found);
2025	}
2026
2027	bytea *
2028	btoptions(Datum reloptions, bool validate)
2029	{
2030	return default_reloptions(reloptions, validate, RELOPT_KIND_BTREE);
2031	}
2032
2033	/*
2034	* btproperty() -- Check boolean properties of indexes.
2035	*
2036	* This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
2037	* to call btcanreturn.
2038	*/
2039	bool
2040	btproperty(Oid index_oid, int attno,
2041	IndexAMProperty prop, const char *propname,
2042	bool res, bool isnull)
2043	{
2044	switch (prop)
2045	{
2046	case AMPROP_RETURNABLE:
2047	/ answer only for columns, not AM or whole index /
2048	if (attno == `0`)
2049	return false;
2050	/ otherwise, btree can always return data /
2051	*res = true;
2052	return true;
2053
2054	default:
2055	return false; / punt to generic code /
2056	}
2057	}
2058
2059	/*
2060	* btbuildphasename() -- Return name of index build phase.
2061	*/
2062	char *
2063	btbuildphasename(int64 phasenum)
2064	{
2065	switch (phasenum)
2066	{
2067	case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE:
2068	return "initializing";
2069	case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN:
2070	return "scanning table";
2071	case PROGRESS_BTREE_PHASE_PERFORMSORT_1:
2072	return "sorting live tuples";
2073	case PROGRESS_BTREE_PHASE_PERFORMSORT_2:
2074	return "sorting dead tuples";
2075	case PROGRESS_BTREE_PHASE_LEAF_LOAD:
2076	return "loading tuples in tree";
2077	default:
2078	return NULL;
2079	}
2080	}
2081
2082	/*
2083	* _bt_truncate() -- create tuple without unneeded suffix attributes.
2084	*
2085	* Returns truncated pivot index tuple allocated in caller's memory context,
2086	* with key attributes copied from caller's firstright argument. If rel is
2087	* an INCLUDE index, non-key attributes will definitely be truncated away,
2088	* since they're not part of the key space. More aggressive suffix
2089	* truncation can take place when it's clear that the returned tuple does not
2090	* need one or more suffix key attributes. We only need to keep firstright
2091	* attributes up to and including the first non-lastleft-equal attribute.
2092	* Caller's insertion scankey is used to compare the tuples; the scankey's
2093	* argument values are not considered here.
2094	*
2095	* Sometimes this routine will return a new pivot tuple that takes up more
2096	* space than firstright, because a new heap TID attribute had to be added to
2097	* distinguish lastleft from firstright. This should only happen when the
2098	* caller is in the process of splitting a leaf page that has many logical
2099	* duplicates, where it's unavoidable.
2100	*
2101	* Note that returned tuple's t_tid offset will hold the number of attributes
2102	* present, so the original item pointer offset is not represented. Caller
2103	* should only change truncated tuple's downlink. Note also that truncated
2104	* key attributes are treated as containing "minus infinity" values by
2105	* _bt_compare().
2106	*
2107	* In the worst case (when a heap TID is appended) the size of the returned
2108	* tuple is the size of the first right tuple plus an additional MAXALIGN()'d
2109	* item pointer. This guarantee is important, since callers need to stay
2110	* under the 1/3 of a page restriction on tuple size. If this routine is ever
2111	* taught to truncate within an attribute/datum, it will need to avoid
2112	* returning an enlarged tuple to caller when truncation + TOAST compression
2113	* ends up enlarging the final datum.
2114	*/
2115	IndexTuple
2116	_bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
2117	BTScanInsert itup_key)
2118	{
2119	TupleDesc itupdesc = RelationGetDescr(rel);
2120	int16 natts = IndexRelationGetNumberOfAttributes(rel);
2121	int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
2122	int keepnatts;
2123	IndexTuple pivot;
2124	ItemPointer pivotheaptid;
2125	Size newsize;
2126
2127	/*
2128	* We should only ever truncate leaf index tuples. It's never okay to
2129	* truncate a second time.
2130	*/
2131	Assert(BTreeTupleGetNAtts(lastleft, rel) == natts);
2132	Assert(BTreeTupleGetNAtts(firstright, rel) == natts);
2133
2134	/ Determine how many attributes must be kept in truncated tuple /
2135	keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
2136
2137	#ifdef DEBUG_NO_TRUNCATE
2138	/ Force truncation to be ineffective for testing purposes /
2139	keepnatts = nkeyatts + `1`;
2140	#endif
2141
2142	if (keepnatts <= natts)
2143	{
2144	IndexTuple tidpivot;
2145
2146	pivot = index_truncate_tuple(itupdesc, firstright, keepnatts);
2147
2148	/*
2149	* If there is a distinguishing key attribute within new pivot tuple,
2150	* there is no need to add an explicit heap TID attribute
2151	*/
2152	if (keepnatts <= nkeyatts)
2153	{
2154	BTreeTupleSetNAtts(pivot, keepnatts);
2155	return pivot;
2156	}
2157
2158	/*
2159	* Only truncation of non-key attributes was possible, since key
2160	* attributes are all equal. It's necessary to add a heap TID
2161	* attribute to the new pivot tuple.
2162	*/
2163	Assert(natts != nkeyatts);
2164	newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData));
2165	tidpivot = palloc0(newsize);
2166	memcpy(tidpivot, pivot, IndexTupleSize(pivot));
2167	/ cannot leak memory here /
2168	pfree(pivot);
2169	pivot = tidpivot;
2170	}
2171	else
2172	{
2173	/*
2174	* No truncation was possible, since key attributes are all equal.
2175	* It's necessary to add a heap TID attribute to the new pivot tuple.
2176	*/
2177	Assert(natts == nkeyatts);
2178	newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData));
2179	pivot = palloc0(newsize);
2180	memcpy(pivot, firstright, IndexTupleSize(firstright));
2181	}
2182
2183	/*
2184	* We have to use heap TID as a unique-ifier in the new pivot tuple, since
2185	* no non-TID key attribute in the right item readily distinguishes the
2186	* right side of the split from the left side. Use enlarged space that
2187	* holds a copy of first right tuple; place a heap TID value within the
2188	* extra space that remains at the end.
2189	*
2190	* nbtree conceptualizes this case as an inability to truncate away any
2191	* key attribute. We must use an alternative representation of heap TID
2192	* within pivots because heap TID is only treated as an attribute within
2193	* nbtree (e.g., there is no pg_attribute entry).
2194	*/
2195	Assert(itup_key->heapkeyspace);
2196	pivot->t_info &= ~INDEX_SIZE_MASK;
2197	pivot->t_info \|= newsize;
2198
2199	/*
2200	* Lehman & Yao use lastleft as the leaf high key in all cases, but don't
2201	* consider suffix truncation. It seems like a good idea to follow that
2202	* example in cases where no truncation takes place -- use lastleft's heap
2203	* TID. (This is also the closest value to negative infinity that's
2204	* legally usable.)
2205	*/
2206	pivotheaptid = (ItemPointer) ((char *) pivot + newsize -
2207	sizeof(ItemPointerData));
2208	ItemPointerCopy(&lastleft->t_tid, pivotheaptid);
2209
2210	/*
2211	* Lehman and Yao require that the downlink to the right page, which is to
2212	* be inserted into the parent page in the second phase of a page split be
2213	* a strict lower bound on items on the right page, and a non-strict upper
2214	* bound for items on the left page. Assert that heap TIDs follow these
2215	* invariants, since a heap TID value is apparently needed as a
2216	* tiebreaker.
2217	*/
2218	#ifndef DEBUG_NO_TRUNCATE
2219	Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < `0`);
2220	Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= `0`);
2221	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < `0`);
2222	#else
2223
2224	/*
2225	* Those invariants aren't guaranteed to hold for lastleft + firstright
2226	* heap TID attribute values when they're considered here only because
2227	* DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
2228	* needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap
2229	* TID value that always works as a strict lower bound for items to the
2230	* right. In particular, it must avoid using firstright's leading key
2231	* attribute values along with lastleft's heap TID value when lastleft's
2232	* TID happens to be greater than firstright's TID.
2233	*/
2234	ItemPointerCopy(&firstright->t_tid, pivotheaptid);
2235
2236	/*
2237	* Pivot heap TID should never be fully equal to firstright. Note that
2238	* the pivot heap TID will still end up equal to lastleft's heap TID when
2239	* that's the only usable value.
2240	*/
2241	ItemPointerSetOffsetNumber(pivotheaptid,
2242	OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
2243	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < `0`);
2244	#endif
2245
2246	BTreeTupleSetNAtts(pivot, nkeyatts);
2247	BTreeTupleSetAltHeapTID(pivot);
2248
2249	return pivot;
2250	}
2251
2252	/*
2253	* _bt_keep_natts - how many key attributes to keep when truncating.
2254	*
2255	* Caller provides two tuples that enclose a split point. Caller's insertion
2256	* scankey is used to compare the tuples; the scankey's argument values are
2257	* not considered here.
2258	*
2259	* This can return a number of attributes that is one greater than the
2260	* number of key attributes for the index relation. This indicates that the
2261	* caller must use a heap TID as a unique-ifier in new pivot tuple.
2262	*/
2263	static int
2264	_bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
2265	BTScanInsert itup_key)
2266	{
2267	int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
2268	TupleDesc itupdesc = RelationGetDescr(rel);
2269	int keepnatts;
2270	ScanKey scankey;
2271
2272	/*
2273	* Be consistent about the representation of BTREE_VERSION 2/3 tuples
2274	* across Postgres versions; don't allow new pivot tuples to have
2275	* truncated key attributes there. _bt_compare() treats truncated key
2276	* attributes as having the value minus infinity, which would break
2277	* searches within !heapkeyspace indexes.
2278	*/
2279	if (!itup_key->heapkeyspace)
2280	{
2281	Assert(nkeyatts != IndexRelationGetNumberOfAttributes(rel));
2282	return nkeyatts;
2283	}
2284
2285	scankey = itup_key->scankeys;
2286	keepnatts = `1`;
2287	for (int attnum = `1`; attnum <= nkeyatts; attnum++, scankey++)
2288	{
2289	Datum datum1,
2290	datum2;
2291	bool isNull1,
2292	isNull2;
2293
2294	datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
2295	datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
2296
2297	if (isNull1 != isNull2)
2298	break;
2299
2300	if (!isNull1 &&
2301	DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
2302	scankey->sk_collation,
2303	datum1,
2304	datum2)) != `0`)
2305	break;
2306
2307	keepnatts++;
2308	}
2309
2310	return keepnatts;
2311	}
2312
2313	/*
2314	* _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
2315	*
2316	* This is exported so that a candidate split point can have its effect on
2317	* suffix truncation inexpensively evaluated ahead of time when finding a
2318	* split location. A naive bitwise approach to datum comparisons is used to
2319	* save cycles.
2320	*
2321	* The approach taken here usually provides the same answer as _bt_keep_natts
2322	* will (for the same pair of tuples from a heapkeyspace index), since the
2323	* majority of btree opclasses can never indicate that two datums are equal
2324	* unless they're bitwise equal (once detoasted). Similarly, result may
2325	* differ from the _bt_keep_natts result when either tuple has TOASTed datums,
2326	* though this is barely possible in practice.
2327	*
2328	* These issues must be acceptable to callers, typically because they're only
2329	* concerned about making suffix truncation as effective as possible without
2330	* leaving excessive amounts of free space on either side of page split.
2331	* Callers can rely on the fact that attributes considered equal here are
2332	* definitely also equal according to _bt_keep_natts.
2333	*/
2334	int
2335	_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
2336	{
2337	TupleDesc itupdesc = RelationGetDescr(rel);
2338	int keysz = IndexRelationGetNumberOfKeyAttributes(rel);
2339	int keepnatts;
2340
2341	keepnatts = `1`;
2342	for (int attnum = `1`; attnum <= keysz; attnum++)
2343	{
2344	Datum datum1,
2345	datum2;
2346	bool isNull1,
2347	isNull2;
2348	Form_pg_attribute att;
2349
2350	datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
2351	datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
2352	att = TupleDescAttr(itupdesc, attnum - `1`);
2353
2354	if (isNull1 != isNull2)
2355	break;
2356
2357	if (!isNull1 &&
2358	!datumIsEqual(datum1, datum2, att->attbyval, att->attlen))
2359	break;
2360
2361	keepnatts++;
2362	}
2363
2364	return keepnatts;
2365	}
2366
2367	/*
2368	* _bt_check_natts() -- Verify tuple has expected number of attributes.
2369	*
2370	* Returns value indicating if the expected number of attributes were found
2371	* for a particular offset on page. This can be used as a general purpose
2372	* sanity check.
2373	*
2374	* Testing a tuple directly with BTreeTupleGetNAtts() should generally be
2375	* preferred to calling here. That's usually more convenient, and is always
2376	* more explicit. Call here instead when offnum's tuple may be a negative
2377	* infinity tuple that uses the pre-v11 on-disk representation, or when a low
2378	* context check is appropriate. This routine is as strict as possible about
2379	* what is expected on each version of btree.
2380	*/
2381	bool
2382	_bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
2383	{
2384	int16 natts = IndexRelationGetNumberOfAttributes(rel);
2385	int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
2386	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2387	IndexTuple itup;
2388	int tupnatts;
2389
2390	/*
2391	* We cannot reliably test a deleted or half-deleted page, since they have
2392	* dummy high keys
2393	*/
2394	if (P_IGNORE(opaque))
2395	return true;
2396
2397	Assert(offnum >= FirstOffsetNumber &&
2398	offnum <= PageGetMaxOffsetNumber(page));
2399
2400	/*
2401	* Mask allocated for number of keys in index tuple must be able to fit
2402	* maximum possible number of index attributes
2403	*/
2404	StaticAssertStmt(BT_N_KEYS_OFFSET_MASK >= INDEX_MAX_KEYS,
2405	"BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
2406
2407	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2408	tupnatts = BTreeTupleGetNAtts(itup, rel);
2409
2410	if (P_ISLEAF(opaque))
2411	{
2412	if (offnum >= P_FIRSTDATAKEY(opaque))
2413	{
2414	/*
2415	* Non-pivot tuples currently never use alternative heap TID
2416	* representation -- even those within heapkeyspace indexes
2417	*/
2418	if ((itup->t_info & INDEX_ALT_TID_MASK) != `0`)
2419	return false;
2420
2421	/*
2422	* Leaf tuples that are not the page high key (non-pivot tuples)
2423	* should never be truncated. (Note that tupnatts must have been
2424	* inferred, rather than coming from an explicit on-disk
2425	* representation.)
2426	*/
2427	return tupnatts == natts;
2428	}
2429	else
2430	{
2431	/*
2432	* Rightmost page doesn't contain a page high key, so tuple was
2433	* checked above as ordinary leaf tuple
2434	*/
2435	Assert(!P_RIGHTMOST(opaque));
2436
2437	/*
2438	* !heapkeyspace high key tuple contains only key attributes. Note
2439	* that tupnatts will only have been explicitly represented in
2440	* !heapkeyspace indexes that happen to have non-key attributes.
2441	*/
2442	if (!heapkeyspace)
2443	return tupnatts == nkeyatts;
2444
2445	/ Use generic heapkeyspace pivot tuple handling /
2446	}
2447	}
2448	else / !P_ISLEAF(opaque) /
2449	{
2450	if (offnum == P_FIRSTDATAKEY(opaque))
2451	{
2452	/*
2453	* The first tuple on any internal page (possibly the first after
2454	* its high key) is its negative infinity tuple. Negative
2455	* infinity tuples are always truncated to zero attributes. They
2456	* are a particular kind of pivot tuple.
2457	*/
2458	if (heapkeyspace)
2459	return tupnatts == `0`;
2460
2461	/*
2462	* The number of attributes won't be explicitly represented if the
2463	* negative infinity tuple was generated during a page split that
2464	* occurred with a version of Postgres before v11. There must be
2465	* a problem when there is an explicit representation that is
2466	* non-zero, or when there is no explicit representation and the
2467	* tuple is evidently not a pre-pg_upgrade tuple.
2468	*
2469	* Prior to v11, downlinks always had P_HIKEY as their offset. Use
2470	* that to decide if the tuple is a pre-v11 tuple.
2471	*/
2472	return tupnatts == `0` \|\|
2473	((itup->t_info & INDEX_ALT_TID_MASK) == `0` &&
2474	ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
2475	}
2476	else
2477	{
2478	/*
2479	* !heapkeyspace downlink tuple with separator key contains only
2480	* key attributes. Note that tupnatts will only have been
2481	* explicitly represented in !heapkeyspace indexes that happen to
2482	* have non-key attributes.
2483	*/
2484	if (!heapkeyspace)
2485	return tupnatts == nkeyatts;
2486
2487	/ Use generic heapkeyspace pivot tuple handling /
2488	}
2489
2490	}
2491
2492	/ Handle heapkeyspace pivot tuples (excluding minus infinity items) /
2493	Assert(heapkeyspace);
2494
2495	/*
2496	* Explicit representation of the number of attributes is mandatory with
2497	* heapkeyspace index pivot tuples, regardless of whether or not there are
2498	* non-key attributes.
2499	*/
2500	if ((itup->t_info & INDEX_ALT_TID_MASK) == `0`)
2501	return false;
2502
2503	/*
2504	* Heap TID is a tiebreaker key attribute, so it cannot be untruncated
2505	* when any other key attribute is truncated
2506	*/
2507	if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
2508	return false;
2509
2510	/*
2511	* Pivot tuple must have at least one untruncated key attribute (minus
2512	* infinity pivot tuples are the only exception). Pivot tuples can never
2513	* represent that there is a value present for a key attribute that
2514	* exceeds pg_index.indnkeyatts for the index.
2515	*/
2516	return tupnatts > `0` && tupnatts <= nkeyatts;
2517	}
2518
2519	/*
2520	*
2521	* _bt_check_third_page() -- check whether tuple fits on a btree page at all.
2522	*
2523	* We actually need to be able to fit three items on every page, so restrict
2524	* any one item to 1/3 the per-page available space. Note that itemsz should
2525	* not include the ItemId overhead.
2526	*
2527	* It might be useful to apply TOAST methods rather than throw an error here.
2528	* Using out of line storage would break assumptions made by suffix truncation
2529	* and by contrib/amcheck, though.
2530	*/
2531	void
2532	_bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
2533	Page page, IndexTuple newtup)
2534	{
2535	Size itemsz;
2536	BTPageOpaque opaque;
2537
2538	itemsz = MAXALIGN(IndexTupleSize(newtup));
2539
2540	/ Double check item size against limit /
2541	if (itemsz <= BTMaxItemSize(page))
2542	return;
2543
2544	/*
2545	* Tuple is probably too large to fit on page, but it's possible that the
2546	* index uses version 2 or version 3, or that page is an internal page, in
2547	* which case a slightly higher limit applies.
2548	*/
2549	if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
2550	return;
2551
2552	/*
2553	* Internal page insertions cannot fail here, because that would mean that
2554	* an earlier leaf level insertion that should have failed didn't
2555	*/
2556	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2557	if (!P_ISLEAF(opaque))
2558	elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
2559	itemsz, RelationGetRelationName(rel));
2560
2561	ereport(ERROR,
2562	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2563	errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
2564	itemsz,
2565	needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
2566	needheaptidspace ? BTMaxItemSize(page) :
2567	BTMaxItemSizeNoHeapTid(page),
2568	RelationGetRelationName(rel)),
2569	errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
2570	ItemPointerGetBlockNumber(&newtup->t_tid),
2571	ItemPointerGetOffsetNumber(&newtup->t_tid),
2572	RelationGetRelationName(heap)),
2573	errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
2574	"Consider a function index of an MD5 hash of the value, "
2575	"or use full text indexing."),
2576	errtableconstraint(heap, RelationGetRelationName(rel))));
2577	}
2578

Browse the source code of PostgreSQL/src/backend/access/nbtree/nbtutils.c