nbtree.c source code [PostgreSQL/src/backend/access/nbtree/nbtree.c]

1	/-------------------------------------------------------------------------*
2	*
3	* nbtree.c
4	* Implementation of Lehman and Yao's btree management algorithm for
5	* Postgres.
6	*
7	* NOTES
8	* This file contains only the public interface routines.
9	*
10	*
11	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
12	* Portions Copyright (c) 1994, Regents of the University of California
13	*
14	* IDENTIFICATION
15	* src/backend/access/nbtree/nbtree.c
16	*
17	*-------------------------------------------------------------------------
18	*/
19	#include "postgres.h"
20
21	#include "access/nbtree.h"
22	#include "access/nbtxlog.h"
23	#include "access/relscan.h"
24	#include "access/xlog.h"
25	#include "commands/progress.h"
26	#include "commands/vacuum.h"
27	#include "miscadmin.h"
28	#include "nodes/execnodes.h"
29	#include "pgstat.h"
30	#include "postmaster/autovacuum.h"
31	#include "storage/condition_variable.h"
32	#include "storage/indexfsm.h"
33	#include "storage/ipc.h"
34	#include "storage/lmgr.h"
35	#include "storage/smgr.h"
36	#include "utils/builtins.h"
37	#include "utils/index_selfuncs.h"
38	#include "utils/memutils.h"
39
40
41	/ Working state needed by btvacuumpage /
42	typedef struct
43	{
44	IndexVacuumInfo *info;
45	IndexBulkDeleteResult *stats;
46	IndexBulkDeleteCallback callback;
47	void *callback_state;
48	BTCycleId cycleid;
49	BlockNumber lastBlockVacuumed; / highest blkno actually vacuumed /
50	BlockNumber lastBlockLocked; / highest blkno we've cleanup-locked /
51	BlockNumber totFreePages; / true total # of free pages /
52	TransactionId oldestBtpoXact;
53	MemoryContext pagedelcontext;
54	} BTVacState;
55
56	/*
57	* BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
58	*
59	* BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
60	* a new page; others must wait.
61	*
62	* BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
63	* to a new page; some process can start doing that.
64	*
65	* BTPARALLEL_DONE indicates that the scan is complete (including error exit).
66	* We reach this state once for every distinct combination of array keys.
67	*/
68	typedef enum
69	{
70	BTPARALLEL_NOT_INITIALIZED,
71	BTPARALLEL_ADVANCING,
72	BTPARALLEL_IDLE,
73	BTPARALLEL_DONE
74	} BTPS_State;
75
76	/*
77	* BTParallelScanDescData contains btree specific shared information required
78	* for parallel scan.
79	*/
80	typedef struct BTParallelScanDescData
81	{
82	BlockNumber btps_scanPage; / latest or next page to be scanned /
83	BTPS_State btps_pageStatus; / indicates whether next page is*
84	* available for scan. see above for
85	* possible states of parallel scan. */
86	int btps_arrayKeyCount; / count indicating number of array scan*
87	* keys processed by parallel scan */
88	slock_t btps_mutex; / protects above variables /
89	ConditionVariable btps_cv; / used to synchronize parallel scan /
90	} BTParallelScanDescData;
91
92	typedef struct BTParallelScanDescData *BTParallelScanDesc;
93
94
95	static void btvacuumscan(IndexVacuumInfo info, IndexBulkDeleteResult stats,
96	IndexBulkDeleteCallback callback, void *callback_state,
97	BTCycleId cycleid, TransactionId *oldestBtpoXact);
98	static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
99	BlockNumber orig_blkno);
100
101
102	/*
103	* Btree handler function: return IndexAmRoutine with access method parameters
104	* and callbacks.
105	*/
106	Datum
107	bthandler(PG_FUNCTION_ARGS)
108	{
109	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
110
111	amroutine->amstrategies = BTMaxStrategyNumber;
112	amroutine->amsupport = BTNProcs;
113	amroutine->amcanorder = true;
114	amroutine->amcanorderbyop = false;
115	amroutine->amcanbackward = true;
116	amroutine->amcanunique = true;
117	amroutine->amcanmulticol = true;
118	amroutine->amoptionalkey = true;
119	amroutine->amsearcharray = true;
120	amroutine->amsearchnulls = true;
121	amroutine->amstorage = false;
122	amroutine->amclusterable = true;
123	amroutine->ampredlocks = true;
124	amroutine->amcanparallel = true;
125	amroutine->amcaninclude = true;
126	amroutine->amkeytype = InvalidOid;
127
128	amroutine->ambuild = btbuild;
129	amroutine->ambuildempty = btbuildempty;
130	amroutine->aminsert = btinsert;
131	amroutine->ambulkdelete = btbulkdelete;
132	amroutine->amvacuumcleanup = btvacuumcleanup;
133	amroutine->amcanreturn = btcanreturn;
134	amroutine->amcostestimate = btcostestimate;
135	amroutine->amoptions = btoptions;
136	amroutine->amproperty = btproperty;
137	amroutine->ambuildphasename = btbuildphasename;
138	amroutine->amvalidate = btvalidate;
139	amroutine->ambeginscan = btbeginscan;
140	amroutine->amrescan = btrescan;
141	amroutine->amgettuple = btgettuple;
142	amroutine->amgetbitmap = btgetbitmap;
143	amroutine->amendscan = btendscan;
144	amroutine->ammarkpos = btmarkpos;
145	amroutine->amrestrpos = btrestrpos;
146	amroutine->amestimateparallelscan = btestimateparallelscan;
147	amroutine->aminitparallelscan = btinitparallelscan;
148	amroutine->amparallelrescan = btparallelrescan;
149
150	PG_RETURN_POINTER(amroutine);
151	}
152
153	/*
154	* btbuildempty() -- build an empty btree index in the initialization fork
155	*/
156	void
157	btbuildempty(Relation index)
158	{
159	Page metapage;
160
161	/ Construct metapage. /
162	metapage = (Page) palloc(BLCKSZ);
163	_bt_initmetapage(metapage, P_NONE, `0`);
164
165	/*
166	* Write the page and log it. It might seem that an immediate sync would
167	* be sufficient to guarantee that the file exists on disk, but recovery
168	* itself might remove it while replaying, for example, an
169	* XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
170	* this even when wal_level=minimal.
171	*/
172	PageSetChecksumInplace(metapage, BTREE_METAPAGE);
173	smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
174	(char *) metapage, true);
175	log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
176	BTREE_METAPAGE, metapage, true);
177
178	/*
179	* An immediate sync is required even if we xlog'd the page, because the
180	* write did not go through shared_buffers and therefore a concurrent
181	* checkpoint may have moved the redo pointer past our xlog record.
182	*/
183	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
184	}
185
186	/*
187	* btinsert() -- insert an index tuple into a btree.
188	*
189	* Descend the tree recursively, find the appropriate location for our
190	* new tuple, and put it there.
191	*/
192	bool
193	btinsert(Relation rel, Datum values, bool isnull,
194	ItemPointer ht_ctid, Relation heapRel,
195	IndexUniqueCheck checkUnique,
196	IndexInfo *indexInfo)
197	{
198	bool result;
199	IndexTuple itup;
200
201	/ generate an index tuple /
202	itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
203	itup->t_tid = *ht_ctid;
204
205	result = _bt_doinsert(rel, itup, checkUnique, heapRel);
206
207	pfree(itup);
208
209	return result;
210	}
211
212	/*
213	* btgettuple() -- Get the next tuple in the scan.
214	*/
215	bool
216	btgettuple(IndexScanDesc scan, ScanDirection dir)
217	{
218	BTScanOpaque so = (BTScanOpaque) scan->opaque;
219	bool res;
220
221	/ btree indexes are never lossy /
222	scan->xs_recheck = false;
223
224	/*
225	* If we have any array keys, initialize them during first call for a
226	* scan. We can't do this in btrescan because we don't know the scan
227	* direction at that time.
228	*/
229	if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
230	{
231	/ punt if we have any unsatisfiable array keys /
232	if (so->numArrayKeys < `0`)
233	return false;
234
235	_bt_start_array_keys(scan, dir);
236	}
237
238	/ This loop handles advancing to the next array elements, if any /
239	do
240	{
241	/*
242	* If we've already initialized this scan, we can just advance it in
243	* the appropriate direction. If we haven't done so yet, we call
244	* _bt_first() to get the first item in the scan.
245	*/
246	if (!BTScanPosIsValid(so->currPos))
247	res = _bt_first(scan, dir);
248	else
249	{
250	/*
251	* Check to see if we should kill the previously-fetched tuple.
252	*/
253	if (scan->kill_prior_tuple)
254	{
255	/*
256	* Yes, remember it for later. (We'll deal with all such
257	* tuples at once right before leaving the index page.) The
258	* test for numKilled overrun is not just paranoia: if the
259	* caller reverses direction in the indexscan then the same
260	* item might get entered multiple times. It's not worth
261	* trying to optimize that, so we don't detect it, but instead
262	* just forget any excess entries.
263	*/
264	if (so->killedItems == NULL)
265	so->killedItems = (int *)
266	palloc(MaxIndexTuplesPerPage * sizeof(int));
267	if (so->numKilled < MaxIndexTuplesPerPage)
268	so->killedItems[so->numKilled++] = so->currPos.itemIndex;
269	}
270
271	/*
272	* Now continue the scan.
273	*/
274	res = _bt_next(scan, dir);
275	}
276
277	/ If we have a tuple, return it ... /
278	if (res)
279	break;
280	/ ... otherwise see if we have more array keys to deal with /
281	} while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
282
283	return res;
284	}
285
286	/*
287	* btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
288	*/
289	int64
290	btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
291	{
292	BTScanOpaque so = (BTScanOpaque) scan->opaque;
293	int64 ntids = `0`;
294	ItemPointer heapTid;
295
296	/*
297	* If we have any array keys, initialize them.
298	*/
299	if (so->numArrayKeys)
300	{
301	/ punt if we have any unsatisfiable array keys /
302	if (so->numArrayKeys < `0`)
303	return ntids;
304
305	_bt_start_array_keys(scan, ForwardScanDirection);
306	}
307
308	/ This loop handles advancing to the next array elements, if any /
309	do
310	{
311	/ Fetch the first page & tuple /
312	if (_bt_first(scan, ForwardScanDirection))
313	{
314	/ Save tuple ID, and continue scanning /
315	heapTid = &scan->xs_heaptid;
316	tbm_add_tuples(tbm, heapTid, `1`, false);
317	ntids++;
318
319	for (;;)
320	{
321	/*
322	* Advance to next tuple within page. This is the same as the
323	* easy case in _bt_next().
324	*/
325	if (++so->currPos.itemIndex > so->currPos.lastItem)
326	{
327	/ let _bt_next do the heavy lifting /
328	if (!_bt_next(scan, ForwardScanDirection))
329	break;
330	}
331
332	/ Save tuple ID, and continue scanning /
333	heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
334	tbm_add_tuples(tbm, heapTid, `1`, false);
335	ntids++;
336	}
337	}
338	/ Now see if we have more array keys to deal with /
339	} while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
340
341	return ntids;
342	}
343
344	/*
345	* btbeginscan() -- start a scan on a btree index
346	*/
347	IndexScanDesc
348	btbeginscan(Relation rel, int nkeys, int norderbys)
349	{
350	IndexScanDesc scan;
351	BTScanOpaque so;
352
353	/ no order by operators allowed /
354	Assert(norderbys == `0`);
355
356	/ get the scan /
357	scan = RelationGetIndexScan(rel, nkeys, norderbys);
358
359	/ allocate private workspace /
360	so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
361	BTScanPosInvalidate(so->currPos);
362	BTScanPosInvalidate(so->markPos);
363	if (scan->numberOfKeys > `0`)
364	so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
365	else
366	so->keyData = NULL;
367
368	so->arrayKeyData = NULL; / assume no array keys for now /
369	so->numArrayKeys = `0`;
370	so->arrayKeys = NULL;
371	so->arrayContext = NULL;
372
373	so->killedItems = NULL; / until needed /
374	so->numKilled = `0`;
375
376	/*
377	* We don't know yet whether the scan will be index-only, so we do not
378	* allocate the tuple workspace arrays until btrescan. However, we set up
379	* scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
380	*/
381	so->currTuples = so->markTuples = NULL;
382
383	scan->xs_itupdesc = RelationGetDescr(rel);
384
385	scan->opaque = so;
386
387	return scan;
388	}
389
390	/*
391	* btrescan() -- rescan an index relation
392	*/
393	void
394	btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
395	ScanKey orderbys, int norderbys)
396	{
397	BTScanOpaque so = (BTScanOpaque) scan->opaque;
398
399	/ we aren't holding any read locks, but gotta drop the pins /
400	if (BTScanPosIsValid(so->currPos))
401	{
402	/ Before leaving current page, deal with any killed items /
403	if (so->numKilled > `0`)
404	_bt_killitems(scan);
405	BTScanPosUnpinIfPinned(so->currPos);
406	BTScanPosInvalidate(so->currPos);
407	}
408
409	so->markItemIndex = -`1`;
410	so->arrayKeyCount = `0`;
411	BTScanPosUnpinIfPinned(so->markPos);
412	BTScanPosInvalidate(so->markPos);
413
414	/*
415	* Allocate tuple workspace arrays, if needed for an index-only scan and
416	* not already done in a previous rescan call. To save on palloc
417	* overhead, both workspaces are allocated as one palloc block; only this
418	* function and btendscan know that.
419	*
420	* NOTE: this data structure also makes it safe to return data from a
421	* "name" column, even though btree name_ops uses an underlying storage
422	* datatype of cstring. The risk there is that "name" is supposed to be
423	* padded to NAMEDATALEN, but the actual index tuple is probably shorter.
424	* However, since we only return data out of tuples sitting in the
425	* currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
426	* data out of the markTuples array --- running off the end of memory for
427	* a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
428	* adding special-case treatment for name_ops elsewhere.
429	*/
430	if (scan->xs_want_itup && so->currTuples == NULL)
431	{
432	so->currTuples = (char ) palloc(BLCKSZ `2`);
433	so->markTuples = so->currTuples + BLCKSZ;
434	}
435
436	/*
437	* Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
438	* - vadim 05/05/97
439	*/
440	if (scankey && scan->numberOfKeys > `0`)
441	memmove(scan->keyData,
442	scankey,
443	scan->numberOfKeys * sizeof(ScanKeyData));
444	so->numberOfKeys = `0`; / until _bt_preprocess_keys sets it /
445
446	/ If any keys are SK_SEARCHARRAY type, set up array-key info /
447	_bt_preprocess_array_keys(scan);
448	}
449
450	/*
451	* btendscan() -- close down a scan
452	*/
453	void
454	btendscan(IndexScanDesc scan)
455	{
456	BTScanOpaque so = (BTScanOpaque) scan->opaque;
457
458	/ we aren't holding any read locks, but gotta drop the pins /
459	if (BTScanPosIsValid(so->currPos))
460	{
461	/ Before leaving current page, deal with any killed items /
462	if (so->numKilled > `0`)
463	_bt_killitems(scan);
464	BTScanPosUnpinIfPinned(so->currPos);
465	}
466
467	so->markItemIndex = -`1`;
468	BTScanPosUnpinIfPinned(so->markPos);
469
470	/ No need to invalidate positions, the RAM is about to be freed. /
471
472	/ Release storage /
473	if (so->keyData != NULL)
474	pfree(so->keyData);
475	/ so->arrayKeyData and so->arrayKeys are in arrayContext /
476	if (so->arrayContext != NULL)
477	MemoryContextDelete(so->arrayContext);
478	if (so->killedItems != NULL)
479	pfree(so->killedItems);
480	if (so->currTuples != NULL)
481	pfree(so->currTuples);
482	/ so->markTuples should not be pfree'd, see btrescan /
483	pfree(so);
484	}
485
486	/*
487	* btmarkpos() -- save current scan position
488	*/
489	void
490	btmarkpos(IndexScanDesc scan)
491	{
492	BTScanOpaque so = (BTScanOpaque) scan->opaque;
493
494	/ There may be an old mark with a pin (but no lock). /
495	BTScanPosUnpinIfPinned(so->markPos);
496
497	/*
498	* Just record the current itemIndex. If we later step to next page
499	* before releasing the marked position, _bt_steppage makes a full copy of
500	* the currPos struct in markPos. If (as often happens) the mark is moved
501	* before we leave the page, we don't have to do that work.
502	*/
503	if (BTScanPosIsValid(so->currPos))
504	so->markItemIndex = so->currPos.itemIndex;
505	else
506	{
507	BTScanPosInvalidate(so->markPos);
508	so->markItemIndex = -`1`;
509	}
510
511	/ Also record the current positions of any array keys /
512	if (so->numArrayKeys)
513	_bt_mark_array_keys(scan);
514	}
515
516	/*
517	* btrestrpos() -- restore scan to last saved position
518	*/
519	void
520	btrestrpos(IndexScanDesc scan)
521	{
522	BTScanOpaque so = (BTScanOpaque) scan->opaque;
523
524	/ Restore the marked positions of any array keys /
525	if (so->numArrayKeys)
526	_bt_restore_array_keys(scan);
527
528	if (so->markItemIndex >= `0`)
529	{
530	/*
531	* The scan has never moved to a new page since the last mark. Just
532	* restore the itemIndex.
533	*
534	* NB: In this case we can't count on anything in so->markPos to be
535	* accurate.
536	*/
537	so->currPos.itemIndex = so->markItemIndex;
538	}
539	else
540	{
541	/*
542	* The scan moved to a new page after last mark or restore, and we are
543	* now restoring to the marked page. We aren't holding any read
544	* locks, but if we're still holding the pin for the current position,
545	* we must drop it.
546	*/
547	if (BTScanPosIsValid(so->currPos))
548	{
549	/ Before leaving current page, deal with any killed items /
550	if (so->numKilled > `0`)
551	_bt_killitems(scan);
552	BTScanPosUnpinIfPinned(so->currPos);
553	}
554
555	if (BTScanPosIsValid(so->markPos))
556	{
557	/ bump pin on mark buffer for assignment to current buffer /
558	if (BTScanPosIsPinned(so->markPos))
559	IncrBufferRefCount(so->markPos.buf);
560	memcpy(&so->currPos, &so->markPos,
561	offsetof(BTScanPosData, items[`1`]) +
562	so->markPos.lastItem * sizeof(BTScanPosItem));
563	if (so->currTuples)
564	memcpy(so->currTuples, so->markTuples,
565	so->markPos.nextTupleOffset);
566	}
567	else
568	BTScanPosInvalidate(so->currPos);
569	}
570	}
571
572	/*
573	* btestimateparallelscan -- estimate storage for BTParallelScanDescData
574	*/
575	Size
576	btestimateparallelscan(void)
577	{
578	return sizeof(BTParallelScanDescData);
579	}
580
581	/*
582	* btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
583	*/
584	void
585	btinitparallelscan(void *target)
586	{
587	BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
588
589	SpinLockInit(&bt_target->btps_mutex);
590	bt_target->btps_scanPage = InvalidBlockNumber;
591	bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
592	bt_target->btps_arrayKeyCount = `0`;
593	ConditionVariableInit(&bt_target->btps_cv);
594	}
595
596	/*
597	* btparallelrescan() -- reset parallel scan
598	*/
599	void
600	btparallelrescan(IndexScanDesc scan)
601	{
602	BTParallelScanDesc btscan;
603	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
604
605	Assert(parallel_scan);
606
607	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
608	parallel_scan->ps_offset);
609
610	/*
611	* In theory, we don't need to acquire the spinlock here, because there
612	* shouldn't be any other workers running at this point, but we do so for
613	* consistency.
614	*/
615	SpinLockAcquire(&btscan->btps_mutex);
616	btscan->btps_scanPage = InvalidBlockNumber;
617	btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
618	btscan->btps_arrayKeyCount = `0`;
619	SpinLockRelease(&btscan->btps_mutex);
620	}
621
622	/*
623	* _bt_parallel_seize() -- Begin the process of advancing the scan to a new
624	* page. Other scans must wait until we call _bt_parallel_release()
625	* or _bt_parallel_done().
626	*
627	* The return value is true if we successfully seized the scan and false
628	* if we did not. The latter case occurs if no pages remain for the current
629	* set of scankeys.
630	*
631	* If the return value is true, *pageno returns the next or current page
632	* of the scan (depending on the scan direction). An invalid block number
633	* means the scan hasn't yet started, and P_NONE means we've reached the end.
634	* The first time a participating process reaches the last page, it will return
635	* true and set *pageno to P_NONE; after that, further attempts to seize the
636	* scan will return false.
637	*
638	* Callers should ignore the value of pageno if the return value is false.
639	*/
640	bool
641	_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
642	{
643	BTScanOpaque so = (BTScanOpaque) scan->opaque;
644	BTPS_State pageStatus;
645	bool exit_loop = false;
646	bool status = true;
647	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
648	BTParallelScanDesc btscan;
649
650	*pageno = P_NONE;
651
652	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
653	parallel_scan->ps_offset);
654
655	while (`1`)
656	{
657	SpinLockAcquire(&btscan->btps_mutex);
658	pageStatus = btscan->btps_pageStatus;
659
660	if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
661	{
662	/ Parallel scan has already advanced to a new set of scankeys. /
663	status = false;
664	}
665	else if (pageStatus == BTPARALLEL_DONE)
666	{
667	/*
668	* We're done with this set of scankeys. This may be the end, or
669	* there could be more sets to try.
670	*/
671	status = false;
672	}
673	else if (pageStatus != BTPARALLEL_ADVANCING)
674	{
675	/*
676	* We have successfully seized control of the scan for the purpose
677	* of advancing it to a new page!
678	*/
679	btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
680	*pageno = btscan->btps_scanPage;
681	exit_loop = true;
682	}
683	SpinLockRelease(&btscan->btps_mutex);
684	if (exit_loop \|\| !status)
685	break;
686	ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
687	}
688	ConditionVariableCancelSleep();
689
690	return status;
691	}
692
693	/*
694	* _bt_parallel_release() -- Complete the process of advancing the scan to a
695	* new page. We now have the new value btps_scanPage; some other backend
696	* can now begin advancing the scan.
697	*/
698	void
699	_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
700	{
701	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
702	BTParallelScanDesc btscan;
703
704	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
705	parallel_scan->ps_offset);
706
707	SpinLockAcquire(&btscan->btps_mutex);
708	btscan->btps_scanPage = scan_page;
709	btscan->btps_pageStatus = BTPARALLEL_IDLE;
710	SpinLockRelease(&btscan->btps_mutex);
711	ConditionVariableSignal(&btscan->btps_cv);
712	}
713
714	/*
715	* _bt_parallel_done() -- Mark the parallel scan as complete.
716	*
717	* When there are no pages left to scan, this function should be called to
718	* notify other workers. Otherwise, they might wait forever for the scan to
719	* advance to the next page.
720	*/
721	void
722	_bt_parallel_done(IndexScanDesc scan)
723	{
724	BTScanOpaque so = (BTScanOpaque) scan->opaque;
725	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
726	BTParallelScanDesc btscan;
727	bool status_changed = false;
728
729	/ Do nothing, for non-parallel scans /
730	if (parallel_scan == NULL)
731	return;
732
733	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
734	parallel_scan->ps_offset);
735
736	/*
737	* Mark the parallel scan as done for this combination of scan keys,
738	* unless some other process already did so. See also
739	* _bt_advance_array_keys.
740	*/
741	SpinLockAcquire(&btscan->btps_mutex);
742	if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
743	btscan->btps_pageStatus != BTPARALLEL_DONE)
744	{
745	btscan->btps_pageStatus = BTPARALLEL_DONE;
746	status_changed = true;
747	}
748	SpinLockRelease(&btscan->btps_mutex);
749
750	/ wake up all the workers associated with this parallel scan /
751	if (status_changed)
752	ConditionVariableBroadcast(&btscan->btps_cv);
753	}
754
755	/*
756	* _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
757	* keys.
758	*
759	* Updates the count of array keys processed for both local and parallel
760	* scans.
761	*/
762	void
763	_bt_parallel_advance_array_keys(IndexScanDesc scan)
764	{
765	BTScanOpaque so = (BTScanOpaque) scan->opaque;
766	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
767	BTParallelScanDesc btscan;
768
769	btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
770	parallel_scan->ps_offset);
771
772	so->arrayKeyCount++;
773	SpinLockAcquire(&btscan->btps_mutex);
774	if (btscan->btps_pageStatus == BTPARALLEL_DONE)
775	{
776	btscan->btps_scanPage = InvalidBlockNumber;
777	btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
778	btscan->btps_arrayKeyCount++;
779	}
780	SpinLockRelease(&btscan->btps_mutex);
781	}
782
783	/*
784	* _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
785	* btbulkdelete() wasn't called.
786	*/
787	static bool
788	_bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
789	{
790	Buffer metabuf;
791	Page metapg;
792	BTMetaPageData *metad;
793	bool result = false;
794
795	metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
796	metapg = BufferGetPage(metabuf);
797	metad = BTPageGetMeta(metapg);
798
799	if (metad->btm_version < BTREE_NOVAC_VERSION)
800	{
801	/*
802	* Do cleanup if metapage needs upgrade, because we don't have
803	* cleanup-related meta-information yet.
804	*/
805	result = true;
806	}
807	else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
808	TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
809	RecentGlobalXmin))
810	{
811	/*
812	* If oldest btpo.xact in the deleted pages is older than
813	* RecentGlobalXmin, then at least one deleted page can be recycled.
814	*/
815	result = true;
816	}
817	else
818	{
819	StdRdOptions *relopts;
820	float8 cleanup_scale_factor;
821	float8 prev_num_heap_tuples;
822
823	/*
824	* If table receives enough insertions and no cleanup was performed,
825	* then index would appear have stale statistics. If scale factor is
826	* set, we avoid that by performing cleanup if the number of inserted
827	* tuples exceeds vacuum_cleanup_index_scale_factor fraction of
828	* original tuples count.
829	*/
830	relopts = (StdRdOptions *) info->index->rd_options;
831	cleanup_scale_factor = (relopts &&
832	relopts->vacuum_cleanup_index_scale_factor >= `0`)
833	? relopts->vacuum_cleanup_index_scale_factor
834	: vacuum_cleanup_index_scale_factor;
835	prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
836
837	if (cleanup_scale_factor <= `0` \|\|
838	prev_num_heap_tuples <= `0` \|\|
839	(info->num_heap_tuples - prev_num_heap_tuples) /
840	prev_num_heap_tuples >= cleanup_scale_factor)
841	result = true;
842	}
843
844	_bt_relbuf(info->index, metabuf);
845	return result;
846	}
847
848	/*
849	* Bulk deletion of all index entries pointing to a set of heap tuples.
850	* The set of target tuples is specified via a callback routine that tells
851	* whether any given heap tuple (identified by ItemPointer) is being deleted.
852	*
853	* Result: a palloc'd struct containing statistical info for VACUUM displays.
854	*/
855	IndexBulkDeleteResult *
856	btbulkdelete(IndexVacuumInfo info, IndexBulkDeleteResult stats,
857	IndexBulkDeleteCallback callback, void *callback_state)
858	{
859	Relation rel = info->index;
860	BTCycleId cycleid;
861
862	/ allocate stats if first time through, else re-use existing struct /
863	if (stats == NULL)
864	stats = (IndexBulkDeleteResult ) palloc0(sizeof*(IndexBulkDeleteResult));
865
866	/ Establish the vacuum cycle ID to use for this scan /
867	/ The ENSURE stuff ensures we clean up shared memory on failure /
868	PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
869	{
870	TransactionId oldestBtpoXact;
871
872	cycleid = _bt_start_vacuum(rel);
873
874	btvacuumscan(info, stats, callback, callback_state, cycleid,
875	&oldestBtpoXact);
876
877	/*
878	* Update cleanup-related information in metapage. This information is
879	* used only for cleanup but keeping them up to date can avoid
880	* unnecessary cleanup even after bulkdelete.
881	*/
882	_bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
883	info->num_heap_tuples);
884	}
885	PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
886	_bt_end_vacuum(rel);
887
888	return stats;
889	}
890
891	/*
892	* Post-VACUUM cleanup.
893	*
894	* Result: a palloc'd struct containing statistical info for VACUUM displays.
895	*/
896	IndexBulkDeleteResult *
897	btvacuumcleanup(IndexVacuumInfo info, IndexBulkDeleteResult stats)
898	{
899	/ No-op in ANALYZE ONLY mode /
900	if (info->analyze_only)
901	return stats;
902
903	/*
904	* If btbulkdelete was called, we need not do anything, just return the
905	* stats from the latest btbulkdelete call. If it wasn't called, we might
906	* still need to do a pass over the index, to recycle any newly-recyclable
907	* pages or to obtain index statistics. _bt_vacuum_needs_cleanup
908	* determines if either are needed.
909	*
910	* Since we aren't going to actually delete any leaf items, there's no
911	* need to go through all the vacuum-cycle-ID pushups.
912	*/
913	if (stats == NULL)
914	{
915	TransactionId oldestBtpoXact;
916
917	/ Check if we need a cleanup /
918	if (!_bt_vacuum_needs_cleanup(info))
919	return NULL;
920
921	stats = (IndexBulkDeleteResult ) palloc0(sizeof*(IndexBulkDeleteResult));
922	btvacuumscan(info, stats, NULL, NULL, `0`, &oldestBtpoXact);
923
924	/ Update cleanup-related information in the metapage /
925	_bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
926	info->num_heap_tuples);
927	}
928
929	/*
930	* It's quite possible for us to be fooled by concurrent page splits into
931	* double-counting some index tuples, so disbelieve any total that exceeds
932	* the underlying heap's count ... if we know that accurately. Otherwise
933	* this might just make matters worse.
934	*/
935	if (!info->estimated_count)
936	{
937	if (stats->num_index_tuples > info->num_heap_tuples)
938	stats->num_index_tuples = info->num_heap_tuples;
939	}
940
941	return stats;
942	}
943
944	/*
945	* btvacuumscan --- scan the index for VACUUMing purposes
946	*
947	* This combines the functions of looking for leaf tuples that are deletable
948	* according to the vacuum callback, looking for empty pages that can be
949	* deleted, and looking for old deleted pages that can be recycled. Both
950	* btbulkdelete and btvacuumcleanup invoke this (the latter only if no
951	* btbulkdelete call occurred).
952	*
953	* The caller is responsible for initially allocating/zeroing a stats struct
954	* and for obtaining a vacuum cycle ID if necessary.
955	*/
956	static void
957	btvacuumscan(IndexVacuumInfo info, IndexBulkDeleteResult stats,
958	IndexBulkDeleteCallback callback, void *callback_state,
959	BTCycleId cycleid, TransactionId *oldestBtpoXact)
960	{
961	Relation rel = info->index;
962	BTVacState vstate;
963	BlockNumber num_pages;
964	BlockNumber blkno;
965	bool needLock;
966
967	/*
968	* Reset counts that will be incremented during the scan; needed in case
969	* of multiple scans during a single VACUUM command
970	*/
971	stats->estimated_count = false;
972	stats->num_index_tuples = `0`;
973	stats->pages_deleted = `0`;
974
975	/ Set up info to pass down to btvacuumpage /
976	vstate.info = info;
977	vstate.stats = stats;
978	vstate.callback = callback;
979	vstate.callback_state = callback_state;
980	vstate.cycleid = cycleid;
981	vstate.lastBlockVacuumed = BTREE_METAPAGE; / Initialise at first block /
982	vstate.lastBlockLocked = BTREE_METAPAGE;
983	vstate.totFreePages = `0`;
984	vstate.oldestBtpoXact = InvalidTransactionId;
985
986	/ Create a temporary memory context to run _bt_pagedel in /
987	vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
988	"_bt_pagedel",
989	ALLOCSET_DEFAULT_SIZES);
990
991	/*
992	* The outer loop iterates over all index pages except the metapage, in
993	* physical order (we hope the kernel will cooperate in providing
994	* read-ahead for speed). It is critical that we visit all leaf pages,
995	* including ones added after we start the scan, else we might fail to
996	* delete some deletable tuples. Hence, we must repeatedly check the
997	* relation length. We must acquire the relation-extension lock while
998	* doing so to avoid a race condition: if someone else is extending the
999	* relation, there is a window where bufmgr/smgr have created a new
1000	* all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1001	* we manage to scan such a page here, we'll improperly assume it can be
1002	* recycled. Taking the lock synchronizes things enough to prevent a
1003	* problem: either num_pages won't include the new page, or _bt_getbuf
1004	* already has write lock on the buffer and it will be fully initialized
1005	* before we can examine it. (See also vacuumlazy.c, which has the same
1006	* issue.) Also, we need not worry if a page is added immediately after
1007	* we look; the page splitting code already has write-lock on the left
1008	* page before it adds a right page, so we must already have processed any
1009	* tuples due to be moved into such a page.
1010	*
1011	* We can skip locking for new or temp relations, however, since no one
1012	* else could be accessing them.
1013	*/
1014	needLock = !RELATION_IS_LOCAL(rel);
1015
1016	blkno = BTREE_METAPAGE + `1`;
1017	for (;;)
1018	{
1019	/ Get the current relation length /
1020	if (needLock)
1021	LockRelationForExtension(rel, ExclusiveLock);
1022	num_pages = RelationGetNumberOfBlocks(rel);
1023	if (needLock)
1024	UnlockRelationForExtension(rel, ExclusiveLock);
1025
1026	if (info->report_progress)
1027	pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1028	num_pages);
1029
1030	/ Quit if we've scanned the whole relation /
1031	if (blkno >= num_pages)
1032	break;
1033	/ Iterate over pages, then loop back to recheck length /
1034	for (; blkno < num_pages; blkno++)
1035	{
1036	btvacuumpage(&vstate, blkno, blkno);
1037	if (info->report_progress)
1038	pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1039	blkno);
1040	}
1041	}
1042
1043	/*
1044	* Check to see if we need to issue one final WAL record for this index,
1045	* which may be needed for correctness on a hot standby node when non-MVCC
1046	* index scans could take place.
1047	*
1048	* If the WAL is replayed in hot standby, the replay process needs to get
1049	* cleanup locks on all index leaf pages, just as we've been doing here.
1050	* However, we won't issue any WAL records about pages that have no items
1051	* to be deleted. For pages between pages we've vacuumed, the replay code
1052	* will take locks under the direction of the lastBlockVacuumed fields in
1053	* the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one
1054	* we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1055	* against the last leaf page in the index, if that one wasn't vacuumed.
1056	*/
1057	if (XLogStandbyInfoActive() &&
1058	vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1059	{
1060	Buffer buf;
1061
1062	/*
1063	* The page should be valid, but we can't use _bt_getbuf() because we
1064	* want to use a nondefault buffer access strategy. Since we aren't
1065	* going to delete any items, getting cleanup lock again is probably
1066	* overkill, but for consistency do that anyway.
1067	*/
1068	buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1069	RBM_NORMAL, info->strategy);
1070	LockBufferForCleanup(buf);
1071	_bt_checkpage(rel, buf);
1072	_bt_delitems_vacuum(rel, buf, NULL, `0`, vstate.lastBlockVacuumed);
1073	_bt_relbuf(rel, buf);
1074	}
1075
1076	MemoryContextDelete(vstate.pagedelcontext);
1077
1078	/*
1079	* If we found any recyclable pages (and recorded them in the FSM), then
1080	* forcibly update the upper-level FSM pages to ensure that searchers can
1081	* find them. It's possible that the pages were also found during
1082	* previous scans and so this is a waste of time, but it's cheap enough
1083	* relative to scanning the index that it shouldn't matter much, and
1084	* making sure that free pages are available sooner not later seems
1085	* worthwhile.
1086	*
1087	* Note that if no recyclable pages exist, we don't bother vacuuming the
1088	* FSM at all.
1089	*/
1090	if (vstate.totFreePages > `0`)
1091	IndexFreeSpaceMapVacuum(rel);
1092
1093	/ update statistics /
1094	stats->num_pages = num_pages;
1095	stats->pages_free = vstate.totFreePages;
1096
1097	if (oldestBtpoXact)
1098	*oldestBtpoXact = vstate.oldestBtpoXact;
1099	}
1100
1101	/*
1102	* btvacuumpage --- VACUUM one page
1103	*
1104	* This processes a single page for btvacuumscan(). In some cases we
1105	* must go back and re-examine previously-scanned pages; this routine
1106	* recurses when necessary to handle that case.
1107	*
1108	* blkno is the page to process. orig_blkno is the highest block number
1109	* reached by the outer btvacuumscan loop (the same as blkno, unless we
1110	* are recursing to re-examine a previous page).
1111	*/
1112	static void
1113	btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1114	{
1115	IndexVacuumInfo *info = vstate->info;
1116	IndexBulkDeleteResult *stats = vstate->stats;
1117	IndexBulkDeleteCallback callback = vstate->callback;
1118	void *callback_state = vstate->callback_state;
1119	Relation rel = info->index;
1120	bool delete_now;
1121	BlockNumber recurse_to;
1122	Buffer buf;
1123	Page page;
1124	BTPageOpaque opaque = NULL;
1125
1126	restart:
1127	delete_now = false;
1128	recurse_to = P_NONE;
1129
1130	/ call vacuum_delay_point while not holding any buffer lock /
1131	vacuum_delay_point();
1132
1133	/*
1134	* We can't use _bt_getbuf() here because it always applies
1135	* _bt_checkpage(), which will barf on an all-zero page. We want to
1136	* recycle all-zero pages, not fail. Also, we want to use a nondefault
1137	* buffer access strategy.
1138	*/
1139	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1140	info->strategy);
1141	LockBuffer(buf, BT_READ);
1142	page = BufferGetPage(buf);
1143	if (!PageIsNew(page))
1144	{
1145	_bt_checkpage(rel, buf);
1146	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1147	}
1148
1149	/*
1150	* If we are recursing, the only case we want to do anything with is a
1151	* live leaf page having the current vacuum cycle ID. Any other state
1152	* implies we already saw the page (eg, deleted it as being empty).
1153	*/
1154	if (blkno != orig_blkno)
1155	{
1156	if (_bt_page_recyclable(page) \|\|
1157	P_IGNORE(opaque) \|\|
1158	!P_ISLEAF(opaque) \|\|
1159	opaque->btpo_cycleid != vstate->cycleid)
1160	{
1161	_bt_relbuf(rel, buf);
1162	return;
1163	}
1164	}
1165
1166	/ Page is valid, see what to do with it /
1167	if (_bt_page_recyclable(page))
1168	{
1169	/ Okay to recycle this page /
1170	RecordFreeIndexPage(rel, blkno);
1171	vstate->totFreePages++;
1172	stats->pages_deleted++;
1173	}
1174	else if (P_ISDELETED(opaque))
1175	{
1176	/ Already deleted, but can't recycle yet /
1177	stats->pages_deleted++;
1178
1179	/ Update the oldest btpo.xact /
1180	if (!TransactionIdIsValid(vstate->oldestBtpoXact) \|\|
1181	TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1182	vstate->oldestBtpoXact = opaque->btpo.xact;
1183	}
1184	else if (P_ISHALFDEAD(opaque))
1185	{
1186	/ Half-dead, try to delete /
1187	delete_now = true;
1188	}
1189	else if (P_ISLEAF(opaque))
1190	{
1191	OffsetNumber deletable[MaxOffsetNumber];
1192	int ndeletable;
1193	OffsetNumber offnum,
1194	minoff,
1195	maxoff;
1196
1197	/*
1198	* Trade in the initial read lock for a super-exclusive write lock on
1199	* this page. We must get such a lock on every leaf page over the
1200	* course of the vacuum scan, whether or not it actually contains any
1201	* deletable tuples --- see nbtree/README.
1202	*/
1203	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1204	LockBufferForCleanup(buf);
1205
1206	/*
1207	* Remember highest leaf page number we've taken cleanup lock on; see
1208	* notes in btvacuumscan
1209	*/
1210	if (blkno > vstate->lastBlockLocked)
1211	vstate->lastBlockLocked = blkno;
1212
1213	/*
1214	* Check whether we need to recurse back to earlier pages. What we
1215	* are concerned about is a page split that happened since we started
1216	* the vacuum scan. If the split moved some tuples to a lower page
1217	* then we might have missed 'em. If so, set up for tail recursion.
1218	* (Must do this before possibly clearing btpo_cycleid below!)
1219	*/
1220	if (vstate->cycleid != `0` &&
1221	opaque->btpo_cycleid == vstate->cycleid &&
1222	!(opaque->btpo_flags & BTP_SPLIT_END) &&
1223	!P_RIGHTMOST(opaque) &&
1224	opaque->btpo_next < orig_blkno)
1225	recurse_to = opaque->btpo_next;
1226
1227	/*
1228	* Scan over all items to see which ones need deleted according to the
1229	* callback function.
1230	*/
1231	ndeletable = `0`;
1232	minoff = P_FIRSTDATAKEY(opaque);
1233	maxoff = PageGetMaxOffsetNumber(page);
1234	if (callback)
1235	{
1236	for (offnum = minoff;
1237	offnum <= maxoff;
1238	offnum = OffsetNumberNext(offnum))
1239	{
1240	IndexTuple itup;
1241	ItemPointer htup;
1242
1243	itup = (IndexTuple) PageGetItem(page,
1244	PageGetItemId(page, offnum));
1245	htup = &(itup->t_tid);
1246
1247	/*
1248	* During Hot Standby we currently assume that
1249	* XLOG_BTREE_VACUUM records do not produce conflicts. That is
1250	* only true as long as the callback function depends only
1251	* upon whether the index tuple refers to heap tuples removed
1252	* in the initial heap scan. When vacuum starts it derives a
1253	* value of OldestXmin. Backends taking later snapshots could
1254	* have a RecentGlobalXmin with a later xid than the vacuum's
1255	* OldestXmin, so it is possible that row versions deleted
1256	* after OldestXmin could be marked as killed by other
1257	* backends. The callback function could look at the index
1258	* tuple state in isolation and decide to delete the index
1259	* tuple, though currently it does not. If it ever did, we
1260	* would need to reconsider whether XLOG_BTREE_VACUUM records
1261	* should cause conflicts. If they did cause conflicts they
1262	* would be fairly harsh conflicts, since we haven't yet
1263	* worked out a way to pass a useful value for
1264	* latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1265	* applies to any type of index that marks index tuples as
1266	* killed.
1267	*/
1268	if (callback(htup, callback_state))
1269	deletable[ndeletable++] = offnum;
1270	}
1271	}
1272
1273	/*
1274	* Apply any needed deletes. We issue just one _bt_delitems_vacuum()
1275	* call per page, so as to minimize WAL traffic.
1276	*/
1277	if (ndeletable > `0`)
1278	{
1279	/*
1280	* Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1281	* all information to the replay code to allow it to get a cleanup
1282	* lock on all pages between the previous lastBlockVacuumed and
1283	* this page. This ensures that WAL replay locks all leaf pages at
1284	* some point, which is important should non-MVCC scans be
1285	* requested. This is currently unused on standby, but we record
1286	* it anyway, so that the WAL contains the required information.
1287	*
1288	* Since we can visit leaf pages out-of-order when recursing,
1289	* replay might end up locking such pages an extra time, but it
1290	* doesn't seem worth the amount of bookkeeping it'd take to avoid
1291	* that.
1292	*/
1293	_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1294	vstate->lastBlockVacuumed);
1295
1296	/*
1297	* Remember highest leaf page number we've issued a
1298	* XLOG_BTREE_VACUUM WAL record for.
1299	*/
1300	if (blkno > vstate->lastBlockVacuumed)
1301	vstate->lastBlockVacuumed = blkno;
1302
1303	stats->tuples_removed += ndeletable;
1304	/ must recompute maxoff /
1305	maxoff = PageGetMaxOffsetNumber(page);
1306	}
1307	else
1308	{
1309	/*
1310	* If the page has been split during this vacuum cycle, it seems
1311	* worth expending a write to clear btpo_cycleid even if we don't
1312	* have any deletions to do. (If we do, _bt_delitems_vacuum takes
1313	* care of this.) This ensures we won't process the page again.
1314	*
1315	* We treat this like a hint-bit update because there's no need to
1316	* WAL-log it.
1317	*/
1318	if (vstate->cycleid != `0` &&
1319	opaque->btpo_cycleid == vstate->cycleid)
1320	{
1321	opaque->btpo_cycleid = `0`;
1322	MarkBufferDirtyHint(buf, true);
1323	}
1324	}
1325
1326	/*
1327	* If it's now empty, try to delete; else count the live tuples. We
1328	* don't delete when recursing, though, to avoid putting entries into
1329	* freePages out-of-order (doesn't seem worth any extra code to handle
1330	* the case).
1331	*/
1332	if (minoff > maxoff)
1333	delete_now = (blkno == orig_blkno);
1334	else
1335	stats->num_index_tuples += maxoff - minoff + `1`;
1336	}
1337
1338	if (delete_now)
1339	{
1340	MemoryContext oldcontext;
1341	int ndel;
1342
1343	/ Run pagedel in a temp context to avoid memory leakage /
1344	MemoryContextReset(vstate->pagedelcontext);
1345	oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1346
1347	ndel = _bt_pagedel(rel, buf);
1348
1349	/ count only this page, else may double-count parent /
1350	if (ndel)
1351	{
1352	stats->pages_deleted++;
1353	if (!TransactionIdIsValid(vstate->oldestBtpoXact) \|\|
1354	TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1355	vstate->oldestBtpoXact = opaque->btpo.xact;
1356	}
1357
1358	MemoryContextSwitchTo(oldcontext);
1359	/ pagedel released buffer, so we shouldn't /
1360	}
1361	else
1362	_bt_relbuf(rel, buf);
1363
1364	/*
1365	* This is really tail recursion, but if the compiler is too stupid to
1366	* optimize it as such, we'd eat an uncomfortably large amount of stack
1367	* space per recursion level (due to the deletable[] array). A failure is
1368	* improbable since the number of levels isn't likely to be large ... but
1369	* just in case, let's hand-optimize into a loop.
1370	*/
1371	if (recurse_to != P_NONE)
1372	{
1373	blkno = recurse_to;
1374	goto restart;
1375	}
1376	}
1377
1378	/*
1379	* btcanreturn() -- Check whether btree indexes support index-only scans.
1380	*
1381	* btrees always do, so this is trivial.
1382	*/
1383	bool
1384	btcanreturn(Relation index, int attno)
1385	{
1386	return true;
1387	}
1388

Browse the source code of PostgreSQL/src/backend/access/nbtree/nbtree.c