1/*-------------------------------------------------------------------------
2 *
3 * nbtree.c
4 * Implementation of Lehman and Yao's btree management algorithm for
5 * Postgres.
6 *
7 * NOTES
8 * This file contains only the public interface routines.
9 *
10 *
11 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
13 *
14 * IDENTIFICATION
15 * src/backend/access/nbtree/nbtree.c
16 *
17 *-------------------------------------------------------------------------
18 */
19#include "postgres.h"
20
21#include "access/nbtree.h"
22#include "access/nbtxlog.h"
23#include "access/relscan.h"
24#include "access/xlog.h"
25#include "commands/progress.h"
26#include "commands/vacuum.h"
27#include "miscadmin.h"
28#include "nodes/execnodes.h"
29#include "pgstat.h"
30#include "postmaster/autovacuum.h"
31#include "storage/condition_variable.h"
32#include "storage/indexfsm.h"
33#include "storage/ipc.h"
34#include "storage/lmgr.h"
35#include "storage/smgr.h"
36#include "utils/builtins.h"
37#include "utils/index_selfuncs.h"
38#include "utils/memutils.h"
39
40
41/* Working state needed by btvacuumpage */
42typedef struct
43{
44 IndexVacuumInfo *info;
45 IndexBulkDeleteResult *stats;
46 IndexBulkDeleteCallback callback;
47 void *callback_state;
48 BTCycleId cycleid;
49 BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
50 BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
51 BlockNumber totFreePages; /* true total # of free pages */
52 TransactionId oldestBtpoXact;
53 MemoryContext pagedelcontext;
54} BTVacState;
55
56/*
57 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
58 *
59 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
60 * a new page; others must wait.
61 *
62 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
63 * to a new page; some process can start doing that.
64 *
65 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
66 * We reach this state once for every distinct combination of array keys.
67 */
68typedef enum
69{
70 BTPARALLEL_NOT_INITIALIZED,
71 BTPARALLEL_ADVANCING,
72 BTPARALLEL_IDLE,
73 BTPARALLEL_DONE
74} BTPS_State;
75
76/*
77 * BTParallelScanDescData contains btree specific shared information required
78 * for parallel scan.
79 */
80typedef struct BTParallelScanDescData
81{
82 BlockNumber btps_scanPage; /* latest or next page to be scanned */
83 BTPS_State btps_pageStatus; /* indicates whether next page is
84 * available for scan. see above for
85 * possible states of parallel scan. */
86 int btps_arrayKeyCount; /* count indicating number of array scan
87 * keys processed by parallel scan */
88 slock_t btps_mutex; /* protects above variables */
89 ConditionVariable btps_cv; /* used to synchronize parallel scan */
90} BTParallelScanDescData;
91
92typedef struct BTParallelScanDescData *BTParallelScanDesc;
93
94
95static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
96 IndexBulkDeleteCallback callback, void *callback_state,
97 BTCycleId cycleid, TransactionId *oldestBtpoXact);
98static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
99 BlockNumber orig_blkno);
100
101
102/*
103 * Btree handler function: return IndexAmRoutine with access method parameters
104 * and callbacks.
105 */
106Datum
107bthandler(PG_FUNCTION_ARGS)
108{
109 IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
110
111 amroutine->amstrategies = BTMaxStrategyNumber;
112 amroutine->amsupport = BTNProcs;
113 amroutine->amcanorder = true;
114 amroutine->amcanorderbyop = false;
115 amroutine->amcanbackward = true;
116 amroutine->amcanunique = true;
117 amroutine->amcanmulticol = true;
118 amroutine->amoptionalkey = true;
119 amroutine->amsearcharray = true;
120 amroutine->amsearchnulls = true;
121 amroutine->amstorage = false;
122 amroutine->amclusterable = true;
123 amroutine->ampredlocks = true;
124 amroutine->amcanparallel = true;
125 amroutine->amcaninclude = true;
126 amroutine->amkeytype = InvalidOid;
127
128 amroutine->ambuild = btbuild;
129 amroutine->ambuildempty = btbuildempty;
130 amroutine->aminsert = btinsert;
131 amroutine->ambulkdelete = btbulkdelete;
132 amroutine->amvacuumcleanup = btvacuumcleanup;
133 amroutine->amcanreturn = btcanreturn;
134 amroutine->amcostestimate = btcostestimate;
135 amroutine->amoptions = btoptions;
136 amroutine->amproperty = btproperty;
137 amroutine->ambuildphasename = btbuildphasename;
138 amroutine->amvalidate = btvalidate;
139 amroutine->ambeginscan = btbeginscan;
140 amroutine->amrescan = btrescan;
141 amroutine->amgettuple = btgettuple;
142 amroutine->amgetbitmap = btgetbitmap;
143 amroutine->amendscan = btendscan;
144 amroutine->ammarkpos = btmarkpos;
145 amroutine->amrestrpos = btrestrpos;
146 amroutine->amestimateparallelscan = btestimateparallelscan;
147 amroutine->aminitparallelscan = btinitparallelscan;
148 amroutine->amparallelrescan = btparallelrescan;
149
150 PG_RETURN_POINTER(amroutine);
151}
152
153/*
154 * btbuildempty() -- build an empty btree index in the initialization fork
155 */
156void
157btbuildempty(Relation index)
158{
159 Page metapage;
160
161 /* Construct metapage. */
162 metapage = (Page) palloc(BLCKSZ);
163 _bt_initmetapage(metapage, P_NONE, 0);
164
165 /*
166 * Write the page and log it. It might seem that an immediate sync would
167 * be sufficient to guarantee that the file exists on disk, but recovery
168 * itself might remove it while replaying, for example, an
169 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
170 * this even when wal_level=minimal.
171 */
172 PageSetChecksumInplace(metapage, BTREE_METAPAGE);
173 smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
174 (char *) metapage, true);
175 log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
176 BTREE_METAPAGE, metapage, true);
177
178 /*
179 * An immediate sync is required even if we xlog'd the page, because the
180 * write did not go through shared_buffers and therefore a concurrent
181 * checkpoint may have moved the redo pointer past our xlog record.
182 */
183 smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
184}
185
186/*
187 * btinsert() -- insert an index tuple into a btree.
188 *
189 * Descend the tree recursively, find the appropriate location for our
190 * new tuple, and put it there.
191 */
192bool
193btinsert(Relation rel, Datum *values, bool *isnull,
194 ItemPointer ht_ctid, Relation heapRel,
195 IndexUniqueCheck checkUnique,
196 IndexInfo *indexInfo)
197{
198 bool result;
199 IndexTuple itup;
200
201 /* generate an index tuple */
202 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
203 itup->t_tid = *ht_ctid;
204
205 result = _bt_doinsert(rel, itup, checkUnique, heapRel);
206
207 pfree(itup);
208
209 return result;
210}
211
212/*
213 * btgettuple() -- Get the next tuple in the scan.
214 */
215bool
216btgettuple(IndexScanDesc scan, ScanDirection dir)
217{
218 BTScanOpaque so = (BTScanOpaque) scan->opaque;
219 bool res;
220
221 /* btree indexes are never lossy */
222 scan->xs_recheck = false;
223
224 /*
225 * If we have any array keys, initialize them during first call for a
226 * scan. We can't do this in btrescan because we don't know the scan
227 * direction at that time.
228 */
229 if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
230 {
231 /* punt if we have any unsatisfiable array keys */
232 if (so->numArrayKeys < 0)
233 return false;
234
235 _bt_start_array_keys(scan, dir);
236 }
237
238 /* This loop handles advancing to the next array elements, if any */
239 do
240 {
241 /*
242 * If we've already initialized this scan, we can just advance it in
243 * the appropriate direction. If we haven't done so yet, we call
244 * _bt_first() to get the first item in the scan.
245 */
246 if (!BTScanPosIsValid(so->currPos))
247 res = _bt_first(scan, dir);
248 else
249 {
250 /*
251 * Check to see if we should kill the previously-fetched tuple.
252 */
253 if (scan->kill_prior_tuple)
254 {
255 /*
256 * Yes, remember it for later. (We'll deal with all such
257 * tuples at once right before leaving the index page.) The
258 * test for numKilled overrun is not just paranoia: if the
259 * caller reverses direction in the indexscan then the same
260 * item might get entered multiple times. It's not worth
261 * trying to optimize that, so we don't detect it, but instead
262 * just forget any excess entries.
263 */
264 if (so->killedItems == NULL)
265 so->killedItems = (int *)
266 palloc(MaxIndexTuplesPerPage * sizeof(int));
267 if (so->numKilled < MaxIndexTuplesPerPage)
268 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
269 }
270
271 /*
272 * Now continue the scan.
273 */
274 res = _bt_next(scan, dir);
275 }
276
277 /* If we have a tuple, return it ... */
278 if (res)
279 break;
280 /* ... otherwise see if we have more array keys to deal with */
281 } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
282
283 return res;
284}
285
286/*
287 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
288 */
289int64
290btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
291{
292 BTScanOpaque so = (BTScanOpaque) scan->opaque;
293 int64 ntids = 0;
294 ItemPointer heapTid;
295
296 /*
297 * If we have any array keys, initialize them.
298 */
299 if (so->numArrayKeys)
300 {
301 /* punt if we have any unsatisfiable array keys */
302 if (so->numArrayKeys < 0)
303 return ntids;
304
305 _bt_start_array_keys(scan, ForwardScanDirection);
306 }
307
308 /* This loop handles advancing to the next array elements, if any */
309 do
310 {
311 /* Fetch the first page & tuple */
312 if (_bt_first(scan, ForwardScanDirection))
313 {
314 /* Save tuple ID, and continue scanning */
315 heapTid = &scan->xs_heaptid;
316 tbm_add_tuples(tbm, heapTid, 1, false);
317 ntids++;
318
319 for (;;)
320 {
321 /*
322 * Advance to next tuple within page. This is the same as the
323 * easy case in _bt_next().
324 */
325 if (++so->currPos.itemIndex > so->currPos.lastItem)
326 {
327 /* let _bt_next do the heavy lifting */
328 if (!_bt_next(scan, ForwardScanDirection))
329 break;
330 }
331
332 /* Save tuple ID, and continue scanning */
333 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
334 tbm_add_tuples(tbm, heapTid, 1, false);
335 ntids++;
336 }
337 }
338 /* Now see if we have more array keys to deal with */
339 } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
340
341 return ntids;
342}
343
344/*
345 * btbeginscan() -- start a scan on a btree index
346 */
347IndexScanDesc
348btbeginscan(Relation rel, int nkeys, int norderbys)
349{
350 IndexScanDesc scan;
351 BTScanOpaque so;
352
353 /* no order by operators allowed */
354 Assert(norderbys == 0);
355
356 /* get the scan */
357 scan = RelationGetIndexScan(rel, nkeys, norderbys);
358
359 /* allocate private workspace */
360 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
361 BTScanPosInvalidate(so->currPos);
362 BTScanPosInvalidate(so->markPos);
363 if (scan->numberOfKeys > 0)
364 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
365 else
366 so->keyData = NULL;
367
368 so->arrayKeyData = NULL; /* assume no array keys for now */
369 so->numArrayKeys = 0;
370 so->arrayKeys = NULL;
371 so->arrayContext = NULL;
372
373 so->killedItems = NULL; /* until needed */
374 so->numKilled = 0;
375
376 /*
377 * We don't know yet whether the scan will be index-only, so we do not
378 * allocate the tuple workspace arrays until btrescan. However, we set up
379 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
380 */
381 so->currTuples = so->markTuples = NULL;
382
383 scan->xs_itupdesc = RelationGetDescr(rel);
384
385 scan->opaque = so;
386
387 return scan;
388}
389
390/*
391 * btrescan() -- rescan an index relation
392 */
393void
394btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
395 ScanKey orderbys, int norderbys)
396{
397 BTScanOpaque so = (BTScanOpaque) scan->opaque;
398
399 /* we aren't holding any read locks, but gotta drop the pins */
400 if (BTScanPosIsValid(so->currPos))
401 {
402 /* Before leaving current page, deal with any killed items */
403 if (so->numKilled > 0)
404 _bt_killitems(scan);
405 BTScanPosUnpinIfPinned(so->currPos);
406 BTScanPosInvalidate(so->currPos);
407 }
408
409 so->markItemIndex = -1;
410 so->arrayKeyCount = 0;
411 BTScanPosUnpinIfPinned(so->markPos);
412 BTScanPosInvalidate(so->markPos);
413
414 /*
415 * Allocate tuple workspace arrays, if needed for an index-only scan and
416 * not already done in a previous rescan call. To save on palloc
417 * overhead, both workspaces are allocated as one palloc block; only this
418 * function and btendscan know that.
419 *
420 * NOTE: this data structure also makes it safe to return data from a
421 * "name" column, even though btree name_ops uses an underlying storage
422 * datatype of cstring. The risk there is that "name" is supposed to be
423 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
424 * However, since we only return data out of tuples sitting in the
425 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
426 * data out of the markTuples array --- running off the end of memory for
427 * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
428 * adding special-case treatment for name_ops elsewhere.
429 */
430 if (scan->xs_want_itup && so->currTuples == NULL)
431 {
432 so->currTuples = (char *) palloc(BLCKSZ * 2);
433 so->markTuples = so->currTuples + BLCKSZ;
434 }
435
436 /*
437 * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
438 * - vadim 05/05/97
439 */
440 if (scankey && scan->numberOfKeys > 0)
441 memmove(scan->keyData,
442 scankey,
443 scan->numberOfKeys * sizeof(ScanKeyData));
444 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
445
446 /* If any keys are SK_SEARCHARRAY type, set up array-key info */
447 _bt_preprocess_array_keys(scan);
448}
449
450/*
451 * btendscan() -- close down a scan
452 */
453void
454btendscan(IndexScanDesc scan)
455{
456 BTScanOpaque so = (BTScanOpaque) scan->opaque;
457
458 /* we aren't holding any read locks, but gotta drop the pins */
459 if (BTScanPosIsValid(so->currPos))
460 {
461 /* Before leaving current page, deal with any killed items */
462 if (so->numKilled > 0)
463 _bt_killitems(scan);
464 BTScanPosUnpinIfPinned(so->currPos);
465 }
466
467 so->markItemIndex = -1;
468 BTScanPosUnpinIfPinned(so->markPos);
469
470 /* No need to invalidate positions, the RAM is about to be freed. */
471
472 /* Release storage */
473 if (so->keyData != NULL)
474 pfree(so->keyData);
475 /* so->arrayKeyData and so->arrayKeys are in arrayContext */
476 if (so->arrayContext != NULL)
477 MemoryContextDelete(so->arrayContext);
478 if (so->killedItems != NULL)
479 pfree(so->killedItems);
480 if (so->currTuples != NULL)
481 pfree(so->currTuples);
482 /* so->markTuples should not be pfree'd, see btrescan */
483 pfree(so);
484}
485
486/*
487 * btmarkpos() -- save current scan position
488 */
489void
490btmarkpos(IndexScanDesc scan)
491{
492 BTScanOpaque so = (BTScanOpaque) scan->opaque;
493
494 /* There may be an old mark with a pin (but no lock). */
495 BTScanPosUnpinIfPinned(so->markPos);
496
497 /*
498 * Just record the current itemIndex. If we later step to next page
499 * before releasing the marked position, _bt_steppage makes a full copy of
500 * the currPos struct in markPos. If (as often happens) the mark is moved
501 * before we leave the page, we don't have to do that work.
502 */
503 if (BTScanPosIsValid(so->currPos))
504 so->markItemIndex = so->currPos.itemIndex;
505 else
506 {
507 BTScanPosInvalidate(so->markPos);
508 so->markItemIndex = -1;
509 }
510
511 /* Also record the current positions of any array keys */
512 if (so->numArrayKeys)
513 _bt_mark_array_keys(scan);
514}
515
516/*
517 * btrestrpos() -- restore scan to last saved position
518 */
519void
520btrestrpos(IndexScanDesc scan)
521{
522 BTScanOpaque so = (BTScanOpaque) scan->opaque;
523
524 /* Restore the marked positions of any array keys */
525 if (so->numArrayKeys)
526 _bt_restore_array_keys(scan);
527
528 if (so->markItemIndex >= 0)
529 {
530 /*
531 * The scan has never moved to a new page since the last mark. Just
532 * restore the itemIndex.
533 *
534 * NB: In this case we can't count on anything in so->markPos to be
535 * accurate.
536 */
537 so->currPos.itemIndex = so->markItemIndex;
538 }
539 else
540 {
541 /*
542 * The scan moved to a new page after last mark or restore, and we are
543 * now restoring to the marked page. We aren't holding any read
544 * locks, but if we're still holding the pin for the current position,
545 * we must drop it.
546 */
547 if (BTScanPosIsValid(so->currPos))
548 {
549 /* Before leaving current page, deal with any killed items */
550 if (so->numKilled > 0)
551 _bt_killitems(scan);
552 BTScanPosUnpinIfPinned(so->currPos);
553 }
554
555 if (BTScanPosIsValid(so->markPos))
556 {
557 /* bump pin on mark buffer for assignment to current buffer */
558 if (BTScanPosIsPinned(so->markPos))
559 IncrBufferRefCount(so->markPos.buf);
560 memcpy(&so->currPos, &so->markPos,
561 offsetof(BTScanPosData, items[1]) +
562 so->markPos.lastItem * sizeof(BTScanPosItem));
563 if (so->currTuples)
564 memcpy(so->currTuples, so->markTuples,
565 so->markPos.nextTupleOffset);
566 }
567 else
568 BTScanPosInvalidate(so->currPos);
569 }
570}
571
572/*
573 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
574 */
575Size
576btestimateparallelscan(void)
577{
578 return sizeof(BTParallelScanDescData);
579}
580
581/*
582 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
583 */
584void
585btinitparallelscan(void *target)
586{
587 BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
588
589 SpinLockInit(&bt_target->btps_mutex);
590 bt_target->btps_scanPage = InvalidBlockNumber;
591 bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
592 bt_target->btps_arrayKeyCount = 0;
593 ConditionVariableInit(&bt_target->btps_cv);
594}
595
596/*
597 * btparallelrescan() -- reset parallel scan
598 */
599void
600btparallelrescan(IndexScanDesc scan)
601{
602 BTParallelScanDesc btscan;
603 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
604
605 Assert(parallel_scan);
606
607 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
608 parallel_scan->ps_offset);
609
610 /*
611 * In theory, we don't need to acquire the spinlock here, because there
612 * shouldn't be any other workers running at this point, but we do so for
613 * consistency.
614 */
615 SpinLockAcquire(&btscan->btps_mutex);
616 btscan->btps_scanPage = InvalidBlockNumber;
617 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
618 btscan->btps_arrayKeyCount = 0;
619 SpinLockRelease(&btscan->btps_mutex);
620}
621
622/*
623 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
624 * page. Other scans must wait until we call _bt_parallel_release()
625 * or _bt_parallel_done().
626 *
627 * The return value is true if we successfully seized the scan and false
628 * if we did not. The latter case occurs if no pages remain for the current
629 * set of scankeys.
630 *
631 * If the return value is true, *pageno returns the next or current page
632 * of the scan (depending on the scan direction). An invalid block number
633 * means the scan hasn't yet started, and P_NONE means we've reached the end.
634 * The first time a participating process reaches the last page, it will return
635 * true and set *pageno to P_NONE; after that, further attempts to seize the
636 * scan will return false.
637 *
638 * Callers should ignore the value of pageno if the return value is false.
639 */
640bool
641_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
642{
643 BTScanOpaque so = (BTScanOpaque) scan->opaque;
644 BTPS_State pageStatus;
645 bool exit_loop = false;
646 bool status = true;
647 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
648 BTParallelScanDesc btscan;
649
650 *pageno = P_NONE;
651
652 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
653 parallel_scan->ps_offset);
654
655 while (1)
656 {
657 SpinLockAcquire(&btscan->btps_mutex);
658 pageStatus = btscan->btps_pageStatus;
659
660 if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
661 {
662 /* Parallel scan has already advanced to a new set of scankeys. */
663 status = false;
664 }
665 else if (pageStatus == BTPARALLEL_DONE)
666 {
667 /*
668 * We're done with this set of scankeys. This may be the end, or
669 * there could be more sets to try.
670 */
671 status = false;
672 }
673 else if (pageStatus != BTPARALLEL_ADVANCING)
674 {
675 /*
676 * We have successfully seized control of the scan for the purpose
677 * of advancing it to a new page!
678 */
679 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
680 *pageno = btscan->btps_scanPage;
681 exit_loop = true;
682 }
683 SpinLockRelease(&btscan->btps_mutex);
684 if (exit_loop || !status)
685 break;
686 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
687 }
688 ConditionVariableCancelSleep();
689
690 return status;
691}
692
693/*
694 * _bt_parallel_release() -- Complete the process of advancing the scan to a
695 * new page. We now have the new value btps_scanPage; some other backend
696 * can now begin advancing the scan.
697 */
698void
699_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
700{
701 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
702 BTParallelScanDesc btscan;
703
704 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
705 parallel_scan->ps_offset);
706
707 SpinLockAcquire(&btscan->btps_mutex);
708 btscan->btps_scanPage = scan_page;
709 btscan->btps_pageStatus = BTPARALLEL_IDLE;
710 SpinLockRelease(&btscan->btps_mutex);
711 ConditionVariableSignal(&btscan->btps_cv);
712}
713
714/*
715 * _bt_parallel_done() -- Mark the parallel scan as complete.
716 *
717 * When there are no pages left to scan, this function should be called to
718 * notify other workers. Otherwise, they might wait forever for the scan to
719 * advance to the next page.
720 */
721void
722_bt_parallel_done(IndexScanDesc scan)
723{
724 BTScanOpaque so = (BTScanOpaque) scan->opaque;
725 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
726 BTParallelScanDesc btscan;
727 bool status_changed = false;
728
729 /* Do nothing, for non-parallel scans */
730 if (parallel_scan == NULL)
731 return;
732
733 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
734 parallel_scan->ps_offset);
735
736 /*
737 * Mark the parallel scan as done for this combination of scan keys,
738 * unless some other process already did so. See also
739 * _bt_advance_array_keys.
740 */
741 SpinLockAcquire(&btscan->btps_mutex);
742 if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
743 btscan->btps_pageStatus != BTPARALLEL_DONE)
744 {
745 btscan->btps_pageStatus = BTPARALLEL_DONE;
746 status_changed = true;
747 }
748 SpinLockRelease(&btscan->btps_mutex);
749
750 /* wake up all the workers associated with this parallel scan */
751 if (status_changed)
752 ConditionVariableBroadcast(&btscan->btps_cv);
753}
754
755/*
756 * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
757 * keys.
758 *
759 * Updates the count of array keys processed for both local and parallel
760 * scans.
761 */
762void
763_bt_parallel_advance_array_keys(IndexScanDesc scan)
764{
765 BTScanOpaque so = (BTScanOpaque) scan->opaque;
766 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
767 BTParallelScanDesc btscan;
768
769 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
770 parallel_scan->ps_offset);
771
772 so->arrayKeyCount++;
773 SpinLockAcquire(&btscan->btps_mutex);
774 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
775 {
776 btscan->btps_scanPage = InvalidBlockNumber;
777 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
778 btscan->btps_arrayKeyCount++;
779 }
780 SpinLockRelease(&btscan->btps_mutex);
781}
782
783/*
784 * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
785 * btbulkdelete() wasn't called.
786 */
787static bool
788_bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
789{
790 Buffer metabuf;
791 Page metapg;
792 BTMetaPageData *metad;
793 bool result = false;
794
795 metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
796 metapg = BufferGetPage(metabuf);
797 metad = BTPageGetMeta(metapg);
798
799 if (metad->btm_version < BTREE_NOVAC_VERSION)
800 {
801 /*
802 * Do cleanup if metapage needs upgrade, because we don't have
803 * cleanup-related meta-information yet.
804 */
805 result = true;
806 }
807 else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
808 TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
809 RecentGlobalXmin))
810 {
811 /*
812 * If oldest btpo.xact in the deleted pages is older than
813 * RecentGlobalXmin, then at least one deleted page can be recycled.
814 */
815 result = true;
816 }
817 else
818 {
819 StdRdOptions *relopts;
820 float8 cleanup_scale_factor;
821 float8 prev_num_heap_tuples;
822
823 /*
824 * If table receives enough insertions and no cleanup was performed,
825 * then index would appear have stale statistics. If scale factor is
826 * set, we avoid that by performing cleanup if the number of inserted
827 * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
828 * original tuples count.
829 */
830 relopts = (StdRdOptions *) info->index->rd_options;
831 cleanup_scale_factor = (relopts &&
832 relopts->vacuum_cleanup_index_scale_factor >= 0)
833 ? relopts->vacuum_cleanup_index_scale_factor
834 : vacuum_cleanup_index_scale_factor;
835 prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
836
837 if (cleanup_scale_factor <= 0 ||
838 prev_num_heap_tuples <= 0 ||
839 (info->num_heap_tuples - prev_num_heap_tuples) /
840 prev_num_heap_tuples >= cleanup_scale_factor)
841 result = true;
842 }
843
844 _bt_relbuf(info->index, metabuf);
845 return result;
846}
847
848/*
849 * Bulk deletion of all index entries pointing to a set of heap tuples.
850 * The set of target tuples is specified via a callback routine that tells
851 * whether any given heap tuple (identified by ItemPointer) is being deleted.
852 *
853 * Result: a palloc'd struct containing statistical info for VACUUM displays.
854 */
855IndexBulkDeleteResult *
856btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
857 IndexBulkDeleteCallback callback, void *callback_state)
858{
859 Relation rel = info->index;
860 BTCycleId cycleid;
861
862 /* allocate stats if first time through, else re-use existing struct */
863 if (stats == NULL)
864 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
865
866 /* Establish the vacuum cycle ID to use for this scan */
867 /* The ENSURE stuff ensures we clean up shared memory on failure */
868 PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
869 {
870 TransactionId oldestBtpoXact;
871
872 cycleid = _bt_start_vacuum(rel);
873
874 btvacuumscan(info, stats, callback, callback_state, cycleid,
875 &oldestBtpoXact);
876
877 /*
878 * Update cleanup-related information in metapage. This information is
879 * used only for cleanup but keeping them up to date can avoid
880 * unnecessary cleanup even after bulkdelete.
881 */
882 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
883 info->num_heap_tuples);
884 }
885 PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
886 _bt_end_vacuum(rel);
887
888 return stats;
889}
890
891/*
892 * Post-VACUUM cleanup.
893 *
894 * Result: a palloc'd struct containing statistical info for VACUUM displays.
895 */
896IndexBulkDeleteResult *
897btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
898{
899 /* No-op in ANALYZE ONLY mode */
900 if (info->analyze_only)
901 return stats;
902
903 /*
904 * If btbulkdelete was called, we need not do anything, just return the
905 * stats from the latest btbulkdelete call. If it wasn't called, we might
906 * still need to do a pass over the index, to recycle any newly-recyclable
907 * pages or to obtain index statistics. _bt_vacuum_needs_cleanup
908 * determines if either are needed.
909 *
910 * Since we aren't going to actually delete any leaf items, there's no
911 * need to go through all the vacuum-cycle-ID pushups.
912 */
913 if (stats == NULL)
914 {
915 TransactionId oldestBtpoXact;
916
917 /* Check if we need a cleanup */
918 if (!_bt_vacuum_needs_cleanup(info))
919 return NULL;
920
921 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
922 btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
923
924 /* Update cleanup-related information in the metapage */
925 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
926 info->num_heap_tuples);
927 }
928
929 /*
930 * It's quite possible for us to be fooled by concurrent page splits into
931 * double-counting some index tuples, so disbelieve any total that exceeds
932 * the underlying heap's count ... if we know that accurately. Otherwise
933 * this might just make matters worse.
934 */
935 if (!info->estimated_count)
936 {
937 if (stats->num_index_tuples > info->num_heap_tuples)
938 stats->num_index_tuples = info->num_heap_tuples;
939 }
940
941 return stats;
942}
943
944/*
945 * btvacuumscan --- scan the index for VACUUMing purposes
946 *
947 * This combines the functions of looking for leaf tuples that are deletable
948 * according to the vacuum callback, looking for empty pages that can be
949 * deleted, and looking for old deleted pages that can be recycled. Both
950 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
951 * btbulkdelete call occurred).
952 *
953 * The caller is responsible for initially allocating/zeroing a stats struct
954 * and for obtaining a vacuum cycle ID if necessary.
955 */
956static void
957btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
958 IndexBulkDeleteCallback callback, void *callback_state,
959 BTCycleId cycleid, TransactionId *oldestBtpoXact)
960{
961 Relation rel = info->index;
962 BTVacState vstate;
963 BlockNumber num_pages;
964 BlockNumber blkno;
965 bool needLock;
966
967 /*
968 * Reset counts that will be incremented during the scan; needed in case
969 * of multiple scans during a single VACUUM command
970 */
971 stats->estimated_count = false;
972 stats->num_index_tuples = 0;
973 stats->pages_deleted = 0;
974
975 /* Set up info to pass down to btvacuumpage */
976 vstate.info = info;
977 vstate.stats = stats;
978 vstate.callback = callback;
979 vstate.callback_state = callback_state;
980 vstate.cycleid = cycleid;
981 vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
982 vstate.lastBlockLocked = BTREE_METAPAGE;
983 vstate.totFreePages = 0;
984 vstate.oldestBtpoXact = InvalidTransactionId;
985
986 /* Create a temporary memory context to run _bt_pagedel in */
987 vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
988 "_bt_pagedel",
989 ALLOCSET_DEFAULT_SIZES);
990
991 /*
992 * The outer loop iterates over all index pages except the metapage, in
993 * physical order (we hope the kernel will cooperate in providing
994 * read-ahead for speed). It is critical that we visit all leaf pages,
995 * including ones added after we start the scan, else we might fail to
996 * delete some deletable tuples. Hence, we must repeatedly check the
997 * relation length. We must acquire the relation-extension lock while
998 * doing so to avoid a race condition: if someone else is extending the
999 * relation, there is a window where bufmgr/smgr have created a new
1000 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1001 * we manage to scan such a page here, we'll improperly assume it can be
1002 * recycled. Taking the lock synchronizes things enough to prevent a
1003 * problem: either num_pages won't include the new page, or _bt_getbuf
1004 * already has write lock on the buffer and it will be fully initialized
1005 * before we can examine it. (See also vacuumlazy.c, which has the same
1006 * issue.) Also, we need not worry if a page is added immediately after
1007 * we look; the page splitting code already has write-lock on the left
1008 * page before it adds a right page, so we must already have processed any
1009 * tuples due to be moved into such a page.
1010 *
1011 * We can skip locking for new or temp relations, however, since no one
1012 * else could be accessing them.
1013 */
1014 needLock = !RELATION_IS_LOCAL(rel);
1015
1016 blkno = BTREE_METAPAGE + 1;
1017 for (;;)
1018 {
1019 /* Get the current relation length */
1020 if (needLock)
1021 LockRelationForExtension(rel, ExclusiveLock);
1022 num_pages = RelationGetNumberOfBlocks(rel);
1023 if (needLock)
1024 UnlockRelationForExtension(rel, ExclusiveLock);
1025
1026 if (info->report_progress)
1027 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1028 num_pages);
1029
1030 /* Quit if we've scanned the whole relation */
1031 if (blkno >= num_pages)
1032 break;
1033 /* Iterate over pages, then loop back to recheck length */
1034 for (; blkno < num_pages; blkno++)
1035 {
1036 btvacuumpage(&vstate, blkno, blkno);
1037 if (info->report_progress)
1038 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1039 blkno);
1040 }
1041 }
1042
1043 /*
1044 * Check to see if we need to issue one final WAL record for this index,
1045 * which may be needed for correctness on a hot standby node when non-MVCC
1046 * index scans could take place.
1047 *
1048 * If the WAL is replayed in hot standby, the replay process needs to get
1049 * cleanup locks on all index leaf pages, just as we've been doing here.
1050 * However, we won't issue any WAL records about pages that have no items
1051 * to be deleted. For pages between pages we've vacuumed, the replay code
1052 * will take locks under the direction of the lastBlockVacuumed fields in
1053 * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one
1054 * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1055 * against the last leaf page in the index, if that one wasn't vacuumed.
1056 */
1057 if (XLogStandbyInfoActive() &&
1058 vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1059 {
1060 Buffer buf;
1061
1062 /*
1063 * The page should be valid, but we can't use _bt_getbuf() because we
1064 * want to use a nondefault buffer access strategy. Since we aren't
1065 * going to delete any items, getting cleanup lock again is probably
1066 * overkill, but for consistency do that anyway.
1067 */
1068 buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1069 RBM_NORMAL, info->strategy);
1070 LockBufferForCleanup(buf);
1071 _bt_checkpage(rel, buf);
1072 _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1073 _bt_relbuf(rel, buf);
1074 }
1075
1076 MemoryContextDelete(vstate.pagedelcontext);
1077
1078 /*
1079 * If we found any recyclable pages (and recorded them in the FSM), then
1080 * forcibly update the upper-level FSM pages to ensure that searchers can
1081 * find them. It's possible that the pages were also found during
1082 * previous scans and so this is a waste of time, but it's cheap enough
1083 * relative to scanning the index that it shouldn't matter much, and
1084 * making sure that free pages are available sooner not later seems
1085 * worthwhile.
1086 *
1087 * Note that if no recyclable pages exist, we don't bother vacuuming the
1088 * FSM at all.
1089 */
1090 if (vstate.totFreePages > 0)
1091 IndexFreeSpaceMapVacuum(rel);
1092
1093 /* update statistics */
1094 stats->num_pages = num_pages;
1095 stats->pages_free = vstate.totFreePages;
1096
1097 if (oldestBtpoXact)
1098 *oldestBtpoXact = vstate.oldestBtpoXact;
1099}
1100
1101/*
1102 * btvacuumpage --- VACUUM one page
1103 *
1104 * This processes a single page for btvacuumscan(). In some cases we
1105 * must go back and re-examine previously-scanned pages; this routine
1106 * recurses when necessary to handle that case.
1107 *
1108 * blkno is the page to process. orig_blkno is the highest block number
1109 * reached by the outer btvacuumscan loop (the same as blkno, unless we
1110 * are recursing to re-examine a previous page).
1111 */
1112static void
1113btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1114{
1115 IndexVacuumInfo *info = vstate->info;
1116 IndexBulkDeleteResult *stats = vstate->stats;
1117 IndexBulkDeleteCallback callback = vstate->callback;
1118 void *callback_state = vstate->callback_state;
1119 Relation rel = info->index;
1120 bool delete_now;
1121 BlockNumber recurse_to;
1122 Buffer buf;
1123 Page page;
1124 BTPageOpaque opaque = NULL;
1125
1126restart:
1127 delete_now = false;
1128 recurse_to = P_NONE;
1129
1130 /* call vacuum_delay_point while not holding any buffer lock */
1131 vacuum_delay_point();
1132
1133 /*
1134 * We can't use _bt_getbuf() here because it always applies
1135 * _bt_checkpage(), which will barf on an all-zero page. We want to
1136 * recycle all-zero pages, not fail. Also, we want to use a nondefault
1137 * buffer access strategy.
1138 */
1139 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1140 info->strategy);
1141 LockBuffer(buf, BT_READ);
1142 page = BufferGetPage(buf);
1143 if (!PageIsNew(page))
1144 {
1145 _bt_checkpage(rel, buf);
1146 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1147 }
1148
1149 /*
1150 * If we are recursing, the only case we want to do anything with is a
1151 * live leaf page having the current vacuum cycle ID. Any other state
1152 * implies we already saw the page (eg, deleted it as being empty).
1153 */
1154 if (blkno != orig_blkno)
1155 {
1156 if (_bt_page_recyclable(page) ||
1157 P_IGNORE(opaque) ||
1158 !P_ISLEAF(opaque) ||
1159 opaque->btpo_cycleid != vstate->cycleid)
1160 {
1161 _bt_relbuf(rel, buf);
1162 return;
1163 }
1164 }
1165
1166 /* Page is valid, see what to do with it */
1167 if (_bt_page_recyclable(page))
1168 {
1169 /* Okay to recycle this page */
1170 RecordFreeIndexPage(rel, blkno);
1171 vstate->totFreePages++;
1172 stats->pages_deleted++;
1173 }
1174 else if (P_ISDELETED(opaque))
1175 {
1176 /* Already deleted, but can't recycle yet */
1177 stats->pages_deleted++;
1178
1179 /* Update the oldest btpo.xact */
1180 if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1181 TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1182 vstate->oldestBtpoXact = opaque->btpo.xact;
1183 }
1184 else if (P_ISHALFDEAD(opaque))
1185 {
1186 /* Half-dead, try to delete */
1187 delete_now = true;
1188 }
1189 else if (P_ISLEAF(opaque))
1190 {
1191 OffsetNumber deletable[MaxOffsetNumber];
1192 int ndeletable;
1193 OffsetNumber offnum,
1194 minoff,
1195 maxoff;
1196
1197 /*
1198 * Trade in the initial read lock for a super-exclusive write lock on
1199 * this page. We must get such a lock on every leaf page over the
1200 * course of the vacuum scan, whether or not it actually contains any
1201 * deletable tuples --- see nbtree/README.
1202 */
1203 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1204 LockBufferForCleanup(buf);
1205
1206 /*
1207 * Remember highest leaf page number we've taken cleanup lock on; see
1208 * notes in btvacuumscan
1209 */
1210 if (blkno > vstate->lastBlockLocked)
1211 vstate->lastBlockLocked = blkno;
1212
1213 /*
1214 * Check whether we need to recurse back to earlier pages. What we
1215 * are concerned about is a page split that happened since we started
1216 * the vacuum scan. If the split moved some tuples to a lower page
1217 * then we might have missed 'em. If so, set up for tail recursion.
1218 * (Must do this before possibly clearing btpo_cycleid below!)
1219 */
1220 if (vstate->cycleid != 0 &&
1221 opaque->btpo_cycleid == vstate->cycleid &&
1222 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1223 !P_RIGHTMOST(opaque) &&
1224 opaque->btpo_next < orig_blkno)
1225 recurse_to = opaque->btpo_next;
1226
1227 /*
1228 * Scan over all items to see which ones need deleted according to the
1229 * callback function.
1230 */
1231 ndeletable = 0;
1232 minoff = P_FIRSTDATAKEY(opaque);
1233 maxoff = PageGetMaxOffsetNumber(page);
1234 if (callback)
1235 {
1236 for (offnum = minoff;
1237 offnum <= maxoff;
1238 offnum = OffsetNumberNext(offnum))
1239 {
1240 IndexTuple itup;
1241 ItemPointer htup;
1242
1243 itup = (IndexTuple) PageGetItem(page,
1244 PageGetItemId(page, offnum));
1245 htup = &(itup->t_tid);
1246
1247 /*
1248 * During Hot Standby we currently assume that
1249 * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1250 * only true as long as the callback function depends only
1251 * upon whether the index tuple refers to heap tuples removed
1252 * in the initial heap scan. When vacuum starts it derives a
1253 * value of OldestXmin. Backends taking later snapshots could
1254 * have a RecentGlobalXmin with a later xid than the vacuum's
1255 * OldestXmin, so it is possible that row versions deleted
1256 * after OldestXmin could be marked as killed by other
1257 * backends. The callback function *could* look at the index
1258 * tuple state in isolation and decide to delete the index
1259 * tuple, though currently it does not. If it ever did, we
1260 * would need to reconsider whether XLOG_BTREE_VACUUM records
1261 * should cause conflicts. If they did cause conflicts they
1262 * would be fairly harsh conflicts, since we haven't yet
1263 * worked out a way to pass a useful value for
1264 * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1265 * applies to *any* type of index that marks index tuples as
1266 * killed.
1267 */
1268 if (callback(htup, callback_state))
1269 deletable[ndeletable++] = offnum;
1270 }
1271 }
1272
1273 /*
1274 * Apply any needed deletes. We issue just one _bt_delitems_vacuum()
1275 * call per page, so as to minimize WAL traffic.
1276 */
1277 if (ndeletable > 0)
1278 {
1279 /*
1280 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1281 * all information to the replay code to allow it to get a cleanup
1282 * lock on all pages between the previous lastBlockVacuumed and
1283 * this page. This ensures that WAL replay locks all leaf pages at
1284 * some point, which is important should non-MVCC scans be
1285 * requested. This is currently unused on standby, but we record
1286 * it anyway, so that the WAL contains the required information.
1287 *
1288 * Since we can visit leaf pages out-of-order when recursing,
1289 * replay might end up locking such pages an extra time, but it
1290 * doesn't seem worth the amount of bookkeeping it'd take to avoid
1291 * that.
1292 */
1293 _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1294 vstate->lastBlockVacuumed);
1295
1296 /*
1297 * Remember highest leaf page number we've issued a
1298 * XLOG_BTREE_VACUUM WAL record for.
1299 */
1300 if (blkno > vstate->lastBlockVacuumed)
1301 vstate->lastBlockVacuumed = blkno;
1302
1303 stats->tuples_removed += ndeletable;
1304 /* must recompute maxoff */
1305 maxoff = PageGetMaxOffsetNumber(page);
1306 }
1307 else
1308 {
1309 /*
1310 * If the page has been split during this vacuum cycle, it seems
1311 * worth expending a write to clear btpo_cycleid even if we don't
1312 * have any deletions to do. (If we do, _bt_delitems_vacuum takes
1313 * care of this.) This ensures we won't process the page again.
1314 *
1315 * We treat this like a hint-bit update because there's no need to
1316 * WAL-log it.
1317 */
1318 if (vstate->cycleid != 0 &&
1319 opaque->btpo_cycleid == vstate->cycleid)
1320 {
1321 opaque->btpo_cycleid = 0;
1322 MarkBufferDirtyHint(buf, true);
1323 }
1324 }
1325
1326 /*
1327 * If it's now empty, try to delete; else count the live tuples. We
1328 * don't delete when recursing, though, to avoid putting entries into
1329 * freePages out-of-order (doesn't seem worth any extra code to handle
1330 * the case).
1331 */
1332 if (minoff > maxoff)
1333 delete_now = (blkno == orig_blkno);
1334 else
1335 stats->num_index_tuples += maxoff - minoff + 1;
1336 }
1337
1338 if (delete_now)
1339 {
1340 MemoryContext oldcontext;
1341 int ndel;
1342
1343 /* Run pagedel in a temp context to avoid memory leakage */
1344 MemoryContextReset(vstate->pagedelcontext);
1345 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1346
1347 ndel = _bt_pagedel(rel, buf);
1348
1349 /* count only this page, else may double-count parent */
1350 if (ndel)
1351 {
1352 stats->pages_deleted++;
1353 if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1354 TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1355 vstate->oldestBtpoXact = opaque->btpo.xact;
1356 }
1357
1358 MemoryContextSwitchTo(oldcontext);
1359 /* pagedel released buffer, so we shouldn't */
1360 }
1361 else
1362 _bt_relbuf(rel, buf);
1363
1364 /*
1365 * This is really tail recursion, but if the compiler is too stupid to
1366 * optimize it as such, we'd eat an uncomfortably large amount of stack
1367 * space per recursion level (due to the deletable[] array). A failure is
1368 * improbable since the number of levels isn't likely to be large ... but
1369 * just in case, let's hand-optimize into a loop.
1370 */
1371 if (recurse_to != P_NONE)
1372 {
1373 blkno = recurse_to;
1374 goto restart;
1375 }
1376}
1377
1378/*
1379 * btcanreturn() -- Check whether btree indexes support index-only scans.
1380 *
1381 * btrees always do, so this is trivial.
1382 */
1383bool
1384btcanreturn(Relation index, int attno)
1385{
1386 return true;
1387}
1388