1/*-------------------------------------------------------------------------
2 *
3 * gistvacuum.c
4 * vacuuming routines for the postgres GiST index access method.
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/gist/gistvacuum.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "postgres.h"
16
17#include "access/genam.h"
18#include "access/gist_private.h"
19#include "access/transam.h"
20#include "commands/vacuum.h"
21#include "lib/integerset.h"
22#include "miscadmin.h"
23#include "storage/indexfsm.h"
24#include "storage/lmgr.h"
25#include "utils/memutils.h"
26
27/*
28 * State kept across vacuum stages.
29 */
30typedef struct
31{
32 IndexBulkDeleteResult stats; /* must be first */
33
34 /*
35 * These are used to memorize all internal and empty leaf pages in the 1st
36 * vacuum stage. They are used in the 2nd stage, to delete all the empty
37 * pages.
38 */
39 IntegerSet *internal_page_set;
40 IntegerSet *empty_leaf_set;
41 MemoryContext page_set_context;
42} GistBulkDeleteResult;
43
44/* Working state needed by gistbulkdelete */
45typedef struct
46{
47 IndexVacuumInfo *info;
48 GistBulkDeleteResult *stats;
49 IndexBulkDeleteCallback callback;
50 void *callback_state;
51 GistNSN startNSN;
52} GistVacState;
53
54static void gistvacuumscan(IndexVacuumInfo *info, GistBulkDeleteResult *stats,
55 IndexBulkDeleteCallback callback, void *callback_state);
56static void gistvacuumpage(GistVacState *vstate, BlockNumber blkno,
57 BlockNumber orig_blkno);
58static void gistvacuum_delete_empty_pages(IndexVacuumInfo *info,
59 GistBulkDeleteResult *stats);
60static bool gistdeletepage(IndexVacuumInfo *info, GistBulkDeleteResult *stats,
61 Buffer buffer, OffsetNumber downlink,
62 Buffer leafBuffer);
63
64/* allocate the 'stats' struct that's kept over vacuum stages */
65static GistBulkDeleteResult *
66create_GistBulkDeleteResult(void)
67{
68 GistBulkDeleteResult *gist_stats;
69
70 gist_stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult));
71 gist_stats->page_set_context =
72 GenerationContextCreate(CurrentMemoryContext,
73 "GiST VACUUM page set context",
74 16 * 1024);
75
76 return gist_stats;
77}
78
79/*
80 * VACUUM bulkdelete stage: remove index entries.
81 */
82IndexBulkDeleteResult *
83gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
84 IndexBulkDeleteCallback callback, void *callback_state)
85{
86 GistBulkDeleteResult *gist_stats = (GistBulkDeleteResult *) stats;
87
88 /* allocate stats if first time through, else re-use existing struct */
89 if (gist_stats == NULL)
90 gist_stats = create_GistBulkDeleteResult();
91
92 gistvacuumscan(info, gist_stats, callback, callback_state);
93
94 return (IndexBulkDeleteResult *) gist_stats;
95}
96
97/*
98 * VACUUM cleanup stage: delete empty pages, and update index statistics.
99 */
100IndexBulkDeleteResult *
101gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
102{
103 GistBulkDeleteResult *gist_stats = (GistBulkDeleteResult *) stats;
104
105 /* No-op in ANALYZE ONLY mode */
106 if (info->analyze_only)
107 return stats;
108
109 /*
110 * If gistbulkdelete was called, we need not do anything, just return the
111 * stats from the latest gistbulkdelete call. If it wasn't called, we
112 * still need to do a pass over the index, to obtain index statistics.
113 */
114 if (gist_stats == NULL)
115 {
116 gist_stats = create_GistBulkDeleteResult();
117 gistvacuumscan(info, gist_stats, NULL, NULL);
118 }
119
120 /*
121 * If we saw any empty pages, try to unlink them from the tree so that
122 * they can be reused.
123 */
124 gistvacuum_delete_empty_pages(info, gist_stats);
125
126 /* we don't need the internal and empty page sets anymore */
127 MemoryContextDelete(gist_stats->page_set_context);
128 gist_stats->page_set_context = NULL;
129 gist_stats->internal_page_set = NULL;
130 gist_stats->empty_leaf_set = NULL;
131
132 /*
133 * It's quite possible for us to be fooled by concurrent page splits into
134 * double-counting some index tuples, so disbelieve any total that exceeds
135 * the underlying heap's count ... if we know that accurately. Otherwise
136 * this might just make matters worse.
137 */
138 if (!info->estimated_count)
139 {
140 if (gist_stats->stats.num_index_tuples > info->num_heap_tuples)
141 gist_stats->stats.num_index_tuples = info->num_heap_tuples;
142 }
143
144 return (IndexBulkDeleteResult *) gist_stats;
145}
146
147/*
148 * gistvacuumscan --- scan the index for VACUUMing purposes
149 *
150 * This scans the index for leaf tuples that are deletable according to the
151 * vacuum callback, and updates the stats. Both btbulkdelete and
152 * btvacuumcleanup invoke this (the latter only if no btbulkdelete call
153 * occurred).
154 *
155 * This also makes note of any empty leaf pages, as well as all internal
156 * pages. The second stage, gistvacuum_delete_empty_pages(), needs that
157 * information. Any deleted pages are added directly to the free space map.
158 * (They should've been added there when they were originally deleted, already,
159 * but it's possible that the FSM was lost at a crash, for example.)
160 *
161 * The caller is responsible for initially allocating/zeroing a stats struct.
162 */
163static void
164gistvacuumscan(IndexVacuumInfo *info, GistBulkDeleteResult *stats,
165 IndexBulkDeleteCallback callback, void *callback_state)
166{
167 Relation rel = info->index;
168 GistVacState vstate;
169 BlockNumber num_pages;
170 bool needLock;
171 BlockNumber blkno;
172
173 /*
174 * Reset counts that will be incremented during the scan; needed in case
175 * of multiple scans during a single VACUUM command.
176 */
177 stats->stats.estimated_count = false;
178 stats->stats.num_index_tuples = 0;
179 stats->stats.pages_deleted = 0;
180 stats->stats.pages_free = 0;
181 MemoryContextReset(stats->page_set_context);
182 stats->internal_page_set = intset_create();
183 stats->empty_leaf_set = intset_create();
184
185 /* Set up info to pass down to gistvacuumpage */
186 vstate.info = info;
187 vstate.stats = stats;
188 vstate.callback = callback;
189 vstate.callback_state = callback_state;
190 if (RelationNeedsWAL(rel))
191 vstate.startNSN = GetInsertRecPtr();
192 else
193 vstate.startNSN = gistGetFakeLSN(rel);
194
195 /*
196 * The outer loop iterates over all index pages, in physical order (we
197 * hope the kernel will cooperate in providing read-ahead for speed). It
198 * is critical that we visit all leaf pages, including ones added after we
199 * start the scan, else we might fail to delete some deletable tuples.
200 * Hence, we must repeatedly check the relation length. We must acquire
201 * the relation-extension lock while doing so to avoid a race condition:
202 * if someone else is extending the relation, there is a window where
203 * bufmgr/smgr have created a new all-zero page but it hasn't yet been
204 * write-locked by gistNewBuffer(). If we manage to scan such a page
205 * here, we'll improperly assume it can be recycled. Taking the lock
206 * synchronizes things enough to prevent a problem: either num_pages won't
207 * include the new page, or gistNewBuffer already has write lock on the
208 * buffer and it will be fully initialized before we can examine it. (See
209 * also vacuumlazy.c, which has the same issue.) Also, we need not worry
210 * if a page is added immediately after we look; the page splitting code
211 * already has write-lock on the left page before it adds a right page, so
212 * we must already have processed any tuples due to be moved into such a
213 * page.
214 *
215 * We can skip locking for new or temp relations, however, since no one
216 * else could be accessing them.
217 */
218 needLock = !RELATION_IS_LOCAL(rel);
219
220 blkno = GIST_ROOT_BLKNO;
221 for (;;)
222 {
223 /* Get the current relation length */
224 if (needLock)
225 LockRelationForExtension(rel, ExclusiveLock);
226 num_pages = RelationGetNumberOfBlocks(rel);
227 if (needLock)
228 UnlockRelationForExtension(rel, ExclusiveLock);
229
230 /* Quit if we've scanned the whole relation */
231 if (blkno >= num_pages)
232 break;
233 /* Iterate over pages, then loop back to recheck length */
234 for (; blkno < num_pages; blkno++)
235 gistvacuumpage(&vstate, blkno, blkno);
236 }
237
238 /*
239 * If we found any recyclable pages (and recorded them in the FSM), then
240 * forcibly update the upper-level FSM pages to ensure that searchers can
241 * find them. It's possible that the pages were also found during
242 * previous scans and so this is a waste of time, but it's cheap enough
243 * relative to scanning the index that it shouldn't matter much, and
244 * making sure that free pages are available sooner not later seems
245 * worthwhile.
246 *
247 * Note that if no recyclable pages exist, we don't bother vacuuming the
248 * FSM at all.
249 */
250 if (stats->stats.pages_free > 0)
251 IndexFreeSpaceMapVacuum(rel);
252
253 /* update statistics */
254 stats->stats.num_pages = num_pages;
255}
256
257/*
258 * gistvacuumpage --- VACUUM one page
259 *
260 * This processes a single page for gistbulkdelete(). In some cases we
261 * must go back and re-examine previously-scanned pages; this routine
262 * recurses when necessary to handle that case.
263 *
264 * blkno is the page to process. orig_blkno is the highest block number
265 * reached by the outer gistvacuumscan loop (the same as blkno, unless we
266 * are recursing to re-examine a previous page).
267 */
268static void
269gistvacuumpage(GistVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
270{
271 GistBulkDeleteResult *stats = vstate->stats;
272 IndexVacuumInfo *info = vstate->info;
273 IndexBulkDeleteCallback callback = vstate->callback;
274 void *callback_state = vstate->callback_state;
275 Relation rel = info->index;
276 Buffer buffer;
277 Page page;
278 BlockNumber recurse_to;
279
280restart:
281 recurse_to = InvalidBlockNumber;
282
283 /* call vacuum_delay_point while not holding any buffer lock */
284 vacuum_delay_point();
285
286 buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
287 info->strategy);
288
289 /*
290 * We are not going to stay here for a long time, aggressively grab an
291 * exclusive lock.
292 */
293 LockBuffer(buffer, GIST_EXCLUSIVE);
294 page = (Page) BufferGetPage(buffer);
295
296 if (gistPageRecyclable(page))
297 {
298 /* Okay to recycle this page */
299 RecordFreeIndexPage(rel, blkno);
300 stats->stats.pages_free++;
301 stats->stats.pages_deleted++;
302 }
303 else if (GistPageIsDeleted(page))
304 {
305 /* Already deleted, but can't recycle yet */
306 stats->stats.pages_deleted++;
307 }
308 else if (GistPageIsLeaf(page))
309 {
310 OffsetNumber todelete[MaxOffsetNumber];
311 int ntodelete = 0;
312 int nremain;
313 GISTPageOpaque opaque = GistPageGetOpaque(page);
314 OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
315
316 /*
317 * Check whether we need to recurse back to earlier pages. What we
318 * are concerned about is a page split that happened since we started
319 * the vacuum scan. If the split moved some tuples to a lower page
320 * then we might have missed 'em. If so, set up for tail recursion.
321 *
322 * This is similar to the checks we do during searches, when following
323 * a downlink, but we don't need to jump to higher-numbered pages,
324 * because we will process them later, anyway.
325 */
326 if ((GistFollowRight(page) ||
327 vstate->startNSN < GistPageGetNSN(page)) &&
328 (opaque->rightlink != InvalidBlockNumber) &&
329 (opaque->rightlink < orig_blkno))
330 {
331 recurse_to = opaque->rightlink;
332 }
333
334 /*
335 * Scan over all items to see which ones need to be deleted according
336 * to the callback function.
337 */
338 if (callback)
339 {
340 OffsetNumber off;
341
342 for (off = FirstOffsetNumber;
343 off <= maxoff;
344 off = OffsetNumberNext(off))
345 {
346 ItemId iid = PageGetItemId(page, off);
347 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
348
349 if (callback(&(idxtuple->t_tid), callback_state))
350 todelete[ntodelete++] = off;
351 }
352 }
353
354 /*
355 * Apply any needed deletes. We issue just one WAL record per page,
356 * so as to minimize WAL traffic.
357 */
358 if (ntodelete > 0)
359 {
360 START_CRIT_SECTION();
361
362 MarkBufferDirty(buffer);
363
364 PageIndexMultiDelete(page, todelete, ntodelete);
365 GistMarkTuplesDeleted(page);
366
367 if (RelationNeedsWAL(rel))
368 {
369 XLogRecPtr recptr;
370
371 recptr = gistXLogUpdate(buffer,
372 todelete, ntodelete,
373 NULL, 0, InvalidBuffer);
374 PageSetLSN(page, recptr);
375 }
376 else
377 PageSetLSN(page, gistGetFakeLSN(rel));
378
379 END_CRIT_SECTION();
380
381 stats->stats.tuples_removed += ntodelete;
382 /* must recompute maxoff */
383 maxoff = PageGetMaxOffsetNumber(page);
384 }
385
386 nremain = maxoff - FirstOffsetNumber + 1;
387 if (nremain == 0)
388 {
389 /*
390 * The page is now completely empty. Remember its block number,
391 * so that we will try to delete the page in the second stage.
392 *
393 * Skip this when recursing, because IntegerSet requires that the
394 * values are added in ascending order. The next VACUUM will pick
395 * it up.
396 */
397 if (blkno == orig_blkno)
398 intset_add_member(stats->empty_leaf_set, blkno);
399 }
400 else
401 stats->stats.num_index_tuples += nremain;
402 }
403 else
404 {
405 /*
406 * On an internal page, check for "invalid tuples", left behind by an
407 * incomplete page split on PostgreSQL 9.0 or below. These are not
408 * created by newer PostgreSQL versions, but unfortunately, there is
409 * no version number anywhere in a GiST index, so we don't know
410 * whether this index might still contain invalid tuples or not.
411 */
412 OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
413 OffsetNumber off;
414
415 for (off = FirstOffsetNumber;
416 off <= maxoff;
417 off = OffsetNumberNext(off))
418 {
419 ItemId iid = PageGetItemId(page, off);
420 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
421
422 if (GistTupleIsInvalid(idxtuple))
423 ereport(LOG,
424 (errmsg("index \"%s\" contains an inner tuple marked as invalid",
425 RelationGetRelationName(rel)),
426 errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
427 errhint("Please REINDEX it.")));
428 }
429
430 /*
431 * Remember the block number of this page, so that we can revisit it
432 * later in gistvacuum_delete_empty_pages(), when we search for
433 * parents of empty leaf pages.
434 */
435 if (blkno == orig_blkno)
436 intset_add_member(stats->internal_page_set, blkno);
437 }
438
439 UnlockReleaseBuffer(buffer);
440
441 /*
442 * This is really tail recursion, but if the compiler is too stupid to
443 * optimize it as such, we'd eat an uncomfortably large amount of stack
444 * space per recursion level (due to the deletable[] array). A failure is
445 * improbable since the number of levels isn't likely to be large ... but
446 * just in case, let's hand-optimize into a loop.
447 */
448 if (recurse_to != InvalidBlockNumber)
449 {
450 blkno = recurse_to;
451 goto restart;
452 }
453}
454
455/*
456 * Scan all internal pages, and try to delete their empty child pages.
457 */
458static void
459gistvacuum_delete_empty_pages(IndexVacuumInfo *info, GistBulkDeleteResult *stats)
460{
461 Relation rel = info->index;
462 BlockNumber empty_pages_remaining;
463 uint64 blkno;
464
465 /*
466 * Rescan all inner pages to find those that have empty child pages.
467 */
468 empty_pages_remaining = intset_num_entries(stats->empty_leaf_set);
469 intset_begin_iterate(stats->internal_page_set);
470 while (empty_pages_remaining > 0 &&
471 intset_iterate_next(stats->internal_page_set, &blkno))
472 {
473 Buffer buffer;
474 Page page;
475 OffsetNumber off,
476 maxoff;
477 OffsetNumber todelete[MaxOffsetNumber];
478 BlockNumber leafs_to_delete[MaxOffsetNumber];
479 int ntodelete;
480 int deleted;
481
482 buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber) blkno,
483 RBM_NORMAL, info->strategy);
484
485 LockBuffer(buffer, GIST_SHARE);
486 page = (Page) BufferGetPage(buffer);
487
488 if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page))
489 {
490 /*
491 * This page was an internal page earlier, but now it's something
492 * else. Shouldn't happen...
493 */
494 Assert(false);
495 UnlockReleaseBuffer(buffer);
496 continue;
497 }
498
499 /*
500 * Scan all the downlinks, and see if any of them point to empty leaf
501 * pages.
502 */
503 maxoff = PageGetMaxOffsetNumber(page);
504 ntodelete = 0;
505 for (off = FirstOffsetNumber;
506 off <= maxoff && ntodelete < maxoff - 1;
507 off = OffsetNumberNext(off))
508 {
509 ItemId iid = PageGetItemId(page, off);
510 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
511 BlockNumber leafblk;
512
513 leafblk = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
514 if (intset_is_member(stats->empty_leaf_set, leafblk))
515 {
516 leafs_to_delete[ntodelete] = leafblk;
517 todelete[ntodelete++] = off;
518 }
519 }
520
521 /*
522 * In order to avoid deadlock, child page must be locked before
523 * parent, so we must release the lock on the parent, lock the child,
524 * and then re-acquire the lock the parent. (And we wouldn't want to
525 * do I/O, while holding a lock, anyway.)
526 *
527 * At the instant that we're not holding a lock on the parent, the
528 * downlink might get moved by a concurrent insert, so we must
529 * re-check that it still points to the same child page after we have
530 * acquired both locks. Also, another backend might have inserted a
531 * tuple to the page, so that it is no longer empty. gistdeletepage()
532 * re-checks all these conditions.
533 */
534 LockBuffer(buffer, GIST_UNLOCK);
535
536 deleted = 0;
537 for (int i = 0; i < ntodelete; i++)
538 {
539 Buffer leafbuf;
540
541 /*
542 * Don't remove the last downlink from the parent. That would
543 * confuse the insertion code.
544 */
545 if (PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
546 break;
547
548 leafbuf = ReadBufferExtended(rel, MAIN_FORKNUM, leafs_to_delete[i],
549 RBM_NORMAL, info->strategy);
550 LockBuffer(leafbuf, GIST_EXCLUSIVE);
551 gistcheckpage(rel, leafbuf);
552
553 LockBuffer(buffer, GIST_EXCLUSIVE);
554 if (gistdeletepage(info, stats,
555 buffer, todelete[i] - deleted,
556 leafbuf))
557 deleted++;
558 LockBuffer(buffer, GIST_UNLOCK);
559
560 UnlockReleaseBuffer(leafbuf);
561 }
562
563 ReleaseBuffer(buffer);
564
565 /* update stats */
566 stats->stats.pages_removed += deleted;
567
568 /*
569 * We can stop the scan as soon as we have seen the downlinks, even if
570 * we were not able to remove them all.
571 */
572 empty_pages_remaining -= ntodelete;
573 }
574}
575
576/*
577 * gistdeletepage takes a leaf page, and its parent, and tries to delete the
578 * leaf. Both pages must be locked.
579 *
580 * Even if the page was empty when we first saw it, a concurrent inserter might
581 * have added a tuple to it since. Similarly, the downlink might have moved.
582 * We re-check all the conditions, to make sure the page is still deletable,
583 * before modifying anything.
584 *
585 * Returns true, if the page was deleted, and false if a concurrent update
586 * prevented it.
587 */
588static bool
589gistdeletepage(IndexVacuumInfo *info, GistBulkDeleteResult *stats,
590 Buffer parentBuffer, OffsetNumber downlink,
591 Buffer leafBuffer)
592{
593 Page parentPage = BufferGetPage(parentBuffer);
594 Page leafPage = BufferGetPage(leafBuffer);
595 ItemId iid;
596 IndexTuple idxtuple;
597 XLogRecPtr recptr;
598 FullTransactionId txid;
599
600 /*
601 * Check that the leaf is still empty and deletable.
602 */
603 if (!GistPageIsLeaf(leafPage))
604 {
605 /* a leaf page should never become a non-leaf page */
606 Assert(false);
607 return false;
608 }
609
610 if (GistFollowRight(leafPage))
611 return false; /* don't mess with a concurrent page split */
612
613 if (PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber)
614 return false; /* not empty anymore */
615
616 /*
617 * Ok, the leaf is deletable. Is the downlink in the parent page still
618 * valid? It might have been moved by a concurrent insert. We could try
619 * to re-find it by scanning the page again, possibly moving right if the
620 * was split. But for now, let's keep it simple and just give up. The
621 * next VACUUM will pick it up.
622 */
623 if (PageIsNew(parentPage) || GistPageIsDeleted(parentPage) ||
624 GistPageIsLeaf(parentPage))
625 {
626 /* shouldn't happen, internal pages are never deleted */
627 Assert(false);
628 return false;
629 }
630
631 if (PageGetMaxOffsetNumber(parentPage) < downlink
632 || PageGetMaxOffsetNumber(parentPage) <= FirstOffsetNumber)
633 return false;
634
635 iid = PageGetItemId(parentPage, downlink);
636 idxtuple = (IndexTuple) PageGetItem(parentPage, iid);
637 if (BufferGetBlockNumber(leafBuffer) !=
638 ItemPointerGetBlockNumber(&(idxtuple->t_tid)))
639 return false;
640
641 /*
642 * All good, proceed with the deletion.
643 *
644 * The page cannot be immediately recycled, because in-progress scans that
645 * saw the downlink might still visit it. Mark the page with the current
646 * next-XID counter, so that we know when it can be recycled. Once that
647 * XID becomes older than GlobalXmin, we know that all scans that are
648 * currently in progress must have ended. (That's much more conservative
649 * than needed, but let's keep it safe and simple.)
650 */
651 txid = ReadNextFullTransactionId();
652
653 START_CRIT_SECTION();
654
655 /* mark the page as deleted */
656 MarkBufferDirty(leafBuffer);
657 GistPageSetDeleted(leafPage, txid);
658 stats->stats.pages_deleted++;
659
660 /* remove the downlink from the parent */
661 MarkBufferDirty(parentBuffer);
662 PageIndexTupleDelete(parentPage, downlink);
663
664 if (RelationNeedsWAL(info->index))
665 recptr = gistXLogPageDelete(leafBuffer, txid, parentBuffer, downlink);
666 else
667 recptr = gistGetFakeLSN(info->index);
668 PageSetLSN(parentPage, recptr);
669 PageSetLSN(leafPage, recptr);
670
671 END_CRIT_SECTION();
672
673 return true;
674}
675