1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * gistvacuum.c |
4 | * vacuuming routines for the postgres GiST index access method. |
5 | * |
6 | * |
7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
8 | * Portions Copyright (c) 1994, Regents of the University of California |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/gist/gistvacuum.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | #include "postgres.h" |
16 | |
17 | #include "access/genam.h" |
18 | #include "access/gist_private.h" |
19 | #include "access/transam.h" |
20 | #include "commands/vacuum.h" |
21 | #include "lib/integerset.h" |
22 | #include "miscadmin.h" |
23 | #include "storage/indexfsm.h" |
24 | #include "storage/lmgr.h" |
25 | #include "utils/memutils.h" |
26 | |
27 | /* |
28 | * State kept across vacuum stages. |
29 | */ |
30 | typedef struct |
31 | { |
32 | IndexBulkDeleteResult stats; /* must be first */ |
33 | |
34 | /* |
35 | * These are used to memorize all internal and empty leaf pages in the 1st |
36 | * vacuum stage. They are used in the 2nd stage, to delete all the empty |
37 | * pages. |
38 | */ |
39 | IntegerSet *internal_page_set; |
40 | IntegerSet *empty_leaf_set; |
41 | MemoryContext page_set_context; |
42 | } GistBulkDeleteResult; |
43 | |
44 | /* Working state needed by gistbulkdelete */ |
45 | typedef struct |
46 | { |
47 | IndexVacuumInfo *info; |
48 | GistBulkDeleteResult *stats; |
49 | IndexBulkDeleteCallback callback; |
50 | void *callback_state; |
51 | GistNSN startNSN; |
52 | } GistVacState; |
53 | |
54 | static void gistvacuumscan(IndexVacuumInfo *info, GistBulkDeleteResult *stats, |
55 | IndexBulkDeleteCallback callback, void *callback_state); |
56 | static void gistvacuumpage(GistVacState *vstate, BlockNumber blkno, |
57 | BlockNumber orig_blkno); |
58 | static void gistvacuum_delete_empty_pages(IndexVacuumInfo *info, |
59 | GistBulkDeleteResult *stats); |
60 | static bool gistdeletepage(IndexVacuumInfo *info, GistBulkDeleteResult *stats, |
61 | Buffer buffer, OffsetNumber downlink, |
62 | Buffer leafBuffer); |
63 | |
64 | /* allocate the 'stats' struct that's kept over vacuum stages */ |
65 | static GistBulkDeleteResult * |
66 | create_GistBulkDeleteResult(void) |
67 | { |
68 | GistBulkDeleteResult *gist_stats; |
69 | |
70 | gist_stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); |
71 | gist_stats->page_set_context = |
72 | GenerationContextCreate(CurrentMemoryContext, |
73 | "GiST VACUUM page set context" , |
74 | 16 * 1024); |
75 | |
76 | return gist_stats; |
77 | } |
78 | |
79 | /* |
80 | * VACUUM bulkdelete stage: remove index entries. |
81 | */ |
82 | IndexBulkDeleteResult * |
83 | gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
84 | IndexBulkDeleteCallback callback, void *callback_state) |
85 | { |
86 | GistBulkDeleteResult *gist_stats = (GistBulkDeleteResult *) stats; |
87 | |
88 | /* allocate stats if first time through, else re-use existing struct */ |
89 | if (gist_stats == NULL) |
90 | gist_stats = create_GistBulkDeleteResult(); |
91 | |
92 | gistvacuumscan(info, gist_stats, callback, callback_state); |
93 | |
94 | return (IndexBulkDeleteResult *) gist_stats; |
95 | } |
96 | |
97 | /* |
98 | * VACUUM cleanup stage: delete empty pages, and update index statistics. |
99 | */ |
100 | IndexBulkDeleteResult * |
101 | gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) |
102 | { |
103 | GistBulkDeleteResult *gist_stats = (GistBulkDeleteResult *) stats; |
104 | |
105 | /* No-op in ANALYZE ONLY mode */ |
106 | if (info->analyze_only) |
107 | return stats; |
108 | |
109 | /* |
110 | * If gistbulkdelete was called, we need not do anything, just return the |
111 | * stats from the latest gistbulkdelete call. If it wasn't called, we |
112 | * still need to do a pass over the index, to obtain index statistics. |
113 | */ |
114 | if (gist_stats == NULL) |
115 | { |
116 | gist_stats = create_GistBulkDeleteResult(); |
117 | gistvacuumscan(info, gist_stats, NULL, NULL); |
118 | } |
119 | |
120 | /* |
121 | * If we saw any empty pages, try to unlink them from the tree so that |
122 | * they can be reused. |
123 | */ |
124 | gistvacuum_delete_empty_pages(info, gist_stats); |
125 | |
126 | /* we don't need the internal and empty page sets anymore */ |
127 | MemoryContextDelete(gist_stats->page_set_context); |
128 | gist_stats->page_set_context = NULL; |
129 | gist_stats->internal_page_set = NULL; |
130 | gist_stats->empty_leaf_set = NULL; |
131 | |
132 | /* |
133 | * It's quite possible for us to be fooled by concurrent page splits into |
134 | * double-counting some index tuples, so disbelieve any total that exceeds |
135 | * the underlying heap's count ... if we know that accurately. Otherwise |
136 | * this might just make matters worse. |
137 | */ |
138 | if (!info->estimated_count) |
139 | { |
140 | if (gist_stats->stats.num_index_tuples > info->num_heap_tuples) |
141 | gist_stats->stats.num_index_tuples = info->num_heap_tuples; |
142 | } |
143 | |
144 | return (IndexBulkDeleteResult *) gist_stats; |
145 | } |
146 | |
147 | /* |
148 | * gistvacuumscan --- scan the index for VACUUMing purposes |
149 | * |
150 | * This scans the index for leaf tuples that are deletable according to the |
151 | * vacuum callback, and updates the stats. Both btbulkdelete and |
152 | * btvacuumcleanup invoke this (the latter only if no btbulkdelete call |
153 | * occurred). |
154 | * |
155 | * This also makes note of any empty leaf pages, as well as all internal |
156 | * pages. The second stage, gistvacuum_delete_empty_pages(), needs that |
157 | * information. Any deleted pages are added directly to the free space map. |
158 | * (They should've been added there when they were originally deleted, already, |
159 | * but it's possible that the FSM was lost at a crash, for example.) |
160 | * |
161 | * The caller is responsible for initially allocating/zeroing a stats struct. |
162 | */ |
163 | static void |
164 | gistvacuumscan(IndexVacuumInfo *info, GistBulkDeleteResult *stats, |
165 | IndexBulkDeleteCallback callback, void *callback_state) |
166 | { |
167 | Relation rel = info->index; |
168 | GistVacState vstate; |
169 | BlockNumber num_pages; |
170 | bool needLock; |
171 | BlockNumber blkno; |
172 | |
173 | /* |
174 | * Reset counts that will be incremented during the scan; needed in case |
175 | * of multiple scans during a single VACUUM command. |
176 | */ |
177 | stats->stats.estimated_count = false; |
178 | stats->stats.num_index_tuples = 0; |
179 | stats->stats.pages_deleted = 0; |
180 | stats->stats.pages_free = 0; |
181 | MemoryContextReset(stats->page_set_context); |
182 | stats->internal_page_set = intset_create(); |
183 | stats->empty_leaf_set = intset_create(); |
184 | |
185 | /* Set up info to pass down to gistvacuumpage */ |
186 | vstate.info = info; |
187 | vstate.stats = stats; |
188 | vstate.callback = callback; |
189 | vstate.callback_state = callback_state; |
190 | if (RelationNeedsWAL(rel)) |
191 | vstate.startNSN = GetInsertRecPtr(); |
192 | else |
193 | vstate.startNSN = gistGetFakeLSN(rel); |
194 | |
195 | /* |
196 | * The outer loop iterates over all index pages, in physical order (we |
197 | * hope the kernel will cooperate in providing read-ahead for speed). It |
198 | * is critical that we visit all leaf pages, including ones added after we |
199 | * start the scan, else we might fail to delete some deletable tuples. |
200 | * Hence, we must repeatedly check the relation length. We must acquire |
201 | * the relation-extension lock while doing so to avoid a race condition: |
202 | * if someone else is extending the relation, there is a window where |
203 | * bufmgr/smgr have created a new all-zero page but it hasn't yet been |
204 | * write-locked by gistNewBuffer(). If we manage to scan such a page |
205 | * here, we'll improperly assume it can be recycled. Taking the lock |
206 | * synchronizes things enough to prevent a problem: either num_pages won't |
207 | * include the new page, or gistNewBuffer already has write lock on the |
208 | * buffer and it will be fully initialized before we can examine it. (See |
209 | * also vacuumlazy.c, which has the same issue.) Also, we need not worry |
210 | * if a page is added immediately after we look; the page splitting code |
211 | * already has write-lock on the left page before it adds a right page, so |
212 | * we must already have processed any tuples due to be moved into such a |
213 | * page. |
214 | * |
215 | * We can skip locking for new or temp relations, however, since no one |
216 | * else could be accessing them. |
217 | */ |
218 | needLock = !RELATION_IS_LOCAL(rel); |
219 | |
220 | blkno = GIST_ROOT_BLKNO; |
221 | for (;;) |
222 | { |
223 | /* Get the current relation length */ |
224 | if (needLock) |
225 | LockRelationForExtension(rel, ExclusiveLock); |
226 | num_pages = RelationGetNumberOfBlocks(rel); |
227 | if (needLock) |
228 | UnlockRelationForExtension(rel, ExclusiveLock); |
229 | |
230 | /* Quit if we've scanned the whole relation */ |
231 | if (blkno >= num_pages) |
232 | break; |
233 | /* Iterate over pages, then loop back to recheck length */ |
234 | for (; blkno < num_pages; blkno++) |
235 | gistvacuumpage(&vstate, blkno, blkno); |
236 | } |
237 | |
238 | /* |
239 | * If we found any recyclable pages (and recorded them in the FSM), then |
240 | * forcibly update the upper-level FSM pages to ensure that searchers can |
241 | * find them. It's possible that the pages were also found during |
242 | * previous scans and so this is a waste of time, but it's cheap enough |
243 | * relative to scanning the index that it shouldn't matter much, and |
244 | * making sure that free pages are available sooner not later seems |
245 | * worthwhile. |
246 | * |
247 | * Note that if no recyclable pages exist, we don't bother vacuuming the |
248 | * FSM at all. |
249 | */ |
250 | if (stats->stats.pages_free > 0) |
251 | IndexFreeSpaceMapVacuum(rel); |
252 | |
253 | /* update statistics */ |
254 | stats->stats.num_pages = num_pages; |
255 | } |
256 | |
257 | /* |
258 | * gistvacuumpage --- VACUUM one page |
259 | * |
260 | * This processes a single page for gistbulkdelete(). In some cases we |
261 | * must go back and re-examine previously-scanned pages; this routine |
262 | * recurses when necessary to handle that case. |
263 | * |
264 | * blkno is the page to process. orig_blkno is the highest block number |
265 | * reached by the outer gistvacuumscan loop (the same as blkno, unless we |
266 | * are recursing to re-examine a previous page). |
267 | */ |
268 | static void |
269 | gistvacuumpage(GistVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) |
270 | { |
271 | GistBulkDeleteResult *stats = vstate->stats; |
272 | IndexVacuumInfo *info = vstate->info; |
273 | IndexBulkDeleteCallback callback = vstate->callback; |
274 | void *callback_state = vstate->callback_state; |
275 | Relation rel = info->index; |
276 | Buffer buffer; |
277 | Page page; |
278 | BlockNumber recurse_to; |
279 | |
280 | restart: |
281 | recurse_to = InvalidBlockNumber; |
282 | |
283 | /* call vacuum_delay_point while not holding any buffer lock */ |
284 | vacuum_delay_point(); |
285 | |
286 | buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, |
287 | info->strategy); |
288 | |
289 | /* |
290 | * We are not going to stay here for a long time, aggressively grab an |
291 | * exclusive lock. |
292 | */ |
293 | LockBuffer(buffer, GIST_EXCLUSIVE); |
294 | page = (Page) BufferGetPage(buffer); |
295 | |
296 | if (gistPageRecyclable(page)) |
297 | { |
298 | /* Okay to recycle this page */ |
299 | RecordFreeIndexPage(rel, blkno); |
300 | stats->stats.pages_free++; |
301 | stats->stats.pages_deleted++; |
302 | } |
303 | else if (GistPageIsDeleted(page)) |
304 | { |
305 | /* Already deleted, but can't recycle yet */ |
306 | stats->stats.pages_deleted++; |
307 | } |
308 | else if (GistPageIsLeaf(page)) |
309 | { |
310 | OffsetNumber todelete[MaxOffsetNumber]; |
311 | int ntodelete = 0; |
312 | int nremain; |
313 | GISTPageOpaque opaque = GistPageGetOpaque(page); |
314 | OffsetNumber maxoff = PageGetMaxOffsetNumber(page); |
315 | |
316 | /* |
317 | * Check whether we need to recurse back to earlier pages. What we |
318 | * are concerned about is a page split that happened since we started |
319 | * the vacuum scan. If the split moved some tuples to a lower page |
320 | * then we might have missed 'em. If so, set up for tail recursion. |
321 | * |
322 | * This is similar to the checks we do during searches, when following |
323 | * a downlink, but we don't need to jump to higher-numbered pages, |
324 | * because we will process them later, anyway. |
325 | */ |
326 | if ((GistFollowRight(page) || |
327 | vstate->startNSN < GistPageGetNSN(page)) && |
328 | (opaque->rightlink != InvalidBlockNumber) && |
329 | (opaque->rightlink < orig_blkno)) |
330 | { |
331 | recurse_to = opaque->rightlink; |
332 | } |
333 | |
334 | /* |
335 | * Scan over all items to see which ones need to be deleted according |
336 | * to the callback function. |
337 | */ |
338 | if (callback) |
339 | { |
340 | OffsetNumber off; |
341 | |
342 | for (off = FirstOffsetNumber; |
343 | off <= maxoff; |
344 | off = OffsetNumberNext(off)) |
345 | { |
346 | ItemId iid = PageGetItemId(page, off); |
347 | IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
348 | |
349 | if (callback(&(idxtuple->t_tid), callback_state)) |
350 | todelete[ntodelete++] = off; |
351 | } |
352 | } |
353 | |
354 | /* |
355 | * Apply any needed deletes. We issue just one WAL record per page, |
356 | * so as to minimize WAL traffic. |
357 | */ |
358 | if (ntodelete > 0) |
359 | { |
360 | START_CRIT_SECTION(); |
361 | |
362 | MarkBufferDirty(buffer); |
363 | |
364 | PageIndexMultiDelete(page, todelete, ntodelete); |
365 | GistMarkTuplesDeleted(page); |
366 | |
367 | if (RelationNeedsWAL(rel)) |
368 | { |
369 | XLogRecPtr recptr; |
370 | |
371 | recptr = gistXLogUpdate(buffer, |
372 | todelete, ntodelete, |
373 | NULL, 0, InvalidBuffer); |
374 | PageSetLSN(page, recptr); |
375 | } |
376 | else |
377 | PageSetLSN(page, gistGetFakeLSN(rel)); |
378 | |
379 | END_CRIT_SECTION(); |
380 | |
381 | stats->stats.tuples_removed += ntodelete; |
382 | /* must recompute maxoff */ |
383 | maxoff = PageGetMaxOffsetNumber(page); |
384 | } |
385 | |
386 | nremain = maxoff - FirstOffsetNumber + 1; |
387 | if (nremain == 0) |
388 | { |
389 | /* |
390 | * The page is now completely empty. Remember its block number, |
391 | * so that we will try to delete the page in the second stage. |
392 | * |
393 | * Skip this when recursing, because IntegerSet requires that the |
394 | * values are added in ascending order. The next VACUUM will pick |
395 | * it up. |
396 | */ |
397 | if (blkno == orig_blkno) |
398 | intset_add_member(stats->empty_leaf_set, blkno); |
399 | } |
400 | else |
401 | stats->stats.num_index_tuples += nremain; |
402 | } |
403 | else |
404 | { |
405 | /* |
406 | * On an internal page, check for "invalid tuples", left behind by an |
407 | * incomplete page split on PostgreSQL 9.0 or below. These are not |
408 | * created by newer PostgreSQL versions, but unfortunately, there is |
409 | * no version number anywhere in a GiST index, so we don't know |
410 | * whether this index might still contain invalid tuples or not. |
411 | */ |
412 | OffsetNumber maxoff = PageGetMaxOffsetNumber(page); |
413 | OffsetNumber off; |
414 | |
415 | for (off = FirstOffsetNumber; |
416 | off <= maxoff; |
417 | off = OffsetNumberNext(off)) |
418 | { |
419 | ItemId iid = PageGetItemId(page, off); |
420 | IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
421 | |
422 | if (GistTupleIsInvalid(idxtuple)) |
423 | ereport(LOG, |
424 | (errmsg("index \"%s\" contains an inner tuple marked as invalid" , |
425 | RelationGetRelationName(rel)), |
426 | errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1." ), |
427 | errhint("Please REINDEX it." ))); |
428 | } |
429 | |
430 | /* |
431 | * Remember the block number of this page, so that we can revisit it |
432 | * later in gistvacuum_delete_empty_pages(), when we search for |
433 | * parents of empty leaf pages. |
434 | */ |
435 | if (blkno == orig_blkno) |
436 | intset_add_member(stats->internal_page_set, blkno); |
437 | } |
438 | |
439 | UnlockReleaseBuffer(buffer); |
440 | |
441 | /* |
442 | * This is really tail recursion, but if the compiler is too stupid to |
443 | * optimize it as such, we'd eat an uncomfortably large amount of stack |
444 | * space per recursion level (due to the deletable[] array). A failure is |
445 | * improbable since the number of levels isn't likely to be large ... but |
446 | * just in case, let's hand-optimize into a loop. |
447 | */ |
448 | if (recurse_to != InvalidBlockNumber) |
449 | { |
450 | blkno = recurse_to; |
451 | goto restart; |
452 | } |
453 | } |
454 | |
455 | /* |
456 | * Scan all internal pages, and try to delete their empty child pages. |
457 | */ |
458 | static void |
459 | gistvacuum_delete_empty_pages(IndexVacuumInfo *info, GistBulkDeleteResult *stats) |
460 | { |
461 | Relation rel = info->index; |
462 | BlockNumber empty_pages_remaining; |
463 | uint64 blkno; |
464 | |
465 | /* |
466 | * Rescan all inner pages to find those that have empty child pages. |
467 | */ |
468 | empty_pages_remaining = intset_num_entries(stats->empty_leaf_set); |
469 | intset_begin_iterate(stats->internal_page_set); |
470 | while (empty_pages_remaining > 0 && |
471 | intset_iterate_next(stats->internal_page_set, &blkno)) |
472 | { |
473 | Buffer buffer; |
474 | Page page; |
475 | OffsetNumber off, |
476 | maxoff; |
477 | OffsetNumber todelete[MaxOffsetNumber]; |
478 | BlockNumber leafs_to_delete[MaxOffsetNumber]; |
479 | int ntodelete; |
480 | int deleted; |
481 | |
482 | buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber) blkno, |
483 | RBM_NORMAL, info->strategy); |
484 | |
485 | LockBuffer(buffer, GIST_SHARE); |
486 | page = (Page) BufferGetPage(buffer); |
487 | |
488 | if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page)) |
489 | { |
490 | /* |
491 | * This page was an internal page earlier, but now it's something |
492 | * else. Shouldn't happen... |
493 | */ |
494 | Assert(false); |
495 | UnlockReleaseBuffer(buffer); |
496 | continue; |
497 | } |
498 | |
499 | /* |
500 | * Scan all the downlinks, and see if any of them point to empty leaf |
501 | * pages. |
502 | */ |
503 | maxoff = PageGetMaxOffsetNumber(page); |
504 | ntodelete = 0; |
505 | for (off = FirstOffsetNumber; |
506 | off <= maxoff && ntodelete < maxoff - 1; |
507 | off = OffsetNumberNext(off)) |
508 | { |
509 | ItemId iid = PageGetItemId(page, off); |
510 | IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
511 | BlockNumber leafblk; |
512 | |
513 | leafblk = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); |
514 | if (intset_is_member(stats->empty_leaf_set, leafblk)) |
515 | { |
516 | leafs_to_delete[ntodelete] = leafblk; |
517 | todelete[ntodelete++] = off; |
518 | } |
519 | } |
520 | |
521 | /* |
522 | * In order to avoid deadlock, child page must be locked before |
523 | * parent, so we must release the lock on the parent, lock the child, |
524 | * and then re-acquire the lock the parent. (And we wouldn't want to |
525 | * do I/O, while holding a lock, anyway.) |
526 | * |
527 | * At the instant that we're not holding a lock on the parent, the |
528 | * downlink might get moved by a concurrent insert, so we must |
529 | * re-check that it still points to the same child page after we have |
530 | * acquired both locks. Also, another backend might have inserted a |
531 | * tuple to the page, so that it is no longer empty. gistdeletepage() |
532 | * re-checks all these conditions. |
533 | */ |
534 | LockBuffer(buffer, GIST_UNLOCK); |
535 | |
536 | deleted = 0; |
537 | for (int i = 0; i < ntodelete; i++) |
538 | { |
539 | Buffer leafbuf; |
540 | |
541 | /* |
542 | * Don't remove the last downlink from the parent. That would |
543 | * confuse the insertion code. |
544 | */ |
545 | if (PageGetMaxOffsetNumber(page) == FirstOffsetNumber) |
546 | break; |
547 | |
548 | leafbuf = ReadBufferExtended(rel, MAIN_FORKNUM, leafs_to_delete[i], |
549 | RBM_NORMAL, info->strategy); |
550 | LockBuffer(leafbuf, GIST_EXCLUSIVE); |
551 | gistcheckpage(rel, leafbuf); |
552 | |
553 | LockBuffer(buffer, GIST_EXCLUSIVE); |
554 | if (gistdeletepage(info, stats, |
555 | buffer, todelete[i] - deleted, |
556 | leafbuf)) |
557 | deleted++; |
558 | LockBuffer(buffer, GIST_UNLOCK); |
559 | |
560 | UnlockReleaseBuffer(leafbuf); |
561 | } |
562 | |
563 | ReleaseBuffer(buffer); |
564 | |
565 | /* update stats */ |
566 | stats->stats.pages_removed += deleted; |
567 | |
568 | /* |
569 | * We can stop the scan as soon as we have seen the downlinks, even if |
570 | * we were not able to remove them all. |
571 | */ |
572 | empty_pages_remaining -= ntodelete; |
573 | } |
574 | } |
575 | |
576 | /* |
577 | * gistdeletepage takes a leaf page, and its parent, and tries to delete the |
578 | * leaf. Both pages must be locked. |
579 | * |
580 | * Even if the page was empty when we first saw it, a concurrent inserter might |
581 | * have added a tuple to it since. Similarly, the downlink might have moved. |
582 | * We re-check all the conditions, to make sure the page is still deletable, |
583 | * before modifying anything. |
584 | * |
585 | * Returns true, if the page was deleted, and false if a concurrent update |
586 | * prevented it. |
587 | */ |
588 | static bool |
589 | gistdeletepage(IndexVacuumInfo *info, GistBulkDeleteResult *stats, |
590 | Buffer parentBuffer, OffsetNumber downlink, |
591 | Buffer leafBuffer) |
592 | { |
593 | Page parentPage = BufferGetPage(parentBuffer); |
594 | Page leafPage = BufferGetPage(leafBuffer); |
595 | ItemId iid; |
596 | IndexTuple idxtuple; |
597 | XLogRecPtr recptr; |
598 | FullTransactionId txid; |
599 | |
600 | /* |
601 | * Check that the leaf is still empty and deletable. |
602 | */ |
603 | if (!GistPageIsLeaf(leafPage)) |
604 | { |
605 | /* a leaf page should never become a non-leaf page */ |
606 | Assert(false); |
607 | return false; |
608 | } |
609 | |
610 | if (GistFollowRight(leafPage)) |
611 | return false; /* don't mess with a concurrent page split */ |
612 | |
613 | if (PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber) |
614 | return false; /* not empty anymore */ |
615 | |
616 | /* |
617 | * Ok, the leaf is deletable. Is the downlink in the parent page still |
618 | * valid? It might have been moved by a concurrent insert. We could try |
619 | * to re-find it by scanning the page again, possibly moving right if the |
620 | * was split. But for now, let's keep it simple and just give up. The |
621 | * next VACUUM will pick it up. |
622 | */ |
623 | if (PageIsNew(parentPage) || GistPageIsDeleted(parentPage) || |
624 | GistPageIsLeaf(parentPage)) |
625 | { |
626 | /* shouldn't happen, internal pages are never deleted */ |
627 | Assert(false); |
628 | return false; |
629 | } |
630 | |
631 | if (PageGetMaxOffsetNumber(parentPage) < downlink |
632 | || PageGetMaxOffsetNumber(parentPage) <= FirstOffsetNumber) |
633 | return false; |
634 | |
635 | iid = PageGetItemId(parentPage, downlink); |
636 | idxtuple = (IndexTuple) PageGetItem(parentPage, iid); |
637 | if (BufferGetBlockNumber(leafBuffer) != |
638 | ItemPointerGetBlockNumber(&(idxtuple->t_tid))) |
639 | return false; |
640 | |
641 | /* |
642 | * All good, proceed with the deletion. |
643 | * |
644 | * The page cannot be immediately recycled, because in-progress scans that |
645 | * saw the downlink might still visit it. Mark the page with the current |
646 | * next-XID counter, so that we know when it can be recycled. Once that |
647 | * XID becomes older than GlobalXmin, we know that all scans that are |
648 | * currently in progress must have ended. (That's much more conservative |
649 | * than needed, but let's keep it safe and simple.) |
650 | */ |
651 | txid = ReadNextFullTransactionId(); |
652 | |
653 | START_CRIT_SECTION(); |
654 | |
655 | /* mark the page as deleted */ |
656 | MarkBufferDirty(leafBuffer); |
657 | GistPageSetDeleted(leafPage, txid); |
658 | stats->stats.pages_deleted++; |
659 | |
660 | /* remove the downlink from the parent */ |
661 | MarkBufferDirty(parentBuffer); |
662 | PageIndexTupleDelete(parentPage, downlink); |
663 | |
664 | if (RelationNeedsWAL(info->index)) |
665 | recptr = gistXLogPageDelete(leafBuffer, txid, parentBuffer, downlink); |
666 | else |
667 | recptr = gistGetFakeLSN(info->index); |
668 | PageSetLSN(parentPage, recptr); |
669 | PageSetLSN(leafPage, recptr); |
670 | |
671 | END_CRIT_SECTION(); |
672 | |
673 | return true; |
674 | } |
675 | |