| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * gistvacuum.c |
| 4 | * vacuuming routines for the postgres GiST index access method. |
| 5 | * |
| 6 | * |
| 7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 8 | * Portions Copyright (c) 1994, Regents of the University of California |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/backend/access/gist/gistvacuum.c |
| 12 | * |
| 13 | *------------------------------------------------------------------------- |
| 14 | */ |
| 15 | #include "postgres.h" |
| 16 | |
| 17 | #include "access/genam.h" |
| 18 | #include "access/gist_private.h" |
| 19 | #include "access/transam.h" |
| 20 | #include "commands/vacuum.h" |
| 21 | #include "lib/integerset.h" |
| 22 | #include "miscadmin.h" |
| 23 | #include "storage/indexfsm.h" |
| 24 | #include "storage/lmgr.h" |
| 25 | #include "utils/memutils.h" |
| 26 | |
| 27 | /* |
| 28 | * State kept across vacuum stages. |
| 29 | */ |
| 30 | typedef struct |
| 31 | { |
| 32 | IndexBulkDeleteResult stats; /* must be first */ |
| 33 | |
| 34 | /* |
| 35 | * These are used to memorize all internal and empty leaf pages in the 1st |
| 36 | * vacuum stage. They are used in the 2nd stage, to delete all the empty |
| 37 | * pages. |
| 38 | */ |
| 39 | IntegerSet *internal_page_set; |
| 40 | IntegerSet *empty_leaf_set; |
| 41 | MemoryContext page_set_context; |
| 42 | } GistBulkDeleteResult; |
| 43 | |
| 44 | /* Working state needed by gistbulkdelete */ |
| 45 | typedef struct |
| 46 | { |
| 47 | IndexVacuumInfo *info; |
| 48 | GistBulkDeleteResult *stats; |
| 49 | IndexBulkDeleteCallback callback; |
| 50 | void *callback_state; |
| 51 | GistNSN startNSN; |
| 52 | } GistVacState; |
| 53 | |
| 54 | static void gistvacuumscan(IndexVacuumInfo *info, GistBulkDeleteResult *stats, |
| 55 | IndexBulkDeleteCallback callback, void *callback_state); |
| 56 | static void gistvacuumpage(GistVacState *vstate, BlockNumber blkno, |
| 57 | BlockNumber orig_blkno); |
| 58 | static void gistvacuum_delete_empty_pages(IndexVacuumInfo *info, |
| 59 | GistBulkDeleteResult *stats); |
| 60 | static bool gistdeletepage(IndexVacuumInfo *info, GistBulkDeleteResult *stats, |
| 61 | Buffer buffer, OffsetNumber downlink, |
| 62 | Buffer leafBuffer); |
| 63 | |
| 64 | /* allocate the 'stats' struct that's kept over vacuum stages */ |
| 65 | static GistBulkDeleteResult * |
| 66 | create_GistBulkDeleteResult(void) |
| 67 | { |
| 68 | GistBulkDeleteResult *gist_stats; |
| 69 | |
| 70 | gist_stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); |
| 71 | gist_stats->page_set_context = |
| 72 | GenerationContextCreate(CurrentMemoryContext, |
| 73 | "GiST VACUUM page set context" , |
| 74 | 16 * 1024); |
| 75 | |
| 76 | return gist_stats; |
| 77 | } |
| 78 | |
| 79 | /* |
| 80 | * VACUUM bulkdelete stage: remove index entries. |
| 81 | */ |
| 82 | IndexBulkDeleteResult * |
| 83 | gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
| 84 | IndexBulkDeleteCallback callback, void *callback_state) |
| 85 | { |
| 86 | GistBulkDeleteResult *gist_stats = (GistBulkDeleteResult *) stats; |
| 87 | |
| 88 | /* allocate stats if first time through, else re-use existing struct */ |
| 89 | if (gist_stats == NULL) |
| 90 | gist_stats = create_GistBulkDeleteResult(); |
| 91 | |
| 92 | gistvacuumscan(info, gist_stats, callback, callback_state); |
| 93 | |
| 94 | return (IndexBulkDeleteResult *) gist_stats; |
| 95 | } |
| 96 | |
| 97 | /* |
| 98 | * VACUUM cleanup stage: delete empty pages, and update index statistics. |
| 99 | */ |
| 100 | IndexBulkDeleteResult * |
| 101 | gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) |
| 102 | { |
| 103 | GistBulkDeleteResult *gist_stats = (GistBulkDeleteResult *) stats; |
| 104 | |
| 105 | /* No-op in ANALYZE ONLY mode */ |
| 106 | if (info->analyze_only) |
| 107 | return stats; |
| 108 | |
| 109 | /* |
| 110 | * If gistbulkdelete was called, we need not do anything, just return the |
| 111 | * stats from the latest gistbulkdelete call. If it wasn't called, we |
| 112 | * still need to do a pass over the index, to obtain index statistics. |
| 113 | */ |
| 114 | if (gist_stats == NULL) |
| 115 | { |
| 116 | gist_stats = create_GistBulkDeleteResult(); |
| 117 | gistvacuumscan(info, gist_stats, NULL, NULL); |
| 118 | } |
| 119 | |
| 120 | /* |
| 121 | * If we saw any empty pages, try to unlink them from the tree so that |
| 122 | * they can be reused. |
| 123 | */ |
| 124 | gistvacuum_delete_empty_pages(info, gist_stats); |
| 125 | |
| 126 | /* we don't need the internal and empty page sets anymore */ |
| 127 | MemoryContextDelete(gist_stats->page_set_context); |
| 128 | gist_stats->page_set_context = NULL; |
| 129 | gist_stats->internal_page_set = NULL; |
| 130 | gist_stats->empty_leaf_set = NULL; |
| 131 | |
| 132 | /* |
| 133 | * It's quite possible for us to be fooled by concurrent page splits into |
| 134 | * double-counting some index tuples, so disbelieve any total that exceeds |
| 135 | * the underlying heap's count ... if we know that accurately. Otherwise |
| 136 | * this might just make matters worse. |
| 137 | */ |
| 138 | if (!info->estimated_count) |
| 139 | { |
| 140 | if (gist_stats->stats.num_index_tuples > info->num_heap_tuples) |
| 141 | gist_stats->stats.num_index_tuples = info->num_heap_tuples; |
| 142 | } |
| 143 | |
| 144 | return (IndexBulkDeleteResult *) gist_stats; |
| 145 | } |
| 146 | |
| 147 | /* |
| 148 | * gistvacuumscan --- scan the index for VACUUMing purposes |
| 149 | * |
| 150 | * This scans the index for leaf tuples that are deletable according to the |
| 151 | * vacuum callback, and updates the stats. Both btbulkdelete and |
| 152 | * btvacuumcleanup invoke this (the latter only if no btbulkdelete call |
| 153 | * occurred). |
| 154 | * |
| 155 | * This also makes note of any empty leaf pages, as well as all internal |
| 156 | * pages. The second stage, gistvacuum_delete_empty_pages(), needs that |
| 157 | * information. Any deleted pages are added directly to the free space map. |
| 158 | * (They should've been added there when they were originally deleted, already, |
| 159 | * but it's possible that the FSM was lost at a crash, for example.) |
| 160 | * |
| 161 | * The caller is responsible for initially allocating/zeroing a stats struct. |
| 162 | */ |
| 163 | static void |
| 164 | gistvacuumscan(IndexVacuumInfo *info, GistBulkDeleteResult *stats, |
| 165 | IndexBulkDeleteCallback callback, void *callback_state) |
| 166 | { |
| 167 | Relation rel = info->index; |
| 168 | GistVacState vstate; |
| 169 | BlockNumber num_pages; |
| 170 | bool needLock; |
| 171 | BlockNumber blkno; |
| 172 | |
| 173 | /* |
| 174 | * Reset counts that will be incremented during the scan; needed in case |
| 175 | * of multiple scans during a single VACUUM command. |
| 176 | */ |
| 177 | stats->stats.estimated_count = false; |
| 178 | stats->stats.num_index_tuples = 0; |
| 179 | stats->stats.pages_deleted = 0; |
| 180 | stats->stats.pages_free = 0; |
| 181 | MemoryContextReset(stats->page_set_context); |
| 182 | stats->internal_page_set = intset_create(); |
| 183 | stats->empty_leaf_set = intset_create(); |
| 184 | |
| 185 | /* Set up info to pass down to gistvacuumpage */ |
| 186 | vstate.info = info; |
| 187 | vstate.stats = stats; |
| 188 | vstate.callback = callback; |
| 189 | vstate.callback_state = callback_state; |
| 190 | if (RelationNeedsWAL(rel)) |
| 191 | vstate.startNSN = GetInsertRecPtr(); |
| 192 | else |
| 193 | vstate.startNSN = gistGetFakeLSN(rel); |
| 194 | |
| 195 | /* |
| 196 | * The outer loop iterates over all index pages, in physical order (we |
| 197 | * hope the kernel will cooperate in providing read-ahead for speed). It |
| 198 | * is critical that we visit all leaf pages, including ones added after we |
| 199 | * start the scan, else we might fail to delete some deletable tuples. |
| 200 | * Hence, we must repeatedly check the relation length. We must acquire |
| 201 | * the relation-extension lock while doing so to avoid a race condition: |
| 202 | * if someone else is extending the relation, there is a window where |
| 203 | * bufmgr/smgr have created a new all-zero page but it hasn't yet been |
| 204 | * write-locked by gistNewBuffer(). If we manage to scan such a page |
| 205 | * here, we'll improperly assume it can be recycled. Taking the lock |
| 206 | * synchronizes things enough to prevent a problem: either num_pages won't |
| 207 | * include the new page, or gistNewBuffer already has write lock on the |
| 208 | * buffer and it will be fully initialized before we can examine it. (See |
| 209 | * also vacuumlazy.c, which has the same issue.) Also, we need not worry |
| 210 | * if a page is added immediately after we look; the page splitting code |
| 211 | * already has write-lock on the left page before it adds a right page, so |
| 212 | * we must already have processed any tuples due to be moved into such a |
| 213 | * page. |
| 214 | * |
| 215 | * We can skip locking for new or temp relations, however, since no one |
| 216 | * else could be accessing them. |
| 217 | */ |
| 218 | needLock = !RELATION_IS_LOCAL(rel); |
| 219 | |
| 220 | blkno = GIST_ROOT_BLKNO; |
| 221 | for (;;) |
| 222 | { |
| 223 | /* Get the current relation length */ |
| 224 | if (needLock) |
| 225 | LockRelationForExtension(rel, ExclusiveLock); |
| 226 | num_pages = RelationGetNumberOfBlocks(rel); |
| 227 | if (needLock) |
| 228 | UnlockRelationForExtension(rel, ExclusiveLock); |
| 229 | |
| 230 | /* Quit if we've scanned the whole relation */ |
| 231 | if (blkno >= num_pages) |
| 232 | break; |
| 233 | /* Iterate over pages, then loop back to recheck length */ |
| 234 | for (; blkno < num_pages; blkno++) |
| 235 | gistvacuumpage(&vstate, blkno, blkno); |
| 236 | } |
| 237 | |
| 238 | /* |
| 239 | * If we found any recyclable pages (and recorded them in the FSM), then |
| 240 | * forcibly update the upper-level FSM pages to ensure that searchers can |
| 241 | * find them. It's possible that the pages were also found during |
| 242 | * previous scans and so this is a waste of time, but it's cheap enough |
| 243 | * relative to scanning the index that it shouldn't matter much, and |
| 244 | * making sure that free pages are available sooner not later seems |
| 245 | * worthwhile. |
| 246 | * |
| 247 | * Note that if no recyclable pages exist, we don't bother vacuuming the |
| 248 | * FSM at all. |
| 249 | */ |
| 250 | if (stats->stats.pages_free > 0) |
| 251 | IndexFreeSpaceMapVacuum(rel); |
| 252 | |
| 253 | /* update statistics */ |
| 254 | stats->stats.num_pages = num_pages; |
| 255 | } |
| 256 | |
| 257 | /* |
| 258 | * gistvacuumpage --- VACUUM one page |
| 259 | * |
| 260 | * This processes a single page for gistbulkdelete(). In some cases we |
| 261 | * must go back and re-examine previously-scanned pages; this routine |
| 262 | * recurses when necessary to handle that case. |
| 263 | * |
| 264 | * blkno is the page to process. orig_blkno is the highest block number |
| 265 | * reached by the outer gistvacuumscan loop (the same as blkno, unless we |
| 266 | * are recursing to re-examine a previous page). |
| 267 | */ |
| 268 | static void |
| 269 | gistvacuumpage(GistVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) |
| 270 | { |
| 271 | GistBulkDeleteResult *stats = vstate->stats; |
| 272 | IndexVacuumInfo *info = vstate->info; |
| 273 | IndexBulkDeleteCallback callback = vstate->callback; |
| 274 | void *callback_state = vstate->callback_state; |
| 275 | Relation rel = info->index; |
| 276 | Buffer buffer; |
| 277 | Page page; |
| 278 | BlockNumber recurse_to; |
| 279 | |
| 280 | restart: |
| 281 | recurse_to = InvalidBlockNumber; |
| 282 | |
| 283 | /* call vacuum_delay_point while not holding any buffer lock */ |
| 284 | vacuum_delay_point(); |
| 285 | |
| 286 | buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, |
| 287 | info->strategy); |
| 288 | |
| 289 | /* |
| 290 | * We are not going to stay here for a long time, aggressively grab an |
| 291 | * exclusive lock. |
| 292 | */ |
| 293 | LockBuffer(buffer, GIST_EXCLUSIVE); |
| 294 | page = (Page) BufferGetPage(buffer); |
| 295 | |
| 296 | if (gistPageRecyclable(page)) |
| 297 | { |
| 298 | /* Okay to recycle this page */ |
| 299 | RecordFreeIndexPage(rel, blkno); |
| 300 | stats->stats.pages_free++; |
| 301 | stats->stats.pages_deleted++; |
| 302 | } |
| 303 | else if (GistPageIsDeleted(page)) |
| 304 | { |
| 305 | /* Already deleted, but can't recycle yet */ |
| 306 | stats->stats.pages_deleted++; |
| 307 | } |
| 308 | else if (GistPageIsLeaf(page)) |
| 309 | { |
| 310 | OffsetNumber todelete[MaxOffsetNumber]; |
| 311 | int ntodelete = 0; |
| 312 | int nremain; |
| 313 | GISTPageOpaque opaque = GistPageGetOpaque(page); |
| 314 | OffsetNumber maxoff = PageGetMaxOffsetNumber(page); |
| 315 | |
| 316 | /* |
| 317 | * Check whether we need to recurse back to earlier pages. What we |
| 318 | * are concerned about is a page split that happened since we started |
| 319 | * the vacuum scan. If the split moved some tuples to a lower page |
| 320 | * then we might have missed 'em. If so, set up for tail recursion. |
| 321 | * |
| 322 | * This is similar to the checks we do during searches, when following |
| 323 | * a downlink, but we don't need to jump to higher-numbered pages, |
| 324 | * because we will process them later, anyway. |
| 325 | */ |
| 326 | if ((GistFollowRight(page) || |
| 327 | vstate->startNSN < GistPageGetNSN(page)) && |
| 328 | (opaque->rightlink != InvalidBlockNumber) && |
| 329 | (opaque->rightlink < orig_blkno)) |
| 330 | { |
| 331 | recurse_to = opaque->rightlink; |
| 332 | } |
| 333 | |
| 334 | /* |
| 335 | * Scan over all items to see which ones need to be deleted according |
| 336 | * to the callback function. |
| 337 | */ |
| 338 | if (callback) |
| 339 | { |
| 340 | OffsetNumber off; |
| 341 | |
| 342 | for (off = FirstOffsetNumber; |
| 343 | off <= maxoff; |
| 344 | off = OffsetNumberNext(off)) |
| 345 | { |
| 346 | ItemId iid = PageGetItemId(page, off); |
| 347 | IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
| 348 | |
| 349 | if (callback(&(idxtuple->t_tid), callback_state)) |
| 350 | todelete[ntodelete++] = off; |
| 351 | } |
| 352 | } |
| 353 | |
| 354 | /* |
| 355 | * Apply any needed deletes. We issue just one WAL record per page, |
| 356 | * so as to minimize WAL traffic. |
| 357 | */ |
| 358 | if (ntodelete > 0) |
| 359 | { |
| 360 | START_CRIT_SECTION(); |
| 361 | |
| 362 | MarkBufferDirty(buffer); |
| 363 | |
| 364 | PageIndexMultiDelete(page, todelete, ntodelete); |
| 365 | GistMarkTuplesDeleted(page); |
| 366 | |
| 367 | if (RelationNeedsWAL(rel)) |
| 368 | { |
| 369 | XLogRecPtr recptr; |
| 370 | |
| 371 | recptr = gistXLogUpdate(buffer, |
| 372 | todelete, ntodelete, |
| 373 | NULL, 0, InvalidBuffer); |
| 374 | PageSetLSN(page, recptr); |
| 375 | } |
| 376 | else |
| 377 | PageSetLSN(page, gistGetFakeLSN(rel)); |
| 378 | |
| 379 | END_CRIT_SECTION(); |
| 380 | |
| 381 | stats->stats.tuples_removed += ntodelete; |
| 382 | /* must recompute maxoff */ |
| 383 | maxoff = PageGetMaxOffsetNumber(page); |
| 384 | } |
| 385 | |
| 386 | nremain = maxoff - FirstOffsetNumber + 1; |
| 387 | if (nremain == 0) |
| 388 | { |
| 389 | /* |
| 390 | * The page is now completely empty. Remember its block number, |
| 391 | * so that we will try to delete the page in the second stage. |
| 392 | * |
| 393 | * Skip this when recursing, because IntegerSet requires that the |
| 394 | * values are added in ascending order. The next VACUUM will pick |
| 395 | * it up. |
| 396 | */ |
| 397 | if (blkno == orig_blkno) |
| 398 | intset_add_member(stats->empty_leaf_set, blkno); |
| 399 | } |
| 400 | else |
| 401 | stats->stats.num_index_tuples += nremain; |
| 402 | } |
| 403 | else |
| 404 | { |
| 405 | /* |
| 406 | * On an internal page, check for "invalid tuples", left behind by an |
| 407 | * incomplete page split on PostgreSQL 9.0 or below. These are not |
| 408 | * created by newer PostgreSQL versions, but unfortunately, there is |
| 409 | * no version number anywhere in a GiST index, so we don't know |
| 410 | * whether this index might still contain invalid tuples or not. |
| 411 | */ |
| 412 | OffsetNumber maxoff = PageGetMaxOffsetNumber(page); |
| 413 | OffsetNumber off; |
| 414 | |
| 415 | for (off = FirstOffsetNumber; |
| 416 | off <= maxoff; |
| 417 | off = OffsetNumberNext(off)) |
| 418 | { |
| 419 | ItemId iid = PageGetItemId(page, off); |
| 420 | IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
| 421 | |
| 422 | if (GistTupleIsInvalid(idxtuple)) |
| 423 | ereport(LOG, |
| 424 | (errmsg("index \"%s\" contains an inner tuple marked as invalid" , |
| 425 | RelationGetRelationName(rel)), |
| 426 | errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1." ), |
| 427 | errhint("Please REINDEX it." ))); |
| 428 | } |
| 429 | |
| 430 | /* |
| 431 | * Remember the block number of this page, so that we can revisit it |
| 432 | * later in gistvacuum_delete_empty_pages(), when we search for |
| 433 | * parents of empty leaf pages. |
| 434 | */ |
| 435 | if (blkno == orig_blkno) |
| 436 | intset_add_member(stats->internal_page_set, blkno); |
| 437 | } |
| 438 | |
| 439 | UnlockReleaseBuffer(buffer); |
| 440 | |
| 441 | /* |
| 442 | * This is really tail recursion, but if the compiler is too stupid to |
| 443 | * optimize it as such, we'd eat an uncomfortably large amount of stack |
| 444 | * space per recursion level (due to the deletable[] array). A failure is |
| 445 | * improbable since the number of levels isn't likely to be large ... but |
| 446 | * just in case, let's hand-optimize into a loop. |
| 447 | */ |
| 448 | if (recurse_to != InvalidBlockNumber) |
| 449 | { |
| 450 | blkno = recurse_to; |
| 451 | goto restart; |
| 452 | } |
| 453 | } |
| 454 | |
| 455 | /* |
| 456 | * Scan all internal pages, and try to delete their empty child pages. |
| 457 | */ |
| 458 | static void |
| 459 | gistvacuum_delete_empty_pages(IndexVacuumInfo *info, GistBulkDeleteResult *stats) |
| 460 | { |
| 461 | Relation rel = info->index; |
| 462 | BlockNumber empty_pages_remaining; |
| 463 | uint64 blkno; |
| 464 | |
| 465 | /* |
| 466 | * Rescan all inner pages to find those that have empty child pages. |
| 467 | */ |
| 468 | empty_pages_remaining = intset_num_entries(stats->empty_leaf_set); |
| 469 | intset_begin_iterate(stats->internal_page_set); |
| 470 | while (empty_pages_remaining > 0 && |
| 471 | intset_iterate_next(stats->internal_page_set, &blkno)) |
| 472 | { |
| 473 | Buffer buffer; |
| 474 | Page page; |
| 475 | OffsetNumber off, |
| 476 | maxoff; |
| 477 | OffsetNumber todelete[MaxOffsetNumber]; |
| 478 | BlockNumber leafs_to_delete[MaxOffsetNumber]; |
| 479 | int ntodelete; |
| 480 | int deleted; |
| 481 | |
| 482 | buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber) blkno, |
| 483 | RBM_NORMAL, info->strategy); |
| 484 | |
| 485 | LockBuffer(buffer, GIST_SHARE); |
| 486 | page = (Page) BufferGetPage(buffer); |
| 487 | |
| 488 | if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page)) |
| 489 | { |
| 490 | /* |
| 491 | * This page was an internal page earlier, but now it's something |
| 492 | * else. Shouldn't happen... |
| 493 | */ |
| 494 | Assert(false); |
| 495 | UnlockReleaseBuffer(buffer); |
| 496 | continue; |
| 497 | } |
| 498 | |
| 499 | /* |
| 500 | * Scan all the downlinks, and see if any of them point to empty leaf |
| 501 | * pages. |
| 502 | */ |
| 503 | maxoff = PageGetMaxOffsetNumber(page); |
| 504 | ntodelete = 0; |
| 505 | for (off = FirstOffsetNumber; |
| 506 | off <= maxoff && ntodelete < maxoff - 1; |
| 507 | off = OffsetNumberNext(off)) |
| 508 | { |
| 509 | ItemId iid = PageGetItemId(page, off); |
| 510 | IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
| 511 | BlockNumber leafblk; |
| 512 | |
| 513 | leafblk = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); |
| 514 | if (intset_is_member(stats->empty_leaf_set, leafblk)) |
| 515 | { |
| 516 | leafs_to_delete[ntodelete] = leafblk; |
| 517 | todelete[ntodelete++] = off; |
| 518 | } |
| 519 | } |
| 520 | |
| 521 | /* |
| 522 | * In order to avoid deadlock, child page must be locked before |
| 523 | * parent, so we must release the lock on the parent, lock the child, |
| 524 | * and then re-acquire the lock the parent. (And we wouldn't want to |
| 525 | * do I/O, while holding a lock, anyway.) |
| 526 | * |
| 527 | * At the instant that we're not holding a lock on the parent, the |
| 528 | * downlink might get moved by a concurrent insert, so we must |
| 529 | * re-check that it still points to the same child page after we have |
| 530 | * acquired both locks. Also, another backend might have inserted a |
| 531 | * tuple to the page, so that it is no longer empty. gistdeletepage() |
| 532 | * re-checks all these conditions. |
| 533 | */ |
| 534 | LockBuffer(buffer, GIST_UNLOCK); |
| 535 | |
| 536 | deleted = 0; |
| 537 | for (int i = 0; i < ntodelete; i++) |
| 538 | { |
| 539 | Buffer leafbuf; |
| 540 | |
| 541 | /* |
| 542 | * Don't remove the last downlink from the parent. That would |
| 543 | * confuse the insertion code. |
| 544 | */ |
| 545 | if (PageGetMaxOffsetNumber(page) == FirstOffsetNumber) |
| 546 | break; |
| 547 | |
| 548 | leafbuf = ReadBufferExtended(rel, MAIN_FORKNUM, leafs_to_delete[i], |
| 549 | RBM_NORMAL, info->strategy); |
| 550 | LockBuffer(leafbuf, GIST_EXCLUSIVE); |
| 551 | gistcheckpage(rel, leafbuf); |
| 552 | |
| 553 | LockBuffer(buffer, GIST_EXCLUSIVE); |
| 554 | if (gistdeletepage(info, stats, |
| 555 | buffer, todelete[i] - deleted, |
| 556 | leafbuf)) |
| 557 | deleted++; |
| 558 | LockBuffer(buffer, GIST_UNLOCK); |
| 559 | |
| 560 | UnlockReleaseBuffer(leafbuf); |
| 561 | } |
| 562 | |
| 563 | ReleaseBuffer(buffer); |
| 564 | |
| 565 | /* update stats */ |
| 566 | stats->stats.pages_removed += deleted; |
| 567 | |
| 568 | /* |
| 569 | * We can stop the scan as soon as we have seen the downlinks, even if |
| 570 | * we were not able to remove them all. |
| 571 | */ |
| 572 | empty_pages_remaining -= ntodelete; |
| 573 | } |
| 574 | } |
| 575 | |
| 576 | /* |
| 577 | * gistdeletepage takes a leaf page, and its parent, and tries to delete the |
| 578 | * leaf. Both pages must be locked. |
| 579 | * |
| 580 | * Even if the page was empty when we first saw it, a concurrent inserter might |
| 581 | * have added a tuple to it since. Similarly, the downlink might have moved. |
| 582 | * We re-check all the conditions, to make sure the page is still deletable, |
| 583 | * before modifying anything. |
| 584 | * |
| 585 | * Returns true, if the page was deleted, and false if a concurrent update |
| 586 | * prevented it. |
| 587 | */ |
| 588 | static bool |
| 589 | gistdeletepage(IndexVacuumInfo *info, GistBulkDeleteResult *stats, |
| 590 | Buffer parentBuffer, OffsetNumber downlink, |
| 591 | Buffer leafBuffer) |
| 592 | { |
| 593 | Page parentPage = BufferGetPage(parentBuffer); |
| 594 | Page leafPage = BufferGetPage(leafBuffer); |
| 595 | ItemId iid; |
| 596 | IndexTuple idxtuple; |
| 597 | XLogRecPtr recptr; |
| 598 | FullTransactionId txid; |
| 599 | |
| 600 | /* |
| 601 | * Check that the leaf is still empty and deletable. |
| 602 | */ |
| 603 | if (!GistPageIsLeaf(leafPage)) |
| 604 | { |
| 605 | /* a leaf page should never become a non-leaf page */ |
| 606 | Assert(false); |
| 607 | return false; |
| 608 | } |
| 609 | |
| 610 | if (GistFollowRight(leafPage)) |
| 611 | return false; /* don't mess with a concurrent page split */ |
| 612 | |
| 613 | if (PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber) |
| 614 | return false; /* not empty anymore */ |
| 615 | |
| 616 | /* |
| 617 | * Ok, the leaf is deletable. Is the downlink in the parent page still |
| 618 | * valid? It might have been moved by a concurrent insert. We could try |
| 619 | * to re-find it by scanning the page again, possibly moving right if the |
| 620 | * was split. But for now, let's keep it simple and just give up. The |
| 621 | * next VACUUM will pick it up. |
| 622 | */ |
| 623 | if (PageIsNew(parentPage) || GistPageIsDeleted(parentPage) || |
| 624 | GistPageIsLeaf(parentPage)) |
| 625 | { |
| 626 | /* shouldn't happen, internal pages are never deleted */ |
| 627 | Assert(false); |
| 628 | return false; |
| 629 | } |
| 630 | |
| 631 | if (PageGetMaxOffsetNumber(parentPage) < downlink |
| 632 | || PageGetMaxOffsetNumber(parentPage) <= FirstOffsetNumber) |
| 633 | return false; |
| 634 | |
| 635 | iid = PageGetItemId(parentPage, downlink); |
| 636 | idxtuple = (IndexTuple) PageGetItem(parentPage, iid); |
| 637 | if (BufferGetBlockNumber(leafBuffer) != |
| 638 | ItemPointerGetBlockNumber(&(idxtuple->t_tid))) |
| 639 | return false; |
| 640 | |
| 641 | /* |
| 642 | * All good, proceed with the deletion. |
| 643 | * |
| 644 | * The page cannot be immediately recycled, because in-progress scans that |
| 645 | * saw the downlink might still visit it. Mark the page with the current |
| 646 | * next-XID counter, so that we know when it can be recycled. Once that |
| 647 | * XID becomes older than GlobalXmin, we know that all scans that are |
| 648 | * currently in progress must have ended. (That's much more conservative |
| 649 | * than needed, but let's keep it safe and simple.) |
| 650 | */ |
| 651 | txid = ReadNextFullTransactionId(); |
| 652 | |
| 653 | START_CRIT_SECTION(); |
| 654 | |
| 655 | /* mark the page as deleted */ |
| 656 | MarkBufferDirty(leafBuffer); |
| 657 | GistPageSetDeleted(leafPage, txid); |
| 658 | stats->stats.pages_deleted++; |
| 659 | |
| 660 | /* remove the downlink from the parent */ |
| 661 | MarkBufferDirty(parentBuffer); |
| 662 | PageIndexTupleDelete(parentPage, downlink); |
| 663 | |
| 664 | if (RelationNeedsWAL(info->index)) |
| 665 | recptr = gistXLogPageDelete(leafBuffer, txid, parentBuffer, downlink); |
| 666 | else |
| 667 | recptr = gistGetFakeLSN(info->index); |
| 668 | PageSetLSN(parentPage, recptr); |
| 669 | PageSetLSN(leafPage, recptr); |
| 670 | |
| 671 | END_CRIT_SECTION(); |
| 672 | |
| 673 | return true; |
| 674 | } |
| 675 | |