1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * spgvacuum.c |
4 | * vacuum for SP-GiST |
5 | * |
6 | * |
7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
8 | * Portions Copyright (c) 1994, Regents of the University of California |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/spgist/spgvacuum.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | |
16 | #include "postgres.h" |
17 | |
18 | #include "access/genam.h" |
19 | #include "access/spgist_private.h" |
20 | #include "access/spgxlog.h" |
21 | #include "access/transam.h" |
22 | #include "access/xloginsert.h" |
23 | #include "catalog/storage_xlog.h" |
24 | #include "commands/vacuum.h" |
25 | #include "miscadmin.h" |
26 | #include "storage/bufmgr.h" |
27 | #include "storage/indexfsm.h" |
28 | #include "storage/lmgr.h" |
29 | #include "utils/snapmgr.h" |
30 | |
31 | |
32 | /* Entry in pending-list of TIDs we need to revisit */ |
33 | typedef struct spgVacPendingItem |
34 | { |
35 | ItemPointerData tid; /* redirection target to visit */ |
36 | bool done; /* have we dealt with this? */ |
37 | struct spgVacPendingItem *next; /* list link */ |
38 | } spgVacPendingItem; |
39 | |
40 | /* Local state for vacuum operations */ |
41 | typedef struct spgBulkDeleteState |
42 | { |
43 | /* Parameters passed in to spgvacuumscan */ |
44 | IndexVacuumInfo *info; |
45 | IndexBulkDeleteResult *stats; |
46 | IndexBulkDeleteCallback callback; |
47 | void *callback_state; |
48 | |
49 | /* Additional working state */ |
50 | SpGistState spgstate; /* for SPGiST operations that need one */ |
51 | spgVacPendingItem *pendingList; /* TIDs we need to (re)visit */ |
52 | TransactionId myXmin; /* for detecting newly-added redirects */ |
53 | BlockNumber lastFilledBlock; /* last non-deletable block */ |
54 | } spgBulkDeleteState; |
55 | |
56 | |
57 | /* |
58 | * Add TID to pendingList, but only if not already present. |
59 | * |
60 | * Note that new items are always appended at the end of the list; this |
61 | * ensures that scans of the list don't miss items added during the scan. |
62 | */ |
63 | static void |
64 | spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid) |
65 | { |
66 | spgVacPendingItem *pitem; |
67 | spgVacPendingItem **listLink; |
68 | |
69 | /* search the list for pre-existing entry */ |
70 | listLink = &bds->pendingList; |
71 | while (*listLink != NULL) |
72 | { |
73 | pitem = *listLink; |
74 | if (ItemPointerEquals(tid, &pitem->tid)) |
75 | return; /* already in list, do nothing */ |
76 | listLink = &pitem->next; |
77 | } |
78 | /* not there, so append new entry */ |
79 | pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem)); |
80 | pitem->tid = *tid; |
81 | pitem->done = false; |
82 | pitem->next = NULL; |
83 | *listLink = pitem; |
84 | } |
85 | |
86 | /* |
87 | * Clear pendingList |
88 | */ |
89 | static void |
90 | spgClearPendingList(spgBulkDeleteState *bds) |
91 | { |
92 | spgVacPendingItem *pitem; |
93 | spgVacPendingItem *nitem; |
94 | |
95 | for (pitem = bds->pendingList; pitem != NULL; pitem = nitem) |
96 | { |
97 | nitem = pitem->next; |
98 | /* All items in list should have been dealt with */ |
99 | Assert(pitem->done); |
100 | pfree(pitem); |
101 | } |
102 | bds->pendingList = NULL; |
103 | } |
104 | |
105 | /* |
106 | * Vacuum a regular (non-root) leaf page |
107 | * |
108 | * We must delete tuples that are targeted for deletion by the VACUUM, |
109 | * but not move any tuples that are referenced by outside links; we assume |
110 | * those are the ones that are heads of chains. |
111 | * |
112 | * If we find a REDIRECT that was made by a concurrently-running transaction, |
113 | * we must add its target TID to pendingList. (We don't try to visit the |
114 | * target immediately, first because we don't want VACUUM locking more than |
115 | * one buffer at a time, and second because the duplicate-filtering logic |
116 | * in spgAddPendingTID is useful to ensure we can't get caught in an infinite |
117 | * loop in the face of continuous concurrent insertions.) |
118 | * |
119 | * If forPending is true, we are examining the page as a consequence of |
120 | * chasing a redirect link, not as part of the normal sequential scan. |
121 | * We still vacuum the page normally, but we don't increment the stats |
122 | * about live tuples; else we'd double-count those tuples, since the page |
123 | * has been or will be visited in the sequential scan as well. |
124 | */ |
125 | static void |
126 | vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, |
127 | bool forPending) |
128 | { |
129 | Page page = BufferGetPage(buffer); |
130 | spgxlogVacuumLeaf xlrec; |
131 | OffsetNumber toDead[MaxIndexTuplesPerPage]; |
132 | OffsetNumber toPlaceholder[MaxIndexTuplesPerPage]; |
133 | OffsetNumber moveSrc[MaxIndexTuplesPerPage]; |
134 | OffsetNumber moveDest[MaxIndexTuplesPerPage]; |
135 | OffsetNumber chainSrc[MaxIndexTuplesPerPage]; |
136 | OffsetNumber chainDest[MaxIndexTuplesPerPage]; |
137 | OffsetNumber predecessor[MaxIndexTuplesPerPage + 1]; |
138 | bool deletable[MaxIndexTuplesPerPage + 1]; |
139 | int nDeletable; |
140 | OffsetNumber i, |
141 | max = PageGetMaxOffsetNumber(page); |
142 | |
143 | memset(predecessor, 0, sizeof(predecessor)); |
144 | memset(deletable, 0, sizeof(deletable)); |
145 | nDeletable = 0; |
146 | |
147 | /* Scan page, identify tuples to delete, accumulate stats */ |
148 | for (i = FirstOffsetNumber; i <= max; i++) |
149 | { |
150 | SpGistLeafTuple lt; |
151 | |
152 | lt = (SpGistLeafTuple) PageGetItem(page, |
153 | PageGetItemId(page, i)); |
154 | if (lt->tupstate == SPGIST_LIVE) |
155 | { |
156 | Assert(ItemPointerIsValid(<->heapPtr)); |
157 | |
158 | if (bds->callback(<->heapPtr, bds->callback_state)) |
159 | { |
160 | bds->stats->tuples_removed += 1; |
161 | deletable[i] = true; |
162 | nDeletable++; |
163 | } |
164 | else |
165 | { |
166 | if (!forPending) |
167 | bds->stats->num_index_tuples += 1; |
168 | } |
169 | |
170 | /* Form predecessor map, too */ |
171 | if (lt->nextOffset != InvalidOffsetNumber) |
172 | { |
173 | /* paranoia about corrupted chain links */ |
174 | if (lt->nextOffset < FirstOffsetNumber || |
175 | lt->nextOffset > max || |
176 | predecessor[lt->nextOffset] != InvalidOffsetNumber) |
177 | elog(ERROR, "inconsistent tuple chain links in page %u of index \"%s\"" , |
178 | BufferGetBlockNumber(buffer), |
179 | RelationGetRelationName(index)); |
180 | predecessor[lt->nextOffset] = i; |
181 | } |
182 | } |
183 | else if (lt->tupstate == SPGIST_REDIRECT) |
184 | { |
185 | SpGistDeadTuple dt = (SpGistDeadTuple) lt; |
186 | |
187 | Assert(dt->nextOffset == InvalidOffsetNumber); |
188 | Assert(ItemPointerIsValid(&dt->pointer)); |
189 | |
190 | /* |
191 | * Add target TID to pending list if the redirection could have |
192 | * happened since VACUUM started. |
193 | * |
194 | * Note: we could make a tighter test by seeing if the xid is |
195 | * "running" according to the active snapshot; but snapmgr.c |
196 | * doesn't currently export a suitable API, and it's not entirely |
197 | * clear that a tighter test is worth the cycles anyway. |
198 | */ |
199 | if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) |
200 | spgAddPendingTID(bds, &dt->pointer); |
201 | } |
202 | else |
203 | { |
204 | Assert(lt->nextOffset == InvalidOffsetNumber); |
205 | } |
206 | } |
207 | |
208 | if (nDeletable == 0) |
209 | return; /* nothing more to do */ |
210 | |
211 | /*---------- |
212 | * Figure out exactly what we have to do. We do this separately from |
213 | * actually modifying the page, mainly so that we have a representation |
214 | * that can be dumped into WAL and then the replay code can do exactly |
215 | * the same thing. The output of this step consists of six arrays |
216 | * describing four kinds of operations, to be performed in this order: |
217 | * |
218 | * toDead[]: tuple numbers to be replaced with DEAD tuples |
219 | * toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples |
220 | * moveSrc[]: tuple numbers that need to be relocated to another offset |
221 | * (replacing the tuple there) and then replaced with PLACEHOLDER tuples |
222 | * moveDest[]: new locations for moveSrc tuples |
223 | * chainSrc[]: tuple numbers whose chain links (nextOffset) need updates |
224 | * chainDest[]: new values of nextOffset for chainSrc members |
225 | * |
226 | * It's easiest to figure out what we have to do by processing tuple |
227 | * chains, so we iterate over all the tuples (not just the deletable |
228 | * ones!) to identify chain heads, then chase down each chain and make |
229 | * work item entries for deletable tuples within the chain. |
230 | *---------- |
231 | */ |
232 | xlrec.nDead = xlrec.nPlaceholder = xlrec.nMove = xlrec.nChain = 0; |
233 | |
234 | for (i = FirstOffsetNumber; i <= max; i++) |
235 | { |
236 | SpGistLeafTuple head; |
237 | bool interveningDeletable; |
238 | OffsetNumber prevLive; |
239 | OffsetNumber j; |
240 | |
241 | head = (SpGistLeafTuple) PageGetItem(page, |
242 | PageGetItemId(page, i)); |
243 | if (head->tupstate != SPGIST_LIVE) |
244 | continue; /* can't be a chain member */ |
245 | if (predecessor[i] != 0) |
246 | continue; /* not a chain head */ |
247 | |
248 | /* initialize ... */ |
249 | interveningDeletable = false; |
250 | prevLive = deletable[i] ? InvalidOffsetNumber : i; |
251 | |
252 | /* scan down the chain ... */ |
253 | j = head->nextOffset; |
254 | while (j != InvalidOffsetNumber) |
255 | { |
256 | SpGistLeafTuple lt; |
257 | |
258 | lt = (SpGistLeafTuple) PageGetItem(page, |
259 | PageGetItemId(page, j)); |
260 | if (lt->tupstate != SPGIST_LIVE) |
261 | { |
262 | /* all tuples in chain should be live */ |
263 | elog(ERROR, "unexpected SPGiST tuple state: %d" , |
264 | lt->tupstate); |
265 | } |
266 | |
267 | if (deletable[j]) |
268 | { |
269 | /* This tuple should be replaced by a placeholder */ |
270 | toPlaceholder[xlrec.nPlaceholder] = j; |
271 | xlrec.nPlaceholder++; |
272 | /* previous live tuple's chain link will need an update */ |
273 | interveningDeletable = true; |
274 | } |
275 | else if (prevLive == InvalidOffsetNumber) |
276 | { |
277 | /* |
278 | * This is the first live tuple in the chain. It has to move |
279 | * to the head position. |
280 | */ |
281 | moveSrc[xlrec.nMove] = j; |
282 | moveDest[xlrec.nMove] = i; |
283 | xlrec.nMove++; |
284 | /* Chain updates will be applied after the move */ |
285 | prevLive = i; |
286 | interveningDeletable = false; |
287 | } |
288 | else |
289 | { |
290 | /* |
291 | * Second or later live tuple. Arrange to re-chain it to the |
292 | * previous live one, if there was a gap. |
293 | */ |
294 | if (interveningDeletable) |
295 | { |
296 | chainSrc[xlrec.nChain] = prevLive; |
297 | chainDest[xlrec.nChain] = j; |
298 | xlrec.nChain++; |
299 | } |
300 | prevLive = j; |
301 | interveningDeletable = false; |
302 | } |
303 | |
304 | j = lt->nextOffset; |
305 | } |
306 | |
307 | if (prevLive == InvalidOffsetNumber) |
308 | { |
309 | /* The chain is entirely removable, so we need a DEAD tuple */ |
310 | toDead[xlrec.nDead] = i; |
311 | xlrec.nDead++; |
312 | } |
313 | else if (interveningDeletable) |
314 | { |
315 | /* One or more deletions at end of chain, so close it off */ |
316 | chainSrc[xlrec.nChain] = prevLive; |
317 | chainDest[xlrec.nChain] = InvalidOffsetNumber; |
318 | xlrec.nChain++; |
319 | } |
320 | } |
321 | |
322 | /* sanity check ... */ |
323 | if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove) |
324 | elog(ERROR, "inconsistent counts of deletable tuples" ); |
325 | |
326 | /* Do the updates */ |
327 | START_CRIT_SECTION(); |
328 | |
329 | spgPageIndexMultiDelete(&bds->spgstate, page, |
330 | toDead, xlrec.nDead, |
331 | SPGIST_DEAD, SPGIST_DEAD, |
332 | InvalidBlockNumber, InvalidOffsetNumber); |
333 | |
334 | spgPageIndexMultiDelete(&bds->spgstate, page, |
335 | toPlaceholder, xlrec.nPlaceholder, |
336 | SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, |
337 | InvalidBlockNumber, InvalidOffsetNumber); |
338 | |
339 | /* |
340 | * We implement the move step by swapping the line pointers of the source |
341 | * and target tuples, then replacing the newly-source tuples with |
342 | * placeholders. This is perhaps unduly friendly with the page data |
343 | * representation, but it's fast and doesn't risk page overflow when a |
344 | * tuple to be relocated is large. |
345 | */ |
346 | for (i = 0; i < xlrec.nMove; i++) |
347 | { |
348 | ItemId idSrc = PageGetItemId(page, moveSrc[i]); |
349 | ItemId idDest = PageGetItemId(page, moveDest[i]); |
350 | ItemIdData tmp; |
351 | |
352 | tmp = *idSrc; |
353 | *idSrc = *idDest; |
354 | *idDest = tmp; |
355 | } |
356 | |
357 | spgPageIndexMultiDelete(&bds->spgstate, page, |
358 | moveSrc, xlrec.nMove, |
359 | SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, |
360 | InvalidBlockNumber, InvalidOffsetNumber); |
361 | |
362 | for (i = 0; i < xlrec.nChain; i++) |
363 | { |
364 | SpGistLeafTuple lt; |
365 | |
366 | lt = (SpGistLeafTuple) PageGetItem(page, |
367 | PageGetItemId(page, chainSrc[i])); |
368 | Assert(lt->tupstate == SPGIST_LIVE); |
369 | lt->nextOffset = chainDest[i]; |
370 | } |
371 | |
372 | MarkBufferDirty(buffer); |
373 | |
374 | if (RelationNeedsWAL(index)) |
375 | { |
376 | XLogRecPtr recptr; |
377 | |
378 | XLogBeginInsert(); |
379 | |
380 | STORE_STATE(&bds->spgstate, xlrec.stateSrc); |
381 | |
382 | XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumLeaf); |
383 | /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ |
384 | XLogRegisterData((char *) toDead, sizeof(OffsetNumber) * xlrec.nDead); |
385 | XLogRegisterData((char *) toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder); |
386 | XLogRegisterData((char *) moveSrc, sizeof(OffsetNumber) * xlrec.nMove); |
387 | XLogRegisterData((char *) moveDest, sizeof(OffsetNumber) * xlrec.nMove); |
388 | XLogRegisterData((char *) chainSrc, sizeof(OffsetNumber) * xlrec.nChain); |
389 | XLogRegisterData((char *) chainDest, sizeof(OffsetNumber) * xlrec.nChain); |
390 | |
391 | XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); |
392 | |
393 | recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF); |
394 | |
395 | PageSetLSN(page, recptr); |
396 | } |
397 | |
398 | END_CRIT_SECTION(); |
399 | } |
400 | |
401 | /* |
402 | * Vacuum a root page when it is also a leaf |
403 | * |
404 | * On the root, we just delete any dead leaf tuples; no fancy business |
405 | */ |
406 | static void |
407 | vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) |
408 | { |
409 | Page page = BufferGetPage(buffer); |
410 | spgxlogVacuumRoot xlrec; |
411 | OffsetNumber toDelete[MaxIndexTuplesPerPage]; |
412 | OffsetNumber i, |
413 | max = PageGetMaxOffsetNumber(page); |
414 | |
415 | xlrec.nDelete = 0; |
416 | |
417 | /* Scan page, identify tuples to delete, accumulate stats */ |
418 | for (i = FirstOffsetNumber; i <= max; i++) |
419 | { |
420 | SpGistLeafTuple lt; |
421 | |
422 | lt = (SpGistLeafTuple) PageGetItem(page, |
423 | PageGetItemId(page, i)); |
424 | if (lt->tupstate == SPGIST_LIVE) |
425 | { |
426 | Assert(ItemPointerIsValid(<->heapPtr)); |
427 | |
428 | if (bds->callback(<->heapPtr, bds->callback_state)) |
429 | { |
430 | bds->stats->tuples_removed += 1; |
431 | toDelete[xlrec.nDelete] = i; |
432 | xlrec.nDelete++; |
433 | } |
434 | else |
435 | { |
436 | bds->stats->num_index_tuples += 1; |
437 | } |
438 | } |
439 | else |
440 | { |
441 | /* all tuples on root should be live */ |
442 | elog(ERROR, "unexpected SPGiST tuple state: %d" , |
443 | lt->tupstate); |
444 | } |
445 | } |
446 | |
447 | if (xlrec.nDelete == 0) |
448 | return; /* nothing more to do */ |
449 | |
450 | /* Do the update */ |
451 | START_CRIT_SECTION(); |
452 | |
453 | /* The tuple numbers are in order, so we can use PageIndexMultiDelete */ |
454 | PageIndexMultiDelete(page, toDelete, xlrec.nDelete); |
455 | |
456 | MarkBufferDirty(buffer); |
457 | |
458 | if (RelationNeedsWAL(index)) |
459 | { |
460 | XLogRecPtr recptr; |
461 | |
462 | XLogBeginInsert(); |
463 | |
464 | /* Prepare WAL record */ |
465 | STORE_STATE(&bds->spgstate, xlrec.stateSrc); |
466 | |
467 | XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRoot); |
468 | /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ |
469 | XLogRegisterData((char *) toDelete, |
470 | sizeof(OffsetNumber) * xlrec.nDelete); |
471 | |
472 | XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); |
473 | |
474 | recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT); |
475 | |
476 | PageSetLSN(page, recptr); |
477 | } |
478 | |
479 | END_CRIT_SECTION(); |
480 | } |
481 | |
482 | /* |
483 | * Clean up redirect and placeholder tuples on the given page |
484 | * |
485 | * Redirect tuples can be marked placeholder once they're old enough. |
486 | * Placeholder tuples can be removed if it won't change the offsets of |
487 | * non-placeholder ones. |
488 | * |
489 | * Unlike the routines above, this works on both leaf and inner pages. |
490 | */ |
491 | static void |
492 | vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) |
493 | { |
494 | Page page = BufferGetPage(buffer); |
495 | SpGistPageOpaque opaque = SpGistPageGetOpaque(page); |
496 | OffsetNumber i, |
497 | max = PageGetMaxOffsetNumber(page), |
498 | firstPlaceholder = InvalidOffsetNumber; |
499 | bool hasNonPlaceholder = false; |
500 | bool hasUpdate = false; |
501 | OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage]; |
502 | OffsetNumber itemnos[MaxIndexTuplesPerPage]; |
503 | spgxlogVacuumRedirect xlrec; |
504 | |
505 | xlrec.nToPlaceholder = 0; |
506 | xlrec.newestRedirectXid = InvalidTransactionId; |
507 | |
508 | START_CRIT_SECTION(); |
509 | |
510 | /* |
511 | * Scan backwards to convert old redirection tuples to placeholder tuples, |
512 | * and identify location of last non-placeholder tuple while at it. |
513 | */ |
514 | for (i = max; |
515 | i >= FirstOffsetNumber && |
516 | (opaque->nRedirection > 0 || !hasNonPlaceholder); |
517 | i--) |
518 | { |
519 | SpGistDeadTuple dt; |
520 | |
521 | dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i)); |
522 | |
523 | if (dt->tupstate == SPGIST_REDIRECT && |
524 | TransactionIdPrecedes(dt->xid, RecentGlobalXmin)) |
525 | { |
526 | dt->tupstate = SPGIST_PLACEHOLDER; |
527 | Assert(opaque->nRedirection > 0); |
528 | opaque->nRedirection--; |
529 | opaque->nPlaceholder++; |
530 | |
531 | /* remember newest XID among the removed redirects */ |
532 | if (!TransactionIdIsValid(xlrec.newestRedirectXid) || |
533 | TransactionIdPrecedes(xlrec.newestRedirectXid, dt->xid)) |
534 | xlrec.newestRedirectXid = dt->xid; |
535 | |
536 | ItemPointerSetInvalid(&dt->pointer); |
537 | |
538 | itemToPlaceholder[xlrec.nToPlaceholder] = i; |
539 | xlrec.nToPlaceholder++; |
540 | |
541 | hasUpdate = true; |
542 | } |
543 | |
544 | if (dt->tupstate == SPGIST_PLACEHOLDER) |
545 | { |
546 | if (!hasNonPlaceholder) |
547 | firstPlaceholder = i; |
548 | } |
549 | else |
550 | { |
551 | hasNonPlaceholder = true; |
552 | } |
553 | } |
554 | |
555 | /* |
556 | * Any placeholder tuples at the end of page can safely be removed. We |
557 | * can't remove ones before the last non-placeholder, though, because we |
558 | * can't alter the offset numbers of non-placeholder tuples. |
559 | */ |
560 | if (firstPlaceholder != InvalidOffsetNumber) |
561 | { |
562 | /* |
563 | * We do not store this array to rdata because it's easy to recreate. |
564 | */ |
565 | for (i = firstPlaceholder; i <= max; i++) |
566 | itemnos[i - firstPlaceholder] = i; |
567 | |
568 | i = max - firstPlaceholder + 1; |
569 | Assert(opaque->nPlaceholder >= i); |
570 | opaque->nPlaceholder -= i; |
571 | |
572 | /* The array is surely sorted, so can use PageIndexMultiDelete */ |
573 | PageIndexMultiDelete(page, itemnos, i); |
574 | |
575 | hasUpdate = true; |
576 | } |
577 | |
578 | xlrec.firstPlaceholder = firstPlaceholder; |
579 | |
580 | if (hasUpdate) |
581 | MarkBufferDirty(buffer); |
582 | |
583 | if (hasUpdate && RelationNeedsWAL(index)) |
584 | { |
585 | XLogRecPtr recptr; |
586 | |
587 | XLogBeginInsert(); |
588 | |
589 | XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRedirect); |
590 | XLogRegisterData((char *) itemToPlaceholder, |
591 | sizeof(OffsetNumber) * xlrec.nToPlaceholder); |
592 | |
593 | XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); |
594 | |
595 | recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT); |
596 | |
597 | PageSetLSN(page, recptr); |
598 | } |
599 | |
600 | END_CRIT_SECTION(); |
601 | } |
602 | |
603 | /* |
604 | * Process one page during a bulkdelete scan |
605 | */ |
606 | static void |
607 | spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno) |
608 | { |
609 | Relation index = bds->info->index; |
610 | Buffer buffer; |
611 | Page page; |
612 | |
613 | /* call vacuum_delay_point while not holding any buffer lock */ |
614 | vacuum_delay_point(); |
615 | |
616 | buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, |
617 | RBM_NORMAL, bds->info->strategy); |
618 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
619 | page = (Page) BufferGetPage(buffer); |
620 | |
621 | if (PageIsNew(page)) |
622 | { |
623 | /* |
624 | * We found an all-zero page, which could happen if the database |
625 | * crashed just after extending the file. Recycle it. |
626 | */ |
627 | } |
628 | else if (PageIsEmpty(page)) |
629 | { |
630 | /* nothing to do */ |
631 | } |
632 | else if (SpGistPageIsLeaf(page)) |
633 | { |
634 | if (SpGistBlockIsRoot(blkno)) |
635 | { |
636 | vacuumLeafRoot(bds, index, buffer); |
637 | /* no need for vacuumRedirectAndPlaceholder */ |
638 | } |
639 | else |
640 | { |
641 | vacuumLeafPage(bds, index, buffer, false); |
642 | vacuumRedirectAndPlaceholder(index, buffer); |
643 | } |
644 | } |
645 | else |
646 | { |
647 | /* inner page */ |
648 | vacuumRedirectAndPlaceholder(index, buffer); |
649 | } |
650 | |
651 | /* |
652 | * The root pages must never be deleted, nor marked as available in FSM, |
653 | * because we don't want them ever returned by a search for a place to put |
654 | * a new tuple. Otherwise, check for empty page, and make sure the FSM |
655 | * knows about it. |
656 | */ |
657 | if (!SpGistBlockIsRoot(blkno)) |
658 | { |
659 | if (PageIsNew(page) || PageIsEmpty(page)) |
660 | { |
661 | RecordFreeIndexPage(index, blkno); |
662 | bds->stats->pages_deleted++; |
663 | } |
664 | else |
665 | { |
666 | SpGistSetLastUsedPage(index, buffer); |
667 | bds->lastFilledBlock = blkno; |
668 | } |
669 | } |
670 | |
671 | UnlockReleaseBuffer(buffer); |
672 | } |
673 | |
674 | /* |
675 | * Process the pending-TID list between pages of the main scan |
676 | */ |
677 | static void |
678 | spgprocesspending(spgBulkDeleteState *bds) |
679 | { |
680 | Relation index = bds->info->index; |
681 | spgVacPendingItem *pitem; |
682 | spgVacPendingItem *nitem; |
683 | BlockNumber blkno; |
684 | Buffer buffer; |
685 | Page page; |
686 | |
687 | for (pitem = bds->pendingList; pitem != NULL; pitem = pitem->next) |
688 | { |
689 | if (pitem->done) |
690 | continue; /* ignore already-done items */ |
691 | |
692 | /* call vacuum_delay_point while not holding any buffer lock */ |
693 | vacuum_delay_point(); |
694 | |
695 | /* examine the referenced page */ |
696 | blkno = ItemPointerGetBlockNumber(&pitem->tid); |
697 | buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, |
698 | RBM_NORMAL, bds->info->strategy); |
699 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
700 | page = (Page) BufferGetPage(buffer); |
701 | |
702 | if (PageIsNew(page) || SpGistPageIsDeleted(page)) |
703 | { |
704 | /* Probably shouldn't happen, but ignore it */ |
705 | } |
706 | else if (SpGistPageIsLeaf(page)) |
707 | { |
708 | if (SpGistBlockIsRoot(blkno)) |
709 | { |
710 | /* this should definitely not happen */ |
711 | elog(ERROR, "redirection leads to root page of index \"%s\"" , |
712 | RelationGetRelationName(index)); |
713 | } |
714 | |
715 | /* deal with any deletable tuples */ |
716 | vacuumLeafPage(bds, index, buffer, true); |
717 | /* might as well do this while we are here */ |
718 | vacuumRedirectAndPlaceholder(index, buffer); |
719 | |
720 | SpGistSetLastUsedPage(index, buffer); |
721 | |
722 | /* |
723 | * We can mark as done not only this item, but any later ones |
724 | * pointing at the same page, since we vacuumed the whole page. |
725 | */ |
726 | pitem->done = true; |
727 | for (nitem = pitem->next; nitem != NULL; nitem = nitem->next) |
728 | { |
729 | if (ItemPointerGetBlockNumber(&nitem->tid) == blkno) |
730 | nitem->done = true; |
731 | } |
732 | } |
733 | else |
734 | { |
735 | /* |
736 | * On an inner page, visit the referenced inner tuple and add all |
737 | * its downlinks to the pending list. We might have pending items |
738 | * for more than one inner tuple on the same page (in fact this is |
739 | * pretty likely given the way space allocation works), so get |
740 | * them all while we are here. |
741 | */ |
742 | for (nitem = pitem; nitem != NULL; nitem = nitem->next) |
743 | { |
744 | if (nitem->done) |
745 | continue; |
746 | if (ItemPointerGetBlockNumber(&nitem->tid) == blkno) |
747 | { |
748 | OffsetNumber offset; |
749 | SpGistInnerTuple innerTuple; |
750 | |
751 | offset = ItemPointerGetOffsetNumber(&nitem->tid); |
752 | innerTuple = (SpGistInnerTuple) PageGetItem(page, |
753 | PageGetItemId(page, offset)); |
754 | if (innerTuple->tupstate == SPGIST_LIVE) |
755 | { |
756 | SpGistNodeTuple node; |
757 | int i; |
758 | |
759 | SGITITERATE(innerTuple, i, node) |
760 | { |
761 | if (ItemPointerIsValid(&node->t_tid)) |
762 | spgAddPendingTID(bds, &node->t_tid); |
763 | } |
764 | } |
765 | else if (innerTuple->tupstate == SPGIST_REDIRECT) |
766 | { |
767 | /* transfer attention to redirect point */ |
768 | spgAddPendingTID(bds, |
769 | &((SpGistDeadTuple) innerTuple)->pointer); |
770 | } |
771 | else |
772 | elog(ERROR, "unexpected SPGiST tuple state: %d" , |
773 | innerTuple->tupstate); |
774 | |
775 | nitem->done = true; |
776 | } |
777 | } |
778 | } |
779 | |
780 | UnlockReleaseBuffer(buffer); |
781 | } |
782 | |
783 | spgClearPendingList(bds); |
784 | } |
785 | |
786 | /* |
787 | * Perform a bulkdelete scan |
788 | */ |
789 | static void |
790 | spgvacuumscan(spgBulkDeleteState *bds) |
791 | { |
792 | Relation index = bds->info->index; |
793 | bool needLock; |
794 | BlockNumber num_pages, |
795 | blkno; |
796 | |
797 | /* Finish setting up spgBulkDeleteState */ |
798 | initSpGistState(&bds->spgstate, index); |
799 | bds->pendingList = NULL; |
800 | bds->myXmin = GetActiveSnapshot()->xmin; |
801 | bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO; |
802 | |
803 | /* |
804 | * Reset counts that will be incremented during the scan; needed in case |
805 | * of multiple scans during a single VACUUM command |
806 | */ |
807 | bds->stats->estimated_count = false; |
808 | bds->stats->num_index_tuples = 0; |
809 | bds->stats->pages_deleted = 0; |
810 | |
811 | /* We can skip locking for new or temp relations */ |
812 | needLock = !RELATION_IS_LOCAL(index); |
813 | |
814 | /* |
815 | * The outer loop iterates over all index pages except the metapage, in |
816 | * physical order (we hope the kernel will cooperate in providing |
817 | * read-ahead for speed). It is critical that we visit all leaf pages, |
818 | * including ones added after we start the scan, else we might fail to |
819 | * delete some deletable tuples. See more extensive comments about this |
820 | * in btvacuumscan(). |
821 | */ |
822 | blkno = SPGIST_METAPAGE_BLKNO + 1; |
823 | for (;;) |
824 | { |
825 | /* Get the current relation length */ |
826 | if (needLock) |
827 | LockRelationForExtension(index, ExclusiveLock); |
828 | num_pages = RelationGetNumberOfBlocks(index); |
829 | if (needLock) |
830 | UnlockRelationForExtension(index, ExclusiveLock); |
831 | |
832 | /* Quit if we've scanned the whole relation */ |
833 | if (blkno >= num_pages) |
834 | break; |
835 | /* Iterate over pages, then loop back to recheck length */ |
836 | for (; blkno < num_pages; blkno++) |
837 | { |
838 | spgvacuumpage(bds, blkno); |
839 | /* empty the pending-list after each page */ |
840 | if (bds->pendingList != NULL) |
841 | spgprocesspending(bds); |
842 | } |
843 | } |
844 | |
845 | /* Propagate local lastUsedPage cache to metablock */ |
846 | SpGistUpdateMetaPage(index); |
847 | |
848 | /* |
849 | * If we found any empty pages (and recorded them in the FSM), then |
850 | * forcibly update the upper-level FSM pages to ensure that searchers can |
851 | * find them. It's possible that the pages were also found during |
852 | * previous scans and so this is a waste of time, but it's cheap enough |
853 | * relative to scanning the index that it shouldn't matter much, and |
854 | * making sure that free pages are available sooner not later seems |
855 | * worthwhile. |
856 | * |
857 | * Note that if no empty pages exist, we don't bother vacuuming the FSM at |
858 | * all. |
859 | */ |
860 | if (bds->stats->pages_deleted > 0) |
861 | IndexFreeSpaceMapVacuum(index); |
862 | |
863 | /* |
864 | * Truncate index if possible |
865 | * |
866 | * XXX disabled because it's unsafe due to possible concurrent inserts. |
867 | * We'd have to rescan the pages to make sure they're still empty, and it |
868 | * doesn't seem worth it. Note that btree doesn't do this either. |
869 | * |
870 | * Another reason not to truncate is that it could invalidate the cached |
871 | * pages-with-freespace pointers in the metapage and other backends' |
872 | * relation caches, that is leave them pointing to nonexistent pages. |
873 | * Adding RelationGetNumberOfBlocks calls to protect the places that use |
874 | * those pointers would be unduly expensive. |
875 | */ |
876 | #ifdef NOT_USED |
877 | if (num_pages > bds->lastFilledBlock + 1) |
878 | { |
879 | BlockNumber lastBlock = num_pages - 1; |
880 | |
881 | num_pages = bds->lastFilledBlock + 1; |
882 | RelationTruncate(index, num_pages); |
883 | bds->stats->pages_removed += lastBlock - bds->lastFilledBlock; |
884 | bds->stats->pages_deleted -= lastBlock - bds->lastFilledBlock; |
885 | } |
886 | #endif |
887 | |
888 | /* Report final stats */ |
889 | bds->stats->num_pages = num_pages; |
890 | bds->stats->pages_free = bds->stats->pages_deleted; |
891 | } |
892 | |
893 | /* |
894 | * Bulk deletion of all index entries pointing to a set of heap tuples. |
895 | * The set of target tuples is specified via a callback routine that tells |
896 | * whether any given heap tuple (identified by ItemPointer) is being deleted. |
897 | * |
898 | * Result: a palloc'd struct containing statistical info for VACUUM displays. |
899 | */ |
900 | IndexBulkDeleteResult * |
901 | spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
902 | IndexBulkDeleteCallback callback, void *callback_state) |
903 | { |
904 | spgBulkDeleteState bds; |
905 | |
906 | /* allocate stats if first time through, else re-use existing struct */ |
907 | if (stats == NULL) |
908 | stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
909 | bds.info = info; |
910 | bds.stats = stats; |
911 | bds.callback = callback; |
912 | bds.callback_state = callback_state; |
913 | |
914 | spgvacuumscan(&bds); |
915 | |
916 | return stats; |
917 | } |
918 | |
919 | /* Dummy callback to delete no tuples during spgvacuumcleanup */ |
920 | static bool |
921 | dummy_callback(ItemPointer itemptr, void *state) |
922 | { |
923 | return false; |
924 | } |
925 | |
926 | /* |
927 | * Post-VACUUM cleanup. |
928 | * |
929 | * Result: a palloc'd struct containing statistical info for VACUUM displays. |
930 | */ |
931 | IndexBulkDeleteResult * |
932 | spgvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) |
933 | { |
934 | spgBulkDeleteState bds; |
935 | |
936 | /* No-op in ANALYZE ONLY mode */ |
937 | if (info->analyze_only) |
938 | return stats; |
939 | |
940 | /* |
941 | * We don't need to scan the index if there was a preceding bulkdelete |
942 | * pass. Otherwise, make a pass that won't delete any live tuples, but |
943 | * might still accomplish useful stuff with redirect/placeholder cleanup |
944 | * and/or FSM housekeeping, and in any case will provide stats. |
945 | */ |
946 | if (stats == NULL) |
947 | { |
948 | stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
949 | bds.info = info; |
950 | bds.stats = stats; |
951 | bds.callback = dummy_callback; |
952 | bds.callback_state = NULL; |
953 | |
954 | spgvacuumscan(&bds); |
955 | } |
956 | |
957 | /* |
958 | * It's quite possible for us to be fooled by concurrent tuple moves into |
959 | * double-counting some index tuples, so disbelieve any total that exceeds |
960 | * the underlying heap's count ... if we know that accurately. Otherwise |
961 | * this might just make matters worse. |
962 | */ |
963 | if (!info->estimated_count) |
964 | { |
965 | if (stats->num_index_tuples > info->num_heap_tuples) |
966 | stats->num_index_tuples = info->num_heap_tuples; |
967 | } |
968 | |
969 | return stats; |
970 | } |
971 | |