1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * pruneheap.c |
4 | * heap page pruning and HOT-chain management code |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/heap/pruneheap.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | #include "postgres.h" |
16 | |
17 | #include "access/heapam.h" |
18 | #include "access/heapam_xlog.h" |
19 | #include "access/transam.h" |
20 | #include "access/htup_details.h" |
21 | #include "access/xlog.h" |
22 | #include "catalog/catalog.h" |
23 | #include "miscadmin.h" |
24 | #include "pgstat.h" |
25 | #include "storage/bufmgr.h" |
26 | #include "utils/snapmgr.h" |
27 | #include "utils/rel.h" |
28 | |
29 | /* Working data for heap_page_prune and subroutines */ |
30 | typedef struct |
31 | { |
32 | TransactionId new_prune_xid; /* new prune hint value for page */ |
33 | TransactionId latestRemovedXid; /* latest xid to be removed by this prune */ |
34 | int nredirected; /* numbers of entries in arrays below */ |
35 | int ndead; |
36 | int nunused; |
37 | /* arrays that accumulate indexes of items to be changed */ |
38 | OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; |
39 | OffsetNumber nowdead[MaxHeapTuplesPerPage]; |
40 | OffsetNumber nowunused[MaxHeapTuplesPerPage]; |
41 | /* marked[i] is true if item i is entered in one of the above arrays */ |
42 | bool marked[MaxHeapTuplesPerPage + 1]; |
43 | } PruneState; |
44 | |
45 | /* Local functions */ |
46 | static int heap_prune_chain(Relation relation, Buffer buffer, |
47 | OffsetNumber rootoffnum, |
48 | TransactionId OldestXmin, |
49 | PruneState *prstate); |
50 | static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid); |
51 | static void heap_prune_record_redirect(PruneState *prstate, |
52 | OffsetNumber offnum, OffsetNumber rdoffnum); |
53 | static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum); |
54 | static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum); |
55 | |
56 | |
57 | /* |
58 | * Optionally prune and repair fragmentation in the specified page. |
59 | * |
60 | * This is an opportunistic function. It will perform housekeeping |
61 | * only if the page heuristically looks like a candidate for pruning and we |
62 | * can acquire buffer cleanup lock without blocking. |
63 | * |
64 | * Note: this is called quite often. It's important that it fall out quickly |
65 | * if there's not any use in pruning. |
66 | * |
67 | * Caller must have pin on the buffer, and must *not* have a lock on it. |
68 | * |
69 | * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD |
70 | * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). |
71 | */ |
72 | void |
73 | heap_page_prune_opt(Relation relation, Buffer buffer) |
74 | { |
75 | Page page = BufferGetPage(buffer); |
76 | Size minfree; |
77 | TransactionId OldestXmin; |
78 | |
79 | /* |
80 | * We can't write WAL in recovery mode, so there's no point trying to |
81 | * clean the page. The master will likely issue a cleaning WAL record soon |
82 | * anyway, so this is no particular loss. |
83 | */ |
84 | if (RecoveryInProgress()) |
85 | return; |
86 | |
87 | /* |
88 | * Use the appropriate xmin horizon for this relation. If it's a proper |
89 | * catalog relation or a user defined, additional, catalog relation, we |
90 | * need to use the horizon that includes slots, otherwise the data-only |
91 | * horizon can be used. Note that the toast relation of user defined |
92 | * relations are *not* considered catalog relations. |
93 | * |
94 | * It is OK to apply the old snapshot limit before acquiring the cleanup |
95 | * lock because the worst that can happen is that we are not quite as |
96 | * aggressive about the cleanup (by however many transaction IDs are |
97 | * consumed between this point and acquiring the lock). This allows us to |
98 | * save significant overhead in the case where the page is found not to be |
99 | * prunable. |
100 | */ |
101 | if (IsCatalogRelation(relation) || |
102 | RelationIsAccessibleInLogicalDecoding(relation)) |
103 | OldestXmin = RecentGlobalXmin; |
104 | else |
105 | OldestXmin = |
106 | TransactionIdLimitedForOldSnapshots(RecentGlobalDataXmin, |
107 | relation); |
108 | |
109 | Assert(TransactionIdIsValid(OldestXmin)); |
110 | |
111 | /* |
112 | * Let's see if we really need pruning. |
113 | * |
114 | * Forget it if page is not hinted to contain something prunable that's |
115 | * older than OldestXmin. |
116 | */ |
117 | if (!PageIsPrunable(page, OldestXmin)) |
118 | return; |
119 | |
120 | /* |
121 | * We prune when a previous UPDATE failed to find enough space on the page |
122 | * for a new tuple version, or when free space falls below the relation's |
123 | * fill-factor target (but not less than 10%). |
124 | * |
125 | * Checking free space here is questionable since we aren't holding any |
126 | * lock on the buffer; in the worst case we could get a bogus answer. It's |
127 | * unlikely to be *seriously* wrong, though, since reading either pd_lower |
128 | * or pd_upper is probably atomic. Avoiding taking a lock seems more |
129 | * important than sometimes getting a wrong answer in what is after all |
130 | * just a heuristic estimate. |
131 | */ |
132 | minfree = RelationGetTargetPageFreeSpace(relation, |
133 | HEAP_DEFAULT_FILLFACTOR); |
134 | minfree = Max(minfree, BLCKSZ / 10); |
135 | |
136 | if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) |
137 | { |
138 | /* OK, try to get exclusive buffer lock */ |
139 | if (!ConditionalLockBufferForCleanup(buffer)) |
140 | return; |
141 | |
142 | /* |
143 | * Now that we have buffer lock, get accurate information about the |
144 | * page's free space, and recheck the heuristic about whether to |
145 | * prune. (We needn't recheck PageIsPrunable, since no one else could |
146 | * have pruned while we hold pin.) |
147 | */ |
148 | if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) |
149 | { |
150 | TransactionId ignore = InvalidTransactionId; /* return value not |
151 | * needed */ |
152 | |
153 | /* OK to prune */ |
154 | (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore); |
155 | } |
156 | |
157 | /* And release buffer lock */ |
158 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
159 | } |
160 | } |
161 | |
162 | |
163 | /* |
164 | * Prune and repair fragmentation in the specified page. |
165 | * |
166 | * Caller must have pin and buffer cleanup lock on the page. |
167 | * |
168 | * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD |
169 | * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). |
170 | * |
171 | * If report_stats is true then we send the number of reclaimed heap-only |
172 | * tuples to pgstats. (This must be false during vacuum, since vacuum will |
173 | * send its own new total to pgstats, and we don't want this delta applied |
174 | * on top of that.) |
175 | * |
176 | * Returns the number of tuples deleted from the page and sets |
177 | * latestRemovedXid. |
178 | */ |
179 | int |
180 | heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, |
181 | bool report_stats, TransactionId *latestRemovedXid) |
182 | { |
183 | int ndeleted = 0; |
184 | Page page = BufferGetPage(buffer); |
185 | OffsetNumber offnum, |
186 | maxoff; |
187 | PruneState prstate; |
188 | |
189 | /* |
190 | * Our strategy is to scan the page and make lists of items to change, |
191 | * then apply the changes within a critical section. This keeps as much |
192 | * logic as possible out of the critical section, and also ensures that |
193 | * WAL replay will work the same as the normal case. |
194 | * |
195 | * First, initialize the new pd_prune_xid value to zero (indicating no |
196 | * prunable tuples). If we find any tuples which may soon become |
197 | * prunable, we will save the lowest relevant XID in new_prune_xid. Also |
198 | * initialize the rest of our working state. |
199 | */ |
200 | prstate.new_prune_xid = InvalidTransactionId; |
201 | prstate.latestRemovedXid = *latestRemovedXid; |
202 | prstate.nredirected = prstate.ndead = prstate.nunused = 0; |
203 | memset(prstate.marked, 0, sizeof(prstate.marked)); |
204 | |
205 | /* Scan the page */ |
206 | maxoff = PageGetMaxOffsetNumber(page); |
207 | for (offnum = FirstOffsetNumber; |
208 | offnum <= maxoff; |
209 | offnum = OffsetNumberNext(offnum)) |
210 | { |
211 | ItemId itemid; |
212 | |
213 | /* Ignore items already processed as part of an earlier chain */ |
214 | if (prstate.marked[offnum]) |
215 | continue; |
216 | |
217 | /* Nothing to do if slot is empty or already dead */ |
218 | itemid = PageGetItemId(page, offnum); |
219 | if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) |
220 | continue; |
221 | |
222 | /* Process this item or chain of items */ |
223 | ndeleted += heap_prune_chain(relation, buffer, offnum, |
224 | OldestXmin, |
225 | &prstate); |
226 | } |
227 | |
228 | /* Any error while applying the changes is critical */ |
229 | START_CRIT_SECTION(); |
230 | |
231 | /* Have we found any prunable items? */ |
232 | if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0) |
233 | { |
234 | /* |
235 | * Apply the planned item changes, then repair page fragmentation, and |
236 | * update the page's hint bit about whether it has free line pointers. |
237 | */ |
238 | heap_page_prune_execute(buffer, |
239 | prstate.redirected, prstate.nredirected, |
240 | prstate.nowdead, prstate.ndead, |
241 | prstate.nowunused, prstate.nunused); |
242 | |
243 | /* |
244 | * Update the page's pd_prune_xid field to either zero, or the lowest |
245 | * XID of any soon-prunable tuple. |
246 | */ |
247 | ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; |
248 | |
249 | /* |
250 | * Also clear the "page is full" flag, since there's no point in |
251 | * repeating the prune/defrag process until something else happens to |
252 | * the page. |
253 | */ |
254 | PageClearFull(page); |
255 | |
256 | MarkBufferDirty(buffer); |
257 | |
258 | /* |
259 | * Emit a WAL HEAP_CLEAN record showing what we did |
260 | */ |
261 | if (RelationNeedsWAL(relation)) |
262 | { |
263 | XLogRecPtr recptr; |
264 | |
265 | recptr = log_heap_clean(relation, buffer, |
266 | prstate.redirected, prstate.nredirected, |
267 | prstate.nowdead, prstate.ndead, |
268 | prstate.nowunused, prstate.nunused, |
269 | prstate.latestRemovedXid); |
270 | |
271 | PageSetLSN(BufferGetPage(buffer), recptr); |
272 | } |
273 | } |
274 | else |
275 | { |
276 | /* |
277 | * If we didn't prune anything, but have found a new value for the |
278 | * pd_prune_xid field, update it and mark the buffer dirty. This is |
279 | * treated as a non-WAL-logged hint. |
280 | * |
281 | * Also clear the "page is full" flag if it is set, since there's no |
282 | * point in repeating the prune/defrag process until something else |
283 | * happens to the page. |
284 | */ |
285 | if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || |
286 | PageIsFull(page)) |
287 | { |
288 | ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; |
289 | PageClearFull(page); |
290 | MarkBufferDirtyHint(buffer, true); |
291 | } |
292 | } |
293 | |
294 | END_CRIT_SECTION(); |
295 | |
296 | /* |
297 | * If requested, report the number of tuples reclaimed to pgstats. This is |
298 | * ndeleted minus ndead, because we don't want to count a now-DEAD root |
299 | * item as a deletion for this purpose. |
300 | */ |
301 | if (report_stats && ndeleted > prstate.ndead) |
302 | pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead); |
303 | |
304 | *latestRemovedXid = prstate.latestRemovedXid; |
305 | |
306 | /* |
307 | * XXX Should we update the FSM information of this page ? |
308 | * |
309 | * There are two schools of thought here. We may not want to update FSM |
310 | * information so that the page is not used for unrelated UPDATEs/INSERTs |
311 | * and any free space in this page will remain available for further |
312 | * UPDATEs in *this* page, thus improving chances for doing HOT updates. |
313 | * |
314 | * But for a large table and where a page does not receive further UPDATEs |
315 | * for a long time, we might waste this space by not updating the FSM |
316 | * information. The relation may get extended and fragmented further. |
317 | * |
318 | * One possibility is to leave "fillfactor" worth of space in this page |
319 | * and update FSM with the remaining space. |
320 | */ |
321 | |
322 | return ndeleted; |
323 | } |
324 | |
325 | |
326 | /* |
327 | * Prune specified line pointer or a HOT chain originating at line pointer. |
328 | * |
329 | * If the item is an index-referenced tuple (i.e. not a heap-only tuple), |
330 | * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT |
331 | * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. |
332 | * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really |
333 | * DEAD, the OldestXmin test is just too coarse to detect it. |
334 | * |
335 | * The root line pointer is redirected to the tuple immediately after the |
336 | * latest DEAD tuple. If all tuples in the chain are DEAD, the root line |
337 | * pointer is marked LP_DEAD. (This includes the case of a DEAD simple |
338 | * tuple, which we treat as a chain of length 1.) |
339 | * |
340 | * OldestXmin is the cutoff XID used to identify dead tuples. |
341 | * |
342 | * We don't actually change the page here, except perhaps for hint-bit updates |
343 | * caused by HeapTupleSatisfiesVacuum. We just add entries to the arrays in |
344 | * prstate showing the changes to be made. Items to be redirected are added |
345 | * to the redirected[] array (two entries per redirection); items to be set to |
346 | * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED |
347 | * state are added to nowunused[]. |
348 | * |
349 | * Returns the number of tuples (to be) deleted from the page. |
350 | */ |
351 | static int |
352 | heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, |
353 | TransactionId OldestXmin, |
354 | PruneState *prstate) |
355 | { |
356 | int ndeleted = 0; |
357 | Page dp = (Page) BufferGetPage(buffer); |
358 | TransactionId priorXmax = InvalidTransactionId; |
359 | ItemId rootlp; |
360 | HeapTupleHeader htup; |
361 | OffsetNumber latestdead = InvalidOffsetNumber, |
362 | maxoff = PageGetMaxOffsetNumber(dp), |
363 | offnum; |
364 | OffsetNumber chainitems[MaxHeapTuplesPerPage]; |
365 | int nchain = 0, |
366 | i; |
367 | HeapTupleData tup; |
368 | |
369 | tup.t_tableOid = RelationGetRelid(relation); |
370 | |
371 | rootlp = PageGetItemId(dp, rootoffnum); |
372 | |
373 | /* |
374 | * If it's a heap-only tuple, then it is not the start of a HOT chain. |
375 | */ |
376 | if (ItemIdIsNormal(rootlp)) |
377 | { |
378 | htup = (HeapTupleHeader) PageGetItem(dp, rootlp); |
379 | |
380 | tup.t_data = htup; |
381 | tup.t_len = ItemIdGetLength(rootlp); |
382 | ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), rootoffnum); |
383 | |
384 | if (HeapTupleHeaderIsHeapOnly(htup)) |
385 | { |
386 | /* |
387 | * If the tuple is DEAD and doesn't chain to anything else, mark |
388 | * it unused immediately. (If it does chain, we can only remove |
389 | * it as part of pruning its chain.) |
390 | * |
391 | * We need this primarily to handle aborted HOT updates, that is, |
392 | * XMIN_INVALID heap-only tuples. Those might not be linked to by |
393 | * any chain, since the parent tuple might be re-updated before |
394 | * any pruning occurs. So we have to be able to reap them |
395 | * separately from chain-pruning. (Note that |
396 | * HeapTupleHeaderIsHotUpdated will never return true for an |
397 | * XMIN_INVALID tuple, so this code will work even when there were |
398 | * sequential updates within the aborted transaction.) |
399 | * |
400 | * Note that we might first arrive at a dead heap-only tuple |
401 | * either here or while following a chain below. Whichever path |
402 | * gets there first will mark the tuple unused. |
403 | */ |
404 | if (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer) |
405 | == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) |
406 | { |
407 | heap_prune_record_unused(prstate, rootoffnum); |
408 | HeapTupleHeaderAdvanceLatestRemovedXid(htup, |
409 | &prstate->latestRemovedXid); |
410 | ndeleted++; |
411 | } |
412 | |
413 | /* Nothing more to do */ |
414 | return ndeleted; |
415 | } |
416 | } |
417 | |
418 | /* Start from the root tuple */ |
419 | offnum = rootoffnum; |
420 | |
421 | /* while not end of the chain */ |
422 | for (;;) |
423 | { |
424 | ItemId lp; |
425 | bool tupdead, |
426 | recent_dead; |
427 | |
428 | /* Some sanity checks */ |
429 | if (offnum < FirstOffsetNumber || offnum > maxoff) |
430 | break; |
431 | |
432 | /* If item is already processed, stop --- it must not be same chain */ |
433 | if (prstate->marked[offnum]) |
434 | break; |
435 | |
436 | lp = PageGetItemId(dp, offnum); |
437 | |
438 | /* Unused item obviously isn't part of the chain */ |
439 | if (!ItemIdIsUsed(lp)) |
440 | break; |
441 | |
442 | /* |
443 | * If we are looking at the redirected root line pointer, jump to the |
444 | * first normal tuple in the chain. If we find a redirect somewhere |
445 | * else, stop --- it must not be same chain. |
446 | */ |
447 | if (ItemIdIsRedirected(lp)) |
448 | { |
449 | if (nchain > 0) |
450 | break; /* not at start of chain */ |
451 | chainitems[nchain++] = offnum; |
452 | offnum = ItemIdGetRedirect(rootlp); |
453 | continue; |
454 | } |
455 | |
456 | /* |
457 | * Likewise, a dead line pointer can't be part of the chain. (We |
458 | * already eliminated the case of dead root tuple outside this |
459 | * function.) |
460 | */ |
461 | if (ItemIdIsDead(lp)) |
462 | break; |
463 | |
464 | Assert(ItemIdIsNormal(lp)); |
465 | htup = (HeapTupleHeader) PageGetItem(dp, lp); |
466 | |
467 | tup.t_data = htup; |
468 | tup.t_len = ItemIdGetLength(lp); |
469 | ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum); |
470 | |
471 | /* |
472 | * Check the tuple XMIN against prior XMAX, if any |
473 | */ |
474 | if (TransactionIdIsValid(priorXmax) && |
475 | !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) |
476 | break; |
477 | |
478 | /* |
479 | * OK, this tuple is indeed a member of the chain. |
480 | */ |
481 | chainitems[nchain++] = offnum; |
482 | |
483 | /* |
484 | * Check tuple's visibility status. |
485 | */ |
486 | tupdead = recent_dead = false; |
487 | |
488 | switch (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer)) |
489 | { |
490 | case HEAPTUPLE_DEAD: |
491 | tupdead = true; |
492 | break; |
493 | |
494 | case HEAPTUPLE_RECENTLY_DEAD: |
495 | recent_dead = true; |
496 | |
497 | /* |
498 | * This tuple may soon become DEAD. Update the hint field so |
499 | * that the page is reconsidered for pruning in future. |
500 | */ |
501 | heap_prune_record_prunable(prstate, |
502 | HeapTupleHeaderGetUpdateXid(htup)); |
503 | break; |
504 | |
505 | case HEAPTUPLE_DELETE_IN_PROGRESS: |
506 | |
507 | /* |
508 | * This tuple may soon become DEAD. Update the hint field so |
509 | * that the page is reconsidered for pruning in future. |
510 | */ |
511 | heap_prune_record_prunable(prstate, |
512 | HeapTupleHeaderGetUpdateXid(htup)); |
513 | break; |
514 | |
515 | case HEAPTUPLE_LIVE: |
516 | case HEAPTUPLE_INSERT_IN_PROGRESS: |
517 | |
518 | /* |
519 | * If we wanted to optimize for aborts, we might consider |
520 | * marking the page prunable when we see INSERT_IN_PROGRESS. |
521 | * But we don't. See related decisions about when to mark the |
522 | * page prunable in heapam.c. |
523 | */ |
524 | break; |
525 | |
526 | default: |
527 | elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result" ); |
528 | break; |
529 | } |
530 | |
531 | /* |
532 | * Remember the last DEAD tuple seen. We will advance past |
533 | * RECENTLY_DEAD tuples just in case there's a DEAD one after them; |
534 | * but we can't advance past anything else. (XXX is it really worth |
535 | * continuing to scan beyond RECENTLY_DEAD? The case where we will |
536 | * find another DEAD tuple is a fairly unusual corner case.) |
537 | */ |
538 | if (tupdead) |
539 | { |
540 | latestdead = offnum; |
541 | HeapTupleHeaderAdvanceLatestRemovedXid(htup, |
542 | &prstate->latestRemovedXid); |
543 | } |
544 | else if (!recent_dead) |
545 | break; |
546 | |
547 | /* |
548 | * If the tuple is not HOT-updated, then we are at the end of this |
549 | * HOT-update chain. |
550 | */ |
551 | if (!HeapTupleHeaderIsHotUpdated(htup)) |
552 | break; |
553 | |
554 | /* HOT implies it can't have moved to different partition */ |
555 | Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup)); |
556 | |
557 | /* |
558 | * Advance to next chain member. |
559 | */ |
560 | Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == |
561 | BufferGetBlockNumber(buffer)); |
562 | offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); |
563 | priorXmax = HeapTupleHeaderGetUpdateXid(htup); |
564 | } |
565 | |
566 | /* |
567 | * If we found a DEAD tuple in the chain, adjust the HOT chain so that all |
568 | * the DEAD tuples at the start of the chain are removed and the root line |
569 | * pointer is appropriately redirected. |
570 | */ |
571 | if (OffsetNumberIsValid(latestdead)) |
572 | { |
573 | /* |
574 | * Mark as unused each intermediate item that we are able to remove |
575 | * from the chain. |
576 | * |
577 | * When the previous item is the last dead tuple seen, we are at the |
578 | * right candidate for redirection. |
579 | */ |
580 | for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++) |
581 | { |
582 | heap_prune_record_unused(prstate, chainitems[i]); |
583 | ndeleted++; |
584 | } |
585 | |
586 | /* |
587 | * If the root entry had been a normal tuple, we are deleting it, so |
588 | * count it in the result. But changing a redirect (even to DEAD |
589 | * state) doesn't count. |
590 | */ |
591 | if (ItemIdIsNormal(rootlp)) |
592 | ndeleted++; |
593 | |
594 | /* |
595 | * If the DEAD tuple is at the end of the chain, the entire chain is |
596 | * dead and the root line pointer can be marked dead. Otherwise just |
597 | * redirect the root to the correct chain member. |
598 | */ |
599 | if (i >= nchain) |
600 | heap_prune_record_dead(prstate, rootoffnum); |
601 | else |
602 | heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]); |
603 | } |
604 | else if (nchain < 2 && ItemIdIsRedirected(rootlp)) |
605 | { |
606 | /* |
607 | * We found a redirect item that doesn't point to a valid follow-on |
608 | * item. This can happen if the loop in heap_page_prune caused us to |
609 | * visit the dead successor of a redirect item before visiting the |
610 | * redirect item. We can clean up by setting the redirect item to |
611 | * DEAD state. |
612 | */ |
613 | heap_prune_record_dead(prstate, rootoffnum); |
614 | } |
615 | |
616 | return ndeleted; |
617 | } |
618 | |
619 | /* Record lowest soon-prunable XID */ |
620 | static void |
621 | heap_prune_record_prunable(PruneState *prstate, TransactionId xid) |
622 | { |
623 | /* |
624 | * This should exactly match the PageSetPrunable macro. We can't store |
625 | * directly into the page header yet, so we update working state. |
626 | */ |
627 | Assert(TransactionIdIsNormal(xid)); |
628 | if (!TransactionIdIsValid(prstate->new_prune_xid) || |
629 | TransactionIdPrecedes(xid, prstate->new_prune_xid)) |
630 | prstate->new_prune_xid = xid; |
631 | } |
632 | |
633 | /* Record line pointer to be redirected */ |
634 | static void |
635 | heap_prune_record_redirect(PruneState *prstate, |
636 | OffsetNumber offnum, OffsetNumber rdoffnum) |
637 | { |
638 | Assert(prstate->nredirected < MaxHeapTuplesPerPage); |
639 | prstate->redirected[prstate->nredirected * 2] = offnum; |
640 | prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum; |
641 | prstate->nredirected++; |
642 | Assert(!prstate->marked[offnum]); |
643 | prstate->marked[offnum] = true; |
644 | Assert(!prstate->marked[rdoffnum]); |
645 | prstate->marked[rdoffnum] = true; |
646 | } |
647 | |
648 | /* Record line pointer to be marked dead */ |
649 | static void |
650 | heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum) |
651 | { |
652 | Assert(prstate->ndead < MaxHeapTuplesPerPage); |
653 | prstate->nowdead[prstate->ndead] = offnum; |
654 | prstate->ndead++; |
655 | Assert(!prstate->marked[offnum]); |
656 | prstate->marked[offnum] = true; |
657 | } |
658 | |
659 | /* Record line pointer to be marked unused */ |
660 | static void |
661 | heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) |
662 | { |
663 | Assert(prstate->nunused < MaxHeapTuplesPerPage); |
664 | prstate->nowunused[prstate->nunused] = offnum; |
665 | prstate->nunused++; |
666 | Assert(!prstate->marked[offnum]); |
667 | prstate->marked[offnum] = true; |
668 | } |
669 | |
670 | |
671 | /* |
672 | * Perform the actual page changes needed by heap_page_prune. |
673 | * It is expected that the caller has suitable pin and lock on the |
674 | * buffer, and is inside a critical section. |
675 | * |
676 | * This is split out because it is also used by heap_xlog_clean() |
677 | * to replay the WAL record when needed after a crash. Note that the |
678 | * arguments are identical to those of log_heap_clean(). |
679 | */ |
680 | void |
681 | heap_page_prune_execute(Buffer buffer, |
682 | OffsetNumber *redirected, int nredirected, |
683 | OffsetNumber *nowdead, int ndead, |
684 | OffsetNumber *nowunused, int nunused) |
685 | { |
686 | Page page = (Page) BufferGetPage(buffer); |
687 | OffsetNumber *offnum; |
688 | int i; |
689 | |
690 | /* Update all redirected line pointers */ |
691 | offnum = redirected; |
692 | for (i = 0; i < nredirected; i++) |
693 | { |
694 | OffsetNumber fromoff = *offnum++; |
695 | OffsetNumber tooff = *offnum++; |
696 | ItemId fromlp = PageGetItemId(page, fromoff); |
697 | |
698 | ItemIdSetRedirect(fromlp, tooff); |
699 | } |
700 | |
701 | /* Update all now-dead line pointers */ |
702 | offnum = nowdead; |
703 | for (i = 0; i < ndead; i++) |
704 | { |
705 | OffsetNumber off = *offnum++; |
706 | ItemId lp = PageGetItemId(page, off); |
707 | |
708 | ItemIdSetDead(lp); |
709 | } |
710 | |
711 | /* Update all now-unused line pointers */ |
712 | offnum = nowunused; |
713 | for (i = 0; i < nunused; i++) |
714 | { |
715 | OffsetNumber off = *offnum++; |
716 | ItemId lp = PageGetItemId(page, off); |
717 | |
718 | ItemIdSetUnused(lp); |
719 | } |
720 | |
721 | /* |
722 | * Finally, repair any fragmentation, and update the page's hint bit about |
723 | * whether it has free pointers. |
724 | */ |
725 | PageRepairFragmentation(page); |
726 | } |
727 | |
728 | |
729 | /* |
730 | * For all items in this page, find their respective root line pointers. |
731 | * If item k is part of a HOT-chain with root at item j, then we set |
732 | * root_offsets[k - 1] = j. |
733 | * |
734 | * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries. |
735 | * We zero out all unused entries. |
736 | * |
737 | * The function must be called with at least share lock on the buffer, to |
738 | * prevent concurrent prune operations. |
739 | * |
740 | * Note: The information collected here is valid only as long as the caller |
741 | * holds a pin on the buffer. Once pin is released, a tuple might be pruned |
742 | * and reused by a completely unrelated tuple. |
743 | */ |
744 | void |
745 | heap_get_root_tuples(Page page, OffsetNumber *root_offsets) |
746 | { |
747 | OffsetNumber offnum, |
748 | maxoff; |
749 | |
750 | MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber)); |
751 | |
752 | maxoff = PageGetMaxOffsetNumber(page); |
753 | for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) |
754 | { |
755 | ItemId lp = PageGetItemId(page, offnum); |
756 | HeapTupleHeader htup; |
757 | OffsetNumber nextoffnum; |
758 | TransactionId priorXmax; |
759 | |
760 | /* skip unused and dead items */ |
761 | if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) |
762 | continue; |
763 | |
764 | if (ItemIdIsNormal(lp)) |
765 | { |
766 | htup = (HeapTupleHeader) PageGetItem(page, lp); |
767 | |
768 | /* |
769 | * Check if this tuple is part of a HOT-chain rooted at some other |
770 | * tuple. If so, skip it for now; we'll process it when we find |
771 | * its root. |
772 | */ |
773 | if (HeapTupleHeaderIsHeapOnly(htup)) |
774 | continue; |
775 | |
776 | /* |
777 | * This is either a plain tuple or the root of a HOT-chain. |
778 | * Remember it in the mapping. |
779 | */ |
780 | root_offsets[offnum - 1] = offnum; |
781 | |
782 | /* If it's not the start of a HOT-chain, we're done with it */ |
783 | if (!HeapTupleHeaderIsHotUpdated(htup)) |
784 | continue; |
785 | |
786 | /* Set up to scan the HOT-chain */ |
787 | nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); |
788 | priorXmax = HeapTupleHeaderGetUpdateXid(htup); |
789 | } |
790 | else |
791 | { |
792 | /* Must be a redirect item. We do not set its root_offsets entry */ |
793 | Assert(ItemIdIsRedirected(lp)); |
794 | /* Set up to scan the HOT-chain */ |
795 | nextoffnum = ItemIdGetRedirect(lp); |
796 | priorXmax = InvalidTransactionId; |
797 | } |
798 | |
799 | /* |
800 | * Now follow the HOT-chain and collect other tuples in the chain. |
801 | * |
802 | * Note: Even though this is a nested loop, the complexity of the |
803 | * function is O(N) because a tuple in the page should be visited not |
804 | * more than twice, once in the outer loop and once in HOT-chain |
805 | * chases. |
806 | */ |
807 | for (;;) |
808 | { |
809 | lp = PageGetItemId(page, nextoffnum); |
810 | |
811 | /* Check for broken chains */ |
812 | if (!ItemIdIsNormal(lp)) |
813 | break; |
814 | |
815 | htup = (HeapTupleHeader) PageGetItem(page, lp); |
816 | |
817 | if (TransactionIdIsValid(priorXmax) && |
818 | !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup))) |
819 | break; |
820 | |
821 | /* Remember the root line pointer for this item */ |
822 | root_offsets[nextoffnum - 1] = offnum; |
823 | |
824 | /* Advance to next chain member, if any */ |
825 | if (!HeapTupleHeaderIsHotUpdated(htup)) |
826 | break; |
827 | |
828 | /* HOT implies it can't have moved to different partition */ |
829 | Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup)); |
830 | |
831 | nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); |
832 | priorXmax = HeapTupleHeaderGetUpdateXid(htup); |
833 | } |
834 | } |
835 | } |
836 | |