1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * vacuumlazy.c |
4 | * Concurrent ("lazy") vacuuming. |
5 | * |
6 | * |
7 | * The major space usage for LAZY VACUUM is storage for the array of dead tuple |
8 | * TIDs. We want to ensure we can vacuum even the very largest relations with |
9 | * finite memory space usage. To do that, we set upper bounds on the number of |
10 | * tuples we will keep track of at once. |
11 | * |
12 | * We are willing to use at most maintenance_work_mem (or perhaps |
13 | * autovacuum_work_mem) memory space to keep track of dead tuples. We |
14 | * initially allocate an array of TIDs of that size, with an upper limit that |
15 | * depends on table size (this limit ensures we don't allocate a huge area |
16 | * uselessly for vacuuming small tables). If the array threatens to overflow, |
17 | * we suspend the heap scan phase and perform a pass of index cleanup and page |
18 | * compaction, then resume the heap scan with an empty TID array. |
19 | * |
20 | * If we're processing a table with no indexes, we can just vacuum each page |
21 | * as we go; there's no need to save up multiple tuples to minimize the number |
22 | * of index scans performed. So we don't use maintenance_work_mem memory for |
23 | * the TID array, just enough to hold as many heap tuples as fit on one page. |
24 | * |
25 | * |
26 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
27 | * Portions Copyright (c) 1994, Regents of the University of California |
28 | * |
29 | * |
30 | * IDENTIFICATION |
31 | * src/backend/access/heap/vacuumlazy.c |
32 | * |
33 | *------------------------------------------------------------------------- |
34 | */ |
35 | #include "postgres.h" |
36 | |
37 | #include <math.h> |
38 | |
39 | #include "access/genam.h" |
40 | #include "access/heapam.h" |
41 | #include "access/heapam_xlog.h" |
42 | #include "access/htup_details.h" |
43 | #include "access/multixact.h" |
44 | #include "access/transam.h" |
45 | #include "access/visibilitymap.h" |
46 | #include "access/xlog.h" |
47 | #include "catalog/storage.h" |
48 | #include "commands/dbcommands.h" |
49 | #include "commands/progress.h" |
50 | #include "commands/vacuum.h" |
51 | #include "miscadmin.h" |
52 | #include "pgstat.h" |
53 | #include "portability/instr_time.h" |
54 | #include "postmaster/autovacuum.h" |
55 | #include "storage/bufmgr.h" |
56 | #include "storage/freespace.h" |
57 | #include "storage/lmgr.h" |
58 | #include "utils/lsyscache.h" |
59 | #include "utils/memutils.h" |
60 | #include "utils/pg_rusage.h" |
61 | #include "utils/timestamp.h" |
62 | |
63 | |
64 | /* |
65 | * Space/time tradeoff parameters: do these need to be user-tunable? |
66 | * |
67 | * To consider truncating the relation, we want there to be at least |
68 | * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever |
69 | * is less) potentially-freeable pages. |
70 | */ |
71 | #define REL_TRUNCATE_MINIMUM 1000 |
72 | #define REL_TRUNCATE_FRACTION 16 |
73 | |
74 | /* |
75 | * Timing parameters for truncate locking heuristics. |
76 | * |
77 | * These were not exposed as user tunable GUC values because it didn't seem |
78 | * that the potential for improvement was great enough to merit the cost of |
79 | * supporting them. |
80 | */ |
81 | #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */ |
82 | #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */ |
83 | #define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */ |
84 | |
85 | /* |
86 | * When a table has no indexes, vacuum the FSM after every 8GB, approximately |
87 | * (it won't be exact because we only vacuum FSM after processing a heap page |
88 | * that has some removable tuples). When there are indexes, this is ignored, |
89 | * and we vacuum FSM after each index/heap cleaning pass. |
90 | */ |
91 | #define VACUUM_FSM_EVERY_PAGES \ |
92 | ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ)) |
93 | |
94 | /* |
95 | * Guesstimation of number of dead tuples per page. This is used to |
96 | * provide an upper limit to memory allocated when vacuuming small |
97 | * tables. |
98 | */ |
99 | #define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage |
100 | |
101 | /* |
102 | * Before we consider skipping a page that's marked as clean in |
103 | * visibility map, we must've seen at least this many clean pages. |
104 | */ |
105 | #define SKIP_PAGES_THRESHOLD ((BlockNumber) 32) |
106 | |
107 | /* |
108 | * Size of the prefetch window for lazy vacuum backwards truncation scan. |
109 | * Needs to be a power of 2. |
110 | */ |
111 | #define PREFETCH_SIZE ((BlockNumber) 32) |
112 | |
113 | typedef struct LVRelStats |
114 | { |
115 | /* useindex = true means two-pass strategy; false means one-pass */ |
116 | bool useindex; |
117 | /* Overall statistics about rel */ |
118 | BlockNumber old_rel_pages; /* previous value of pg_class.relpages */ |
119 | BlockNumber rel_pages; /* total number of pages */ |
120 | BlockNumber scanned_pages; /* number of pages we examined */ |
121 | BlockNumber pinskipped_pages; /* # of pages we skipped due to a pin */ |
122 | BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */ |
123 | BlockNumber tupcount_pages; /* pages whose tuples we counted */ |
124 | double old_live_tuples; /* previous value of pg_class.reltuples */ |
125 | double new_rel_tuples; /* new estimated total # of tuples */ |
126 | double new_live_tuples; /* new estimated total # of live tuples */ |
127 | double new_dead_tuples; /* new estimated total # of dead tuples */ |
128 | BlockNumber pages_removed; |
129 | double tuples_deleted; |
130 | BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ |
131 | /* List of TIDs of tuples we intend to delete */ |
132 | /* NB: this list is ordered by TID address */ |
133 | int num_dead_tuples; /* current # of entries */ |
134 | int max_dead_tuples; /* # slots allocated in array */ |
135 | ItemPointer dead_tuples; /* array of ItemPointerData */ |
136 | int num_index_scans; |
137 | TransactionId latestRemovedXid; |
138 | bool lock_waiter_detected; |
139 | } LVRelStats; |
140 | |
141 | |
142 | /* A few variables that don't seem worth passing around as parameters */ |
143 | static int elevel = -1; |
144 | |
145 | static TransactionId OldestXmin; |
146 | static TransactionId FreezeLimit; |
147 | static MultiXactId MultiXactCutoff; |
148 | |
149 | static BufferAccessStrategy vac_strategy; |
150 | |
151 | |
152 | /* non-export function prototypes */ |
153 | static void lazy_scan_heap(Relation onerel, VacuumParams *params, |
154 | LVRelStats *vacrelstats, Relation *Irel, int nindexes, |
155 | bool aggressive); |
156 | static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats); |
157 | static bool lazy_check_needs_freeze(Buffer buf, bool *hastup); |
158 | static void lazy_vacuum_index(Relation indrel, |
159 | IndexBulkDeleteResult **stats, |
160 | LVRelStats *vacrelstats); |
161 | static void lazy_cleanup_index(Relation indrel, |
162 | IndexBulkDeleteResult *stats, |
163 | LVRelStats *vacrelstats); |
164 | static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, |
165 | int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer); |
166 | static bool should_attempt_truncation(VacuumParams *params, |
167 | LVRelStats *vacrelstats); |
168 | static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats); |
169 | static BlockNumber count_nondeletable_pages(Relation onerel, |
170 | LVRelStats *vacrelstats); |
171 | static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks); |
172 | static void lazy_record_dead_tuple(LVRelStats *vacrelstats, |
173 | ItemPointer itemptr); |
174 | static bool lazy_tid_reaped(ItemPointer itemptr, void *state); |
175 | static int vac_cmp_itemptr(const void *left, const void *right); |
176 | static bool heap_page_is_all_visible(Relation rel, Buffer buf, |
177 | TransactionId *visibility_cutoff_xid, bool *all_frozen); |
178 | |
179 | |
180 | /* |
181 | * heap_vacuum_rel() -- perform VACUUM for one heap relation |
182 | * |
183 | * This routine vacuums a single heap, cleans out its indexes, and |
184 | * updates its relpages and reltuples statistics. |
185 | * |
186 | * At entry, we have already established a transaction and opened |
187 | * and locked the relation. |
188 | */ |
189 | void |
190 | heap_vacuum_rel(Relation onerel, VacuumParams *params, |
191 | BufferAccessStrategy bstrategy) |
192 | { |
193 | LVRelStats *vacrelstats; |
194 | Relation *Irel; |
195 | int nindexes; |
196 | PGRUsage ru0; |
197 | TimestampTz starttime = 0; |
198 | long secs; |
199 | int usecs; |
200 | double read_rate, |
201 | write_rate; |
202 | bool aggressive; /* should we scan all unfrozen pages? */ |
203 | bool scanned_all_unfrozen; /* actually scanned all such pages? */ |
204 | TransactionId xidFullScanLimit; |
205 | MultiXactId mxactFullScanLimit; |
206 | BlockNumber new_rel_pages; |
207 | BlockNumber new_rel_allvisible; |
208 | double new_live_tuples; |
209 | TransactionId new_frozen_xid; |
210 | MultiXactId new_min_multi; |
211 | |
212 | Assert(params != NULL); |
213 | Assert(params->index_cleanup != VACOPT_TERNARY_DEFAULT); |
214 | Assert(params->truncate != VACOPT_TERNARY_DEFAULT); |
215 | |
216 | /* not every AM requires these to be valid, but heap does */ |
217 | Assert(TransactionIdIsNormal(onerel->rd_rel->relfrozenxid)); |
218 | Assert(MultiXactIdIsValid(onerel->rd_rel->relminmxid)); |
219 | |
220 | /* measure elapsed time iff autovacuum logging requires it */ |
221 | if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0) |
222 | { |
223 | pg_rusage_init(&ru0); |
224 | starttime = GetCurrentTimestamp(); |
225 | } |
226 | |
227 | if (params->options & VACOPT_VERBOSE) |
228 | elevel = INFO; |
229 | else |
230 | elevel = DEBUG2; |
231 | |
232 | pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM, |
233 | RelationGetRelid(onerel)); |
234 | |
235 | vac_strategy = bstrategy; |
236 | |
237 | vacuum_set_xid_limits(onerel, |
238 | params->freeze_min_age, |
239 | params->freeze_table_age, |
240 | params->multixact_freeze_min_age, |
241 | params->multixact_freeze_table_age, |
242 | &OldestXmin, &FreezeLimit, &xidFullScanLimit, |
243 | &MultiXactCutoff, &mxactFullScanLimit); |
244 | |
245 | /* |
246 | * We request an aggressive scan if the table's frozen Xid is now older |
247 | * than or equal to the requested Xid full-table scan limit; or if the |
248 | * table's minimum MultiXactId is older than or equal to the requested |
249 | * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified. |
250 | */ |
251 | aggressive = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid, |
252 | xidFullScanLimit); |
253 | aggressive |= MultiXactIdPrecedesOrEquals(onerel->rd_rel->relminmxid, |
254 | mxactFullScanLimit); |
255 | if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) |
256 | aggressive = true; |
257 | |
258 | /* |
259 | * Normally the relfrozenxid for an anti-wraparound vacuum will be old |
260 | * enough to force an aggressive vacuum. However, a concurrent vacuum |
261 | * might have already done this work that the relfrozenxid in relcache has |
262 | * been updated. If that happens this vacuum is redundant, so skip it. |
263 | */ |
264 | if (params->is_wraparound && !aggressive) |
265 | { |
266 | ereport(DEBUG1, |
267 | (errmsg("skipping redundant vacuum to prevent wraparound of table \"%s.%s.%s\"" , |
268 | get_database_name(MyDatabaseId), |
269 | get_namespace_name(RelationGetNamespace(onerel)), |
270 | RelationGetRelationName(onerel)))); |
271 | pgstat_progress_end_command(); |
272 | return; |
273 | } |
274 | |
275 | vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); |
276 | |
277 | vacrelstats->old_rel_pages = onerel->rd_rel->relpages; |
278 | vacrelstats->old_live_tuples = onerel->rd_rel->reltuples; |
279 | vacrelstats->num_index_scans = 0; |
280 | vacrelstats->pages_removed = 0; |
281 | vacrelstats->lock_waiter_detected = false; |
282 | |
283 | /* Open all indexes of the relation */ |
284 | vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); |
285 | vacrelstats->useindex = (nindexes > 0 && |
286 | params->index_cleanup == VACOPT_TERNARY_ENABLED); |
287 | |
288 | /* Do the vacuuming */ |
289 | lazy_scan_heap(onerel, params, vacrelstats, Irel, nindexes, aggressive); |
290 | |
291 | /* Done with indexes */ |
292 | vac_close_indexes(nindexes, Irel, NoLock); |
293 | |
294 | /* |
295 | * Compute whether we actually scanned the all unfrozen pages. If we did, |
296 | * we can adjust relfrozenxid and relminmxid. |
297 | * |
298 | * NB: We need to check this before truncating the relation, because that |
299 | * will change ->rel_pages. |
300 | */ |
301 | if ((vacrelstats->scanned_pages + vacrelstats->frozenskipped_pages) |
302 | < vacrelstats->rel_pages) |
303 | { |
304 | Assert(!aggressive); |
305 | scanned_all_unfrozen = false; |
306 | } |
307 | else |
308 | scanned_all_unfrozen = true; |
309 | |
310 | /* |
311 | * Optionally truncate the relation. |
312 | */ |
313 | if (should_attempt_truncation(params, vacrelstats)) |
314 | lazy_truncate_heap(onerel, vacrelstats); |
315 | |
316 | /* Report that we are now doing final cleanup */ |
317 | pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, |
318 | PROGRESS_VACUUM_PHASE_FINAL_CLEANUP); |
319 | |
320 | /* |
321 | * Update statistics in pg_class. |
322 | * |
323 | * A corner case here is that if we scanned no pages at all because every |
324 | * page is all-visible, we should not update relpages/reltuples, because |
325 | * we have no new information to contribute. In particular this keeps us |
326 | * from replacing relpages=reltuples=0 (which means "unknown tuple |
327 | * density") with nonzero relpages and reltuples=0 (which means "zero |
328 | * tuple density") unless there's some actual evidence for the latter. |
329 | * |
330 | * It's important that we use tupcount_pages and not scanned_pages for the |
331 | * check described above; scanned_pages counts pages where we could not |
332 | * get cleanup lock, and which were processed only for frozenxid purposes. |
333 | * |
334 | * We do update relallvisible even in the corner case, since if the table |
335 | * is all-visible we'd definitely like to know that. But clamp the value |
336 | * to be not more than what we're setting relpages to. |
337 | * |
338 | * Also, don't change relfrozenxid/relminmxid if we skipped any pages, |
339 | * since then we don't know for certain that all tuples have a newer xmin. |
340 | */ |
341 | new_rel_pages = vacrelstats->rel_pages; |
342 | new_live_tuples = vacrelstats->new_live_tuples; |
343 | if (vacrelstats->tupcount_pages == 0 && new_rel_pages > 0) |
344 | { |
345 | new_rel_pages = vacrelstats->old_rel_pages; |
346 | new_live_tuples = vacrelstats->old_live_tuples; |
347 | } |
348 | |
349 | visibilitymap_count(onerel, &new_rel_allvisible, NULL); |
350 | if (new_rel_allvisible > new_rel_pages) |
351 | new_rel_allvisible = new_rel_pages; |
352 | |
353 | new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId; |
354 | new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId; |
355 | |
356 | vac_update_relstats(onerel, |
357 | new_rel_pages, |
358 | new_live_tuples, |
359 | new_rel_allvisible, |
360 | nindexes > 0, |
361 | new_frozen_xid, |
362 | new_min_multi, |
363 | false); |
364 | |
365 | /* report results to the stats collector, too */ |
366 | pgstat_report_vacuum(RelationGetRelid(onerel), |
367 | onerel->rd_rel->relisshared, |
368 | new_live_tuples, |
369 | vacrelstats->new_dead_tuples); |
370 | pgstat_progress_end_command(); |
371 | |
372 | /* and log the action if appropriate */ |
373 | if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0) |
374 | { |
375 | TimestampTz endtime = GetCurrentTimestamp(); |
376 | |
377 | if (params->log_min_duration == 0 || |
378 | TimestampDifferenceExceeds(starttime, endtime, |
379 | params->log_min_duration)) |
380 | { |
381 | StringInfoData buf; |
382 | char *msgfmt; |
383 | |
384 | TimestampDifference(starttime, endtime, &secs, &usecs); |
385 | |
386 | read_rate = 0; |
387 | write_rate = 0; |
388 | if ((secs > 0) || (usecs > 0)) |
389 | { |
390 | read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) / |
391 | (secs + usecs / 1000000.0); |
392 | write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) / |
393 | (secs + usecs / 1000000.0); |
394 | } |
395 | |
396 | /* |
397 | * This is pretty messy, but we split it up so that we can skip |
398 | * emitting individual parts of the message when not applicable. |
399 | */ |
400 | initStringInfo(&buf); |
401 | if (params->is_wraparound) |
402 | { |
403 | /* an anti-wraparound vacuum has to be aggressive */ |
404 | Assert(aggressive); |
405 | msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n" ); |
406 | } |
407 | else |
408 | { |
409 | if (aggressive) |
410 | msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n" ); |
411 | else |
412 | msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n" ); |
413 | } |
414 | appendStringInfo(&buf, msgfmt, |
415 | get_database_name(MyDatabaseId), |
416 | get_namespace_name(RelationGetNamespace(onerel)), |
417 | RelationGetRelationName(onerel), |
418 | vacrelstats->num_index_scans); |
419 | appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n" ), |
420 | vacrelstats->pages_removed, |
421 | vacrelstats->rel_pages, |
422 | vacrelstats->pinskipped_pages, |
423 | vacrelstats->frozenskipped_pages); |
424 | appendStringInfo(&buf, |
425 | _("tuples: %.0f removed, %.0f remain, %.0f are dead but not yet removable, oldest xmin: %u\n" ), |
426 | vacrelstats->tuples_deleted, |
427 | vacrelstats->new_rel_tuples, |
428 | vacrelstats->new_dead_tuples, |
429 | OldestXmin); |
430 | appendStringInfo(&buf, |
431 | _("buffer usage: %d hits, %d misses, %d dirtied\n" ), |
432 | VacuumPageHit, |
433 | VacuumPageMiss, |
434 | VacuumPageDirty); |
435 | appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n" ), |
436 | read_rate, write_rate); |
437 | appendStringInfo(&buf, _("system usage: %s" ), pg_rusage_show(&ru0)); |
438 | |
439 | ereport(LOG, |
440 | (errmsg_internal("%s" , buf.data))); |
441 | pfree(buf.data); |
442 | } |
443 | } |
444 | } |
445 | |
446 | /* |
447 | * For Hot Standby we need to know the highest transaction id that will |
448 | * be removed by any change. VACUUM proceeds in a number of passes so |
449 | * we need to consider how each pass operates. The first phase runs |
450 | * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it |
451 | * progresses - these will have a latestRemovedXid on each record. |
452 | * In some cases this removes all of the tuples to be removed, though |
453 | * often we have dead tuples with index pointers so we must remember them |
454 | * for removal in phase 3. Index records for those rows are removed |
455 | * in phase 2 and index blocks do not have MVCC information attached. |
456 | * So before we can allow removal of any index tuples we need to issue |
457 | * a WAL record containing the latestRemovedXid of rows that will be |
458 | * removed in phase three. This allows recovery queries to block at the |
459 | * correct place, i.e. before phase two, rather than during phase three |
460 | * which would be after the rows have become inaccessible. |
461 | */ |
462 | static void |
463 | vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats) |
464 | { |
465 | /* |
466 | * Skip this for relations for which no WAL is to be written, or if we're |
467 | * not trying to support archive recovery. |
468 | */ |
469 | if (!RelationNeedsWAL(rel) || !XLogIsNeeded()) |
470 | return; |
471 | |
472 | /* |
473 | * No need to write the record at all unless it contains a valid value |
474 | */ |
475 | if (TransactionIdIsValid(vacrelstats->latestRemovedXid)) |
476 | (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid); |
477 | } |
478 | |
479 | /* |
480 | * lazy_scan_heap() -- scan an open heap relation |
481 | * |
482 | * This routine prunes each page in the heap, which will among other |
483 | * things truncate dead tuples to dead line pointers, defragment the |
484 | * page, and set commit status bits (see heap_page_prune). It also builds |
485 | * lists of dead tuples and pages with free space, calculates statistics |
486 | * on the number of live tuples in the heap, and marks pages as |
487 | * all-visible if appropriate. When done, or when we run low on space for |
488 | * dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap |
489 | * to reclaim dead line pointers. |
490 | * |
491 | * If there are no indexes then we can reclaim line pointers on the fly; |
492 | * dead line pointers need only be retained until all index pointers that |
493 | * reference them have been killed. |
494 | */ |
495 | static void |
496 | lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, |
497 | Relation *Irel, int nindexes, bool aggressive) |
498 | { |
499 | BlockNumber nblocks, |
500 | blkno; |
501 | HeapTupleData tuple; |
502 | char *relname; |
503 | TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid; |
504 | TransactionId relminmxid = onerel->rd_rel->relminmxid; |
505 | BlockNumber empty_pages, |
506 | vacuumed_pages, |
507 | next_fsm_block_to_vacuum; |
508 | double num_tuples, /* total number of nonremovable tuples */ |
509 | live_tuples, /* live tuples (reltuples estimate) */ |
510 | tups_vacuumed, /* tuples cleaned up by vacuum */ |
511 | nkeep, /* dead-but-not-removable tuples */ |
512 | nunused; /* unused line pointers */ |
513 | IndexBulkDeleteResult **indstats; |
514 | int i; |
515 | PGRUsage ru0; |
516 | Buffer vmbuffer = InvalidBuffer; |
517 | BlockNumber next_unskippable_block; |
518 | bool skipping_blocks; |
519 | xl_heap_freeze_tuple *frozen; |
520 | StringInfoData buf; |
521 | const int initprog_index[] = { |
522 | PROGRESS_VACUUM_PHASE, |
523 | PROGRESS_VACUUM_TOTAL_HEAP_BLKS, |
524 | PROGRESS_VACUUM_MAX_DEAD_TUPLES |
525 | }; |
526 | int64 initprog_val[3]; |
527 | |
528 | pg_rusage_init(&ru0); |
529 | |
530 | relname = RelationGetRelationName(onerel); |
531 | if (aggressive) |
532 | ereport(elevel, |
533 | (errmsg("aggressively vacuuming \"%s.%s\"" , |
534 | get_namespace_name(RelationGetNamespace(onerel)), |
535 | relname))); |
536 | else |
537 | ereport(elevel, |
538 | (errmsg("vacuuming \"%s.%s\"" , |
539 | get_namespace_name(RelationGetNamespace(onerel)), |
540 | relname))); |
541 | |
542 | empty_pages = vacuumed_pages = 0; |
543 | next_fsm_block_to_vacuum = (BlockNumber) 0; |
544 | num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0; |
545 | |
546 | indstats = (IndexBulkDeleteResult **) |
547 | palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); |
548 | |
549 | nblocks = RelationGetNumberOfBlocks(onerel); |
550 | vacrelstats->rel_pages = nblocks; |
551 | vacrelstats->scanned_pages = 0; |
552 | vacrelstats->tupcount_pages = 0; |
553 | vacrelstats->nonempty_pages = 0; |
554 | vacrelstats->latestRemovedXid = InvalidTransactionId; |
555 | |
556 | lazy_space_alloc(vacrelstats, nblocks); |
557 | frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage); |
558 | |
559 | /* Report that we're scanning the heap, advertising total # of blocks */ |
560 | initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP; |
561 | initprog_val[1] = nblocks; |
562 | initprog_val[2] = vacrelstats->max_dead_tuples; |
563 | pgstat_progress_update_multi_param(3, initprog_index, initprog_val); |
564 | |
565 | /* |
566 | * Except when aggressive is set, we want to skip pages that are |
567 | * all-visible according to the visibility map, but only when we can skip |
568 | * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading |
569 | * sequentially, the OS should be doing readahead for us, so there's no |
570 | * gain in skipping a page now and then; that's likely to disable |
571 | * readahead and so be counterproductive. Also, skipping even a single |
572 | * page means that we can't update relfrozenxid, so we only want to do it |
573 | * if we can skip a goodly number of pages. |
574 | * |
575 | * When aggressive is set, we can't skip pages just because they are |
576 | * all-visible, but we can still skip pages that are all-frozen, since |
577 | * such pages do not need freezing and do not affect the value that we can |
578 | * safely set for relfrozenxid or relminmxid. |
579 | * |
580 | * Before entering the main loop, establish the invariant that |
581 | * next_unskippable_block is the next block number >= blkno that we can't |
582 | * skip based on the visibility map, either all-visible for a regular scan |
583 | * or all-frozen for an aggressive scan. We set it to nblocks if there's |
584 | * no such block. We also set up the skipping_blocks flag correctly at |
585 | * this stage. |
586 | * |
587 | * Note: The value returned by visibilitymap_get_status could be slightly |
588 | * out-of-date, since we make this test before reading the corresponding |
589 | * heap page or locking the buffer. This is OK. If we mistakenly think |
590 | * that the page is all-visible or all-frozen when in fact the flag's just |
591 | * been cleared, we might fail to vacuum the page. It's easy to see that |
592 | * skipping a page when aggressive is not set is not a very big deal; we |
593 | * might leave some dead tuples lying around, but the next vacuum will |
594 | * find them. But even when aggressive *is* set, it's still OK if we miss |
595 | * a page whose all-frozen marking has just been cleared. Any new XIDs |
596 | * just added to that page are necessarily newer than the GlobalXmin we |
597 | * computed, so they'll have no effect on the value to which we can safely |
598 | * set relfrozenxid. A similar argument applies for MXIDs and relminmxid. |
599 | * |
600 | * We will scan the table's last page, at least to the extent of |
601 | * determining whether it has tuples or not, even if it should be skipped |
602 | * according to the above rules; except when we've already determined that |
603 | * it's not worth trying to truncate the table. This avoids having |
604 | * lazy_truncate_heap() take access-exclusive lock on the table to attempt |
605 | * a truncation that just fails immediately because there are tuples in |
606 | * the last page. This is worth avoiding mainly because such a lock must |
607 | * be replayed on any hot standby, where it can be disruptive. |
608 | */ |
609 | next_unskippable_block = 0; |
610 | if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0) |
611 | { |
612 | while (next_unskippable_block < nblocks) |
613 | { |
614 | uint8 vmstatus; |
615 | |
616 | vmstatus = visibilitymap_get_status(onerel, next_unskippable_block, |
617 | &vmbuffer); |
618 | if (aggressive) |
619 | { |
620 | if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0) |
621 | break; |
622 | } |
623 | else |
624 | { |
625 | if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0) |
626 | break; |
627 | } |
628 | vacuum_delay_point(); |
629 | next_unskippable_block++; |
630 | } |
631 | } |
632 | |
633 | if (next_unskippable_block >= SKIP_PAGES_THRESHOLD) |
634 | skipping_blocks = true; |
635 | else |
636 | skipping_blocks = false; |
637 | |
638 | for (blkno = 0; blkno < nblocks; blkno++) |
639 | { |
640 | Buffer buf; |
641 | Page page; |
642 | OffsetNumber offnum, |
643 | maxoff; |
644 | bool tupgone, |
645 | hastup; |
646 | int prev_dead_count; |
647 | int nfrozen; |
648 | Size freespace; |
649 | bool all_visible_according_to_vm = false; |
650 | bool all_visible; |
651 | bool all_frozen = true; /* provided all_visible is also true */ |
652 | bool has_dead_tuples; |
653 | TransactionId visibility_cutoff_xid = InvalidTransactionId; |
654 | |
655 | /* see note above about forcing scanning of last page */ |
656 | #define FORCE_CHECK_PAGE() \ |
657 | (blkno == nblocks - 1 && should_attempt_truncation(params, vacrelstats)) |
658 | |
659 | pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); |
660 | |
661 | if (blkno == next_unskippable_block) |
662 | { |
663 | /* Time to advance next_unskippable_block */ |
664 | next_unskippable_block++; |
665 | if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0) |
666 | { |
667 | while (next_unskippable_block < nblocks) |
668 | { |
669 | uint8 vmskipflags; |
670 | |
671 | vmskipflags = visibilitymap_get_status(onerel, |
672 | next_unskippable_block, |
673 | &vmbuffer); |
674 | if (aggressive) |
675 | { |
676 | if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0) |
677 | break; |
678 | } |
679 | else |
680 | { |
681 | if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0) |
682 | break; |
683 | } |
684 | vacuum_delay_point(); |
685 | next_unskippable_block++; |
686 | } |
687 | } |
688 | |
689 | /* |
690 | * We know we can't skip the current block. But set up |
691 | * skipping_blocks to do the right thing at the following blocks. |
692 | */ |
693 | if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD) |
694 | skipping_blocks = true; |
695 | else |
696 | skipping_blocks = false; |
697 | |
698 | /* |
699 | * Normally, the fact that we can't skip this block must mean that |
700 | * it's not all-visible. But in an aggressive vacuum we know only |
701 | * that it's not all-frozen, so it might still be all-visible. |
702 | */ |
703 | if (aggressive && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer)) |
704 | all_visible_according_to_vm = true; |
705 | } |
706 | else |
707 | { |
708 | /* |
709 | * The current block is potentially skippable; if we've seen a |
710 | * long enough run of skippable blocks to justify skipping it, and |
711 | * we're not forced to check it, then go ahead and skip. |
712 | * Otherwise, the page must be at least all-visible if not |
713 | * all-frozen, so we can set all_visible_according_to_vm = true. |
714 | */ |
715 | if (skipping_blocks && !FORCE_CHECK_PAGE()) |
716 | { |
717 | /* |
718 | * Tricky, tricky. If this is in aggressive vacuum, the page |
719 | * must have been all-frozen at the time we checked whether it |
720 | * was skippable, but it might not be any more. We must be |
721 | * careful to count it as a skipped all-frozen page in that |
722 | * case, or else we'll think we can't update relfrozenxid and |
723 | * relminmxid. If it's not an aggressive vacuum, we don't |
724 | * know whether it was all-frozen, so we have to recheck; but |
725 | * in this case an approximate answer is OK. |
726 | */ |
727 | if (aggressive || VM_ALL_FROZEN(onerel, blkno, &vmbuffer)) |
728 | vacrelstats->frozenskipped_pages++; |
729 | continue; |
730 | } |
731 | all_visible_according_to_vm = true; |
732 | } |
733 | |
734 | vacuum_delay_point(); |
735 | |
736 | /* |
737 | * If we are close to overrunning the available space for dead-tuple |
738 | * TIDs, pause and do a cycle of vacuuming before we tackle this page. |
739 | */ |
740 | if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && |
741 | vacrelstats->num_dead_tuples > 0) |
742 | { |
743 | const int hvp_index[] = { |
744 | PROGRESS_VACUUM_PHASE, |
745 | PROGRESS_VACUUM_NUM_INDEX_VACUUMS |
746 | }; |
747 | int64 hvp_val[2]; |
748 | |
749 | /* |
750 | * Before beginning index vacuuming, we release any pin we may |
751 | * hold on the visibility map page. This isn't necessary for |
752 | * correctness, but we do it anyway to avoid holding the pin |
753 | * across a lengthy, unrelated operation. |
754 | */ |
755 | if (BufferIsValid(vmbuffer)) |
756 | { |
757 | ReleaseBuffer(vmbuffer); |
758 | vmbuffer = InvalidBuffer; |
759 | } |
760 | |
761 | /* Log cleanup info before we touch indexes */ |
762 | vacuum_log_cleanup_info(onerel, vacrelstats); |
763 | |
764 | /* Report that we are now vacuuming indexes */ |
765 | pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, |
766 | PROGRESS_VACUUM_PHASE_VACUUM_INDEX); |
767 | |
768 | /* Remove index entries */ |
769 | for (i = 0; i < nindexes; i++) |
770 | lazy_vacuum_index(Irel[i], |
771 | &indstats[i], |
772 | vacrelstats); |
773 | |
774 | /* |
775 | * Report that we are now vacuuming the heap. We also increase |
776 | * the number of index scans here; note that by using |
777 | * pgstat_progress_update_multi_param we can update both |
778 | * parameters atomically. |
779 | */ |
780 | hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP; |
781 | hvp_val[1] = vacrelstats->num_index_scans + 1; |
782 | pgstat_progress_update_multi_param(2, hvp_index, hvp_val); |
783 | |
784 | /* Remove tuples from heap */ |
785 | lazy_vacuum_heap(onerel, vacrelstats); |
786 | |
787 | /* |
788 | * Forget the now-vacuumed tuples, and press on, but be careful |
789 | * not to reset latestRemovedXid since we want that value to be |
790 | * valid. |
791 | */ |
792 | vacrelstats->num_dead_tuples = 0; |
793 | vacrelstats->num_index_scans++; |
794 | |
795 | /* |
796 | * Vacuum the Free Space Map to make newly-freed space visible on |
797 | * upper-level FSM pages. Note we have not yet processed blkno. |
798 | */ |
799 | FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno); |
800 | next_fsm_block_to_vacuum = blkno; |
801 | |
802 | /* Report that we are once again scanning the heap */ |
803 | pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, |
804 | PROGRESS_VACUUM_PHASE_SCAN_HEAP); |
805 | } |
806 | |
807 | /* |
808 | * Pin the visibility map page in case we need to mark the page |
809 | * all-visible. In most cases this will be very cheap, because we'll |
810 | * already have the correct page pinned anyway. However, it's |
811 | * possible that (a) next_unskippable_block is covered by a different |
812 | * VM page than the current block or (b) we released our pin and did a |
813 | * cycle of index vacuuming. |
814 | * |
815 | */ |
816 | visibilitymap_pin(onerel, blkno, &vmbuffer); |
817 | |
818 | buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, |
819 | RBM_NORMAL, vac_strategy); |
820 | |
821 | /* We need buffer cleanup lock so that we can prune HOT chains. */ |
822 | if (!ConditionalLockBufferForCleanup(buf)) |
823 | { |
824 | /* |
825 | * If we're not performing an aggressive scan to guard against XID |
826 | * wraparound, and we don't want to forcibly check the page, then |
827 | * it's OK to skip vacuuming pages we get a lock conflict on. They |
828 | * will be dealt with in some future vacuum. |
829 | */ |
830 | if (!aggressive && !FORCE_CHECK_PAGE()) |
831 | { |
832 | ReleaseBuffer(buf); |
833 | vacrelstats->pinskipped_pages++; |
834 | continue; |
835 | } |
836 | |
837 | /* |
838 | * Read the page with share lock to see if any xids on it need to |
839 | * be frozen. If not we just skip the page, after updating our |
840 | * scan statistics. If there are some, we wait for cleanup lock. |
841 | * |
842 | * We could defer the lock request further by remembering the page |
843 | * and coming back to it later, or we could even register |
844 | * ourselves for multiple buffers and then service whichever one |
845 | * is received first. For now, this seems good enough. |
846 | * |
847 | * If we get here with aggressive false, then we're just forcibly |
848 | * checking the page, and so we don't want to insist on getting |
849 | * the lock; we only need to know if the page contains tuples, so |
850 | * that we can update nonempty_pages correctly. It's convenient |
851 | * to use lazy_check_needs_freeze() for both situations, though. |
852 | */ |
853 | LockBuffer(buf, BUFFER_LOCK_SHARE); |
854 | if (!lazy_check_needs_freeze(buf, &hastup)) |
855 | { |
856 | UnlockReleaseBuffer(buf); |
857 | vacrelstats->scanned_pages++; |
858 | vacrelstats->pinskipped_pages++; |
859 | if (hastup) |
860 | vacrelstats->nonempty_pages = blkno + 1; |
861 | continue; |
862 | } |
863 | if (!aggressive) |
864 | { |
865 | /* |
866 | * Here, we must not advance scanned_pages; that would amount |
867 | * to claiming that the page contains no freezable tuples. |
868 | */ |
869 | UnlockReleaseBuffer(buf); |
870 | vacrelstats->pinskipped_pages++; |
871 | if (hastup) |
872 | vacrelstats->nonempty_pages = blkno + 1; |
873 | continue; |
874 | } |
875 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
876 | LockBufferForCleanup(buf); |
877 | /* drop through to normal processing */ |
878 | } |
879 | |
880 | vacrelstats->scanned_pages++; |
881 | vacrelstats->tupcount_pages++; |
882 | |
883 | page = BufferGetPage(buf); |
884 | |
885 | if (PageIsNew(page)) |
886 | { |
887 | bool still_new; |
888 | |
889 | /* |
890 | * All-zeroes pages can be left over if either a backend extends |
891 | * the relation by a single page, but crashes before the newly |
892 | * initialized page has been written out, or when bulk-extending |
893 | * the relation (which creates a number of empty pages at the tail |
894 | * end of the relation, but enters them into the FSM). |
895 | * |
896 | * Make sure these pages are in the FSM, to ensure they can be |
897 | * reused. Do that by testing if there's any space recorded for |
898 | * the page. If not, enter it. |
899 | * |
900 | * Note we do not enter the page into the visibilitymap. That has |
901 | * the downside that we repeatedly visit this page in subsequent |
902 | * vacuums, but otherwise we'll never not discover the space on a |
903 | * promoted standby. The harm of repeated checking ought to |
904 | * normally not be too bad - the space usually should be used at |
905 | * some point, otherwise there wouldn't be any regular vacuums. |
906 | */ |
907 | |
908 | /* |
909 | * Perform checking of FSM after releasing lock, the fsm is |
910 | * approximate, after all. |
911 | */ |
912 | still_new = PageIsNew(page); |
913 | UnlockReleaseBuffer(buf); |
914 | |
915 | if (still_new) |
916 | { |
917 | empty_pages++; |
918 | |
919 | if (GetRecordedFreeSpace(onerel, blkno) == 0) |
920 | { |
921 | Size freespace; |
922 | |
923 | freespace = BufferGetPageSize(buf) - SizeOfPageHeaderData; |
924 | RecordPageWithFreeSpace(onerel, blkno, freespace); |
925 | } |
926 | } |
927 | continue; |
928 | } |
929 | |
930 | if (PageIsEmpty(page)) |
931 | { |
932 | empty_pages++; |
933 | freespace = PageGetHeapFreeSpace(page); |
934 | |
935 | /* |
936 | * Empty pages are always all-visible and all-frozen (note that |
937 | * the same is currently not true for new pages, see above). |
938 | */ |
939 | if (!PageIsAllVisible(page)) |
940 | { |
941 | START_CRIT_SECTION(); |
942 | |
943 | /* mark buffer dirty before writing a WAL record */ |
944 | MarkBufferDirty(buf); |
945 | |
946 | /* |
947 | * It's possible that another backend has extended the heap, |
948 | * initialized the page, and then failed to WAL-log the page |
949 | * due to an ERROR. Since heap extension is not WAL-logged, |
950 | * recovery might try to replay our record setting the page |
951 | * all-visible and find that the page isn't initialized, which |
952 | * will cause a PANIC. To prevent that, check whether the |
953 | * page has been previously WAL-logged, and if not, do that |
954 | * now. |
955 | */ |
956 | if (RelationNeedsWAL(onerel) && |
957 | PageGetLSN(page) == InvalidXLogRecPtr) |
958 | log_newpage_buffer(buf, true); |
959 | |
960 | PageSetAllVisible(page); |
961 | visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, |
962 | vmbuffer, InvalidTransactionId, |
963 | VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); |
964 | END_CRIT_SECTION(); |
965 | } |
966 | |
967 | UnlockReleaseBuffer(buf); |
968 | RecordPageWithFreeSpace(onerel, blkno, freespace); |
969 | continue; |
970 | } |
971 | |
972 | /* |
973 | * Prune all HOT-update chains in this page. |
974 | * |
975 | * We count tuples removed by the pruning step as removed by VACUUM. |
976 | */ |
977 | tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, |
978 | &vacrelstats->latestRemovedXid); |
979 | |
980 | /* |
981 | * Now scan the page to collect vacuumable items and check for tuples |
982 | * requiring freezing. |
983 | */ |
984 | all_visible = true; |
985 | has_dead_tuples = false; |
986 | nfrozen = 0; |
987 | hastup = false; |
988 | prev_dead_count = vacrelstats->num_dead_tuples; |
989 | maxoff = PageGetMaxOffsetNumber(page); |
990 | |
991 | /* |
992 | * Note: If you change anything in the loop below, also look at |
993 | * heap_page_is_all_visible to see if that needs to be changed. |
994 | */ |
995 | for (offnum = FirstOffsetNumber; |
996 | offnum <= maxoff; |
997 | offnum = OffsetNumberNext(offnum)) |
998 | { |
999 | ItemId itemid; |
1000 | |
1001 | itemid = PageGetItemId(page, offnum); |
1002 | |
1003 | /* Unused items require no processing, but we count 'em */ |
1004 | if (!ItemIdIsUsed(itemid)) |
1005 | { |
1006 | nunused += 1; |
1007 | continue; |
1008 | } |
1009 | |
1010 | /* Redirect items mustn't be touched */ |
1011 | if (ItemIdIsRedirected(itemid)) |
1012 | { |
1013 | hastup = true; /* this page won't be truncatable */ |
1014 | continue; |
1015 | } |
1016 | |
1017 | ItemPointerSet(&(tuple.t_self), blkno, offnum); |
1018 | |
1019 | /* |
1020 | * DEAD line pointers are to be vacuumed normally; but we don't |
1021 | * count them in tups_vacuumed, else we'd be double-counting (at |
1022 | * least in the common case where heap_page_prune() just freed up |
1023 | * a non-HOT tuple). |
1024 | */ |
1025 | if (ItemIdIsDead(itemid)) |
1026 | { |
1027 | lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); |
1028 | all_visible = false; |
1029 | continue; |
1030 | } |
1031 | |
1032 | Assert(ItemIdIsNormal(itemid)); |
1033 | |
1034 | tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); |
1035 | tuple.t_len = ItemIdGetLength(itemid); |
1036 | tuple.t_tableOid = RelationGetRelid(onerel); |
1037 | |
1038 | tupgone = false; |
1039 | |
1040 | /* |
1041 | * The criteria for counting a tuple as live in this block need to |
1042 | * match what analyze.c's acquire_sample_rows() does, otherwise |
1043 | * VACUUM and ANALYZE may produce wildly different reltuples |
1044 | * values, e.g. when there are many recently-dead tuples. |
1045 | * |
1046 | * The logic here is a bit simpler than acquire_sample_rows(), as |
1047 | * VACUUM can't run inside a transaction block, which makes some |
1048 | * cases impossible (e.g. in-progress insert from the same |
1049 | * transaction). |
1050 | */ |
1051 | switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) |
1052 | { |
1053 | case HEAPTUPLE_DEAD: |
1054 | |
1055 | /* |
1056 | * Ordinarily, DEAD tuples would have been removed by |
1057 | * heap_page_prune(), but it's possible that the tuple |
1058 | * state changed since heap_page_prune() looked. In |
1059 | * particular an INSERT_IN_PROGRESS tuple could have |
1060 | * changed to DEAD if the inserter aborted. So this |
1061 | * cannot be considered an error condition. |
1062 | * |
1063 | * If the tuple is HOT-updated then it must only be |
1064 | * removed by a prune operation; so we keep it just as if |
1065 | * it were RECENTLY_DEAD. Also, if it's a heap-only |
1066 | * tuple, we choose to keep it, because it'll be a lot |
1067 | * cheaper to get rid of it in the next pruning pass than |
1068 | * to treat it like an indexed tuple. Finally, if index |
1069 | * cleanup is disabled, the second heap pass will not |
1070 | * execute, and the tuple will not get removed, so we must |
1071 | * treat it like any other dead tuple that we choose to |
1072 | * keep. |
1073 | * |
1074 | * If this were to happen for a tuple that actually needed |
1075 | * to be deleted, we'd be in trouble, because it'd |
1076 | * possibly leave a tuple below the relation's xmin |
1077 | * horizon alive. heap_prepare_freeze_tuple() is prepared |
1078 | * to detect that case and abort the transaction, |
1079 | * preventing corruption. |
1080 | */ |
1081 | if (HeapTupleIsHotUpdated(&tuple) || |
1082 | HeapTupleIsHeapOnly(&tuple) || |
1083 | params->index_cleanup == VACOPT_TERNARY_DISABLED) |
1084 | nkeep += 1; |
1085 | else |
1086 | tupgone = true; /* we can delete the tuple */ |
1087 | all_visible = false; |
1088 | break; |
1089 | case HEAPTUPLE_LIVE: |
1090 | |
1091 | /* |
1092 | * Count it as live. Not only is this natural, but it's |
1093 | * also what acquire_sample_rows() does. |
1094 | */ |
1095 | live_tuples += 1; |
1096 | |
1097 | /* |
1098 | * Is the tuple definitely visible to all transactions? |
1099 | * |
1100 | * NB: Like with per-tuple hint bits, we can't set the |
1101 | * PD_ALL_VISIBLE flag if the inserter committed |
1102 | * asynchronously. See SetHintBits for more info. Check |
1103 | * that the tuple is hinted xmin-committed because of |
1104 | * that. |
1105 | */ |
1106 | if (all_visible) |
1107 | { |
1108 | TransactionId xmin; |
1109 | |
1110 | if (!HeapTupleHeaderXminCommitted(tuple.t_data)) |
1111 | { |
1112 | all_visible = false; |
1113 | break; |
1114 | } |
1115 | |
1116 | /* |
1117 | * The inserter definitely committed. But is it old |
1118 | * enough that everyone sees it as committed? |
1119 | */ |
1120 | xmin = HeapTupleHeaderGetXmin(tuple.t_data); |
1121 | if (!TransactionIdPrecedes(xmin, OldestXmin)) |
1122 | { |
1123 | all_visible = false; |
1124 | break; |
1125 | } |
1126 | |
1127 | /* Track newest xmin on page. */ |
1128 | if (TransactionIdFollows(xmin, visibility_cutoff_xid)) |
1129 | visibility_cutoff_xid = xmin; |
1130 | } |
1131 | break; |
1132 | case HEAPTUPLE_RECENTLY_DEAD: |
1133 | |
1134 | /* |
1135 | * If tuple is recently deleted then we must not remove it |
1136 | * from relation. |
1137 | */ |
1138 | nkeep += 1; |
1139 | all_visible = false; |
1140 | break; |
1141 | case HEAPTUPLE_INSERT_IN_PROGRESS: |
1142 | |
1143 | /* |
1144 | * This is an expected case during concurrent vacuum. |
1145 | * |
1146 | * We do not count these rows as live, because we expect |
1147 | * the inserting transaction to update the counters at |
1148 | * commit, and we assume that will happen only after we |
1149 | * report our results. This assumption is a bit shaky, |
1150 | * but it is what acquire_sample_rows() does, so be |
1151 | * consistent. |
1152 | */ |
1153 | all_visible = false; |
1154 | break; |
1155 | case HEAPTUPLE_DELETE_IN_PROGRESS: |
1156 | /* This is an expected case during concurrent vacuum */ |
1157 | all_visible = false; |
1158 | |
1159 | /* |
1160 | * Count such rows as live. As above, we assume the |
1161 | * deleting transaction will commit and update the |
1162 | * counters after we report. |
1163 | */ |
1164 | live_tuples += 1; |
1165 | break; |
1166 | default: |
1167 | elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result" ); |
1168 | break; |
1169 | } |
1170 | |
1171 | if (tupgone) |
1172 | { |
1173 | lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); |
1174 | HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, |
1175 | &vacrelstats->latestRemovedXid); |
1176 | tups_vacuumed += 1; |
1177 | has_dead_tuples = true; |
1178 | } |
1179 | else |
1180 | { |
1181 | bool tuple_totally_frozen; |
1182 | |
1183 | num_tuples += 1; |
1184 | hastup = true; |
1185 | |
1186 | /* |
1187 | * Each non-removable tuple must be checked to see if it needs |
1188 | * freezing. Note we already have exclusive buffer lock. |
1189 | */ |
1190 | if (heap_prepare_freeze_tuple(tuple.t_data, |
1191 | relfrozenxid, relminmxid, |
1192 | FreezeLimit, MultiXactCutoff, |
1193 | &frozen[nfrozen], |
1194 | &tuple_totally_frozen)) |
1195 | frozen[nfrozen++].offset = offnum; |
1196 | |
1197 | if (!tuple_totally_frozen) |
1198 | all_frozen = false; |
1199 | } |
1200 | } /* scan along page */ |
1201 | |
1202 | /* |
1203 | * If we froze any tuples, mark the buffer dirty, and write a WAL |
1204 | * record recording the changes. We must log the changes to be |
1205 | * crash-safe against future truncation of CLOG. |
1206 | */ |
1207 | if (nfrozen > 0) |
1208 | { |
1209 | START_CRIT_SECTION(); |
1210 | |
1211 | MarkBufferDirty(buf); |
1212 | |
1213 | /* execute collected freezes */ |
1214 | for (i = 0; i < nfrozen; i++) |
1215 | { |
1216 | ItemId itemid; |
1217 | HeapTupleHeader htup; |
1218 | |
1219 | itemid = PageGetItemId(page, frozen[i].offset); |
1220 | htup = (HeapTupleHeader) PageGetItem(page, itemid); |
1221 | |
1222 | heap_execute_freeze_tuple(htup, &frozen[i]); |
1223 | } |
1224 | |
1225 | /* Now WAL-log freezing if necessary */ |
1226 | if (RelationNeedsWAL(onerel)) |
1227 | { |
1228 | XLogRecPtr recptr; |
1229 | |
1230 | recptr = log_heap_freeze(onerel, buf, FreezeLimit, |
1231 | frozen, nfrozen); |
1232 | PageSetLSN(page, recptr); |
1233 | } |
1234 | |
1235 | END_CRIT_SECTION(); |
1236 | } |
1237 | |
1238 | /* |
1239 | * If there are no indexes we can vacuum the page right now instead of |
1240 | * doing a second scan. Also we don't do that but forget dead tuples |
1241 | * when index cleanup is disabled. |
1242 | */ |
1243 | if (!vacrelstats->useindex && vacrelstats->num_dead_tuples > 0) |
1244 | { |
1245 | if (nindexes == 0) |
1246 | { |
1247 | /* Remove tuples from heap if the table has no index */ |
1248 | lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer); |
1249 | vacuumed_pages++; |
1250 | has_dead_tuples = false; |
1251 | } |
1252 | else |
1253 | { |
1254 | /* |
1255 | * Here, we have indexes but index cleanup is disabled. |
1256 | * Instead of vacuuming the dead tuples on the heap, we just |
1257 | * forget them. |
1258 | * |
1259 | * Note that vacrelstats->dead_tuples could have tuples which |
1260 | * became dead after HOT-pruning but are not marked dead yet. |
1261 | * We do not process them because it's a very rare condition, |
1262 | * and the next vacuum will process them anyway. |
1263 | */ |
1264 | Assert(params->index_cleanup == VACOPT_TERNARY_DISABLED); |
1265 | } |
1266 | |
1267 | /* |
1268 | * Forget the now-vacuumed tuples, and press on, but be careful |
1269 | * not to reset latestRemovedXid since we want that value to be |
1270 | * valid. |
1271 | */ |
1272 | vacrelstats->num_dead_tuples = 0; |
1273 | |
1274 | /* |
1275 | * Periodically do incremental FSM vacuuming to make newly-freed |
1276 | * space visible on upper FSM pages. Note: although we've cleaned |
1277 | * the current block, we haven't yet updated its FSM entry (that |
1278 | * happens further down), so passing end == blkno is correct. |
1279 | */ |
1280 | if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) |
1281 | { |
1282 | FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, |
1283 | blkno); |
1284 | next_fsm_block_to_vacuum = blkno; |
1285 | } |
1286 | } |
1287 | |
1288 | freespace = PageGetHeapFreeSpace(page); |
1289 | |
1290 | /* mark page all-visible, if appropriate */ |
1291 | if (all_visible && !all_visible_according_to_vm) |
1292 | { |
1293 | uint8 flags = VISIBILITYMAP_ALL_VISIBLE; |
1294 | |
1295 | if (all_frozen) |
1296 | flags |= VISIBILITYMAP_ALL_FROZEN; |
1297 | |
1298 | /* |
1299 | * It should never be the case that the visibility map page is set |
1300 | * while the page-level bit is clear, but the reverse is allowed |
1301 | * (if checksums are not enabled). Regardless, set the both bits |
1302 | * so that we get back in sync. |
1303 | * |
1304 | * NB: If the heap page is all-visible but the VM bit is not set, |
1305 | * we don't need to dirty the heap page. However, if checksums |
1306 | * are enabled, we do need to make sure that the heap page is |
1307 | * dirtied before passing it to visibilitymap_set(), because it |
1308 | * may be logged. Given that this situation should only happen in |
1309 | * rare cases after a crash, it is not worth optimizing. |
1310 | */ |
1311 | PageSetAllVisible(page); |
1312 | MarkBufferDirty(buf); |
1313 | visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, |
1314 | vmbuffer, visibility_cutoff_xid, flags); |
1315 | } |
1316 | |
1317 | /* |
1318 | * As of PostgreSQL 9.2, the visibility map bit should never be set if |
1319 | * the page-level bit is clear. However, it's possible that the bit |
1320 | * got cleared after we checked it and before we took the buffer |
1321 | * content lock, so we must recheck before jumping to the conclusion |
1322 | * that something bad has happened. |
1323 | */ |
1324 | else if (all_visible_according_to_vm && !PageIsAllVisible(page) |
1325 | && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer)) |
1326 | { |
1327 | elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u" , |
1328 | relname, blkno); |
1329 | visibilitymap_clear(onerel, blkno, vmbuffer, |
1330 | VISIBILITYMAP_VALID_BITS); |
1331 | } |
1332 | |
1333 | /* |
1334 | * It's possible for the value returned by GetOldestXmin() to move |
1335 | * backwards, so it's not wrong for us to see tuples that appear to |
1336 | * not be visible to everyone yet, while PD_ALL_VISIBLE is already |
1337 | * set. The real safe xmin value never moves backwards, but |
1338 | * GetOldestXmin() is conservative and sometimes returns a value |
1339 | * that's unnecessarily small, so if we see that contradiction it just |
1340 | * means that the tuples that we think are not visible to everyone yet |
1341 | * actually are, and the PD_ALL_VISIBLE flag is correct. |
1342 | * |
1343 | * There should never be dead tuples on a page with PD_ALL_VISIBLE |
1344 | * set, however. |
1345 | */ |
1346 | else if (PageIsAllVisible(page) && has_dead_tuples) |
1347 | { |
1348 | elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u" , |
1349 | relname, blkno); |
1350 | PageClearAllVisible(page); |
1351 | MarkBufferDirty(buf); |
1352 | visibilitymap_clear(onerel, blkno, vmbuffer, |
1353 | VISIBILITYMAP_VALID_BITS); |
1354 | } |
1355 | |
1356 | /* |
1357 | * If the all-visible page is turned out to be all-frozen but not |
1358 | * marked, we should so mark it. Note that all_frozen is only valid |
1359 | * if all_visible is true, so we must check both. |
1360 | */ |
1361 | else if (all_visible_according_to_vm && all_visible && all_frozen && |
1362 | !VM_ALL_FROZEN(onerel, blkno, &vmbuffer)) |
1363 | { |
1364 | /* |
1365 | * We can pass InvalidTransactionId as the cutoff XID here, |
1366 | * because setting the all-frozen bit doesn't cause recovery |
1367 | * conflicts. |
1368 | */ |
1369 | visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, |
1370 | vmbuffer, InvalidTransactionId, |
1371 | VISIBILITYMAP_ALL_FROZEN); |
1372 | } |
1373 | |
1374 | UnlockReleaseBuffer(buf); |
1375 | |
1376 | /* Remember the location of the last page with nonremovable tuples */ |
1377 | if (hastup) |
1378 | vacrelstats->nonempty_pages = blkno + 1; |
1379 | |
1380 | /* |
1381 | * If we remembered any tuples for deletion, then the page will be |
1382 | * visited again by lazy_vacuum_heap, which will compute and record |
1383 | * its post-compaction free space. If not, then we're done with this |
1384 | * page, so remember its free space as-is. (This path will always be |
1385 | * taken if there are no indexes.) |
1386 | */ |
1387 | if (vacrelstats->num_dead_tuples == prev_dead_count) |
1388 | RecordPageWithFreeSpace(onerel, blkno, freespace); |
1389 | } |
1390 | |
1391 | /* report that everything is scanned and vacuumed */ |
1392 | pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); |
1393 | |
1394 | pfree(frozen); |
1395 | |
1396 | /* save stats for use later */ |
1397 | vacrelstats->tuples_deleted = tups_vacuumed; |
1398 | vacrelstats->new_dead_tuples = nkeep; |
1399 | |
1400 | /* now we can compute the new value for pg_class.reltuples */ |
1401 | vacrelstats->new_live_tuples = vac_estimate_reltuples(onerel, |
1402 | nblocks, |
1403 | vacrelstats->tupcount_pages, |
1404 | live_tuples); |
1405 | |
1406 | /* also compute total number of surviving heap entries */ |
1407 | vacrelstats->new_rel_tuples = |
1408 | vacrelstats->new_live_tuples + vacrelstats->new_dead_tuples; |
1409 | |
1410 | /* |
1411 | * Release any remaining pin on visibility map page. |
1412 | */ |
1413 | if (BufferIsValid(vmbuffer)) |
1414 | { |
1415 | ReleaseBuffer(vmbuffer); |
1416 | vmbuffer = InvalidBuffer; |
1417 | } |
1418 | |
1419 | /* If any tuples need to be deleted, perform final vacuum cycle */ |
1420 | /* XXX put a threshold on min number of tuples here? */ |
1421 | if (vacrelstats->num_dead_tuples > 0) |
1422 | { |
1423 | const int hvp_index[] = { |
1424 | PROGRESS_VACUUM_PHASE, |
1425 | PROGRESS_VACUUM_NUM_INDEX_VACUUMS |
1426 | }; |
1427 | int64 hvp_val[2]; |
1428 | |
1429 | /* Log cleanup info before we touch indexes */ |
1430 | vacuum_log_cleanup_info(onerel, vacrelstats); |
1431 | |
1432 | /* Report that we are now vacuuming indexes */ |
1433 | pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, |
1434 | PROGRESS_VACUUM_PHASE_VACUUM_INDEX); |
1435 | |
1436 | /* Remove index entries */ |
1437 | for (i = 0; i < nindexes; i++) |
1438 | lazy_vacuum_index(Irel[i], |
1439 | &indstats[i], |
1440 | vacrelstats); |
1441 | |
1442 | /* Report that we are now vacuuming the heap */ |
1443 | hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP; |
1444 | hvp_val[1] = vacrelstats->num_index_scans + 1; |
1445 | pgstat_progress_update_multi_param(2, hvp_index, hvp_val); |
1446 | |
1447 | /* Remove tuples from heap */ |
1448 | pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, |
1449 | PROGRESS_VACUUM_PHASE_VACUUM_HEAP); |
1450 | lazy_vacuum_heap(onerel, vacrelstats); |
1451 | vacrelstats->num_index_scans++; |
1452 | } |
1453 | |
1454 | /* |
1455 | * Vacuum the remainder of the Free Space Map. We must do this whether or |
1456 | * not there were indexes. |
1457 | */ |
1458 | if (blkno > next_fsm_block_to_vacuum) |
1459 | FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno); |
1460 | |
1461 | /* report all blocks vacuumed; and that we're cleaning up */ |
1462 | pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); |
1463 | pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, |
1464 | PROGRESS_VACUUM_PHASE_INDEX_CLEANUP); |
1465 | |
1466 | /* Do post-vacuum cleanup and statistics update for each index */ |
1467 | if (vacrelstats->useindex) |
1468 | { |
1469 | for (i = 0; i < nindexes; i++) |
1470 | lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); |
1471 | } |
1472 | |
1473 | /* If no indexes, make log report that lazy_vacuum_heap would've made */ |
1474 | if (vacuumed_pages) |
1475 | ereport(elevel, |
1476 | (errmsg("\"%s\": removed %.0f row versions in %u pages" , |
1477 | RelationGetRelationName(onerel), |
1478 | tups_vacuumed, vacuumed_pages))); |
1479 | |
1480 | /* |
1481 | * This is pretty messy, but we split it up so that we can skip emitting |
1482 | * individual parts of the message when not applicable. |
1483 | */ |
1484 | initStringInfo(&buf); |
1485 | appendStringInfo(&buf, |
1486 | _("%.0f dead row versions cannot be removed yet, oldest xmin: %u\n" ), |
1487 | nkeep, OldestXmin); |
1488 | appendStringInfo(&buf, _("There were %.0f unused item identifiers.\n" ), |
1489 | nunused); |
1490 | appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, " , |
1491 | "Skipped %u pages due to buffer pins, " , |
1492 | vacrelstats->pinskipped_pages), |
1493 | vacrelstats->pinskipped_pages); |
1494 | appendStringInfo(&buf, ngettext("%u frozen page.\n" , |
1495 | "%u frozen pages.\n" , |
1496 | vacrelstats->frozenskipped_pages), |
1497 | vacrelstats->frozenskipped_pages); |
1498 | appendStringInfo(&buf, ngettext("%u page is entirely empty.\n" , |
1499 | "%u pages are entirely empty.\n" , |
1500 | empty_pages), |
1501 | empty_pages); |
1502 | appendStringInfo(&buf, _("%s." ), pg_rusage_show(&ru0)); |
1503 | |
1504 | ereport(elevel, |
1505 | (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages" , |
1506 | RelationGetRelationName(onerel), |
1507 | tups_vacuumed, num_tuples, |
1508 | vacrelstats->scanned_pages, nblocks), |
1509 | errdetail_internal("%s" , buf.data))); |
1510 | pfree(buf.data); |
1511 | } |
1512 | |
1513 | |
1514 | /* |
1515 | * lazy_vacuum_heap() -- second pass over the heap |
1516 | * |
1517 | * This routine marks dead tuples as unused and compacts out free |
1518 | * space on their pages. Pages not having dead tuples recorded from |
1519 | * lazy_scan_heap are not visited at all. |
1520 | * |
1521 | * Note: the reason for doing this as a second pass is we cannot remove |
1522 | * the tuples until we've removed their index entries, and we want to |
1523 | * process index entry removal in batches as large as possible. |
1524 | */ |
1525 | static void |
1526 | lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) |
1527 | { |
1528 | int tupindex; |
1529 | int npages; |
1530 | PGRUsage ru0; |
1531 | Buffer vmbuffer = InvalidBuffer; |
1532 | |
1533 | pg_rusage_init(&ru0); |
1534 | npages = 0; |
1535 | |
1536 | tupindex = 0; |
1537 | while (tupindex < vacrelstats->num_dead_tuples) |
1538 | { |
1539 | BlockNumber tblk; |
1540 | Buffer buf; |
1541 | Page page; |
1542 | Size freespace; |
1543 | |
1544 | vacuum_delay_point(); |
1545 | |
1546 | tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); |
1547 | buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL, |
1548 | vac_strategy); |
1549 | if (!ConditionalLockBufferForCleanup(buf)) |
1550 | { |
1551 | ReleaseBuffer(buf); |
1552 | ++tupindex; |
1553 | continue; |
1554 | } |
1555 | tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats, |
1556 | &vmbuffer); |
1557 | |
1558 | /* Now that we've compacted the page, record its available space */ |
1559 | page = BufferGetPage(buf); |
1560 | freespace = PageGetHeapFreeSpace(page); |
1561 | |
1562 | UnlockReleaseBuffer(buf); |
1563 | RecordPageWithFreeSpace(onerel, tblk, freespace); |
1564 | npages++; |
1565 | } |
1566 | |
1567 | if (BufferIsValid(vmbuffer)) |
1568 | { |
1569 | ReleaseBuffer(vmbuffer); |
1570 | vmbuffer = InvalidBuffer; |
1571 | } |
1572 | |
1573 | ereport(elevel, |
1574 | (errmsg("\"%s\": removed %d row versions in %d pages" , |
1575 | RelationGetRelationName(onerel), |
1576 | tupindex, npages), |
1577 | errdetail_internal("%s" , pg_rusage_show(&ru0)))); |
1578 | } |
1579 | |
1580 | /* |
1581 | * lazy_vacuum_page() -- free dead tuples on a page |
1582 | * and repair its fragmentation. |
1583 | * |
1584 | * Caller must hold pin and buffer cleanup lock on the buffer. |
1585 | * |
1586 | * tupindex is the index in vacrelstats->dead_tuples of the first dead |
1587 | * tuple for this page. We assume the rest follow sequentially. |
1588 | * The return value is the first tupindex after the tuples of this page. |
1589 | */ |
1590 | static int |
1591 | lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, |
1592 | int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer) |
1593 | { |
1594 | Page page = BufferGetPage(buffer); |
1595 | OffsetNumber unused[MaxOffsetNumber]; |
1596 | int uncnt = 0; |
1597 | TransactionId visibility_cutoff_xid; |
1598 | bool all_frozen; |
1599 | |
1600 | pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); |
1601 | |
1602 | START_CRIT_SECTION(); |
1603 | |
1604 | for (; tupindex < vacrelstats->num_dead_tuples; tupindex++) |
1605 | { |
1606 | BlockNumber tblk; |
1607 | OffsetNumber toff; |
1608 | ItemId itemid; |
1609 | |
1610 | tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); |
1611 | if (tblk != blkno) |
1612 | break; /* past end of tuples for this block */ |
1613 | toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]); |
1614 | itemid = PageGetItemId(page, toff); |
1615 | ItemIdSetUnused(itemid); |
1616 | unused[uncnt++] = toff; |
1617 | } |
1618 | |
1619 | PageRepairFragmentation(page); |
1620 | |
1621 | /* |
1622 | * Mark buffer dirty before we write WAL. |
1623 | */ |
1624 | MarkBufferDirty(buffer); |
1625 | |
1626 | /* XLOG stuff */ |
1627 | if (RelationNeedsWAL(onerel)) |
1628 | { |
1629 | XLogRecPtr recptr; |
1630 | |
1631 | recptr = log_heap_clean(onerel, buffer, |
1632 | NULL, 0, NULL, 0, |
1633 | unused, uncnt, |
1634 | vacrelstats->latestRemovedXid); |
1635 | PageSetLSN(page, recptr); |
1636 | } |
1637 | |
1638 | /* |
1639 | * End critical section, so we safely can do visibility tests (which |
1640 | * possibly need to perform IO and allocate memory!). If we crash now the |
1641 | * page (including the corresponding vm bit) might not be marked all |
1642 | * visible, but that's fine. A later vacuum will fix that. |
1643 | */ |
1644 | END_CRIT_SECTION(); |
1645 | |
1646 | /* |
1647 | * Now that we have removed the dead tuples from the page, once again |
1648 | * check if the page has become all-visible. The page is already marked |
1649 | * dirty, exclusively locked, and, if needed, a full page image has been |
1650 | * emitted in the log_heap_clean() above. |
1651 | */ |
1652 | if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid, |
1653 | &all_frozen)) |
1654 | PageSetAllVisible(page); |
1655 | |
1656 | /* |
1657 | * All the changes to the heap page have been done. If the all-visible |
1658 | * flag is now set, also set the VM all-visible bit (and, if possible, the |
1659 | * all-frozen bit) unless this has already been done previously. |
1660 | */ |
1661 | if (PageIsAllVisible(page)) |
1662 | { |
1663 | uint8 vm_status = visibilitymap_get_status(onerel, blkno, vmbuffer); |
1664 | uint8 flags = 0; |
1665 | |
1666 | /* Set the VM all-frozen bit to flag, if needed */ |
1667 | if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0) |
1668 | flags |= VISIBILITYMAP_ALL_VISIBLE; |
1669 | if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen) |
1670 | flags |= VISIBILITYMAP_ALL_FROZEN; |
1671 | |
1672 | Assert(BufferIsValid(*vmbuffer)); |
1673 | if (flags != 0) |
1674 | visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, |
1675 | *vmbuffer, visibility_cutoff_xid, flags); |
1676 | } |
1677 | |
1678 | return tupindex; |
1679 | } |
1680 | |
1681 | /* |
1682 | * lazy_check_needs_freeze() -- scan page to see if any tuples |
1683 | * need to be cleaned to avoid wraparound |
1684 | * |
1685 | * Returns true if the page needs to be vacuumed using cleanup lock. |
1686 | * Also returns a flag indicating whether page contains any tuples at all. |
1687 | */ |
1688 | static bool |
1689 | lazy_check_needs_freeze(Buffer buf, bool *hastup) |
1690 | { |
1691 | Page page = BufferGetPage(buf); |
1692 | OffsetNumber offnum, |
1693 | maxoff; |
1694 | HeapTupleHeader ; |
1695 | |
1696 | *hastup = false; |
1697 | |
1698 | /* |
1699 | * New and empty pages, obviously, don't contain tuples. We could make |
1700 | * sure that the page is registered in the FSM, but it doesn't seem worth |
1701 | * waiting for a cleanup lock just for that, especially because it's |
1702 | * likely that the pin holder will do so. |
1703 | */ |
1704 | if (PageIsNew(page) || PageIsEmpty(page)) |
1705 | return false; |
1706 | |
1707 | maxoff = PageGetMaxOffsetNumber(page); |
1708 | for (offnum = FirstOffsetNumber; |
1709 | offnum <= maxoff; |
1710 | offnum = OffsetNumberNext(offnum)) |
1711 | { |
1712 | ItemId itemid; |
1713 | |
1714 | itemid = PageGetItemId(page, offnum); |
1715 | |
1716 | /* this should match hastup test in count_nondeletable_pages() */ |
1717 | if (ItemIdIsUsed(itemid)) |
1718 | *hastup = true; |
1719 | |
1720 | /* dead and redirect items never need freezing */ |
1721 | if (!ItemIdIsNormal(itemid)) |
1722 | continue; |
1723 | |
1724 | tupleheader = (HeapTupleHeader) PageGetItem(page, itemid); |
1725 | |
1726 | if (heap_tuple_needs_freeze(tupleheader, FreezeLimit, |
1727 | MultiXactCutoff, buf)) |
1728 | return true; |
1729 | } /* scan along page */ |
1730 | |
1731 | return false; |
1732 | } |
1733 | |
1734 | |
1735 | /* |
1736 | * lazy_vacuum_index() -- vacuum one index relation. |
1737 | * |
1738 | * Delete all the index entries pointing to tuples listed in |
1739 | * vacrelstats->dead_tuples, and update running statistics. |
1740 | */ |
1741 | static void |
1742 | lazy_vacuum_index(Relation indrel, |
1743 | IndexBulkDeleteResult **stats, |
1744 | LVRelStats *vacrelstats) |
1745 | { |
1746 | IndexVacuumInfo ivinfo; |
1747 | PGRUsage ru0; |
1748 | |
1749 | pg_rusage_init(&ru0); |
1750 | |
1751 | ivinfo.index = indrel; |
1752 | ivinfo.analyze_only = false; |
1753 | ivinfo.report_progress = false; |
1754 | ivinfo.estimated_count = true; |
1755 | ivinfo.message_level = elevel; |
1756 | /* We can only provide an approximate value of num_heap_tuples here */ |
1757 | ivinfo.num_heap_tuples = vacrelstats->old_live_tuples; |
1758 | ivinfo.strategy = vac_strategy; |
1759 | |
1760 | /* Do bulk deletion */ |
1761 | *stats = index_bulk_delete(&ivinfo, *stats, |
1762 | lazy_tid_reaped, (void *) vacrelstats); |
1763 | |
1764 | ereport(elevel, |
1765 | (errmsg("scanned index \"%s\" to remove %d row versions" , |
1766 | RelationGetRelationName(indrel), |
1767 | vacrelstats->num_dead_tuples), |
1768 | errdetail_internal("%s" , pg_rusage_show(&ru0)))); |
1769 | } |
1770 | |
1771 | /* |
1772 | * lazy_cleanup_index() -- do post-vacuum cleanup for one index relation. |
1773 | */ |
1774 | static void |
1775 | lazy_cleanup_index(Relation indrel, |
1776 | IndexBulkDeleteResult *stats, |
1777 | LVRelStats *vacrelstats) |
1778 | { |
1779 | IndexVacuumInfo ivinfo; |
1780 | PGRUsage ru0; |
1781 | |
1782 | pg_rusage_init(&ru0); |
1783 | |
1784 | ivinfo.index = indrel; |
1785 | ivinfo.analyze_only = false; |
1786 | ivinfo.report_progress = false; |
1787 | ivinfo.estimated_count = (vacrelstats->tupcount_pages < vacrelstats->rel_pages); |
1788 | ivinfo.message_level = elevel; |
1789 | |
1790 | /* |
1791 | * Now we can provide a better estimate of total number of surviving |
1792 | * tuples (we assume indexes are more interested in that than in the |
1793 | * number of nominally live tuples). |
1794 | */ |
1795 | ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples; |
1796 | ivinfo.strategy = vac_strategy; |
1797 | |
1798 | stats = index_vacuum_cleanup(&ivinfo, stats); |
1799 | |
1800 | if (!stats) |
1801 | return; |
1802 | |
1803 | /* |
1804 | * Now update statistics in pg_class, but only if the index says the count |
1805 | * is accurate. |
1806 | */ |
1807 | if (!stats->estimated_count) |
1808 | vac_update_relstats(indrel, |
1809 | stats->num_pages, |
1810 | stats->num_index_tuples, |
1811 | 0, |
1812 | false, |
1813 | InvalidTransactionId, |
1814 | InvalidMultiXactId, |
1815 | false); |
1816 | |
1817 | ereport(elevel, |
1818 | (errmsg("index \"%s\" now contains %.0f row versions in %u pages" , |
1819 | RelationGetRelationName(indrel), |
1820 | stats->num_index_tuples, |
1821 | stats->num_pages), |
1822 | errdetail("%.0f index row versions were removed.\n" |
1823 | "%u index pages have been deleted, %u are currently reusable.\n" |
1824 | "%s." , |
1825 | stats->tuples_removed, |
1826 | stats->pages_deleted, stats->pages_free, |
1827 | pg_rusage_show(&ru0)))); |
1828 | |
1829 | pfree(stats); |
1830 | } |
1831 | |
1832 | /* |
1833 | * should_attempt_truncation - should we attempt to truncate the heap? |
1834 | * |
1835 | * Don't even think about it unless we have a shot at releasing a goodly |
1836 | * number of pages. Otherwise, the time taken isn't worth it. |
1837 | * |
1838 | * Also don't attempt it if we are doing early pruning/vacuuming, because a |
1839 | * scan which cannot find a truncated heap page cannot determine that the |
1840 | * snapshot is too old to read that page. We might be able to get away with |
1841 | * truncating all except one of the pages, setting its LSN to (at least) the |
1842 | * maximum of the truncated range if we also treated an index leaf tuple |
1843 | * pointing to a missing heap page as something to trigger the "snapshot too |
1844 | * old" error, but that seems fragile and seems like it deserves its own patch |
1845 | * if we consider it. |
1846 | * |
1847 | * This is split out so that we can test whether truncation is going to be |
1848 | * called for before we actually do it. If you change the logic here, be |
1849 | * careful to depend only on fields that lazy_scan_heap updates on-the-fly. |
1850 | */ |
1851 | static bool |
1852 | should_attempt_truncation(VacuumParams *params, LVRelStats *vacrelstats) |
1853 | { |
1854 | BlockNumber possibly_freeable; |
1855 | |
1856 | if (params->truncate == VACOPT_TERNARY_DISABLED) |
1857 | return false; |
1858 | |
1859 | possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; |
1860 | if (possibly_freeable > 0 && |
1861 | (possibly_freeable >= REL_TRUNCATE_MINIMUM || |
1862 | possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION) && |
1863 | old_snapshot_threshold < 0) |
1864 | return true; |
1865 | else |
1866 | return false; |
1867 | } |
1868 | |
1869 | /* |
1870 | * lazy_truncate_heap - try to truncate off any empty pages at the end |
1871 | */ |
1872 | static void |
1873 | lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) |
1874 | { |
1875 | BlockNumber old_rel_pages = vacrelstats->rel_pages; |
1876 | BlockNumber new_rel_pages; |
1877 | PGRUsage ru0; |
1878 | int lock_retry; |
1879 | |
1880 | pg_rusage_init(&ru0); |
1881 | |
1882 | /* Report that we are now truncating */ |
1883 | pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, |
1884 | PROGRESS_VACUUM_PHASE_TRUNCATE); |
1885 | |
1886 | /* |
1887 | * Loop until no more truncating can be done. |
1888 | */ |
1889 | do |
1890 | { |
1891 | /* |
1892 | * We need full exclusive lock on the relation in order to do |
1893 | * truncation. If we can't get it, give up rather than waiting --- we |
1894 | * don't want to block other backends, and we don't want to deadlock |
1895 | * (which is quite possible considering we already hold a lower-grade |
1896 | * lock). |
1897 | */ |
1898 | vacrelstats->lock_waiter_detected = false; |
1899 | lock_retry = 0; |
1900 | while (true) |
1901 | { |
1902 | if (ConditionalLockRelation(onerel, AccessExclusiveLock)) |
1903 | break; |
1904 | |
1905 | /* |
1906 | * Check for interrupts while trying to (re-)acquire the exclusive |
1907 | * lock. |
1908 | */ |
1909 | CHECK_FOR_INTERRUPTS(); |
1910 | |
1911 | if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT / |
1912 | VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL)) |
1913 | { |
1914 | /* |
1915 | * We failed to establish the lock in the specified number of |
1916 | * retries. This means we give up truncating. |
1917 | */ |
1918 | vacrelstats->lock_waiter_detected = true; |
1919 | ereport(elevel, |
1920 | (errmsg("\"%s\": stopping truncate due to conflicting lock request" , |
1921 | RelationGetRelationName(onerel)))); |
1922 | return; |
1923 | } |
1924 | |
1925 | pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L); |
1926 | } |
1927 | |
1928 | /* |
1929 | * Now that we have exclusive lock, look to see if the rel has grown |
1930 | * whilst we were vacuuming with non-exclusive lock. If so, give up; |
1931 | * the newly added pages presumably contain non-deletable tuples. |
1932 | */ |
1933 | new_rel_pages = RelationGetNumberOfBlocks(onerel); |
1934 | if (new_rel_pages != old_rel_pages) |
1935 | { |
1936 | /* |
1937 | * Note: we intentionally don't update vacrelstats->rel_pages with |
1938 | * the new rel size here. If we did, it would amount to assuming |
1939 | * that the new pages are empty, which is unlikely. Leaving the |
1940 | * numbers alone amounts to assuming that the new pages have the |
1941 | * same tuple density as existing ones, which is less unlikely. |
1942 | */ |
1943 | UnlockRelation(onerel, AccessExclusiveLock); |
1944 | return; |
1945 | } |
1946 | |
1947 | /* |
1948 | * Scan backwards from the end to verify that the end pages actually |
1949 | * contain no tuples. This is *necessary*, not optional, because |
1950 | * other backends could have added tuples to these pages whilst we |
1951 | * were vacuuming. |
1952 | */ |
1953 | new_rel_pages = count_nondeletable_pages(onerel, vacrelstats); |
1954 | |
1955 | if (new_rel_pages >= old_rel_pages) |
1956 | { |
1957 | /* can't do anything after all */ |
1958 | UnlockRelation(onerel, AccessExclusiveLock); |
1959 | return; |
1960 | } |
1961 | |
1962 | /* |
1963 | * Okay to truncate. |
1964 | */ |
1965 | RelationTruncate(onerel, new_rel_pages); |
1966 | |
1967 | /* |
1968 | * We can release the exclusive lock as soon as we have truncated. |
1969 | * Other backends can't safely access the relation until they have |
1970 | * processed the smgr invalidation that smgrtruncate sent out ... but |
1971 | * that should happen as part of standard invalidation processing once |
1972 | * they acquire lock on the relation. |
1973 | */ |
1974 | UnlockRelation(onerel, AccessExclusiveLock); |
1975 | |
1976 | /* |
1977 | * Update statistics. Here, it *is* correct to adjust rel_pages |
1978 | * without also touching reltuples, since the tuple count wasn't |
1979 | * changed by the truncation. |
1980 | */ |
1981 | vacrelstats->pages_removed += old_rel_pages - new_rel_pages; |
1982 | vacrelstats->rel_pages = new_rel_pages; |
1983 | |
1984 | ereport(elevel, |
1985 | (errmsg("\"%s\": truncated %u to %u pages" , |
1986 | RelationGetRelationName(onerel), |
1987 | old_rel_pages, new_rel_pages), |
1988 | errdetail_internal("%s" , |
1989 | pg_rusage_show(&ru0)))); |
1990 | old_rel_pages = new_rel_pages; |
1991 | } while (new_rel_pages > vacrelstats->nonempty_pages && |
1992 | vacrelstats->lock_waiter_detected); |
1993 | } |
1994 | |
1995 | /* |
1996 | * Rescan end pages to verify that they are (still) empty of tuples. |
1997 | * |
1998 | * Returns number of nondeletable pages (last nonempty page + 1). |
1999 | */ |
2000 | static BlockNumber |
2001 | count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) |
2002 | { |
2003 | BlockNumber blkno; |
2004 | BlockNumber prefetchedUntil; |
2005 | instr_time starttime; |
2006 | |
2007 | /* Initialize the starttime if we check for conflicting lock requests */ |
2008 | INSTR_TIME_SET_CURRENT(starttime); |
2009 | |
2010 | /* |
2011 | * Start checking blocks at what we believe relation end to be and move |
2012 | * backwards. (Strange coding of loop control is needed because blkno is |
2013 | * unsigned.) To make the scan faster, we prefetch a few blocks at a time |
2014 | * in forward direction, so that OS-level readahead can kick in. |
2015 | */ |
2016 | blkno = vacrelstats->rel_pages; |
2017 | StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0, |
2018 | "prefetch size must be power of 2" ); |
2019 | prefetchedUntil = InvalidBlockNumber; |
2020 | while (blkno > vacrelstats->nonempty_pages) |
2021 | { |
2022 | Buffer buf; |
2023 | Page page; |
2024 | OffsetNumber offnum, |
2025 | maxoff; |
2026 | bool hastup; |
2027 | |
2028 | /* |
2029 | * Check if another process requests a lock on our relation. We are |
2030 | * holding an AccessExclusiveLock here, so they will be waiting. We |
2031 | * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we |
2032 | * only check if that interval has elapsed once every 32 blocks to |
2033 | * keep the number of system calls and actual shared lock table |
2034 | * lookups to a minimum. |
2035 | */ |
2036 | if ((blkno % 32) == 0) |
2037 | { |
2038 | instr_time currenttime; |
2039 | instr_time elapsed; |
2040 | |
2041 | INSTR_TIME_SET_CURRENT(currenttime); |
2042 | elapsed = currenttime; |
2043 | INSTR_TIME_SUBTRACT(elapsed, starttime); |
2044 | if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000) |
2045 | >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) |
2046 | { |
2047 | if (LockHasWaitersRelation(onerel, AccessExclusiveLock)) |
2048 | { |
2049 | ereport(elevel, |
2050 | (errmsg("\"%s\": suspending truncate due to conflicting lock request" , |
2051 | RelationGetRelationName(onerel)))); |
2052 | |
2053 | vacrelstats->lock_waiter_detected = true; |
2054 | return blkno; |
2055 | } |
2056 | starttime = currenttime; |
2057 | } |
2058 | } |
2059 | |
2060 | /* |
2061 | * We don't insert a vacuum delay point here, because we have an |
2062 | * exclusive lock on the table which we want to hold for as short a |
2063 | * time as possible. We still need to check for interrupts however. |
2064 | */ |
2065 | CHECK_FOR_INTERRUPTS(); |
2066 | |
2067 | blkno--; |
2068 | |
2069 | /* If we haven't prefetched this lot yet, do so now. */ |
2070 | if (prefetchedUntil > blkno) |
2071 | { |
2072 | BlockNumber prefetchStart; |
2073 | BlockNumber pblkno; |
2074 | |
2075 | prefetchStart = blkno & ~(PREFETCH_SIZE - 1); |
2076 | for (pblkno = prefetchStart; pblkno <= blkno; pblkno++) |
2077 | { |
2078 | PrefetchBuffer(onerel, MAIN_FORKNUM, pblkno); |
2079 | CHECK_FOR_INTERRUPTS(); |
2080 | } |
2081 | prefetchedUntil = prefetchStart; |
2082 | } |
2083 | |
2084 | buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, |
2085 | RBM_NORMAL, vac_strategy); |
2086 | |
2087 | /* In this phase we only need shared access to the buffer */ |
2088 | LockBuffer(buf, BUFFER_LOCK_SHARE); |
2089 | |
2090 | page = BufferGetPage(buf); |
2091 | |
2092 | if (PageIsNew(page) || PageIsEmpty(page)) |
2093 | { |
2094 | UnlockReleaseBuffer(buf); |
2095 | continue; |
2096 | } |
2097 | |
2098 | hastup = false; |
2099 | maxoff = PageGetMaxOffsetNumber(page); |
2100 | for (offnum = FirstOffsetNumber; |
2101 | offnum <= maxoff; |
2102 | offnum = OffsetNumberNext(offnum)) |
2103 | { |
2104 | ItemId itemid; |
2105 | |
2106 | itemid = PageGetItemId(page, offnum); |
2107 | |
2108 | /* |
2109 | * Note: any non-unused item should be taken as a reason to keep |
2110 | * this page. We formerly thought that DEAD tuples could be |
2111 | * thrown away, but that's not so, because we'd not have cleaned |
2112 | * out their index entries. |
2113 | */ |
2114 | if (ItemIdIsUsed(itemid)) |
2115 | { |
2116 | hastup = true; |
2117 | break; /* can stop scanning */ |
2118 | } |
2119 | } /* scan along page */ |
2120 | |
2121 | UnlockReleaseBuffer(buf); |
2122 | |
2123 | /* Done scanning if we found a tuple here */ |
2124 | if (hastup) |
2125 | return blkno + 1; |
2126 | } |
2127 | |
2128 | /* |
2129 | * If we fall out of the loop, all the previously-thought-to-be-empty |
2130 | * pages still are; we need not bother to look at the last known-nonempty |
2131 | * page. |
2132 | */ |
2133 | return vacrelstats->nonempty_pages; |
2134 | } |
2135 | |
2136 | /* |
2137 | * lazy_space_alloc - space allocation decisions for lazy vacuum |
2138 | * |
2139 | * See the comments at the head of this file for rationale. |
2140 | */ |
2141 | static void |
2142 | lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks) |
2143 | { |
2144 | long maxtuples; |
2145 | int vac_work_mem = IsAutoVacuumWorkerProcess() && |
2146 | autovacuum_work_mem != -1 ? |
2147 | autovacuum_work_mem : maintenance_work_mem; |
2148 | |
2149 | if (vacrelstats->useindex) |
2150 | { |
2151 | maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData); |
2152 | maxtuples = Min(maxtuples, INT_MAX); |
2153 | maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData)); |
2154 | |
2155 | /* curious coding here to ensure the multiplication can't overflow */ |
2156 | if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks) |
2157 | maxtuples = relblocks * LAZY_ALLOC_TUPLES; |
2158 | |
2159 | /* stay sane if small maintenance_work_mem */ |
2160 | maxtuples = Max(maxtuples, MaxHeapTuplesPerPage); |
2161 | } |
2162 | else |
2163 | { |
2164 | maxtuples = MaxHeapTuplesPerPage; |
2165 | } |
2166 | |
2167 | vacrelstats->num_dead_tuples = 0; |
2168 | vacrelstats->max_dead_tuples = (int) maxtuples; |
2169 | vacrelstats->dead_tuples = (ItemPointer) |
2170 | palloc(maxtuples * sizeof(ItemPointerData)); |
2171 | } |
2172 | |
2173 | /* |
2174 | * lazy_record_dead_tuple - remember one deletable tuple |
2175 | */ |
2176 | static void |
2177 | lazy_record_dead_tuple(LVRelStats *vacrelstats, |
2178 | ItemPointer itemptr) |
2179 | { |
2180 | /* |
2181 | * The array shouldn't overflow under normal behavior, but perhaps it |
2182 | * could if we are given a really small maintenance_work_mem. In that |
2183 | * case, just forget the last few tuples (we'll get 'em next time). |
2184 | */ |
2185 | if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples) |
2186 | { |
2187 | vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr; |
2188 | vacrelstats->num_dead_tuples++; |
2189 | pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, |
2190 | vacrelstats->num_dead_tuples); |
2191 | } |
2192 | } |
2193 | |
2194 | /* |
2195 | * lazy_tid_reaped() -- is a particular tid deletable? |
2196 | * |
2197 | * This has the right signature to be an IndexBulkDeleteCallback. |
2198 | * |
2199 | * Assumes dead_tuples array is in sorted order. |
2200 | */ |
2201 | static bool |
2202 | lazy_tid_reaped(ItemPointer itemptr, void *state) |
2203 | { |
2204 | LVRelStats *vacrelstats = (LVRelStats *) state; |
2205 | ItemPointer res; |
2206 | |
2207 | res = (ItemPointer) bsearch((void *) itemptr, |
2208 | (void *) vacrelstats->dead_tuples, |
2209 | vacrelstats->num_dead_tuples, |
2210 | sizeof(ItemPointerData), |
2211 | vac_cmp_itemptr); |
2212 | |
2213 | return (res != NULL); |
2214 | } |
2215 | |
2216 | /* |
2217 | * Comparator routines for use with qsort() and bsearch(). |
2218 | */ |
2219 | static int |
2220 | vac_cmp_itemptr(const void *left, const void *right) |
2221 | { |
2222 | BlockNumber lblk, |
2223 | rblk; |
2224 | OffsetNumber loff, |
2225 | roff; |
2226 | |
2227 | lblk = ItemPointerGetBlockNumber((ItemPointer) left); |
2228 | rblk = ItemPointerGetBlockNumber((ItemPointer) right); |
2229 | |
2230 | if (lblk < rblk) |
2231 | return -1; |
2232 | if (lblk > rblk) |
2233 | return 1; |
2234 | |
2235 | loff = ItemPointerGetOffsetNumber((ItemPointer) left); |
2236 | roff = ItemPointerGetOffsetNumber((ItemPointer) right); |
2237 | |
2238 | if (loff < roff) |
2239 | return -1; |
2240 | if (loff > roff) |
2241 | return 1; |
2242 | |
2243 | return 0; |
2244 | } |
2245 | |
2246 | /* |
2247 | * Check if every tuple in the given page is visible to all current and future |
2248 | * transactions. Also return the visibility_cutoff_xid which is the highest |
2249 | * xmin amongst the visible tuples. Set *all_frozen to true if every tuple |
2250 | * on this page is frozen. |
2251 | */ |
2252 | static bool |
2253 | heap_page_is_all_visible(Relation rel, Buffer buf, |
2254 | TransactionId *visibility_cutoff_xid, |
2255 | bool *all_frozen) |
2256 | { |
2257 | Page page = BufferGetPage(buf); |
2258 | BlockNumber blockno = BufferGetBlockNumber(buf); |
2259 | OffsetNumber offnum, |
2260 | maxoff; |
2261 | bool all_visible = true; |
2262 | |
2263 | *visibility_cutoff_xid = InvalidTransactionId; |
2264 | *all_frozen = true; |
2265 | |
2266 | /* |
2267 | * This is a stripped down version of the line pointer scan in |
2268 | * lazy_scan_heap(). So if you change anything here, also check that code. |
2269 | */ |
2270 | maxoff = PageGetMaxOffsetNumber(page); |
2271 | for (offnum = FirstOffsetNumber; |
2272 | offnum <= maxoff && all_visible; |
2273 | offnum = OffsetNumberNext(offnum)) |
2274 | { |
2275 | ItemId itemid; |
2276 | HeapTupleData tuple; |
2277 | |
2278 | itemid = PageGetItemId(page, offnum); |
2279 | |
2280 | /* Unused or redirect line pointers are of no interest */ |
2281 | if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid)) |
2282 | continue; |
2283 | |
2284 | ItemPointerSet(&(tuple.t_self), blockno, offnum); |
2285 | |
2286 | /* |
2287 | * Dead line pointers can have index pointers pointing to them. So |
2288 | * they can't be treated as visible |
2289 | */ |
2290 | if (ItemIdIsDead(itemid)) |
2291 | { |
2292 | all_visible = false; |
2293 | *all_frozen = false; |
2294 | break; |
2295 | } |
2296 | |
2297 | Assert(ItemIdIsNormal(itemid)); |
2298 | |
2299 | tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); |
2300 | tuple.t_len = ItemIdGetLength(itemid); |
2301 | tuple.t_tableOid = RelationGetRelid(rel); |
2302 | |
2303 | switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) |
2304 | { |
2305 | case HEAPTUPLE_LIVE: |
2306 | { |
2307 | TransactionId xmin; |
2308 | |
2309 | /* Check comments in lazy_scan_heap. */ |
2310 | if (!HeapTupleHeaderXminCommitted(tuple.t_data)) |
2311 | { |
2312 | all_visible = false; |
2313 | *all_frozen = false; |
2314 | break; |
2315 | } |
2316 | |
2317 | /* |
2318 | * The inserter definitely committed. But is it old enough |
2319 | * that everyone sees it as committed? |
2320 | */ |
2321 | xmin = HeapTupleHeaderGetXmin(tuple.t_data); |
2322 | if (!TransactionIdPrecedes(xmin, OldestXmin)) |
2323 | { |
2324 | all_visible = false; |
2325 | *all_frozen = false; |
2326 | break; |
2327 | } |
2328 | |
2329 | /* Track newest xmin on page. */ |
2330 | if (TransactionIdFollows(xmin, *visibility_cutoff_xid)) |
2331 | *visibility_cutoff_xid = xmin; |
2332 | |
2333 | /* Check whether this tuple is already frozen or not */ |
2334 | if (all_visible && *all_frozen && |
2335 | heap_tuple_needs_eventual_freeze(tuple.t_data)) |
2336 | *all_frozen = false; |
2337 | } |
2338 | break; |
2339 | |
2340 | case HEAPTUPLE_DEAD: |
2341 | case HEAPTUPLE_RECENTLY_DEAD: |
2342 | case HEAPTUPLE_INSERT_IN_PROGRESS: |
2343 | case HEAPTUPLE_DELETE_IN_PROGRESS: |
2344 | { |
2345 | all_visible = false; |
2346 | *all_frozen = false; |
2347 | break; |
2348 | } |
2349 | default: |
2350 | elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result" ); |
2351 | break; |
2352 | } |
2353 | } /* scan along page */ |
2354 | |
2355 | return all_visible; |
2356 | } |
2357 | |