1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * nbtree.c |
4 | * Implementation of Lehman and Yao's btree management algorithm for |
5 | * Postgres. |
6 | * |
7 | * NOTES |
8 | * This file contains only the public interface routines. |
9 | * |
10 | * |
11 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
12 | * Portions Copyright (c) 1994, Regents of the University of California |
13 | * |
14 | * IDENTIFICATION |
15 | * src/backend/access/nbtree/nbtree.c |
16 | * |
17 | *------------------------------------------------------------------------- |
18 | */ |
19 | #include "postgres.h" |
20 | |
21 | #include "access/nbtree.h" |
22 | #include "access/nbtxlog.h" |
23 | #include "access/relscan.h" |
24 | #include "access/xlog.h" |
25 | #include "commands/progress.h" |
26 | #include "commands/vacuum.h" |
27 | #include "miscadmin.h" |
28 | #include "nodes/execnodes.h" |
29 | #include "pgstat.h" |
30 | #include "postmaster/autovacuum.h" |
31 | #include "storage/condition_variable.h" |
32 | #include "storage/indexfsm.h" |
33 | #include "storage/ipc.h" |
34 | #include "storage/lmgr.h" |
35 | #include "storage/smgr.h" |
36 | #include "utils/builtins.h" |
37 | #include "utils/index_selfuncs.h" |
38 | #include "utils/memutils.h" |
39 | |
40 | |
41 | /* Working state needed by btvacuumpage */ |
42 | typedef struct |
43 | { |
44 | IndexVacuumInfo *info; |
45 | IndexBulkDeleteResult *stats; |
46 | IndexBulkDeleteCallback callback; |
47 | void *callback_state; |
48 | BTCycleId cycleid; |
49 | BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */ |
50 | BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */ |
51 | BlockNumber totFreePages; /* true total # of free pages */ |
52 | TransactionId oldestBtpoXact; |
53 | MemoryContext pagedelcontext; |
54 | } BTVacState; |
55 | |
56 | /* |
57 | * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started. |
58 | * |
59 | * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to |
60 | * a new page; others must wait. |
61 | * |
62 | * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan |
63 | * to a new page; some process can start doing that. |
64 | * |
65 | * BTPARALLEL_DONE indicates that the scan is complete (including error exit). |
66 | * We reach this state once for every distinct combination of array keys. |
67 | */ |
68 | typedef enum |
69 | { |
70 | BTPARALLEL_NOT_INITIALIZED, |
71 | BTPARALLEL_ADVANCING, |
72 | BTPARALLEL_IDLE, |
73 | BTPARALLEL_DONE |
74 | } BTPS_State; |
75 | |
76 | /* |
77 | * BTParallelScanDescData contains btree specific shared information required |
78 | * for parallel scan. |
79 | */ |
80 | typedef struct BTParallelScanDescData |
81 | { |
82 | BlockNumber btps_scanPage; /* latest or next page to be scanned */ |
83 | BTPS_State btps_pageStatus; /* indicates whether next page is |
84 | * available for scan. see above for |
85 | * possible states of parallel scan. */ |
86 | int btps_arrayKeyCount; /* count indicating number of array scan |
87 | * keys processed by parallel scan */ |
88 | slock_t btps_mutex; /* protects above variables */ |
89 | ConditionVariable btps_cv; /* used to synchronize parallel scan */ |
90 | } BTParallelScanDescData; |
91 | |
92 | typedef struct BTParallelScanDescData *BTParallelScanDesc; |
93 | |
94 | |
95 | static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
96 | IndexBulkDeleteCallback callback, void *callback_state, |
97 | BTCycleId cycleid, TransactionId *oldestBtpoXact); |
98 | static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, |
99 | BlockNumber orig_blkno); |
100 | |
101 | |
102 | /* |
103 | * Btree handler function: return IndexAmRoutine with access method parameters |
104 | * and callbacks. |
105 | */ |
106 | Datum |
107 | bthandler(PG_FUNCTION_ARGS) |
108 | { |
109 | IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); |
110 | |
111 | amroutine->amstrategies = BTMaxStrategyNumber; |
112 | amroutine->amsupport = BTNProcs; |
113 | amroutine->amcanorder = true; |
114 | amroutine->amcanorderbyop = false; |
115 | amroutine->amcanbackward = true; |
116 | amroutine->amcanunique = true; |
117 | amroutine->amcanmulticol = true; |
118 | amroutine->amoptionalkey = true; |
119 | amroutine->amsearcharray = true; |
120 | amroutine->amsearchnulls = true; |
121 | amroutine->amstorage = false; |
122 | amroutine->amclusterable = true; |
123 | amroutine->ampredlocks = true; |
124 | amroutine->amcanparallel = true; |
125 | amroutine->amcaninclude = true; |
126 | amroutine->amkeytype = InvalidOid; |
127 | |
128 | amroutine->ambuild = btbuild; |
129 | amroutine->ambuildempty = btbuildempty; |
130 | amroutine->aminsert = btinsert; |
131 | amroutine->ambulkdelete = btbulkdelete; |
132 | amroutine->amvacuumcleanup = btvacuumcleanup; |
133 | amroutine->amcanreturn = btcanreturn; |
134 | amroutine->amcostestimate = btcostestimate; |
135 | amroutine->amoptions = btoptions; |
136 | amroutine->amproperty = btproperty; |
137 | amroutine->ambuildphasename = btbuildphasename; |
138 | amroutine->amvalidate = btvalidate; |
139 | amroutine->ambeginscan = btbeginscan; |
140 | amroutine->amrescan = btrescan; |
141 | amroutine->amgettuple = btgettuple; |
142 | amroutine->amgetbitmap = btgetbitmap; |
143 | amroutine->amendscan = btendscan; |
144 | amroutine->ammarkpos = btmarkpos; |
145 | amroutine->amrestrpos = btrestrpos; |
146 | amroutine->amestimateparallelscan = btestimateparallelscan; |
147 | amroutine->aminitparallelscan = btinitparallelscan; |
148 | amroutine->amparallelrescan = btparallelrescan; |
149 | |
150 | PG_RETURN_POINTER(amroutine); |
151 | } |
152 | |
153 | /* |
154 | * btbuildempty() -- build an empty btree index in the initialization fork |
155 | */ |
156 | void |
157 | btbuildempty(Relation index) |
158 | { |
159 | Page metapage; |
160 | |
161 | /* Construct metapage. */ |
162 | metapage = (Page) palloc(BLCKSZ); |
163 | _bt_initmetapage(metapage, P_NONE, 0); |
164 | |
165 | /* |
166 | * Write the page and log it. It might seem that an immediate sync would |
167 | * be sufficient to guarantee that the file exists on disk, but recovery |
168 | * itself might remove it while replaying, for example, an |
169 | * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need |
170 | * this even when wal_level=minimal. |
171 | */ |
172 | PageSetChecksumInplace(metapage, BTREE_METAPAGE); |
173 | smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, |
174 | (char *) metapage, true); |
175 | log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, |
176 | BTREE_METAPAGE, metapage, true); |
177 | |
178 | /* |
179 | * An immediate sync is required even if we xlog'd the page, because the |
180 | * write did not go through shared_buffers and therefore a concurrent |
181 | * checkpoint may have moved the redo pointer past our xlog record. |
182 | */ |
183 | smgrimmedsync(index->rd_smgr, INIT_FORKNUM); |
184 | } |
185 | |
186 | /* |
187 | * btinsert() -- insert an index tuple into a btree. |
188 | * |
189 | * Descend the tree recursively, find the appropriate location for our |
190 | * new tuple, and put it there. |
191 | */ |
192 | bool |
193 | btinsert(Relation rel, Datum *values, bool *isnull, |
194 | ItemPointer ht_ctid, Relation heapRel, |
195 | IndexUniqueCheck checkUnique, |
196 | IndexInfo *indexInfo) |
197 | { |
198 | bool result; |
199 | IndexTuple itup; |
200 | |
201 | /* generate an index tuple */ |
202 | itup = index_form_tuple(RelationGetDescr(rel), values, isnull); |
203 | itup->t_tid = *ht_ctid; |
204 | |
205 | result = _bt_doinsert(rel, itup, checkUnique, heapRel); |
206 | |
207 | pfree(itup); |
208 | |
209 | return result; |
210 | } |
211 | |
212 | /* |
213 | * btgettuple() -- Get the next tuple in the scan. |
214 | */ |
215 | bool |
216 | btgettuple(IndexScanDesc scan, ScanDirection dir) |
217 | { |
218 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
219 | bool res; |
220 | |
221 | /* btree indexes are never lossy */ |
222 | scan->xs_recheck = false; |
223 | |
224 | /* |
225 | * If we have any array keys, initialize them during first call for a |
226 | * scan. We can't do this in btrescan because we don't know the scan |
227 | * direction at that time. |
228 | */ |
229 | if (so->numArrayKeys && !BTScanPosIsValid(so->currPos)) |
230 | { |
231 | /* punt if we have any unsatisfiable array keys */ |
232 | if (so->numArrayKeys < 0) |
233 | return false; |
234 | |
235 | _bt_start_array_keys(scan, dir); |
236 | } |
237 | |
238 | /* This loop handles advancing to the next array elements, if any */ |
239 | do |
240 | { |
241 | /* |
242 | * If we've already initialized this scan, we can just advance it in |
243 | * the appropriate direction. If we haven't done so yet, we call |
244 | * _bt_first() to get the first item in the scan. |
245 | */ |
246 | if (!BTScanPosIsValid(so->currPos)) |
247 | res = _bt_first(scan, dir); |
248 | else |
249 | { |
250 | /* |
251 | * Check to see if we should kill the previously-fetched tuple. |
252 | */ |
253 | if (scan->kill_prior_tuple) |
254 | { |
255 | /* |
256 | * Yes, remember it for later. (We'll deal with all such |
257 | * tuples at once right before leaving the index page.) The |
258 | * test for numKilled overrun is not just paranoia: if the |
259 | * caller reverses direction in the indexscan then the same |
260 | * item might get entered multiple times. It's not worth |
261 | * trying to optimize that, so we don't detect it, but instead |
262 | * just forget any excess entries. |
263 | */ |
264 | if (so->killedItems == NULL) |
265 | so->killedItems = (int *) |
266 | palloc(MaxIndexTuplesPerPage * sizeof(int)); |
267 | if (so->numKilled < MaxIndexTuplesPerPage) |
268 | so->killedItems[so->numKilled++] = so->currPos.itemIndex; |
269 | } |
270 | |
271 | /* |
272 | * Now continue the scan. |
273 | */ |
274 | res = _bt_next(scan, dir); |
275 | } |
276 | |
277 | /* If we have a tuple, return it ... */ |
278 | if (res) |
279 | break; |
280 | /* ... otherwise see if we have more array keys to deal with */ |
281 | } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir)); |
282 | |
283 | return res; |
284 | } |
285 | |
286 | /* |
287 | * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap |
288 | */ |
289 | int64 |
290 | btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) |
291 | { |
292 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
293 | int64 ntids = 0; |
294 | ItemPointer heapTid; |
295 | |
296 | /* |
297 | * If we have any array keys, initialize them. |
298 | */ |
299 | if (so->numArrayKeys) |
300 | { |
301 | /* punt if we have any unsatisfiable array keys */ |
302 | if (so->numArrayKeys < 0) |
303 | return ntids; |
304 | |
305 | _bt_start_array_keys(scan, ForwardScanDirection); |
306 | } |
307 | |
308 | /* This loop handles advancing to the next array elements, if any */ |
309 | do |
310 | { |
311 | /* Fetch the first page & tuple */ |
312 | if (_bt_first(scan, ForwardScanDirection)) |
313 | { |
314 | /* Save tuple ID, and continue scanning */ |
315 | heapTid = &scan->xs_heaptid; |
316 | tbm_add_tuples(tbm, heapTid, 1, false); |
317 | ntids++; |
318 | |
319 | for (;;) |
320 | { |
321 | /* |
322 | * Advance to next tuple within page. This is the same as the |
323 | * easy case in _bt_next(). |
324 | */ |
325 | if (++so->currPos.itemIndex > so->currPos.lastItem) |
326 | { |
327 | /* let _bt_next do the heavy lifting */ |
328 | if (!_bt_next(scan, ForwardScanDirection)) |
329 | break; |
330 | } |
331 | |
332 | /* Save tuple ID, and continue scanning */ |
333 | heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid; |
334 | tbm_add_tuples(tbm, heapTid, 1, false); |
335 | ntids++; |
336 | } |
337 | } |
338 | /* Now see if we have more array keys to deal with */ |
339 | } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection)); |
340 | |
341 | return ntids; |
342 | } |
343 | |
344 | /* |
345 | * btbeginscan() -- start a scan on a btree index |
346 | */ |
347 | IndexScanDesc |
348 | btbeginscan(Relation rel, int nkeys, int norderbys) |
349 | { |
350 | IndexScanDesc scan; |
351 | BTScanOpaque so; |
352 | |
353 | /* no order by operators allowed */ |
354 | Assert(norderbys == 0); |
355 | |
356 | /* get the scan */ |
357 | scan = RelationGetIndexScan(rel, nkeys, norderbys); |
358 | |
359 | /* allocate private workspace */ |
360 | so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); |
361 | BTScanPosInvalidate(so->currPos); |
362 | BTScanPosInvalidate(so->markPos); |
363 | if (scan->numberOfKeys > 0) |
364 | so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); |
365 | else |
366 | so->keyData = NULL; |
367 | |
368 | so->arrayKeyData = NULL; /* assume no array keys for now */ |
369 | so->numArrayKeys = 0; |
370 | so->arrayKeys = NULL; |
371 | so->arrayContext = NULL; |
372 | |
373 | so->killedItems = NULL; /* until needed */ |
374 | so->numKilled = 0; |
375 | |
376 | /* |
377 | * We don't know yet whether the scan will be index-only, so we do not |
378 | * allocate the tuple workspace arrays until btrescan. However, we set up |
379 | * scan->xs_itupdesc whether we'll need it or not, since that's so cheap. |
380 | */ |
381 | so->currTuples = so->markTuples = NULL; |
382 | |
383 | scan->xs_itupdesc = RelationGetDescr(rel); |
384 | |
385 | scan->opaque = so; |
386 | |
387 | return scan; |
388 | } |
389 | |
390 | /* |
391 | * btrescan() -- rescan an index relation |
392 | */ |
393 | void |
394 | btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, |
395 | ScanKey orderbys, int norderbys) |
396 | { |
397 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
398 | |
399 | /* we aren't holding any read locks, but gotta drop the pins */ |
400 | if (BTScanPosIsValid(so->currPos)) |
401 | { |
402 | /* Before leaving current page, deal with any killed items */ |
403 | if (so->numKilled > 0) |
404 | _bt_killitems(scan); |
405 | BTScanPosUnpinIfPinned(so->currPos); |
406 | BTScanPosInvalidate(so->currPos); |
407 | } |
408 | |
409 | so->markItemIndex = -1; |
410 | so->arrayKeyCount = 0; |
411 | BTScanPosUnpinIfPinned(so->markPos); |
412 | BTScanPosInvalidate(so->markPos); |
413 | |
414 | /* |
415 | * Allocate tuple workspace arrays, if needed for an index-only scan and |
416 | * not already done in a previous rescan call. To save on palloc |
417 | * overhead, both workspaces are allocated as one palloc block; only this |
418 | * function and btendscan know that. |
419 | * |
420 | * NOTE: this data structure also makes it safe to return data from a |
421 | * "name" column, even though btree name_ops uses an underlying storage |
422 | * datatype of cstring. The risk there is that "name" is supposed to be |
423 | * padded to NAMEDATALEN, but the actual index tuple is probably shorter. |
424 | * However, since we only return data out of tuples sitting in the |
425 | * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some |
426 | * data out of the markTuples array --- running off the end of memory for |
427 | * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats |
428 | * adding special-case treatment for name_ops elsewhere. |
429 | */ |
430 | if (scan->xs_want_itup && so->currTuples == NULL) |
431 | { |
432 | so->currTuples = (char *) palloc(BLCKSZ * 2); |
433 | so->markTuples = so->currTuples + BLCKSZ; |
434 | } |
435 | |
436 | /* |
437 | * Reset the scan keys. Note that keys ordering stuff moved to _bt_first. |
438 | * - vadim 05/05/97 |
439 | */ |
440 | if (scankey && scan->numberOfKeys > 0) |
441 | memmove(scan->keyData, |
442 | scankey, |
443 | scan->numberOfKeys * sizeof(ScanKeyData)); |
444 | so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ |
445 | |
446 | /* If any keys are SK_SEARCHARRAY type, set up array-key info */ |
447 | _bt_preprocess_array_keys(scan); |
448 | } |
449 | |
450 | /* |
451 | * btendscan() -- close down a scan |
452 | */ |
453 | void |
454 | btendscan(IndexScanDesc scan) |
455 | { |
456 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
457 | |
458 | /* we aren't holding any read locks, but gotta drop the pins */ |
459 | if (BTScanPosIsValid(so->currPos)) |
460 | { |
461 | /* Before leaving current page, deal with any killed items */ |
462 | if (so->numKilled > 0) |
463 | _bt_killitems(scan); |
464 | BTScanPosUnpinIfPinned(so->currPos); |
465 | } |
466 | |
467 | so->markItemIndex = -1; |
468 | BTScanPosUnpinIfPinned(so->markPos); |
469 | |
470 | /* No need to invalidate positions, the RAM is about to be freed. */ |
471 | |
472 | /* Release storage */ |
473 | if (so->keyData != NULL) |
474 | pfree(so->keyData); |
475 | /* so->arrayKeyData and so->arrayKeys are in arrayContext */ |
476 | if (so->arrayContext != NULL) |
477 | MemoryContextDelete(so->arrayContext); |
478 | if (so->killedItems != NULL) |
479 | pfree(so->killedItems); |
480 | if (so->currTuples != NULL) |
481 | pfree(so->currTuples); |
482 | /* so->markTuples should not be pfree'd, see btrescan */ |
483 | pfree(so); |
484 | } |
485 | |
486 | /* |
487 | * btmarkpos() -- save current scan position |
488 | */ |
489 | void |
490 | btmarkpos(IndexScanDesc scan) |
491 | { |
492 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
493 | |
494 | /* There may be an old mark with a pin (but no lock). */ |
495 | BTScanPosUnpinIfPinned(so->markPos); |
496 | |
497 | /* |
498 | * Just record the current itemIndex. If we later step to next page |
499 | * before releasing the marked position, _bt_steppage makes a full copy of |
500 | * the currPos struct in markPos. If (as often happens) the mark is moved |
501 | * before we leave the page, we don't have to do that work. |
502 | */ |
503 | if (BTScanPosIsValid(so->currPos)) |
504 | so->markItemIndex = so->currPos.itemIndex; |
505 | else |
506 | { |
507 | BTScanPosInvalidate(so->markPos); |
508 | so->markItemIndex = -1; |
509 | } |
510 | |
511 | /* Also record the current positions of any array keys */ |
512 | if (so->numArrayKeys) |
513 | _bt_mark_array_keys(scan); |
514 | } |
515 | |
516 | /* |
517 | * btrestrpos() -- restore scan to last saved position |
518 | */ |
519 | void |
520 | btrestrpos(IndexScanDesc scan) |
521 | { |
522 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
523 | |
524 | /* Restore the marked positions of any array keys */ |
525 | if (so->numArrayKeys) |
526 | _bt_restore_array_keys(scan); |
527 | |
528 | if (so->markItemIndex >= 0) |
529 | { |
530 | /* |
531 | * The scan has never moved to a new page since the last mark. Just |
532 | * restore the itemIndex. |
533 | * |
534 | * NB: In this case we can't count on anything in so->markPos to be |
535 | * accurate. |
536 | */ |
537 | so->currPos.itemIndex = so->markItemIndex; |
538 | } |
539 | else |
540 | { |
541 | /* |
542 | * The scan moved to a new page after last mark or restore, and we are |
543 | * now restoring to the marked page. We aren't holding any read |
544 | * locks, but if we're still holding the pin for the current position, |
545 | * we must drop it. |
546 | */ |
547 | if (BTScanPosIsValid(so->currPos)) |
548 | { |
549 | /* Before leaving current page, deal with any killed items */ |
550 | if (so->numKilled > 0) |
551 | _bt_killitems(scan); |
552 | BTScanPosUnpinIfPinned(so->currPos); |
553 | } |
554 | |
555 | if (BTScanPosIsValid(so->markPos)) |
556 | { |
557 | /* bump pin on mark buffer for assignment to current buffer */ |
558 | if (BTScanPosIsPinned(so->markPos)) |
559 | IncrBufferRefCount(so->markPos.buf); |
560 | memcpy(&so->currPos, &so->markPos, |
561 | offsetof(BTScanPosData, items[1]) + |
562 | so->markPos.lastItem * sizeof(BTScanPosItem)); |
563 | if (so->currTuples) |
564 | memcpy(so->currTuples, so->markTuples, |
565 | so->markPos.nextTupleOffset); |
566 | } |
567 | else |
568 | BTScanPosInvalidate(so->currPos); |
569 | } |
570 | } |
571 | |
572 | /* |
573 | * btestimateparallelscan -- estimate storage for BTParallelScanDescData |
574 | */ |
575 | Size |
576 | btestimateparallelscan(void) |
577 | { |
578 | return sizeof(BTParallelScanDescData); |
579 | } |
580 | |
581 | /* |
582 | * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan |
583 | */ |
584 | void |
585 | btinitparallelscan(void *target) |
586 | { |
587 | BTParallelScanDesc bt_target = (BTParallelScanDesc) target; |
588 | |
589 | SpinLockInit(&bt_target->btps_mutex); |
590 | bt_target->btps_scanPage = InvalidBlockNumber; |
591 | bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; |
592 | bt_target->btps_arrayKeyCount = 0; |
593 | ConditionVariableInit(&bt_target->btps_cv); |
594 | } |
595 | |
596 | /* |
597 | * btparallelrescan() -- reset parallel scan |
598 | */ |
599 | void |
600 | btparallelrescan(IndexScanDesc scan) |
601 | { |
602 | BTParallelScanDesc btscan; |
603 | ParallelIndexScanDesc parallel_scan = scan->parallel_scan; |
604 | |
605 | Assert(parallel_scan); |
606 | |
607 | btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, |
608 | parallel_scan->ps_offset); |
609 | |
610 | /* |
611 | * In theory, we don't need to acquire the spinlock here, because there |
612 | * shouldn't be any other workers running at this point, but we do so for |
613 | * consistency. |
614 | */ |
615 | SpinLockAcquire(&btscan->btps_mutex); |
616 | btscan->btps_scanPage = InvalidBlockNumber; |
617 | btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; |
618 | btscan->btps_arrayKeyCount = 0; |
619 | SpinLockRelease(&btscan->btps_mutex); |
620 | } |
621 | |
622 | /* |
623 | * _bt_parallel_seize() -- Begin the process of advancing the scan to a new |
624 | * page. Other scans must wait until we call _bt_parallel_release() |
625 | * or _bt_parallel_done(). |
626 | * |
627 | * The return value is true if we successfully seized the scan and false |
628 | * if we did not. The latter case occurs if no pages remain for the current |
629 | * set of scankeys. |
630 | * |
631 | * If the return value is true, *pageno returns the next or current page |
632 | * of the scan (depending on the scan direction). An invalid block number |
633 | * means the scan hasn't yet started, and P_NONE means we've reached the end. |
634 | * The first time a participating process reaches the last page, it will return |
635 | * true and set *pageno to P_NONE; after that, further attempts to seize the |
636 | * scan will return false. |
637 | * |
638 | * Callers should ignore the value of pageno if the return value is false. |
639 | */ |
640 | bool |
641 | _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) |
642 | { |
643 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
644 | BTPS_State pageStatus; |
645 | bool exit_loop = false; |
646 | bool status = true; |
647 | ParallelIndexScanDesc parallel_scan = scan->parallel_scan; |
648 | BTParallelScanDesc btscan; |
649 | |
650 | *pageno = P_NONE; |
651 | |
652 | btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, |
653 | parallel_scan->ps_offset); |
654 | |
655 | while (1) |
656 | { |
657 | SpinLockAcquire(&btscan->btps_mutex); |
658 | pageStatus = btscan->btps_pageStatus; |
659 | |
660 | if (so->arrayKeyCount < btscan->btps_arrayKeyCount) |
661 | { |
662 | /* Parallel scan has already advanced to a new set of scankeys. */ |
663 | status = false; |
664 | } |
665 | else if (pageStatus == BTPARALLEL_DONE) |
666 | { |
667 | /* |
668 | * We're done with this set of scankeys. This may be the end, or |
669 | * there could be more sets to try. |
670 | */ |
671 | status = false; |
672 | } |
673 | else if (pageStatus != BTPARALLEL_ADVANCING) |
674 | { |
675 | /* |
676 | * We have successfully seized control of the scan for the purpose |
677 | * of advancing it to a new page! |
678 | */ |
679 | btscan->btps_pageStatus = BTPARALLEL_ADVANCING; |
680 | *pageno = btscan->btps_scanPage; |
681 | exit_loop = true; |
682 | } |
683 | SpinLockRelease(&btscan->btps_mutex); |
684 | if (exit_loop || !status) |
685 | break; |
686 | ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE); |
687 | } |
688 | ConditionVariableCancelSleep(); |
689 | |
690 | return status; |
691 | } |
692 | |
693 | /* |
694 | * _bt_parallel_release() -- Complete the process of advancing the scan to a |
695 | * new page. We now have the new value btps_scanPage; some other backend |
696 | * can now begin advancing the scan. |
697 | */ |
698 | void |
699 | _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) |
700 | { |
701 | ParallelIndexScanDesc parallel_scan = scan->parallel_scan; |
702 | BTParallelScanDesc btscan; |
703 | |
704 | btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, |
705 | parallel_scan->ps_offset); |
706 | |
707 | SpinLockAcquire(&btscan->btps_mutex); |
708 | btscan->btps_scanPage = scan_page; |
709 | btscan->btps_pageStatus = BTPARALLEL_IDLE; |
710 | SpinLockRelease(&btscan->btps_mutex); |
711 | ConditionVariableSignal(&btscan->btps_cv); |
712 | } |
713 | |
714 | /* |
715 | * _bt_parallel_done() -- Mark the parallel scan as complete. |
716 | * |
717 | * When there are no pages left to scan, this function should be called to |
718 | * notify other workers. Otherwise, they might wait forever for the scan to |
719 | * advance to the next page. |
720 | */ |
721 | void |
722 | _bt_parallel_done(IndexScanDesc scan) |
723 | { |
724 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
725 | ParallelIndexScanDesc parallel_scan = scan->parallel_scan; |
726 | BTParallelScanDesc btscan; |
727 | bool status_changed = false; |
728 | |
729 | /* Do nothing, for non-parallel scans */ |
730 | if (parallel_scan == NULL) |
731 | return; |
732 | |
733 | btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, |
734 | parallel_scan->ps_offset); |
735 | |
736 | /* |
737 | * Mark the parallel scan as done for this combination of scan keys, |
738 | * unless some other process already did so. See also |
739 | * _bt_advance_array_keys. |
740 | */ |
741 | SpinLockAcquire(&btscan->btps_mutex); |
742 | if (so->arrayKeyCount >= btscan->btps_arrayKeyCount && |
743 | btscan->btps_pageStatus != BTPARALLEL_DONE) |
744 | { |
745 | btscan->btps_pageStatus = BTPARALLEL_DONE; |
746 | status_changed = true; |
747 | } |
748 | SpinLockRelease(&btscan->btps_mutex); |
749 | |
750 | /* wake up all the workers associated with this parallel scan */ |
751 | if (status_changed) |
752 | ConditionVariableBroadcast(&btscan->btps_cv); |
753 | } |
754 | |
755 | /* |
756 | * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array |
757 | * keys. |
758 | * |
759 | * Updates the count of array keys processed for both local and parallel |
760 | * scans. |
761 | */ |
762 | void |
763 | _bt_parallel_advance_array_keys(IndexScanDesc scan) |
764 | { |
765 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
766 | ParallelIndexScanDesc parallel_scan = scan->parallel_scan; |
767 | BTParallelScanDesc btscan; |
768 | |
769 | btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, |
770 | parallel_scan->ps_offset); |
771 | |
772 | so->arrayKeyCount++; |
773 | SpinLockAcquire(&btscan->btps_mutex); |
774 | if (btscan->btps_pageStatus == BTPARALLEL_DONE) |
775 | { |
776 | btscan->btps_scanPage = InvalidBlockNumber; |
777 | btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; |
778 | btscan->btps_arrayKeyCount++; |
779 | } |
780 | SpinLockRelease(&btscan->btps_mutex); |
781 | } |
782 | |
783 | /* |
784 | * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that |
785 | * btbulkdelete() wasn't called. |
786 | */ |
787 | static bool |
788 | _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) |
789 | { |
790 | Buffer metabuf; |
791 | Page metapg; |
792 | BTMetaPageData *metad; |
793 | bool result = false; |
794 | |
795 | metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ); |
796 | metapg = BufferGetPage(metabuf); |
797 | metad = BTPageGetMeta(metapg); |
798 | |
799 | if (metad->btm_version < BTREE_NOVAC_VERSION) |
800 | { |
801 | /* |
802 | * Do cleanup if metapage needs upgrade, because we don't have |
803 | * cleanup-related meta-information yet. |
804 | */ |
805 | result = true; |
806 | } |
807 | else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) && |
808 | TransactionIdPrecedes(metad->btm_oldest_btpo_xact, |
809 | RecentGlobalXmin)) |
810 | { |
811 | /* |
812 | * If oldest btpo.xact in the deleted pages is older than |
813 | * RecentGlobalXmin, then at least one deleted page can be recycled. |
814 | */ |
815 | result = true; |
816 | } |
817 | else |
818 | { |
819 | StdRdOptions *relopts; |
820 | float8 cleanup_scale_factor; |
821 | float8 prev_num_heap_tuples; |
822 | |
823 | /* |
824 | * If table receives enough insertions and no cleanup was performed, |
825 | * then index would appear have stale statistics. If scale factor is |
826 | * set, we avoid that by performing cleanup if the number of inserted |
827 | * tuples exceeds vacuum_cleanup_index_scale_factor fraction of |
828 | * original tuples count. |
829 | */ |
830 | relopts = (StdRdOptions *) info->index->rd_options; |
831 | cleanup_scale_factor = (relopts && |
832 | relopts->vacuum_cleanup_index_scale_factor >= 0) |
833 | ? relopts->vacuum_cleanup_index_scale_factor |
834 | : vacuum_cleanup_index_scale_factor; |
835 | prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; |
836 | |
837 | if (cleanup_scale_factor <= 0 || |
838 | prev_num_heap_tuples <= 0 || |
839 | (info->num_heap_tuples - prev_num_heap_tuples) / |
840 | prev_num_heap_tuples >= cleanup_scale_factor) |
841 | result = true; |
842 | } |
843 | |
844 | _bt_relbuf(info->index, metabuf); |
845 | return result; |
846 | } |
847 | |
848 | /* |
849 | * Bulk deletion of all index entries pointing to a set of heap tuples. |
850 | * The set of target tuples is specified via a callback routine that tells |
851 | * whether any given heap tuple (identified by ItemPointer) is being deleted. |
852 | * |
853 | * Result: a palloc'd struct containing statistical info for VACUUM displays. |
854 | */ |
855 | IndexBulkDeleteResult * |
856 | btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
857 | IndexBulkDeleteCallback callback, void *callback_state) |
858 | { |
859 | Relation rel = info->index; |
860 | BTCycleId cycleid; |
861 | |
862 | /* allocate stats if first time through, else re-use existing struct */ |
863 | if (stats == NULL) |
864 | stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
865 | |
866 | /* Establish the vacuum cycle ID to use for this scan */ |
867 | /* The ENSURE stuff ensures we clean up shared memory on failure */ |
868 | PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); |
869 | { |
870 | TransactionId oldestBtpoXact; |
871 | |
872 | cycleid = _bt_start_vacuum(rel); |
873 | |
874 | btvacuumscan(info, stats, callback, callback_state, cycleid, |
875 | &oldestBtpoXact); |
876 | |
877 | /* |
878 | * Update cleanup-related information in metapage. This information is |
879 | * used only for cleanup but keeping them up to date can avoid |
880 | * unnecessary cleanup even after bulkdelete. |
881 | */ |
882 | _bt_update_meta_cleanup_info(info->index, oldestBtpoXact, |
883 | info->num_heap_tuples); |
884 | } |
885 | PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); |
886 | _bt_end_vacuum(rel); |
887 | |
888 | return stats; |
889 | } |
890 | |
891 | /* |
892 | * Post-VACUUM cleanup. |
893 | * |
894 | * Result: a palloc'd struct containing statistical info for VACUUM displays. |
895 | */ |
896 | IndexBulkDeleteResult * |
897 | btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) |
898 | { |
899 | /* No-op in ANALYZE ONLY mode */ |
900 | if (info->analyze_only) |
901 | return stats; |
902 | |
903 | /* |
904 | * If btbulkdelete was called, we need not do anything, just return the |
905 | * stats from the latest btbulkdelete call. If it wasn't called, we might |
906 | * still need to do a pass over the index, to recycle any newly-recyclable |
907 | * pages or to obtain index statistics. _bt_vacuum_needs_cleanup |
908 | * determines if either are needed. |
909 | * |
910 | * Since we aren't going to actually delete any leaf items, there's no |
911 | * need to go through all the vacuum-cycle-ID pushups. |
912 | */ |
913 | if (stats == NULL) |
914 | { |
915 | TransactionId oldestBtpoXact; |
916 | |
917 | /* Check if we need a cleanup */ |
918 | if (!_bt_vacuum_needs_cleanup(info)) |
919 | return NULL; |
920 | |
921 | stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
922 | btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact); |
923 | |
924 | /* Update cleanup-related information in the metapage */ |
925 | _bt_update_meta_cleanup_info(info->index, oldestBtpoXact, |
926 | info->num_heap_tuples); |
927 | } |
928 | |
929 | /* |
930 | * It's quite possible for us to be fooled by concurrent page splits into |
931 | * double-counting some index tuples, so disbelieve any total that exceeds |
932 | * the underlying heap's count ... if we know that accurately. Otherwise |
933 | * this might just make matters worse. |
934 | */ |
935 | if (!info->estimated_count) |
936 | { |
937 | if (stats->num_index_tuples > info->num_heap_tuples) |
938 | stats->num_index_tuples = info->num_heap_tuples; |
939 | } |
940 | |
941 | return stats; |
942 | } |
943 | |
944 | /* |
945 | * btvacuumscan --- scan the index for VACUUMing purposes |
946 | * |
947 | * This combines the functions of looking for leaf tuples that are deletable |
948 | * according to the vacuum callback, looking for empty pages that can be |
949 | * deleted, and looking for old deleted pages that can be recycled. Both |
950 | * btbulkdelete and btvacuumcleanup invoke this (the latter only if no |
951 | * btbulkdelete call occurred). |
952 | * |
953 | * The caller is responsible for initially allocating/zeroing a stats struct |
954 | * and for obtaining a vacuum cycle ID if necessary. |
955 | */ |
956 | static void |
957 | btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
958 | IndexBulkDeleteCallback callback, void *callback_state, |
959 | BTCycleId cycleid, TransactionId *oldestBtpoXact) |
960 | { |
961 | Relation rel = info->index; |
962 | BTVacState vstate; |
963 | BlockNumber num_pages; |
964 | BlockNumber blkno; |
965 | bool needLock; |
966 | |
967 | /* |
968 | * Reset counts that will be incremented during the scan; needed in case |
969 | * of multiple scans during a single VACUUM command |
970 | */ |
971 | stats->estimated_count = false; |
972 | stats->num_index_tuples = 0; |
973 | stats->pages_deleted = 0; |
974 | |
975 | /* Set up info to pass down to btvacuumpage */ |
976 | vstate.info = info; |
977 | vstate.stats = stats; |
978 | vstate.callback = callback; |
979 | vstate.callback_state = callback_state; |
980 | vstate.cycleid = cycleid; |
981 | vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ |
982 | vstate.lastBlockLocked = BTREE_METAPAGE; |
983 | vstate.totFreePages = 0; |
984 | vstate.oldestBtpoXact = InvalidTransactionId; |
985 | |
986 | /* Create a temporary memory context to run _bt_pagedel in */ |
987 | vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, |
988 | "_bt_pagedel" , |
989 | ALLOCSET_DEFAULT_SIZES); |
990 | |
991 | /* |
992 | * The outer loop iterates over all index pages except the metapage, in |
993 | * physical order (we hope the kernel will cooperate in providing |
994 | * read-ahead for speed). It is critical that we visit all leaf pages, |
995 | * including ones added after we start the scan, else we might fail to |
996 | * delete some deletable tuples. Hence, we must repeatedly check the |
997 | * relation length. We must acquire the relation-extension lock while |
998 | * doing so to avoid a race condition: if someone else is extending the |
999 | * relation, there is a window where bufmgr/smgr have created a new |
1000 | * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If |
1001 | * we manage to scan such a page here, we'll improperly assume it can be |
1002 | * recycled. Taking the lock synchronizes things enough to prevent a |
1003 | * problem: either num_pages won't include the new page, or _bt_getbuf |
1004 | * already has write lock on the buffer and it will be fully initialized |
1005 | * before we can examine it. (See also vacuumlazy.c, which has the same |
1006 | * issue.) Also, we need not worry if a page is added immediately after |
1007 | * we look; the page splitting code already has write-lock on the left |
1008 | * page before it adds a right page, so we must already have processed any |
1009 | * tuples due to be moved into such a page. |
1010 | * |
1011 | * We can skip locking for new or temp relations, however, since no one |
1012 | * else could be accessing them. |
1013 | */ |
1014 | needLock = !RELATION_IS_LOCAL(rel); |
1015 | |
1016 | blkno = BTREE_METAPAGE + 1; |
1017 | for (;;) |
1018 | { |
1019 | /* Get the current relation length */ |
1020 | if (needLock) |
1021 | LockRelationForExtension(rel, ExclusiveLock); |
1022 | num_pages = RelationGetNumberOfBlocks(rel); |
1023 | if (needLock) |
1024 | UnlockRelationForExtension(rel, ExclusiveLock); |
1025 | |
1026 | if (info->report_progress) |
1027 | pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, |
1028 | num_pages); |
1029 | |
1030 | /* Quit if we've scanned the whole relation */ |
1031 | if (blkno >= num_pages) |
1032 | break; |
1033 | /* Iterate over pages, then loop back to recheck length */ |
1034 | for (; blkno < num_pages; blkno++) |
1035 | { |
1036 | btvacuumpage(&vstate, blkno, blkno); |
1037 | if (info->report_progress) |
1038 | pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, |
1039 | blkno); |
1040 | } |
1041 | } |
1042 | |
1043 | /* |
1044 | * Check to see if we need to issue one final WAL record for this index, |
1045 | * which may be needed for correctness on a hot standby node when non-MVCC |
1046 | * index scans could take place. |
1047 | * |
1048 | * If the WAL is replayed in hot standby, the replay process needs to get |
1049 | * cleanup locks on all index leaf pages, just as we've been doing here. |
1050 | * However, we won't issue any WAL records about pages that have no items |
1051 | * to be deleted. For pages between pages we've vacuumed, the replay code |
1052 | * will take locks under the direction of the lastBlockVacuumed fields in |
1053 | * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one |
1054 | * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record |
1055 | * against the last leaf page in the index, if that one wasn't vacuumed. |
1056 | */ |
1057 | if (XLogStandbyInfoActive() && |
1058 | vstate.lastBlockVacuumed < vstate.lastBlockLocked) |
1059 | { |
1060 | Buffer buf; |
1061 | |
1062 | /* |
1063 | * The page should be valid, but we can't use _bt_getbuf() because we |
1064 | * want to use a nondefault buffer access strategy. Since we aren't |
1065 | * going to delete any items, getting cleanup lock again is probably |
1066 | * overkill, but for consistency do that anyway. |
1067 | */ |
1068 | buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked, |
1069 | RBM_NORMAL, info->strategy); |
1070 | LockBufferForCleanup(buf); |
1071 | _bt_checkpage(rel, buf); |
1072 | _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); |
1073 | _bt_relbuf(rel, buf); |
1074 | } |
1075 | |
1076 | MemoryContextDelete(vstate.pagedelcontext); |
1077 | |
1078 | /* |
1079 | * If we found any recyclable pages (and recorded them in the FSM), then |
1080 | * forcibly update the upper-level FSM pages to ensure that searchers can |
1081 | * find them. It's possible that the pages were also found during |
1082 | * previous scans and so this is a waste of time, but it's cheap enough |
1083 | * relative to scanning the index that it shouldn't matter much, and |
1084 | * making sure that free pages are available sooner not later seems |
1085 | * worthwhile. |
1086 | * |
1087 | * Note that if no recyclable pages exist, we don't bother vacuuming the |
1088 | * FSM at all. |
1089 | */ |
1090 | if (vstate.totFreePages > 0) |
1091 | IndexFreeSpaceMapVacuum(rel); |
1092 | |
1093 | /* update statistics */ |
1094 | stats->num_pages = num_pages; |
1095 | stats->pages_free = vstate.totFreePages; |
1096 | |
1097 | if (oldestBtpoXact) |
1098 | *oldestBtpoXact = vstate.oldestBtpoXact; |
1099 | } |
1100 | |
1101 | /* |
1102 | * btvacuumpage --- VACUUM one page |
1103 | * |
1104 | * This processes a single page for btvacuumscan(). In some cases we |
1105 | * must go back and re-examine previously-scanned pages; this routine |
1106 | * recurses when necessary to handle that case. |
1107 | * |
1108 | * blkno is the page to process. orig_blkno is the highest block number |
1109 | * reached by the outer btvacuumscan loop (the same as blkno, unless we |
1110 | * are recursing to re-examine a previous page). |
1111 | */ |
1112 | static void |
1113 | btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) |
1114 | { |
1115 | IndexVacuumInfo *info = vstate->info; |
1116 | IndexBulkDeleteResult *stats = vstate->stats; |
1117 | IndexBulkDeleteCallback callback = vstate->callback; |
1118 | void *callback_state = vstate->callback_state; |
1119 | Relation rel = info->index; |
1120 | bool delete_now; |
1121 | BlockNumber recurse_to; |
1122 | Buffer buf; |
1123 | Page page; |
1124 | BTPageOpaque opaque = NULL; |
1125 | |
1126 | restart: |
1127 | delete_now = false; |
1128 | recurse_to = P_NONE; |
1129 | |
1130 | /* call vacuum_delay_point while not holding any buffer lock */ |
1131 | vacuum_delay_point(); |
1132 | |
1133 | /* |
1134 | * We can't use _bt_getbuf() here because it always applies |
1135 | * _bt_checkpage(), which will barf on an all-zero page. We want to |
1136 | * recycle all-zero pages, not fail. Also, we want to use a nondefault |
1137 | * buffer access strategy. |
1138 | */ |
1139 | buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, |
1140 | info->strategy); |
1141 | LockBuffer(buf, BT_READ); |
1142 | page = BufferGetPage(buf); |
1143 | if (!PageIsNew(page)) |
1144 | { |
1145 | _bt_checkpage(rel, buf); |
1146 | opaque = (BTPageOpaque) PageGetSpecialPointer(page); |
1147 | } |
1148 | |
1149 | /* |
1150 | * If we are recursing, the only case we want to do anything with is a |
1151 | * live leaf page having the current vacuum cycle ID. Any other state |
1152 | * implies we already saw the page (eg, deleted it as being empty). |
1153 | */ |
1154 | if (blkno != orig_blkno) |
1155 | { |
1156 | if (_bt_page_recyclable(page) || |
1157 | P_IGNORE(opaque) || |
1158 | !P_ISLEAF(opaque) || |
1159 | opaque->btpo_cycleid != vstate->cycleid) |
1160 | { |
1161 | _bt_relbuf(rel, buf); |
1162 | return; |
1163 | } |
1164 | } |
1165 | |
1166 | /* Page is valid, see what to do with it */ |
1167 | if (_bt_page_recyclable(page)) |
1168 | { |
1169 | /* Okay to recycle this page */ |
1170 | RecordFreeIndexPage(rel, blkno); |
1171 | vstate->totFreePages++; |
1172 | stats->pages_deleted++; |
1173 | } |
1174 | else if (P_ISDELETED(opaque)) |
1175 | { |
1176 | /* Already deleted, but can't recycle yet */ |
1177 | stats->pages_deleted++; |
1178 | |
1179 | /* Update the oldest btpo.xact */ |
1180 | if (!TransactionIdIsValid(vstate->oldestBtpoXact) || |
1181 | TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) |
1182 | vstate->oldestBtpoXact = opaque->btpo.xact; |
1183 | } |
1184 | else if (P_ISHALFDEAD(opaque)) |
1185 | { |
1186 | /* Half-dead, try to delete */ |
1187 | delete_now = true; |
1188 | } |
1189 | else if (P_ISLEAF(opaque)) |
1190 | { |
1191 | OffsetNumber deletable[MaxOffsetNumber]; |
1192 | int ndeletable; |
1193 | OffsetNumber offnum, |
1194 | minoff, |
1195 | maxoff; |
1196 | |
1197 | /* |
1198 | * Trade in the initial read lock for a super-exclusive write lock on |
1199 | * this page. We must get such a lock on every leaf page over the |
1200 | * course of the vacuum scan, whether or not it actually contains any |
1201 | * deletable tuples --- see nbtree/README. |
1202 | */ |
1203 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
1204 | LockBufferForCleanup(buf); |
1205 | |
1206 | /* |
1207 | * Remember highest leaf page number we've taken cleanup lock on; see |
1208 | * notes in btvacuumscan |
1209 | */ |
1210 | if (blkno > vstate->lastBlockLocked) |
1211 | vstate->lastBlockLocked = blkno; |
1212 | |
1213 | /* |
1214 | * Check whether we need to recurse back to earlier pages. What we |
1215 | * are concerned about is a page split that happened since we started |
1216 | * the vacuum scan. If the split moved some tuples to a lower page |
1217 | * then we might have missed 'em. If so, set up for tail recursion. |
1218 | * (Must do this before possibly clearing btpo_cycleid below!) |
1219 | */ |
1220 | if (vstate->cycleid != 0 && |
1221 | opaque->btpo_cycleid == vstate->cycleid && |
1222 | !(opaque->btpo_flags & BTP_SPLIT_END) && |
1223 | !P_RIGHTMOST(opaque) && |
1224 | opaque->btpo_next < orig_blkno) |
1225 | recurse_to = opaque->btpo_next; |
1226 | |
1227 | /* |
1228 | * Scan over all items to see which ones need deleted according to the |
1229 | * callback function. |
1230 | */ |
1231 | ndeletable = 0; |
1232 | minoff = P_FIRSTDATAKEY(opaque); |
1233 | maxoff = PageGetMaxOffsetNumber(page); |
1234 | if (callback) |
1235 | { |
1236 | for (offnum = minoff; |
1237 | offnum <= maxoff; |
1238 | offnum = OffsetNumberNext(offnum)) |
1239 | { |
1240 | IndexTuple itup; |
1241 | ItemPointer htup; |
1242 | |
1243 | itup = (IndexTuple) PageGetItem(page, |
1244 | PageGetItemId(page, offnum)); |
1245 | htup = &(itup->t_tid); |
1246 | |
1247 | /* |
1248 | * During Hot Standby we currently assume that |
1249 | * XLOG_BTREE_VACUUM records do not produce conflicts. That is |
1250 | * only true as long as the callback function depends only |
1251 | * upon whether the index tuple refers to heap tuples removed |
1252 | * in the initial heap scan. When vacuum starts it derives a |
1253 | * value of OldestXmin. Backends taking later snapshots could |
1254 | * have a RecentGlobalXmin with a later xid than the vacuum's |
1255 | * OldestXmin, so it is possible that row versions deleted |
1256 | * after OldestXmin could be marked as killed by other |
1257 | * backends. The callback function *could* look at the index |
1258 | * tuple state in isolation and decide to delete the index |
1259 | * tuple, though currently it does not. If it ever did, we |
1260 | * would need to reconsider whether XLOG_BTREE_VACUUM records |
1261 | * should cause conflicts. If they did cause conflicts they |
1262 | * would be fairly harsh conflicts, since we haven't yet |
1263 | * worked out a way to pass a useful value for |
1264 | * latestRemovedXid on the XLOG_BTREE_VACUUM records. This |
1265 | * applies to *any* type of index that marks index tuples as |
1266 | * killed. |
1267 | */ |
1268 | if (callback(htup, callback_state)) |
1269 | deletable[ndeletable++] = offnum; |
1270 | } |
1271 | } |
1272 | |
1273 | /* |
1274 | * Apply any needed deletes. We issue just one _bt_delitems_vacuum() |
1275 | * call per page, so as to minimize WAL traffic. |
1276 | */ |
1277 | if (ndeletable > 0) |
1278 | { |
1279 | /* |
1280 | * Notice that the issued XLOG_BTREE_VACUUM WAL record includes |
1281 | * all information to the replay code to allow it to get a cleanup |
1282 | * lock on all pages between the previous lastBlockVacuumed and |
1283 | * this page. This ensures that WAL replay locks all leaf pages at |
1284 | * some point, which is important should non-MVCC scans be |
1285 | * requested. This is currently unused on standby, but we record |
1286 | * it anyway, so that the WAL contains the required information. |
1287 | * |
1288 | * Since we can visit leaf pages out-of-order when recursing, |
1289 | * replay might end up locking such pages an extra time, but it |
1290 | * doesn't seem worth the amount of bookkeeping it'd take to avoid |
1291 | * that. |
1292 | */ |
1293 | _bt_delitems_vacuum(rel, buf, deletable, ndeletable, |
1294 | vstate->lastBlockVacuumed); |
1295 | |
1296 | /* |
1297 | * Remember highest leaf page number we've issued a |
1298 | * XLOG_BTREE_VACUUM WAL record for. |
1299 | */ |
1300 | if (blkno > vstate->lastBlockVacuumed) |
1301 | vstate->lastBlockVacuumed = blkno; |
1302 | |
1303 | stats->tuples_removed += ndeletable; |
1304 | /* must recompute maxoff */ |
1305 | maxoff = PageGetMaxOffsetNumber(page); |
1306 | } |
1307 | else |
1308 | { |
1309 | /* |
1310 | * If the page has been split during this vacuum cycle, it seems |
1311 | * worth expending a write to clear btpo_cycleid even if we don't |
1312 | * have any deletions to do. (If we do, _bt_delitems_vacuum takes |
1313 | * care of this.) This ensures we won't process the page again. |
1314 | * |
1315 | * We treat this like a hint-bit update because there's no need to |
1316 | * WAL-log it. |
1317 | */ |
1318 | if (vstate->cycleid != 0 && |
1319 | opaque->btpo_cycleid == vstate->cycleid) |
1320 | { |
1321 | opaque->btpo_cycleid = 0; |
1322 | MarkBufferDirtyHint(buf, true); |
1323 | } |
1324 | } |
1325 | |
1326 | /* |
1327 | * If it's now empty, try to delete; else count the live tuples. We |
1328 | * don't delete when recursing, though, to avoid putting entries into |
1329 | * freePages out-of-order (doesn't seem worth any extra code to handle |
1330 | * the case). |
1331 | */ |
1332 | if (minoff > maxoff) |
1333 | delete_now = (blkno == orig_blkno); |
1334 | else |
1335 | stats->num_index_tuples += maxoff - minoff + 1; |
1336 | } |
1337 | |
1338 | if (delete_now) |
1339 | { |
1340 | MemoryContext oldcontext; |
1341 | int ndel; |
1342 | |
1343 | /* Run pagedel in a temp context to avoid memory leakage */ |
1344 | MemoryContextReset(vstate->pagedelcontext); |
1345 | oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); |
1346 | |
1347 | ndel = _bt_pagedel(rel, buf); |
1348 | |
1349 | /* count only this page, else may double-count parent */ |
1350 | if (ndel) |
1351 | { |
1352 | stats->pages_deleted++; |
1353 | if (!TransactionIdIsValid(vstate->oldestBtpoXact) || |
1354 | TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) |
1355 | vstate->oldestBtpoXact = opaque->btpo.xact; |
1356 | } |
1357 | |
1358 | MemoryContextSwitchTo(oldcontext); |
1359 | /* pagedel released buffer, so we shouldn't */ |
1360 | } |
1361 | else |
1362 | _bt_relbuf(rel, buf); |
1363 | |
1364 | /* |
1365 | * This is really tail recursion, but if the compiler is too stupid to |
1366 | * optimize it as such, we'd eat an uncomfortably large amount of stack |
1367 | * space per recursion level (due to the deletable[] array). A failure is |
1368 | * improbable since the number of levels isn't likely to be large ... but |
1369 | * just in case, let's hand-optimize into a loop. |
1370 | */ |
1371 | if (recurse_to != P_NONE) |
1372 | { |
1373 | blkno = recurse_to; |
1374 | goto restart; |
1375 | } |
1376 | } |
1377 | |
1378 | /* |
1379 | * btcanreturn() -- Check whether btree indexes support index-only scans. |
1380 | * |
1381 | * btrees always do, so this is trivial. |
1382 | */ |
1383 | bool |
1384 | btcanreturn(Relation index, int attno) |
1385 | { |
1386 | return true; |
1387 | } |
1388 | |