1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * hash.c |
4 | * Implementation of Margo Seltzer's Hashing package for postgres. |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/hash/hash.c |
12 | * |
13 | * NOTES |
14 | * This file contains only the public interface routines. |
15 | * |
16 | *------------------------------------------------------------------------- |
17 | */ |
18 | |
19 | #include "postgres.h" |
20 | |
21 | #include "access/hash.h" |
22 | #include "access/hash_xlog.h" |
23 | #include "access/relscan.h" |
24 | #include "access/tableam.h" |
25 | #include "catalog/index.h" |
26 | #include "commands/progress.h" |
27 | #include "commands/vacuum.h" |
28 | #include "miscadmin.h" |
29 | #include "optimizer/plancat.h" |
30 | #include "pgstat.h" |
31 | #include "utils/builtins.h" |
32 | #include "utils/index_selfuncs.h" |
33 | #include "utils/rel.h" |
34 | #include "miscadmin.h" |
35 | |
36 | |
37 | /* Working state for hashbuild and its callback */ |
38 | typedef struct |
39 | { |
40 | HSpool *spool; /* NULL if not using spooling */ |
41 | double indtuples; /* # tuples accepted into index */ |
42 | Relation heapRel; /* heap relation descriptor */ |
43 | } HashBuildState; |
44 | |
45 | static void hashbuildCallback(Relation index, |
46 | HeapTuple htup, |
47 | Datum *values, |
48 | bool *isnull, |
49 | bool tupleIsAlive, |
50 | void *state); |
51 | |
52 | |
53 | /* |
54 | * Hash handler function: return IndexAmRoutine with access method parameters |
55 | * and callbacks. |
56 | */ |
57 | Datum |
58 | hashhandler(PG_FUNCTION_ARGS) |
59 | { |
60 | IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); |
61 | |
62 | amroutine->amstrategies = HTMaxStrategyNumber; |
63 | amroutine->amsupport = HASHNProcs; |
64 | amroutine->amcanorder = false; |
65 | amroutine->amcanorderbyop = false; |
66 | amroutine->amcanbackward = true; |
67 | amroutine->amcanunique = false; |
68 | amroutine->amcanmulticol = false; |
69 | amroutine->amoptionalkey = false; |
70 | amroutine->amsearcharray = false; |
71 | amroutine->amsearchnulls = false; |
72 | amroutine->amstorage = false; |
73 | amroutine->amclusterable = false; |
74 | amroutine->ampredlocks = true; |
75 | amroutine->amcanparallel = false; |
76 | amroutine->amcaninclude = false; |
77 | amroutine->amkeytype = INT4OID; |
78 | |
79 | amroutine->ambuild = hashbuild; |
80 | amroutine->ambuildempty = hashbuildempty; |
81 | amroutine->aminsert = hashinsert; |
82 | amroutine->ambulkdelete = hashbulkdelete; |
83 | amroutine->amvacuumcleanup = hashvacuumcleanup; |
84 | amroutine->amcanreturn = NULL; |
85 | amroutine->amcostestimate = hashcostestimate; |
86 | amroutine->amoptions = hashoptions; |
87 | amroutine->amproperty = NULL; |
88 | amroutine->ambuildphasename = NULL; |
89 | amroutine->amvalidate = hashvalidate; |
90 | amroutine->ambeginscan = hashbeginscan; |
91 | amroutine->amrescan = hashrescan; |
92 | amroutine->amgettuple = hashgettuple; |
93 | amroutine->amgetbitmap = hashgetbitmap; |
94 | amroutine->amendscan = hashendscan; |
95 | amroutine->ammarkpos = NULL; |
96 | amroutine->amrestrpos = NULL; |
97 | amroutine->amestimateparallelscan = NULL; |
98 | amroutine->aminitparallelscan = NULL; |
99 | amroutine->amparallelrescan = NULL; |
100 | |
101 | PG_RETURN_POINTER(amroutine); |
102 | } |
103 | |
104 | /* |
105 | * hashbuild() -- build a new hash index. |
106 | */ |
107 | IndexBuildResult * |
108 | hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) |
109 | { |
110 | IndexBuildResult *result; |
111 | BlockNumber relpages; |
112 | double reltuples; |
113 | double allvisfrac; |
114 | uint32 num_buckets; |
115 | long sort_threshold; |
116 | HashBuildState buildstate; |
117 | |
118 | /* |
119 | * We expect to be called exactly once for any index relation. If that's |
120 | * not the case, big trouble's what we have. |
121 | */ |
122 | if (RelationGetNumberOfBlocks(index) != 0) |
123 | elog(ERROR, "index \"%s\" already contains data" , |
124 | RelationGetRelationName(index)); |
125 | |
126 | /* Estimate the number of rows currently present in the table */ |
127 | estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac); |
128 | |
129 | /* Initialize the hash index metadata page and initial buckets */ |
130 | num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM); |
131 | |
132 | /* |
133 | * If we just insert the tuples into the index in scan order, then |
134 | * (assuming their hash codes are pretty random) there will be no locality |
135 | * of access to the index, and if the index is bigger than available RAM |
136 | * then we'll thrash horribly. To prevent that scenario, we can sort the |
137 | * tuples by (expected) bucket number. However, such a sort is useless |
138 | * overhead when the index does fit in RAM. We choose to sort if the |
139 | * initial index size exceeds maintenance_work_mem, or the number of |
140 | * buffers usable for the index, whichever is less. (Limiting by the |
141 | * number of buffers should reduce thrashing between PG buffers and kernel |
142 | * buffers, which seems useful even if no physical I/O results. Limiting |
143 | * by maintenance_work_mem is useful to allow easy testing of the sort |
144 | * code path, and may be useful to DBAs as an additional control knob.) |
145 | * |
146 | * NOTE: this test will need adjustment if a bucket is ever different from |
147 | * one page. Also, "initial index size" accounting does not include the |
148 | * metapage, nor the first bitmap page. |
149 | */ |
150 | sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ; |
151 | if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP) |
152 | sort_threshold = Min(sort_threshold, NBuffers); |
153 | else |
154 | sort_threshold = Min(sort_threshold, NLocBuffer); |
155 | |
156 | if (num_buckets >= (uint32) sort_threshold) |
157 | buildstate.spool = _h_spoolinit(heap, index, num_buckets); |
158 | else |
159 | buildstate.spool = NULL; |
160 | |
161 | /* prepare to build the index */ |
162 | buildstate.indtuples = 0; |
163 | buildstate.heapRel = heap; |
164 | |
165 | /* do the heap scan */ |
166 | reltuples = table_index_build_scan(heap, index, indexInfo, true, true, |
167 | hashbuildCallback, |
168 | (void *) &buildstate, NULL); |
169 | pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL, |
170 | buildstate.indtuples); |
171 | |
172 | if (buildstate.spool) |
173 | { |
174 | /* sort the tuples and insert them into the index */ |
175 | _h_indexbuild(buildstate.spool, buildstate.heapRel); |
176 | _h_spooldestroy(buildstate.spool); |
177 | } |
178 | |
179 | /* |
180 | * Return statistics |
181 | */ |
182 | result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); |
183 | |
184 | result->heap_tuples = reltuples; |
185 | result->index_tuples = buildstate.indtuples; |
186 | |
187 | return result; |
188 | } |
189 | |
190 | /* |
191 | * hashbuildempty() -- build an empty hash index in the initialization fork |
192 | */ |
193 | void |
194 | hashbuildempty(Relation index) |
195 | { |
196 | _hash_init(index, 0, INIT_FORKNUM); |
197 | } |
198 | |
199 | /* |
200 | * Per-tuple callback for table_index_build_scan |
201 | */ |
202 | static void |
203 | hashbuildCallback(Relation index, |
204 | HeapTuple htup, |
205 | Datum *values, |
206 | bool *isnull, |
207 | bool tupleIsAlive, |
208 | void *state) |
209 | { |
210 | HashBuildState *buildstate = (HashBuildState *) state; |
211 | Datum index_values[1]; |
212 | bool index_isnull[1]; |
213 | IndexTuple itup; |
214 | |
215 | /* convert data to a hash key; on failure, do not insert anything */ |
216 | if (!_hash_convert_tuple(index, |
217 | values, isnull, |
218 | index_values, index_isnull)) |
219 | return; |
220 | |
221 | /* Either spool the tuple for sorting, or just put it into the index */ |
222 | if (buildstate->spool) |
223 | _h_spool(buildstate->spool, &htup->t_self, |
224 | index_values, index_isnull); |
225 | else |
226 | { |
227 | /* form an index tuple and point it at the heap tuple */ |
228 | itup = index_form_tuple(RelationGetDescr(index), |
229 | index_values, index_isnull); |
230 | itup->t_tid = htup->t_self; |
231 | _hash_doinsert(index, itup, buildstate->heapRel); |
232 | pfree(itup); |
233 | } |
234 | |
235 | buildstate->indtuples += 1; |
236 | } |
237 | |
238 | /* |
239 | * hashinsert() -- insert an index tuple into a hash table. |
240 | * |
241 | * Hash on the heap tuple's key, form an index tuple with hash code. |
242 | * Find the appropriate location for the new tuple, and put it there. |
243 | */ |
244 | bool |
245 | hashinsert(Relation rel, Datum *values, bool *isnull, |
246 | ItemPointer ht_ctid, Relation heapRel, |
247 | IndexUniqueCheck checkUnique, |
248 | IndexInfo *indexInfo) |
249 | { |
250 | Datum index_values[1]; |
251 | bool index_isnull[1]; |
252 | IndexTuple itup; |
253 | |
254 | /* convert data to a hash key; on failure, do not insert anything */ |
255 | if (!_hash_convert_tuple(rel, |
256 | values, isnull, |
257 | index_values, index_isnull)) |
258 | return false; |
259 | |
260 | /* form an index tuple and point it at the heap tuple */ |
261 | itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull); |
262 | itup->t_tid = *ht_ctid; |
263 | |
264 | _hash_doinsert(rel, itup, heapRel); |
265 | |
266 | pfree(itup); |
267 | |
268 | return false; |
269 | } |
270 | |
271 | |
272 | /* |
273 | * hashgettuple() -- Get the next tuple in the scan. |
274 | */ |
275 | bool |
276 | hashgettuple(IndexScanDesc scan, ScanDirection dir) |
277 | { |
278 | HashScanOpaque so = (HashScanOpaque) scan->opaque; |
279 | bool res; |
280 | |
281 | /* Hash indexes are always lossy since we store only the hash code */ |
282 | scan->xs_recheck = true; |
283 | |
284 | /* |
285 | * If we've already initialized this scan, we can just advance it in the |
286 | * appropriate direction. If we haven't done so yet, we call a routine to |
287 | * get the first item in the scan. |
288 | */ |
289 | if (!HashScanPosIsValid(so->currPos)) |
290 | res = _hash_first(scan, dir); |
291 | else |
292 | { |
293 | /* |
294 | * Check to see if we should kill the previously-fetched tuple. |
295 | */ |
296 | if (scan->kill_prior_tuple) |
297 | { |
298 | /* |
299 | * Yes, so remember it for later. (We'll deal with all such tuples |
300 | * at once right after leaving the index page or at end of scan.) |
301 | * In case if caller reverses the indexscan direction it is quite |
302 | * possible that the same item might get entered multiple times. |
303 | * But, we don't detect that; instead, we just forget any excess |
304 | * entries. |
305 | */ |
306 | if (so->killedItems == NULL) |
307 | so->killedItems = (int *) |
308 | palloc(MaxIndexTuplesPerPage * sizeof(int)); |
309 | |
310 | if (so->numKilled < MaxIndexTuplesPerPage) |
311 | so->killedItems[so->numKilled++] = so->currPos.itemIndex; |
312 | } |
313 | |
314 | /* |
315 | * Now continue the scan. |
316 | */ |
317 | res = _hash_next(scan, dir); |
318 | } |
319 | |
320 | return res; |
321 | } |
322 | |
323 | |
324 | /* |
325 | * hashgetbitmap() -- get all tuples at once |
326 | */ |
327 | int64 |
328 | hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) |
329 | { |
330 | HashScanOpaque so = (HashScanOpaque) scan->opaque; |
331 | bool res; |
332 | int64 ntids = 0; |
333 | HashScanPosItem *currItem; |
334 | |
335 | res = _hash_first(scan, ForwardScanDirection); |
336 | |
337 | while (res) |
338 | { |
339 | currItem = &so->currPos.items[so->currPos.itemIndex]; |
340 | |
341 | /* |
342 | * _hash_first and _hash_next handle eliminate dead index entries |
343 | * whenever scan->ignore_killed_tuples is true. Therefore, there's |
344 | * nothing to do here except add the results to the TIDBitmap. |
345 | */ |
346 | tbm_add_tuples(tbm, &(currItem->heapTid), 1, true); |
347 | ntids++; |
348 | |
349 | res = _hash_next(scan, ForwardScanDirection); |
350 | } |
351 | |
352 | return ntids; |
353 | } |
354 | |
355 | |
356 | /* |
357 | * hashbeginscan() -- start a scan on a hash index |
358 | */ |
359 | IndexScanDesc |
360 | hashbeginscan(Relation rel, int nkeys, int norderbys) |
361 | { |
362 | IndexScanDesc scan; |
363 | HashScanOpaque so; |
364 | |
365 | /* no order by operators allowed */ |
366 | Assert(norderbys == 0); |
367 | |
368 | scan = RelationGetIndexScan(rel, nkeys, norderbys); |
369 | |
370 | so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); |
371 | HashScanPosInvalidate(so->currPos); |
372 | so->hashso_bucket_buf = InvalidBuffer; |
373 | so->hashso_split_bucket_buf = InvalidBuffer; |
374 | |
375 | so->hashso_buc_populated = false; |
376 | so->hashso_buc_split = false; |
377 | |
378 | so->killedItems = NULL; |
379 | so->numKilled = 0; |
380 | |
381 | scan->opaque = so; |
382 | |
383 | return scan; |
384 | } |
385 | |
386 | /* |
387 | * hashrescan() -- rescan an index relation |
388 | */ |
389 | void |
390 | hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, |
391 | ScanKey orderbys, int norderbys) |
392 | { |
393 | HashScanOpaque so = (HashScanOpaque) scan->opaque; |
394 | Relation rel = scan->indexRelation; |
395 | |
396 | if (HashScanPosIsValid(so->currPos)) |
397 | { |
398 | /* Before leaving current page, deal with any killed items */ |
399 | if (so->numKilled > 0) |
400 | _hash_kill_items(scan); |
401 | } |
402 | |
403 | _hash_dropscanbuf(rel, so); |
404 | |
405 | /* set position invalid (this will cause _hash_first call) */ |
406 | HashScanPosInvalidate(so->currPos); |
407 | |
408 | /* Update scan key, if a new one is given */ |
409 | if (scankey && scan->numberOfKeys > 0) |
410 | { |
411 | memmove(scan->keyData, |
412 | scankey, |
413 | scan->numberOfKeys * sizeof(ScanKeyData)); |
414 | } |
415 | |
416 | so->hashso_buc_populated = false; |
417 | so->hashso_buc_split = false; |
418 | } |
419 | |
420 | /* |
421 | * hashendscan() -- close down a scan |
422 | */ |
423 | void |
424 | hashendscan(IndexScanDesc scan) |
425 | { |
426 | HashScanOpaque so = (HashScanOpaque) scan->opaque; |
427 | Relation rel = scan->indexRelation; |
428 | |
429 | if (HashScanPosIsValid(so->currPos)) |
430 | { |
431 | /* Before leaving current page, deal with any killed items */ |
432 | if (so->numKilled > 0) |
433 | _hash_kill_items(scan); |
434 | } |
435 | |
436 | _hash_dropscanbuf(rel, so); |
437 | |
438 | if (so->killedItems != NULL) |
439 | pfree(so->killedItems); |
440 | pfree(so); |
441 | scan->opaque = NULL; |
442 | } |
443 | |
444 | /* |
445 | * Bulk deletion of all index entries pointing to a set of heap tuples. |
446 | * The set of target tuples is specified via a callback routine that tells |
447 | * whether any given heap tuple (identified by ItemPointer) is being deleted. |
448 | * |
449 | * This function also deletes the tuples that are moved by split to other |
450 | * bucket. |
451 | * |
452 | * Result: a palloc'd struct containing statistical info for VACUUM displays. |
453 | */ |
454 | IndexBulkDeleteResult * |
455 | hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
456 | IndexBulkDeleteCallback callback, void *callback_state) |
457 | { |
458 | Relation rel = info->index; |
459 | double tuples_removed; |
460 | double num_index_tuples; |
461 | double orig_ntuples; |
462 | Bucket orig_maxbucket; |
463 | Bucket cur_maxbucket; |
464 | Bucket cur_bucket; |
465 | Buffer metabuf = InvalidBuffer; |
466 | HashMetaPage metap; |
467 | HashMetaPage cachedmetap; |
468 | |
469 | tuples_removed = 0; |
470 | num_index_tuples = 0; |
471 | |
472 | /* |
473 | * We need a copy of the metapage so that we can use its hashm_spares[] |
474 | * values to compute bucket page addresses, but a cached copy should be |
475 | * good enough. (If not, we'll detect that further down and refresh the |
476 | * cache as necessary.) |
477 | */ |
478 | cachedmetap = _hash_getcachedmetap(rel, &metabuf, false); |
479 | Assert(cachedmetap != NULL); |
480 | |
481 | orig_maxbucket = cachedmetap->hashm_maxbucket; |
482 | orig_ntuples = cachedmetap->hashm_ntuples; |
483 | |
484 | /* Scan the buckets that we know exist */ |
485 | cur_bucket = 0; |
486 | cur_maxbucket = orig_maxbucket; |
487 | |
488 | loop_top: |
489 | while (cur_bucket <= cur_maxbucket) |
490 | { |
491 | BlockNumber bucket_blkno; |
492 | BlockNumber blkno; |
493 | Buffer bucket_buf; |
494 | Buffer buf; |
495 | HashPageOpaque bucket_opaque; |
496 | Page page; |
497 | bool split_cleanup = false; |
498 | |
499 | /* Get address of bucket's start page */ |
500 | bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); |
501 | |
502 | blkno = bucket_blkno; |
503 | |
504 | /* |
505 | * We need to acquire a cleanup lock on the primary bucket page to out |
506 | * wait concurrent scans before deleting the dead tuples. |
507 | */ |
508 | buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); |
509 | LockBufferForCleanup(buf); |
510 | _hash_checkpage(rel, buf, LH_BUCKET_PAGE); |
511 | |
512 | page = BufferGetPage(buf); |
513 | bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); |
514 | |
515 | /* |
516 | * If the bucket contains tuples that are moved by split, then we need |
517 | * to delete such tuples. We can't delete such tuples if the split |
518 | * operation on bucket is not finished as those are needed by scans. |
519 | */ |
520 | if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && |
521 | H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) |
522 | { |
523 | split_cleanup = true; |
524 | |
525 | /* |
526 | * This bucket might have been split since we last held a lock on |
527 | * the metapage. If so, hashm_maxbucket, hashm_highmask and |
528 | * hashm_lowmask might be old enough to cause us to fail to remove |
529 | * tuples left behind by the most recent split. To prevent that, |
530 | * now that the primary page of the target bucket has been locked |
531 | * (and thus can't be further split), check whether we need to |
532 | * update our cached metapage data. |
533 | */ |
534 | Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber); |
535 | if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) |
536 | { |
537 | cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); |
538 | Assert(cachedmetap != NULL); |
539 | } |
540 | } |
541 | |
542 | bucket_buf = buf; |
543 | |
544 | hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, |
545 | cachedmetap->hashm_maxbucket, |
546 | cachedmetap->hashm_highmask, |
547 | cachedmetap->hashm_lowmask, &tuples_removed, |
548 | &num_index_tuples, split_cleanup, |
549 | callback, callback_state); |
550 | |
551 | _hash_dropbuf(rel, bucket_buf); |
552 | |
553 | /* Advance to next bucket */ |
554 | cur_bucket++; |
555 | } |
556 | |
557 | if (BufferIsInvalid(metabuf)) |
558 | metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); |
559 | |
560 | /* Write-lock metapage and check for split since we started */ |
561 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
562 | metap = HashPageGetMeta(BufferGetPage(metabuf)); |
563 | |
564 | if (cur_maxbucket != metap->hashm_maxbucket) |
565 | { |
566 | /* There's been a split, so process the additional bucket(s) */ |
567 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
568 | cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); |
569 | Assert(cachedmetap != NULL); |
570 | cur_maxbucket = cachedmetap->hashm_maxbucket; |
571 | goto loop_top; |
572 | } |
573 | |
574 | /* Okay, we're really done. Update tuple count in metapage. */ |
575 | START_CRIT_SECTION(); |
576 | |
577 | if (orig_maxbucket == metap->hashm_maxbucket && |
578 | orig_ntuples == metap->hashm_ntuples) |
579 | { |
580 | /* |
581 | * No one has split or inserted anything since start of scan, so |
582 | * believe our count as gospel. |
583 | */ |
584 | metap->hashm_ntuples = num_index_tuples; |
585 | } |
586 | else |
587 | { |
588 | /* |
589 | * Otherwise, our count is untrustworthy since we may have |
590 | * double-scanned tuples in split buckets. Proceed by dead-reckoning. |
591 | * (Note: we still return estimated_count = false, because using this |
592 | * count is better than not updating reltuples at all.) |
593 | */ |
594 | if (metap->hashm_ntuples > tuples_removed) |
595 | metap->hashm_ntuples -= tuples_removed; |
596 | else |
597 | metap->hashm_ntuples = 0; |
598 | num_index_tuples = metap->hashm_ntuples; |
599 | } |
600 | |
601 | MarkBufferDirty(metabuf); |
602 | |
603 | /* XLOG stuff */ |
604 | if (RelationNeedsWAL(rel)) |
605 | { |
606 | xl_hash_update_meta_page xlrec; |
607 | XLogRecPtr recptr; |
608 | |
609 | xlrec.ntuples = metap->hashm_ntuples; |
610 | |
611 | XLogBeginInsert(); |
612 | XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage); |
613 | |
614 | XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); |
615 | |
616 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); |
617 | PageSetLSN(BufferGetPage(metabuf), recptr); |
618 | } |
619 | |
620 | END_CRIT_SECTION(); |
621 | |
622 | _hash_relbuf(rel, metabuf); |
623 | |
624 | /* return statistics */ |
625 | if (stats == NULL) |
626 | stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
627 | stats->estimated_count = false; |
628 | stats->num_index_tuples = num_index_tuples; |
629 | stats->tuples_removed += tuples_removed; |
630 | /* hashvacuumcleanup will fill in num_pages */ |
631 | |
632 | return stats; |
633 | } |
634 | |
635 | /* |
636 | * Post-VACUUM cleanup. |
637 | * |
638 | * Result: a palloc'd struct containing statistical info for VACUUM displays. |
639 | */ |
640 | IndexBulkDeleteResult * |
641 | hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) |
642 | { |
643 | Relation rel = info->index; |
644 | BlockNumber num_pages; |
645 | |
646 | /* If hashbulkdelete wasn't called, return NULL signifying no change */ |
647 | /* Note: this covers the analyze_only case too */ |
648 | if (stats == NULL) |
649 | return NULL; |
650 | |
651 | /* update statistics */ |
652 | num_pages = RelationGetNumberOfBlocks(rel); |
653 | stats->num_pages = num_pages; |
654 | |
655 | return stats; |
656 | } |
657 | |
658 | /* |
659 | * Helper function to perform deletion of index entries from a bucket. |
660 | * |
661 | * This function expects that the caller has acquired a cleanup lock on the |
662 | * primary bucket page, and will return with a write lock again held on the |
663 | * primary bucket page. The lock won't necessarily be held continuously, |
664 | * though, because we'll release it when visiting overflow pages. |
665 | * |
666 | * There can't be any concurrent scans in progress when we first enter this |
667 | * function because of the cleanup lock we hold on the primary bucket page, |
668 | * but as soon as we release that lock, there might be. If those scans got |
669 | * ahead of our cleanup scan, they might see a tuple before we kill it and |
670 | * wake up only after VACUUM has completed and the TID has been recycled for |
671 | * an unrelated tuple. To avoid that calamity, we prevent scans from passing |
672 | * our cleanup scan by locking the next page in the bucket chain before |
673 | * releasing the lock on the previous page. (This type of lock chaining is not |
674 | * ideal, so we might want to look for a better solution at some point.) |
675 | * |
676 | * We need to retain a pin on the primary bucket to ensure that no concurrent |
677 | * split can start. |
678 | */ |
679 | void |
680 | hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, |
681 | BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, |
682 | uint32 maxbucket, uint32 highmask, uint32 lowmask, |
683 | double *tuples_removed, double *num_index_tuples, |
684 | bool split_cleanup, |
685 | IndexBulkDeleteCallback callback, void *callback_state) |
686 | { |
687 | BlockNumber blkno; |
688 | Buffer buf; |
689 | Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; |
690 | bool bucket_dirty = false; |
691 | |
692 | blkno = bucket_blkno; |
693 | buf = bucket_buf; |
694 | |
695 | if (split_cleanup) |
696 | new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, |
697 | lowmask, maxbucket); |
698 | |
699 | /* Scan each page in bucket */ |
700 | for (;;) |
701 | { |
702 | HashPageOpaque opaque; |
703 | OffsetNumber offno; |
704 | OffsetNumber maxoffno; |
705 | Buffer next_buf; |
706 | Page page; |
707 | OffsetNumber deletable[MaxOffsetNumber]; |
708 | int ndeletable = 0; |
709 | bool retain_pin = false; |
710 | bool clear_dead_marking = false; |
711 | |
712 | vacuum_delay_point(); |
713 | |
714 | page = BufferGetPage(buf); |
715 | opaque = (HashPageOpaque) PageGetSpecialPointer(page); |
716 | |
717 | /* Scan each tuple in page */ |
718 | maxoffno = PageGetMaxOffsetNumber(page); |
719 | for (offno = FirstOffsetNumber; |
720 | offno <= maxoffno; |
721 | offno = OffsetNumberNext(offno)) |
722 | { |
723 | ItemPointer htup; |
724 | IndexTuple itup; |
725 | Bucket bucket; |
726 | bool kill_tuple = false; |
727 | |
728 | itup = (IndexTuple) PageGetItem(page, |
729 | PageGetItemId(page, offno)); |
730 | htup = &(itup->t_tid); |
731 | |
732 | /* |
733 | * To remove the dead tuples, we strictly want to rely on results |
734 | * of callback function. refer btvacuumpage for detailed reason. |
735 | */ |
736 | if (callback && callback(htup, callback_state)) |
737 | { |
738 | kill_tuple = true; |
739 | if (tuples_removed) |
740 | *tuples_removed += 1; |
741 | } |
742 | else if (split_cleanup) |
743 | { |
744 | /* delete the tuples that are moved by split. */ |
745 | bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), |
746 | maxbucket, |
747 | highmask, |
748 | lowmask); |
749 | /* mark the item for deletion */ |
750 | if (bucket != cur_bucket) |
751 | { |
752 | /* |
753 | * We expect tuples to either belong to current bucket or |
754 | * new_bucket. This is ensured because we don't allow |
755 | * further splits from bucket that contains garbage. See |
756 | * comments in _hash_expandtable. |
757 | */ |
758 | Assert(bucket == new_bucket); |
759 | kill_tuple = true; |
760 | } |
761 | } |
762 | |
763 | if (kill_tuple) |
764 | { |
765 | /* mark the item for deletion */ |
766 | deletable[ndeletable++] = offno; |
767 | } |
768 | else |
769 | { |
770 | /* we're keeping it, so count it */ |
771 | if (num_index_tuples) |
772 | *num_index_tuples += 1; |
773 | } |
774 | } |
775 | |
776 | /* retain the pin on primary bucket page till end of bucket scan */ |
777 | if (blkno == bucket_blkno) |
778 | retain_pin = true; |
779 | else |
780 | retain_pin = false; |
781 | |
782 | blkno = opaque->hasho_nextblkno; |
783 | |
784 | /* |
785 | * Apply deletions, advance to next page and write page if needed. |
786 | */ |
787 | if (ndeletable > 0) |
788 | { |
789 | /* No ereport(ERROR) until changes are logged */ |
790 | START_CRIT_SECTION(); |
791 | |
792 | PageIndexMultiDelete(page, deletable, ndeletable); |
793 | bucket_dirty = true; |
794 | |
795 | /* |
796 | * Let us mark the page as clean if vacuum removes the DEAD tuples |
797 | * from an index page. We do this by clearing |
798 | * LH_PAGE_HAS_DEAD_TUPLES flag. |
799 | */ |
800 | if (tuples_removed && *tuples_removed > 0 && |
801 | H_HAS_DEAD_TUPLES(opaque)) |
802 | { |
803 | opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; |
804 | clear_dead_marking = true; |
805 | } |
806 | |
807 | MarkBufferDirty(buf); |
808 | |
809 | /* XLOG stuff */ |
810 | if (RelationNeedsWAL(rel)) |
811 | { |
812 | xl_hash_delete xlrec; |
813 | XLogRecPtr recptr; |
814 | |
815 | xlrec.clear_dead_marking = clear_dead_marking; |
816 | xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; |
817 | |
818 | XLogBeginInsert(); |
819 | XLogRegisterData((char *) &xlrec, SizeOfHashDelete); |
820 | |
821 | /* |
822 | * bucket buffer needs to be registered to ensure that we can |
823 | * acquire a cleanup lock on it during replay. |
824 | */ |
825 | if (!xlrec.is_primary_bucket_page) |
826 | XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); |
827 | |
828 | XLogRegisterBuffer(1, buf, REGBUF_STANDARD); |
829 | XLogRegisterBufData(1, (char *) deletable, |
830 | ndeletable * sizeof(OffsetNumber)); |
831 | |
832 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); |
833 | PageSetLSN(BufferGetPage(buf), recptr); |
834 | } |
835 | |
836 | END_CRIT_SECTION(); |
837 | } |
838 | |
839 | /* bail out if there are no more pages to scan. */ |
840 | if (!BlockNumberIsValid(blkno)) |
841 | break; |
842 | |
843 | next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, |
844 | LH_OVERFLOW_PAGE, |
845 | bstrategy); |
846 | |
847 | /* |
848 | * release the lock on previous page after acquiring the lock on next |
849 | * page |
850 | */ |
851 | if (retain_pin) |
852 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
853 | else |
854 | _hash_relbuf(rel, buf); |
855 | |
856 | buf = next_buf; |
857 | } |
858 | |
859 | /* |
860 | * lock the bucket page to clear the garbage flag and squeeze the bucket. |
861 | * if the current buffer is same as bucket buffer, then we already have |
862 | * lock on bucket page. |
863 | */ |
864 | if (buf != bucket_buf) |
865 | { |
866 | _hash_relbuf(rel, buf); |
867 | LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); |
868 | } |
869 | |
870 | /* |
871 | * Clear the garbage flag from bucket after deleting the tuples that are |
872 | * moved by split. We purposefully clear the flag before squeeze bucket, |
873 | * so that after restart, vacuum shouldn't again try to delete the moved |
874 | * by split tuples. |
875 | */ |
876 | if (split_cleanup) |
877 | { |
878 | HashPageOpaque bucket_opaque; |
879 | Page page; |
880 | |
881 | page = BufferGetPage(bucket_buf); |
882 | bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); |
883 | |
884 | /* No ereport(ERROR) until changes are logged */ |
885 | START_CRIT_SECTION(); |
886 | |
887 | bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; |
888 | MarkBufferDirty(bucket_buf); |
889 | |
890 | /* XLOG stuff */ |
891 | if (RelationNeedsWAL(rel)) |
892 | { |
893 | XLogRecPtr recptr; |
894 | |
895 | XLogBeginInsert(); |
896 | XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); |
897 | |
898 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); |
899 | PageSetLSN(page, recptr); |
900 | } |
901 | |
902 | END_CRIT_SECTION(); |
903 | } |
904 | |
905 | /* |
906 | * If we have deleted anything, try to compact free space. For squeezing |
907 | * the bucket, we must have a cleanup lock, else it can impact the |
908 | * ordering of tuples for a scan that has started before it. |
909 | */ |
910 | if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) |
911 | _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, |
912 | bstrategy); |
913 | else |
914 | LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); |
915 | } |
916 | |