hash.c source code [PostgreSQL/src/backend/access/hash/hash.c]

1	/-------------------------------------------------------------------------*
2	*
3	* hash.c
4	* Implementation of Margo Seltzer's Hashing package for postgres.
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/access/hash/hash.c
12	*
13	* NOTES
14	* This file contains only the public interface routines.
15	*
16	*-------------------------------------------------------------------------
17	*/
18
19	#include "postgres.h"
20
21	#include "access/hash.h"
22	#include "access/hash_xlog.h"
23	#include "access/relscan.h"
24	#include "access/tableam.h"
25	#include "catalog/index.h"
26	#include "commands/progress.h"
27	#include "commands/vacuum.h"
28	#include "miscadmin.h"
29	#include "optimizer/plancat.h"
30	#include "pgstat.h"
31	#include "utils/builtins.h"
32	#include "utils/index_selfuncs.h"
33	#include "utils/rel.h"
34	#include "miscadmin.h"
35
36
37	/ Working state for hashbuild and its callback /
38	typedef struct
39	{
40	HSpool spool; /* NULL if not using spooling /
41	double indtuples; / # tuples accepted into index /
42	Relation heapRel; / heap relation descriptor /
43	} HashBuildState;
44
45	static void hashbuildCallback(Relation index,
46	HeapTuple htup,
47	Datum *values,
48	bool *isnull,
49	bool tupleIsAlive,
50	void *state);
51
52
53	/*
54	* Hash handler function: return IndexAmRoutine with access method parameters
55	* and callbacks.
56	*/
57	Datum
58	hashhandler(PG_FUNCTION_ARGS)
59	{
60	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
61
62	amroutine->amstrategies = HTMaxStrategyNumber;
63	amroutine->amsupport = HASHNProcs;
64	amroutine->amcanorder = false;
65	amroutine->amcanorderbyop = false;
66	amroutine->amcanbackward = true;
67	amroutine->amcanunique = false;
68	amroutine->amcanmulticol = false;
69	amroutine->amoptionalkey = false;
70	amroutine->amsearcharray = false;
71	amroutine->amsearchnulls = false;
72	amroutine->amstorage = false;
73	amroutine->amclusterable = false;
74	amroutine->ampredlocks = true;
75	amroutine->amcanparallel = false;
76	amroutine->amcaninclude = false;
77	amroutine->amkeytype = INT4OID;
78
79	amroutine->ambuild = hashbuild;
80	amroutine->ambuildempty = hashbuildempty;
81	amroutine->aminsert = hashinsert;
82	amroutine->ambulkdelete = hashbulkdelete;
83	amroutine->amvacuumcleanup = hashvacuumcleanup;
84	amroutine->amcanreturn = NULL;
85	amroutine->amcostestimate = hashcostestimate;
86	amroutine->amoptions = hashoptions;
87	amroutine->amproperty = NULL;
88	amroutine->ambuildphasename = NULL;
89	amroutine->amvalidate = hashvalidate;
90	amroutine->ambeginscan = hashbeginscan;
91	amroutine->amrescan = hashrescan;
92	amroutine->amgettuple = hashgettuple;
93	amroutine->amgetbitmap = hashgetbitmap;
94	amroutine->amendscan = hashendscan;
95	amroutine->ammarkpos = NULL;
96	amroutine->amrestrpos = NULL;
97	amroutine->amestimateparallelscan = NULL;
98	amroutine->aminitparallelscan = NULL;
99	amroutine->amparallelrescan = NULL;
100
101	PG_RETURN_POINTER(amroutine);
102	}
103
104	/*
105	* hashbuild() -- build a new hash index.
106	*/
107	IndexBuildResult *
108	hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
109	{
110	IndexBuildResult *result;
111	BlockNumber relpages;
112	double reltuples;
113	double allvisfrac;
114	uint32 num_buckets;
115	long sort_threshold;
116	HashBuildState buildstate;
117
118	/*
119	* We expect to be called exactly once for any index relation. If that's
120	* not the case, big trouble's what we have.
121	*/
122	if (RelationGetNumberOfBlocks(index) != `0`)
123	elog(ERROR, "index \"%s\" already contains data",
124	RelationGetRelationName(index));
125
126	/ Estimate the number of rows currently present in the table /
127	estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
128
129	/ Initialize the hash index metadata page and initial buckets /
130	num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
131
132	/*
133	* If we just insert the tuples into the index in scan order, then
134	* (assuming their hash codes are pretty random) there will be no locality
135	* of access to the index, and if the index is bigger than available RAM
136	* then we'll thrash horribly. To prevent that scenario, we can sort the
137	* tuples by (expected) bucket number. However, such a sort is useless
138	* overhead when the index does fit in RAM. We choose to sort if the
139	* initial index size exceeds maintenance_work_mem, or the number of
140	* buffers usable for the index, whichever is less. (Limiting by the
141	* number of buffers should reduce thrashing between PG buffers and kernel
142	* buffers, which seems useful even if no physical I/O results. Limiting
143	* by maintenance_work_mem is useful to allow easy testing of the sort
144	* code path, and may be useful to DBAs as an additional control knob.)
145	*
146	* NOTE: this test will need adjustment if a bucket is ever different from
147	* one page. Also, "initial index size" accounting does not include the
148	* metapage, nor the first bitmap page.
149	*/
150	sort_threshold = (maintenance_work_mem * `1024L`) / BLCKSZ;
151	if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
152	sort_threshold = Min(sort_threshold, NBuffers);
153	else
154	sort_threshold = Min(sort_threshold, NLocBuffer);
155
156	if (num_buckets >= (uint32) sort_threshold)
157	buildstate.spool = _h_spoolinit(heap, index, num_buckets);
158	else
159	buildstate.spool = NULL;
160
161	/ prepare to build the index /
162	buildstate.indtuples = `0`;
163	buildstate.heapRel = heap;
164
165	/ do the heap scan /
166	reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
167	hashbuildCallback,
168	(void *) &buildstate, NULL);
169	pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
170	buildstate.indtuples);
171
172	if (buildstate.spool)
173	{
174	/ sort the tuples and insert them into the index /
175	_h_indexbuild(buildstate.spool, buildstate.heapRel);
176	_h_spooldestroy(buildstate.spool);
177	}
178
179	/*
180	* Return statistics
181	*/
182	result = (IndexBuildResult ) palloc(sizeof*(IndexBuildResult));
183
184	result->heap_tuples = reltuples;
185	result->index_tuples = buildstate.indtuples;
186
187	return result;
188	}
189
190	/*
191	* hashbuildempty() -- build an empty hash index in the initialization fork
192	*/
193	void
194	hashbuildempty(Relation index)
195	{
196	_hash_init(index, `0`, INIT_FORKNUM);
197	}
198
199	/*
200	* Per-tuple callback for table_index_build_scan
201	*/
202	static void
203	hashbuildCallback(Relation index,
204	HeapTuple htup,
205	Datum *values,
206	bool *isnull,
207	bool tupleIsAlive,
208	void *state)
209	{
210	HashBuildState buildstate = (HashBuildState ) state;
211	Datum index_values[`1`];
212	bool index_isnull[`1`];
213	IndexTuple itup;
214
215	/ convert data to a hash key; on failure, do not insert anything /
216	if (!_hash_convert_tuple(index,
217	values, isnull,
218	index_values, index_isnull))
219	return;
220
221	/ Either spool the tuple for sorting, or just put it into the index /
222	if (buildstate->spool)
223	_h_spool(buildstate->spool, &htup->t_self,
224	index_values, index_isnull);
225	else
226	{
227	/ form an index tuple and point it at the heap tuple /
228	itup = index_form_tuple(RelationGetDescr(index),
229	index_values, index_isnull);
230	itup->t_tid = htup->t_self;
231	_hash_doinsert(index, itup, buildstate->heapRel);
232	pfree(itup);
233	}
234
235	buildstate->indtuples += `1`;
236	}
237
238	/*
239	* hashinsert() -- insert an index tuple into a hash table.
240	*
241	* Hash on the heap tuple's key, form an index tuple with hash code.
242	* Find the appropriate location for the new tuple, and put it there.
243	*/
244	bool
245	hashinsert(Relation rel, Datum values, bool isnull,
246	ItemPointer ht_ctid, Relation heapRel,
247	IndexUniqueCheck checkUnique,
248	IndexInfo *indexInfo)
249	{
250	Datum index_values[`1`];
251	bool index_isnull[`1`];
252	IndexTuple itup;
253
254	/ convert data to a hash key; on failure, do not insert anything /
255	if (!_hash_convert_tuple(rel,
256	values, isnull,
257	index_values, index_isnull))
258	return false;
259
260	/ form an index tuple and point it at the heap tuple /
261	itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
262	itup->t_tid = *ht_ctid;
263
264	_hash_doinsert(rel, itup, heapRel);
265
266	pfree(itup);
267
268	return false;
269	}
270
271
272	/*
273	* hashgettuple() -- Get the next tuple in the scan.
274	*/
275	bool
276	hashgettuple(IndexScanDesc scan, ScanDirection dir)
277	{
278	HashScanOpaque so = (HashScanOpaque) scan->opaque;
279	bool res;
280
281	/ Hash indexes are always lossy since we store only the hash code /
282	scan->xs_recheck = true;
283
284	/*
285	* If we've already initialized this scan, we can just advance it in the
286	* appropriate direction. If we haven't done so yet, we call a routine to
287	* get the first item in the scan.
288	*/
289	if (!HashScanPosIsValid(so->currPos))
290	res = _hash_first(scan, dir);
291	else
292	{
293	/*
294	* Check to see if we should kill the previously-fetched tuple.
295	*/
296	if (scan->kill_prior_tuple)
297	{
298	/*
299	* Yes, so remember it for later. (We'll deal with all such tuples
300	* at once right after leaving the index page or at end of scan.)
301	* In case if caller reverses the indexscan direction it is quite
302	* possible that the same item might get entered multiple times.
303	* But, we don't detect that; instead, we just forget any excess
304	* entries.
305	*/
306	if (so->killedItems == NULL)
307	so->killedItems = (int *)
308	palloc(MaxIndexTuplesPerPage * sizeof(int));
309
310	if (so->numKilled < MaxIndexTuplesPerPage)
311	so->killedItems[so->numKilled++] = so->currPos.itemIndex;
312	}
313
314	/*
315	* Now continue the scan.
316	*/
317	res = _hash_next(scan, dir);
318	}
319
320	return res;
321	}
322
323
324	/*
325	* hashgetbitmap() -- get all tuples at once
326	*/
327	int64
328	hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
329	{
330	HashScanOpaque so = (HashScanOpaque) scan->opaque;
331	bool res;
332	int64 ntids = `0`;
333	HashScanPosItem *currItem;
334
335	res = _hash_first(scan, ForwardScanDirection);
336
337	while (res)
338	{
339	currItem = &so->currPos.items[so->currPos.itemIndex];
340
341	/*
342	* _hash_first and _hash_next handle eliminate dead index entries
343	* whenever scan->ignore_killed_tuples is true. Therefore, there's
344	* nothing to do here except add the results to the TIDBitmap.
345	*/
346	tbm_add_tuples(tbm, &(currItem->heapTid), `1`, true);
347	ntids++;
348
349	res = _hash_next(scan, ForwardScanDirection);
350	}
351
352	return ntids;
353	}
354
355
356	/*
357	* hashbeginscan() -- start a scan on a hash index
358	*/
359	IndexScanDesc
360	hashbeginscan(Relation rel, int nkeys, int norderbys)
361	{
362	IndexScanDesc scan;
363	HashScanOpaque so;
364
365	/ no order by operators allowed /
366	Assert(norderbys == `0`);
367
368	scan = RelationGetIndexScan(rel, nkeys, norderbys);
369
370	so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
371	HashScanPosInvalidate(so->currPos);
372	so->hashso_bucket_buf = InvalidBuffer;
373	so->hashso_split_bucket_buf = InvalidBuffer;
374
375	so->hashso_buc_populated = false;
376	so->hashso_buc_split = false;
377
378	so->killedItems = NULL;
379	so->numKilled = `0`;
380
381	scan->opaque = so;
382
383	return scan;
384	}
385
386	/*
387	* hashrescan() -- rescan an index relation
388	*/
389	void
390	hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
391	ScanKey orderbys, int norderbys)
392	{
393	HashScanOpaque so = (HashScanOpaque) scan->opaque;
394	Relation rel = scan->indexRelation;
395
396	if (HashScanPosIsValid(so->currPos))
397	{
398	/ Before leaving current page, deal with any killed items /
399	if (so->numKilled > `0`)
400	_hash_kill_items(scan);
401	}
402
403	_hash_dropscanbuf(rel, so);
404
405	/ set position invalid (this will cause _hash_first call) /
406	HashScanPosInvalidate(so->currPos);
407
408	/ Update scan key, if a new one is given /
409	if (scankey && scan->numberOfKeys > `0`)
410	{
411	memmove(scan->keyData,
412	scankey,
413	scan->numberOfKeys * sizeof(ScanKeyData));
414	}
415
416	so->hashso_buc_populated = false;
417	so->hashso_buc_split = false;
418	}
419
420	/*
421	* hashendscan() -- close down a scan
422	*/
423	void
424	hashendscan(IndexScanDesc scan)
425	{
426	HashScanOpaque so = (HashScanOpaque) scan->opaque;
427	Relation rel = scan->indexRelation;
428
429	if (HashScanPosIsValid(so->currPos))
430	{
431	/ Before leaving current page, deal with any killed items /
432	if (so->numKilled > `0`)
433	_hash_kill_items(scan);
434	}
435
436	_hash_dropscanbuf(rel, so);
437
438	if (so->killedItems != NULL)
439	pfree(so->killedItems);
440	pfree(so);
441	scan->opaque = NULL;
442	}
443
444	/*
445	* Bulk deletion of all index entries pointing to a set of heap tuples.
446	* The set of target tuples is specified via a callback routine that tells
447	* whether any given heap tuple (identified by ItemPointer) is being deleted.
448	*
449	* This function also deletes the tuples that are moved by split to other
450	* bucket.
451	*
452	* Result: a palloc'd struct containing statistical info for VACUUM displays.
453	*/
454	IndexBulkDeleteResult *
455	hashbulkdelete(IndexVacuumInfo info, IndexBulkDeleteResult stats,
456	IndexBulkDeleteCallback callback, void *callback_state)
457	{
458	Relation rel = info->index;
459	double tuples_removed;
460	double num_index_tuples;
461	double orig_ntuples;
462	Bucket orig_maxbucket;
463	Bucket cur_maxbucket;
464	Bucket cur_bucket;
465	Buffer metabuf = InvalidBuffer;
466	HashMetaPage metap;
467	HashMetaPage cachedmetap;
468
469	tuples_removed = `0`;
470	num_index_tuples = `0`;
471
472	/*
473	* We need a copy of the metapage so that we can use its hashm_spares[]
474	* values to compute bucket page addresses, but a cached copy should be
475	* good enough. (If not, we'll detect that further down and refresh the
476	* cache as necessary.)
477	*/
478	cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
479	Assert(cachedmetap != NULL);
480
481	orig_maxbucket = cachedmetap->hashm_maxbucket;
482	orig_ntuples = cachedmetap->hashm_ntuples;
483
484	/ Scan the buckets that we know exist /
485	cur_bucket = `0`;
486	cur_maxbucket = orig_maxbucket;
487
488	loop_top:
489	while (cur_bucket <= cur_maxbucket)
490	{
491	BlockNumber bucket_blkno;
492	BlockNumber blkno;
493	Buffer bucket_buf;
494	Buffer buf;
495	HashPageOpaque bucket_opaque;
496	Page page;
497	bool split_cleanup = false;
498
499	/ Get address of bucket's start page /
500	bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
501
502	blkno = bucket_blkno;
503
504	/*
505	* We need to acquire a cleanup lock on the primary bucket page to out
506	* wait concurrent scans before deleting the dead tuples.
507	*/
508	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
509	LockBufferForCleanup(buf);
510	_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
511
512	page = BufferGetPage(buf);
513	bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
514
515	/*
516	* If the bucket contains tuples that are moved by split, then we need
517	* to delete such tuples. We can't delete such tuples if the split
518	* operation on bucket is not finished as those are needed by scans.
519	*/
520	if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
521	H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
522	{
523	split_cleanup = true;
524
525	/*
526	* This bucket might have been split since we last held a lock on
527	* the metapage. If so, hashm_maxbucket, hashm_highmask and
528	* hashm_lowmask might be old enough to cause us to fail to remove
529	* tuples left behind by the most recent split. To prevent that,
530	* now that the primary page of the target bucket has been locked
531	* (and thus can't be further split), check whether we need to
532	* update our cached metapage data.
533	*/
534	Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
535	if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
536	{
537	cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
538	Assert(cachedmetap != NULL);
539	}
540	}
541
542	bucket_buf = buf;
543
544	hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
545	cachedmetap->hashm_maxbucket,
546	cachedmetap->hashm_highmask,
547	cachedmetap->hashm_lowmask, &tuples_removed,
548	&num_index_tuples, split_cleanup,
549	callback, callback_state);
550
551	_hash_dropbuf(rel, bucket_buf);
552
553	/ Advance to next bucket /
554	cur_bucket++;
555	}
556
557	if (BufferIsInvalid(metabuf))
558	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
559
560	/ Write-lock metapage and check for split since we started /
561	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
562	metap = HashPageGetMeta(BufferGetPage(metabuf));
563
564	if (cur_maxbucket != metap->hashm_maxbucket)
565	{
566	/ There's been a split, so process the additional bucket(s) /
567	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
568	cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
569	Assert(cachedmetap != NULL);
570	cur_maxbucket = cachedmetap->hashm_maxbucket;
571	goto loop_top;
572	}
573
574	/ Okay, we're really done. Update tuple count in metapage. /
575	START_CRIT_SECTION();
576
577	if (orig_maxbucket == metap->hashm_maxbucket &&
578	orig_ntuples == metap->hashm_ntuples)
579	{
580	/*
581	* No one has split or inserted anything since start of scan, so
582	* believe our count as gospel.
583	*/
584	metap->hashm_ntuples = num_index_tuples;
585	}
586	else
587	{
588	/*
589	* Otherwise, our count is untrustworthy since we may have
590	* double-scanned tuples in split buckets. Proceed by dead-reckoning.
591	* (Note: we still return estimated_count = false, because using this
592	* count is better than not updating reltuples at all.)
593	*/
594	if (metap->hashm_ntuples > tuples_removed)
595	metap->hashm_ntuples -= tuples_removed;
596	else
597	metap->hashm_ntuples = `0`;
598	num_index_tuples = metap->hashm_ntuples;
599	}
600
601	MarkBufferDirty(metabuf);
602
603	/ XLOG stuff /
604	if (RelationNeedsWAL(rel))
605	{
606	xl_hash_update_meta_page xlrec;
607	XLogRecPtr recptr;
608
609	xlrec.ntuples = metap->hashm_ntuples;
610
611	XLogBeginInsert();
612	XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
613
614	XLogRegisterBuffer(`0`, metabuf, REGBUF_STANDARD);
615
616	recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
617	PageSetLSN(BufferGetPage(metabuf), recptr);
618	}
619
620	END_CRIT_SECTION();
621
622	_hash_relbuf(rel, metabuf);
623
624	/ return statistics /
625	if (stats == NULL)
626	stats = (IndexBulkDeleteResult ) palloc0(sizeof*(IndexBulkDeleteResult));
627	stats->estimated_count = false;
628	stats->num_index_tuples = num_index_tuples;
629	stats->tuples_removed += tuples_removed;
630	/ hashvacuumcleanup will fill in num_pages /
631
632	return stats;
633	}
634
635	/*
636	* Post-VACUUM cleanup.
637	*
638	* Result: a palloc'd struct containing statistical info for VACUUM displays.
639	*/
640	IndexBulkDeleteResult *
641	hashvacuumcleanup(IndexVacuumInfo info, IndexBulkDeleteResult stats)
642	{
643	Relation rel = info->index;
644	BlockNumber num_pages;
645
646	/ If hashbulkdelete wasn't called, return NULL signifying no change /
647	/ Note: this covers the analyze_only case too /
648	if (stats == NULL)
649	return NULL;
650
651	/ update statistics /
652	num_pages = RelationGetNumberOfBlocks(rel);
653	stats->num_pages = num_pages;
654
655	return stats;
656	}
657
658	/*
659	* Helper function to perform deletion of index entries from a bucket.
660	*
661	* This function expects that the caller has acquired a cleanup lock on the
662	* primary bucket page, and will return with a write lock again held on the
663	* primary bucket page. The lock won't necessarily be held continuously,
664	* though, because we'll release it when visiting overflow pages.
665	*
666	* There can't be any concurrent scans in progress when we first enter this
667	* function because of the cleanup lock we hold on the primary bucket page,
668	* but as soon as we release that lock, there might be. If those scans got
669	* ahead of our cleanup scan, they might see a tuple before we kill it and
670	* wake up only after VACUUM has completed and the TID has been recycled for
671	* an unrelated tuple. To avoid that calamity, we prevent scans from passing
672	* our cleanup scan by locking the next page in the bucket chain before
673	* releasing the lock on the previous page. (This type of lock chaining is not
674	* ideal, so we might want to look for a better solution at some point.)
675	*
676	* We need to retain a pin on the primary bucket to ensure that no concurrent
677	* split can start.
678	*/
679	void
680	hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
681	BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
682	uint32 maxbucket, uint32 highmask, uint32 lowmask,
683	double tuples_removed, double* *num_index_tuples,
684	bool split_cleanup,
685	IndexBulkDeleteCallback callback, void *callback_state)
686	{
687	BlockNumber blkno;
688	Buffer buf;
689	Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
690	bool bucket_dirty = false;
691
692	blkno = bucket_blkno;
693	buf = bucket_buf;
694
695	if (split_cleanup)
696	new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
697	lowmask, maxbucket);
698
699	/ Scan each page in bucket /
700	for (;;)
701	{
702	HashPageOpaque opaque;
703	OffsetNumber offno;
704	OffsetNumber maxoffno;
705	Buffer next_buf;
706	Page page;
707	OffsetNumber deletable[MaxOffsetNumber];
708	int ndeletable = `0`;
709	bool retain_pin = false;
710	bool clear_dead_marking = false;
711
712	vacuum_delay_point();
713
714	page = BufferGetPage(buf);
715	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
716
717	/ Scan each tuple in page /
718	maxoffno = PageGetMaxOffsetNumber(page);
719	for (offno = FirstOffsetNumber;
720	offno <= maxoffno;
721	offno = OffsetNumberNext(offno))
722	{
723	ItemPointer htup;
724	IndexTuple itup;
725	Bucket bucket;
726	bool kill_tuple = false;
727
728	itup = (IndexTuple) PageGetItem(page,
729	PageGetItemId(page, offno));
730	htup = &(itup->t_tid);
731
732	/*
733	* To remove the dead tuples, we strictly want to rely on results
734	* of callback function. refer btvacuumpage for detailed reason.
735	*/
736	if (callback && callback(htup, callback_state))
737	{
738	kill_tuple = true;
739	if (tuples_removed)
740	*tuples_removed += `1`;
741	}
742	else if (split_cleanup)
743	{
744	/ delete the tuples that are moved by split. /
745	bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
746	maxbucket,
747	highmask,
748	lowmask);
749	/ mark the item for deletion /
750	if (bucket != cur_bucket)
751	{
752	/*
753	* We expect tuples to either belong to current bucket or
754	* new_bucket. This is ensured because we don't allow
755	* further splits from bucket that contains garbage. See
756	* comments in _hash_expandtable.
757	*/
758	Assert(bucket == new_bucket);
759	kill_tuple = true;
760	}
761	}
762
763	if (kill_tuple)
764	{
765	/ mark the item for deletion /
766	deletable[ndeletable++] = offno;
767	}
768	else
769	{
770	/ we're keeping it, so count it /
771	if (num_index_tuples)
772	*num_index_tuples += `1`;
773	}
774	}
775
776	/ retain the pin on primary bucket page till end of bucket scan /
777	if (blkno == bucket_blkno)
778	retain_pin = true;
779	else
780	retain_pin = false;
781
782	blkno = opaque->hasho_nextblkno;
783
784	/*
785	* Apply deletions, advance to next page and write page if needed.
786	*/
787	if (ndeletable > `0`)
788	{
789	/ No ereport(ERROR) until changes are logged /
790	START_CRIT_SECTION();
791
792	PageIndexMultiDelete(page, deletable, ndeletable);
793	bucket_dirty = true;
794
795	/*
796	* Let us mark the page as clean if vacuum removes the DEAD tuples
797	* from an index page. We do this by clearing
798	* LH_PAGE_HAS_DEAD_TUPLES flag.
799	*/
800	if (tuples_removed && *tuples_removed > `0` &&
801	H_HAS_DEAD_TUPLES(opaque))
802	{
803	opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
804	clear_dead_marking = true;
805	}
806
807	MarkBufferDirty(buf);
808
809	/ XLOG stuff /
810	if (RelationNeedsWAL(rel))
811	{
812	xl_hash_delete xlrec;
813	XLogRecPtr recptr;
814
815	xlrec.clear_dead_marking = clear_dead_marking;
816	xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
817
818	XLogBeginInsert();
819	XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
820
821	/*
822	* bucket buffer needs to be registered to ensure that we can
823	* acquire a cleanup lock on it during replay.
824	*/
825	if (!xlrec.is_primary_bucket_page)
826	XLogRegisterBuffer(`0`, bucket_buf, REGBUF_STANDARD \| REGBUF_NO_IMAGE);
827
828	XLogRegisterBuffer(`1`, buf, REGBUF_STANDARD);
829	XLogRegisterBufData(`1`, (char *) deletable,
830	ndeletable * sizeof(OffsetNumber));
831
832	recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
833	PageSetLSN(BufferGetPage(buf), recptr);
834	}
835
836	END_CRIT_SECTION();
837	}
838
839	/ bail out if there are no more pages to scan. /
840	if (!BlockNumberIsValid(blkno))
841	break;
842
843	next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
844	LH_OVERFLOW_PAGE,
845	bstrategy);
846
847	/*
848	* release the lock on previous page after acquiring the lock on next
849	* page
850	*/
851	if (retain_pin)
852	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
853	else
854	_hash_relbuf(rel, buf);
855
856	buf = next_buf;
857	}
858
859	/*
860	* lock the bucket page to clear the garbage flag and squeeze the bucket.
861	* if the current buffer is same as bucket buffer, then we already have
862	* lock on bucket page.
863	*/
864	if (buf != bucket_buf)
865	{
866	_hash_relbuf(rel, buf);
867	LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
868	}
869
870	/*
871	* Clear the garbage flag from bucket after deleting the tuples that are
872	* moved by split. We purposefully clear the flag before squeeze bucket,
873	* so that after restart, vacuum shouldn't again try to delete the moved
874	* by split tuples.
875	*/
876	if (split_cleanup)
877	{
878	HashPageOpaque bucket_opaque;
879	Page page;
880
881	page = BufferGetPage(bucket_buf);
882	bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
883
884	/ No ereport(ERROR) until changes are logged /
885	START_CRIT_SECTION();
886
887	bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
888	MarkBufferDirty(bucket_buf);
889
890	/ XLOG stuff /
891	if (RelationNeedsWAL(rel))
892	{
893	XLogRecPtr recptr;
894
895	XLogBeginInsert();
896	XLogRegisterBuffer(`0`, bucket_buf, REGBUF_STANDARD);
897
898	recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
899	PageSetLSN(page, recptr);
900	}
901
902	END_CRIT_SECTION();
903	}
904
905	/*
906	* If we have deleted anything, try to compact free space. For squeezing
907	* the bucket, we must have a cleanup lock, else it can impact the
908	* ordering of tuples for a scan that has started before it.
909	*/
910	if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
911	_hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
912	bstrategy);
913	else
914	LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
915	}
916

Browse the source code of PostgreSQL/src/backend/access/hash/hash.c