hashutil.c source code [PostgreSQL/src/backend/access/hash/hashutil.c]

1	/-------------------------------------------------------------------------*
2	*
3	* hashutil.c
4	* Utility code for Postgres hash implementation.
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/access/hash/hashutil.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15	#include "postgres.h"
16
17	#include "access/hash.h"
18	#include "access/reloptions.h"
19	#include "access/relscan.h"
20	#include "utils/lsyscache.h"
21	#include "utils/rel.h"
22	#include "storage/buf_internals.h"
23
24	#define CALC_NEW_BUCKET(old_bucket, lowmask) \
25	old_bucket \| (lowmask + 1)
26
27	/*
28	* _hash_checkqual -- does the index tuple satisfy the scan conditions?
29	*/
30	bool
31	_hash_checkqual(IndexScanDesc scan, IndexTuple itup)
32	{
33	/*
34	* Currently, we can't check any of the scan conditions since we do not
35	* have the original index entry value to supply to the sk_func. Always
36	* return true; we expect that hashgettuple already set the recheck flag
37	* to make the main indexscan code do it.
38	*/
39	#ifdef NOT_USED
40	TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
41	ScanKey key = scan->keyData;
42	int scanKeySize = scan->numberOfKeys;
43
44	while (scanKeySize > `0`)
45	{
46	Datum datum;
47	bool isNull;
48	Datum test;
49
50	datum = index_getattr(itup,
51	key->sk_attno,
52	tupdesc,
53	&isNull);
54
55	/ assume sk_func is strict /
56	if (isNull)
57	return false;
58	if (key->sk_flags & SK_ISNULL)
59	return false;
60
61	test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
62	datum, key->sk_argument);
63
64	if (!DatumGetBool(test))
65	return false;
66
67	key++;
68	scanKeySize--;
69	}
70	#endif
71
72	return true;
73	}
74
75	/*
76	* _hash_datum2hashkey -- given a Datum, call the index's hash function
77	*
78	* The Datum is assumed to be of the index's column type, so we can use the
79	* "primary" hash function that's tracked for us by the generic index code.
80	*/
81	uint32
82	_hash_datum2hashkey(Relation rel, Datum key)
83	{
84	FmgrInfo *procinfo;
85	Oid collation;
86
87	/ XXX assumes index has only one attribute /
88	procinfo = index_getprocinfo(rel, `1`, HASHSTANDARD_PROC);
89	collation = rel->rd_indcollation[`0`];
90
91	return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key));
92	}
93
94	/*
95	* _hash_datum2hashkey_type -- given a Datum of a specified type,
96	* hash it in a fashion compatible with this index
97	*
98	* This is much more expensive than _hash_datum2hashkey, so use it only in
99	* cross-type situations.
100	*/
101	uint32
102	_hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
103	{
104	RegProcedure hash_proc;
105	Oid collation;
106
107	/ XXX assumes index has only one attribute /
108	hash_proc = get_opfamily_proc(rel->rd_opfamily[`0`],
109	keytype,
110	keytype,
111	HASHSTANDARD_PROC);
112	if (!RegProcedureIsValid(hash_proc))
113	elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"",
114	HASHSTANDARD_PROC, keytype, keytype,
115	RelationGetRelationName(rel));
116	collation = rel->rd_indcollation[`0`];
117
118	return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key));
119	}
120
121	/*
122	* _hash_hashkey2bucket -- determine which bucket the hashkey maps to.
123	*/
124	Bucket
125	_hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
126	uint32 highmask, uint32 lowmask)
127	{
128	Bucket bucket;
129
130	bucket = hashkey & highmask;
131	if (bucket > maxbucket)
132	bucket = bucket & lowmask;
133
134	return bucket;
135	}
136
137	/*
138	* _hash_log2 -- returns ceil(lg2(num))
139	*/
140	uint32
141	_hash_log2(uint32 num)
142	{
143	uint32 i,
144	limit;
145
146	limit = `1`;
147	for (i = `0`; limit < num; limit <<= `1`, i++)
148	;
149	return i;
150	}
151
152	/*
153	* _hash_spareindex -- returns spare index / global splitpoint phase of the
154	* bucket
155	*/
156	uint32
157	_hash_spareindex(uint32 num_bucket)
158	{
159	uint32 splitpoint_group;
160	uint32 splitpoint_phases;
161
162	splitpoint_group = _hash_log2(num_bucket);
163
164	if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
165	return splitpoint_group;
166
167	/ account for single-phase groups /
168	splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
169
170	/ account for multi-phase groups before splitpoint_group /
171	splitpoint_phases +=
172	((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
173	HASH_SPLITPOINT_PHASE_BITS);
174
175	/ account for phases within current group /
176	splitpoint_phases +=
177	(((num_bucket - `1`) >>
178	(splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + `1`))) &
179	HASH_SPLITPOINT_PHASE_MASK); / to 0-based value. /
180
181	return splitpoint_phases;
182	}
183
184	/*
185	* _hash_get_totalbuckets -- returns total number of buckets allocated till
186	* the given splitpoint phase.
187	*/
188	uint32
189	_hash_get_totalbuckets(uint32 splitpoint_phase)
190	{
191	uint32 splitpoint_group;
192	uint32 total_buckets;
193	uint32 phases_within_splitpoint_group;
194
195	if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
196	return (`1` << splitpoint_phase);
197
198	/ get splitpoint's group /
199	splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
200	splitpoint_group +=
201	((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
202	HASH_SPLITPOINT_PHASE_BITS);
203
204	/ account for buckets before splitpoint_group /
205	total_buckets = (`1` << (splitpoint_group - `1`));
206
207	/ account for buckets within splitpoint_group /
208	phases_within_splitpoint_group =
209	(((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
210	HASH_SPLITPOINT_PHASE_MASK) + `1`); / from 0-based to 1-based /
211	total_buckets +=
212	(((`1` << (splitpoint_group - `1`)) >> HASH_SPLITPOINT_PHASE_BITS) *
213	phases_within_splitpoint_group);
214
215	return total_buckets;
216	}
217
218	/*
219	* _hash_checkpage -- sanity checks on the format of all hash pages
220	*
221	* If flags is not zero, it is a bitwise OR of the acceptable page types
222	* (values of hasho_flag & LH_PAGE_TYPE).
223	*/
224	void
225	_hash_checkpage(Relation rel, Buffer buf, int flags)
226	{
227	Page page = BufferGetPage(buf);
228
229	/*
230	* ReadBuffer verifies that every newly-read page passes
231	* PageHeaderIsValid, which means it either contains a reasonably sane
232	* page header or is all-zero. We have to defend against the all-zero
233	* case, however.
234	*/
235	if (PageIsNew(page))
236	ereport(ERROR,
237	(errcode(ERRCODE_INDEX_CORRUPTED),
238	errmsg("index \"%s\" contains unexpected zero page at block %u",
239	RelationGetRelationName(rel),
240	BufferGetBlockNumber(buf)),
241	errhint("Please REINDEX it.")));
242
243	/*
244	* Additionally check that the special area looks sane.
245	*/
246	if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData)))
247	ereport(ERROR,
248	(errcode(ERRCODE_INDEX_CORRUPTED),
249	errmsg("index \"%s\" contains corrupted page at block %u",
250	RelationGetRelationName(rel),
251	BufferGetBlockNumber(buf)),
252	errhint("Please REINDEX it.")));
253
254	if (flags)
255	{
256	HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
257
258	if ((opaque->hasho_flag & flags) == `0`)
259	ereport(ERROR,
260	(errcode(ERRCODE_INDEX_CORRUPTED),
261	errmsg("index \"%s\" contains corrupted page at block %u",
262	RelationGetRelationName(rel),
263	BufferGetBlockNumber(buf)),
264	errhint("Please REINDEX it.")));
265	}
266
267	/*
268	* When checking the metapage, also verify magic number and version.
269	*/
270	if (flags == LH_META_PAGE)
271	{
272	HashMetaPage metap = HashPageGetMeta(page);
273
274	if (metap->hashm_magic != HASH_MAGIC)
275	ereport(ERROR,
276	(errcode(ERRCODE_INDEX_CORRUPTED),
277	errmsg("index \"%s\" is not a hash index",
278	RelationGetRelationName(rel))));
279
280	if (metap->hashm_version != HASH_VERSION)
281	ereport(ERROR,
282	(errcode(ERRCODE_INDEX_CORRUPTED),
283	errmsg("index \"%s\" has wrong hash version",
284	RelationGetRelationName(rel)),
285	errhint("Please REINDEX it.")));
286	}
287	}
288
289	bytea *
290	hashoptions(Datum reloptions, bool validate)
291	{
292	return default_reloptions(reloptions, validate, RELOPT_KIND_HASH);
293	}
294
295	/*
296	* _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
297	*/
298	uint32
299	_hash_get_indextuple_hashkey(IndexTuple itup)
300	{
301	char *attp;
302
303	/*
304	* We assume the hash key is the first attribute and can't be null, so
305	* this can be done crudely but very very cheaply ...
306	*/
307	attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
308	return ((uint32 ) attp);
309	}
310
311	/*
312	* _hash_convert_tuple - convert raw index data to hash key
313	*
314	* Inputs: values and isnull arrays for the user data column(s)
315	* Outputs: values and isnull arrays for the index tuple, suitable for
316	* passing to index_form_tuple().
317	*
318	* Returns true if successful, false if not (because there are null values).
319	* On a false result, the given data need not be indexed.
320	*
321	* Note: callers know that the index-column arrays are always of length 1.
322	* In principle, there could be more than one input column, though we do not
323	* currently support that.
324	*/
325	bool
326	_hash_convert_tuple(Relation index,
327	Datum user_values, bool user_isnull,
328	Datum index_values, bool index_isnull)
329	{
330	uint32 hashkey;
331
332	/*
333	* We do not insert null values into hash indexes. This is okay because
334	* the only supported search operator is '=', and we assume it is strict.
335	*/
336	if (user_isnull[`0`])
337	return false;
338
339	hashkey = _hash_datum2hashkey(index, user_values[`0`]);
340	index_values[`0`] = UInt32GetDatum(hashkey);
341	index_isnull[`0`] = false;
342	return true;
343	}
344
345	/*
346	* _hash_binsearch - Return the offset number in the page where the
347	* specified hash value should be sought or inserted.
348	*
349	* We use binary search, relying on the assumption that the existing entries
350	* are ordered by hash key.
351	*
352	* Returns the offset of the first index entry having hashkey >= hash_value,
353	* or the page's max offset plus one if hash_value is greater than all
354	* existing hash keys in the page. This is the appropriate place to start
355	* a search, or to insert a new item.
356	*/
357	OffsetNumber
358	_hash_binsearch(Page page, uint32 hash_value)
359	{
360	OffsetNumber upper;
361	OffsetNumber lower;
362
363	/ Loop invariant: lower <= desired place <= upper /
364	upper = PageGetMaxOffsetNumber(page) + `1`;
365	lower = FirstOffsetNumber;
366
367	while (upper > lower)
368	{
369	OffsetNumber off;
370	IndexTuple itup;
371	uint32 hashkey;
372
373	off = (upper + lower) / `2`;
374	Assert(OffsetNumberIsValid(off));
375
376	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
377	hashkey = _hash_get_indextuple_hashkey(itup);
378	if (hashkey < hash_value)
379	lower = off + `1`;
380	else
381	upper = off;
382	}
383
384	return lower;
385	}
386
387	/*
388	* _hash_binsearch_last
389	*
390	* Same as above, except that if there are multiple matching items in the
391	* page, we return the offset of the last one instead of the first one,
392	* and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
393	* This is handy for starting a new page in a backwards scan.
394	*/
395	OffsetNumber
396	_hash_binsearch_last(Page page, uint32 hash_value)
397	{
398	OffsetNumber upper;
399	OffsetNumber lower;
400
401	/ Loop invariant: lower <= desired place <= upper /
402	upper = PageGetMaxOffsetNumber(page);
403	lower = FirstOffsetNumber - `1`;
404
405	while (upper > lower)
406	{
407	IndexTuple itup;
408	OffsetNumber off;
409	uint32 hashkey;
410
411	off = (upper + lower + `1`) / `2`;
412	Assert(OffsetNumberIsValid(off));
413
414	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
415	hashkey = _hash_get_indextuple_hashkey(itup);
416	if (hashkey > hash_value)
417	upper = off - `1`;
418	else
419	lower = off;
420	}
421
422	return lower;
423	}
424
425	/*
426	* _hash_get_oldblock_from_newbucket() -- get the block number of a bucket
427	* from which current (new) bucket is being split.
428	*/
429	BlockNumber
430	_hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
431	{
432	Bucket old_bucket;
433	uint32 mask;
434	Buffer metabuf;
435	HashMetaPage metap;
436	BlockNumber blkno;
437
438	/*
439	* To get the old bucket from the current bucket, we need a mask to modulo
440	* into lower half of table. This mask is stored in meta page as
441	* hashm_lowmask, but here we can't rely on the same, because we need a
442	* value of lowmask that was prevalent at the time when bucket split was
443	* started. Masking the most significant bit of new bucket would give us
444	* old bucket.
445	*/
446	mask = (((uint32) `1`) << (fls(new_bucket) - `1`)) - `1`;
447	old_bucket = new_bucket & mask;
448
449	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
450	metap = HashPageGetMeta(BufferGetPage(metabuf));
451
452	blkno = BUCKET_TO_BLKNO(metap, old_bucket);
453
454	_hash_relbuf(rel, metabuf);
455
456	return blkno;
457	}
458
459	/*
460	* _hash_get_newblock_from_oldbucket() -- get the block number of a bucket
461	* that will be generated after split from old bucket.
462	*
463	* This is used to find the new bucket from old bucket based on current table
464	* half. It is mainly required to finish the incomplete splits where we are
465	* sure that not more than one bucket could have split in progress from old
466	* bucket.
467	*/
468	BlockNumber
469	_hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
470	{
471	Bucket new_bucket;
472	Buffer metabuf;
473	HashMetaPage metap;
474	BlockNumber blkno;
475
476	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
477	metap = HashPageGetMeta(BufferGetPage(metabuf));
478
479	new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
480	metap->hashm_lowmask,
481	metap->hashm_maxbucket);
482	blkno = BUCKET_TO_BLKNO(metap, new_bucket);
483
484	_hash_relbuf(rel, metabuf);
485
486	return blkno;
487	}
488
489	/*
490	* _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
491	* generated after split from current (old) bucket.
492	*
493	* This is used to find the new bucket from old bucket. New bucket can be
494	* obtained by OR'ing old bucket with most significant bit of current table
495	* half (lowmask passed in this function can be used to identify msb of
496	* current table half). There could be multiple buckets that could have
497	* been split from current bucket. We need the first such bucket that exists.
498	* Caller must ensure that no more than one split has happened from old
499	* bucket.
500	*/
501	Bucket
502	_hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
503	uint32 lowmask, uint32 maxbucket)
504	{
505	Bucket new_bucket;
506
507	new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
508	if (new_bucket > maxbucket)
509	{
510	lowmask = lowmask >> `1`;
511	new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
512	}
513
514	return new_bucket;
515	}
516
517	/*
518	* _hash_kill_items - set LP_DEAD state for items an indexscan caller has
519	* told us were killed.
520	*
521	* scan->opaque, referenced locally through so, contains information about the
522	* current page and killed tuples thereon (generally, this should only be
523	* called if so->numKilled > 0).
524	*
525	* The caller does not have a lock on the page and may or may not have the
526	* page pinned in a buffer. Note that read-lock is sufficient for setting
527	* LP_DEAD status (which is only a hint).
528	*
529	* The caller must have pin on bucket buffer, but may or may not have pin
530	* on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos).
531	*
532	* We match items by heap TID before assuming they are the right ones to
533	* delete.
534	*
535	* There are never any scans active in a bucket at the time VACUUM begins,
536	* because VACUUM takes a cleanup lock on the primary bucket page and scans
537	* hold a pin. A scan can begin after VACUUM leaves the primary bucket page
538	* but before it finishes the entire bucket, but it can never pass VACUUM,
539	* because VACUUM always locks the next page before releasing the lock on
540	* the previous one. Therefore, we don't have to worry about accidentally
541	* killing a TID that has been reused for an unrelated tuple.
542	*/
543	void
544	_hash_kill_items(IndexScanDesc scan)
545	{
546	HashScanOpaque so = (HashScanOpaque) scan->opaque;
547	Relation rel = scan->indexRelation;
548	BlockNumber blkno;
549	Buffer buf;
550	Page page;
551	HashPageOpaque opaque;
552	OffsetNumber offnum,
553	maxoff;
554	int numKilled = so->numKilled;
555	int i;
556	bool killedsomething = false;
557	bool havePin = false;
558
559	Assert(so->numKilled > `0`);
560	Assert(so->killedItems != NULL);
561	Assert(HashScanPosIsValid(so->currPos));
562
563	/*
564	* Always reset the scan state, so we don't look for same items on other
565	* pages.
566	*/
567	so->numKilled = `0`;
568
569	blkno = so->currPos.currPage;
570	if (HashScanPosIsPinned(so->currPos))
571	{
572	/*
573	* We already have pin on this buffer, so, all we need to do is
574	* acquire lock on it.
575	*/
576	havePin = true;
577	buf = so->currPos.buf;
578	LockBuffer(buf, BUFFER_LOCK_SHARE);
579	}
580	else
581	buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
582
583	page = BufferGetPage(buf);
584	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
585	maxoff = PageGetMaxOffsetNumber(page);
586
587	for (i = `0`; i < numKilled; i++)
588	{
589	int itemIndex = so->killedItems[i];
590	HashScanPosItem *currItem = &so->currPos.items[itemIndex];
591
592	offnum = currItem->indexOffset;
593
594	Assert(itemIndex >= so->currPos.firstItem &&
595	itemIndex <= so->currPos.lastItem);
596
597	while (offnum <= maxoff)
598	{
599	ItemId iid = PageGetItemId(page, offnum);
600	IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
601
602	if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid))
603	{
604	/ found the item /
605	ItemIdMarkDead(iid);
606	killedsomething = true;
607	break; / out of inner search loop /
608	}
609	offnum = OffsetNumberNext(offnum);
610	}
611	}
612
613	/*
614	* Since this can be redone later if needed, mark as dirty hint. Whenever
615	* we mark anything LP_DEAD, we also set the page's
616	* LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
617	*/
618	if (killedsomething)
619	{
620	opaque->hasho_flag \|= LH_PAGE_HAS_DEAD_TUPLES;
621	MarkBufferDirtyHint(buf, true);
622	}
623
624	if (so->hashso_bucket_buf == so->currPos.buf \|\|
625	havePin)
626	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
627	else
628	_hash_relbuf(rel, buf);
629	}
630

Browse the source code of PostgreSQL/src/backend/access/hash/hashutil.c