1/*-------------------------------------------------------------------------
2 *
3 * hashutil.c
4 * Utility code for Postgres hash implementation.
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/hash/hashutil.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "postgres.h"
16
17#include "access/hash.h"
18#include "access/reloptions.h"
19#include "access/relscan.h"
20#include "utils/lsyscache.h"
21#include "utils/rel.h"
22#include "storage/buf_internals.h"
23
24#define CALC_NEW_BUCKET(old_bucket, lowmask) \
25 old_bucket | (lowmask + 1)
26
27/*
28 * _hash_checkqual -- does the index tuple satisfy the scan conditions?
29 */
30bool
31_hash_checkqual(IndexScanDesc scan, IndexTuple itup)
32{
33 /*
34 * Currently, we can't check any of the scan conditions since we do not
35 * have the original index entry value to supply to the sk_func. Always
36 * return true; we expect that hashgettuple already set the recheck flag
37 * to make the main indexscan code do it.
38 */
39#ifdef NOT_USED
40 TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
41 ScanKey key = scan->keyData;
42 int scanKeySize = scan->numberOfKeys;
43
44 while (scanKeySize > 0)
45 {
46 Datum datum;
47 bool isNull;
48 Datum test;
49
50 datum = index_getattr(itup,
51 key->sk_attno,
52 tupdesc,
53 &isNull);
54
55 /* assume sk_func is strict */
56 if (isNull)
57 return false;
58 if (key->sk_flags & SK_ISNULL)
59 return false;
60
61 test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
62 datum, key->sk_argument);
63
64 if (!DatumGetBool(test))
65 return false;
66
67 key++;
68 scanKeySize--;
69 }
70#endif
71
72 return true;
73}
74
75/*
76 * _hash_datum2hashkey -- given a Datum, call the index's hash function
77 *
78 * The Datum is assumed to be of the index's column type, so we can use the
79 * "primary" hash function that's tracked for us by the generic index code.
80 */
81uint32
82_hash_datum2hashkey(Relation rel, Datum key)
83{
84 FmgrInfo *procinfo;
85 Oid collation;
86
87 /* XXX assumes index has only one attribute */
88 procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC);
89 collation = rel->rd_indcollation[0];
90
91 return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key));
92}
93
94/*
95 * _hash_datum2hashkey_type -- given a Datum of a specified type,
96 * hash it in a fashion compatible with this index
97 *
98 * This is much more expensive than _hash_datum2hashkey, so use it only in
99 * cross-type situations.
100 */
101uint32
102_hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
103{
104 RegProcedure hash_proc;
105 Oid collation;
106
107 /* XXX assumes index has only one attribute */
108 hash_proc = get_opfamily_proc(rel->rd_opfamily[0],
109 keytype,
110 keytype,
111 HASHSTANDARD_PROC);
112 if (!RegProcedureIsValid(hash_proc))
113 elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"",
114 HASHSTANDARD_PROC, keytype, keytype,
115 RelationGetRelationName(rel));
116 collation = rel->rd_indcollation[0];
117
118 return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key));
119}
120
121/*
122 * _hash_hashkey2bucket -- determine which bucket the hashkey maps to.
123 */
124Bucket
125_hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
126 uint32 highmask, uint32 lowmask)
127{
128 Bucket bucket;
129
130 bucket = hashkey & highmask;
131 if (bucket > maxbucket)
132 bucket = bucket & lowmask;
133
134 return bucket;
135}
136
137/*
138 * _hash_log2 -- returns ceil(lg2(num))
139 */
140uint32
141_hash_log2(uint32 num)
142{
143 uint32 i,
144 limit;
145
146 limit = 1;
147 for (i = 0; limit < num; limit <<= 1, i++)
148 ;
149 return i;
150}
151
152/*
153 * _hash_spareindex -- returns spare index / global splitpoint phase of the
154 * bucket
155 */
156uint32
157_hash_spareindex(uint32 num_bucket)
158{
159 uint32 splitpoint_group;
160 uint32 splitpoint_phases;
161
162 splitpoint_group = _hash_log2(num_bucket);
163
164 if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
165 return splitpoint_group;
166
167 /* account for single-phase groups */
168 splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
169
170 /* account for multi-phase groups before splitpoint_group */
171 splitpoint_phases +=
172 ((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
173 HASH_SPLITPOINT_PHASE_BITS);
174
175 /* account for phases within current group */
176 splitpoint_phases +=
177 (((num_bucket - 1) >>
178 (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) &
179 HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */
180
181 return splitpoint_phases;
182}
183
184/*
185 * _hash_get_totalbuckets -- returns total number of buckets allocated till
186 * the given splitpoint phase.
187 */
188uint32
189_hash_get_totalbuckets(uint32 splitpoint_phase)
190{
191 uint32 splitpoint_group;
192 uint32 total_buckets;
193 uint32 phases_within_splitpoint_group;
194
195 if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
196 return (1 << splitpoint_phase);
197
198 /* get splitpoint's group */
199 splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
200 splitpoint_group +=
201 ((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
202 HASH_SPLITPOINT_PHASE_BITS);
203
204 /* account for buckets before splitpoint_group */
205 total_buckets = (1 << (splitpoint_group - 1));
206
207 /* account for buckets within splitpoint_group */
208 phases_within_splitpoint_group =
209 (((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
210 HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */
211 total_buckets +=
212 (((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
213 phases_within_splitpoint_group);
214
215 return total_buckets;
216}
217
218/*
219 * _hash_checkpage -- sanity checks on the format of all hash pages
220 *
221 * If flags is not zero, it is a bitwise OR of the acceptable page types
222 * (values of hasho_flag & LH_PAGE_TYPE).
223 */
224void
225_hash_checkpage(Relation rel, Buffer buf, int flags)
226{
227 Page page = BufferGetPage(buf);
228
229 /*
230 * ReadBuffer verifies that every newly-read page passes
231 * PageHeaderIsValid, which means it either contains a reasonably sane
232 * page header or is all-zero. We have to defend against the all-zero
233 * case, however.
234 */
235 if (PageIsNew(page))
236 ereport(ERROR,
237 (errcode(ERRCODE_INDEX_CORRUPTED),
238 errmsg("index \"%s\" contains unexpected zero page at block %u",
239 RelationGetRelationName(rel),
240 BufferGetBlockNumber(buf)),
241 errhint("Please REINDEX it.")));
242
243 /*
244 * Additionally check that the special area looks sane.
245 */
246 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData)))
247 ereport(ERROR,
248 (errcode(ERRCODE_INDEX_CORRUPTED),
249 errmsg("index \"%s\" contains corrupted page at block %u",
250 RelationGetRelationName(rel),
251 BufferGetBlockNumber(buf)),
252 errhint("Please REINDEX it.")));
253
254 if (flags)
255 {
256 HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
257
258 if ((opaque->hasho_flag & flags) == 0)
259 ereport(ERROR,
260 (errcode(ERRCODE_INDEX_CORRUPTED),
261 errmsg("index \"%s\" contains corrupted page at block %u",
262 RelationGetRelationName(rel),
263 BufferGetBlockNumber(buf)),
264 errhint("Please REINDEX it.")));
265 }
266
267 /*
268 * When checking the metapage, also verify magic number and version.
269 */
270 if (flags == LH_META_PAGE)
271 {
272 HashMetaPage metap = HashPageGetMeta(page);
273
274 if (metap->hashm_magic != HASH_MAGIC)
275 ereport(ERROR,
276 (errcode(ERRCODE_INDEX_CORRUPTED),
277 errmsg("index \"%s\" is not a hash index",
278 RelationGetRelationName(rel))));
279
280 if (metap->hashm_version != HASH_VERSION)
281 ereport(ERROR,
282 (errcode(ERRCODE_INDEX_CORRUPTED),
283 errmsg("index \"%s\" has wrong hash version",
284 RelationGetRelationName(rel)),
285 errhint("Please REINDEX it.")));
286 }
287}
288
289bytea *
290hashoptions(Datum reloptions, bool validate)
291{
292 return default_reloptions(reloptions, validate, RELOPT_KIND_HASH);
293}
294
295/*
296 * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
297 */
298uint32
299_hash_get_indextuple_hashkey(IndexTuple itup)
300{
301 char *attp;
302
303 /*
304 * We assume the hash key is the first attribute and can't be null, so
305 * this can be done crudely but very very cheaply ...
306 */
307 attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
308 return *((uint32 *) attp);
309}
310
311/*
312 * _hash_convert_tuple - convert raw index data to hash key
313 *
314 * Inputs: values and isnull arrays for the user data column(s)
315 * Outputs: values and isnull arrays for the index tuple, suitable for
316 * passing to index_form_tuple().
317 *
318 * Returns true if successful, false if not (because there are null values).
319 * On a false result, the given data need not be indexed.
320 *
321 * Note: callers know that the index-column arrays are always of length 1.
322 * In principle, there could be more than one input column, though we do not
323 * currently support that.
324 */
325bool
326_hash_convert_tuple(Relation index,
327 Datum *user_values, bool *user_isnull,
328 Datum *index_values, bool *index_isnull)
329{
330 uint32 hashkey;
331
332 /*
333 * We do not insert null values into hash indexes. This is okay because
334 * the only supported search operator is '=', and we assume it is strict.
335 */
336 if (user_isnull[0])
337 return false;
338
339 hashkey = _hash_datum2hashkey(index, user_values[0]);
340 index_values[0] = UInt32GetDatum(hashkey);
341 index_isnull[0] = false;
342 return true;
343}
344
345/*
346 * _hash_binsearch - Return the offset number in the page where the
347 * specified hash value should be sought or inserted.
348 *
349 * We use binary search, relying on the assumption that the existing entries
350 * are ordered by hash key.
351 *
352 * Returns the offset of the first index entry having hashkey >= hash_value,
353 * or the page's max offset plus one if hash_value is greater than all
354 * existing hash keys in the page. This is the appropriate place to start
355 * a search, or to insert a new item.
356 */
357OffsetNumber
358_hash_binsearch(Page page, uint32 hash_value)
359{
360 OffsetNumber upper;
361 OffsetNumber lower;
362
363 /* Loop invariant: lower <= desired place <= upper */
364 upper = PageGetMaxOffsetNumber(page) + 1;
365 lower = FirstOffsetNumber;
366
367 while (upper > lower)
368 {
369 OffsetNumber off;
370 IndexTuple itup;
371 uint32 hashkey;
372
373 off = (upper + lower) / 2;
374 Assert(OffsetNumberIsValid(off));
375
376 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
377 hashkey = _hash_get_indextuple_hashkey(itup);
378 if (hashkey < hash_value)
379 lower = off + 1;
380 else
381 upper = off;
382 }
383
384 return lower;
385}
386
387/*
388 * _hash_binsearch_last
389 *
390 * Same as above, except that if there are multiple matching items in the
391 * page, we return the offset of the last one instead of the first one,
392 * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
393 * This is handy for starting a new page in a backwards scan.
394 */
395OffsetNumber
396_hash_binsearch_last(Page page, uint32 hash_value)
397{
398 OffsetNumber upper;
399 OffsetNumber lower;
400
401 /* Loop invariant: lower <= desired place <= upper */
402 upper = PageGetMaxOffsetNumber(page);
403 lower = FirstOffsetNumber - 1;
404
405 while (upper > lower)
406 {
407 IndexTuple itup;
408 OffsetNumber off;
409 uint32 hashkey;
410
411 off = (upper + lower + 1) / 2;
412 Assert(OffsetNumberIsValid(off));
413
414 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
415 hashkey = _hash_get_indextuple_hashkey(itup);
416 if (hashkey > hash_value)
417 upper = off - 1;
418 else
419 lower = off;
420 }
421
422 return lower;
423}
424
425/*
426 * _hash_get_oldblock_from_newbucket() -- get the block number of a bucket
427 * from which current (new) bucket is being split.
428 */
429BlockNumber
430_hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
431{
432 Bucket old_bucket;
433 uint32 mask;
434 Buffer metabuf;
435 HashMetaPage metap;
436 BlockNumber blkno;
437
438 /*
439 * To get the old bucket from the current bucket, we need a mask to modulo
440 * into lower half of table. This mask is stored in meta page as
441 * hashm_lowmask, but here we can't rely on the same, because we need a
442 * value of lowmask that was prevalent at the time when bucket split was
443 * started. Masking the most significant bit of new bucket would give us
444 * old bucket.
445 */
446 mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
447 old_bucket = new_bucket & mask;
448
449 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
450 metap = HashPageGetMeta(BufferGetPage(metabuf));
451
452 blkno = BUCKET_TO_BLKNO(metap, old_bucket);
453
454 _hash_relbuf(rel, metabuf);
455
456 return blkno;
457}
458
459/*
460 * _hash_get_newblock_from_oldbucket() -- get the block number of a bucket
461 * that will be generated after split from old bucket.
462 *
463 * This is used to find the new bucket from old bucket based on current table
464 * half. It is mainly required to finish the incomplete splits where we are
465 * sure that not more than one bucket could have split in progress from old
466 * bucket.
467 */
468BlockNumber
469_hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
470{
471 Bucket new_bucket;
472 Buffer metabuf;
473 HashMetaPage metap;
474 BlockNumber blkno;
475
476 metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
477 metap = HashPageGetMeta(BufferGetPage(metabuf));
478
479 new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
480 metap->hashm_lowmask,
481 metap->hashm_maxbucket);
482 blkno = BUCKET_TO_BLKNO(metap, new_bucket);
483
484 _hash_relbuf(rel, metabuf);
485
486 return blkno;
487}
488
489/*
490 * _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
491 * generated after split from current (old) bucket.
492 *
493 * This is used to find the new bucket from old bucket. New bucket can be
494 * obtained by OR'ing old bucket with most significant bit of current table
495 * half (lowmask passed in this function can be used to identify msb of
496 * current table half). There could be multiple buckets that could have
497 * been split from current bucket. We need the first such bucket that exists.
498 * Caller must ensure that no more than one split has happened from old
499 * bucket.
500 */
501Bucket
502_hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
503 uint32 lowmask, uint32 maxbucket)
504{
505 Bucket new_bucket;
506
507 new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
508 if (new_bucket > maxbucket)
509 {
510 lowmask = lowmask >> 1;
511 new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
512 }
513
514 return new_bucket;
515}
516
517/*
518 * _hash_kill_items - set LP_DEAD state for items an indexscan caller has
519 * told us were killed.
520 *
521 * scan->opaque, referenced locally through so, contains information about the
522 * current page and killed tuples thereon (generally, this should only be
523 * called if so->numKilled > 0).
524 *
525 * The caller does not have a lock on the page and may or may not have the
526 * page pinned in a buffer. Note that read-lock is sufficient for setting
527 * LP_DEAD status (which is only a hint).
528 *
529 * The caller must have pin on bucket buffer, but may or may not have pin
530 * on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos).
531 *
532 * We match items by heap TID before assuming they are the right ones to
533 * delete.
534 *
535 * There are never any scans active in a bucket at the time VACUUM begins,
536 * because VACUUM takes a cleanup lock on the primary bucket page and scans
537 * hold a pin. A scan can begin after VACUUM leaves the primary bucket page
538 * but before it finishes the entire bucket, but it can never pass VACUUM,
539 * because VACUUM always locks the next page before releasing the lock on
540 * the previous one. Therefore, we don't have to worry about accidentally
541 * killing a TID that has been reused for an unrelated tuple.
542 */
543void
544_hash_kill_items(IndexScanDesc scan)
545{
546 HashScanOpaque so = (HashScanOpaque) scan->opaque;
547 Relation rel = scan->indexRelation;
548 BlockNumber blkno;
549 Buffer buf;
550 Page page;
551 HashPageOpaque opaque;
552 OffsetNumber offnum,
553 maxoff;
554 int numKilled = so->numKilled;
555 int i;
556 bool killedsomething = false;
557 bool havePin = false;
558
559 Assert(so->numKilled > 0);
560 Assert(so->killedItems != NULL);
561 Assert(HashScanPosIsValid(so->currPos));
562
563 /*
564 * Always reset the scan state, so we don't look for same items on other
565 * pages.
566 */
567 so->numKilled = 0;
568
569 blkno = so->currPos.currPage;
570 if (HashScanPosIsPinned(so->currPos))
571 {
572 /*
573 * We already have pin on this buffer, so, all we need to do is
574 * acquire lock on it.
575 */
576 havePin = true;
577 buf = so->currPos.buf;
578 LockBuffer(buf, BUFFER_LOCK_SHARE);
579 }
580 else
581 buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
582
583 page = BufferGetPage(buf);
584 opaque = (HashPageOpaque) PageGetSpecialPointer(page);
585 maxoff = PageGetMaxOffsetNumber(page);
586
587 for (i = 0; i < numKilled; i++)
588 {
589 int itemIndex = so->killedItems[i];
590 HashScanPosItem *currItem = &so->currPos.items[itemIndex];
591
592 offnum = currItem->indexOffset;
593
594 Assert(itemIndex >= so->currPos.firstItem &&
595 itemIndex <= so->currPos.lastItem);
596
597 while (offnum <= maxoff)
598 {
599 ItemId iid = PageGetItemId(page, offnum);
600 IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
601
602 if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid))
603 {
604 /* found the item */
605 ItemIdMarkDead(iid);
606 killedsomething = true;
607 break; /* out of inner search loop */
608 }
609 offnum = OffsetNumberNext(offnum);
610 }
611 }
612
613 /*
614 * Since this can be redone later if needed, mark as dirty hint. Whenever
615 * we mark anything LP_DEAD, we also set the page's
616 * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
617 */
618 if (killedsomething)
619 {
620 opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
621 MarkBufferDirtyHint(buf, true);
622 }
623
624 if (so->hashso_bucket_buf == so->currPos.buf ||
625 havePin)
626 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
627 else
628 _hash_relbuf(rel, buf);
629}
630