1/*-------------------------------------------------------------------------
2 *
3 * hash.h
4 * header file for postgres hash access method implementation
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * src/include/access/hash.h
11 *
12 * NOTES
13 * modeled after Margo Seltzer's hash implementation for unix.
14 *
15 *-------------------------------------------------------------------------
16 */
17#ifndef HASH_H
18#define HASH_H
19
20#include "access/amapi.h"
21#include "access/itup.h"
22#include "access/sdir.h"
23#include "fmgr.h"
24#include "lib/stringinfo.h"
25#include "storage/bufmgr.h"
26#include "storage/lockdefs.h"
27#include "utils/hashutils.h"
28#include "utils/hsearch.h"
29#include "utils/relcache.h"
30
31/*
32 * Mapping from hash bucket number to physical block number of bucket's
33 * starting page. Beware of multiple evaluations of argument!
34 */
35typedef uint32 Bucket;
36
37#define InvalidBucket ((Bucket) 0xFFFFFFFF)
38
39#define BUCKET_TO_BLKNO(metap,B) \
40 ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1)
41
42/*
43 * Special space for hash index pages.
44 *
45 * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
46 * Additional bits in the flag word are used for more transient purposes.
47 *
48 * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
49 * However, we ensure that each used page type has a distinct bit so that
50 * we can OR together page types for uses such as the allowable-page-types
51 * argument of _hash_checkpage().
52 */
53#define LH_UNUSED_PAGE (0)
54#define LH_OVERFLOW_PAGE (1 << 0)
55#define LH_BUCKET_PAGE (1 << 1)
56#define LH_BITMAP_PAGE (1 << 2)
57#define LH_META_PAGE (1 << 3)
58#define LH_BUCKET_BEING_POPULATED (1 << 4)
59#define LH_BUCKET_BEING_SPLIT (1 << 5)
60#define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6)
61#define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
62
63#define LH_PAGE_TYPE \
64 (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
65
66/*
67 * In an overflow page, hasho_prevblkno stores the block number of the previous
68 * page in the bucket chain; in a bucket page, hasho_prevblkno stores the
69 * hashm_maxbucket value as of the last time the bucket was last split, or
70 * else as of the time the bucket was created. The latter convention is used
71 * to determine whether a cached copy of the metapage is too stale to be used
72 * without needing to lock or pin the metapage.
73 *
74 * hasho_nextblkno is always the block number of the next page in the
75 * bucket chain, or InvalidBlockNumber if there are no more such pages.
76 */
77typedef struct HashPageOpaqueData
78{
79 BlockNumber hasho_prevblkno; /* see above */
80 BlockNumber hasho_nextblkno; /* see above */
81 Bucket hasho_bucket; /* bucket number this pg belongs to */
82 uint16 hasho_flag; /* page type code + flag bits, see above */
83 uint16 hasho_page_id; /* for identification of hash indexes */
84} HashPageOpaqueData;
85
86typedef HashPageOpaqueData *HashPageOpaque;
87
88#define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
89#define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
90#define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
91#define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
92
93/*
94 * The page ID is for the convenience of pg_filedump and similar utilities,
95 * which otherwise would have a hard time telling pages of different index
96 * types apart. It should be the last 2 bytes on the page. This is more or
97 * less "free" due to alignment considerations.
98 */
99#define HASHO_PAGE_ID 0xFF80
100
101typedef struct HashScanPosItem /* what we remember about each match */
102{
103 ItemPointerData heapTid; /* TID of referenced heap item */
104 OffsetNumber indexOffset; /* index item's location within page */
105} HashScanPosItem;
106
107typedef struct HashScanPosData
108{
109 Buffer buf; /* if valid, the buffer is pinned */
110 BlockNumber currPage; /* current hash index page */
111 BlockNumber nextPage; /* next overflow page */
112 BlockNumber prevPage; /* prev overflow or bucket page */
113
114 /*
115 * The items array is always ordered in index order (ie, increasing
116 * indexoffset). When scanning backwards it is convenient to fill the
117 * array back-to-front, so we start at the last slot and fill downwards.
118 * Hence we need both a first-valid-entry and a last-valid-entry counter.
119 * itemIndex is a cursor showing which entry was last returned to caller.
120 */
121 int firstItem; /* first valid index in items[] */
122 int lastItem; /* last valid index in items[] */
123 int itemIndex; /* current index in items[] */
124
125 HashScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
126} HashScanPosData;
127
128#define HashScanPosIsPinned(scanpos) \
129( \
130 AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
131 !BufferIsValid((scanpos).buf)), \
132 BufferIsValid((scanpos).buf) \
133)
134
135#define HashScanPosIsValid(scanpos) \
136( \
137 AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
138 !BufferIsValid((scanpos).buf)), \
139 BlockNumberIsValid((scanpos).currPage) \
140)
141
142#define HashScanPosInvalidate(scanpos) \
143 do { \
144 (scanpos).buf = InvalidBuffer; \
145 (scanpos).currPage = InvalidBlockNumber; \
146 (scanpos).nextPage = InvalidBlockNumber; \
147 (scanpos).prevPage = InvalidBlockNumber; \
148 (scanpos).firstItem = 0; \
149 (scanpos).lastItem = 0; \
150 (scanpos).itemIndex = 0; \
151 } while (0);
152
153/*
154 * HashScanOpaqueData is private state for a hash index scan.
155 */
156typedef struct HashScanOpaqueData
157{
158 /* Hash value of the scan key, ie, the hash key we seek */
159 uint32 hashso_sk_hash;
160
161 /* remember the buffer associated with primary bucket */
162 Buffer hashso_bucket_buf;
163
164 /*
165 * remember the buffer associated with primary bucket page of bucket being
166 * split. it is required during the scan of the bucket which is being
167 * populated during split operation.
168 */
169 Buffer hashso_split_bucket_buf;
170
171 /* Whether scan starts on bucket being populated due to split */
172 bool hashso_buc_populated;
173
174 /*
175 * Whether scanning bucket being split? The value of this parameter is
176 * referred only when hashso_buc_populated is true.
177 */
178 bool hashso_buc_split;
179 /* info about killed items if any (killedItems is NULL if never used) */
180 int *killedItems; /* currPos.items indexes of killed items */
181 int numKilled; /* number of currently stored items */
182
183 /*
184 * Identify all the matching items on a page and save them in
185 * HashScanPosData
186 */
187 HashScanPosData currPos; /* current position data */
188} HashScanOpaqueData;
189
190typedef HashScanOpaqueData *HashScanOpaque;
191
192/*
193 * Definitions for metapage.
194 */
195
196#define HASH_METAPAGE 0 /* metapage is always block 0 */
197
198#define HASH_MAGIC 0x6440640
199#define HASH_VERSION 4
200
201/*
202 * spares[] holds the number of overflow pages currently allocated at or
203 * before a certain splitpoint. For example, if spares[3] = 7 then there are
204 * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The
205 * value in spares[ovflpoint] increases as overflow pages are added at the
206 * end of the index. Once ovflpoint increases (ie, we have actually allocated
207 * the bucket pages belonging to that splitpoint) the number of spares at the
208 * prior splitpoint cannot change anymore.
209 *
210 * ovflpages that have been recycled for reuse can be found by looking at
211 * bitmaps that are stored within ovflpages dedicated for the purpose.
212 * The blknos of these bitmap pages are kept in mapp[]; nmaps is the
213 * number of currently existing bitmaps.
214 *
215 * The limitation on the size of spares[] comes from the fact that there's
216 * no point in having more than 2^32 buckets with only uint32 hashcodes.
217 * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
218 * adjusted in such a way to accommodate multi phased allocation of buckets
219 * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
220 *
221 * There is no particular upper limit on the size of mapp[], other than
222 * needing to fit into the metapage. (With 8K block size, 1024 bitmaps
223 * limit us to 256 GB of overflow space...). For smaller block size we
224 * can not use 1024 bitmaps as it will lead to the meta page data crossing
225 * the block size boundary. So we use BLCKSZ to determine the maximum number
226 * of bitmaps.
227 */
228#define HASH_MAX_BITMAPS Min(BLCKSZ / 8, 1024)
229
230#define HASH_SPLITPOINT_PHASE_BITS 2
231#define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)
232#define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1)
233#define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10
234
235/* defines max number of splitpoint phases a hash index can have */
236#define HASH_MAX_SPLITPOINT_GROUP 32
237#define HASH_MAX_SPLITPOINTS \
238 (((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
239 HASH_SPLITPOINT_PHASES_PER_GRP) + \
240 HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
241
242typedef struct HashMetaPageData
243{
244 uint32 hashm_magic; /* magic no. for hash tables */
245 uint32 hashm_version; /* version ID */
246 double hashm_ntuples; /* number of tuples stored in the table */
247 uint16 hashm_ffactor; /* target fill factor (tuples/bucket) */
248 uint16 hashm_bsize; /* index page size (bytes) */
249 uint16 hashm_bmsize; /* bitmap array size (bytes) - must be a power
250 * of 2 */
251 uint16 hashm_bmshift; /* log2(bitmap array size in BITS) */
252 uint32 hashm_maxbucket; /* ID of maximum bucket in use */
253 uint32 hashm_highmask; /* mask to modulo into entire table */
254 uint32 hashm_lowmask; /* mask to modulo into lower half of table */
255 uint32 hashm_ovflpoint; /* splitpoint from which ovflpgs being
256 * allocated */
257 uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */
258 uint32 hashm_nmaps; /* number of bitmap pages */
259 RegProcedure hashm_procid; /* hash function id from pg_proc */
260 uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before each
261 * splitpoint */
262 BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */
263} HashMetaPageData;
264
265typedef HashMetaPageData *HashMetaPage;
266
267/*
268 * Maximum size of a hash index item (it's okay to have only one per page)
269 */
270#define HashMaxItemSize(page) \
271 MAXALIGN_DOWN(PageGetPageSize(page) - \
272 SizeOfPageHeaderData - \
273 sizeof(ItemIdData) - \
274 MAXALIGN(sizeof(HashPageOpaqueData)))
275
276#define INDEX_MOVED_BY_SPLIT_MASK INDEX_AM_RESERVED_BIT
277
278#define HASH_MIN_FILLFACTOR 10
279#define HASH_DEFAULT_FILLFACTOR 75
280
281/*
282 * Constants
283 */
284#define BYTE_TO_BIT 3 /* 2^3 bits/byte */
285#define ALL_SET ((uint32) ~0)
286
287/*
288 * Bitmap pages do not contain tuples. They do contain the standard
289 * page headers and trailers; however, everything in between is a
290 * giant bit array. The number of bits that fit on a page obviously
291 * depends on the page size and the header/trailer overhead. We require
292 * the number of bits per page to be a power of 2.
293 */
294#define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize)
295#define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT)
296#define BMPG_SHIFT(metap) ((metap)->hashm_bmshift)
297#define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1)
298
299#define HashPageGetBitmap(page) \
300 ((uint32 *) PageGetContents(page))
301
302#define HashGetMaxBitmapSize(page) \
303 (PageGetPageSize((Page) page) - \
304 (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
305
306#define HashPageGetMeta(page) \
307 ((HashMetaPage) PageGetContents(page))
308
309/*
310 * The number of bits in an ovflpage bitmap word.
311 */
312#define BITS_PER_MAP 32 /* Number of bits in uint32 */
313
314/* Given the address of the beginning of a bit map, clear/set the nth bit */
315#define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
316#define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
317#define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
318
319/*
320 * page-level and high-level locking modes (see README)
321 */
322#define HASH_READ BUFFER_LOCK_SHARE
323#define HASH_WRITE BUFFER_LOCK_EXCLUSIVE
324#define HASH_NOLOCK (-1)
325
326/*
327 * When a new operator class is declared, we require that the user supply
328 * us with an amproc function for hashing a key of the new type, returning
329 * a 32-bit hash value. We call this the "standard" hash function. We
330 * also allow an optional "extended" hash function which accepts a salt and
331 * returns a 64-bit hash value. This is highly recommended but, for reasons
332 * of backward compatibility, optional.
333 *
334 * When the salt is 0, the low 32 bits of the value returned by the extended
335 * hash function should match the value that would have been returned by the
336 * standard hash function.
337 */
338#define HASHSTANDARD_PROC 1
339#define HASHEXTENDED_PROC 2
340#define HASHNProcs 2
341
342
343/* public routines */
344
345extern IndexBuildResult *hashbuild(Relation heap, Relation index,
346 struct IndexInfo *indexInfo);
347extern void hashbuildempty(Relation index);
348extern bool hashinsert(Relation rel, Datum *values, bool *isnull,
349 ItemPointer ht_ctid, Relation heapRel,
350 IndexUniqueCheck checkUnique,
351 struct IndexInfo *indexInfo);
352extern bool hashgettuple(IndexScanDesc scan, ScanDirection dir);
353extern int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
354extern IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys);
355extern void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
356 ScanKey orderbys, int norderbys);
357extern void hashendscan(IndexScanDesc scan);
358extern IndexBulkDeleteResult *hashbulkdelete(IndexVacuumInfo *info,
359 IndexBulkDeleteResult *stats,
360 IndexBulkDeleteCallback callback,
361 void *callback_state);
362extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info,
363 IndexBulkDeleteResult *stats);
364extern bytea *hashoptions(Datum reloptions, bool validate);
365extern bool hashvalidate(Oid opclassoid);
366
367/* private routines */
368
369/* hashinsert.c */
370extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel);
371extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf,
372 Size itemsize, IndexTuple itup);
373extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
374 OffsetNumber *itup_offsets, uint16 nitups);
375
376/* hashovfl.c */
377extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
378extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
379 Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
380 Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
381extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
382extern void _hash_squeezebucket(Relation rel,
383 Bucket bucket, BlockNumber bucket_blkno,
384 Buffer bucket_buf,
385 BufferAccessStrategy bstrategy);
386extern uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno);
387
388/* hashpage.c */
389extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
390 int access, int flags);
391extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
392 BlockNumber blkno, int flags);
393extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf,
394 bool force_refresh);
395extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
396 int access,
397 HashMetaPage *cachedmetap);
398extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
399extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket,
400 uint32 flag, bool initpage);
401extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
402 ForkNumber forkNum);
403extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
404 int access, int flags,
405 BufferAccessStrategy bstrategy);
406extern void _hash_relbuf(Relation rel, Buffer buf);
407extern void _hash_dropbuf(Relation rel, Buffer buf);
408extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
409extern uint32 _hash_init(Relation rel, double num_tuples,
410 ForkNumber forkNum);
411extern void _hash_init_metabuffer(Buffer buf, double num_tuples,
412 RegProcedure procid, uint16 ffactor, bool initpage);
413extern void _hash_pageinit(Page page, Size size);
414extern void _hash_expandtable(Relation rel, Buffer metabuf);
415extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf,
416 Bucket obucket, uint32 maxbucket, uint32 highmask,
417 uint32 lowmask);
418
419/* hashsearch.c */
420extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
421extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
422
423/* hashsort.c */
424typedef struct HSpool HSpool; /* opaque struct in hashsort.c */
425
426extern HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets);
427extern void _h_spooldestroy(HSpool *hspool);
428extern void _h_spool(HSpool *hspool, ItemPointer self,
429 Datum *values, bool *isnull);
430extern void _h_indexbuild(HSpool *hspool, Relation heapRel);
431
432/* hashutil.c */
433extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
434extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
435extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
436extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
437 uint32 highmask, uint32 lowmask);
438extern uint32 _hash_log2(uint32 num);
439extern uint32 _hash_spareindex(uint32 num_bucket);
440extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
441extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
442extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
443extern bool _hash_convert_tuple(Relation index,
444 Datum *user_values, bool *user_isnull,
445 Datum *index_values, bool *index_isnull);
446extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
447extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
448extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
449extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
450extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
451 uint32 lowmask, uint32 maxbucket);
452extern void _hash_kill_items(IndexScanDesc scan);
453
454/* hash.c */
455extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
456 Buffer bucket_buf, BlockNumber bucket_blkno,
457 BufferAccessStrategy bstrategy,
458 uint32 maxbucket, uint32 highmask, uint32 lowmask,
459 double *tuples_removed, double *num_index_tuples,
460 bool split_cleanup,
461 IndexBulkDeleteCallback callback, void *callback_state);
462
463#endif /* HASH_H */
464