1/*-------------------------------------------------------------------------
2 *
3 * dynahash.c
4 * dynamic hash tables
5 *
6 * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 * shared memory. For shared hash tables, it is the caller's responsibility
8 * to provide appropriate access interlocking. The simplest convention is
9 * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 * hash_seq_search) need only shared lock, but any update requires exclusive
11 * lock. For heavily-used shared tables, the single-lock approach creates a
12 * concurrency bottleneck, so we also support "partitioned" locking wherein
13 * there are multiple LWLocks guarding distinct subsets of the table. To use
14 * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 * Therefore, each hash bucket chain operates independently, and no fields
17 * of the hash header change after init except nentries and freeList.
18 * (A partitioned table uses multiple copies of those fields, guarded by
19 * spinlocks, for additional concurrency.)
20 * This lets any subset of the hash buckets be treated as a separately
21 * lockable partition. We expect callers to use the low-order bits of a
22 * lookup key's hash value as a partition number --- this will work because
23 * of the way calc_bucket() maps hash values to bucket numbers.
24 *
25 * For hash tables in shared memory, the memory allocator function should
26 * match malloc's semantics of returning NULL on failure. For hash tables
27 * in local memory, we typically use palloc() which will throw error on
28 * failure. The code in this file has to cope with both cases.
29 *
30 * dynahash.c provides support for these types of lookup keys:
31 *
32 * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
33 * compared as though by strcmp(). This is the default behavior.
34 *
35 * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
36 * (Caller must ensure there are no undefined padding bits in the keys!)
37 * This is selected by specifying HASH_BLOBS flag to hash_create.
38 *
39 * 3. More complex key behavior can be selected by specifying user-supplied
40 * hashing, comparison, and/or key-copying functions. At least a hashing
41 * function must be supplied; comparison defaults to memcmp() and key copying
42 * to memcpy() when a user-defined hashing function is selected.
43 *
44 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
45 * Portions Copyright (c) 1994, Regents of the University of California
46 *
47 *
48 * IDENTIFICATION
49 * src/backend/utils/hash/dynahash.c
50 *
51 *-------------------------------------------------------------------------
52 */
53
54/*
55 * Original comments:
56 *
57 * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
58 * Coded into C, with minor code improvements, and with hsearch(3) interface,
59 * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
60 * also, hcreate/hdestroy routines added to simulate hsearch(3).
61 *
62 * These routines simulate hsearch(3) and family, with the important
63 * difference that the hash table is dynamic - can grow indefinitely
64 * beyond its original size (as supplied to hcreate()).
65 *
66 * Performance appears to be comparable to that of hsearch(3).
67 * The 'source-code' options referred to in hsearch(3)'s 'man' page
68 * are not implemented; otherwise functionality is identical.
69 *
70 * Compilation controls:
71 * HASH_DEBUG controls some informative traces, mainly for debugging.
72 * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
73 * when combined with HASH_DEBUG, these are displayed by hdestroy().
74 *
75 * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
76 * concatenation property, in probably unnecessary code 'optimization'.
77 *
78 * Modified margo@postgres.berkeley.edu February 1990
79 * added multiple table interface
80 * Modified by sullivan@postgres.berkeley.edu April 1990
81 * changed ctl structure for shared memory
82 */
83
84#include "postgres.h"
85
86#include <limits.h>
87
88#include "access/xact.h"
89#include "storage/shmem.h"
90#include "storage/spin.h"
91#include "utils/dynahash.h"
92#include "utils/memutils.h"
93
94
95/*
96 * Constants
97 *
98 * A hash table has a top-level "directory", each of whose entries points
99 * to a "segment" of ssize bucket headers. The maximum number of hash
100 * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
101 * the number of records in the table can be larger, but we don't want a
102 * whole lot of records per bucket or performance goes down.
103 *
104 * In a hash table allocated in shared memory, the directory cannot be
105 * expanded because it must stay at a fixed address. The directory size
106 * should be selected using hash_select_dirsize (and you'd better have
107 * a good idea of the maximum number of entries!). For non-shared hash
108 * tables, the initial directory size can be left at the default.
109 */
110#define DEF_SEGSIZE 256
111#define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
112#define DEF_DIRSIZE 256
113#define DEF_FFACTOR 1 /* default fill factor */
114
115/* Number of freelists to be used for a partitioned hash table. */
116#define NUM_FREELISTS 32
117
118/* A hash bucket is a linked list of HASHELEMENTs */
119typedef HASHELEMENT *HASHBUCKET;
120
121/* A hash segment is an array of bucket headers */
122typedef HASHBUCKET *HASHSEGMENT;
123
124/*
125 * Per-freelist data.
126 *
127 * In a partitioned hash table, each freelist is associated with a specific
128 * set of hashcodes, as determined by the FREELIST_IDX() macro below.
129 * nentries tracks the number of live hashtable entries having those hashcodes
130 * (NOT the number of entries in the freelist, as you might expect).
131 *
132 * The coverage of a freelist might be more or less than one partition, so it
133 * needs its own lock rather than relying on caller locking. Relying on that
134 * wouldn't work even if the coverage was the same, because of the occasional
135 * need to "borrow" entries from another freelist; see get_hash_entry().
136 *
137 * Using an array of FreeListData instead of separate arrays of mutexes,
138 * nentries and freeLists helps to reduce sharing of cache lines between
139 * different mutexes.
140 */
141typedef struct
142{
143 slock_t mutex; /* spinlock for this freelist */
144 long nentries; /* number of entries in associated buckets */
145 HASHELEMENT *freeList; /* chain of free elements */
146} FreeListData;
147
148/*
149 * Header structure for a hash table --- contains all changeable info
150 *
151 * In a shared-memory hash table, the HASHHDR is in shared memory, while
152 * each backend has a local HTAB struct. For a non-shared table, there isn't
153 * any functional difference between HASHHDR and HTAB, but we separate them
154 * anyway to share code between shared and non-shared tables.
155 */
156struct HASHHDR
157{
158 /*
159 * The freelist can become a point of contention in high-concurrency hash
160 * tables, so we use an array of freelists, each with its own mutex and
161 * nentries count, instead of just a single one. Although the freelists
162 * normally operate independently, we will scavenge entries from freelists
163 * other than a hashcode's default freelist when necessary.
164 *
165 * If the hash table is not partitioned, only freeList[0] is used and its
166 * spinlock is not used at all; callers' locking is assumed sufficient.
167 */
168 FreeListData freeList[NUM_FREELISTS];
169
170 /* These fields can change, but not in a partitioned table */
171 /* Also, dsize can't change in a shared table, even if unpartitioned */
172 long dsize; /* directory size */
173 long nsegs; /* number of allocated segments (<= dsize) */
174 uint32 max_bucket; /* ID of maximum bucket in use */
175 uint32 high_mask; /* mask to modulo into entire table */
176 uint32 low_mask; /* mask to modulo into lower half of table */
177
178 /* These fields are fixed at hashtable creation */
179 Size keysize; /* hash key length in bytes */
180 Size entrysize; /* total user element size in bytes */
181 long num_partitions; /* # partitions (must be power of 2), or 0 */
182 long ffactor; /* target fill factor */
183 long max_dsize; /* 'dsize' limit if directory is fixed size */
184 long ssize; /* segment size --- must be power of 2 */
185 int sshift; /* segment shift = log2(ssize) */
186 int nelem_alloc; /* number of entries to allocate at once */
187
188#ifdef HASH_STATISTICS
189
190 /*
191 * Count statistics here. NB: stats code doesn't bother with mutex, so
192 * counts could be corrupted a bit in a partitioned table.
193 */
194 long accesses;
195 long collisions;
196#endif
197};
198
199#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
200
201#define FREELIST_IDX(hctl, hashcode) \
202 (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
203
204/*
205 * Top control structure for a hashtable --- in a shared table, each backend
206 * has its own copy (OK since no fields change at runtime)
207 */
208struct HTAB
209{
210 HASHHDR *hctl; /* => shared control information */
211 HASHSEGMENT *dir; /* directory of segment starts */
212 HashValueFunc hash; /* hash function */
213 HashCompareFunc match; /* key comparison function */
214 HashCopyFunc keycopy; /* key copying function */
215 HashAllocFunc alloc; /* memory allocator */
216 MemoryContext hcxt; /* memory context if default allocator used */
217 char *tabname; /* table name (for error messages) */
218 bool isshared; /* true if table is in shared memory */
219 bool isfixed; /* if true, don't enlarge */
220
221 /* freezing a shared table isn't allowed, so we can keep state here */
222 bool frozen; /* true = no more inserts allowed */
223
224 /* We keep local copies of these fixed values to reduce contention */
225 Size keysize; /* hash key length in bytes */
226 long ssize; /* segment size --- must be power of 2 */
227 int sshift; /* segment shift = log2(ssize) */
228};
229
230/*
231 * Key (also entry) part of a HASHELEMENT
232 */
233#define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
234
235/*
236 * Obtain element pointer given pointer to key
237 */
238#define ELEMENT_FROM_KEY(key) \
239 ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
240
241/*
242 * Fast MOD arithmetic, assuming that y is a power of 2 !
243 */
244#define MOD(x,y) ((x) & ((y)-1))
245
246#if HASH_STATISTICS
247static long hash_accesses,
248 hash_collisions,
249 hash_expansions;
250#endif
251
252/*
253 * Private function prototypes
254 */
255static void *DynaHashAlloc(Size size);
256static HASHSEGMENT seg_alloc(HTAB *hashp);
257static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
258static bool dir_realloc(HTAB *hashp);
259static bool expand_table(HTAB *hashp);
260static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
261static void hdefault(HTAB *hashp);
262static int choose_nelem_alloc(Size entrysize);
263static bool init_htab(HTAB *hashp, long nelem);
264static void hash_corrupted(HTAB *hashp);
265static long next_pow2_long(long num);
266static int next_pow2_int(long num);
267static void register_seq_scan(HTAB *hashp);
268static void deregister_seq_scan(HTAB *hashp);
269static bool has_seq_scans(HTAB *hashp);
270
271
272/*
273 * memory allocation support
274 */
275static MemoryContext CurrentDynaHashCxt = NULL;
276
277static void *
278DynaHashAlloc(Size size)
279{
280 Assert(MemoryContextIsValid(CurrentDynaHashCxt));
281 return MemoryContextAlloc(CurrentDynaHashCxt, size);
282}
283
284
285/*
286 * HashCompareFunc for string keys
287 *
288 * Because we copy keys with strlcpy(), they will be truncated at keysize-1
289 * bytes, so we can only compare that many ... hence strncmp is almost but
290 * not quite the right thing.
291 */
292static int
293string_compare(const char *key1, const char *key2, Size keysize)
294{
295 return strncmp(key1, key2, keysize - 1);
296}
297
298
299/************************** CREATE ROUTINES **********************/
300
301/*
302 * hash_create -- create a new dynamic hash table
303 *
304 * tabname: a name for the table (for debugging purposes)
305 * nelem: maximum number of elements expected
306 * *info: additional table parameters, as indicated by flags
307 * flags: bitmask indicating which parameters to take from *info
308 *
309 * Note: for a shared-memory hashtable, nelem needs to be a pretty good
310 * estimate, since we can't expand the table on the fly. But an unshared
311 * hashtable can be expanded on-the-fly, so it's better for nelem to be
312 * on the small side and let the table grow if it's exceeded. An overly
313 * large nelem will penalize hash_seq_search speed without buying much.
314 */
315HTAB *
316hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
317{
318 HTAB *hashp;
319 HASHHDR *hctl;
320
321 /*
322 * For shared hash tables, we have a local hash header (HTAB struct) that
323 * we allocate in TopMemoryContext; all else is in shared memory.
324 *
325 * For non-shared hash tables, everything including the hash header is in
326 * a memory context created specially for the hash table --- this makes
327 * hash_destroy very simple. The memory context is made a child of either
328 * a context specified by the caller, or TopMemoryContext if nothing is
329 * specified.
330 */
331 if (flags & HASH_SHARED_MEM)
332 {
333 /* Set up to allocate the hash header */
334 CurrentDynaHashCxt = TopMemoryContext;
335 }
336 else
337 {
338 /* Create the hash table's private memory context */
339 if (flags & HASH_CONTEXT)
340 CurrentDynaHashCxt = info->hcxt;
341 else
342 CurrentDynaHashCxt = TopMemoryContext;
343 CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
344 "dynahash",
345 ALLOCSET_DEFAULT_SIZES);
346 }
347
348 /* Initialize the hash header, plus a copy of the table name */
349 hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1);
350 MemSet(hashp, 0, sizeof(HTAB));
351
352 hashp->tabname = (char *) (hashp + 1);
353 strcpy(hashp->tabname, tabname);
354
355 /* If we have a private context, label it with hashtable's name */
356 if (!(flags & HASH_SHARED_MEM))
357 MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
358
359 /*
360 * Select the appropriate hash function (see comments at head of file).
361 */
362 if (flags & HASH_FUNCTION)
363 hashp->hash = info->hash;
364 else if (flags & HASH_BLOBS)
365 {
366 /* We can optimize hashing for common key sizes */
367 Assert(flags & HASH_ELEM);
368 if (info->keysize == sizeof(uint32))
369 hashp->hash = uint32_hash;
370 else
371 hashp->hash = tag_hash;
372 }
373 else
374 hashp->hash = string_hash; /* default hash function */
375
376 /*
377 * If you don't specify a match function, it defaults to string_compare if
378 * you used string_hash (either explicitly or by default) and to memcmp
379 * otherwise.
380 *
381 * Note: explicitly specifying string_hash is deprecated, because this
382 * might not work for callers in loadable modules on some platforms due to
383 * referencing a trampoline instead of the string_hash function proper.
384 * Just let it default, eh?
385 */
386 if (flags & HASH_COMPARE)
387 hashp->match = info->match;
388 else if (hashp->hash == string_hash)
389 hashp->match = (HashCompareFunc) string_compare;
390 else
391 hashp->match = memcmp;
392
393 /*
394 * Similarly, the key-copying function defaults to strlcpy or memcpy.
395 */
396 if (flags & HASH_KEYCOPY)
397 hashp->keycopy = info->keycopy;
398 else if (hashp->hash == string_hash)
399 hashp->keycopy = (HashCopyFunc) strlcpy;
400 else
401 hashp->keycopy = memcpy;
402
403 /* And select the entry allocation function, too. */
404 if (flags & HASH_ALLOC)
405 hashp->alloc = info->alloc;
406 else
407 hashp->alloc = DynaHashAlloc;
408
409 if (flags & HASH_SHARED_MEM)
410 {
411 /*
412 * ctl structure and directory are preallocated for shared memory
413 * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
414 * well.
415 */
416 hashp->hctl = info->hctl;
417 hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
418 hashp->hcxt = NULL;
419 hashp->isshared = true;
420
421 /* hash table already exists, we're just attaching to it */
422 if (flags & HASH_ATTACH)
423 {
424 /* make local copies of some heavily-used values */
425 hctl = hashp->hctl;
426 hashp->keysize = hctl->keysize;
427 hashp->ssize = hctl->ssize;
428 hashp->sshift = hctl->sshift;
429
430 return hashp;
431 }
432 }
433 else
434 {
435 /* setup hash table defaults */
436 hashp->hctl = NULL;
437 hashp->dir = NULL;
438 hashp->hcxt = CurrentDynaHashCxt;
439 hashp->isshared = false;
440 }
441
442 if (!hashp->hctl)
443 {
444 hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
445 if (!hashp->hctl)
446 ereport(ERROR,
447 (errcode(ERRCODE_OUT_OF_MEMORY),
448 errmsg("out of memory")));
449 }
450
451 hashp->frozen = false;
452
453 hdefault(hashp);
454
455 hctl = hashp->hctl;
456
457 if (flags & HASH_PARTITION)
458 {
459 /* Doesn't make sense to partition a local hash table */
460 Assert(flags & HASH_SHARED_MEM);
461
462 /*
463 * The number of partitions had better be a power of 2. Also, it must
464 * be less than INT_MAX (see init_htab()), so call the int version of
465 * next_pow2.
466 */
467 Assert(info->num_partitions == next_pow2_int(info->num_partitions));
468
469 hctl->num_partitions = info->num_partitions;
470 }
471
472 if (flags & HASH_SEGMENT)
473 {
474 hctl->ssize = info->ssize;
475 hctl->sshift = my_log2(info->ssize);
476 /* ssize had better be a power of 2 */
477 Assert(hctl->ssize == (1L << hctl->sshift));
478 }
479 if (flags & HASH_FFACTOR)
480 hctl->ffactor = info->ffactor;
481
482 /*
483 * SHM hash tables have fixed directory size passed by the caller.
484 */
485 if (flags & HASH_DIRSIZE)
486 {
487 hctl->max_dsize = info->max_dsize;
488 hctl->dsize = info->dsize;
489 }
490
491 /*
492 * hash table now allocates space for key and data but you have to say how
493 * much space to allocate
494 */
495 if (flags & HASH_ELEM)
496 {
497 Assert(info->entrysize >= info->keysize);
498 hctl->keysize = info->keysize;
499 hctl->entrysize = info->entrysize;
500 }
501
502 /* make local copies of heavily-used constant fields */
503 hashp->keysize = hctl->keysize;
504 hashp->ssize = hctl->ssize;
505 hashp->sshift = hctl->sshift;
506
507 /* Build the hash directory structure */
508 if (!init_htab(hashp, nelem))
509 elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
510
511 /*
512 * For a shared hash table, preallocate the requested number of elements.
513 * This reduces problems with run-time out-of-shared-memory conditions.
514 *
515 * For a non-shared hash table, preallocate the requested number of
516 * elements if it's less than our chosen nelem_alloc. This avoids wasting
517 * space if the caller correctly estimates a small table size.
518 */
519 if ((flags & HASH_SHARED_MEM) ||
520 nelem < hctl->nelem_alloc)
521 {
522 int i,
523 freelist_partitions,
524 nelem_alloc,
525 nelem_alloc_first;
526
527 /*
528 * If hash table is partitioned, give each freelist an equal share of
529 * the initial allocation. Otherwise only freeList[0] is used.
530 */
531 if (IS_PARTITIONED(hashp->hctl))
532 freelist_partitions = NUM_FREELISTS;
533 else
534 freelist_partitions = 1;
535
536 nelem_alloc = nelem / freelist_partitions;
537 if (nelem_alloc <= 0)
538 nelem_alloc = 1;
539
540 /*
541 * Make sure we'll allocate all the requested elements; freeList[0]
542 * gets the excess if the request isn't divisible by NUM_FREELISTS.
543 */
544 if (nelem_alloc * freelist_partitions < nelem)
545 nelem_alloc_first =
546 nelem - nelem_alloc * (freelist_partitions - 1);
547 else
548 nelem_alloc_first = nelem_alloc;
549
550 for (i = 0; i < freelist_partitions; i++)
551 {
552 int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
553
554 if (!element_alloc(hashp, temp, i))
555 ereport(ERROR,
556 (errcode(ERRCODE_OUT_OF_MEMORY),
557 errmsg("out of memory")));
558 }
559 }
560
561 if (flags & HASH_FIXED_SIZE)
562 hashp->isfixed = true;
563 return hashp;
564}
565
566/*
567 * Set default HASHHDR parameters.
568 */
569static void
570hdefault(HTAB *hashp)
571{
572 HASHHDR *hctl = hashp->hctl;
573
574 MemSet(hctl, 0, sizeof(HASHHDR));
575
576 hctl->dsize = DEF_DIRSIZE;
577 hctl->nsegs = 0;
578
579 /* rather pointless defaults for key & entry size */
580 hctl->keysize = sizeof(char *);
581 hctl->entrysize = 2 * sizeof(char *);
582
583 hctl->num_partitions = 0; /* not partitioned */
584
585 hctl->ffactor = DEF_FFACTOR;
586
587 /* table has no fixed maximum size */
588 hctl->max_dsize = NO_MAX_DSIZE;
589
590 hctl->ssize = DEF_SEGSIZE;
591 hctl->sshift = DEF_SEGSIZE_SHIFT;
592
593#ifdef HASH_STATISTICS
594 hctl->accesses = hctl->collisions = 0;
595#endif
596}
597
598/*
599 * Given the user-specified entry size, choose nelem_alloc, ie, how many
600 * elements to add to the hash table when we need more.
601 */
602static int
603choose_nelem_alloc(Size entrysize)
604{
605 int nelem_alloc;
606 Size elementSize;
607 Size allocSize;
608
609 /* Each element has a HASHELEMENT header plus user data. */
610 /* NB: this had better match element_alloc() */
611 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
612
613 /*
614 * The idea here is to choose nelem_alloc at least 32, but round up so
615 * that the allocation request will be a power of 2 or just less. This
616 * makes little difference for hash tables in shared memory, but for hash
617 * tables managed by palloc, the allocation request will be rounded up to
618 * a power of 2 anyway. If we fail to take this into account, we'll waste
619 * as much as half the allocated space.
620 */
621 allocSize = 32 * 4; /* assume elementSize at least 8 */
622 do
623 {
624 allocSize <<= 1;
625 nelem_alloc = allocSize / elementSize;
626 } while (nelem_alloc < 32);
627
628 return nelem_alloc;
629}
630
631/*
632 * Compute derived fields of hctl and build the initial directory/segment
633 * arrays
634 */
635static bool
636init_htab(HTAB *hashp, long nelem)
637{
638 HASHHDR *hctl = hashp->hctl;
639 HASHSEGMENT *segp;
640 int nbuckets;
641 int nsegs;
642 int i;
643
644 /*
645 * initialize mutexes if it's a partitioned table
646 */
647 if (IS_PARTITIONED(hctl))
648 for (i = 0; i < NUM_FREELISTS; i++)
649 SpinLockInit(&(hctl->freeList[i].mutex));
650
651 /*
652 * Divide number of elements by the fill factor to determine a desired
653 * number of buckets. Allocate space for the next greater power of two
654 * number of buckets
655 */
656 nbuckets = next_pow2_int((nelem - 1) / hctl->ffactor + 1);
657
658 /*
659 * In a partitioned table, nbuckets must be at least equal to
660 * num_partitions; were it less, keys with apparently different partition
661 * numbers would map to the same bucket, breaking partition independence.
662 * (Normally nbuckets will be much bigger; this is just a safety check.)
663 */
664 while (nbuckets < hctl->num_partitions)
665 nbuckets <<= 1;
666
667 hctl->max_bucket = hctl->low_mask = nbuckets - 1;
668 hctl->high_mask = (nbuckets << 1) - 1;
669
670 /*
671 * Figure number of directory segments needed, round up to a power of 2
672 */
673 nsegs = (nbuckets - 1) / hctl->ssize + 1;
674 nsegs = next_pow2_int(nsegs);
675
676 /*
677 * Make sure directory is big enough. If pre-allocated directory is too
678 * small, choke (caller screwed up).
679 */
680 if (nsegs > hctl->dsize)
681 {
682 if (!(hashp->dir))
683 hctl->dsize = nsegs;
684 else
685 return false;
686 }
687
688 /* Allocate a directory */
689 if (!(hashp->dir))
690 {
691 CurrentDynaHashCxt = hashp->hcxt;
692 hashp->dir = (HASHSEGMENT *)
693 hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
694 if (!hashp->dir)
695 return false;
696 }
697
698 /* Allocate initial segments */
699 for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
700 {
701 *segp = seg_alloc(hashp);
702 if (*segp == NULL)
703 return false;
704 }
705
706 /* Choose number of entries to allocate at a time */
707 hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
708
709#if HASH_DEBUG
710 fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
711 "TABLE POINTER ", hashp,
712 "DIRECTORY SIZE ", hctl->dsize,
713 "SEGMENT SIZE ", hctl->ssize,
714 "SEGMENT SHIFT ", hctl->sshift,
715 "FILL FACTOR ", hctl->ffactor,
716 "MAX BUCKET ", hctl->max_bucket,
717 "HIGH MASK ", hctl->high_mask,
718 "LOW MASK ", hctl->low_mask,
719 "NSEGS ", hctl->nsegs);
720#endif
721 return true;
722}
723
724/*
725 * Estimate the space needed for a hashtable containing the given number
726 * of entries of given size.
727 * NOTE: this is used to estimate the footprint of hashtables in shared
728 * memory; therefore it does not count HTAB which is in local memory.
729 * NB: assumes that all hash structure parameters have default values!
730 */
731Size
732hash_estimate_size(long num_entries, Size entrysize)
733{
734 Size size;
735 long nBuckets,
736 nSegments,
737 nDirEntries,
738 nElementAllocs,
739 elementSize,
740 elementAllocCnt;
741
742 /* estimate number of buckets wanted */
743 nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
744 /* # of segments needed for nBuckets */
745 nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
746 /* directory entries */
747 nDirEntries = DEF_DIRSIZE;
748 while (nDirEntries < nSegments)
749 nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
750
751 /* fixed control info */
752 size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
753 /* directory */
754 size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
755 /* segments */
756 size = add_size(size, mul_size(nSegments,
757 MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
758 /* elements --- allocated in groups of choose_nelem_alloc() entries */
759 elementAllocCnt = choose_nelem_alloc(entrysize);
760 nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
761 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
762 size = add_size(size,
763 mul_size(nElementAllocs,
764 mul_size(elementAllocCnt, elementSize)));
765
766 return size;
767}
768
769/*
770 * Select an appropriate directory size for a hashtable with the given
771 * maximum number of entries.
772 * This is only needed for hashtables in shared memory, whose directories
773 * cannot be expanded dynamically.
774 * NB: assumes that all hash structure parameters have default values!
775 *
776 * XXX this had better agree with the behavior of init_htab()...
777 */
778long
779hash_select_dirsize(long num_entries)
780{
781 long nBuckets,
782 nSegments,
783 nDirEntries;
784
785 /* estimate number of buckets wanted */
786 nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
787 /* # of segments needed for nBuckets */
788 nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
789 /* directory entries */
790 nDirEntries = DEF_DIRSIZE;
791 while (nDirEntries < nSegments)
792 nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
793
794 return nDirEntries;
795}
796
797/*
798 * Compute the required initial memory allocation for a shared-memory
799 * hashtable with the given parameters. We need space for the HASHHDR
800 * and for the (non expansible) directory.
801 */
802Size
803hash_get_shared_size(HASHCTL *info, int flags)
804{
805 Assert(flags & HASH_DIRSIZE);
806 Assert(info->dsize == info->max_dsize);
807 return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
808}
809
810
811/********************** DESTROY ROUTINES ************************/
812
813void
814hash_destroy(HTAB *hashp)
815{
816 if (hashp != NULL)
817 {
818 /* allocation method must be one we know how to free, too */
819 Assert(hashp->alloc == DynaHashAlloc);
820 /* so this hashtable must have its own context */
821 Assert(hashp->hcxt != NULL);
822
823 hash_stats("destroy", hashp);
824
825 /*
826 * Free everything by destroying the hash table's memory context.
827 */
828 MemoryContextDelete(hashp->hcxt);
829 }
830}
831
832void
833hash_stats(const char *where, HTAB *hashp)
834{
835#if HASH_STATISTICS
836 fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
837 where, hashp->hctl->accesses, hashp->hctl->collisions);
838
839 fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
840 hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
841 hashp->hctl->max_bucket, hashp->hctl->nsegs);
842 fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
843 where, hash_accesses, hash_collisions);
844 fprintf(stderr, "hash_stats: total expansions %ld\n",
845 hash_expansions);
846#endif
847}
848
849/*******************************SEARCH ROUTINES *****************************/
850
851
852/*
853 * get_hash_value -- exported routine to calculate a key's hash value
854 *
855 * We export this because for partitioned tables, callers need to compute
856 * the partition number (from the low-order bits of the hash value) before
857 * searching.
858 */
859uint32
860get_hash_value(HTAB *hashp, const void *keyPtr)
861{
862 return hashp->hash(keyPtr, hashp->keysize);
863}
864
865/* Convert a hash value to a bucket number */
866static inline uint32
867calc_bucket(HASHHDR *hctl, uint32 hash_val)
868{
869 uint32 bucket;
870
871 bucket = hash_val & hctl->high_mask;
872 if (bucket > hctl->max_bucket)
873 bucket = bucket & hctl->low_mask;
874
875 return bucket;
876}
877
878/*
879 * hash_search -- look up key in table and perform action
880 * hash_search_with_hash_value -- same, with key's hash value already computed
881 *
882 * action is one of:
883 * HASH_FIND: look up key in table
884 * HASH_ENTER: look up key in table, creating entry if not present
885 * HASH_ENTER_NULL: same, but return NULL if out of memory
886 * HASH_REMOVE: look up key in table, remove entry if present
887 *
888 * Return value is a pointer to the element found/entered/removed if any,
889 * or NULL if no match was found. (NB: in the case of the REMOVE action,
890 * the result is a dangling pointer that shouldn't be dereferenced!)
891 *
892 * HASH_ENTER will normally ereport a generic "out of memory" error if
893 * it is unable to create a new entry. The HASH_ENTER_NULL operation is
894 * the same except it will return NULL if out of memory. Note that
895 * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
896 * since palloc internally ereports on out-of-memory.
897 *
898 * If foundPtr isn't NULL, then *foundPtr is set true if we found an
899 * existing entry in the table, false otherwise. This is needed in the
900 * HASH_ENTER case, but is redundant with the return value otherwise.
901 *
902 * For hash_search_with_hash_value, the hashvalue parameter must have been
903 * calculated with get_hash_value().
904 */
905void *
906hash_search(HTAB *hashp,
907 const void *keyPtr,
908 HASHACTION action,
909 bool *foundPtr)
910{
911 return hash_search_with_hash_value(hashp,
912 keyPtr,
913 hashp->hash(keyPtr, hashp->keysize),
914 action,
915 foundPtr);
916}
917
918void *
919hash_search_with_hash_value(HTAB *hashp,
920 const void *keyPtr,
921 uint32 hashvalue,
922 HASHACTION action,
923 bool *foundPtr)
924{
925 HASHHDR *hctl = hashp->hctl;
926 int freelist_idx = FREELIST_IDX(hctl, hashvalue);
927 Size keysize;
928 uint32 bucket;
929 long segment_num;
930 long segment_ndx;
931 HASHSEGMENT segp;
932 HASHBUCKET currBucket;
933 HASHBUCKET *prevBucketPtr;
934 HashCompareFunc match;
935
936#if HASH_STATISTICS
937 hash_accesses++;
938 hctl->accesses++;
939#endif
940
941 /*
942 * If inserting, check if it is time to split a bucket.
943 *
944 * NOTE: failure to expand table is not a fatal error, it just means we
945 * have to run at higher fill factor than we wanted. However, if we're
946 * using the palloc allocator then it will throw error anyway on
947 * out-of-memory, so we must do this before modifying the table.
948 */
949 if (action == HASH_ENTER || action == HASH_ENTER_NULL)
950 {
951 /*
952 * Can't split if running in partitioned mode, nor if frozen, nor if
953 * table is the subject of any active hash_seq_search scans. Strange
954 * order of these tests is to try to check cheaper conditions first.
955 */
956 if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
957 hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
958 !has_seq_scans(hashp))
959 (void) expand_table(hashp);
960 }
961
962 /*
963 * Do the initial lookup
964 */
965 bucket = calc_bucket(hctl, hashvalue);
966
967 segment_num = bucket >> hashp->sshift;
968 segment_ndx = MOD(bucket, hashp->ssize);
969
970 segp = hashp->dir[segment_num];
971
972 if (segp == NULL)
973 hash_corrupted(hashp);
974
975 prevBucketPtr = &segp[segment_ndx];
976 currBucket = *prevBucketPtr;
977
978 /*
979 * Follow collision chain looking for matching key
980 */
981 match = hashp->match; /* save one fetch in inner loop */
982 keysize = hashp->keysize; /* ditto */
983
984 while (currBucket != NULL)
985 {
986 if (currBucket->hashvalue == hashvalue &&
987 match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
988 break;
989 prevBucketPtr = &(currBucket->link);
990 currBucket = *prevBucketPtr;
991#if HASH_STATISTICS
992 hash_collisions++;
993 hctl->collisions++;
994#endif
995 }
996
997 if (foundPtr)
998 *foundPtr = (bool) (currBucket != NULL);
999
1000 /*
1001 * OK, now what?
1002 */
1003 switch (action)
1004 {
1005 case HASH_FIND:
1006 if (currBucket != NULL)
1007 return (void *) ELEMENTKEY(currBucket);
1008 return NULL;
1009
1010 case HASH_REMOVE:
1011 if (currBucket != NULL)
1012 {
1013 /* if partitioned, must lock to touch nentries and freeList */
1014 if (IS_PARTITIONED(hctl))
1015 SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1016
1017 /* delete the record from the appropriate nentries counter. */
1018 Assert(hctl->freeList[freelist_idx].nentries > 0);
1019 hctl->freeList[freelist_idx].nentries--;
1020
1021 /* remove record from hash bucket's chain. */
1022 *prevBucketPtr = currBucket->link;
1023
1024 /* add the record to the appropriate freelist. */
1025 currBucket->link = hctl->freeList[freelist_idx].freeList;
1026 hctl->freeList[freelist_idx].freeList = currBucket;
1027
1028 if (IS_PARTITIONED(hctl))
1029 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1030
1031 /*
1032 * better hope the caller is synchronizing access to this
1033 * element, because someone else is going to reuse it the next
1034 * time something is added to the table
1035 */
1036 return (void *) ELEMENTKEY(currBucket);
1037 }
1038 return NULL;
1039
1040 case HASH_ENTER_NULL:
1041 /* ENTER_NULL does not work with palloc-based allocator */
1042 Assert(hashp->alloc != DynaHashAlloc);
1043 /* FALL THRU */
1044
1045 case HASH_ENTER:
1046 /* Return existing element if found, else create one */
1047 if (currBucket != NULL)
1048 return (void *) ELEMENTKEY(currBucket);
1049
1050 /* disallow inserts if frozen */
1051 if (hashp->frozen)
1052 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1053 hashp->tabname);
1054
1055 currBucket = get_hash_entry(hashp, freelist_idx);
1056 if (currBucket == NULL)
1057 {
1058 /* out of memory */
1059 if (action == HASH_ENTER_NULL)
1060 return NULL;
1061 /* report a generic message */
1062 if (hashp->isshared)
1063 ereport(ERROR,
1064 (errcode(ERRCODE_OUT_OF_MEMORY),
1065 errmsg("out of shared memory")));
1066 else
1067 ereport(ERROR,
1068 (errcode(ERRCODE_OUT_OF_MEMORY),
1069 errmsg("out of memory")));
1070 }
1071
1072 /* link into hashbucket chain */
1073 *prevBucketPtr = currBucket;
1074 currBucket->link = NULL;
1075
1076 /* copy key into record */
1077 currBucket->hashvalue = hashvalue;
1078 hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1079
1080 /*
1081 * Caller is expected to fill the data field on return. DO NOT
1082 * insert any code that could possibly throw error here, as doing
1083 * so would leave the table entry incomplete and hence corrupt the
1084 * caller's data structure.
1085 */
1086
1087 return (void *) ELEMENTKEY(currBucket);
1088 }
1089
1090 elog(ERROR, "unrecognized hash action code: %d", (int) action);
1091
1092 return NULL; /* keep compiler quiet */
1093}
1094
1095/*
1096 * hash_update_hash_key -- change the hash key of an existing table entry
1097 *
1098 * This is equivalent to removing the entry, making a new entry, and copying
1099 * over its data, except that the entry never goes to the table's freelist.
1100 * Therefore this cannot suffer an out-of-memory failure, even if there are
1101 * other processes operating in other partitions of the hashtable.
1102 *
1103 * Returns true if successful, false if the requested new hash key is already
1104 * present. Throws error if the specified entry pointer isn't actually a
1105 * table member.
1106 *
1107 * NB: currently, there is no special case for old and new hash keys being
1108 * identical, which means we'll report false for that situation. This is
1109 * preferable for existing uses.
1110 *
1111 * NB: for a partitioned hashtable, caller must hold lock on both relevant
1112 * partitions, if the new hash key would belong to a different partition.
1113 */
1114bool
1115hash_update_hash_key(HTAB *hashp,
1116 void *existingEntry,
1117 const void *newKeyPtr)
1118{
1119 HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1120 HASHHDR *hctl = hashp->hctl;
1121 uint32 newhashvalue;
1122 Size keysize;
1123 uint32 bucket;
1124 uint32 newbucket;
1125 long segment_num;
1126 long segment_ndx;
1127 HASHSEGMENT segp;
1128 HASHBUCKET currBucket;
1129 HASHBUCKET *prevBucketPtr;
1130 HASHBUCKET *oldPrevPtr;
1131 HashCompareFunc match;
1132
1133#if HASH_STATISTICS
1134 hash_accesses++;
1135 hctl->accesses++;
1136#endif
1137
1138 /* disallow updates if frozen */
1139 if (hashp->frozen)
1140 elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1141 hashp->tabname);
1142
1143 /*
1144 * Lookup the existing element using its saved hash value. We need to do
1145 * this to be able to unlink it from its hash chain, but as a side benefit
1146 * we can verify the validity of the passed existingEntry pointer.
1147 */
1148 bucket = calc_bucket(hctl, existingElement->hashvalue);
1149
1150 segment_num = bucket >> hashp->sshift;
1151 segment_ndx = MOD(bucket, hashp->ssize);
1152
1153 segp = hashp->dir[segment_num];
1154
1155 if (segp == NULL)
1156 hash_corrupted(hashp);
1157
1158 prevBucketPtr = &segp[segment_ndx];
1159 currBucket = *prevBucketPtr;
1160
1161 while (currBucket != NULL)
1162 {
1163 if (currBucket == existingElement)
1164 break;
1165 prevBucketPtr = &(currBucket->link);
1166 currBucket = *prevBucketPtr;
1167 }
1168
1169 if (currBucket == NULL)
1170 elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1171 hashp->tabname);
1172
1173 oldPrevPtr = prevBucketPtr;
1174
1175 /*
1176 * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1177 * chain we want to put the entry into.
1178 */
1179 newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1180
1181 newbucket = calc_bucket(hctl, newhashvalue);
1182
1183 segment_num = newbucket >> hashp->sshift;
1184 segment_ndx = MOD(newbucket, hashp->ssize);
1185
1186 segp = hashp->dir[segment_num];
1187
1188 if (segp == NULL)
1189 hash_corrupted(hashp);
1190
1191 prevBucketPtr = &segp[segment_ndx];
1192 currBucket = *prevBucketPtr;
1193
1194 /*
1195 * Follow collision chain looking for matching key
1196 */
1197 match = hashp->match; /* save one fetch in inner loop */
1198 keysize = hashp->keysize; /* ditto */
1199
1200 while (currBucket != NULL)
1201 {
1202 if (currBucket->hashvalue == newhashvalue &&
1203 match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1204 break;
1205 prevBucketPtr = &(currBucket->link);
1206 currBucket = *prevBucketPtr;
1207#if HASH_STATISTICS
1208 hash_collisions++;
1209 hctl->collisions++;
1210#endif
1211 }
1212
1213 if (currBucket != NULL)
1214 return false; /* collision with an existing entry */
1215
1216 currBucket = existingElement;
1217
1218 /*
1219 * If old and new hash values belong to the same bucket, we need not
1220 * change any chain links, and indeed should not since this simplistic
1221 * update will corrupt the list if currBucket is the last element. (We
1222 * cannot fall out earlier, however, since we need to scan the bucket to
1223 * check for duplicate keys.)
1224 */
1225 if (bucket != newbucket)
1226 {
1227 /* OK to remove record from old hash bucket's chain. */
1228 *oldPrevPtr = currBucket->link;
1229
1230 /* link into new hashbucket chain */
1231 *prevBucketPtr = currBucket;
1232 currBucket->link = NULL;
1233 }
1234
1235 /* copy new key into record */
1236 currBucket->hashvalue = newhashvalue;
1237 hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1238
1239 /* rest of record is untouched */
1240
1241 return true;
1242}
1243
1244/*
1245 * Allocate a new hashtable entry if possible; return NULL if out of memory.
1246 * (Or, if the underlying space allocator throws error for out-of-memory,
1247 * we won't return at all.)
1248 */
1249static HASHBUCKET
1250get_hash_entry(HTAB *hashp, int freelist_idx)
1251{
1252 HASHHDR *hctl = hashp->hctl;
1253 HASHBUCKET newElement;
1254
1255 for (;;)
1256 {
1257 /* if partitioned, must lock to touch nentries and freeList */
1258 if (IS_PARTITIONED(hctl))
1259 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1260
1261 /* try to get an entry from the freelist */
1262 newElement = hctl->freeList[freelist_idx].freeList;
1263
1264 if (newElement != NULL)
1265 break;
1266
1267 if (IS_PARTITIONED(hctl))
1268 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1269
1270 /*
1271 * No free elements in this freelist. In a partitioned table, there
1272 * might be entries in other freelists, but to reduce contention we
1273 * prefer to first try to get another chunk of buckets from the main
1274 * shmem allocator. If that fails, though, we *MUST* root through all
1275 * the other freelists before giving up. There are multiple callers
1276 * that assume that they can allocate every element in the initially
1277 * requested table size, or that deleting an element guarantees they
1278 * can insert a new element, even if shared memory is entirely full.
1279 * Failing because the needed element is in a different freelist is
1280 * not acceptable.
1281 */
1282 if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1283 {
1284 int borrow_from_idx;
1285
1286 if (!IS_PARTITIONED(hctl))
1287 return NULL; /* out of memory */
1288
1289 /* try to borrow element from another freelist */
1290 borrow_from_idx = freelist_idx;
1291 for (;;)
1292 {
1293 borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1294 if (borrow_from_idx == freelist_idx)
1295 break; /* examined all freelists, fail */
1296
1297 SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1298 newElement = hctl->freeList[borrow_from_idx].freeList;
1299
1300 if (newElement != NULL)
1301 {
1302 hctl->freeList[borrow_from_idx].freeList = newElement->link;
1303 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1304
1305 /* careful: count the new element in its proper freelist */
1306 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1307 hctl->freeList[freelist_idx].nentries++;
1308 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1309
1310 return newElement;
1311 }
1312
1313 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1314 }
1315
1316 /* no elements available to borrow either, so out of memory */
1317 return NULL;
1318 }
1319 }
1320
1321 /* remove entry from freelist, bump nentries */
1322 hctl->freeList[freelist_idx].freeList = newElement->link;
1323 hctl->freeList[freelist_idx].nentries++;
1324
1325 if (IS_PARTITIONED(hctl))
1326 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1327
1328 return newElement;
1329}
1330
1331/*
1332 * hash_get_num_entries -- get the number of entries in a hashtable
1333 */
1334long
1335hash_get_num_entries(HTAB *hashp)
1336{
1337 int i;
1338 long sum = hashp->hctl->freeList[0].nentries;
1339
1340 /*
1341 * We currently don't bother with acquiring the mutexes; it's only
1342 * sensible to call this function if you've got lock on all partitions of
1343 * the table.
1344 */
1345 if (IS_PARTITIONED(hashp->hctl))
1346 {
1347 for (i = 1; i < NUM_FREELISTS; i++)
1348 sum += hashp->hctl->freeList[i].nentries;
1349 }
1350
1351 return sum;
1352}
1353
1354/*
1355 * hash_seq_init/_search/_term
1356 * Sequentially search through hash table and return
1357 * all the elements one by one, return NULL when no more.
1358 *
1359 * hash_seq_term should be called if and only if the scan is abandoned before
1360 * completion; if hash_seq_search returns NULL then it has already done the
1361 * end-of-scan cleanup.
1362 *
1363 * NOTE: caller may delete the returned element before continuing the scan.
1364 * However, deleting any other element while the scan is in progress is
1365 * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1366 * if elements are added to the table while the scan is in progress, it is
1367 * unspecified whether they will be visited by the scan or not.
1368 *
1369 * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1370 * worry about hash_seq_term cleanup, if the hashtable is first locked against
1371 * further insertions by calling hash_freeze.
1372 *
1373 * NOTE: to use this with a partitioned hashtable, caller had better hold
1374 * at least shared lock on all partitions of the table throughout the scan!
1375 * We can cope with insertions or deletions by our own backend, but *not*
1376 * with concurrent insertions or deletions by another.
1377 */
1378void
1379hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1380{
1381 status->hashp = hashp;
1382 status->curBucket = 0;
1383 status->curEntry = NULL;
1384 if (!hashp->frozen)
1385 register_seq_scan(hashp);
1386}
1387
1388void *
1389hash_seq_search(HASH_SEQ_STATUS *status)
1390{
1391 HTAB *hashp;
1392 HASHHDR *hctl;
1393 uint32 max_bucket;
1394 long ssize;
1395 long segment_num;
1396 long segment_ndx;
1397 HASHSEGMENT segp;
1398 uint32 curBucket;
1399 HASHELEMENT *curElem;
1400
1401 if ((curElem = status->curEntry) != NULL)
1402 {
1403 /* Continuing scan of curBucket... */
1404 status->curEntry = curElem->link;
1405 if (status->curEntry == NULL) /* end of this bucket */
1406 ++status->curBucket;
1407 return (void *) ELEMENTKEY(curElem);
1408 }
1409
1410 /*
1411 * Search for next nonempty bucket starting at curBucket.
1412 */
1413 curBucket = status->curBucket;
1414 hashp = status->hashp;
1415 hctl = hashp->hctl;
1416 ssize = hashp->ssize;
1417 max_bucket = hctl->max_bucket;
1418
1419 if (curBucket > max_bucket)
1420 {
1421 hash_seq_term(status);
1422 return NULL; /* search is done */
1423 }
1424
1425 /*
1426 * first find the right segment in the table directory.
1427 */
1428 segment_num = curBucket >> hashp->sshift;
1429 segment_ndx = MOD(curBucket, ssize);
1430
1431 segp = hashp->dir[segment_num];
1432
1433 /*
1434 * Pick up the first item in this bucket's chain. If chain is not empty
1435 * we can begin searching it. Otherwise we have to advance to find the
1436 * next nonempty bucket. We try to optimize that case since searching a
1437 * near-empty hashtable has to iterate this loop a lot.
1438 */
1439 while ((curElem = segp[segment_ndx]) == NULL)
1440 {
1441 /* empty bucket, advance to next */
1442 if (++curBucket > max_bucket)
1443 {
1444 status->curBucket = curBucket;
1445 hash_seq_term(status);
1446 return NULL; /* search is done */
1447 }
1448 if (++segment_ndx >= ssize)
1449 {
1450 segment_num++;
1451 segment_ndx = 0;
1452 segp = hashp->dir[segment_num];
1453 }
1454 }
1455
1456 /* Begin scan of curBucket... */
1457 status->curEntry = curElem->link;
1458 if (status->curEntry == NULL) /* end of this bucket */
1459 ++curBucket;
1460 status->curBucket = curBucket;
1461 return (void *) ELEMENTKEY(curElem);
1462}
1463
1464void
1465hash_seq_term(HASH_SEQ_STATUS *status)
1466{
1467 if (!status->hashp->frozen)
1468 deregister_seq_scan(status->hashp);
1469}
1470
1471/*
1472 * hash_freeze
1473 * Freeze a hashtable against future insertions (deletions are
1474 * still allowed)
1475 *
1476 * The reason for doing this is that by preventing any more bucket splits,
1477 * we no longer need to worry about registering hash_seq_search scans,
1478 * and thus caller need not be careful about ensuring hash_seq_term gets
1479 * called at the right times.
1480 *
1481 * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1482 * with active scans (since hash_seq_term would then do the wrong thing).
1483 */
1484void
1485hash_freeze(HTAB *hashp)
1486{
1487 if (hashp->isshared)
1488 elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1489 if (!hashp->frozen && has_seq_scans(hashp))
1490 elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1491 hashp->tabname);
1492 hashp->frozen = true;
1493}
1494
1495
1496/********************************* UTILITIES ************************/
1497
1498/*
1499 * Expand the table by adding one more hash bucket.
1500 */
1501static bool
1502expand_table(HTAB *hashp)
1503{
1504 HASHHDR *hctl = hashp->hctl;
1505 HASHSEGMENT old_seg,
1506 new_seg;
1507 long old_bucket,
1508 new_bucket;
1509 long new_segnum,
1510 new_segndx;
1511 long old_segnum,
1512 old_segndx;
1513 HASHBUCKET *oldlink,
1514 *newlink;
1515 HASHBUCKET currElement,
1516 nextElement;
1517
1518 Assert(!IS_PARTITIONED(hctl));
1519
1520#ifdef HASH_STATISTICS
1521 hash_expansions++;
1522#endif
1523
1524 new_bucket = hctl->max_bucket + 1;
1525 new_segnum = new_bucket >> hashp->sshift;
1526 new_segndx = MOD(new_bucket, hashp->ssize);
1527
1528 if (new_segnum >= hctl->nsegs)
1529 {
1530 /* Allocate new segment if necessary -- could fail if dir full */
1531 if (new_segnum >= hctl->dsize)
1532 if (!dir_realloc(hashp))
1533 return false;
1534 if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1535 return false;
1536 hctl->nsegs++;
1537 }
1538
1539 /* OK, we created a new bucket */
1540 hctl->max_bucket++;
1541
1542 /*
1543 * *Before* changing masks, find old bucket corresponding to same hash
1544 * values; values in that bucket may need to be relocated to new bucket.
1545 * Note that new_bucket is certainly larger than low_mask at this point,
1546 * so we can skip the first step of the regular hash mask calc.
1547 */
1548 old_bucket = (new_bucket & hctl->low_mask);
1549
1550 /*
1551 * If we crossed a power of 2, readjust masks.
1552 */
1553 if ((uint32) new_bucket > hctl->high_mask)
1554 {
1555 hctl->low_mask = hctl->high_mask;
1556 hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1557 }
1558
1559 /*
1560 * Relocate records to the new bucket. NOTE: because of the way the hash
1561 * masking is done in calc_bucket, only one old bucket can need to be
1562 * split at this point. With a different way of reducing the hash value,
1563 * that might not be true!
1564 */
1565 old_segnum = old_bucket >> hashp->sshift;
1566 old_segndx = MOD(old_bucket, hashp->ssize);
1567
1568 old_seg = hashp->dir[old_segnum];
1569 new_seg = hashp->dir[new_segnum];
1570
1571 oldlink = &old_seg[old_segndx];
1572 newlink = &new_seg[new_segndx];
1573
1574 for (currElement = *oldlink;
1575 currElement != NULL;
1576 currElement = nextElement)
1577 {
1578 nextElement = currElement->link;
1579 if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1580 {
1581 *oldlink = currElement;
1582 oldlink = &currElement->link;
1583 }
1584 else
1585 {
1586 *newlink = currElement;
1587 newlink = &currElement->link;
1588 }
1589 }
1590 /* don't forget to terminate the rebuilt hash chains... */
1591 *oldlink = NULL;
1592 *newlink = NULL;
1593
1594 return true;
1595}
1596
1597
1598static bool
1599dir_realloc(HTAB *hashp)
1600{
1601 HASHSEGMENT *p;
1602 HASHSEGMENT *old_p;
1603 long new_dsize;
1604 long old_dirsize;
1605 long new_dirsize;
1606
1607 if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1608 return false;
1609
1610 /* Reallocate directory */
1611 new_dsize = hashp->hctl->dsize << 1;
1612 old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1613 new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1614
1615 old_p = hashp->dir;
1616 CurrentDynaHashCxt = hashp->hcxt;
1617 p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1618
1619 if (p != NULL)
1620 {
1621 memcpy(p, old_p, old_dirsize);
1622 MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1623 hashp->dir = p;
1624 hashp->hctl->dsize = new_dsize;
1625
1626 /* XXX assume the allocator is palloc, so we know how to free */
1627 Assert(hashp->alloc == DynaHashAlloc);
1628 pfree(old_p);
1629
1630 return true;
1631 }
1632
1633 return false;
1634}
1635
1636
1637static HASHSEGMENT
1638seg_alloc(HTAB *hashp)
1639{
1640 HASHSEGMENT segp;
1641
1642 CurrentDynaHashCxt = hashp->hcxt;
1643 segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1644
1645 if (!segp)
1646 return NULL;
1647
1648 MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1649
1650 return segp;
1651}
1652
1653/*
1654 * allocate some new elements and link them into the indicated free list
1655 */
1656static bool
1657element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1658{
1659 HASHHDR *hctl = hashp->hctl;
1660 Size elementSize;
1661 HASHELEMENT *firstElement;
1662 HASHELEMENT *tmpElement;
1663 HASHELEMENT *prevElement;
1664 int i;
1665
1666 if (hashp->isfixed)
1667 return false;
1668
1669 /* Each element has a HASHELEMENT header plus user data. */
1670 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1671
1672 CurrentDynaHashCxt = hashp->hcxt;
1673 firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
1674
1675 if (!firstElement)
1676 return false;
1677
1678 /* prepare to link all the new entries into the freelist */
1679 prevElement = NULL;
1680 tmpElement = firstElement;
1681 for (i = 0; i < nelem; i++)
1682 {
1683 tmpElement->link = prevElement;
1684 prevElement = tmpElement;
1685 tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1686 }
1687
1688 /* if partitioned, must lock to touch freeList */
1689 if (IS_PARTITIONED(hctl))
1690 SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1691
1692 /* freelist could be nonempty if two backends did this concurrently */
1693 firstElement->link = hctl->freeList[freelist_idx].freeList;
1694 hctl->freeList[freelist_idx].freeList = prevElement;
1695
1696 if (IS_PARTITIONED(hctl))
1697 SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1698
1699 return true;
1700}
1701
1702/* complain when we have detected a corrupted hashtable */
1703static void
1704hash_corrupted(HTAB *hashp)
1705{
1706 /*
1707 * If the corruption is in a shared hashtable, we'd better force a
1708 * systemwide restart. Otherwise, just shut down this one backend.
1709 */
1710 if (hashp->isshared)
1711 elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1712 else
1713 elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1714}
1715
1716/* calculate ceil(log base 2) of num */
1717int
1718my_log2(long num)
1719{
1720 int i;
1721 long limit;
1722
1723 /* guard against too-large input, which would put us into infinite loop */
1724 if (num > LONG_MAX / 2)
1725 num = LONG_MAX / 2;
1726
1727 for (i = 0, limit = 1; limit < num; i++, limit <<= 1)
1728 ;
1729 return i;
1730}
1731
1732/* calculate first power of 2 >= num, bounded to what will fit in a long */
1733static long
1734next_pow2_long(long num)
1735{
1736 /* my_log2's internal range check is sufficient */
1737 return 1L << my_log2(num);
1738}
1739
1740/* calculate first power of 2 >= num, bounded to what will fit in an int */
1741static int
1742next_pow2_int(long num)
1743{
1744 if (num > INT_MAX / 2)
1745 num = INT_MAX / 2;
1746 return 1 << my_log2(num);
1747}
1748
1749
1750/************************* SEQ SCAN TRACKING ************************/
1751
1752/*
1753 * We track active hash_seq_search scans here. The need for this mechanism
1754 * comes from the fact that a scan will get confused if a bucket split occurs
1755 * while it's in progress: it might visit entries twice, or even miss some
1756 * entirely (if it's partway through the same bucket that splits). Hence
1757 * we want to inhibit bucket splits if there are any active scans on the
1758 * table being inserted into. This is a fairly rare case in current usage,
1759 * so just postponing the split until the next insertion seems sufficient.
1760 *
1761 * Given present usages of the function, only a few scans are likely to be
1762 * open concurrently; so a finite-size stack of open scans seems sufficient,
1763 * and we don't worry that linear search is too slow. Note that we do
1764 * allow multiple scans of the same hashtable to be open concurrently.
1765 *
1766 * This mechanism can support concurrent scan and insertion in a shared
1767 * hashtable if it's the same backend doing both. It would fail otherwise,
1768 * but locking reasons seem to preclude any such scenario anyway, so we don't
1769 * worry.
1770 *
1771 * This arrangement is reasonably robust if a transient hashtable is deleted
1772 * without notifying us. The absolute worst case is we might inhibit splits
1773 * in another table created later at exactly the same address. We will give
1774 * a warning at transaction end for reference leaks, so any bugs leading to
1775 * lack of notification should be easy to catch.
1776 */
1777
1778#define MAX_SEQ_SCANS 100
1779
1780static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1781static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
1782static int num_seq_scans = 0;
1783
1784
1785/* Register a table as having an active hash_seq_search scan */
1786static void
1787register_seq_scan(HTAB *hashp)
1788{
1789 if (num_seq_scans >= MAX_SEQ_SCANS)
1790 elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1791 hashp->tabname);
1792 seq_scan_tables[num_seq_scans] = hashp;
1793 seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1794 num_seq_scans++;
1795}
1796
1797/* Deregister an active scan */
1798static void
1799deregister_seq_scan(HTAB *hashp)
1800{
1801 int i;
1802
1803 /* Search backward since it's most likely at the stack top */
1804 for (i = num_seq_scans - 1; i >= 0; i--)
1805 {
1806 if (seq_scan_tables[i] == hashp)
1807 {
1808 seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1809 seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1810 num_seq_scans--;
1811 return;
1812 }
1813 }
1814 elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1815 hashp->tabname);
1816}
1817
1818/* Check if a table has any active scan */
1819static bool
1820has_seq_scans(HTAB *hashp)
1821{
1822 int i;
1823
1824 for (i = 0; i < num_seq_scans; i++)
1825 {
1826 if (seq_scan_tables[i] == hashp)
1827 return true;
1828 }
1829 return false;
1830}
1831
1832/* Clean up any open scans at end of transaction */
1833void
1834AtEOXact_HashTables(bool isCommit)
1835{
1836 /*
1837 * During abort cleanup, open scans are expected; just silently clean 'em
1838 * out. An open scan at commit means someone forgot a hash_seq_term()
1839 * call, so complain.
1840 *
1841 * Note: it's tempting to try to print the tabname here, but refrain for
1842 * fear of touching deallocated memory. This isn't a user-facing message
1843 * anyway, so it needn't be pretty.
1844 */
1845 if (isCommit)
1846 {
1847 int i;
1848
1849 for (i = 0; i < num_seq_scans; i++)
1850 {
1851 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1852 seq_scan_tables[i]);
1853 }
1854 }
1855 num_seq_scans = 0;
1856}
1857
1858/* Clean up any open scans at end of subtransaction */
1859void
1860AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1861{
1862 int i;
1863
1864 /*
1865 * Search backward to make cleanup easy. Note we must check all entries,
1866 * not only those at the end of the array, because deletion technique
1867 * doesn't keep them in order.
1868 */
1869 for (i = num_seq_scans - 1; i >= 0; i--)
1870 {
1871 if (seq_scan_level[i] >= nestDepth)
1872 {
1873 if (isCommit)
1874 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1875 seq_scan_tables[i]);
1876 seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1877 seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1878 num_seq_scans--;
1879 }
1880 }
1881}
1882