dynahash.c source code [PostgreSQL/src/backend/utils/hash/dynahash.c]

1	/-------------------------------------------------------------------------*
2	*
3	* dynahash.c
4	* dynamic hash tables
5	*
6	* dynahash.c supports both local-to-a-backend hash tables and hash tables in
7	* shared memory. For shared hash tables, it is the caller's responsibility
8	* to provide appropriate access interlocking. The simplest convention is
9	* that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10	* hash_seq_search) need only shared lock, but any update requires exclusive
11	* lock. For heavily-used shared tables, the single-lock approach creates a
12	* concurrency bottleneck, so we also support "partitioned" locking wherein
13	* there are multiple LWLocks guarding distinct subsets of the table. To use
14	* a hash table in partitioned mode, the HASH_PARTITION flag must be given
15	* to hash_create. This prevents any attempt to split buckets on-the-fly.
16	* Therefore, each hash bucket chain operates independently, and no fields
17	* of the hash header change after init except nentries and freeList.
18	* (A partitioned table uses multiple copies of those fields, guarded by
19	* spinlocks, for additional concurrency.)
20	* This lets any subset of the hash buckets be treated as a separately
21	* lockable partition. We expect callers to use the low-order bits of a
22	* lookup key's hash value as a partition number --- this will work because
23	* of the way calc_bucket() maps hash values to bucket numbers.
24	*
25	* For hash tables in shared memory, the memory allocator function should
26	* match malloc's semantics of returning NULL on failure. For hash tables
27	* in local memory, we typically use palloc() which will throw error on
28	* failure. The code in this file has to cope with both cases.
29	*
30	* dynahash.c provides support for these types of lookup keys:
31	*
32	* 1. Null-terminated C strings (truncated if necessary to fit in keysize),
33	* compared as though by strcmp(). This is the default behavior.
34	*
35	* 2. Arbitrary binary data of size keysize, compared as though by memcmp().
36	* (Caller must ensure there are no undefined padding bits in the keys!)
37	* This is selected by specifying HASH_BLOBS flag to hash_create.
38	*
39	* 3. More complex key behavior can be selected by specifying user-supplied
40	* hashing, comparison, and/or key-copying functions. At least a hashing
41	* function must be supplied; comparison defaults to memcmp() and key copying
42	* to memcpy() when a user-defined hashing function is selected.
43	*
44	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
45	* Portions Copyright (c) 1994, Regents of the University of California
46	*
47	*
48	* IDENTIFICATION
49	* src/backend/utils/hash/dynahash.c
50	*
51	*-------------------------------------------------------------------------
52	*/
53
54	/*
55	* Original comments:
56	*
57	* Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
58	* Coded into C, with minor code improvements, and with hsearch(3) interface,
59	* by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
60	* also, hcreate/hdestroy routines added to simulate hsearch(3).
61	*
62	* These routines simulate hsearch(3) and family, with the important
63	* difference that the hash table is dynamic - can grow indefinitely
64	* beyond its original size (as supplied to hcreate()).
65	*
66	* Performance appears to be comparable to that of hsearch(3).
67	* The 'source-code' options referred to in hsearch(3)'s 'man' page
68	* are not implemented; otherwise functionality is identical.
69	*
70	* Compilation controls:
71	* HASH_DEBUG controls some informative traces, mainly for debugging.
72	* HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
73	* when combined with HASH_DEBUG, these are displayed by hdestroy().
74	*
75	* Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
76	* concatenation property, in probably unnecessary code 'optimization'.
77	*
78	* Modified margo@postgres.berkeley.edu February 1990
79	* added multiple table interface
80	* Modified by sullivan@postgres.berkeley.edu April 1990
81	* changed ctl structure for shared memory
82	*/
83
84	#include "postgres.h"
85
86	#include <limits.h>
87
88	#include "access/xact.h"
89	#include "storage/shmem.h"
90	#include "storage/spin.h"
91	#include "utils/dynahash.h"
92	#include "utils/memutils.h"
93
94
95	/*
96	* Constants
97	*
98	* A hash table has a top-level "directory", each of whose entries points
99	* to a "segment" of ssize bucket headers. The maximum number of hash
100	* buckets is thus dsize * ssize (but dsize may be expansible). Of course,
101	* the number of records in the table can be larger, but we don't want a
102	* whole lot of records per bucket or performance goes down.
103	*
104	* In a hash table allocated in shared memory, the directory cannot be
105	* expanded because it must stay at a fixed address. The directory size
106	* should be selected using hash_select_dirsize (and you'd better have
107	* a good idea of the maximum number of entries!). For non-shared hash
108	* tables, the initial directory size can be left at the default.
109	*/
110	#define DEF_SEGSIZE 256
111	#define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
112	#define DEF_DIRSIZE 256
113	#define DEF_FFACTOR 1 /* default fill factor */
114
115	/ Number of freelists to be used for a partitioned hash table. /
116	#define NUM_FREELISTS 32
117
118	/ A hash bucket is a linked list of HASHELEMENTs /
119	typedef HASHELEMENT *HASHBUCKET;
120
121	/ A hash segment is an array of bucket headers /
122	typedef HASHBUCKET *HASHSEGMENT;
123
124	/*
125	* Per-freelist data.
126	*
127	* In a partitioned hash table, each freelist is associated with a specific
128	* set of hashcodes, as determined by the FREELIST_IDX() macro below.
129	* nentries tracks the number of live hashtable entries having those hashcodes
130	* (NOT the number of entries in the freelist, as you might expect).
131	*
132	* The coverage of a freelist might be more or less than one partition, so it
133	* needs its own lock rather than relying on caller locking. Relying on that
134	* wouldn't work even if the coverage was the same, because of the occasional
135	* need to "borrow" entries from another freelist; see get_hash_entry().
136	*
137	* Using an array of FreeListData instead of separate arrays of mutexes,
138	* nentries and freeLists helps to reduce sharing of cache lines between
139	* different mutexes.
140	*/
141	typedef struct
142	{
143	slock_t mutex; / spinlock for this freelist /
144	long nentries; / number of entries in associated buckets /
145	HASHELEMENT freeList; /* chain of free elements /
146	} FreeListData;
147
148	/*
149	* Header structure for a hash table --- contains all changeable info
150	*
151	* In a shared-memory hash table, the HASHHDR is in shared memory, while
152	* each backend has a local HTAB struct. For a non-shared table, there isn't
153	* any functional difference between HASHHDR and HTAB, but we separate them
154	* anyway to share code between shared and non-shared tables.
155	*/
156	struct HASHHDR
157	{
158	/*
159	* The freelist can become a point of contention in high-concurrency hash
160	* tables, so we use an array of freelists, each with its own mutex and
161	* nentries count, instead of just a single one. Although the freelists
162	* normally operate independently, we will scavenge entries from freelists
163	* other than a hashcode's default freelist when necessary.
164	*
165	* If the hash table is not partitioned, only freeList[0] is used and its
166	* spinlock is not used at all; callers' locking is assumed sufficient.
167	*/
168	FreeListData freeList[NUM_FREELISTS];
169
170	/ These fields can change, but not in a partitioned table /
171	/ Also, dsize can't change in a shared table, even if unpartitioned /
172	long dsize; / directory size /
173	long nsegs; / number of allocated segments (<= dsize) /
174	uint32 max_bucket; / ID of maximum bucket in use /
175	uint32 high_mask; / mask to modulo into entire table /
176	uint32 low_mask; / mask to modulo into lower half of table /
177
178	/ These fields are fixed at hashtable creation /
179	Size keysize; / hash key length in bytes /
180	Size entrysize; / total user element size in bytes /
181	long num_partitions; / # partitions (must be power of 2), or 0 /
182	long ffactor; / target fill factor /
183	long max_dsize; / 'dsize' limit if directory is fixed size /
184	long ssize; / segment size --- must be power of 2 /
185	int sshift; / segment shift = log2(ssize) /
186	int nelem_alloc; / number of entries to allocate at once /
187
188	#ifdef HASH_STATISTICS
189
190	/*
191	* Count statistics here. NB: stats code doesn't bother with mutex, so
192	* counts could be corrupted a bit in a partitioned table.
193	*/
194	long accesses;
195	long collisions;
196	#endif
197	};
198
199	#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
200
201	#define FREELIST_IDX(hctl, hashcode) \
202	(IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
203
204	/*
205	* Top control structure for a hashtable --- in a shared table, each backend
206	* has its own copy (OK since no fields change at runtime)
207	*/
208	struct HTAB
209	{
210	HASHHDR hctl; /* => shared control information /
211	HASHSEGMENT dir; /* directory of segment starts /
212	HashValueFunc hash; / hash function /
213	HashCompareFunc match; / key comparison function /
214	HashCopyFunc keycopy; / key copying function /
215	HashAllocFunc alloc; / memory allocator /
216	MemoryContext hcxt; / memory context if default allocator used /
217	char tabname; /* table name (for error messages) /
218	bool isshared; / true if table is in shared memory /
219	bool isfixed; / if true, don't enlarge /
220
221	/ freezing a shared table isn't allowed, so we can keep state here /
222	bool frozen; / true = no more inserts allowed /
223
224	/ We keep local copies of these fixed values to reduce contention /
225	Size keysize; / hash key length in bytes /
226	long ssize; / segment size --- must be power of 2 /
227	int sshift; / segment shift = log2(ssize) /
228	};
229
230	/*
231	* Key (also entry) part of a HASHELEMENT
232	*/
233	#define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
234
235	/*
236	* Obtain element pointer given pointer to key
237	*/
238	#define ELEMENT_FROM_KEY(key) \
239	((HASHELEMENT ) (((char ) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
240
241	/*
242	* Fast MOD arithmetic, assuming that y is a power of 2 !
243	*/
244	#define MOD(x,y) ((x) & ((y)-1))
245
246	#if HASH_STATISTICS
247	static long hash_accesses,
248	hash_collisions,
249	hash_expansions;
250	#endif
251
252	/*
253	* Private function prototypes
254	*/
255	static void *DynaHashAlloc(Size size);
256	static HASHSEGMENT seg_alloc(HTAB *hashp);
257	static bool element_alloc(HTAB hashp, int* nelem, int freelist_idx);
258	static bool dir_realloc(HTAB *hashp);
259	static bool expand_table(HTAB *hashp);
260	static HASHBUCKET get_hash_entry(HTAB hashp, int* freelist_idx);
261	static void hdefault(HTAB *hashp);
262	static int choose_nelem_alloc(Size entrysize);
263	static bool init_htab(HTAB hashp, long* nelem);
264	static void hash_corrupted(HTAB *hashp);
265	static long next_pow2_long(long num);
266	static int next_pow2_int(long num);
267	static void register_seq_scan(HTAB *hashp);
268	static void deregister_seq_scan(HTAB *hashp);
269	static bool has_seq_scans(HTAB *hashp);
270
271
272	/*
273	* memory allocation support
274	*/
275	static MemoryContext CurrentDynaHashCxt = NULL;
276
277	static void *
278	DynaHashAlloc(Size size)
279	{
280	Assert(MemoryContextIsValid(CurrentDynaHashCxt));
281	return MemoryContextAlloc(CurrentDynaHashCxt, size);
282	}
283
284
285	/*
286	* HashCompareFunc for string keys
287	*
288	* Because we copy keys with strlcpy(), they will be truncated at keysize-1
289	* bytes, so we can only compare that many ... hence strncmp is almost but
290	* not quite the right thing.
291	*/
292	static int
293	string_compare(const char key1, const* char *key2, Size keysize)
294	{
295	return strncmp(key1, key2, keysize - `1`);
296	}
297
298
299	/*********************** CREATE ROUTINES *******************/
300
301	/*
302	* hash_create -- create a new dynamic hash table
303	*
304	* tabname: a name for the table (for debugging purposes)
305	* nelem: maximum number of elements expected
306	* *info: additional table parameters, as indicated by flags
307	* flags: bitmask indicating which parameters to take from *info
308	*
309	* Note: for a shared-memory hashtable, nelem needs to be a pretty good
310	* estimate, since we can't expand the table on the fly. But an unshared
311	* hashtable can be expanded on-the-fly, so it's better for nelem to be
312	* on the small side and let the table grow if it's exceeded. An overly
313	* large nelem will penalize hash_seq_search speed without buying much.
314	*/
315	HTAB *
316	hash_create(const char tabname, long* nelem, HASHCTL info, int* flags)
317	{
318	HTAB *hashp;
319	HASHHDR *hctl;
320
321	/*
322	* For shared hash tables, we have a local hash header (HTAB struct) that
323	* we allocate in TopMemoryContext; all else is in shared memory.
324	*
325	* For non-shared hash tables, everything including the hash header is in
326	* a memory context created specially for the hash table --- this makes
327	* hash_destroy very simple. The memory context is made a child of either
328	* a context specified by the caller, or TopMemoryContext if nothing is
329	* specified.
330	*/
331	if (flags & HASH_SHARED_MEM)
332	{
333	/ Set up to allocate the hash header /
334	CurrentDynaHashCxt = TopMemoryContext;
335	}
336	else
337	{
338	/ Create the hash table's private memory context /
339	if (flags & HASH_CONTEXT)
340	CurrentDynaHashCxt = info->hcxt;
341	else
342	CurrentDynaHashCxt = TopMemoryContext;
343	CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
344	"dynahash",
345	ALLOCSET_DEFAULT_SIZES);
346	}
347
348	/ Initialize the hash header, plus a copy of the table name /
349	hashp = (HTAB ) DynaHashAlloc(sizeof*(HTAB) + strlen(tabname) + `1`);
350	MemSet(hashp, `0`, sizeof(HTAB));
351
352	hashp->tabname = (char *) (hashp + `1`);
353	strcpy(hashp->tabname, tabname);
354
355	/ If we have a private context, label it with hashtable's name /
356	if (!(flags & HASH_SHARED_MEM))
357	MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
358
359	/*
360	* Select the appropriate hash function (see comments at head of file).
361	*/
362	if (flags & HASH_FUNCTION)
363	hashp->hash = info->hash;
364	else if (flags & HASH_BLOBS)
365	{
366	/ We can optimize hashing for common key sizes /
367	Assert(flags & HASH_ELEM);
368	if (info->keysize == sizeof(uint32))
369	hashp->hash = uint32_hash;
370	else
371	hashp->hash = tag_hash;
372	}
373	else
374	hashp->hash = string_hash; / default hash function /
375
376	/*
377	* If you don't specify a match function, it defaults to string_compare if
378	* you used string_hash (either explicitly or by default) and to memcmp
379	* otherwise.
380	*
381	* Note: explicitly specifying string_hash is deprecated, because this
382	* might not work for callers in loadable modules on some platforms due to
383	* referencing a trampoline instead of the string_hash function proper.
384	* Just let it default, eh?
385	*/
386	if (flags & HASH_COMPARE)
387	hashp->match = info->match;
388	else if (hashp->hash == string_hash)
389	hashp->match = (HashCompareFunc) string_compare;
390	else
391	hashp->match = memcmp;
392
393	/*
394	* Similarly, the key-copying function defaults to strlcpy or memcpy.
395	*/
396	if (flags & HASH_KEYCOPY)
397	hashp->keycopy = info->keycopy;
398	else if (hashp->hash == string_hash)
399	hashp->keycopy = (HashCopyFunc) strlcpy;
400	else
401	hashp->keycopy = memcpy;
402
403	/ And select the entry allocation function, too. /
404	if (flags & HASH_ALLOC)
405	hashp->alloc = info->alloc;
406	else
407	hashp->alloc = DynaHashAlloc;
408
409	if (flags & HASH_SHARED_MEM)
410	{
411	/*
412	* ctl structure and directory are preallocated for shared memory
413	* tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
414	* well.
415	*/
416	hashp->hctl = info->hctl;
417	hashp->dir = (HASHSEGMENT ) (((char* ) info->hctl) + sizeof*(HASHHDR));
418	hashp->hcxt = NULL;
419	hashp->isshared = true;
420
421	/ hash table already exists, we're just attaching to it /
422	if (flags & HASH_ATTACH)
423	{
424	/ make local copies of some heavily-used values /
425	hctl = hashp->hctl;
426	hashp->keysize = hctl->keysize;
427	hashp->ssize = hctl->ssize;
428	hashp->sshift = hctl->sshift;
429
430	return hashp;
431	}
432	}
433	else
434	{
435	/ setup hash table defaults /
436	hashp->hctl = NULL;
437	hashp->dir = NULL;
438	hashp->hcxt = CurrentDynaHashCxt;
439	hashp->isshared = false;
440	}
441
442	if (!hashp->hctl)
443	{
444	hashp->hctl = (HASHHDR ) hashp->alloc(sizeof*(HASHHDR));
445	if (!hashp->hctl)
446	ereport(ERROR,
447	(errcode(ERRCODE_OUT_OF_MEMORY),
448	errmsg("out of memory")));
449	}
450
451	hashp->frozen = false;
452
453	hdefault(hashp);
454
455	hctl = hashp->hctl;
456
457	if (flags & HASH_PARTITION)
458	{
459	/ Doesn't make sense to partition a local hash table /
460	Assert(flags & HASH_SHARED_MEM);
461
462	/*
463	* The number of partitions had better be a power of 2. Also, it must
464	* be less than INT_MAX (see init_htab()), so call the int version of
465	* next_pow2.
466	*/
467	Assert(info->num_partitions == next_pow2_int(info->num_partitions));
468
469	hctl->num_partitions = info->num_partitions;
470	}
471
472	if (flags & HASH_SEGMENT)
473	{
474	hctl->ssize = info->ssize;
475	hctl->sshift = my_log2(info->ssize);
476	/ ssize had better be a power of 2 /
477	Assert(hctl->ssize == (`1L` << hctl->sshift));
478	}
479	if (flags & HASH_FFACTOR)
480	hctl->ffactor = info->ffactor;
481
482	/*
483	* SHM hash tables have fixed directory size passed by the caller.
484	*/
485	if (flags & HASH_DIRSIZE)
486	{
487	hctl->max_dsize = info->max_dsize;
488	hctl->dsize = info->dsize;
489	}
490
491	/*
492	* hash table now allocates space for key and data but you have to say how
493	* much space to allocate
494	*/
495	if (flags & HASH_ELEM)
496	{
497	Assert(info->entrysize >= info->keysize);
498	hctl->keysize = info->keysize;
499	hctl->entrysize = info->entrysize;
500	}
501
502	/ make local copies of heavily-used constant fields /
503	hashp->keysize = hctl->keysize;
504	hashp->ssize = hctl->ssize;
505	hashp->sshift = hctl->sshift;
506
507	/ Build the hash directory structure /
508	if (!init_htab(hashp, nelem))
509	elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
510
511	/*
512	* For a shared hash table, preallocate the requested number of elements.
513	* This reduces problems with run-time out-of-shared-memory conditions.
514	*
515	* For a non-shared hash table, preallocate the requested number of
516	* elements if it's less than our chosen nelem_alloc. This avoids wasting
517	* space if the caller correctly estimates a small table size.
518	*/
519	if ((flags & HASH_SHARED_MEM) \|\|
520	nelem < hctl->nelem_alloc)
521	{
522	int i,
523	freelist_partitions,
524	nelem_alloc,
525	nelem_alloc_first;
526
527	/*
528	* If hash table is partitioned, give each freelist an equal share of
529	* the initial allocation. Otherwise only freeList[0] is used.
530	*/
531	if (IS_PARTITIONED(hashp->hctl))
532	freelist_partitions = NUM_FREELISTS;
533	else
534	freelist_partitions = `1`;
535
536	nelem_alloc = nelem / freelist_partitions;
537	if (nelem_alloc <= `0`)
538	nelem_alloc = `1`;
539
540	/*
541	* Make sure we'll allocate all the requested elements; freeList[0]
542	* gets the excess if the request isn't divisible by NUM_FREELISTS.
543	*/
544	if (nelem_alloc * freelist_partitions < nelem)
545	nelem_alloc_first =
546	nelem - nelem_alloc * (freelist_partitions - `1`);
547	else
548	nelem_alloc_first = nelem_alloc;
549
550	for (i = `0`; i < freelist_partitions; i++)
551	{
552	int temp = (i == `0`) ? nelem_alloc_first : nelem_alloc;
553
554	if (!element_alloc(hashp, temp, i))
555	ereport(ERROR,
556	(errcode(ERRCODE_OUT_OF_MEMORY),
557	errmsg("out of memory")));
558	}
559	}
560
561	if (flags & HASH_FIXED_SIZE)
562	hashp->isfixed = true;
563	return hashp;
564	}
565
566	/*
567	* Set default HASHHDR parameters.
568	*/
569	static void
570	hdefault(HTAB *hashp)
571	{
572	HASHHDR *hctl = hashp->hctl;
573
574	MemSet(hctl, `0`, sizeof(HASHHDR));
575
576	hctl->dsize = DEF_DIRSIZE;
577	hctl->nsegs = `0`;
578
579	/ rather pointless defaults for key & entry size /
580	hctl->keysize = sizeof(char *);
581	hctl->entrysize = `2` * sizeof(char *);
582
583	hctl->num_partitions = `0`; / not partitioned /
584
585	hctl->ffactor = DEF_FFACTOR;
586
587	/ table has no fixed maximum size /
588	hctl->max_dsize = NO_MAX_DSIZE;
589
590	hctl->ssize = DEF_SEGSIZE;
591	hctl->sshift = DEF_SEGSIZE_SHIFT;
592
593	#ifdef HASH_STATISTICS
594	hctl->accesses = hctl->collisions = `0`;
595	#endif
596	}
597
598	/*
599	* Given the user-specified entry size, choose nelem_alloc, ie, how many
600	* elements to add to the hash table when we need more.
601	*/
602	static int
603	choose_nelem_alloc(Size entrysize)
604	{
605	int nelem_alloc;
606	Size elementSize;
607	Size allocSize;
608
609	/ Each element has a HASHELEMENT header plus user data. /
610	/ NB: this had better match element_alloc() /
611	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
612
613	/*
614	* The idea here is to choose nelem_alloc at least 32, but round up so
615	* that the allocation request will be a power of 2 or just less. This
616	* makes little difference for hash tables in shared memory, but for hash
617	* tables managed by palloc, the allocation request will be rounded up to
618	* a power of 2 anyway. If we fail to take this into account, we'll waste
619	* as much as half the allocated space.
620	*/
621	allocSize = `32` * `4`; / assume elementSize at least 8 /
622	do
623	{
624	allocSize <<= `1`;
625	nelem_alloc = allocSize / elementSize;
626	} while (nelem_alloc < `32`);
627
628	return nelem_alloc;
629	}
630
631	/*
632	* Compute derived fields of hctl and build the initial directory/segment
633	* arrays
634	*/
635	static bool
636	init_htab(HTAB hashp, long* nelem)
637	{
638	HASHHDR *hctl = hashp->hctl;
639	HASHSEGMENT *segp;
640	int nbuckets;
641	int nsegs;
642	int i;
643
644	/*
645	* initialize mutexes if it's a partitioned table
646	*/
647	if (IS_PARTITIONED(hctl))
648	for (i = `0`; i < NUM_FREELISTS; i++)
649	SpinLockInit(&(hctl->freeList[i].mutex));
650
651	/*
652	* Divide number of elements by the fill factor to determine a desired
653	* number of buckets. Allocate space for the next greater power of two
654	* number of buckets
655	*/
656	nbuckets = next_pow2_int((nelem - `1`) / hctl->ffactor + `1`);
657
658	/*
659	* In a partitioned table, nbuckets must be at least equal to
660	* num_partitions; were it less, keys with apparently different partition
661	* numbers would map to the same bucket, breaking partition independence.
662	* (Normally nbuckets will be much bigger; this is just a safety check.)
663	*/
664	while (nbuckets < hctl->num_partitions)
665	nbuckets <<= `1`;
666
667	hctl->max_bucket = hctl->low_mask = nbuckets - `1`;
668	hctl->high_mask = (nbuckets << `1`) - `1`;
669
670	/*
671	* Figure number of directory segments needed, round up to a power of 2
672	*/
673	nsegs = (nbuckets - `1`) / hctl->ssize + `1`;
674	nsegs = next_pow2_int(nsegs);
675
676	/*
677	* Make sure directory is big enough. If pre-allocated directory is too
678	* small, choke (caller screwed up).
679	*/
680	if (nsegs > hctl->dsize)
681	{
682	if (!(hashp->dir))
683	hctl->dsize = nsegs;
684	else
685	return false;
686	}
687
688	/ Allocate a directory /
689	if (!(hashp->dir))
690	{
691	CurrentDynaHashCxt = hashp->hcxt;
692	hashp->dir = (HASHSEGMENT *)
693	hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
694	if (!hashp->dir)
695	return false;
696	}
697
698	/ Allocate initial segments /
699	for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
700	{
701	*segp = seg_alloc(hashp);
702	if (*segp == NULL)
703	return false;
704	}
705
706	/ Choose number of entries to allocate at a time /
707	hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
708
709	#if HASH_DEBUG
710	fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
711	"TABLE POINTER ", hashp,
712	"DIRECTORY SIZE ", hctl->dsize,
713	"SEGMENT SIZE ", hctl->ssize,
714	"SEGMENT SHIFT ", hctl->sshift,
715	"FILL FACTOR ", hctl->ffactor,
716	"MAX BUCKET ", hctl->max_bucket,
717	"HIGH MASK ", hctl->high_mask,
718	"LOW MASK ", hctl->low_mask,
719	"NSEGS ", hctl->nsegs);
720	#endif
721	return true;
722	}
723
724	/*
725	* Estimate the space needed for a hashtable containing the given number
726	* of entries of given size.
727	* NOTE: this is used to estimate the footprint of hashtables in shared
728	* memory; therefore it does not count HTAB which is in local memory.
729	* NB: assumes that all hash structure parameters have default values!
730	*/
731	Size
732	hash_estimate_size(long num_entries, Size entrysize)
733	{
734	Size size;
735	long nBuckets,
736	nSegments,
737	nDirEntries,
738	nElementAllocs,
739	elementSize,
740	elementAllocCnt;
741
742	/ estimate number of buckets wanted /
743	nBuckets = next_pow2_long((num_entries - `1`) / DEF_FFACTOR + `1`);
744	/ # of segments needed for nBuckets /
745	nSegments = next_pow2_long((nBuckets - `1`) / DEF_SEGSIZE + `1`);
746	/ directory entries /
747	nDirEntries = DEF_DIRSIZE;
748	while (nDirEntries < nSegments)
749	nDirEntries <<= `1`; / dir_alloc doubles dsize at each call /
750
751	/ fixed control info /
752	size = MAXALIGN(sizeof(HASHHDR)); / but not HTAB, per above /
753	/ directory /
754	size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
755	/ segments /
756	size = add_size(size, mul_size(nSegments,
757	MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
758	/ elements --- allocated in groups of choose_nelem_alloc() entries /
759	elementAllocCnt = choose_nelem_alloc(entrysize);
760	nElementAllocs = (num_entries - `1`) / elementAllocCnt + `1`;
761	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
762	size = add_size(size,
763	mul_size(nElementAllocs,
764	mul_size(elementAllocCnt, elementSize)));
765
766	return size;
767	}
768
769	/*
770	* Select an appropriate directory size for a hashtable with the given
771	* maximum number of entries.
772	* This is only needed for hashtables in shared memory, whose directories
773	* cannot be expanded dynamically.
774	* NB: assumes that all hash structure parameters have default values!
775	*
776	* XXX this had better agree with the behavior of init_htab()...
777	*/
778	long
779	hash_select_dirsize(long num_entries)
780	{
781	long nBuckets,
782	nSegments,
783	nDirEntries;
784
785	/ estimate number of buckets wanted /
786	nBuckets = next_pow2_long((num_entries - `1`) / DEF_FFACTOR + `1`);
787	/ # of segments needed for nBuckets /
788	nSegments = next_pow2_long((nBuckets - `1`) / DEF_SEGSIZE + `1`);
789	/ directory entries /
790	nDirEntries = DEF_DIRSIZE;
791	while (nDirEntries < nSegments)
792	nDirEntries <<= `1`; / dir_alloc doubles dsize at each call /
793
794	return nDirEntries;
795	}
796
797	/*
798	* Compute the required initial memory allocation for a shared-memory
799	* hashtable with the given parameters. We need space for the HASHHDR
800	* and for the (non expansible) directory.
801	*/
802	Size
803	hash_get_shared_size(HASHCTL info, int* flags)
804	{
805	Assert(flags & HASH_DIRSIZE);
806	Assert(info->dsize == info->max_dsize);
807	return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
808	}
809
810
811	/******************* DESTROY ROUTINES *********************/
812
813	void
814	hash_destroy(HTAB *hashp)
815	{
816	if (hashp != NULL)
817	{
818	/ allocation method must be one we know how to free, too /
819	Assert(hashp->alloc == DynaHashAlloc);
820	/ so this hashtable must have its own context /
821	Assert(hashp->hcxt != NULL);
822
823	hash_stats("destroy", hashp);
824
825	/*
826	* Free everything by destroying the hash table's memory context.
827	*/
828	MemoryContextDelete(hashp->hcxt);
829	}
830	}
831
832	void
833	hash_stats(const char where, HTAB hashp)
834	{
835	#if HASH_STATISTICS
836	fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
837	where, hashp->hctl->accesses, hashp->hctl->collisions);
838
839	fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
840	hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
841	hashp->hctl->max_bucket, hashp->hctl->nsegs);
842	fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
843	where, hash_accesses, hash_collisions);
844	fprintf(stderr, "hash_stats: total expansions %ld\n",
845	hash_expansions);
846	#endif
847	}
848
849	/****************************SEARCH ROUTINES **************************/
850
851
852	/*
853	* get_hash_value -- exported routine to calculate a key's hash value
854	*
855	* We export this because for partitioned tables, callers need to compute
856	* the partition number (from the low-order bits of the hash value) before
857	* searching.
858	*/
859	uint32
860	get_hash_value(HTAB hashp, const* void *keyPtr)
861	{
862	return hashp->hash(keyPtr, hashp->keysize);
863	}
864
865	/ Convert a hash value to a bucket number /
866	static inline uint32
867	calc_bucket(HASHHDR *hctl, uint32 hash_val)
868	{
869	uint32 bucket;
870
871	bucket = hash_val & hctl->high_mask;
872	if (bucket > hctl->max_bucket)
873	bucket = bucket & hctl->low_mask;
874
875	return bucket;
876	}
877
878	/*
879	* hash_search -- look up key in table and perform action
880	* hash_search_with_hash_value -- same, with key's hash value already computed
881	*
882	* action is one of:
883	* HASH_FIND: look up key in table
884	* HASH_ENTER: look up key in table, creating entry if not present
885	* HASH_ENTER_NULL: same, but return NULL if out of memory
886	* HASH_REMOVE: look up key in table, remove entry if present
887	*
888	* Return value is a pointer to the element found/entered/removed if any,
889	* or NULL if no match was found. (NB: in the case of the REMOVE action,
890	* the result is a dangling pointer that shouldn't be dereferenced!)
891	*
892	* HASH_ENTER will normally ereport a generic "out of memory" error if
893	* it is unable to create a new entry. The HASH_ENTER_NULL operation is
894	* the same except it will return NULL if out of memory. Note that
895	* HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
896	* since palloc internally ereports on out-of-memory.
897	*
898	* If foundPtr isn't NULL, then *foundPtr is set true if we found an
899	* existing entry in the table, false otherwise. This is needed in the
900	* HASH_ENTER case, but is redundant with the return value otherwise.
901	*
902	* For hash_search_with_hash_value, the hashvalue parameter must have been
903	* calculated with get_hash_value().
904	*/
905	void *
906	hash_search(HTAB *hashp,
907	const void *keyPtr,
908	HASHACTION action,
909	bool *foundPtr)
910	{
911	return hash_search_with_hash_value(hashp,
912	keyPtr,
913	hashp->hash(keyPtr, hashp->keysize),
914	action,
915	foundPtr);
916	}
917
918	void *
919	hash_search_with_hash_value(HTAB *hashp,
920	const void *keyPtr,
921	uint32 hashvalue,
922	HASHACTION action,
923	bool *foundPtr)
924	{
925	HASHHDR *hctl = hashp->hctl;
926	int freelist_idx = FREELIST_IDX(hctl, hashvalue);
927	Size keysize;
928	uint32 bucket;
929	long segment_num;
930	long segment_ndx;
931	HASHSEGMENT segp;
932	HASHBUCKET currBucket;
933	HASHBUCKET *prevBucketPtr;
934	HashCompareFunc match;
935
936	#if HASH_STATISTICS
937	hash_accesses++;
938	hctl->accesses++;
939	#endif
940
941	/*
942	* If inserting, check if it is time to split a bucket.
943	*
944	* NOTE: failure to expand table is not a fatal error, it just means we
945	* have to run at higher fill factor than we wanted. However, if we're
946	* using the palloc allocator then it will throw error anyway on
947	* out-of-memory, so we must do this before modifying the table.
948	*/
949	if (action == HASH_ENTER \|\| action == HASH_ENTER_NULL)
950	{
951	/*
952	* Can't split if running in partitioned mode, nor if frozen, nor if
953	* table is the subject of any active hash_seq_search scans. Strange
954	* order of these tests is to try to check cheaper conditions first.
955	*/
956	if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
957	hctl->freeList[`0`].nentries / (long) (hctl->max_bucket + `1`) >= hctl->ffactor &&
958	!has_seq_scans(hashp))
959	(void) expand_table(hashp);
960	}
961
962	/*
963	* Do the initial lookup
964	*/
965	bucket = calc_bucket(hctl, hashvalue);
966
967	segment_num = bucket >> hashp->sshift;
968	segment_ndx = MOD(bucket, hashp->ssize);
969
970	segp = hashp->dir[segment_num];
971
972	if (segp == NULL)
973	hash_corrupted(hashp);
974
975	prevBucketPtr = &segp[segment_ndx];
976	currBucket = *prevBucketPtr;
977
978	/*
979	* Follow collision chain looking for matching key
980	*/
981	match = hashp->match; / save one fetch in inner loop /
982	keysize = hashp->keysize; / ditto /
983
984	while (currBucket != NULL)
985	{
986	if (currBucket->hashvalue == hashvalue &&
987	match(ELEMENTKEY(currBucket), keyPtr, keysize) == `0`)
988	break;
989	prevBucketPtr = &(currBucket->link);
990	currBucket = *prevBucketPtr;
991	#if HASH_STATISTICS
992	hash_collisions++;
993	hctl->collisions++;
994	#endif
995	}
996
997	if (foundPtr)
998	*foundPtr = (bool) (currBucket != NULL);
999
1000	/*
1001	* OK, now what?
1002	*/
1003	switch (action)
1004	{
1005	case HASH_FIND:
1006	if (currBucket != NULL)
1007	return (void *) ELEMENTKEY(currBucket);
1008	return NULL;
1009
1010	case HASH_REMOVE:
1011	if (currBucket != NULL)
1012	{
1013	/ if partitioned, must lock to touch nentries and freeList /
1014	if (IS_PARTITIONED(hctl))
1015	SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1016
1017	/ delete the record from the appropriate nentries counter. /
1018	Assert(hctl->freeList[freelist_idx].nentries > `0`);
1019	hctl->freeList[freelist_idx].nentries--;
1020
1021	/ remove record from hash bucket's chain. /
1022	*prevBucketPtr = currBucket->link;
1023
1024	/ add the record to the appropriate freelist. /
1025	currBucket->link = hctl->freeList[freelist_idx].freeList;
1026	hctl->freeList[freelist_idx].freeList = currBucket;
1027
1028	if (IS_PARTITIONED(hctl))
1029	SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1030
1031	/*
1032	* better hope the caller is synchronizing access to this
1033	* element, because someone else is going to reuse it the next
1034	* time something is added to the table
1035	*/
1036	return (void *) ELEMENTKEY(currBucket);
1037	}
1038	return NULL;
1039
1040	case HASH_ENTER_NULL:
1041	/ ENTER_NULL does not work with palloc-based allocator /
1042	Assert(hashp->alloc != DynaHashAlloc);
1043	/ FALL THRU /
1044
1045	case HASH_ENTER:
1046	/ Return existing element if found, else create one /
1047	if (currBucket != NULL)
1048	return (void *) ELEMENTKEY(currBucket);
1049
1050	/ disallow inserts if frozen /
1051	if (hashp->frozen)
1052	elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1053	hashp->tabname);
1054
1055	currBucket = get_hash_entry(hashp, freelist_idx);
1056	if (currBucket == NULL)
1057	{
1058	/ out of memory /
1059	if (action == HASH_ENTER_NULL)
1060	return NULL;
1061	/ report a generic message /
1062	if (hashp->isshared)
1063	ereport(ERROR,
1064	(errcode(ERRCODE_OUT_OF_MEMORY),
1065	errmsg("out of shared memory")));
1066	else
1067	ereport(ERROR,
1068	(errcode(ERRCODE_OUT_OF_MEMORY),
1069	errmsg("out of memory")));
1070	}
1071
1072	/ link into hashbucket chain /
1073	*prevBucketPtr = currBucket;
1074	currBucket->link = NULL;
1075
1076	/ copy key into record /
1077	currBucket->hashvalue = hashvalue;
1078	hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1079
1080	/*
1081	* Caller is expected to fill the data field on return. DO NOT
1082	* insert any code that could possibly throw error here, as doing
1083	* so would leave the table entry incomplete and hence corrupt the
1084	* caller's data structure.
1085	*/
1086
1087	return (void *) ELEMENTKEY(currBucket);
1088	}
1089
1090	elog(ERROR, "unrecognized hash action code: %d", (int) action);
1091
1092	return NULL; / keep compiler quiet /
1093	}
1094
1095	/*
1096	* hash_update_hash_key -- change the hash key of an existing table entry
1097	*
1098	* This is equivalent to removing the entry, making a new entry, and copying
1099	* over its data, except that the entry never goes to the table's freelist.
1100	* Therefore this cannot suffer an out-of-memory failure, even if there are
1101	* other processes operating in other partitions of the hashtable.
1102	*
1103	* Returns true if successful, false if the requested new hash key is already
1104	* present. Throws error if the specified entry pointer isn't actually a
1105	* table member.
1106	*
1107	* NB: currently, there is no special case for old and new hash keys being
1108	* identical, which means we'll report false for that situation. This is
1109	* preferable for existing uses.
1110	*
1111	* NB: for a partitioned hashtable, caller must hold lock on both relevant
1112	* partitions, if the new hash key would belong to a different partition.
1113	*/
1114	bool
1115	hash_update_hash_key(HTAB *hashp,
1116	void *existingEntry,
1117	const void *newKeyPtr)
1118	{
1119	HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1120	HASHHDR *hctl = hashp->hctl;
1121	uint32 newhashvalue;
1122	Size keysize;
1123	uint32 bucket;
1124	uint32 newbucket;
1125	long segment_num;
1126	long segment_ndx;
1127	HASHSEGMENT segp;
1128	HASHBUCKET currBucket;
1129	HASHBUCKET *prevBucketPtr;
1130	HASHBUCKET *oldPrevPtr;
1131	HashCompareFunc match;
1132
1133	#if HASH_STATISTICS
1134	hash_accesses++;
1135	hctl->accesses++;
1136	#endif
1137
1138	/ disallow updates if frozen /
1139	if (hashp->frozen)
1140	elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1141	hashp->tabname);
1142
1143	/*
1144	* Lookup the existing element using its saved hash value. We need to do
1145	* this to be able to unlink it from its hash chain, but as a side benefit
1146	* we can verify the validity of the passed existingEntry pointer.
1147	*/
1148	bucket = calc_bucket(hctl, existingElement->hashvalue);
1149
1150	segment_num = bucket >> hashp->sshift;
1151	segment_ndx = MOD(bucket, hashp->ssize);
1152
1153	segp = hashp->dir[segment_num];
1154
1155	if (segp == NULL)
1156	hash_corrupted(hashp);
1157
1158	prevBucketPtr = &segp[segment_ndx];
1159	currBucket = *prevBucketPtr;
1160
1161	while (currBucket != NULL)
1162	{
1163	if (currBucket == existingElement)
1164	break;
1165	prevBucketPtr = &(currBucket->link);
1166	currBucket = *prevBucketPtr;
1167	}
1168
1169	if (currBucket == NULL)
1170	elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1171	hashp->tabname);
1172
1173	oldPrevPtr = prevBucketPtr;
1174
1175	/*
1176	* Now perform the equivalent of a HASH_ENTER operation to locate the hash
1177	* chain we want to put the entry into.
1178	*/
1179	newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1180
1181	newbucket = calc_bucket(hctl, newhashvalue);
1182
1183	segment_num = newbucket >> hashp->sshift;
1184	segment_ndx = MOD(newbucket, hashp->ssize);
1185
1186	segp = hashp->dir[segment_num];
1187
1188	if (segp == NULL)
1189	hash_corrupted(hashp);
1190
1191	prevBucketPtr = &segp[segment_ndx];
1192	currBucket = *prevBucketPtr;
1193
1194	/*
1195	* Follow collision chain looking for matching key
1196	*/
1197	match = hashp->match; / save one fetch in inner loop /
1198	keysize = hashp->keysize; / ditto /
1199
1200	while (currBucket != NULL)
1201	{
1202	if (currBucket->hashvalue == newhashvalue &&
1203	match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == `0`)
1204	break;
1205	prevBucketPtr = &(currBucket->link);
1206	currBucket = *prevBucketPtr;
1207	#if HASH_STATISTICS
1208	hash_collisions++;
1209	hctl->collisions++;
1210	#endif
1211	}
1212
1213	if (currBucket != NULL)
1214	return false; / collision with an existing entry /
1215
1216	currBucket = existingElement;
1217
1218	/*
1219	* If old and new hash values belong to the same bucket, we need not
1220	* change any chain links, and indeed should not since this simplistic
1221	* update will corrupt the list if currBucket is the last element. (We
1222	* cannot fall out earlier, however, since we need to scan the bucket to
1223	* check for duplicate keys.)
1224	*/
1225	if (bucket != newbucket)
1226	{
1227	/ OK to remove record from old hash bucket's chain. /
1228	*oldPrevPtr = currBucket->link;
1229
1230	/ link into new hashbucket chain /
1231	*prevBucketPtr = currBucket;
1232	currBucket->link = NULL;
1233	}
1234
1235	/ copy new key into record /
1236	currBucket->hashvalue = newhashvalue;
1237	hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1238
1239	/ rest of record is untouched /
1240
1241	return true;
1242	}
1243
1244	/*
1245	* Allocate a new hashtable entry if possible; return NULL if out of memory.
1246	* (Or, if the underlying space allocator throws error for out-of-memory,
1247	* we won't return at all.)
1248	*/
1249	static HASHBUCKET
1250	get_hash_entry(HTAB hashp, int* freelist_idx)
1251	{
1252	HASHHDR *hctl = hashp->hctl;
1253	HASHBUCKET newElement;
1254
1255	for (;;)
1256	{
1257	/ if partitioned, must lock to touch nentries and freeList /
1258	if (IS_PARTITIONED(hctl))
1259	SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1260
1261	/ try to get an entry from the freelist /
1262	newElement = hctl->freeList[freelist_idx].freeList;
1263
1264	if (newElement != NULL)
1265	break;
1266
1267	if (IS_PARTITIONED(hctl))
1268	SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1269
1270	/*
1271	* No free elements in this freelist. In a partitioned table, there
1272	* might be entries in other freelists, but to reduce contention we
1273	* prefer to first try to get another chunk of buckets from the main
1274	* shmem allocator. If that fails, though, we MUST root through all
1275	* the other freelists before giving up. There are multiple callers
1276	* that assume that they can allocate every element in the initially
1277	* requested table size, or that deleting an element guarantees they
1278	* can insert a new element, even if shared memory is entirely full.
1279	* Failing because the needed element is in a different freelist is
1280	* not acceptable.
1281	*/
1282	if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1283	{
1284	int borrow_from_idx;
1285
1286	if (!IS_PARTITIONED(hctl))
1287	return NULL; / out of memory /
1288
1289	/ try to borrow element from another freelist /
1290	borrow_from_idx = freelist_idx;
1291	for (;;)
1292	{
1293	borrow_from_idx = (borrow_from_idx + `1`) % NUM_FREELISTS;
1294	if (borrow_from_idx == freelist_idx)
1295	break; / examined all freelists, fail /
1296
1297	SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1298	newElement = hctl->freeList[borrow_from_idx].freeList;
1299
1300	if (newElement != NULL)
1301	{
1302	hctl->freeList[borrow_from_idx].freeList = newElement->link;
1303	SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1304
1305	/ careful: count the new element in its proper freelist /
1306	SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1307	hctl->freeList[freelist_idx].nentries++;
1308	SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1309
1310	return newElement;
1311	}
1312
1313	SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1314	}
1315
1316	/ no elements available to borrow either, so out of memory /
1317	return NULL;
1318	}
1319	}
1320
1321	/ remove entry from freelist, bump nentries /
1322	hctl->freeList[freelist_idx].freeList = newElement->link;
1323	hctl->freeList[freelist_idx].nentries++;
1324
1325	if (IS_PARTITIONED(hctl))
1326	SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1327
1328	return newElement;
1329	}
1330
1331	/*
1332	* hash_get_num_entries -- get the number of entries in a hashtable
1333	*/
1334	long
1335	hash_get_num_entries(HTAB *hashp)
1336	{
1337	int i;
1338	long sum = hashp->hctl->freeList[`0`].nentries;
1339
1340	/*
1341	* We currently don't bother with acquiring the mutexes; it's only
1342	* sensible to call this function if you've got lock on all partitions of
1343	* the table.
1344	*/
1345	if (IS_PARTITIONED(hashp->hctl))
1346	{
1347	for (i = `1`; i < NUM_FREELISTS; i++)
1348	sum += hashp->hctl->freeList[i].nentries;
1349	}
1350
1351	return sum;
1352	}
1353
1354	/*
1355	* hash_seq_init/_search/_term
1356	* Sequentially search through hash table and return
1357	* all the elements one by one, return NULL when no more.
1358	*
1359	* hash_seq_term should be called if and only if the scan is abandoned before
1360	* completion; if hash_seq_search returns NULL then it has already done the
1361	* end-of-scan cleanup.
1362	*
1363	* NOTE: caller may delete the returned element before continuing the scan.
1364	* However, deleting any other element while the scan is in progress is
1365	* UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1366	* if elements are added to the table while the scan is in progress, it is
1367	* unspecified whether they will be visited by the scan or not.
1368	*
1369	* NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1370	* worry about hash_seq_term cleanup, if the hashtable is first locked against
1371	* further insertions by calling hash_freeze.
1372	*
1373	* NOTE: to use this with a partitioned hashtable, caller had better hold
1374	* at least shared lock on all partitions of the table throughout the scan!
1375	* We can cope with insertions or deletions by our own backend, but not
1376	* with concurrent insertions or deletions by another.
1377	*/
1378	void
1379	hash_seq_init(HASH_SEQ_STATUS status, HTAB hashp)
1380	{
1381	status->hashp = hashp;
1382	status->curBucket = `0`;
1383	status->curEntry = NULL;
1384	if (!hashp->frozen)
1385	register_seq_scan(hashp);
1386	}
1387
1388	void *
1389	hash_seq_search(HASH_SEQ_STATUS *status)
1390	{
1391	HTAB *hashp;
1392	HASHHDR *hctl;
1393	uint32 max_bucket;
1394	long ssize;
1395	long segment_num;
1396	long segment_ndx;
1397	HASHSEGMENT segp;
1398	uint32 curBucket;
1399	HASHELEMENT *curElem;
1400
1401	if ((curElem = status->curEntry) != NULL)
1402	{
1403	/ Continuing scan of curBucket... /
1404	status->curEntry = curElem->link;
1405	if (status->curEntry == NULL) / end of this bucket /
1406	++status->curBucket;
1407	return (void *) ELEMENTKEY(curElem);
1408	}
1409
1410	/*
1411	* Search for next nonempty bucket starting at curBucket.
1412	*/
1413	curBucket = status->curBucket;
1414	hashp = status->hashp;
1415	hctl = hashp->hctl;
1416	ssize = hashp->ssize;
1417	max_bucket = hctl->max_bucket;
1418
1419	if (curBucket > max_bucket)
1420	{
1421	hash_seq_term(status);
1422	return NULL; / search is done /
1423	}
1424
1425	/*
1426	* first find the right segment in the table directory.
1427	*/
1428	segment_num = curBucket >> hashp->sshift;
1429	segment_ndx = MOD(curBucket, ssize);
1430
1431	segp = hashp->dir[segment_num];
1432
1433	/*
1434	* Pick up the first item in this bucket's chain. If chain is not empty
1435	* we can begin searching it. Otherwise we have to advance to find the
1436	* next nonempty bucket. We try to optimize that case since searching a
1437	* near-empty hashtable has to iterate this loop a lot.
1438	*/
1439	while ((curElem = segp[segment_ndx]) == NULL)
1440	{
1441	/ empty bucket, advance to next /
1442	if (++curBucket > max_bucket)
1443	{
1444	status->curBucket = curBucket;
1445	hash_seq_term(status);
1446	return NULL; / search is done /
1447	}
1448	if (++segment_ndx >= ssize)
1449	{
1450	segment_num++;
1451	segment_ndx = `0`;
1452	segp = hashp->dir[segment_num];
1453	}
1454	}
1455
1456	/ Begin scan of curBucket... /
1457	status->curEntry = curElem->link;
1458	if (status->curEntry == NULL) / end of this bucket /
1459	++curBucket;
1460	status->curBucket = curBucket;
1461	return (void *) ELEMENTKEY(curElem);
1462	}
1463
1464	void
1465	hash_seq_term(HASH_SEQ_STATUS *status)
1466	{
1467	if (!status->hashp->frozen)
1468	deregister_seq_scan(status->hashp);
1469	}
1470
1471	/*
1472	* hash_freeze
1473	* Freeze a hashtable against future insertions (deletions are
1474	* still allowed)
1475	*
1476	* The reason for doing this is that by preventing any more bucket splits,
1477	* we no longer need to worry about registering hash_seq_search scans,
1478	* and thus caller need not be careful about ensuring hash_seq_term gets
1479	* called at the right times.
1480	*
1481	* Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1482	* with active scans (since hash_seq_term would then do the wrong thing).
1483	*/
1484	void
1485	hash_freeze(HTAB *hashp)
1486	{
1487	if (hashp->isshared)
1488	elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1489	if (!hashp->frozen && has_seq_scans(hashp))
1490	elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1491	hashp->tabname);
1492	hashp->frozen = true;
1493	}
1494
1495
1496	/****************************** UTILITIES *********************/
1497
1498	/*
1499	* Expand the table by adding one more hash bucket.
1500	*/
1501	static bool
1502	expand_table(HTAB *hashp)
1503	{
1504	HASHHDR *hctl = hashp->hctl;
1505	HASHSEGMENT old_seg,
1506	new_seg;
1507	long old_bucket,
1508	new_bucket;
1509	long new_segnum,
1510	new_segndx;
1511	long old_segnum,
1512	old_segndx;
1513	HASHBUCKET *oldlink,
1514	*newlink;
1515	HASHBUCKET currElement,
1516	nextElement;
1517
1518	Assert(!IS_PARTITIONED(hctl));
1519
1520	#ifdef HASH_STATISTICS
1521	hash_expansions++;
1522	#endif
1523
1524	new_bucket = hctl->max_bucket + `1`;
1525	new_segnum = new_bucket >> hashp->sshift;
1526	new_segndx = MOD(new_bucket, hashp->ssize);
1527
1528	if (new_segnum >= hctl->nsegs)
1529	{
1530	/ Allocate new segment if necessary -- could fail if dir full /
1531	if (new_segnum >= hctl->dsize)
1532	if (!dir_realloc(hashp))
1533	return false;
1534	if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1535	return false;
1536	hctl->nsegs++;
1537	}
1538
1539	/ OK, we created a new bucket /
1540	hctl->max_bucket++;
1541
1542	/*
1543	* Before changing masks, find old bucket corresponding to same hash
1544	* values; values in that bucket may need to be relocated to new bucket.
1545	* Note that new_bucket is certainly larger than low_mask at this point,
1546	* so we can skip the first step of the regular hash mask calc.
1547	*/
1548	old_bucket = (new_bucket & hctl->low_mask);
1549
1550	/*
1551	* If we crossed a power of 2, readjust masks.
1552	*/
1553	if ((uint32) new_bucket > hctl->high_mask)
1554	{
1555	hctl->low_mask = hctl->high_mask;
1556	hctl->high_mask = (uint32) new_bucket \| hctl->low_mask;
1557	}
1558
1559	/*
1560	* Relocate records to the new bucket. NOTE: because of the way the hash
1561	* masking is done in calc_bucket, only one old bucket can need to be
1562	* split at this point. With a different way of reducing the hash value,
1563	* that might not be true!
1564	*/
1565	old_segnum = old_bucket >> hashp->sshift;
1566	old_segndx = MOD(old_bucket, hashp->ssize);
1567
1568	old_seg = hashp->dir[old_segnum];
1569	new_seg = hashp->dir[new_segnum];
1570
1571	oldlink = &old_seg[old_segndx];
1572	newlink = &new_seg[new_segndx];
1573
1574	for (currElement = *oldlink;
1575	currElement != NULL;
1576	currElement = nextElement)
1577	{
1578	nextElement = currElement->link;
1579	if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1580	{
1581	*oldlink = currElement;
1582	oldlink = &currElement->link;
1583	}
1584	else
1585	{
1586	*newlink = currElement;
1587	newlink = &currElement->link;
1588	}
1589	}
1590	/ don't forget to terminate the rebuilt hash chains... /
1591	*oldlink = NULL;
1592	*newlink = NULL;
1593
1594	return true;
1595	}
1596
1597
1598	static bool
1599	dir_realloc(HTAB *hashp)
1600	{
1601	HASHSEGMENT *p;
1602	HASHSEGMENT *old_p;
1603	long new_dsize;
1604	long old_dirsize;
1605	long new_dirsize;
1606
1607	if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1608	return false;
1609
1610	/ Reallocate directory /
1611	new_dsize = hashp->hctl->dsize << `1`;
1612	old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1613	new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1614
1615	old_p = hashp->dir;
1616	CurrentDynaHashCxt = hashp->hcxt;
1617	p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1618
1619	if (p != NULL)
1620	{
1621	memcpy(p, old_p, old_dirsize);
1622	MemSet(((char *) p) + old_dirsize, `0`, new_dirsize - old_dirsize);
1623	hashp->dir = p;
1624	hashp->hctl->dsize = new_dsize;
1625
1626	/ XXX assume the allocator is palloc, so we know how to free /
1627	Assert(hashp->alloc == DynaHashAlloc);
1628	pfree(old_p);
1629
1630	return true;
1631	}
1632
1633	return false;
1634	}
1635
1636
1637	static HASHSEGMENT
1638	seg_alloc(HTAB *hashp)
1639	{
1640	HASHSEGMENT segp;
1641
1642	CurrentDynaHashCxt = hashp->hcxt;
1643	segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1644
1645	if (!segp)
1646	return NULL;
1647
1648	MemSet(segp, `0`, sizeof(HASHBUCKET) * hashp->ssize);
1649
1650	return segp;
1651	}
1652
1653	/*
1654	* allocate some new elements and link them into the indicated free list
1655	*/
1656	static bool
1657	element_alloc(HTAB hashp, int* nelem, int freelist_idx)
1658	{
1659	HASHHDR *hctl = hashp->hctl;
1660	Size elementSize;
1661	HASHELEMENT *firstElement;
1662	HASHELEMENT *tmpElement;
1663	HASHELEMENT *prevElement;
1664	int i;
1665
1666	if (hashp->isfixed)
1667	return false;
1668
1669	/ Each element has a HASHELEMENT header plus user data. /
1670	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1671
1672	CurrentDynaHashCxt = hashp->hcxt;
1673	firstElement = (HASHELEMENT ) hashp->alloc(nelem elementSize);
1674
1675	if (!firstElement)
1676	return false;
1677
1678	/ prepare to link all the new entries into the freelist /
1679	prevElement = NULL;
1680	tmpElement = firstElement;
1681	for (i = `0`; i < nelem; i++)
1682	{
1683	tmpElement->link = prevElement;
1684	prevElement = tmpElement;
1685	tmpElement = (HASHELEMENT ) (((char* *) tmpElement) + elementSize);
1686	}
1687
1688	/ if partitioned, must lock to touch freeList /
1689	if (IS_PARTITIONED(hctl))
1690	SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1691
1692	/ freelist could be nonempty if two backends did this concurrently /
1693	firstElement->link = hctl->freeList[freelist_idx].freeList;
1694	hctl->freeList[freelist_idx].freeList = prevElement;
1695
1696	if (IS_PARTITIONED(hctl))
1697	SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1698
1699	return true;
1700	}
1701
1702	/ complain when we have detected a corrupted hashtable /
1703	static void
1704	hash_corrupted(HTAB *hashp)
1705	{
1706	/*
1707	* If the corruption is in a shared hashtable, we'd better force a
1708	* systemwide restart. Otherwise, just shut down this one backend.
1709	*/
1710	if (hashp->isshared)
1711	elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1712	else
1713	elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1714	}
1715
1716	/ calculate ceil(log base 2) of num /
1717	int
1718	my_log2(long num)
1719	{
1720	int i;
1721	long limit;
1722
1723	/ guard against too-large input, which would put us into infinite loop /
1724	if (num > LONG_MAX / `2`)
1725	num = LONG_MAX / `2`;
1726
1727	for (i = `0`, limit = `1`; limit < num; i++, limit <<= `1`)
1728	;
1729	return i;
1730	}
1731
1732	/ calculate first power of 2 >= num, bounded to what will fit in a long /
1733	static long
1734	next_pow2_long(long num)
1735	{
1736	/ my_log2's internal range check is sufficient /
1737	return `1L` << my_log2(num);
1738	}
1739
1740	/ calculate first power of 2 >= num, bounded to what will fit in an int /
1741	static int
1742	next_pow2_int(long num)
1743	{
1744	if (num > INT_MAX / `2`)
1745	num = INT_MAX / `2`;
1746	return `1` << my_log2(num);
1747	}
1748
1749
1750	/********************** SEQ SCAN TRACKING *********************/
1751
1752	/*
1753	* We track active hash_seq_search scans here. The need for this mechanism
1754	* comes from the fact that a scan will get confused if a bucket split occurs
1755	* while it's in progress: it might visit entries twice, or even miss some
1756	* entirely (if it's partway through the same bucket that splits). Hence
1757	* we want to inhibit bucket splits if there are any active scans on the
1758	* table being inserted into. This is a fairly rare case in current usage,
1759	* so just postponing the split until the next insertion seems sufficient.
1760	*
1761	* Given present usages of the function, only a few scans are likely to be
1762	* open concurrently; so a finite-size stack of open scans seems sufficient,
1763	* and we don't worry that linear search is too slow. Note that we do
1764	* allow multiple scans of the same hashtable to be open concurrently.
1765	*
1766	* This mechanism can support concurrent scan and insertion in a shared
1767	* hashtable if it's the same backend doing both. It would fail otherwise,
1768	* but locking reasons seem to preclude any such scenario anyway, so we don't
1769	* worry.
1770	*
1771	* This arrangement is reasonably robust if a transient hashtable is deleted
1772	* without notifying us. The absolute worst case is we might inhibit splits
1773	* in another table created later at exactly the same address. We will give
1774	* a warning at transaction end for reference leaks, so any bugs leading to
1775	* lack of notification should be easy to catch.
1776	*/
1777
1778	#define MAX_SEQ_SCANS 100
1779
1780	static HTAB seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned /
1781	static int seq_scan_level[MAX_SEQ_SCANS]; / subtransaction nest level /
1782	static int num_seq_scans = `0`;
1783
1784
1785	/ Register a table as having an active hash_seq_search scan /
1786	static void
1787	register_seq_scan(HTAB *hashp)
1788	{
1789	if (num_seq_scans >= MAX_SEQ_SCANS)
1790	elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1791	hashp->tabname);
1792	seq_scan_tables[num_seq_scans] = hashp;
1793	seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1794	num_seq_scans++;
1795	}
1796
1797	/ Deregister an active scan /
1798	static void
1799	deregister_seq_scan(HTAB *hashp)
1800	{
1801	int i;
1802
1803	/ Search backward since it's most likely at the stack top /
1804	for (i = num_seq_scans - `1`; i >= `0`; i--)
1805	{
1806	if (seq_scan_tables[i] == hashp)
1807	{
1808	seq_scan_tables[i] = seq_scan_tables[num_seq_scans - `1`];
1809	seq_scan_level[i] = seq_scan_level[num_seq_scans - `1`];
1810	num_seq_scans--;
1811	return;
1812	}
1813	}
1814	elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1815	hashp->tabname);
1816	}
1817
1818	/ Check if a table has any active scan /
1819	static bool
1820	has_seq_scans(HTAB *hashp)
1821	{
1822	int i;
1823
1824	for (i = `0`; i < num_seq_scans; i++)
1825	{
1826	if (seq_scan_tables[i] == hashp)
1827	return true;
1828	}
1829	return false;
1830	}
1831
1832	/ Clean up any open scans at end of transaction /
1833	void
1834	AtEOXact_HashTables(bool isCommit)
1835	{
1836	/*
1837	* During abort cleanup, open scans are expected; just silently clean 'em
1838	* out. An open scan at commit means someone forgot a hash_seq_term()
1839	* call, so complain.
1840	*
1841	* Note: it's tempting to try to print the tabname here, but refrain for
1842	* fear of touching deallocated memory. This isn't a user-facing message
1843	* anyway, so it needn't be pretty.
1844	*/
1845	if (isCommit)
1846	{
1847	int i;
1848
1849	for (i = `0`; i < num_seq_scans; i++)
1850	{
1851	elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1852	seq_scan_tables[i]);
1853	}
1854	}
1855	num_seq_scans = `0`;
1856	}
1857
1858	/ Clean up any open scans at end of subtransaction /
1859	void
1860	AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1861	{
1862	int i;
1863
1864	/*
1865	* Search backward to make cleanup easy. Note we must check all entries,
1866	* not only those at the end of the array, because deletion technique
1867	* doesn't keep them in order.
1868	*/
1869	for (i = num_seq_scans - `1`; i >= `0`; i--)
1870	{
1871	if (seq_scan_level[i] >= nestDepth)
1872	{
1873	if (isCommit)
1874	elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1875	seq_scan_tables[i]);
1876	seq_scan_tables[i] = seq_scan_tables[num_seq_scans - `1`];
1877	seq_scan_level[i] = seq_scan_level[num_seq_scans - `1`];
1878	num_seq_scans--;
1879	}
1880	}
1881	}
1882

Browse the source code of PostgreSQL/src/backend/utils/hash/dynahash.c