1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * dynahash.c |
4 | * dynamic hash tables |
5 | * |
6 | * dynahash.c supports both local-to-a-backend hash tables and hash tables in |
7 | * shared memory. For shared hash tables, it is the caller's responsibility |
8 | * to provide appropriate access interlocking. The simplest convention is |
9 | * that a single LWLock protects the whole hash table. Searches (HASH_FIND or |
10 | * hash_seq_search) need only shared lock, but any update requires exclusive |
11 | * lock. For heavily-used shared tables, the single-lock approach creates a |
12 | * concurrency bottleneck, so we also support "partitioned" locking wherein |
13 | * there are multiple LWLocks guarding distinct subsets of the table. To use |
14 | * a hash table in partitioned mode, the HASH_PARTITION flag must be given |
15 | * to hash_create. This prevents any attempt to split buckets on-the-fly. |
16 | * Therefore, each hash bucket chain operates independently, and no fields |
17 | * of the hash header change after init except nentries and freeList. |
18 | * (A partitioned table uses multiple copies of those fields, guarded by |
19 | * spinlocks, for additional concurrency.) |
20 | * This lets any subset of the hash buckets be treated as a separately |
21 | * lockable partition. We expect callers to use the low-order bits of a |
22 | * lookup key's hash value as a partition number --- this will work because |
23 | * of the way calc_bucket() maps hash values to bucket numbers. |
24 | * |
25 | * For hash tables in shared memory, the memory allocator function should |
26 | * match malloc's semantics of returning NULL on failure. For hash tables |
27 | * in local memory, we typically use palloc() which will throw error on |
28 | * failure. The code in this file has to cope with both cases. |
29 | * |
30 | * dynahash.c provides support for these types of lookup keys: |
31 | * |
32 | * 1. Null-terminated C strings (truncated if necessary to fit in keysize), |
33 | * compared as though by strcmp(). This is the default behavior. |
34 | * |
35 | * 2. Arbitrary binary data of size keysize, compared as though by memcmp(). |
36 | * (Caller must ensure there are no undefined padding bits in the keys!) |
37 | * This is selected by specifying HASH_BLOBS flag to hash_create. |
38 | * |
39 | * 3. More complex key behavior can be selected by specifying user-supplied |
40 | * hashing, comparison, and/or key-copying functions. At least a hashing |
41 | * function must be supplied; comparison defaults to memcmp() and key copying |
42 | * to memcpy() when a user-defined hashing function is selected. |
43 | * |
44 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
45 | * Portions Copyright (c) 1994, Regents of the University of California |
46 | * |
47 | * |
48 | * IDENTIFICATION |
49 | * src/backend/utils/hash/dynahash.c |
50 | * |
51 | *------------------------------------------------------------------------- |
52 | */ |
53 | |
54 | /* |
55 | * Original comments: |
56 | * |
57 | * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson. |
58 | * Coded into C, with minor code improvements, and with hsearch(3) interface, |
59 | * by ejp@ausmelb.oz, Jul 26, 1988: 13:16; |
60 | * also, hcreate/hdestroy routines added to simulate hsearch(3). |
61 | * |
62 | * These routines simulate hsearch(3) and family, with the important |
63 | * difference that the hash table is dynamic - can grow indefinitely |
64 | * beyond its original size (as supplied to hcreate()). |
65 | * |
66 | * Performance appears to be comparable to that of hsearch(3). |
67 | * The 'source-code' options referred to in hsearch(3)'s 'man' page |
68 | * are not implemented; otherwise functionality is identical. |
69 | * |
70 | * Compilation controls: |
71 | * HASH_DEBUG controls some informative traces, mainly for debugging. |
72 | * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained; |
73 | * when combined with HASH_DEBUG, these are displayed by hdestroy(). |
74 | * |
75 | * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor |
76 | * concatenation property, in probably unnecessary code 'optimization'. |
77 | * |
78 | * Modified margo@postgres.berkeley.edu February 1990 |
79 | * added multiple table interface |
80 | * Modified by sullivan@postgres.berkeley.edu April 1990 |
81 | * changed ctl structure for shared memory |
82 | */ |
83 | |
84 | #include "postgres.h" |
85 | |
86 | #include <limits.h> |
87 | |
88 | #include "access/xact.h" |
89 | #include "storage/shmem.h" |
90 | #include "storage/spin.h" |
91 | #include "utils/dynahash.h" |
92 | #include "utils/memutils.h" |
93 | |
94 | |
95 | /* |
96 | * Constants |
97 | * |
98 | * A hash table has a top-level "directory", each of whose entries points |
99 | * to a "segment" of ssize bucket headers. The maximum number of hash |
100 | * buckets is thus dsize * ssize (but dsize may be expansible). Of course, |
101 | * the number of records in the table can be larger, but we don't want a |
102 | * whole lot of records per bucket or performance goes down. |
103 | * |
104 | * In a hash table allocated in shared memory, the directory cannot be |
105 | * expanded because it must stay at a fixed address. The directory size |
106 | * should be selected using hash_select_dirsize (and you'd better have |
107 | * a good idea of the maximum number of entries!). For non-shared hash |
108 | * tables, the initial directory size can be left at the default. |
109 | */ |
110 | #define DEF_SEGSIZE 256 |
111 | #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */ |
112 | #define DEF_DIRSIZE 256 |
113 | #define DEF_FFACTOR 1 /* default fill factor */ |
114 | |
115 | /* Number of freelists to be used for a partitioned hash table. */ |
116 | #define NUM_FREELISTS 32 |
117 | |
118 | /* A hash bucket is a linked list of HASHELEMENTs */ |
119 | typedef HASHELEMENT *HASHBUCKET; |
120 | |
121 | /* A hash segment is an array of bucket headers */ |
122 | typedef HASHBUCKET *HASHSEGMENT; |
123 | |
124 | /* |
125 | * Per-freelist data. |
126 | * |
127 | * In a partitioned hash table, each freelist is associated with a specific |
128 | * set of hashcodes, as determined by the FREELIST_IDX() macro below. |
129 | * nentries tracks the number of live hashtable entries having those hashcodes |
130 | * (NOT the number of entries in the freelist, as you might expect). |
131 | * |
132 | * The coverage of a freelist might be more or less than one partition, so it |
133 | * needs its own lock rather than relying on caller locking. Relying on that |
134 | * wouldn't work even if the coverage was the same, because of the occasional |
135 | * need to "borrow" entries from another freelist; see get_hash_entry(). |
136 | * |
137 | * Using an array of FreeListData instead of separate arrays of mutexes, |
138 | * nentries and freeLists helps to reduce sharing of cache lines between |
139 | * different mutexes. |
140 | */ |
141 | typedef struct |
142 | { |
143 | slock_t mutex; /* spinlock for this freelist */ |
144 | long nentries; /* number of entries in associated buckets */ |
145 | HASHELEMENT *freeList; /* chain of free elements */ |
146 | } FreeListData; |
147 | |
148 | /* |
149 | * Header structure for a hash table --- contains all changeable info |
150 | * |
151 | * In a shared-memory hash table, the HASHHDR is in shared memory, while |
152 | * each backend has a local HTAB struct. For a non-shared table, there isn't |
153 | * any functional difference between HASHHDR and HTAB, but we separate them |
154 | * anyway to share code between shared and non-shared tables. |
155 | */ |
156 | struct HASHHDR |
157 | { |
158 | /* |
159 | * The freelist can become a point of contention in high-concurrency hash |
160 | * tables, so we use an array of freelists, each with its own mutex and |
161 | * nentries count, instead of just a single one. Although the freelists |
162 | * normally operate independently, we will scavenge entries from freelists |
163 | * other than a hashcode's default freelist when necessary. |
164 | * |
165 | * If the hash table is not partitioned, only freeList[0] is used and its |
166 | * spinlock is not used at all; callers' locking is assumed sufficient. |
167 | */ |
168 | FreeListData freeList[NUM_FREELISTS]; |
169 | |
170 | /* These fields can change, but not in a partitioned table */ |
171 | /* Also, dsize can't change in a shared table, even if unpartitioned */ |
172 | long dsize; /* directory size */ |
173 | long nsegs; /* number of allocated segments (<= dsize) */ |
174 | uint32 max_bucket; /* ID of maximum bucket in use */ |
175 | uint32 high_mask; /* mask to modulo into entire table */ |
176 | uint32 low_mask; /* mask to modulo into lower half of table */ |
177 | |
178 | /* These fields are fixed at hashtable creation */ |
179 | Size keysize; /* hash key length in bytes */ |
180 | Size entrysize; /* total user element size in bytes */ |
181 | long num_partitions; /* # partitions (must be power of 2), or 0 */ |
182 | long ffactor; /* target fill factor */ |
183 | long max_dsize; /* 'dsize' limit if directory is fixed size */ |
184 | long ssize; /* segment size --- must be power of 2 */ |
185 | int sshift; /* segment shift = log2(ssize) */ |
186 | int nelem_alloc; /* number of entries to allocate at once */ |
187 | |
188 | #ifdef HASH_STATISTICS |
189 | |
190 | /* |
191 | * Count statistics here. NB: stats code doesn't bother with mutex, so |
192 | * counts could be corrupted a bit in a partitioned table. |
193 | */ |
194 | long accesses; |
195 | long collisions; |
196 | #endif |
197 | }; |
198 | |
199 | #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) |
200 | |
201 | #define FREELIST_IDX(hctl, hashcode) \ |
202 | (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0) |
203 | |
204 | /* |
205 | * Top control structure for a hashtable --- in a shared table, each backend |
206 | * has its own copy (OK since no fields change at runtime) |
207 | */ |
208 | struct HTAB |
209 | { |
210 | HASHHDR *hctl; /* => shared control information */ |
211 | HASHSEGMENT *dir; /* directory of segment starts */ |
212 | HashValueFunc hash; /* hash function */ |
213 | HashCompareFunc match; /* key comparison function */ |
214 | HashCopyFunc keycopy; /* key copying function */ |
215 | HashAllocFunc alloc; /* memory allocator */ |
216 | MemoryContext hcxt; /* memory context if default allocator used */ |
217 | char *tabname; /* table name (for error messages) */ |
218 | bool isshared; /* true if table is in shared memory */ |
219 | bool isfixed; /* if true, don't enlarge */ |
220 | |
221 | /* freezing a shared table isn't allowed, so we can keep state here */ |
222 | bool frozen; /* true = no more inserts allowed */ |
223 | |
224 | /* We keep local copies of these fixed values to reduce contention */ |
225 | Size keysize; /* hash key length in bytes */ |
226 | long ssize; /* segment size --- must be power of 2 */ |
227 | int sshift; /* segment shift = log2(ssize) */ |
228 | }; |
229 | |
230 | /* |
231 | * Key (also entry) part of a HASHELEMENT |
232 | */ |
233 | #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT))) |
234 | |
235 | /* |
236 | * Obtain element pointer given pointer to key |
237 | */ |
238 | #define ELEMENT_FROM_KEY(key) \ |
239 | ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT)))) |
240 | |
241 | /* |
242 | * Fast MOD arithmetic, assuming that y is a power of 2 ! |
243 | */ |
244 | #define MOD(x,y) ((x) & ((y)-1)) |
245 | |
246 | #if HASH_STATISTICS |
247 | static long hash_accesses, |
248 | hash_collisions, |
249 | hash_expansions; |
250 | #endif |
251 | |
252 | /* |
253 | * Private function prototypes |
254 | */ |
255 | static void *DynaHashAlloc(Size size); |
256 | static HASHSEGMENT seg_alloc(HTAB *hashp); |
257 | static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx); |
258 | static bool dir_realloc(HTAB *hashp); |
259 | static bool expand_table(HTAB *hashp); |
260 | static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx); |
261 | static void hdefault(HTAB *hashp); |
262 | static int choose_nelem_alloc(Size entrysize); |
263 | static bool init_htab(HTAB *hashp, long nelem); |
264 | static void hash_corrupted(HTAB *hashp); |
265 | static long next_pow2_long(long num); |
266 | static int next_pow2_int(long num); |
267 | static void register_seq_scan(HTAB *hashp); |
268 | static void deregister_seq_scan(HTAB *hashp); |
269 | static bool has_seq_scans(HTAB *hashp); |
270 | |
271 | |
272 | /* |
273 | * memory allocation support |
274 | */ |
275 | static MemoryContext CurrentDynaHashCxt = NULL; |
276 | |
277 | static void * |
278 | DynaHashAlloc(Size size) |
279 | { |
280 | Assert(MemoryContextIsValid(CurrentDynaHashCxt)); |
281 | return MemoryContextAlloc(CurrentDynaHashCxt, size); |
282 | } |
283 | |
284 | |
285 | /* |
286 | * HashCompareFunc for string keys |
287 | * |
288 | * Because we copy keys with strlcpy(), they will be truncated at keysize-1 |
289 | * bytes, so we can only compare that many ... hence strncmp is almost but |
290 | * not quite the right thing. |
291 | */ |
292 | static int |
293 | string_compare(const char *key1, const char *key2, Size keysize) |
294 | { |
295 | return strncmp(key1, key2, keysize - 1); |
296 | } |
297 | |
298 | |
299 | /************************** CREATE ROUTINES **********************/ |
300 | |
301 | /* |
302 | * hash_create -- create a new dynamic hash table |
303 | * |
304 | * tabname: a name for the table (for debugging purposes) |
305 | * nelem: maximum number of elements expected |
306 | * *info: additional table parameters, as indicated by flags |
307 | * flags: bitmask indicating which parameters to take from *info |
308 | * |
309 | * Note: for a shared-memory hashtable, nelem needs to be a pretty good |
310 | * estimate, since we can't expand the table on the fly. But an unshared |
311 | * hashtable can be expanded on-the-fly, so it's better for nelem to be |
312 | * on the small side and let the table grow if it's exceeded. An overly |
313 | * large nelem will penalize hash_seq_search speed without buying much. |
314 | */ |
315 | HTAB * |
316 | hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) |
317 | { |
318 | HTAB *hashp; |
319 | HASHHDR *hctl; |
320 | |
321 | /* |
322 | * For shared hash tables, we have a local hash header (HTAB struct) that |
323 | * we allocate in TopMemoryContext; all else is in shared memory. |
324 | * |
325 | * For non-shared hash tables, everything including the hash header is in |
326 | * a memory context created specially for the hash table --- this makes |
327 | * hash_destroy very simple. The memory context is made a child of either |
328 | * a context specified by the caller, or TopMemoryContext if nothing is |
329 | * specified. |
330 | */ |
331 | if (flags & HASH_SHARED_MEM) |
332 | { |
333 | /* Set up to allocate the hash header */ |
334 | CurrentDynaHashCxt = TopMemoryContext; |
335 | } |
336 | else |
337 | { |
338 | /* Create the hash table's private memory context */ |
339 | if (flags & HASH_CONTEXT) |
340 | CurrentDynaHashCxt = info->hcxt; |
341 | else |
342 | CurrentDynaHashCxt = TopMemoryContext; |
343 | CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt, |
344 | "dynahash" , |
345 | ALLOCSET_DEFAULT_SIZES); |
346 | } |
347 | |
348 | /* Initialize the hash header, plus a copy of the table name */ |
349 | hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1); |
350 | MemSet(hashp, 0, sizeof(HTAB)); |
351 | |
352 | hashp->tabname = (char *) (hashp + 1); |
353 | strcpy(hashp->tabname, tabname); |
354 | |
355 | /* If we have a private context, label it with hashtable's name */ |
356 | if (!(flags & HASH_SHARED_MEM)) |
357 | MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname); |
358 | |
359 | /* |
360 | * Select the appropriate hash function (see comments at head of file). |
361 | */ |
362 | if (flags & HASH_FUNCTION) |
363 | hashp->hash = info->hash; |
364 | else if (flags & HASH_BLOBS) |
365 | { |
366 | /* We can optimize hashing for common key sizes */ |
367 | Assert(flags & HASH_ELEM); |
368 | if (info->keysize == sizeof(uint32)) |
369 | hashp->hash = uint32_hash; |
370 | else |
371 | hashp->hash = tag_hash; |
372 | } |
373 | else |
374 | hashp->hash = string_hash; /* default hash function */ |
375 | |
376 | /* |
377 | * If you don't specify a match function, it defaults to string_compare if |
378 | * you used string_hash (either explicitly or by default) and to memcmp |
379 | * otherwise. |
380 | * |
381 | * Note: explicitly specifying string_hash is deprecated, because this |
382 | * might not work for callers in loadable modules on some platforms due to |
383 | * referencing a trampoline instead of the string_hash function proper. |
384 | * Just let it default, eh? |
385 | */ |
386 | if (flags & HASH_COMPARE) |
387 | hashp->match = info->match; |
388 | else if (hashp->hash == string_hash) |
389 | hashp->match = (HashCompareFunc) string_compare; |
390 | else |
391 | hashp->match = memcmp; |
392 | |
393 | /* |
394 | * Similarly, the key-copying function defaults to strlcpy or memcpy. |
395 | */ |
396 | if (flags & HASH_KEYCOPY) |
397 | hashp->keycopy = info->keycopy; |
398 | else if (hashp->hash == string_hash) |
399 | hashp->keycopy = (HashCopyFunc) strlcpy; |
400 | else |
401 | hashp->keycopy = memcpy; |
402 | |
403 | /* And select the entry allocation function, too. */ |
404 | if (flags & HASH_ALLOC) |
405 | hashp->alloc = info->alloc; |
406 | else |
407 | hashp->alloc = DynaHashAlloc; |
408 | |
409 | if (flags & HASH_SHARED_MEM) |
410 | { |
411 | /* |
412 | * ctl structure and directory are preallocated for shared memory |
413 | * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as |
414 | * well. |
415 | */ |
416 | hashp->hctl = info->hctl; |
417 | hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR)); |
418 | hashp->hcxt = NULL; |
419 | hashp->isshared = true; |
420 | |
421 | /* hash table already exists, we're just attaching to it */ |
422 | if (flags & HASH_ATTACH) |
423 | { |
424 | /* make local copies of some heavily-used values */ |
425 | hctl = hashp->hctl; |
426 | hashp->keysize = hctl->keysize; |
427 | hashp->ssize = hctl->ssize; |
428 | hashp->sshift = hctl->sshift; |
429 | |
430 | return hashp; |
431 | } |
432 | } |
433 | else |
434 | { |
435 | /* setup hash table defaults */ |
436 | hashp->hctl = NULL; |
437 | hashp->dir = NULL; |
438 | hashp->hcxt = CurrentDynaHashCxt; |
439 | hashp->isshared = false; |
440 | } |
441 | |
442 | if (!hashp->hctl) |
443 | { |
444 | hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR)); |
445 | if (!hashp->hctl) |
446 | ereport(ERROR, |
447 | (errcode(ERRCODE_OUT_OF_MEMORY), |
448 | errmsg("out of memory" ))); |
449 | } |
450 | |
451 | hashp->frozen = false; |
452 | |
453 | hdefault(hashp); |
454 | |
455 | hctl = hashp->hctl; |
456 | |
457 | if (flags & HASH_PARTITION) |
458 | { |
459 | /* Doesn't make sense to partition a local hash table */ |
460 | Assert(flags & HASH_SHARED_MEM); |
461 | |
462 | /* |
463 | * The number of partitions had better be a power of 2. Also, it must |
464 | * be less than INT_MAX (see init_htab()), so call the int version of |
465 | * next_pow2. |
466 | */ |
467 | Assert(info->num_partitions == next_pow2_int(info->num_partitions)); |
468 | |
469 | hctl->num_partitions = info->num_partitions; |
470 | } |
471 | |
472 | if (flags & HASH_SEGMENT) |
473 | { |
474 | hctl->ssize = info->ssize; |
475 | hctl->sshift = my_log2(info->ssize); |
476 | /* ssize had better be a power of 2 */ |
477 | Assert(hctl->ssize == (1L << hctl->sshift)); |
478 | } |
479 | if (flags & HASH_FFACTOR) |
480 | hctl->ffactor = info->ffactor; |
481 | |
482 | /* |
483 | * SHM hash tables have fixed directory size passed by the caller. |
484 | */ |
485 | if (flags & HASH_DIRSIZE) |
486 | { |
487 | hctl->max_dsize = info->max_dsize; |
488 | hctl->dsize = info->dsize; |
489 | } |
490 | |
491 | /* |
492 | * hash table now allocates space for key and data but you have to say how |
493 | * much space to allocate |
494 | */ |
495 | if (flags & HASH_ELEM) |
496 | { |
497 | Assert(info->entrysize >= info->keysize); |
498 | hctl->keysize = info->keysize; |
499 | hctl->entrysize = info->entrysize; |
500 | } |
501 | |
502 | /* make local copies of heavily-used constant fields */ |
503 | hashp->keysize = hctl->keysize; |
504 | hashp->ssize = hctl->ssize; |
505 | hashp->sshift = hctl->sshift; |
506 | |
507 | /* Build the hash directory structure */ |
508 | if (!init_htab(hashp, nelem)) |
509 | elog(ERROR, "failed to initialize hash table \"%s\"" , hashp->tabname); |
510 | |
511 | /* |
512 | * For a shared hash table, preallocate the requested number of elements. |
513 | * This reduces problems with run-time out-of-shared-memory conditions. |
514 | * |
515 | * For a non-shared hash table, preallocate the requested number of |
516 | * elements if it's less than our chosen nelem_alloc. This avoids wasting |
517 | * space if the caller correctly estimates a small table size. |
518 | */ |
519 | if ((flags & HASH_SHARED_MEM) || |
520 | nelem < hctl->nelem_alloc) |
521 | { |
522 | int i, |
523 | freelist_partitions, |
524 | nelem_alloc, |
525 | nelem_alloc_first; |
526 | |
527 | /* |
528 | * If hash table is partitioned, give each freelist an equal share of |
529 | * the initial allocation. Otherwise only freeList[0] is used. |
530 | */ |
531 | if (IS_PARTITIONED(hashp->hctl)) |
532 | freelist_partitions = NUM_FREELISTS; |
533 | else |
534 | freelist_partitions = 1; |
535 | |
536 | nelem_alloc = nelem / freelist_partitions; |
537 | if (nelem_alloc <= 0) |
538 | nelem_alloc = 1; |
539 | |
540 | /* |
541 | * Make sure we'll allocate all the requested elements; freeList[0] |
542 | * gets the excess if the request isn't divisible by NUM_FREELISTS. |
543 | */ |
544 | if (nelem_alloc * freelist_partitions < nelem) |
545 | nelem_alloc_first = |
546 | nelem - nelem_alloc * (freelist_partitions - 1); |
547 | else |
548 | nelem_alloc_first = nelem_alloc; |
549 | |
550 | for (i = 0; i < freelist_partitions; i++) |
551 | { |
552 | int temp = (i == 0) ? nelem_alloc_first : nelem_alloc; |
553 | |
554 | if (!element_alloc(hashp, temp, i)) |
555 | ereport(ERROR, |
556 | (errcode(ERRCODE_OUT_OF_MEMORY), |
557 | errmsg("out of memory" ))); |
558 | } |
559 | } |
560 | |
561 | if (flags & HASH_FIXED_SIZE) |
562 | hashp->isfixed = true; |
563 | return hashp; |
564 | } |
565 | |
566 | /* |
567 | * Set default HASHHDR parameters. |
568 | */ |
569 | static void |
570 | hdefault(HTAB *hashp) |
571 | { |
572 | HASHHDR *hctl = hashp->hctl; |
573 | |
574 | MemSet(hctl, 0, sizeof(HASHHDR)); |
575 | |
576 | hctl->dsize = DEF_DIRSIZE; |
577 | hctl->nsegs = 0; |
578 | |
579 | /* rather pointless defaults for key & entry size */ |
580 | hctl->keysize = sizeof(char *); |
581 | hctl->entrysize = 2 * sizeof(char *); |
582 | |
583 | hctl->num_partitions = 0; /* not partitioned */ |
584 | |
585 | hctl->ffactor = DEF_FFACTOR; |
586 | |
587 | /* table has no fixed maximum size */ |
588 | hctl->max_dsize = NO_MAX_DSIZE; |
589 | |
590 | hctl->ssize = DEF_SEGSIZE; |
591 | hctl->sshift = DEF_SEGSIZE_SHIFT; |
592 | |
593 | #ifdef HASH_STATISTICS |
594 | hctl->accesses = hctl->collisions = 0; |
595 | #endif |
596 | } |
597 | |
598 | /* |
599 | * Given the user-specified entry size, choose nelem_alloc, ie, how many |
600 | * elements to add to the hash table when we need more. |
601 | */ |
602 | static int |
603 | choose_nelem_alloc(Size entrysize) |
604 | { |
605 | int nelem_alloc; |
606 | Size elementSize; |
607 | Size allocSize; |
608 | |
609 | /* Each element has a HASHELEMENT header plus user data. */ |
610 | /* NB: this had better match element_alloc() */ |
611 | elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize); |
612 | |
613 | /* |
614 | * The idea here is to choose nelem_alloc at least 32, but round up so |
615 | * that the allocation request will be a power of 2 or just less. This |
616 | * makes little difference for hash tables in shared memory, but for hash |
617 | * tables managed by palloc, the allocation request will be rounded up to |
618 | * a power of 2 anyway. If we fail to take this into account, we'll waste |
619 | * as much as half the allocated space. |
620 | */ |
621 | allocSize = 32 * 4; /* assume elementSize at least 8 */ |
622 | do |
623 | { |
624 | allocSize <<= 1; |
625 | nelem_alloc = allocSize / elementSize; |
626 | } while (nelem_alloc < 32); |
627 | |
628 | return nelem_alloc; |
629 | } |
630 | |
631 | /* |
632 | * Compute derived fields of hctl and build the initial directory/segment |
633 | * arrays |
634 | */ |
635 | static bool |
636 | init_htab(HTAB *hashp, long nelem) |
637 | { |
638 | HASHHDR *hctl = hashp->hctl; |
639 | HASHSEGMENT *segp; |
640 | int nbuckets; |
641 | int nsegs; |
642 | int i; |
643 | |
644 | /* |
645 | * initialize mutexes if it's a partitioned table |
646 | */ |
647 | if (IS_PARTITIONED(hctl)) |
648 | for (i = 0; i < NUM_FREELISTS; i++) |
649 | SpinLockInit(&(hctl->freeList[i].mutex)); |
650 | |
651 | /* |
652 | * Divide number of elements by the fill factor to determine a desired |
653 | * number of buckets. Allocate space for the next greater power of two |
654 | * number of buckets |
655 | */ |
656 | nbuckets = next_pow2_int((nelem - 1) / hctl->ffactor + 1); |
657 | |
658 | /* |
659 | * In a partitioned table, nbuckets must be at least equal to |
660 | * num_partitions; were it less, keys with apparently different partition |
661 | * numbers would map to the same bucket, breaking partition independence. |
662 | * (Normally nbuckets will be much bigger; this is just a safety check.) |
663 | */ |
664 | while (nbuckets < hctl->num_partitions) |
665 | nbuckets <<= 1; |
666 | |
667 | hctl->max_bucket = hctl->low_mask = nbuckets - 1; |
668 | hctl->high_mask = (nbuckets << 1) - 1; |
669 | |
670 | /* |
671 | * Figure number of directory segments needed, round up to a power of 2 |
672 | */ |
673 | nsegs = (nbuckets - 1) / hctl->ssize + 1; |
674 | nsegs = next_pow2_int(nsegs); |
675 | |
676 | /* |
677 | * Make sure directory is big enough. If pre-allocated directory is too |
678 | * small, choke (caller screwed up). |
679 | */ |
680 | if (nsegs > hctl->dsize) |
681 | { |
682 | if (!(hashp->dir)) |
683 | hctl->dsize = nsegs; |
684 | else |
685 | return false; |
686 | } |
687 | |
688 | /* Allocate a directory */ |
689 | if (!(hashp->dir)) |
690 | { |
691 | CurrentDynaHashCxt = hashp->hcxt; |
692 | hashp->dir = (HASHSEGMENT *) |
693 | hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT)); |
694 | if (!hashp->dir) |
695 | return false; |
696 | } |
697 | |
698 | /* Allocate initial segments */ |
699 | for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++) |
700 | { |
701 | *segp = seg_alloc(hashp); |
702 | if (*segp == NULL) |
703 | return false; |
704 | } |
705 | |
706 | /* Choose number of entries to allocate at a time */ |
707 | hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize); |
708 | |
709 | #if HASH_DEBUG |
710 | fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n" , |
711 | "TABLE POINTER " , hashp, |
712 | "DIRECTORY SIZE " , hctl->dsize, |
713 | "SEGMENT SIZE " , hctl->ssize, |
714 | "SEGMENT SHIFT " , hctl->sshift, |
715 | "FILL FACTOR " , hctl->ffactor, |
716 | "MAX BUCKET " , hctl->max_bucket, |
717 | "HIGH MASK " , hctl->high_mask, |
718 | "LOW MASK " , hctl->low_mask, |
719 | "NSEGS " , hctl->nsegs); |
720 | #endif |
721 | return true; |
722 | } |
723 | |
724 | /* |
725 | * Estimate the space needed for a hashtable containing the given number |
726 | * of entries of given size. |
727 | * NOTE: this is used to estimate the footprint of hashtables in shared |
728 | * memory; therefore it does not count HTAB which is in local memory. |
729 | * NB: assumes that all hash structure parameters have default values! |
730 | */ |
731 | Size |
732 | hash_estimate_size(long num_entries, Size entrysize) |
733 | { |
734 | Size size; |
735 | long nBuckets, |
736 | nSegments, |
737 | nDirEntries, |
738 | nElementAllocs, |
739 | elementSize, |
740 | elementAllocCnt; |
741 | |
742 | /* estimate number of buckets wanted */ |
743 | nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1); |
744 | /* # of segments needed for nBuckets */ |
745 | nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1); |
746 | /* directory entries */ |
747 | nDirEntries = DEF_DIRSIZE; |
748 | while (nDirEntries < nSegments) |
749 | nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */ |
750 | |
751 | /* fixed control info */ |
752 | size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */ |
753 | /* directory */ |
754 | size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT))); |
755 | /* segments */ |
756 | size = add_size(size, mul_size(nSegments, |
757 | MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET)))); |
758 | /* elements --- allocated in groups of choose_nelem_alloc() entries */ |
759 | elementAllocCnt = choose_nelem_alloc(entrysize); |
760 | nElementAllocs = (num_entries - 1) / elementAllocCnt + 1; |
761 | elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize); |
762 | size = add_size(size, |
763 | mul_size(nElementAllocs, |
764 | mul_size(elementAllocCnt, elementSize))); |
765 | |
766 | return size; |
767 | } |
768 | |
769 | /* |
770 | * Select an appropriate directory size for a hashtable with the given |
771 | * maximum number of entries. |
772 | * This is only needed for hashtables in shared memory, whose directories |
773 | * cannot be expanded dynamically. |
774 | * NB: assumes that all hash structure parameters have default values! |
775 | * |
776 | * XXX this had better agree with the behavior of init_htab()... |
777 | */ |
778 | long |
779 | hash_select_dirsize(long num_entries) |
780 | { |
781 | long nBuckets, |
782 | nSegments, |
783 | nDirEntries; |
784 | |
785 | /* estimate number of buckets wanted */ |
786 | nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1); |
787 | /* # of segments needed for nBuckets */ |
788 | nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1); |
789 | /* directory entries */ |
790 | nDirEntries = DEF_DIRSIZE; |
791 | while (nDirEntries < nSegments) |
792 | nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */ |
793 | |
794 | return nDirEntries; |
795 | } |
796 | |
797 | /* |
798 | * Compute the required initial memory allocation for a shared-memory |
799 | * hashtable with the given parameters. We need space for the HASHHDR |
800 | * and for the (non expansible) directory. |
801 | */ |
802 | Size |
803 | hash_get_shared_size(HASHCTL *info, int flags) |
804 | { |
805 | Assert(flags & HASH_DIRSIZE); |
806 | Assert(info->dsize == info->max_dsize); |
807 | return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT); |
808 | } |
809 | |
810 | |
811 | /********************** DESTROY ROUTINES ************************/ |
812 | |
813 | void |
814 | hash_destroy(HTAB *hashp) |
815 | { |
816 | if (hashp != NULL) |
817 | { |
818 | /* allocation method must be one we know how to free, too */ |
819 | Assert(hashp->alloc == DynaHashAlloc); |
820 | /* so this hashtable must have its own context */ |
821 | Assert(hashp->hcxt != NULL); |
822 | |
823 | hash_stats("destroy" , hashp); |
824 | |
825 | /* |
826 | * Free everything by destroying the hash table's memory context. |
827 | */ |
828 | MemoryContextDelete(hashp->hcxt); |
829 | } |
830 | } |
831 | |
832 | void |
833 | hash_stats(const char *where, HTAB *hashp) |
834 | { |
835 | #if HASH_STATISTICS |
836 | fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n" , |
837 | where, hashp->hctl->accesses, hashp->hctl->collisions); |
838 | |
839 | fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n" , |
840 | hash_get_num_entries(hashp), (long) hashp->hctl->keysize, |
841 | hashp->hctl->max_bucket, hashp->hctl->nsegs); |
842 | fprintf(stderr, "%s: total accesses %ld total collisions %ld\n" , |
843 | where, hash_accesses, hash_collisions); |
844 | fprintf(stderr, "hash_stats: total expansions %ld\n" , |
845 | hash_expansions); |
846 | #endif |
847 | } |
848 | |
849 | /*******************************SEARCH ROUTINES *****************************/ |
850 | |
851 | |
852 | /* |
853 | * get_hash_value -- exported routine to calculate a key's hash value |
854 | * |
855 | * We export this because for partitioned tables, callers need to compute |
856 | * the partition number (from the low-order bits of the hash value) before |
857 | * searching. |
858 | */ |
859 | uint32 |
860 | get_hash_value(HTAB *hashp, const void *keyPtr) |
861 | { |
862 | return hashp->hash(keyPtr, hashp->keysize); |
863 | } |
864 | |
865 | /* Convert a hash value to a bucket number */ |
866 | static inline uint32 |
867 | calc_bucket(HASHHDR *hctl, uint32 hash_val) |
868 | { |
869 | uint32 bucket; |
870 | |
871 | bucket = hash_val & hctl->high_mask; |
872 | if (bucket > hctl->max_bucket) |
873 | bucket = bucket & hctl->low_mask; |
874 | |
875 | return bucket; |
876 | } |
877 | |
878 | /* |
879 | * hash_search -- look up key in table and perform action |
880 | * hash_search_with_hash_value -- same, with key's hash value already computed |
881 | * |
882 | * action is one of: |
883 | * HASH_FIND: look up key in table |
884 | * HASH_ENTER: look up key in table, creating entry if not present |
885 | * HASH_ENTER_NULL: same, but return NULL if out of memory |
886 | * HASH_REMOVE: look up key in table, remove entry if present |
887 | * |
888 | * Return value is a pointer to the element found/entered/removed if any, |
889 | * or NULL if no match was found. (NB: in the case of the REMOVE action, |
890 | * the result is a dangling pointer that shouldn't be dereferenced!) |
891 | * |
892 | * HASH_ENTER will normally ereport a generic "out of memory" error if |
893 | * it is unable to create a new entry. The HASH_ENTER_NULL operation is |
894 | * the same except it will return NULL if out of memory. Note that |
895 | * HASH_ENTER_NULL cannot be used with the default palloc-based allocator, |
896 | * since palloc internally ereports on out-of-memory. |
897 | * |
898 | * If foundPtr isn't NULL, then *foundPtr is set true if we found an |
899 | * existing entry in the table, false otherwise. This is needed in the |
900 | * HASH_ENTER case, but is redundant with the return value otherwise. |
901 | * |
902 | * For hash_search_with_hash_value, the hashvalue parameter must have been |
903 | * calculated with get_hash_value(). |
904 | */ |
905 | void * |
906 | hash_search(HTAB *hashp, |
907 | const void *keyPtr, |
908 | HASHACTION action, |
909 | bool *foundPtr) |
910 | { |
911 | return hash_search_with_hash_value(hashp, |
912 | keyPtr, |
913 | hashp->hash(keyPtr, hashp->keysize), |
914 | action, |
915 | foundPtr); |
916 | } |
917 | |
918 | void * |
919 | hash_search_with_hash_value(HTAB *hashp, |
920 | const void *keyPtr, |
921 | uint32 hashvalue, |
922 | HASHACTION action, |
923 | bool *foundPtr) |
924 | { |
925 | HASHHDR *hctl = hashp->hctl; |
926 | int freelist_idx = FREELIST_IDX(hctl, hashvalue); |
927 | Size keysize; |
928 | uint32 bucket; |
929 | long segment_num; |
930 | long segment_ndx; |
931 | HASHSEGMENT segp; |
932 | HASHBUCKET currBucket; |
933 | HASHBUCKET *prevBucketPtr; |
934 | HashCompareFunc match; |
935 | |
936 | #if HASH_STATISTICS |
937 | hash_accesses++; |
938 | hctl->accesses++; |
939 | #endif |
940 | |
941 | /* |
942 | * If inserting, check if it is time to split a bucket. |
943 | * |
944 | * NOTE: failure to expand table is not a fatal error, it just means we |
945 | * have to run at higher fill factor than we wanted. However, if we're |
946 | * using the palloc allocator then it will throw error anyway on |
947 | * out-of-memory, so we must do this before modifying the table. |
948 | */ |
949 | if (action == HASH_ENTER || action == HASH_ENTER_NULL) |
950 | { |
951 | /* |
952 | * Can't split if running in partitioned mode, nor if frozen, nor if |
953 | * table is the subject of any active hash_seq_search scans. Strange |
954 | * order of these tests is to try to check cheaper conditions first. |
955 | */ |
956 | if (!IS_PARTITIONED(hctl) && !hashp->frozen && |
957 | hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && |
958 | !has_seq_scans(hashp)) |
959 | (void) expand_table(hashp); |
960 | } |
961 | |
962 | /* |
963 | * Do the initial lookup |
964 | */ |
965 | bucket = calc_bucket(hctl, hashvalue); |
966 | |
967 | segment_num = bucket >> hashp->sshift; |
968 | segment_ndx = MOD(bucket, hashp->ssize); |
969 | |
970 | segp = hashp->dir[segment_num]; |
971 | |
972 | if (segp == NULL) |
973 | hash_corrupted(hashp); |
974 | |
975 | prevBucketPtr = &segp[segment_ndx]; |
976 | currBucket = *prevBucketPtr; |
977 | |
978 | /* |
979 | * Follow collision chain looking for matching key |
980 | */ |
981 | match = hashp->match; /* save one fetch in inner loop */ |
982 | keysize = hashp->keysize; /* ditto */ |
983 | |
984 | while (currBucket != NULL) |
985 | { |
986 | if (currBucket->hashvalue == hashvalue && |
987 | match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0) |
988 | break; |
989 | prevBucketPtr = &(currBucket->link); |
990 | currBucket = *prevBucketPtr; |
991 | #if HASH_STATISTICS |
992 | hash_collisions++; |
993 | hctl->collisions++; |
994 | #endif |
995 | } |
996 | |
997 | if (foundPtr) |
998 | *foundPtr = (bool) (currBucket != NULL); |
999 | |
1000 | /* |
1001 | * OK, now what? |
1002 | */ |
1003 | switch (action) |
1004 | { |
1005 | case HASH_FIND: |
1006 | if (currBucket != NULL) |
1007 | return (void *) ELEMENTKEY(currBucket); |
1008 | return NULL; |
1009 | |
1010 | case HASH_REMOVE: |
1011 | if (currBucket != NULL) |
1012 | { |
1013 | /* if partitioned, must lock to touch nentries and freeList */ |
1014 | if (IS_PARTITIONED(hctl)) |
1015 | SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex)); |
1016 | |
1017 | /* delete the record from the appropriate nentries counter. */ |
1018 | Assert(hctl->freeList[freelist_idx].nentries > 0); |
1019 | hctl->freeList[freelist_idx].nentries--; |
1020 | |
1021 | /* remove record from hash bucket's chain. */ |
1022 | *prevBucketPtr = currBucket->link; |
1023 | |
1024 | /* add the record to the appropriate freelist. */ |
1025 | currBucket->link = hctl->freeList[freelist_idx].freeList; |
1026 | hctl->freeList[freelist_idx].freeList = currBucket; |
1027 | |
1028 | if (IS_PARTITIONED(hctl)) |
1029 | SpinLockRelease(&hctl->freeList[freelist_idx].mutex); |
1030 | |
1031 | /* |
1032 | * better hope the caller is synchronizing access to this |
1033 | * element, because someone else is going to reuse it the next |
1034 | * time something is added to the table |
1035 | */ |
1036 | return (void *) ELEMENTKEY(currBucket); |
1037 | } |
1038 | return NULL; |
1039 | |
1040 | case HASH_ENTER_NULL: |
1041 | /* ENTER_NULL does not work with palloc-based allocator */ |
1042 | Assert(hashp->alloc != DynaHashAlloc); |
1043 | /* FALL THRU */ |
1044 | |
1045 | case HASH_ENTER: |
1046 | /* Return existing element if found, else create one */ |
1047 | if (currBucket != NULL) |
1048 | return (void *) ELEMENTKEY(currBucket); |
1049 | |
1050 | /* disallow inserts if frozen */ |
1051 | if (hashp->frozen) |
1052 | elog(ERROR, "cannot insert into frozen hashtable \"%s\"" , |
1053 | hashp->tabname); |
1054 | |
1055 | currBucket = get_hash_entry(hashp, freelist_idx); |
1056 | if (currBucket == NULL) |
1057 | { |
1058 | /* out of memory */ |
1059 | if (action == HASH_ENTER_NULL) |
1060 | return NULL; |
1061 | /* report a generic message */ |
1062 | if (hashp->isshared) |
1063 | ereport(ERROR, |
1064 | (errcode(ERRCODE_OUT_OF_MEMORY), |
1065 | errmsg("out of shared memory" ))); |
1066 | else |
1067 | ereport(ERROR, |
1068 | (errcode(ERRCODE_OUT_OF_MEMORY), |
1069 | errmsg("out of memory" ))); |
1070 | } |
1071 | |
1072 | /* link into hashbucket chain */ |
1073 | *prevBucketPtr = currBucket; |
1074 | currBucket->link = NULL; |
1075 | |
1076 | /* copy key into record */ |
1077 | currBucket->hashvalue = hashvalue; |
1078 | hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize); |
1079 | |
1080 | /* |
1081 | * Caller is expected to fill the data field on return. DO NOT |
1082 | * insert any code that could possibly throw error here, as doing |
1083 | * so would leave the table entry incomplete and hence corrupt the |
1084 | * caller's data structure. |
1085 | */ |
1086 | |
1087 | return (void *) ELEMENTKEY(currBucket); |
1088 | } |
1089 | |
1090 | elog(ERROR, "unrecognized hash action code: %d" , (int) action); |
1091 | |
1092 | return NULL; /* keep compiler quiet */ |
1093 | } |
1094 | |
1095 | /* |
1096 | * hash_update_hash_key -- change the hash key of an existing table entry |
1097 | * |
1098 | * This is equivalent to removing the entry, making a new entry, and copying |
1099 | * over its data, except that the entry never goes to the table's freelist. |
1100 | * Therefore this cannot suffer an out-of-memory failure, even if there are |
1101 | * other processes operating in other partitions of the hashtable. |
1102 | * |
1103 | * Returns true if successful, false if the requested new hash key is already |
1104 | * present. Throws error if the specified entry pointer isn't actually a |
1105 | * table member. |
1106 | * |
1107 | * NB: currently, there is no special case for old and new hash keys being |
1108 | * identical, which means we'll report false for that situation. This is |
1109 | * preferable for existing uses. |
1110 | * |
1111 | * NB: for a partitioned hashtable, caller must hold lock on both relevant |
1112 | * partitions, if the new hash key would belong to a different partition. |
1113 | */ |
1114 | bool |
1115 | hash_update_hash_key(HTAB *hashp, |
1116 | void *existingEntry, |
1117 | const void *newKeyPtr) |
1118 | { |
1119 | HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry); |
1120 | HASHHDR *hctl = hashp->hctl; |
1121 | uint32 newhashvalue; |
1122 | Size keysize; |
1123 | uint32 bucket; |
1124 | uint32 newbucket; |
1125 | long segment_num; |
1126 | long segment_ndx; |
1127 | HASHSEGMENT segp; |
1128 | HASHBUCKET currBucket; |
1129 | HASHBUCKET *prevBucketPtr; |
1130 | HASHBUCKET *oldPrevPtr; |
1131 | HashCompareFunc match; |
1132 | |
1133 | #if HASH_STATISTICS |
1134 | hash_accesses++; |
1135 | hctl->accesses++; |
1136 | #endif |
1137 | |
1138 | /* disallow updates if frozen */ |
1139 | if (hashp->frozen) |
1140 | elog(ERROR, "cannot update in frozen hashtable \"%s\"" , |
1141 | hashp->tabname); |
1142 | |
1143 | /* |
1144 | * Lookup the existing element using its saved hash value. We need to do |
1145 | * this to be able to unlink it from its hash chain, but as a side benefit |
1146 | * we can verify the validity of the passed existingEntry pointer. |
1147 | */ |
1148 | bucket = calc_bucket(hctl, existingElement->hashvalue); |
1149 | |
1150 | segment_num = bucket >> hashp->sshift; |
1151 | segment_ndx = MOD(bucket, hashp->ssize); |
1152 | |
1153 | segp = hashp->dir[segment_num]; |
1154 | |
1155 | if (segp == NULL) |
1156 | hash_corrupted(hashp); |
1157 | |
1158 | prevBucketPtr = &segp[segment_ndx]; |
1159 | currBucket = *prevBucketPtr; |
1160 | |
1161 | while (currBucket != NULL) |
1162 | { |
1163 | if (currBucket == existingElement) |
1164 | break; |
1165 | prevBucketPtr = &(currBucket->link); |
1166 | currBucket = *prevBucketPtr; |
1167 | } |
1168 | |
1169 | if (currBucket == NULL) |
1170 | elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"" , |
1171 | hashp->tabname); |
1172 | |
1173 | oldPrevPtr = prevBucketPtr; |
1174 | |
1175 | /* |
1176 | * Now perform the equivalent of a HASH_ENTER operation to locate the hash |
1177 | * chain we want to put the entry into. |
1178 | */ |
1179 | newhashvalue = hashp->hash(newKeyPtr, hashp->keysize); |
1180 | |
1181 | newbucket = calc_bucket(hctl, newhashvalue); |
1182 | |
1183 | segment_num = newbucket >> hashp->sshift; |
1184 | segment_ndx = MOD(newbucket, hashp->ssize); |
1185 | |
1186 | segp = hashp->dir[segment_num]; |
1187 | |
1188 | if (segp == NULL) |
1189 | hash_corrupted(hashp); |
1190 | |
1191 | prevBucketPtr = &segp[segment_ndx]; |
1192 | currBucket = *prevBucketPtr; |
1193 | |
1194 | /* |
1195 | * Follow collision chain looking for matching key |
1196 | */ |
1197 | match = hashp->match; /* save one fetch in inner loop */ |
1198 | keysize = hashp->keysize; /* ditto */ |
1199 | |
1200 | while (currBucket != NULL) |
1201 | { |
1202 | if (currBucket->hashvalue == newhashvalue && |
1203 | match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0) |
1204 | break; |
1205 | prevBucketPtr = &(currBucket->link); |
1206 | currBucket = *prevBucketPtr; |
1207 | #if HASH_STATISTICS |
1208 | hash_collisions++; |
1209 | hctl->collisions++; |
1210 | #endif |
1211 | } |
1212 | |
1213 | if (currBucket != NULL) |
1214 | return false; /* collision with an existing entry */ |
1215 | |
1216 | currBucket = existingElement; |
1217 | |
1218 | /* |
1219 | * If old and new hash values belong to the same bucket, we need not |
1220 | * change any chain links, and indeed should not since this simplistic |
1221 | * update will corrupt the list if currBucket is the last element. (We |
1222 | * cannot fall out earlier, however, since we need to scan the bucket to |
1223 | * check for duplicate keys.) |
1224 | */ |
1225 | if (bucket != newbucket) |
1226 | { |
1227 | /* OK to remove record from old hash bucket's chain. */ |
1228 | *oldPrevPtr = currBucket->link; |
1229 | |
1230 | /* link into new hashbucket chain */ |
1231 | *prevBucketPtr = currBucket; |
1232 | currBucket->link = NULL; |
1233 | } |
1234 | |
1235 | /* copy new key into record */ |
1236 | currBucket->hashvalue = newhashvalue; |
1237 | hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize); |
1238 | |
1239 | /* rest of record is untouched */ |
1240 | |
1241 | return true; |
1242 | } |
1243 | |
1244 | /* |
1245 | * Allocate a new hashtable entry if possible; return NULL if out of memory. |
1246 | * (Or, if the underlying space allocator throws error for out-of-memory, |
1247 | * we won't return at all.) |
1248 | */ |
1249 | static HASHBUCKET |
1250 | get_hash_entry(HTAB *hashp, int freelist_idx) |
1251 | { |
1252 | HASHHDR *hctl = hashp->hctl; |
1253 | HASHBUCKET newElement; |
1254 | |
1255 | for (;;) |
1256 | { |
1257 | /* if partitioned, must lock to touch nentries and freeList */ |
1258 | if (IS_PARTITIONED(hctl)) |
1259 | SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); |
1260 | |
1261 | /* try to get an entry from the freelist */ |
1262 | newElement = hctl->freeList[freelist_idx].freeList; |
1263 | |
1264 | if (newElement != NULL) |
1265 | break; |
1266 | |
1267 | if (IS_PARTITIONED(hctl)) |
1268 | SpinLockRelease(&hctl->freeList[freelist_idx].mutex); |
1269 | |
1270 | /* |
1271 | * No free elements in this freelist. In a partitioned table, there |
1272 | * might be entries in other freelists, but to reduce contention we |
1273 | * prefer to first try to get another chunk of buckets from the main |
1274 | * shmem allocator. If that fails, though, we *MUST* root through all |
1275 | * the other freelists before giving up. There are multiple callers |
1276 | * that assume that they can allocate every element in the initially |
1277 | * requested table size, or that deleting an element guarantees they |
1278 | * can insert a new element, even if shared memory is entirely full. |
1279 | * Failing because the needed element is in a different freelist is |
1280 | * not acceptable. |
1281 | */ |
1282 | if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx)) |
1283 | { |
1284 | int borrow_from_idx; |
1285 | |
1286 | if (!IS_PARTITIONED(hctl)) |
1287 | return NULL; /* out of memory */ |
1288 | |
1289 | /* try to borrow element from another freelist */ |
1290 | borrow_from_idx = freelist_idx; |
1291 | for (;;) |
1292 | { |
1293 | borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS; |
1294 | if (borrow_from_idx == freelist_idx) |
1295 | break; /* examined all freelists, fail */ |
1296 | |
1297 | SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex)); |
1298 | newElement = hctl->freeList[borrow_from_idx].freeList; |
1299 | |
1300 | if (newElement != NULL) |
1301 | { |
1302 | hctl->freeList[borrow_from_idx].freeList = newElement->link; |
1303 | SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); |
1304 | |
1305 | /* careful: count the new element in its proper freelist */ |
1306 | SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); |
1307 | hctl->freeList[freelist_idx].nentries++; |
1308 | SpinLockRelease(&hctl->freeList[freelist_idx].mutex); |
1309 | |
1310 | return newElement; |
1311 | } |
1312 | |
1313 | SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); |
1314 | } |
1315 | |
1316 | /* no elements available to borrow either, so out of memory */ |
1317 | return NULL; |
1318 | } |
1319 | } |
1320 | |
1321 | /* remove entry from freelist, bump nentries */ |
1322 | hctl->freeList[freelist_idx].freeList = newElement->link; |
1323 | hctl->freeList[freelist_idx].nentries++; |
1324 | |
1325 | if (IS_PARTITIONED(hctl)) |
1326 | SpinLockRelease(&hctl->freeList[freelist_idx].mutex); |
1327 | |
1328 | return newElement; |
1329 | } |
1330 | |
1331 | /* |
1332 | * hash_get_num_entries -- get the number of entries in a hashtable |
1333 | */ |
1334 | long |
1335 | hash_get_num_entries(HTAB *hashp) |
1336 | { |
1337 | int i; |
1338 | long sum = hashp->hctl->freeList[0].nentries; |
1339 | |
1340 | /* |
1341 | * We currently don't bother with acquiring the mutexes; it's only |
1342 | * sensible to call this function if you've got lock on all partitions of |
1343 | * the table. |
1344 | */ |
1345 | if (IS_PARTITIONED(hashp->hctl)) |
1346 | { |
1347 | for (i = 1; i < NUM_FREELISTS; i++) |
1348 | sum += hashp->hctl->freeList[i].nentries; |
1349 | } |
1350 | |
1351 | return sum; |
1352 | } |
1353 | |
1354 | /* |
1355 | * hash_seq_init/_search/_term |
1356 | * Sequentially search through hash table and return |
1357 | * all the elements one by one, return NULL when no more. |
1358 | * |
1359 | * hash_seq_term should be called if and only if the scan is abandoned before |
1360 | * completion; if hash_seq_search returns NULL then it has already done the |
1361 | * end-of-scan cleanup. |
1362 | * |
1363 | * NOTE: caller may delete the returned element before continuing the scan. |
1364 | * However, deleting any other element while the scan is in progress is |
1365 | * UNDEFINED (it might be the one that curIndex is pointing at!). Also, |
1366 | * if elements are added to the table while the scan is in progress, it is |
1367 | * unspecified whether they will be visited by the scan or not. |
1368 | * |
1369 | * NOTE: it is possible to use hash_seq_init/hash_seq_search without any |
1370 | * worry about hash_seq_term cleanup, if the hashtable is first locked against |
1371 | * further insertions by calling hash_freeze. |
1372 | * |
1373 | * NOTE: to use this with a partitioned hashtable, caller had better hold |
1374 | * at least shared lock on all partitions of the table throughout the scan! |
1375 | * We can cope with insertions or deletions by our own backend, but *not* |
1376 | * with concurrent insertions or deletions by another. |
1377 | */ |
1378 | void |
1379 | hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp) |
1380 | { |
1381 | status->hashp = hashp; |
1382 | status->curBucket = 0; |
1383 | status->curEntry = NULL; |
1384 | if (!hashp->frozen) |
1385 | register_seq_scan(hashp); |
1386 | } |
1387 | |
1388 | void * |
1389 | hash_seq_search(HASH_SEQ_STATUS *status) |
1390 | { |
1391 | HTAB *hashp; |
1392 | HASHHDR *hctl; |
1393 | uint32 max_bucket; |
1394 | long ssize; |
1395 | long segment_num; |
1396 | long segment_ndx; |
1397 | HASHSEGMENT segp; |
1398 | uint32 curBucket; |
1399 | HASHELEMENT *curElem; |
1400 | |
1401 | if ((curElem = status->curEntry) != NULL) |
1402 | { |
1403 | /* Continuing scan of curBucket... */ |
1404 | status->curEntry = curElem->link; |
1405 | if (status->curEntry == NULL) /* end of this bucket */ |
1406 | ++status->curBucket; |
1407 | return (void *) ELEMENTKEY(curElem); |
1408 | } |
1409 | |
1410 | /* |
1411 | * Search for next nonempty bucket starting at curBucket. |
1412 | */ |
1413 | curBucket = status->curBucket; |
1414 | hashp = status->hashp; |
1415 | hctl = hashp->hctl; |
1416 | ssize = hashp->ssize; |
1417 | max_bucket = hctl->max_bucket; |
1418 | |
1419 | if (curBucket > max_bucket) |
1420 | { |
1421 | hash_seq_term(status); |
1422 | return NULL; /* search is done */ |
1423 | } |
1424 | |
1425 | /* |
1426 | * first find the right segment in the table directory. |
1427 | */ |
1428 | segment_num = curBucket >> hashp->sshift; |
1429 | segment_ndx = MOD(curBucket, ssize); |
1430 | |
1431 | segp = hashp->dir[segment_num]; |
1432 | |
1433 | /* |
1434 | * Pick up the first item in this bucket's chain. If chain is not empty |
1435 | * we can begin searching it. Otherwise we have to advance to find the |
1436 | * next nonempty bucket. We try to optimize that case since searching a |
1437 | * near-empty hashtable has to iterate this loop a lot. |
1438 | */ |
1439 | while ((curElem = segp[segment_ndx]) == NULL) |
1440 | { |
1441 | /* empty bucket, advance to next */ |
1442 | if (++curBucket > max_bucket) |
1443 | { |
1444 | status->curBucket = curBucket; |
1445 | hash_seq_term(status); |
1446 | return NULL; /* search is done */ |
1447 | } |
1448 | if (++segment_ndx >= ssize) |
1449 | { |
1450 | segment_num++; |
1451 | segment_ndx = 0; |
1452 | segp = hashp->dir[segment_num]; |
1453 | } |
1454 | } |
1455 | |
1456 | /* Begin scan of curBucket... */ |
1457 | status->curEntry = curElem->link; |
1458 | if (status->curEntry == NULL) /* end of this bucket */ |
1459 | ++curBucket; |
1460 | status->curBucket = curBucket; |
1461 | return (void *) ELEMENTKEY(curElem); |
1462 | } |
1463 | |
1464 | void |
1465 | hash_seq_term(HASH_SEQ_STATUS *status) |
1466 | { |
1467 | if (!status->hashp->frozen) |
1468 | deregister_seq_scan(status->hashp); |
1469 | } |
1470 | |
1471 | /* |
1472 | * hash_freeze |
1473 | * Freeze a hashtable against future insertions (deletions are |
1474 | * still allowed) |
1475 | * |
1476 | * The reason for doing this is that by preventing any more bucket splits, |
1477 | * we no longer need to worry about registering hash_seq_search scans, |
1478 | * and thus caller need not be careful about ensuring hash_seq_term gets |
1479 | * called at the right times. |
1480 | * |
1481 | * Multiple calls to hash_freeze() are allowed, but you can't freeze a table |
1482 | * with active scans (since hash_seq_term would then do the wrong thing). |
1483 | */ |
1484 | void |
1485 | hash_freeze(HTAB *hashp) |
1486 | { |
1487 | if (hashp->isshared) |
1488 | elog(ERROR, "cannot freeze shared hashtable \"%s\"" , hashp->tabname); |
1489 | if (!hashp->frozen && has_seq_scans(hashp)) |
1490 | elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans" , |
1491 | hashp->tabname); |
1492 | hashp->frozen = true; |
1493 | } |
1494 | |
1495 | |
1496 | /********************************* UTILITIES ************************/ |
1497 | |
1498 | /* |
1499 | * Expand the table by adding one more hash bucket. |
1500 | */ |
1501 | static bool |
1502 | expand_table(HTAB *hashp) |
1503 | { |
1504 | HASHHDR *hctl = hashp->hctl; |
1505 | HASHSEGMENT old_seg, |
1506 | new_seg; |
1507 | long old_bucket, |
1508 | new_bucket; |
1509 | long new_segnum, |
1510 | new_segndx; |
1511 | long old_segnum, |
1512 | old_segndx; |
1513 | HASHBUCKET *oldlink, |
1514 | *newlink; |
1515 | HASHBUCKET currElement, |
1516 | nextElement; |
1517 | |
1518 | Assert(!IS_PARTITIONED(hctl)); |
1519 | |
1520 | #ifdef HASH_STATISTICS |
1521 | hash_expansions++; |
1522 | #endif |
1523 | |
1524 | new_bucket = hctl->max_bucket + 1; |
1525 | new_segnum = new_bucket >> hashp->sshift; |
1526 | new_segndx = MOD(new_bucket, hashp->ssize); |
1527 | |
1528 | if (new_segnum >= hctl->nsegs) |
1529 | { |
1530 | /* Allocate new segment if necessary -- could fail if dir full */ |
1531 | if (new_segnum >= hctl->dsize) |
1532 | if (!dir_realloc(hashp)) |
1533 | return false; |
1534 | if (!(hashp->dir[new_segnum] = seg_alloc(hashp))) |
1535 | return false; |
1536 | hctl->nsegs++; |
1537 | } |
1538 | |
1539 | /* OK, we created a new bucket */ |
1540 | hctl->max_bucket++; |
1541 | |
1542 | /* |
1543 | * *Before* changing masks, find old bucket corresponding to same hash |
1544 | * values; values in that bucket may need to be relocated to new bucket. |
1545 | * Note that new_bucket is certainly larger than low_mask at this point, |
1546 | * so we can skip the first step of the regular hash mask calc. |
1547 | */ |
1548 | old_bucket = (new_bucket & hctl->low_mask); |
1549 | |
1550 | /* |
1551 | * If we crossed a power of 2, readjust masks. |
1552 | */ |
1553 | if ((uint32) new_bucket > hctl->high_mask) |
1554 | { |
1555 | hctl->low_mask = hctl->high_mask; |
1556 | hctl->high_mask = (uint32) new_bucket | hctl->low_mask; |
1557 | } |
1558 | |
1559 | /* |
1560 | * Relocate records to the new bucket. NOTE: because of the way the hash |
1561 | * masking is done in calc_bucket, only one old bucket can need to be |
1562 | * split at this point. With a different way of reducing the hash value, |
1563 | * that might not be true! |
1564 | */ |
1565 | old_segnum = old_bucket >> hashp->sshift; |
1566 | old_segndx = MOD(old_bucket, hashp->ssize); |
1567 | |
1568 | old_seg = hashp->dir[old_segnum]; |
1569 | new_seg = hashp->dir[new_segnum]; |
1570 | |
1571 | oldlink = &old_seg[old_segndx]; |
1572 | newlink = &new_seg[new_segndx]; |
1573 | |
1574 | for (currElement = *oldlink; |
1575 | currElement != NULL; |
1576 | currElement = nextElement) |
1577 | { |
1578 | nextElement = currElement->link; |
1579 | if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket) |
1580 | { |
1581 | *oldlink = currElement; |
1582 | oldlink = &currElement->link; |
1583 | } |
1584 | else |
1585 | { |
1586 | *newlink = currElement; |
1587 | newlink = &currElement->link; |
1588 | } |
1589 | } |
1590 | /* don't forget to terminate the rebuilt hash chains... */ |
1591 | *oldlink = NULL; |
1592 | *newlink = NULL; |
1593 | |
1594 | return true; |
1595 | } |
1596 | |
1597 | |
1598 | static bool |
1599 | dir_realloc(HTAB *hashp) |
1600 | { |
1601 | HASHSEGMENT *p; |
1602 | HASHSEGMENT *old_p; |
1603 | long new_dsize; |
1604 | long old_dirsize; |
1605 | long new_dirsize; |
1606 | |
1607 | if (hashp->hctl->max_dsize != NO_MAX_DSIZE) |
1608 | return false; |
1609 | |
1610 | /* Reallocate directory */ |
1611 | new_dsize = hashp->hctl->dsize << 1; |
1612 | old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT); |
1613 | new_dirsize = new_dsize * sizeof(HASHSEGMENT); |
1614 | |
1615 | old_p = hashp->dir; |
1616 | CurrentDynaHashCxt = hashp->hcxt; |
1617 | p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize); |
1618 | |
1619 | if (p != NULL) |
1620 | { |
1621 | memcpy(p, old_p, old_dirsize); |
1622 | MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize); |
1623 | hashp->dir = p; |
1624 | hashp->hctl->dsize = new_dsize; |
1625 | |
1626 | /* XXX assume the allocator is palloc, so we know how to free */ |
1627 | Assert(hashp->alloc == DynaHashAlloc); |
1628 | pfree(old_p); |
1629 | |
1630 | return true; |
1631 | } |
1632 | |
1633 | return false; |
1634 | } |
1635 | |
1636 | |
1637 | static HASHSEGMENT |
1638 | seg_alloc(HTAB *hashp) |
1639 | { |
1640 | HASHSEGMENT segp; |
1641 | |
1642 | CurrentDynaHashCxt = hashp->hcxt; |
1643 | segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize); |
1644 | |
1645 | if (!segp) |
1646 | return NULL; |
1647 | |
1648 | MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize); |
1649 | |
1650 | return segp; |
1651 | } |
1652 | |
1653 | /* |
1654 | * allocate some new elements and link them into the indicated free list |
1655 | */ |
1656 | static bool |
1657 | element_alloc(HTAB *hashp, int nelem, int freelist_idx) |
1658 | { |
1659 | HASHHDR *hctl = hashp->hctl; |
1660 | Size elementSize; |
1661 | HASHELEMENT *firstElement; |
1662 | HASHELEMENT *tmpElement; |
1663 | HASHELEMENT *prevElement; |
1664 | int i; |
1665 | |
1666 | if (hashp->isfixed) |
1667 | return false; |
1668 | |
1669 | /* Each element has a HASHELEMENT header plus user data. */ |
1670 | elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize); |
1671 | |
1672 | CurrentDynaHashCxt = hashp->hcxt; |
1673 | firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize); |
1674 | |
1675 | if (!firstElement) |
1676 | return false; |
1677 | |
1678 | /* prepare to link all the new entries into the freelist */ |
1679 | prevElement = NULL; |
1680 | tmpElement = firstElement; |
1681 | for (i = 0; i < nelem; i++) |
1682 | { |
1683 | tmpElement->link = prevElement; |
1684 | prevElement = tmpElement; |
1685 | tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize); |
1686 | } |
1687 | |
1688 | /* if partitioned, must lock to touch freeList */ |
1689 | if (IS_PARTITIONED(hctl)) |
1690 | SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); |
1691 | |
1692 | /* freelist could be nonempty if two backends did this concurrently */ |
1693 | firstElement->link = hctl->freeList[freelist_idx].freeList; |
1694 | hctl->freeList[freelist_idx].freeList = prevElement; |
1695 | |
1696 | if (IS_PARTITIONED(hctl)) |
1697 | SpinLockRelease(&hctl->freeList[freelist_idx].mutex); |
1698 | |
1699 | return true; |
1700 | } |
1701 | |
1702 | /* complain when we have detected a corrupted hashtable */ |
1703 | static void |
1704 | hash_corrupted(HTAB *hashp) |
1705 | { |
1706 | /* |
1707 | * If the corruption is in a shared hashtable, we'd better force a |
1708 | * systemwide restart. Otherwise, just shut down this one backend. |
1709 | */ |
1710 | if (hashp->isshared) |
1711 | elog(PANIC, "hash table \"%s\" corrupted" , hashp->tabname); |
1712 | else |
1713 | elog(FATAL, "hash table \"%s\" corrupted" , hashp->tabname); |
1714 | } |
1715 | |
1716 | /* calculate ceil(log base 2) of num */ |
1717 | int |
1718 | my_log2(long num) |
1719 | { |
1720 | int i; |
1721 | long limit; |
1722 | |
1723 | /* guard against too-large input, which would put us into infinite loop */ |
1724 | if (num > LONG_MAX / 2) |
1725 | num = LONG_MAX / 2; |
1726 | |
1727 | for (i = 0, limit = 1; limit < num; i++, limit <<= 1) |
1728 | ; |
1729 | return i; |
1730 | } |
1731 | |
1732 | /* calculate first power of 2 >= num, bounded to what will fit in a long */ |
1733 | static long |
1734 | next_pow2_long(long num) |
1735 | { |
1736 | /* my_log2's internal range check is sufficient */ |
1737 | return 1L << my_log2(num); |
1738 | } |
1739 | |
1740 | /* calculate first power of 2 >= num, bounded to what will fit in an int */ |
1741 | static int |
1742 | next_pow2_int(long num) |
1743 | { |
1744 | if (num > INT_MAX / 2) |
1745 | num = INT_MAX / 2; |
1746 | return 1 << my_log2(num); |
1747 | } |
1748 | |
1749 | |
1750 | /************************* SEQ SCAN TRACKING ************************/ |
1751 | |
1752 | /* |
1753 | * We track active hash_seq_search scans here. The need for this mechanism |
1754 | * comes from the fact that a scan will get confused if a bucket split occurs |
1755 | * while it's in progress: it might visit entries twice, or even miss some |
1756 | * entirely (if it's partway through the same bucket that splits). Hence |
1757 | * we want to inhibit bucket splits if there are any active scans on the |
1758 | * table being inserted into. This is a fairly rare case in current usage, |
1759 | * so just postponing the split until the next insertion seems sufficient. |
1760 | * |
1761 | * Given present usages of the function, only a few scans are likely to be |
1762 | * open concurrently; so a finite-size stack of open scans seems sufficient, |
1763 | * and we don't worry that linear search is too slow. Note that we do |
1764 | * allow multiple scans of the same hashtable to be open concurrently. |
1765 | * |
1766 | * This mechanism can support concurrent scan and insertion in a shared |
1767 | * hashtable if it's the same backend doing both. It would fail otherwise, |
1768 | * but locking reasons seem to preclude any such scenario anyway, so we don't |
1769 | * worry. |
1770 | * |
1771 | * This arrangement is reasonably robust if a transient hashtable is deleted |
1772 | * without notifying us. The absolute worst case is we might inhibit splits |
1773 | * in another table created later at exactly the same address. We will give |
1774 | * a warning at transaction end for reference leaks, so any bugs leading to |
1775 | * lack of notification should be easy to catch. |
1776 | */ |
1777 | |
1778 | #define MAX_SEQ_SCANS 100 |
1779 | |
1780 | static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */ |
1781 | static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */ |
1782 | static int num_seq_scans = 0; |
1783 | |
1784 | |
1785 | /* Register a table as having an active hash_seq_search scan */ |
1786 | static void |
1787 | register_seq_scan(HTAB *hashp) |
1788 | { |
1789 | if (num_seq_scans >= MAX_SEQ_SCANS) |
1790 | elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"" , |
1791 | hashp->tabname); |
1792 | seq_scan_tables[num_seq_scans] = hashp; |
1793 | seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel(); |
1794 | num_seq_scans++; |
1795 | } |
1796 | |
1797 | /* Deregister an active scan */ |
1798 | static void |
1799 | deregister_seq_scan(HTAB *hashp) |
1800 | { |
1801 | int i; |
1802 | |
1803 | /* Search backward since it's most likely at the stack top */ |
1804 | for (i = num_seq_scans - 1; i >= 0; i--) |
1805 | { |
1806 | if (seq_scan_tables[i] == hashp) |
1807 | { |
1808 | seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1]; |
1809 | seq_scan_level[i] = seq_scan_level[num_seq_scans - 1]; |
1810 | num_seq_scans--; |
1811 | return; |
1812 | } |
1813 | } |
1814 | elog(ERROR, "no hash_seq_search scan for hash table \"%s\"" , |
1815 | hashp->tabname); |
1816 | } |
1817 | |
1818 | /* Check if a table has any active scan */ |
1819 | static bool |
1820 | has_seq_scans(HTAB *hashp) |
1821 | { |
1822 | int i; |
1823 | |
1824 | for (i = 0; i < num_seq_scans; i++) |
1825 | { |
1826 | if (seq_scan_tables[i] == hashp) |
1827 | return true; |
1828 | } |
1829 | return false; |
1830 | } |
1831 | |
1832 | /* Clean up any open scans at end of transaction */ |
1833 | void |
1834 | AtEOXact_HashTables(bool isCommit) |
1835 | { |
1836 | /* |
1837 | * During abort cleanup, open scans are expected; just silently clean 'em |
1838 | * out. An open scan at commit means someone forgot a hash_seq_term() |
1839 | * call, so complain. |
1840 | * |
1841 | * Note: it's tempting to try to print the tabname here, but refrain for |
1842 | * fear of touching deallocated memory. This isn't a user-facing message |
1843 | * anyway, so it needn't be pretty. |
1844 | */ |
1845 | if (isCommit) |
1846 | { |
1847 | int i; |
1848 | |
1849 | for (i = 0; i < num_seq_scans; i++) |
1850 | { |
1851 | elog(WARNING, "leaked hash_seq_search scan for hash table %p" , |
1852 | seq_scan_tables[i]); |
1853 | } |
1854 | } |
1855 | num_seq_scans = 0; |
1856 | } |
1857 | |
1858 | /* Clean up any open scans at end of subtransaction */ |
1859 | void |
1860 | AtEOSubXact_HashTables(bool isCommit, int nestDepth) |
1861 | { |
1862 | int i; |
1863 | |
1864 | /* |
1865 | * Search backward to make cleanup easy. Note we must check all entries, |
1866 | * not only those at the end of the array, because deletion technique |
1867 | * doesn't keep them in order. |
1868 | */ |
1869 | for (i = num_seq_scans - 1; i >= 0; i--) |
1870 | { |
1871 | if (seq_scan_level[i] >= nestDepth) |
1872 | { |
1873 | if (isCommit) |
1874 | elog(WARNING, "leaked hash_seq_search scan for hash table %p" , |
1875 | seq_scan_tables[i]); |
1876 | seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1]; |
1877 | seq_scan_level[i] = seq_scan_level[num_seq_scans - 1]; |
1878 | num_seq_scans--; |
1879 | } |
1880 | } |
1881 | } |
1882 | |