| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * dynahash.c |
| 4 | * dynamic hash tables |
| 5 | * |
| 6 | * dynahash.c supports both local-to-a-backend hash tables and hash tables in |
| 7 | * shared memory. For shared hash tables, it is the caller's responsibility |
| 8 | * to provide appropriate access interlocking. The simplest convention is |
| 9 | * that a single LWLock protects the whole hash table. Searches (HASH_FIND or |
| 10 | * hash_seq_search) need only shared lock, but any update requires exclusive |
| 11 | * lock. For heavily-used shared tables, the single-lock approach creates a |
| 12 | * concurrency bottleneck, so we also support "partitioned" locking wherein |
| 13 | * there are multiple LWLocks guarding distinct subsets of the table. To use |
| 14 | * a hash table in partitioned mode, the HASH_PARTITION flag must be given |
| 15 | * to hash_create. This prevents any attempt to split buckets on-the-fly. |
| 16 | * Therefore, each hash bucket chain operates independently, and no fields |
| 17 | * of the hash header change after init except nentries and freeList. |
| 18 | * (A partitioned table uses multiple copies of those fields, guarded by |
| 19 | * spinlocks, for additional concurrency.) |
| 20 | * This lets any subset of the hash buckets be treated as a separately |
| 21 | * lockable partition. We expect callers to use the low-order bits of a |
| 22 | * lookup key's hash value as a partition number --- this will work because |
| 23 | * of the way calc_bucket() maps hash values to bucket numbers. |
| 24 | * |
| 25 | * For hash tables in shared memory, the memory allocator function should |
| 26 | * match malloc's semantics of returning NULL on failure. For hash tables |
| 27 | * in local memory, we typically use palloc() which will throw error on |
| 28 | * failure. The code in this file has to cope with both cases. |
| 29 | * |
| 30 | * dynahash.c provides support for these types of lookup keys: |
| 31 | * |
| 32 | * 1. Null-terminated C strings (truncated if necessary to fit in keysize), |
| 33 | * compared as though by strcmp(). This is the default behavior. |
| 34 | * |
| 35 | * 2. Arbitrary binary data of size keysize, compared as though by memcmp(). |
| 36 | * (Caller must ensure there are no undefined padding bits in the keys!) |
| 37 | * This is selected by specifying HASH_BLOBS flag to hash_create. |
| 38 | * |
| 39 | * 3. More complex key behavior can be selected by specifying user-supplied |
| 40 | * hashing, comparison, and/or key-copying functions. At least a hashing |
| 41 | * function must be supplied; comparison defaults to memcmp() and key copying |
| 42 | * to memcpy() when a user-defined hashing function is selected. |
| 43 | * |
| 44 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 45 | * Portions Copyright (c) 1994, Regents of the University of California |
| 46 | * |
| 47 | * |
| 48 | * IDENTIFICATION |
| 49 | * src/backend/utils/hash/dynahash.c |
| 50 | * |
| 51 | *------------------------------------------------------------------------- |
| 52 | */ |
| 53 | |
| 54 | /* |
| 55 | * Original comments: |
| 56 | * |
| 57 | * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson. |
| 58 | * Coded into C, with minor code improvements, and with hsearch(3) interface, |
| 59 | * by ejp@ausmelb.oz, Jul 26, 1988: 13:16; |
| 60 | * also, hcreate/hdestroy routines added to simulate hsearch(3). |
| 61 | * |
| 62 | * These routines simulate hsearch(3) and family, with the important |
| 63 | * difference that the hash table is dynamic - can grow indefinitely |
| 64 | * beyond its original size (as supplied to hcreate()). |
| 65 | * |
| 66 | * Performance appears to be comparable to that of hsearch(3). |
| 67 | * The 'source-code' options referred to in hsearch(3)'s 'man' page |
| 68 | * are not implemented; otherwise functionality is identical. |
| 69 | * |
| 70 | * Compilation controls: |
| 71 | * HASH_DEBUG controls some informative traces, mainly for debugging. |
| 72 | * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained; |
| 73 | * when combined with HASH_DEBUG, these are displayed by hdestroy(). |
| 74 | * |
| 75 | * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor |
| 76 | * concatenation property, in probably unnecessary code 'optimization'. |
| 77 | * |
| 78 | * Modified margo@postgres.berkeley.edu February 1990 |
| 79 | * added multiple table interface |
| 80 | * Modified by sullivan@postgres.berkeley.edu April 1990 |
| 81 | * changed ctl structure for shared memory |
| 82 | */ |
| 83 | |
| 84 | #include "postgres.h" |
| 85 | |
| 86 | #include <limits.h> |
| 87 | |
| 88 | #include "access/xact.h" |
| 89 | #include "storage/shmem.h" |
| 90 | #include "storage/spin.h" |
| 91 | #include "utils/dynahash.h" |
| 92 | #include "utils/memutils.h" |
| 93 | |
| 94 | |
| 95 | /* |
| 96 | * Constants |
| 97 | * |
| 98 | * A hash table has a top-level "directory", each of whose entries points |
| 99 | * to a "segment" of ssize bucket headers. The maximum number of hash |
| 100 | * buckets is thus dsize * ssize (but dsize may be expansible). Of course, |
| 101 | * the number of records in the table can be larger, but we don't want a |
| 102 | * whole lot of records per bucket or performance goes down. |
| 103 | * |
| 104 | * In a hash table allocated in shared memory, the directory cannot be |
| 105 | * expanded because it must stay at a fixed address. The directory size |
| 106 | * should be selected using hash_select_dirsize (and you'd better have |
| 107 | * a good idea of the maximum number of entries!). For non-shared hash |
| 108 | * tables, the initial directory size can be left at the default. |
| 109 | */ |
| 110 | #define DEF_SEGSIZE 256 |
| 111 | #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */ |
| 112 | #define DEF_DIRSIZE 256 |
| 113 | #define DEF_FFACTOR 1 /* default fill factor */ |
| 114 | |
| 115 | /* Number of freelists to be used for a partitioned hash table. */ |
| 116 | #define NUM_FREELISTS 32 |
| 117 | |
| 118 | /* A hash bucket is a linked list of HASHELEMENTs */ |
| 119 | typedef HASHELEMENT *HASHBUCKET; |
| 120 | |
| 121 | /* A hash segment is an array of bucket headers */ |
| 122 | typedef HASHBUCKET *HASHSEGMENT; |
| 123 | |
| 124 | /* |
| 125 | * Per-freelist data. |
| 126 | * |
| 127 | * In a partitioned hash table, each freelist is associated with a specific |
| 128 | * set of hashcodes, as determined by the FREELIST_IDX() macro below. |
| 129 | * nentries tracks the number of live hashtable entries having those hashcodes |
| 130 | * (NOT the number of entries in the freelist, as you might expect). |
| 131 | * |
| 132 | * The coverage of a freelist might be more or less than one partition, so it |
| 133 | * needs its own lock rather than relying on caller locking. Relying on that |
| 134 | * wouldn't work even if the coverage was the same, because of the occasional |
| 135 | * need to "borrow" entries from another freelist; see get_hash_entry(). |
| 136 | * |
| 137 | * Using an array of FreeListData instead of separate arrays of mutexes, |
| 138 | * nentries and freeLists helps to reduce sharing of cache lines between |
| 139 | * different mutexes. |
| 140 | */ |
| 141 | typedef struct |
| 142 | { |
| 143 | slock_t mutex; /* spinlock for this freelist */ |
| 144 | long nentries; /* number of entries in associated buckets */ |
| 145 | HASHELEMENT *freeList; /* chain of free elements */ |
| 146 | } FreeListData; |
| 147 | |
| 148 | /* |
| 149 | * Header structure for a hash table --- contains all changeable info |
| 150 | * |
| 151 | * In a shared-memory hash table, the HASHHDR is in shared memory, while |
| 152 | * each backend has a local HTAB struct. For a non-shared table, there isn't |
| 153 | * any functional difference between HASHHDR and HTAB, but we separate them |
| 154 | * anyway to share code between shared and non-shared tables. |
| 155 | */ |
| 156 | struct HASHHDR |
| 157 | { |
| 158 | /* |
| 159 | * The freelist can become a point of contention in high-concurrency hash |
| 160 | * tables, so we use an array of freelists, each with its own mutex and |
| 161 | * nentries count, instead of just a single one. Although the freelists |
| 162 | * normally operate independently, we will scavenge entries from freelists |
| 163 | * other than a hashcode's default freelist when necessary. |
| 164 | * |
| 165 | * If the hash table is not partitioned, only freeList[0] is used and its |
| 166 | * spinlock is not used at all; callers' locking is assumed sufficient. |
| 167 | */ |
| 168 | FreeListData freeList[NUM_FREELISTS]; |
| 169 | |
| 170 | /* These fields can change, but not in a partitioned table */ |
| 171 | /* Also, dsize can't change in a shared table, even if unpartitioned */ |
| 172 | long dsize; /* directory size */ |
| 173 | long nsegs; /* number of allocated segments (<= dsize) */ |
| 174 | uint32 max_bucket; /* ID of maximum bucket in use */ |
| 175 | uint32 high_mask; /* mask to modulo into entire table */ |
| 176 | uint32 low_mask; /* mask to modulo into lower half of table */ |
| 177 | |
| 178 | /* These fields are fixed at hashtable creation */ |
| 179 | Size keysize; /* hash key length in bytes */ |
| 180 | Size entrysize; /* total user element size in bytes */ |
| 181 | long num_partitions; /* # partitions (must be power of 2), or 0 */ |
| 182 | long ffactor; /* target fill factor */ |
| 183 | long max_dsize; /* 'dsize' limit if directory is fixed size */ |
| 184 | long ssize; /* segment size --- must be power of 2 */ |
| 185 | int sshift; /* segment shift = log2(ssize) */ |
| 186 | int nelem_alloc; /* number of entries to allocate at once */ |
| 187 | |
| 188 | #ifdef HASH_STATISTICS |
| 189 | |
| 190 | /* |
| 191 | * Count statistics here. NB: stats code doesn't bother with mutex, so |
| 192 | * counts could be corrupted a bit in a partitioned table. |
| 193 | */ |
| 194 | long accesses; |
| 195 | long collisions; |
| 196 | #endif |
| 197 | }; |
| 198 | |
| 199 | #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) |
| 200 | |
| 201 | #define FREELIST_IDX(hctl, hashcode) \ |
| 202 | (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0) |
| 203 | |
| 204 | /* |
| 205 | * Top control structure for a hashtable --- in a shared table, each backend |
| 206 | * has its own copy (OK since no fields change at runtime) |
| 207 | */ |
| 208 | struct HTAB |
| 209 | { |
| 210 | HASHHDR *hctl; /* => shared control information */ |
| 211 | HASHSEGMENT *dir; /* directory of segment starts */ |
| 212 | HashValueFunc hash; /* hash function */ |
| 213 | HashCompareFunc match; /* key comparison function */ |
| 214 | HashCopyFunc keycopy; /* key copying function */ |
| 215 | HashAllocFunc alloc; /* memory allocator */ |
| 216 | MemoryContext hcxt; /* memory context if default allocator used */ |
| 217 | char *tabname; /* table name (for error messages) */ |
| 218 | bool isshared; /* true if table is in shared memory */ |
| 219 | bool isfixed; /* if true, don't enlarge */ |
| 220 | |
| 221 | /* freezing a shared table isn't allowed, so we can keep state here */ |
| 222 | bool frozen; /* true = no more inserts allowed */ |
| 223 | |
| 224 | /* We keep local copies of these fixed values to reduce contention */ |
| 225 | Size keysize; /* hash key length in bytes */ |
| 226 | long ssize; /* segment size --- must be power of 2 */ |
| 227 | int sshift; /* segment shift = log2(ssize) */ |
| 228 | }; |
| 229 | |
| 230 | /* |
| 231 | * Key (also entry) part of a HASHELEMENT |
| 232 | */ |
| 233 | #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT))) |
| 234 | |
| 235 | /* |
| 236 | * Obtain element pointer given pointer to key |
| 237 | */ |
| 238 | #define ELEMENT_FROM_KEY(key) \ |
| 239 | ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT)))) |
| 240 | |
| 241 | /* |
| 242 | * Fast MOD arithmetic, assuming that y is a power of 2 ! |
| 243 | */ |
| 244 | #define MOD(x,y) ((x) & ((y)-1)) |
| 245 | |
| 246 | #if HASH_STATISTICS |
| 247 | static long hash_accesses, |
| 248 | hash_collisions, |
| 249 | hash_expansions; |
| 250 | #endif |
| 251 | |
| 252 | /* |
| 253 | * Private function prototypes |
| 254 | */ |
| 255 | static void *DynaHashAlloc(Size size); |
| 256 | static HASHSEGMENT seg_alloc(HTAB *hashp); |
| 257 | static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx); |
| 258 | static bool dir_realloc(HTAB *hashp); |
| 259 | static bool expand_table(HTAB *hashp); |
| 260 | static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx); |
| 261 | static void hdefault(HTAB *hashp); |
| 262 | static int choose_nelem_alloc(Size entrysize); |
| 263 | static bool init_htab(HTAB *hashp, long nelem); |
| 264 | static void hash_corrupted(HTAB *hashp); |
| 265 | static long next_pow2_long(long num); |
| 266 | static int next_pow2_int(long num); |
| 267 | static void register_seq_scan(HTAB *hashp); |
| 268 | static void deregister_seq_scan(HTAB *hashp); |
| 269 | static bool has_seq_scans(HTAB *hashp); |
| 270 | |
| 271 | |
| 272 | /* |
| 273 | * memory allocation support |
| 274 | */ |
| 275 | static MemoryContext CurrentDynaHashCxt = NULL; |
| 276 | |
| 277 | static void * |
| 278 | DynaHashAlloc(Size size) |
| 279 | { |
| 280 | Assert(MemoryContextIsValid(CurrentDynaHashCxt)); |
| 281 | return MemoryContextAlloc(CurrentDynaHashCxt, size); |
| 282 | } |
| 283 | |
| 284 | |
| 285 | /* |
| 286 | * HashCompareFunc for string keys |
| 287 | * |
| 288 | * Because we copy keys with strlcpy(), they will be truncated at keysize-1 |
| 289 | * bytes, so we can only compare that many ... hence strncmp is almost but |
| 290 | * not quite the right thing. |
| 291 | */ |
| 292 | static int |
| 293 | string_compare(const char *key1, const char *key2, Size keysize) |
| 294 | { |
| 295 | return strncmp(key1, key2, keysize - 1); |
| 296 | } |
| 297 | |
| 298 | |
| 299 | /************************** CREATE ROUTINES **********************/ |
| 300 | |
| 301 | /* |
| 302 | * hash_create -- create a new dynamic hash table |
| 303 | * |
| 304 | * tabname: a name for the table (for debugging purposes) |
| 305 | * nelem: maximum number of elements expected |
| 306 | * *info: additional table parameters, as indicated by flags |
| 307 | * flags: bitmask indicating which parameters to take from *info |
| 308 | * |
| 309 | * Note: for a shared-memory hashtable, nelem needs to be a pretty good |
| 310 | * estimate, since we can't expand the table on the fly. But an unshared |
| 311 | * hashtable can be expanded on-the-fly, so it's better for nelem to be |
| 312 | * on the small side and let the table grow if it's exceeded. An overly |
| 313 | * large nelem will penalize hash_seq_search speed without buying much. |
| 314 | */ |
| 315 | HTAB * |
| 316 | hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) |
| 317 | { |
| 318 | HTAB *hashp; |
| 319 | HASHHDR *hctl; |
| 320 | |
| 321 | /* |
| 322 | * For shared hash tables, we have a local hash header (HTAB struct) that |
| 323 | * we allocate in TopMemoryContext; all else is in shared memory. |
| 324 | * |
| 325 | * For non-shared hash tables, everything including the hash header is in |
| 326 | * a memory context created specially for the hash table --- this makes |
| 327 | * hash_destroy very simple. The memory context is made a child of either |
| 328 | * a context specified by the caller, or TopMemoryContext if nothing is |
| 329 | * specified. |
| 330 | */ |
| 331 | if (flags & HASH_SHARED_MEM) |
| 332 | { |
| 333 | /* Set up to allocate the hash header */ |
| 334 | CurrentDynaHashCxt = TopMemoryContext; |
| 335 | } |
| 336 | else |
| 337 | { |
| 338 | /* Create the hash table's private memory context */ |
| 339 | if (flags & HASH_CONTEXT) |
| 340 | CurrentDynaHashCxt = info->hcxt; |
| 341 | else |
| 342 | CurrentDynaHashCxt = TopMemoryContext; |
| 343 | CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt, |
| 344 | "dynahash" , |
| 345 | ALLOCSET_DEFAULT_SIZES); |
| 346 | } |
| 347 | |
| 348 | /* Initialize the hash header, plus a copy of the table name */ |
| 349 | hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1); |
| 350 | MemSet(hashp, 0, sizeof(HTAB)); |
| 351 | |
| 352 | hashp->tabname = (char *) (hashp + 1); |
| 353 | strcpy(hashp->tabname, tabname); |
| 354 | |
| 355 | /* If we have a private context, label it with hashtable's name */ |
| 356 | if (!(flags & HASH_SHARED_MEM)) |
| 357 | MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname); |
| 358 | |
| 359 | /* |
| 360 | * Select the appropriate hash function (see comments at head of file). |
| 361 | */ |
| 362 | if (flags & HASH_FUNCTION) |
| 363 | hashp->hash = info->hash; |
| 364 | else if (flags & HASH_BLOBS) |
| 365 | { |
| 366 | /* We can optimize hashing for common key sizes */ |
| 367 | Assert(flags & HASH_ELEM); |
| 368 | if (info->keysize == sizeof(uint32)) |
| 369 | hashp->hash = uint32_hash; |
| 370 | else |
| 371 | hashp->hash = tag_hash; |
| 372 | } |
| 373 | else |
| 374 | hashp->hash = string_hash; /* default hash function */ |
| 375 | |
| 376 | /* |
| 377 | * If you don't specify a match function, it defaults to string_compare if |
| 378 | * you used string_hash (either explicitly or by default) and to memcmp |
| 379 | * otherwise. |
| 380 | * |
| 381 | * Note: explicitly specifying string_hash is deprecated, because this |
| 382 | * might not work for callers in loadable modules on some platforms due to |
| 383 | * referencing a trampoline instead of the string_hash function proper. |
| 384 | * Just let it default, eh? |
| 385 | */ |
| 386 | if (flags & HASH_COMPARE) |
| 387 | hashp->match = info->match; |
| 388 | else if (hashp->hash == string_hash) |
| 389 | hashp->match = (HashCompareFunc) string_compare; |
| 390 | else |
| 391 | hashp->match = memcmp; |
| 392 | |
| 393 | /* |
| 394 | * Similarly, the key-copying function defaults to strlcpy or memcpy. |
| 395 | */ |
| 396 | if (flags & HASH_KEYCOPY) |
| 397 | hashp->keycopy = info->keycopy; |
| 398 | else if (hashp->hash == string_hash) |
| 399 | hashp->keycopy = (HashCopyFunc) strlcpy; |
| 400 | else |
| 401 | hashp->keycopy = memcpy; |
| 402 | |
| 403 | /* And select the entry allocation function, too. */ |
| 404 | if (flags & HASH_ALLOC) |
| 405 | hashp->alloc = info->alloc; |
| 406 | else |
| 407 | hashp->alloc = DynaHashAlloc; |
| 408 | |
| 409 | if (flags & HASH_SHARED_MEM) |
| 410 | { |
| 411 | /* |
| 412 | * ctl structure and directory are preallocated for shared memory |
| 413 | * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as |
| 414 | * well. |
| 415 | */ |
| 416 | hashp->hctl = info->hctl; |
| 417 | hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR)); |
| 418 | hashp->hcxt = NULL; |
| 419 | hashp->isshared = true; |
| 420 | |
| 421 | /* hash table already exists, we're just attaching to it */ |
| 422 | if (flags & HASH_ATTACH) |
| 423 | { |
| 424 | /* make local copies of some heavily-used values */ |
| 425 | hctl = hashp->hctl; |
| 426 | hashp->keysize = hctl->keysize; |
| 427 | hashp->ssize = hctl->ssize; |
| 428 | hashp->sshift = hctl->sshift; |
| 429 | |
| 430 | return hashp; |
| 431 | } |
| 432 | } |
| 433 | else |
| 434 | { |
| 435 | /* setup hash table defaults */ |
| 436 | hashp->hctl = NULL; |
| 437 | hashp->dir = NULL; |
| 438 | hashp->hcxt = CurrentDynaHashCxt; |
| 439 | hashp->isshared = false; |
| 440 | } |
| 441 | |
| 442 | if (!hashp->hctl) |
| 443 | { |
| 444 | hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR)); |
| 445 | if (!hashp->hctl) |
| 446 | ereport(ERROR, |
| 447 | (errcode(ERRCODE_OUT_OF_MEMORY), |
| 448 | errmsg("out of memory" ))); |
| 449 | } |
| 450 | |
| 451 | hashp->frozen = false; |
| 452 | |
| 453 | hdefault(hashp); |
| 454 | |
| 455 | hctl = hashp->hctl; |
| 456 | |
| 457 | if (flags & HASH_PARTITION) |
| 458 | { |
| 459 | /* Doesn't make sense to partition a local hash table */ |
| 460 | Assert(flags & HASH_SHARED_MEM); |
| 461 | |
| 462 | /* |
| 463 | * The number of partitions had better be a power of 2. Also, it must |
| 464 | * be less than INT_MAX (see init_htab()), so call the int version of |
| 465 | * next_pow2. |
| 466 | */ |
| 467 | Assert(info->num_partitions == next_pow2_int(info->num_partitions)); |
| 468 | |
| 469 | hctl->num_partitions = info->num_partitions; |
| 470 | } |
| 471 | |
| 472 | if (flags & HASH_SEGMENT) |
| 473 | { |
| 474 | hctl->ssize = info->ssize; |
| 475 | hctl->sshift = my_log2(info->ssize); |
| 476 | /* ssize had better be a power of 2 */ |
| 477 | Assert(hctl->ssize == (1L << hctl->sshift)); |
| 478 | } |
| 479 | if (flags & HASH_FFACTOR) |
| 480 | hctl->ffactor = info->ffactor; |
| 481 | |
| 482 | /* |
| 483 | * SHM hash tables have fixed directory size passed by the caller. |
| 484 | */ |
| 485 | if (flags & HASH_DIRSIZE) |
| 486 | { |
| 487 | hctl->max_dsize = info->max_dsize; |
| 488 | hctl->dsize = info->dsize; |
| 489 | } |
| 490 | |
| 491 | /* |
| 492 | * hash table now allocates space for key and data but you have to say how |
| 493 | * much space to allocate |
| 494 | */ |
| 495 | if (flags & HASH_ELEM) |
| 496 | { |
| 497 | Assert(info->entrysize >= info->keysize); |
| 498 | hctl->keysize = info->keysize; |
| 499 | hctl->entrysize = info->entrysize; |
| 500 | } |
| 501 | |
| 502 | /* make local copies of heavily-used constant fields */ |
| 503 | hashp->keysize = hctl->keysize; |
| 504 | hashp->ssize = hctl->ssize; |
| 505 | hashp->sshift = hctl->sshift; |
| 506 | |
| 507 | /* Build the hash directory structure */ |
| 508 | if (!init_htab(hashp, nelem)) |
| 509 | elog(ERROR, "failed to initialize hash table \"%s\"" , hashp->tabname); |
| 510 | |
| 511 | /* |
| 512 | * For a shared hash table, preallocate the requested number of elements. |
| 513 | * This reduces problems with run-time out-of-shared-memory conditions. |
| 514 | * |
| 515 | * For a non-shared hash table, preallocate the requested number of |
| 516 | * elements if it's less than our chosen nelem_alloc. This avoids wasting |
| 517 | * space if the caller correctly estimates a small table size. |
| 518 | */ |
| 519 | if ((flags & HASH_SHARED_MEM) || |
| 520 | nelem < hctl->nelem_alloc) |
| 521 | { |
| 522 | int i, |
| 523 | freelist_partitions, |
| 524 | nelem_alloc, |
| 525 | nelem_alloc_first; |
| 526 | |
| 527 | /* |
| 528 | * If hash table is partitioned, give each freelist an equal share of |
| 529 | * the initial allocation. Otherwise only freeList[0] is used. |
| 530 | */ |
| 531 | if (IS_PARTITIONED(hashp->hctl)) |
| 532 | freelist_partitions = NUM_FREELISTS; |
| 533 | else |
| 534 | freelist_partitions = 1; |
| 535 | |
| 536 | nelem_alloc = nelem / freelist_partitions; |
| 537 | if (nelem_alloc <= 0) |
| 538 | nelem_alloc = 1; |
| 539 | |
| 540 | /* |
| 541 | * Make sure we'll allocate all the requested elements; freeList[0] |
| 542 | * gets the excess if the request isn't divisible by NUM_FREELISTS. |
| 543 | */ |
| 544 | if (nelem_alloc * freelist_partitions < nelem) |
| 545 | nelem_alloc_first = |
| 546 | nelem - nelem_alloc * (freelist_partitions - 1); |
| 547 | else |
| 548 | nelem_alloc_first = nelem_alloc; |
| 549 | |
| 550 | for (i = 0; i < freelist_partitions; i++) |
| 551 | { |
| 552 | int temp = (i == 0) ? nelem_alloc_first : nelem_alloc; |
| 553 | |
| 554 | if (!element_alloc(hashp, temp, i)) |
| 555 | ereport(ERROR, |
| 556 | (errcode(ERRCODE_OUT_OF_MEMORY), |
| 557 | errmsg("out of memory" ))); |
| 558 | } |
| 559 | } |
| 560 | |
| 561 | if (flags & HASH_FIXED_SIZE) |
| 562 | hashp->isfixed = true; |
| 563 | return hashp; |
| 564 | } |
| 565 | |
| 566 | /* |
| 567 | * Set default HASHHDR parameters. |
| 568 | */ |
| 569 | static void |
| 570 | hdefault(HTAB *hashp) |
| 571 | { |
| 572 | HASHHDR *hctl = hashp->hctl; |
| 573 | |
| 574 | MemSet(hctl, 0, sizeof(HASHHDR)); |
| 575 | |
| 576 | hctl->dsize = DEF_DIRSIZE; |
| 577 | hctl->nsegs = 0; |
| 578 | |
| 579 | /* rather pointless defaults for key & entry size */ |
| 580 | hctl->keysize = sizeof(char *); |
| 581 | hctl->entrysize = 2 * sizeof(char *); |
| 582 | |
| 583 | hctl->num_partitions = 0; /* not partitioned */ |
| 584 | |
| 585 | hctl->ffactor = DEF_FFACTOR; |
| 586 | |
| 587 | /* table has no fixed maximum size */ |
| 588 | hctl->max_dsize = NO_MAX_DSIZE; |
| 589 | |
| 590 | hctl->ssize = DEF_SEGSIZE; |
| 591 | hctl->sshift = DEF_SEGSIZE_SHIFT; |
| 592 | |
| 593 | #ifdef HASH_STATISTICS |
| 594 | hctl->accesses = hctl->collisions = 0; |
| 595 | #endif |
| 596 | } |
| 597 | |
| 598 | /* |
| 599 | * Given the user-specified entry size, choose nelem_alloc, ie, how many |
| 600 | * elements to add to the hash table when we need more. |
| 601 | */ |
| 602 | static int |
| 603 | choose_nelem_alloc(Size entrysize) |
| 604 | { |
| 605 | int nelem_alloc; |
| 606 | Size elementSize; |
| 607 | Size allocSize; |
| 608 | |
| 609 | /* Each element has a HASHELEMENT header plus user data. */ |
| 610 | /* NB: this had better match element_alloc() */ |
| 611 | elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize); |
| 612 | |
| 613 | /* |
| 614 | * The idea here is to choose nelem_alloc at least 32, but round up so |
| 615 | * that the allocation request will be a power of 2 or just less. This |
| 616 | * makes little difference for hash tables in shared memory, but for hash |
| 617 | * tables managed by palloc, the allocation request will be rounded up to |
| 618 | * a power of 2 anyway. If we fail to take this into account, we'll waste |
| 619 | * as much as half the allocated space. |
| 620 | */ |
| 621 | allocSize = 32 * 4; /* assume elementSize at least 8 */ |
| 622 | do |
| 623 | { |
| 624 | allocSize <<= 1; |
| 625 | nelem_alloc = allocSize / elementSize; |
| 626 | } while (nelem_alloc < 32); |
| 627 | |
| 628 | return nelem_alloc; |
| 629 | } |
| 630 | |
| 631 | /* |
| 632 | * Compute derived fields of hctl and build the initial directory/segment |
| 633 | * arrays |
| 634 | */ |
| 635 | static bool |
| 636 | init_htab(HTAB *hashp, long nelem) |
| 637 | { |
| 638 | HASHHDR *hctl = hashp->hctl; |
| 639 | HASHSEGMENT *segp; |
| 640 | int nbuckets; |
| 641 | int nsegs; |
| 642 | int i; |
| 643 | |
| 644 | /* |
| 645 | * initialize mutexes if it's a partitioned table |
| 646 | */ |
| 647 | if (IS_PARTITIONED(hctl)) |
| 648 | for (i = 0; i < NUM_FREELISTS; i++) |
| 649 | SpinLockInit(&(hctl->freeList[i].mutex)); |
| 650 | |
| 651 | /* |
| 652 | * Divide number of elements by the fill factor to determine a desired |
| 653 | * number of buckets. Allocate space for the next greater power of two |
| 654 | * number of buckets |
| 655 | */ |
| 656 | nbuckets = next_pow2_int((nelem - 1) / hctl->ffactor + 1); |
| 657 | |
| 658 | /* |
| 659 | * In a partitioned table, nbuckets must be at least equal to |
| 660 | * num_partitions; were it less, keys with apparently different partition |
| 661 | * numbers would map to the same bucket, breaking partition independence. |
| 662 | * (Normally nbuckets will be much bigger; this is just a safety check.) |
| 663 | */ |
| 664 | while (nbuckets < hctl->num_partitions) |
| 665 | nbuckets <<= 1; |
| 666 | |
| 667 | hctl->max_bucket = hctl->low_mask = nbuckets - 1; |
| 668 | hctl->high_mask = (nbuckets << 1) - 1; |
| 669 | |
| 670 | /* |
| 671 | * Figure number of directory segments needed, round up to a power of 2 |
| 672 | */ |
| 673 | nsegs = (nbuckets - 1) / hctl->ssize + 1; |
| 674 | nsegs = next_pow2_int(nsegs); |
| 675 | |
| 676 | /* |
| 677 | * Make sure directory is big enough. If pre-allocated directory is too |
| 678 | * small, choke (caller screwed up). |
| 679 | */ |
| 680 | if (nsegs > hctl->dsize) |
| 681 | { |
| 682 | if (!(hashp->dir)) |
| 683 | hctl->dsize = nsegs; |
| 684 | else |
| 685 | return false; |
| 686 | } |
| 687 | |
| 688 | /* Allocate a directory */ |
| 689 | if (!(hashp->dir)) |
| 690 | { |
| 691 | CurrentDynaHashCxt = hashp->hcxt; |
| 692 | hashp->dir = (HASHSEGMENT *) |
| 693 | hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT)); |
| 694 | if (!hashp->dir) |
| 695 | return false; |
| 696 | } |
| 697 | |
| 698 | /* Allocate initial segments */ |
| 699 | for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++) |
| 700 | { |
| 701 | *segp = seg_alloc(hashp); |
| 702 | if (*segp == NULL) |
| 703 | return false; |
| 704 | } |
| 705 | |
| 706 | /* Choose number of entries to allocate at a time */ |
| 707 | hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize); |
| 708 | |
| 709 | #if HASH_DEBUG |
| 710 | fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n" , |
| 711 | "TABLE POINTER " , hashp, |
| 712 | "DIRECTORY SIZE " , hctl->dsize, |
| 713 | "SEGMENT SIZE " , hctl->ssize, |
| 714 | "SEGMENT SHIFT " , hctl->sshift, |
| 715 | "FILL FACTOR " , hctl->ffactor, |
| 716 | "MAX BUCKET " , hctl->max_bucket, |
| 717 | "HIGH MASK " , hctl->high_mask, |
| 718 | "LOW MASK " , hctl->low_mask, |
| 719 | "NSEGS " , hctl->nsegs); |
| 720 | #endif |
| 721 | return true; |
| 722 | } |
| 723 | |
| 724 | /* |
| 725 | * Estimate the space needed for a hashtable containing the given number |
| 726 | * of entries of given size. |
| 727 | * NOTE: this is used to estimate the footprint of hashtables in shared |
| 728 | * memory; therefore it does not count HTAB which is in local memory. |
| 729 | * NB: assumes that all hash structure parameters have default values! |
| 730 | */ |
| 731 | Size |
| 732 | hash_estimate_size(long num_entries, Size entrysize) |
| 733 | { |
| 734 | Size size; |
| 735 | long nBuckets, |
| 736 | nSegments, |
| 737 | nDirEntries, |
| 738 | nElementAllocs, |
| 739 | elementSize, |
| 740 | elementAllocCnt; |
| 741 | |
| 742 | /* estimate number of buckets wanted */ |
| 743 | nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1); |
| 744 | /* # of segments needed for nBuckets */ |
| 745 | nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1); |
| 746 | /* directory entries */ |
| 747 | nDirEntries = DEF_DIRSIZE; |
| 748 | while (nDirEntries < nSegments) |
| 749 | nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */ |
| 750 | |
| 751 | /* fixed control info */ |
| 752 | size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */ |
| 753 | /* directory */ |
| 754 | size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT))); |
| 755 | /* segments */ |
| 756 | size = add_size(size, mul_size(nSegments, |
| 757 | MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET)))); |
| 758 | /* elements --- allocated in groups of choose_nelem_alloc() entries */ |
| 759 | elementAllocCnt = choose_nelem_alloc(entrysize); |
| 760 | nElementAllocs = (num_entries - 1) / elementAllocCnt + 1; |
| 761 | elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize); |
| 762 | size = add_size(size, |
| 763 | mul_size(nElementAllocs, |
| 764 | mul_size(elementAllocCnt, elementSize))); |
| 765 | |
| 766 | return size; |
| 767 | } |
| 768 | |
| 769 | /* |
| 770 | * Select an appropriate directory size for a hashtable with the given |
| 771 | * maximum number of entries. |
| 772 | * This is only needed for hashtables in shared memory, whose directories |
| 773 | * cannot be expanded dynamically. |
| 774 | * NB: assumes that all hash structure parameters have default values! |
| 775 | * |
| 776 | * XXX this had better agree with the behavior of init_htab()... |
| 777 | */ |
| 778 | long |
| 779 | hash_select_dirsize(long num_entries) |
| 780 | { |
| 781 | long nBuckets, |
| 782 | nSegments, |
| 783 | nDirEntries; |
| 784 | |
| 785 | /* estimate number of buckets wanted */ |
| 786 | nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1); |
| 787 | /* # of segments needed for nBuckets */ |
| 788 | nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1); |
| 789 | /* directory entries */ |
| 790 | nDirEntries = DEF_DIRSIZE; |
| 791 | while (nDirEntries < nSegments) |
| 792 | nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */ |
| 793 | |
| 794 | return nDirEntries; |
| 795 | } |
| 796 | |
| 797 | /* |
| 798 | * Compute the required initial memory allocation for a shared-memory |
| 799 | * hashtable with the given parameters. We need space for the HASHHDR |
| 800 | * and for the (non expansible) directory. |
| 801 | */ |
| 802 | Size |
| 803 | hash_get_shared_size(HASHCTL *info, int flags) |
| 804 | { |
| 805 | Assert(flags & HASH_DIRSIZE); |
| 806 | Assert(info->dsize == info->max_dsize); |
| 807 | return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT); |
| 808 | } |
| 809 | |
| 810 | |
| 811 | /********************** DESTROY ROUTINES ************************/ |
| 812 | |
| 813 | void |
| 814 | hash_destroy(HTAB *hashp) |
| 815 | { |
| 816 | if (hashp != NULL) |
| 817 | { |
| 818 | /* allocation method must be one we know how to free, too */ |
| 819 | Assert(hashp->alloc == DynaHashAlloc); |
| 820 | /* so this hashtable must have its own context */ |
| 821 | Assert(hashp->hcxt != NULL); |
| 822 | |
| 823 | hash_stats("destroy" , hashp); |
| 824 | |
| 825 | /* |
| 826 | * Free everything by destroying the hash table's memory context. |
| 827 | */ |
| 828 | MemoryContextDelete(hashp->hcxt); |
| 829 | } |
| 830 | } |
| 831 | |
| 832 | void |
| 833 | hash_stats(const char *where, HTAB *hashp) |
| 834 | { |
| 835 | #if HASH_STATISTICS |
| 836 | fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n" , |
| 837 | where, hashp->hctl->accesses, hashp->hctl->collisions); |
| 838 | |
| 839 | fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n" , |
| 840 | hash_get_num_entries(hashp), (long) hashp->hctl->keysize, |
| 841 | hashp->hctl->max_bucket, hashp->hctl->nsegs); |
| 842 | fprintf(stderr, "%s: total accesses %ld total collisions %ld\n" , |
| 843 | where, hash_accesses, hash_collisions); |
| 844 | fprintf(stderr, "hash_stats: total expansions %ld\n" , |
| 845 | hash_expansions); |
| 846 | #endif |
| 847 | } |
| 848 | |
| 849 | /*******************************SEARCH ROUTINES *****************************/ |
| 850 | |
| 851 | |
| 852 | /* |
| 853 | * get_hash_value -- exported routine to calculate a key's hash value |
| 854 | * |
| 855 | * We export this because for partitioned tables, callers need to compute |
| 856 | * the partition number (from the low-order bits of the hash value) before |
| 857 | * searching. |
| 858 | */ |
| 859 | uint32 |
| 860 | get_hash_value(HTAB *hashp, const void *keyPtr) |
| 861 | { |
| 862 | return hashp->hash(keyPtr, hashp->keysize); |
| 863 | } |
| 864 | |
| 865 | /* Convert a hash value to a bucket number */ |
| 866 | static inline uint32 |
| 867 | calc_bucket(HASHHDR *hctl, uint32 hash_val) |
| 868 | { |
| 869 | uint32 bucket; |
| 870 | |
| 871 | bucket = hash_val & hctl->high_mask; |
| 872 | if (bucket > hctl->max_bucket) |
| 873 | bucket = bucket & hctl->low_mask; |
| 874 | |
| 875 | return bucket; |
| 876 | } |
| 877 | |
| 878 | /* |
| 879 | * hash_search -- look up key in table and perform action |
| 880 | * hash_search_with_hash_value -- same, with key's hash value already computed |
| 881 | * |
| 882 | * action is one of: |
| 883 | * HASH_FIND: look up key in table |
| 884 | * HASH_ENTER: look up key in table, creating entry if not present |
| 885 | * HASH_ENTER_NULL: same, but return NULL if out of memory |
| 886 | * HASH_REMOVE: look up key in table, remove entry if present |
| 887 | * |
| 888 | * Return value is a pointer to the element found/entered/removed if any, |
| 889 | * or NULL if no match was found. (NB: in the case of the REMOVE action, |
| 890 | * the result is a dangling pointer that shouldn't be dereferenced!) |
| 891 | * |
| 892 | * HASH_ENTER will normally ereport a generic "out of memory" error if |
| 893 | * it is unable to create a new entry. The HASH_ENTER_NULL operation is |
| 894 | * the same except it will return NULL if out of memory. Note that |
| 895 | * HASH_ENTER_NULL cannot be used with the default palloc-based allocator, |
| 896 | * since palloc internally ereports on out-of-memory. |
| 897 | * |
| 898 | * If foundPtr isn't NULL, then *foundPtr is set true if we found an |
| 899 | * existing entry in the table, false otherwise. This is needed in the |
| 900 | * HASH_ENTER case, but is redundant with the return value otherwise. |
| 901 | * |
| 902 | * For hash_search_with_hash_value, the hashvalue parameter must have been |
| 903 | * calculated with get_hash_value(). |
| 904 | */ |
| 905 | void * |
| 906 | hash_search(HTAB *hashp, |
| 907 | const void *keyPtr, |
| 908 | HASHACTION action, |
| 909 | bool *foundPtr) |
| 910 | { |
| 911 | return hash_search_with_hash_value(hashp, |
| 912 | keyPtr, |
| 913 | hashp->hash(keyPtr, hashp->keysize), |
| 914 | action, |
| 915 | foundPtr); |
| 916 | } |
| 917 | |
| 918 | void * |
| 919 | hash_search_with_hash_value(HTAB *hashp, |
| 920 | const void *keyPtr, |
| 921 | uint32 hashvalue, |
| 922 | HASHACTION action, |
| 923 | bool *foundPtr) |
| 924 | { |
| 925 | HASHHDR *hctl = hashp->hctl; |
| 926 | int freelist_idx = FREELIST_IDX(hctl, hashvalue); |
| 927 | Size keysize; |
| 928 | uint32 bucket; |
| 929 | long segment_num; |
| 930 | long segment_ndx; |
| 931 | HASHSEGMENT segp; |
| 932 | HASHBUCKET currBucket; |
| 933 | HASHBUCKET *prevBucketPtr; |
| 934 | HashCompareFunc match; |
| 935 | |
| 936 | #if HASH_STATISTICS |
| 937 | hash_accesses++; |
| 938 | hctl->accesses++; |
| 939 | #endif |
| 940 | |
| 941 | /* |
| 942 | * If inserting, check if it is time to split a bucket. |
| 943 | * |
| 944 | * NOTE: failure to expand table is not a fatal error, it just means we |
| 945 | * have to run at higher fill factor than we wanted. However, if we're |
| 946 | * using the palloc allocator then it will throw error anyway on |
| 947 | * out-of-memory, so we must do this before modifying the table. |
| 948 | */ |
| 949 | if (action == HASH_ENTER || action == HASH_ENTER_NULL) |
| 950 | { |
| 951 | /* |
| 952 | * Can't split if running in partitioned mode, nor if frozen, nor if |
| 953 | * table is the subject of any active hash_seq_search scans. Strange |
| 954 | * order of these tests is to try to check cheaper conditions first. |
| 955 | */ |
| 956 | if (!IS_PARTITIONED(hctl) && !hashp->frozen && |
| 957 | hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && |
| 958 | !has_seq_scans(hashp)) |
| 959 | (void) expand_table(hashp); |
| 960 | } |
| 961 | |
| 962 | /* |
| 963 | * Do the initial lookup |
| 964 | */ |
| 965 | bucket = calc_bucket(hctl, hashvalue); |
| 966 | |
| 967 | segment_num = bucket >> hashp->sshift; |
| 968 | segment_ndx = MOD(bucket, hashp->ssize); |
| 969 | |
| 970 | segp = hashp->dir[segment_num]; |
| 971 | |
| 972 | if (segp == NULL) |
| 973 | hash_corrupted(hashp); |
| 974 | |
| 975 | prevBucketPtr = &segp[segment_ndx]; |
| 976 | currBucket = *prevBucketPtr; |
| 977 | |
| 978 | /* |
| 979 | * Follow collision chain looking for matching key |
| 980 | */ |
| 981 | match = hashp->match; /* save one fetch in inner loop */ |
| 982 | keysize = hashp->keysize; /* ditto */ |
| 983 | |
| 984 | while (currBucket != NULL) |
| 985 | { |
| 986 | if (currBucket->hashvalue == hashvalue && |
| 987 | match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0) |
| 988 | break; |
| 989 | prevBucketPtr = &(currBucket->link); |
| 990 | currBucket = *prevBucketPtr; |
| 991 | #if HASH_STATISTICS |
| 992 | hash_collisions++; |
| 993 | hctl->collisions++; |
| 994 | #endif |
| 995 | } |
| 996 | |
| 997 | if (foundPtr) |
| 998 | *foundPtr = (bool) (currBucket != NULL); |
| 999 | |
| 1000 | /* |
| 1001 | * OK, now what? |
| 1002 | */ |
| 1003 | switch (action) |
| 1004 | { |
| 1005 | case HASH_FIND: |
| 1006 | if (currBucket != NULL) |
| 1007 | return (void *) ELEMENTKEY(currBucket); |
| 1008 | return NULL; |
| 1009 | |
| 1010 | case HASH_REMOVE: |
| 1011 | if (currBucket != NULL) |
| 1012 | { |
| 1013 | /* if partitioned, must lock to touch nentries and freeList */ |
| 1014 | if (IS_PARTITIONED(hctl)) |
| 1015 | SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex)); |
| 1016 | |
| 1017 | /* delete the record from the appropriate nentries counter. */ |
| 1018 | Assert(hctl->freeList[freelist_idx].nentries > 0); |
| 1019 | hctl->freeList[freelist_idx].nentries--; |
| 1020 | |
| 1021 | /* remove record from hash bucket's chain. */ |
| 1022 | *prevBucketPtr = currBucket->link; |
| 1023 | |
| 1024 | /* add the record to the appropriate freelist. */ |
| 1025 | currBucket->link = hctl->freeList[freelist_idx].freeList; |
| 1026 | hctl->freeList[freelist_idx].freeList = currBucket; |
| 1027 | |
| 1028 | if (IS_PARTITIONED(hctl)) |
| 1029 | SpinLockRelease(&hctl->freeList[freelist_idx].mutex); |
| 1030 | |
| 1031 | /* |
| 1032 | * better hope the caller is synchronizing access to this |
| 1033 | * element, because someone else is going to reuse it the next |
| 1034 | * time something is added to the table |
| 1035 | */ |
| 1036 | return (void *) ELEMENTKEY(currBucket); |
| 1037 | } |
| 1038 | return NULL; |
| 1039 | |
| 1040 | case HASH_ENTER_NULL: |
| 1041 | /* ENTER_NULL does not work with palloc-based allocator */ |
| 1042 | Assert(hashp->alloc != DynaHashAlloc); |
| 1043 | /* FALL THRU */ |
| 1044 | |
| 1045 | case HASH_ENTER: |
| 1046 | /* Return existing element if found, else create one */ |
| 1047 | if (currBucket != NULL) |
| 1048 | return (void *) ELEMENTKEY(currBucket); |
| 1049 | |
| 1050 | /* disallow inserts if frozen */ |
| 1051 | if (hashp->frozen) |
| 1052 | elog(ERROR, "cannot insert into frozen hashtable \"%s\"" , |
| 1053 | hashp->tabname); |
| 1054 | |
| 1055 | currBucket = get_hash_entry(hashp, freelist_idx); |
| 1056 | if (currBucket == NULL) |
| 1057 | { |
| 1058 | /* out of memory */ |
| 1059 | if (action == HASH_ENTER_NULL) |
| 1060 | return NULL; |
| 1061 | /* report a generic message */ |
| 1062 | if (hashp->isshared) |
| 1063 | ereport(ERROR, |
| 1064 | (errcode(ERRCODE_OUT_OF_MEMORY), |
| 1065 | errmsg("out of shared memory" ))); |
| 1066 | else |
| 1067 | ereport(ERROR, |
| 1068 | (errcode(ERRCODE_OUT_OF_MEMORY), |
| 1069 | errmsg("out of memory" ))); |
| 1070 | } |
| 1071 | |
| 1072 | /* link into hashbucket chain */ |
| 1073 | *prevBucketPtr = currBucket; |
| 1074 | currBucket->link = NULL; |
| 1075 | |
| 1076 | /* copy key into record */ |
| 1077 | currBucket->hashvalue = hashvalue; |
| 1078 | hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize); |
| 1079 | |
| 1080 | /* |
| 1081 | * Caller is expected to fill the data field on return. DO NOT |
| 1082 | * insert any code that could possibly throw error here, as doing |
| 1083 | * so would leave the table entry incomplete and hence corrupt the |
| 1084 | * caller's data structure. |
| 1085 | */ |
| 1086 | |
| 1087 | return (void *) ELEMENTKEY(currBucket); |
| 1088 | } |
| 1089 | |
| 1090 | elog(ERROR, "unrecognized hash action code: %d" , (int) action); |
| 1091 | |
| 1092 | return NULL; /* keep compiler quiet */ |
| 1093 | } |
| 1094 | |
| 1095 | /* |
| 1096 | * hash_update_hash_key -- change the hash key of an existing table entry |
| 1097 | * |
| 1098 | * This is equivalent to removing the entry, making a new entry, and copying |
| 1099 | * over its data, except that the entry never goes to the table's freelist. |
| 1100 | * Therefore this cannot suffer an out-of-memory failure, even if there are |
| 1101 | * other processes operating in other partitions of the hashtable. |
| 1102 | * |
| 1103 | * Returns true if successful, false if the requested new hash key is already |
| 1104 | * present. Throws error if the specified entry pointer isn't actually a |
| 1105 | * table member. |
| 1106 | * |
| 1107 | * NB: currently, there is no special case for old and new hash keys being |
| 1108 | * identical, which means we'll report false for that situation. This is |
| 1109 | * preferable for existing uses. |
| 1110 | * |
| 1111 | * NB: for a partitioned hashtable, caller must hold lock on both relevant |
| 1112 | * partitions, if the new hash key would belong to a different partition. |
| 1113 | */ |
| 1114 | bool |
| 1115 | hash_update_hash_key(HTAB *hashp, |
| 1116 | void *existingEntry, |
| 1117 | const void *newKeyPtr) |
| 1118 | { |
| 1119 | HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry); |
| 1120 | HASHHDR *hctl = hashp->hctl; |
| 1121 | uint32 newhashvalue; |
| 1122 | Size keysize; |
| 1123 | uint32 bucket; |
| 1124 | uint32 newbucket; |
| 1125 | long segment_num; |
| 1126 | long segment_ndx; |
| 1127 | HASHSEGMENT segp; |
| 1128 | HASHBUCKET currBucket; |
| 1129 | HASHBUCKET *prevBucketPtr; |
| 1130 | HASHBUCKET *oldPrevPtr; |
| 1131 | HashCompareFunc match; |
| 1132 | |
| 1133 | #if HASH_STATISTICS |
| 1134 | hash_accesses++; |
| 1135 | hctl->accesses++; |
| 1136 | #endif |
| 1137 | |
| 1138 | /* disallow updates if frozen */ |
| 1139 | if (hashp->frozen) |
| 1140 | elog(ERROR, "cannot update in frozen hashtable \"%s\"" , |
| 1141 | hashp->tabname); |
| 1142 | |
| 1143 | /* |
| 1144 | * Lookup the existing element using its saved hash value. We need to do |
| 1145 | * this to be able to unlink it from its hash chain, but as a side benefit |
| 1146 | * we can verify the validity of the passed existingEntry pointer. |
| 1147 | */ |
| 1148 | bucket = calc_bucket(hctl, existingElement->hashvalue); |
| 1149 | |
| 1150 | segment_num = bucket >> hashp->sshift; |
| 1151 | segment_ndx = MOD(bucket, hashp->ssize); |
| 1152 | |
| 1153 | segp = hashp->dir[segment_num]; |
| 1154 | |
| 1155 | if (segp == NULL) |
| 1156 | hash_corrupted(hashp); |
| 1157 | |
| 1158 | prevBucketPtr = &segp[segment_ndx]; |
| 1159 | currBucket = *prevBucketPtr; |
| 1160 | |
| 1161 | while (currBucket != NULL) |
| 1162 | { |
| 1163 | if (currBucket == existingElement) |
| 1164 | break; |
| 1165 | prevBucketPtr = &(currBucket->link); |
| 1166 | currBucket = *prevBucketPtr; |
| 1167 | } |
| 1168 | |
| 1169 | if (currBucket == NULL) |
| 1170 | elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"" , |
| 1171 | hashp->tabname); |
| 1172 | |
| 1173 | oldPrevPtr = prevBucketPtr; |
| 1174 | |
| 1175 | /* |
| 1176 | * Now perform the equivalent of a HASH_ENTER operation to locate the hash |
| 1177 | * chain we want to put the entry into. |
| 1178 | */ |
| 1179 | newhashvalue = hashp->hash(newKeyPtr, hashp->keysize); |
| 1180 | |
| 1181 | newbucket = calc_bucket(hctl, newhashvalue); |
| 1182 | |
| 1183 | segment_num = newbucket >> hashp->sshift; |
| 1184 | segment_ndx = MOD(newbucket, hashp->ssize); |
| 1185 | |
| 1186 | segp = hashp->dir[segment_num]; |
| 1187 | |
| 1188 | if (segp == NULL) |
| 1189 | hash_corrupted(hashp); |
| 1190 | |
| 1191 | prevBucketPtr = &segp[segment_ndx]; |
| 1192 | currBucket = *prevBucketPtr; |
| 1193 | |
| 1194 | /* |
| 1195 | * Follow collision chain looking for matching key |
| 1196 | */ |
| 1197 | match = hashp->match; /* save one fetch in inner loop */ |
| 1198 | keysize = hashp->keysize; /* ditto */ |
| 1199 | |
| 1200 | while (currBucket != NULL) |
| 1201 | { |
| 1202 | if (currBucket->hashvalue == newhashvalue && |
| 1203 | match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0) |
| 1204 | break; |
| 1205 | prevBucketPtr = &(currBucket->link); |
| 1206 | currBucket = *prevBucketPtr; |
| 1207 | #if HASH_STATISTICS |
| 1208 | hash_collisions++; |
| 1209 | hctl->collisions++; |
| 1210 | #endif |
| 1211 | } |
| 1212 | |
| 1213 | if (currBucket != NULL) |
| 1214 | return false; /* collision with an existing entry */ |
| 1215 | |
| 1216 | currBucket = existingElement; |
| 1217 | |
| 1218 | /* |
| 1219 | * If old and new hash values belong to the same bucket, we need not |
| 1220 | * change any chain links, and indeed should not since this simplistic |
| 1221 | * update will corrupt the list if currBucket is the last element. (We |
| 1222 | * cannot fall out earlier, however, since we need to scan the bucket to |
| 1223 | * check for duplicate keys.) |
| 1224 | */ |
| 1225 | if (bucket != newbucket) |
| 1226 | { |
| 1227 | /* OK to remove record from old hash bucket's chain. */ |
| 1228 | *oldPrevPtr = currBucket->link; |
| 1229 | |
| 1230 | /* link into new hashbucket chain */ |
| 1231 | *prevBucketPtr = currBucket; |
| 1232 | currBucket->link = NULL; |
| 1233 | } |
| 1234 | |
| 1235 | /* copy new key into record */ |
| 1236 | currBucket->hashvalue = newhashvalue; |
| 1237 | hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize); |
| 1238 | |
| 1239 | /* rest of record is untouched */ |
| 1240 | |
| 1241 | return true; |
| 1242 | } |
| 1243 | |
| 1244 | /* |
| 1245 | * Allocate a new hashtable entry if possible; return NULL if out of memory. |
| 1246 | * (Or, if the underlying space allocator throws error for out-of-memory, |
| 1247 | * we won't return at all.) |
| 1248 | */ |
| 1249 | static HASHBUCKET |
| 1250 | get_hash_entry(HTAB *hashp, int freelist_idx) |
| 1251 | { |
| 1252 | HASHHDR *hctl = hashp->hctl; |
| 1253 | HASHBUCKET newElement; |
| 1254 | |
| 1255 | for (;;) |
| 1256 | { |
| 1257 | /* if partitioned, must lock to touch nentries and freeList */ |
| 1258 | if (IS_PARTITIONED(hctl)) |
| 1259 | SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); |
| 1260 | |
| 1261 | /* try to get an entry from the freelist */ |
| 1262 | newElement = hctl->freeList[freelist_idx].freeList; |
| 1263 | |
| 1264 | if (newElement != NULL) |
| 1265 | break; |
| 1266 | |
| 1267 | if (IS_PARTITIONED(hctl)) |
| 1268 | SpinLockRelease(&hctl->freeList[freelist_idx].mutex); |
| 1269 | |
| 1270 | /* |
| 1271 | * No free elements in this freelist. In a partitioned table, there |
| 1272 | * might be entries in other freelists, but to reduce contention we |
| 1273 | * prefer to first try to get another chunk of buckets from the main |
| 1274 | * shmem allocator. If that fails, though, we *MUST* root through all |
| 1275 | * the other freelists before giving up. There are multiple callers |
| 1276 | * that assume that they can allocate every element in the initially |
| 1277 | * requested table size, or that deleting an element guarantees they |
| 1278 | * can insert a new element, even if shared memory is entirely full. |
| 1279 | * Failing because the needed element is in a different freelist is |
| 1280 | * not acceptable. |
| 1281 | */ |
| 1282 | if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx)) |
| 1283 | { |
| 1284 | int borrow_from_idx; |
| 1285 | |
| 1286 | if (!IS_PARTITIONED(hctl)) |
| 1287 | return NULL; /* out of memory */ |
| 1288 | |
| 1289 | /* try to borrow element from another freelist */ |
| 1290 | borrow_from_idx = freelist_idx; |
| 1291 | for (;;) |
| 1292 | { |
| 1293 | borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS; |
| 1294 | if (borrow_from_idx == freelist_idx) |
| 1295 | break; /* examined all freelists, fail */ |
| 1296 | |
| 1297 | SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex)); |
| 1298 | newElement = hctl->freeList[borrow_from_idx].freeList; |
| 1299 | |
| 1300 | if (newElement != NULL) |
| 1301 | { |
| 1302 | hctl->freeList[borrow_from_idx].freeList = newElement->link; |
| 1303 | SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); |
| 1304 | |
| 1305 | /* careful: count the new element in its proper freelist */ |
| 1306 | SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); |
| 1307 | hctl->freeList[freelist_idx].nentries++; |
| 1308 | SpinLockRelease(&hctl->freeList[freelist_idx].mutex); |
| 1309 | |
| 1310 | return newElement; |
| 1311 | } |
| 1312 | |
| 1313 | SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); |
| 1314 | } |
| 1315 | |
| 1316 | /* no elements available to borrow either, so out of memory */ |
| 1317 | return NULL; |
| 1318 | } |
| 1319 | } |
| 1320 | |
| 1321 | /* remove entry from freelist, bump nentries */ |
| 1322 | hctl->freeList[freelist_idx].freeList = newElement->link; |
| 1323 | hctl->freeList[freelist_idx].nentries++; |
| 1324 | |
| 1325 | if (IS_PARTITIONED(hctl)) |
| 1326 | SpinLockRelease(&hctl->freeList[freelist_idx].mutex); |
| 1327 | |
| 1328 | return newElement; |
| 1329 | } |
| 1330 | |
| 1331 | /* |
| 1332 | * hash_get_num_entries -- get the number of entries in a hashtable |
| 1333 | */ |
| 1334 | long |
| 1335 | hash_get_num_entries(HTAB *hashp) |
| 1336 | { |
| 1337 | int i; |
| 1338 | long sum = hashp->hctl->freeList[0].nentries; |
| 1339 | |
| 1340 | /* |
| 1341 | * We currently don't bother with acquiring the mutexes; it's only |
| 1342 | * sensible to call this function if you've got lock on all partitions of |
| 1343 | * the table. |
| 1344 | */ |
| 1345 | if (IS_PARTITIONED(hashp->hctl)) |
| 1346 | { |
| 1347 | for (i = 1; i < NUM_FREELISTS; i++) |
| 1348 | sum += hashp->hctl->freeList[i].nentries; |
| 1349 | } |
| 1350 | |
| 1351 | return sum; |
| 1352 | } |
| 1353 | |
| 1354 | /* |
| 1355 | * hash_seq_init/_search/_term |
| 1356 | * Sequentially search through hash table and return |
| 1357 | * all the elements one by one, return NULL when no more. |
| 1358 | * |
| 1359 | * hash_seq_term should be called if and only if the scan is abandoned before |
| 1360 | * completion; if hash_seq_search returns NULL then it has already done the |
| 1361 | * end-of-scan cleanup. |
| 1362 | * |
| 1363 | * NOTE: caller may delete the returned element before continuing the scan. |
| 1364 | * However, deleting any other element while the scan is in progress is |
| 1365 | * UNDEFINED (it might be the one that curIndex is pointing at!). Also, |
| 1366 | * if elements are added to the table while the scan is in progress, it is |
| 1367 | * unspecified whether they will be visited by the scan or not. |
| 1368 | * |
| 1369 | * NOTE: it is possible to use hash_seq_init/hash_seq_search without any |
| 1370 | * worry about hash_seq_term cleanup, if the hashtable is first locked against |
| 1371 | * further insertions by calling hash_freeze. |
| 1372 | * |
| 1373 | * NOTE: to use this with a partitioned hashtable, caller had better hold |
| 1374 | * at least shared lock on all partitions of the table throughout the scan! |
| 1375 | * We can cope with insertions or deletions by our own backend, but *not* |
| 1376 | * with concurrent insertions or deletions by another. |
| 1377 | */ |
| 1378 | void |
| 1379 | hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp) |
| 1380 | { |
| 1381 | status->hashp = hashp; |
| 1382 | status->curBucket = 0; |
| 1383 | status->curEntry = NULL; |
| 1384 | if (!hashp->frozen) |
| 1385 | register_seq_scan(hashp); |
| 1386 | } |
| 1387 | |
| 1388 | void * |
| 1389 | hash_seq_search(HASH_SEQ_STATUS *status) |
| 1390 | { |
| 1391 | HTAB *hashp; |
| 1392 | HASHHDR *hctl; |
| 1393 | uint32 max_bucket; |
| 1394 | long ssize; |
| 1395 | long segment_num; |
| 1396 | long segment_ndx; |
| 1397 | HASHSEGMENT segp; |
| 1398 | uint32 curBucket; |
| 1399 | HASHELEMENT *curElem; |
| 1400 | |
| 1401 | if ((curElem = status->curEntry) != NULL) |
| 1402 | { |
| 1403 | /* Continuing scan of curBucket... */ |
| 1404 | status->curEntry = curElem->link; |
| 1405 | if (status->curEntry == NULL) /* end of this bucket */ |
| 1406 | ++status->curBucket; |
| 1407 | return (void *) ELEMENTKEY(curElem); |
| 1408 | } |
| 1409 | |
| 1410 | /* |
| 1411 | * Search for next nonempty bucket starting at curBucket. |
| 1412 | */ |
| 1413 | curBucket = status->curBucket; |
| 1414 | hashp = status->hashp; |
| 1415 | hctl = hashp->hctl; |
| 1416 | ssize = hashp->ssize; |
| 1417 | max_bucket = hctl->max_bucket; |
| 1418 | |
| 1419 | if (curBucket > max_bucket) |
| 1420 | { |
| 1421 | hash_seq_term(status); |
| 1422 | return NULL; /* search is done */ |
| 1423 | } |
| 1424 | |
| 1425 | /* |
| 1426 | * first find the right segment in the table directory. |
| 1427 | */ |
| 1428 | segment_num = curBucket >> hashp->sshift; |
| 1429 | segment_ndx = MOD(curBucket, ssize); |
| 1430 | |
| 1431 | segp = hashp->dir[segment_num]; |
| 1432 | |
| 1433 | /* |
| 1434 | * Pick up the first item in this bucket's chain. If chain is not empty |
| 1435 | * we can begin searching it. Otherwise we have to advance to find the |
| 1436 | * next nonempty bucket. We try to optimize that case since searching a |
| 1437 | * near-empty hashtable has to iterate this loop a lot. |
| 1438 | */ |
| 1439 | while ((curElem = segp[segment_ndx]) == NULL) |
| 1440 | { |
| 1441 | /* empty bucket, advance to next */ |
| 1442 | if (++curBucket > max_bucket) |
| 1443 | { |
| 1444 | status->curBucket = curBucket; |
| 1445 | hash_seq_term(status); |
| 1446 | return NULL; /* search is done */ |
| 1447 | } |
| 1448 | if (++segment_ndx >= ssize) |
| 1449 | { |
| 1450 | segment_num++; |
| 1451 | segment_ndx = 0; |
| 1452 | segp = hashp->dir[segment_num]; |
| 1453 | } |
| 1454 | } |
| 1455 | |
| 1456 | /* Begin scan of curBucket... */ |
| 1457 | status->curEntry = curElem->link; |
| 1458 | if (status->curEntry == NULL) /* end of this bucket */ |
| 1459 | ++curBucket; |
| 1460 | status->curBucket = curBucket; |
| 1461 | return (void *) ELEMENTKEY(curElem); |
| 1462 | } |
| 1463 | |
| 1464 | void |
| 1465 | hash_seq_term(HASH_SEQ_STATUS *status) |
| 1466 | { |
| 1467 | if (!status->hashp->frozen) |
| 1468 | deregister_seq_scan(status->hashp); |
| 1469 | } |
| 1470 | |
| 1471 | /* |
| 1472 | * hash_freeze |
| 1473 | * Freeze a hashtable against future insertions (deletions are |
| 1474 | * still allowed) |
| 1475 | * |
| 1476 | * The reason for doing this is that by preventing any more bucket splits, |
| 1477 | * we no longer need to worry about registering hash_seq_search scans, |
| 1478 | * and thus caller need not be careful about ensuring hash_seq_term gets |
| 1479 | * called at the right times. |
| 1480 | * |
| 1481 | * Multiple calls to hash_freeze() are allowed, but you can't freeze a table |
| 1482 | * with active scans (since hash_seq_term would then do the wrong thing). |
| 1483 | */ |
| 1484 | void |
| 1485 | hash_freeze(HTAB *hashp) |
| 1486 | { |
| 1487 | if (hashp->isshared) |
| 1488 | elog(ERROR, "cannot freeze shared hashtable \"%s\"" , hashp->tabname); |
| 1489 | if (!hashp->frozen && has_seq_scans(hashp)) |
| 1490 | elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans" , |
| 1491 | hashp->tabname); |
| 1492 | hashp->frozen = true; |
| 1493 | } |
| 1494 | |
| 1495 | |
| 1496 | /********************************* UTILITIES ************************/ |
| 1497 | |
| 1498 | /* |
| 1499 | * Expand the table by adding one more hash bucket. |
| 1500 | */ |
| 1501 | static bool |
| 1502 | expand_table(HTAB *hashp) |
| 1503 | { |
| 1504 | HASHHDR *hctl = hashp->hctl; |
| 1505 | HASHSEGMENT old_seg, |
| 1506 | new_seg; |
| 1507 | long old_bucket, |
| 1508 | new_bucket; |
| 1509 | long new_segnum, |
| 1510 | new_segndx; |
| 1511 | long old_segnum, |
| 1512 | old_segndx; |
| 1513 | HASHBUCKET *oldlink, |
| 1514 | *newlink; |
| 1515 | HASHBUCKET currElement, |
| 1516 | nextElement; |
| 1517 | |
| 1518 | Assert(!IS_PARTITIONED(hctl)); |
| 1519 | |
| 1520 | #ifdef HASH_STATISTICS |
| 1521 | hash_expansions++; |
| 1522 | #endif |
| 1523 | |
| 1524 | new_bucket = hctl->max_bucket + 1; |
| 1525 | new_segnum = new_bucket >> hashp->sshift; |
| 1526 | new_segndx = MOD(new_bucket, hashp->ssize); |
| 1527 | |
| 1528 | if (new_segnum >= hctl->nsegs) |
| 1529 | { |
| 1530 | /* Allocate new segment if necessary -- could fail if dir full */ |
| 1531 | if (new_segnum >= hctl->dsize) |
| 1532 | if (!dir_realloc(hashp)) |
| 1533 | return false; |
| 1534 | if (!(hashp->dir[new_segnum] = seg_alloc(hashp))) |
| 1535 | return false; |
| 1536 | hctl->nsegs++; |
| 1537 | } |
| 1538 | |
| 1539 | /* OK, we created a new bucket */ |
| 1540 | hctl->max_bucket++; |
| 1541 | |
| 1542 | /* |
| 1543 | * *Before* changing masks, find old bucket corresponding to same hash |
| 1544 | * values; values in that bucket may need to be relocated to new bucket. |
| 1545 | * Note that new_bucket is certainly larger than low_mask at this point, |
| 1546 | * so we can skip the first step of the regular hash mask calc. |
| 1547 | */ |
| 1548 | old_bucket = (new_bucket & hctl->low_mask); |
| 1549 | |
| 1550 | /* |
| 1551 | * If we crossed a power of 2, readjust masks. |
| 1552 | */ |
| 1553 | if ((uint32) new_bucket > hctl->high_mask) |
| 1554 | { |
| 1555 | hctl->low_mask = hctl->high_mask; |
| 1556 | hctl->high_mask = (uint32) new_bucket | hctl->low_mask; |
| 1557 | } |
| 1558 | |
| 1559 | /* |
| 1560 | * Relocate records to the new bucket. NOTE: because of the way the hash |
| 1561 | * masking is done in calc_bucket, only one old bucket can need to be |
| 1562 | * split at this point. With a different way of reducing the hash value, |
| 1563 | * that might not be true! |
| 1564 | */ |
| 1565 | old_segnum = old_bucket >> hashp->sshift; |
| 1566 | old_segndx = MOD(old_bucket, hashp->ssize); |
| 1567 | |
| 1568 | old_seg = hashp->dir[old_segnum]; |
| 1569 | new_seg = hashp->dir[new_segnum]; |
| 1570 | |
| 1571 | oldlink = &old_seg[old_segndx]; |
| 1572 | newlink = &new_seg[new_segndx]; |
| 1573 | |
| 1574 | for (currElement = *oldlink; |
| 1575 | currElement != NULL; |
| 1576 | currElement = nextElement) |
| 1577 | { |
| 1578 | nextElement = currElement->link; |
| 1579 | if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket) |
| 1580 | { |
| 1581 | *oldlink = currElement; |
| 1582 | oldlink = &currElement->link; |
| 1583 | } |
| 1584 | else |
| 1585 | { |
| 1586 | *newlink = currElement; |
| 1587 | newlink = &currElement->link; |
| 1588 | } |
| 1589 | } |
| 1590 | /* don't forget to terminate the rebuilt hash chains... */ |
| 1591 | *oldlink = NULL; |
| 1592 | *newlink = NULL; |
| 1593 | |
| 1594 | return true; |
| 1595 | } |
| 1596 | |
| 1597 | |
| 1598 | static bool |
| 1599 | dir_realloc(HTAB *hashp) |
| 1600 | { |
| 1601 | HASHSEGMENT *p; |
| 1602 | HASHSEGMENT *old_p; |
| 1603 | long new_dsize; |
| 1604 | long old_dirsize; |
| 1605 | long new_dirsize; |
| 1606 | |
| 1607 | if (hashp->hctl->max_dsize != NO_MAX_DSIZE) |
| 1608 | return false; |
| 1609 | |
| 1610 | /* Reallocate directory */ |
| 1611 | new_dsize = hashp->hctl->dsize << 1; |
| 1612 | old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT); |
| 1613 | new_dirsize = new_dsize * sizeof(HASHSEGMENT); |
| 1614 | |
| 1615 | old_p = hashp->dir; |
| 1616 | CurrentDynaHashCxt = hashp->hcxt; |
| 1617 | p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize); |
| 1618 | |
| 1619 | if (p != NULL) |
| 1620 | { |
| 1621 | memcpy(p, old_p, old_dirsize); |
| 1622 | MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize); |
| 1623 | hashp->dir = p; |
| 1624 | hashp->hctl->dsize = new_dsize; |
| 1625 | |
| 1626 | /* XXX assume the allocator is palloc, so we know how to free */ |
| 1627 | Assert(hashp->alloc == DynaHashAlloc); |
| 1628 | pfree(old_p); |
| 1629 | |
| 1630 | return true; |
| 1631 | } |
| 1632 | |
| 1633 | return false; |
| 1634 | } |
| 1635 | |
| 1636 | |
| 1637 | static HASHSEGMENT |
| 1638 | seg_alloc(HTAB *hashp) |
| 1639 | { |
| 1640 | HASHSEGMENT segp; |
| 1641 | |
| 1642 | CurrentDynaHashCxt = hashp->hcxt; |
| 1643 | segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize); |
| 1644 | |
| 1645 | if (!segp) |
| 1646 | return NULL; |
| 1647 | |
| 1648 | MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize); |
| 1649 | |
| 1650 | return segp; |
| 1651 | } |
| 1652 | |
| 1653 | /* |
| 1654 | * allocate some new elements and link them into the indicated free list |
| 1655 | */ |
| 1656 | static bool |
| 1657 | element_alloc(HTAB *hashp, int nelem, int freelist_idx) |
| 1658 | { |
| 1659 | HASHHDR *hctl = hashp->hctl; |
| 1660 | Size elementSize; |
| 1661 | HASHELEMENT *firstElement; |
| 1662 | HASHELEMENT *tmpElement; |
| 1663 | HASHELEMENT *prevElement; |
| 1664 | int i; |
| 1665 | |
| 1666 | if (hashp->isfixed) |
| 1667 | return false; |
| 1668 | |
| 1669 | /* Each element has a HASHELEMENT header plus user data. */ |
| 1670 | elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize); |
| 1671 | |
| 1672 | CurrentDynaHashCxt = hashp->hcxt; |
| 1673 | firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize); |
| 1674 | |
| 1675 | if (!firstElement) |
| 1676 | return false; |
| 1677 | |
| 1678 | /* prepare to link all the new entries into the freelist */ |
| 1679 | prevElement = NULL; |
| 1680 | tmpElement = firstElement; |
| 1681 | for (i = 0; i < nelem; i++) |
| 1682 | { |
| 1683 | tmpElement->link = prevElement; |
| 1684 | prevElement = tmpElement; |
| 1685 | tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize); |
| 1686 | } |
| 1687 | |
| 1688 | /* if partitioned, must lock to touch freeList */ |
| 1689 | if (IS_PARTITIONED(hctl)) |
| 1690 | SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); |
| 1691 | |
| 1692 | /* freelist could be nonempty if two backends did this concurrently */ |
| 1693 | firstElement->link = hctl->freeList[freelist_idx].freeList; |
| 1694 | hctl->freeList[freelist_idx].freeList = prevElement; |
| 1695 | |
| 1696 | if (IS_PARTITIONED(hctl)) |
| 1697 | SpinLockRelease(&hctl->freeList[freelist_idx].mutex); |
| 1698 | |
| 1699 | return true; |
| 1700 | } |
| 1701 | |
| 1702 | /* complain when we have detected a corrupted hashtable */ |
| 1703 | static void |
| 1704 | hash_corrupted(HTAB *hashp) |
| 1705 | { |
| 1706 | /* |
| 1707 | * If the corruption is in a shared hashtable, we'd better force a |
| 1708 | * systemwide restart. Otherwise, just shut down this one backend. |
| 1709 | */ |
| 1710 | if (hashp->isshared) |
| 1711 | elog(PANIC, "hash table \"%s\" corrupted" , hashp->tabname); |
| 1712 | else |
| 1713 | elog(FATAL, "hash table \"%s\" corrupted" , hashp->tabname); |
| 1714 | } |
| 1715 | |
| 1716 | /* calculate ceil(log base 2) of num */ |
| 1717 | int |
| 1718 | my_log2(long num) |
| 1719 | { |
| 1720 | int i; |
| 1721 | long limit; |
| 1722 | |
| 1723 | /* guard against too-large input, which would put us into infinite loop */ |
| 1724 | if (num > LONG_MAX / 2) |
| 1725 | num = LONG_MAX / 2; |
| 1726 | |
| 1727 | for (i = 0, limit = 1; limit < num; i++, limit <<= 1) |
| 1728 | ; |
| 1729 | return i; |
| 1730 | } |
| 1731 | |
| 1732 | /* calculate first power of 2 >= num, bounded to what will fit in a long */ |
| 1733 | static long |
| 1734 | next_pow2_long(long num) |
| 1735 | { |
| 1736 | /* my_log2's internal range check is sufficient */ |
| 1737 | return 1L << my_log2(num); |
| 1738 | } |
| 1739 | |
| 1740 | /* calculate first power of 2 >= num, bounded to what will fit in an int */ |
| 1741 | static int |
| 1742 | next_pow2_int(long num) |
| 1743 | { |
| 1744 | if (num > INT_MAX / 2) |
| 1745 | num = INT_MAX / 2; |
| 1746 | return 1 << my_log2(num); |
| 1747 | } |
| 1748 | |
| 1749 | |
| 1750 | /************************* SEQ SCAN TRACKING ************************/ |
| 1751 | |
| 1752 | /* |
| 1753 | * We track active hash_seq_search scans here. The need for this mechanism |
| 1754 | * comes from the fact that a scan will get confused if a bucket split occurs |
| 1755 | * while it's in progress: it might visit entries twice, or even miss some |
| 1756 | * entirely (if it's partway through the same bucket that splits). Hence |
| 1757 | * we want to inhibit bucket splits if there are any active scans on the |
| 1758 | * table being inserted into. This is a fairly rare case in current usage, |
| 1759 | * so just postponing the split until the next insertion seems sufficient. |
| 1760 | * |
| 1761 | * Given present usages of the function, only a few scans are likely to be |
| 1762 | * open concurrently; so a finite-size stack of open scans seems sufficient, |
| 1763 | * and we don't worry that linear search is too slow. Note that we do |
| 1764 | * allow multiple scans of the same hashtable to be open concurrently. |
| 1765 | * |
| 1766 | * This mechanism can support concurrent scan and insertion in a shared |
| 1767 | * hashtable if it's the same backend doing both. It would fail otherwise, |
| 1768 | * but locking reasons seem to preclude any such scenario anyway, so we don't |
| 1769 | * worry. |
| 1770 | * |
| 1771 | * This arrangement is reasonably robust if a transient hashtable is deleted |
| 1772 | * without notifying us. The absolute worst case is we might inhibit splits |
| 1773 | * in another table created later at exactly the same address. We will give |
| 1774 | * a warning at transaction end for reference leaks, so any bugs leading to |
| 1775 | * lack of notification should be easy to catch. |
| 1776 | */ |
| 1777 | |
| 1778 | #define MAX_SEQ_SCANS 100 |
| 1779 | |
| 1780 | static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */ |
| 1781 | static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */ |
| 1782 | static int num_seq_scans = 0; |
| 1783 | |
| 1784 | |
| 1785 | /* Register a table as having an active hash_seq_search scan */ |
| 1786 | static void |
| 1787 | register_seq_scan(HTAB *hashp) |
| 1788 | { |
| 1789 | if (num_seq_scans >= MAX_SEQ_SCANS) |
| 1790 | elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"" , |
| 1791 | hashp->tabname); |
| 1792 | seq_scan_tables[num_seq_scans] = hashp; |
| 1793 | seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel(); |
| 1794 | num_seq_scans++; |
| 1795 | } |
| 1796 | |
| 1797 | /* Deregister an active scan */ |
| 1798 | static void |
| 1799 | deregister_seq_scan(HTAB *hashp) |
| 1800 | { |
| 1801 | int i; |
| 1802 | |
| 1803 | /* Search backward since it's most likely at the stack top */ |
| 1804 | for (i = num_seq_scans - 1; i >= 0; i--) |
| 1805 | { |
| 1806 | if (seq_scan_tables[i] == hashp) |
| 1807 | { |
| 1808 | seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1]; |
| 1809 | seq_scan_level[i] = seq_scan_level[num_seq_scans - 1]; |
| 1810 | num_seq_scans--; |
| 1811 | return; |
| 1812 | } |
| 1813 | } |
| 1814 | elog(ERROR, "no hash_seq_search scan for hash table \"%s\"" , |
| 1815 | hashp->tabname); |
| 1816 | } |
| 1817 | |
| 1818 | /* Check if a table has any active scan */ |
| 1819 | static bool |
| 1820 | has_seq_scans(HTAB *hashp) |
| 1821 | { |
| 1822 | int i; |
| 1823 | |
| 1824 | for (i = 0; i < num_seq_scans; i++) |
| 1825 | { |
| 1826 | if (seq_scan_tables[i] == hashp) |
| 1827 | return true; |
| 1828 | } |
| 1829 | return false; |
| 1830 | } |
| 1831 | |
| 1832 | /* Clean up any open scans at end of transaction */ |
| 1833 | void |
| 1834 | AtEOXact_HashTables(bool isCommit) |
| 1835 | { |
| 1836 | /* |
| 1837 | * During abort cleanup, open scans are expected; just silently clean 'em |
| 1838 | * out. An open scan at commit means someone forgot a hash_seq_term() |
| 1839 | * call, so complain. |
| 1840 | * |
| 1841 | * Note: it's tempting to try to print the tabname here, but refrain for |
| 1842 | * fear of touching deallocated memory. This isn't a user-facing message |
| 1843 | * anyway, so it needn't be pretty. |
| 1844 | */ |
| 1845 | if (isCommit) |
| 1846 | { |
| 1847 | int i; |
| 1848 | |
| 1849 | for (i = 0; i < num_seq_scans; i++) |
| 1850 | { |
| 1851 | elog(WARNING, "leaked hash_seq_search scan for hash table %p" , |
| 1852 | seq_scan_tables[i]); |
| 1853 | } |
| 1854 | } |
| 1855 | num_seq_scans = 0; |
| 1856 | } |
| 1857 | |
| 1858 | /* Clean up any open scans at end of subtransaction */ |
| 1859 | void |
| 1860 | AtEOSubXact_HashTables(bool isCommit, int nestDepth) |
| 1861 | { |
| 1862 | int i; |
| 1863 | |
| 1864 | /* |
| 1865 | * Search backward to make cleanup easy. Note we must check all entries, |
| 1866 | * not only those at the end of the array, because deletion technique |
| 1867 | * doesn't keep them in order. |
| 1868 | */ |
| 1869 | for (i = num_seq_scans - 1; i >= 0; i--) |
| 1870 | { |
| 1871 | if (seq_scan_level[i] >= nestDepth) |
| 1872 | { |
| 1873 | if (isCommit) |
| 1874 | elog(WARNING, "leaked hash_seq_search scan for hash table %p" , |
| 1875 | seq_scan_tables[i]); |
| 1876 | seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1]; |
| 1877 | seq_scan_level[i] = seq_scan_level[num_seq_scans - 1]; |
| 1878 | num_seq_scans--; |
| 1879 | } |
| 1880 | } |
| 1881 | } |
| 1882 | |