| 1 | /* |
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
| 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 5 | * |
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
| 7 | */ |
| 8 | |
| 9 | /* |
| 10 | * - Hash Table Creation |
| 11 | * The hash indexing scheme for BATs reserves a block of memory to |
| 12 | * maintain the hash table and a collision list. A one-to-one mapping |
| 13 | * exists between the BAT and the collision list using the BUN |
| 14 | * index. NOTE: we alloc the link list as a parallel array to the BUN |
| 15 | * array; hence the hash link array has the same size as |
| 16 | * BATcapacity(b) (not BATcount(b)). This allows us in the BUN insert |
| 17 | * and delete to assume that there is hash space iff there is BUN |
| 18 | * space. If there is no BUN space, the BATextend now destroys the |
| 19 | * hash table. |
| 20 | * |
| 21 | * The hash mask size is a power of two, so we can do bitwise AND on |
| 22 | * the hash (integer) number to quickly find the head of the bucket |
| 23 | * chain. Clearly, the hash mask size is a crucial parameter. If we |
| 24 | * know that the column is unique (tkey), we use direct hashing (mask |
| 25 | * size ~= BATcount). Otherwise we dynamically determine the mask size |
| 26 | * by starting out with mask size = BATcount/64 (just 1.5% of memory |
| 27 | * storage overhead). Then we start building the hash table on the |
| 28 | * first 25% of the BAT. As we aim for max-collisions list length of |
| 29 | * 4, the list on 25% should not exceed length 1. So, if a small |
| 30 | * number of collisssions occurs (mask/2) then we abandon the attempt |
| 31 | * and restart with a mask that is 4 times larger. This converges |
| 32 | * after three cycles to direct hashing. |
| 33 | */ |
| 34 | |
| 35 | #include "monetdb_config.h" |
| 36 | #include "gdk.h" |
| 37 | #include "gdk_private.h" |
| 38 | |
| 39 | static int |
| 40 | HASHwidth(BUN hashsize) |
| 41 | { |
| 42 | if (hashsize <= (BUN) BUN2_NONE) |
| 43 | return BUN2; |
| 44 | #if SIZEOF_BUN <= 4 |
| 45 | return BUN4; |
| 46 | #else |
| 47 | if (hashsize <= (BUN) BUN4_NONE) |
| 48 | return BUN4; |
| 49 | return BUN8; |
| 50 | #endif |
| 51 | } |
| 52 | |
| 53 | BUN |
| 54 | HASHmask(BUN cnt) |
| 55 | { |
| 56 | BUN m = cnt; |
| 57 | |
| 58 | /* find largest power of 2 smaller than or equal to cnt */ |
| 59 | m |= m >> 1; |
| 60 | m |= m >> 2; |
| 61 | m |= m >> 4; |
| 62 | m |= m >> 8; |
| 63 | m |= m >> 16; |
| 64 | #if SIZEOF_BUN == 8 |
| 65 | m |= m >> 32; |
| 66 | #endif |
| 67 | m -= m >> 1; |
| 68 | |
| 69 | /* if cnt is more than 1/3 into the gap between m and 2*m, |
| 70 | double m */ |
| 71 | if (m + m - cnt < 2 * (cnt - m)) |
| 72 | m += m; |
| 73 | if (m < BATTINY) |
| 74 | m = BATTINY; |
| 75 | return m; |
| 76 | } |
| 77 | |
| 78 | static inline void |
| 79 | HASHclear(Hash *h) |
| 80 | { |
| 81 | /* since BUN2_NONE, BUN4_NONE, BUN8_NONE |
| 82 | * are all equal to -1 (~0), i.e., have all bits set, |
| 83 | * we can use a simple memset() to clear the Hash, |
| 84 | * rather than iteratively assigning individual |
| 85 | * BUNi_NONE values in a for-loop |
| 86 | */ |
| 87 | memset(h->Hash, 0xFF, (h->mask + 1) * h->width); |
| 88 | } |
| 89 | |
| 90 | #define HASH_VERSION 2 |
| 91 | #define 6 /* nr of size_t fields in header */ |
| 92 | |
| 93 | gdk_return |
| 94 | HASHnew(Hash *h, int tpe, BUN size, BUN mask, BUN count) |
| 95 | { |
| 96 | Heap *hp = &h->heap; |
| 97 | int width = HASHwidth(size); |
| 98 | |
| 99 | if (HEAPalloc(hp, mask + size + HASH_HEADER_SIZE * SIZEOF_SIZE_T / width, width) != GDK_SUCCEED) |
| 100 | return GDK_FAIL; |
| 101 | h->heap.free = (mask + size) * width + HASH_HEADER_SIZE * SIZEOF_SIZE_T; |
| 102 | h->lim = size; |
| 103 | h->mask = mask - 1; |
| 104 | h->width = width; |
| 105 | switch (width) { |
| 106 | case BUN2: |
| 107 | h->nil = (BUN) BUN2_NONE; |
| 108 | break; |
| 109 | case BUN4: |
| 110 | h->nil = (BUN) BUN4_NONE; |
| 111 | break; |
| 112 | #ifdef BUN8 |
| 113 | case BUN8: |
| 114 | h->nil = (BUN) BUN8_NONE; |
| 115 | break; |
| 116 | #endif |
| 117 | default: |
| 118 | assert(0); |
| 119 | } |
| 120 | h->Link = h->heap.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T; |
| 121 | h->Hash = (void *) ((char *) h->Link + h->lim * width); |
| 122 | h->type = tpe; |
| 123 | HASHclear(h); /* zero the mask */ |
| 124 | ((size_t *) h->heap.base)[0] = HASH_VERSION; |
| 125 | ((size_t *) h->heap.base)[1] = size; |
| 126 | ((size_t *) h->heap.base)[2] = mask; |
| 127 | ((size_t *) h->heap.base)[3] = width; |
| 128 | ((size_t *) h->heap.base)[4] = count; |
| 129 | ((size_t *) h->heap.base)[5] = 0; /* # filled slots (chain heads) */ |
| 130 | ACCELDEBUG fprintf(stderr, "#HASHnew: create hash(size " BUNFMT ", mask " BUNFMT ", width %d, total " BUNFMT " bytes);\n" , size, mask, width, (size + mask) * width); |
| 131 | return GDK_SUCCEED; |
| 132 | } |
| 133 | |
| 134 | /* collect HASH statistics for analysis */ |
| 135 | static void |
| 136 | HASHcollisions(BAT *b, Hash *h) |
| 137 | { |
| 138 | lng cnt, entries = 0, max = 0; |
| 139 | double total = 0; |
| 140 | BUN p, i, j, nil; |
| 141 | |
| 142 | if (b == 0 || h == 0) |
| 143 | return; |
| 144 | nil = HASHnil(h); |
| 145 | for (i = 0, j = h->mask; i <= j; i++) |
| 146 | if ((p = HASHget(h, i)) != nil) { |
| 147 | entries++; |
| 148 | cnt = 0; |
| 149 | for (; p != nil; p = HASHgetlink(h, p)) |
| 150 | cnt++; |
| 151 | if (cnt > max) |
| 152 | max = cnt; |
| 153 | total += cnt; |
| 154 | } |
| 155 | fprintf(stderr, "#BAThash: statistics (" BUNFMT ", entries " LLFMT ", mask " BUNFMT ", max " LLFMT ", avg %2.6f);\n" , BATcount(b), entries, h->mask, max, entries == 0 ? 0 : total / entries); |
| 156 | } |
| 157 | |
| 158 | /* Return TRUE if we have a hash on the tail, even if we need to read |
| 159 | * one from disk. |
| 160 | * |
| 161 | * Note that the b->thash pointer can be NULL, meaning there is no |
| 162 | * hash; (Hash *) 1, meaning there is no hash loaded, but it may exist |
| 163 | * on disk; or a valid pointer to a loaded hash. These values are |
| 164 | * maintained here, in the HASHdestroy and HASHfree functions, and in |
| 165 | * BBPdiskscan during initialization. */ |
| 166 | bool |
| 167 | BATcheckhash(BAT *b) |
| 168 | { |
| 169 | bool ret; |
| 170 | lng t = 0; |
| 171 | |
| 172 | /* we don't need the lock just to read the value b->thash */ |
| 173 | if (b->thash == (Hash *) 1) { |
| 174 | /* but when we want to change it, we need the lock */ |
| 175 | ACCELDEBUG t = GDKusec(); |
| 176 | MT_lock_set(&b->batIdxLock); |
| 177 | ACCELDEBUG t = GDKusec() - t; |
| 178 | /* if still 1 now that we have the lock, we can update */ |
| 179 | if (b->thash == (Hash *) 1) { |
| 180 | Hash *h; |
| 181 | int fd; |
| 182 | |
| 183 | assert(!GDKinmemory()); |
| 184 | b->thash = NULL; |
| 185 | if ((h = GDKzalloc(sizeof(*h))) != NULL && |
| 186 | (h->heap.farmid = BBPselectfarm(b->batRole, b->ttype, hashheap)) >= 0) { |
| 187 | const char *nme = BBP_physical(b->batCacheid); |
| 188 | strconcat_len(h->heap.filename, |
| 189 | sizeof(h->heap.filename), |
| 190 | nme, ".thash" , NULL); |
| 191 | |
| 192 | /* check whether a persisted hash can be found */ |
| 193 | if ((fd = GDKfdlocate(h->heap.farmid, nme, "rb+" , "thash" )) >= 0) { |
| 194 | size_t hdata[HASH_HEADER_SIZE]; |
| 195 | struct stat st; |
| 196 | |
| 197 | if (read(fd, hdata, sizeof(hdata)) == sizeof(hdata) && |
| 198 | hdata[0] == ( |
| 199 | #ifdef PERSISTENTHASH |
| 200 | ((size_t) 1 << 24) | |
| 201 | #endif |
| 202 | HASH_VERSION) && |
| 203 | hdata[4] == (size_t) BATcount(b) && |
| 204 | fstat(fd, &st) == 0 && |
| 205 | st.st_size >= (off_t) (h->heap.size = h->heap.free = (hdata[1] + hdata[2]) * hdata[3] + HASH_HEADER_SIZE * SIZEOF_SIZE_T) && |
| 206 | HEAPload(&h->heap, nme, "thash" , false) == GDK_SUCCEED) { |
| 207 | h->lim = (BUN) hdata[1]; |
| 208 | h->type = ATOMtype(b->ttype); |
| 209 | h->mask = (BUN) (hdata[2] - 1); |
| 210 | h->width = (int) hdata[3]; |
| 211 | switch (h->width) { |
| 212 | case BUN2: |
| 213 | h->nil = (BUN) BUN2_NONE; |
| 214 | break; |
| 215 | case BUN4: |
| 216 | h->nil = (BUN) BUN4_NONE; |
| 217 | break; |
| 218 | #ifdef BUN8 |
| 219 | case BUN8: |
| 220 | h->nil = (BUN) BUN8_NONE; |
| 221 | break; |
| 222 | #endif |
| 223 | default: |
| 224 | assert(0); |
| 225 | } |
| 226 | h->Link = h->heap.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T; |
| 227 | h->Hash = (void *) ((char *) h->Link + h->lim * h->width); |
| 228 | close(fd); |
| 229 | h->heap.parentid = b->batCacheid; |
| 230 | h->heap.dirty = false; |
| 231 | BATsetprop_nolock( |
| 232 | b, |
| 233 | GDK_HASH_MASK, |
| 234 | TYPE_oid, |
| 235 | &(oid){h->mask + 1}); |
| 236 | b->thash = h; |
| 237 | ACCELDEBUG fprintf(stderr, "#BATcheckhash: reusing persisted hash %s\n" , BATgetId(b)); |
| 238 | MT_lock_unset(&b->batIdxLock); |
| 239 | return true; |
| 240 | } |
| 241 | close(fd); |
| 242 | /* unlink unusable file */ |
| 243 | GDKunlink(h->heap.farmid, BATDIR, nme, "thash" ); |
| 244 | } |
| 245 | } |
| 246 | GDKfree(h); |
| 247 | GDKclrerr(); /* we're not currently interested in errors */ |
| 248 | } |
| 249 | MT_lock_unset(&b->batIdxLock); |
| 250 | } |
| 251 | ret = b->thash != NULL; |
| 252 | ACCELDEBUG if (ret) fprintf(stderr, "#BATcheckhash: already has hash %s, waited " LLFMT " usec\n" , BATgetId(b), t); |
| 253 | return ret; |
| 254 | } |
| 255 | |
| 256 | #ifdef PERSISTENTHASH |
| 257 | static void |
| 258 | BAThashsync(void *arg) |
| 259 | { |
| 260 | BAT *b = arg; |
| 261 | int fd; |
| 262 | lng t0 = 0; |
| 263 | const char *failed = " failed" ; |
| 264 | |
| 265 | ACCELDEBUG t0 = GDKusec(); |
| 266 | |
| 267 | /* we could check whether b->thash == NULL before getting the |
| 268 | * lock, and only lock if it isn't; however, it's very |
| 269 | * unlikely that that is the case, so we don't */ |
| 270 | MT_lock_set(&b->batIdxLock); |
| 271 | if (b->thash != NULL) { |
| 272 | Heap *hp = &b->thash->heap; |
| 273 | /* only persist if parent BAT hasn't changed in the |
| 274 | * mean time */ |
| 275 | if (!b->theap.dirty && |
| 276 | ((size_t *) hp->base)[4] == b->batCount && |
| 277 | HEAPsave(hp, hp->filename, NULL) == GDK_SUCCEED) { |
| 278 | if (hp->storage == STORE_MEM) { |
| 279 | if ((fd = GDKfdlocate(hp->farmid, hp->filename, "rb+" , NULL)) >= 0) { |
| 280 | ((size_t *) hp->base)[0] |= (size_t) 1 << 24; |
| 281 | if (write(fd, hp->base, SIZEOF_SIZE_T) >= 0) { |
| 282 | failed = "" ; /* not failed */ |
| 283 | if (!(GDKdebug & NOSYNCMASK)) { |
| 284 | #if defined(NATIVE_WIN32) |
| 285 | _commit(fd); |
| 286 | #elif defined(HAVE_FDATASYNC) |
| 287 | fdatasync(fd); |
| 288 | #elif defined(HAVE_FSYNC) |
| 289 | fsync(fd); |
| 290 | #endif |
| 291 | } |
| 292 | hp->dirty = false; |
| 293 | } else { |
| 294 | perror("write hash" ); |
| 295 | } |
| 296 | close(fd); |
| 297 | } |
| 298 | } else { |
| 299 | ((size_t *) hp->base)[0] |= (size_t) 1 << 24; |
| 300 | if (!(GDKdebug & NOSYNCMASK) && |
| 301 | MT_msync(hp->base, SIZEOF_SIZE_T) < 0) { |
| 302 | ((size_t *) hp->base)[0] &= ~((size_t) 1 << 24); |
| 303 | } else { |
| 304 | hp->dirty = false; |
| 305 | failed = "" ; /* not failed */ |
| 306 | } |
| 307 | } |
| 308 | ACCELDEBUG fprintf(stderr, "#BAThash: persisting hash %s (" LLFMT " usec)%s\n" , hp->filename, GDKusec() - t0, failed); |
| 309 | } |
| 310 | } |
| 311 | MT_lock_unset(&b->batIdxLock); |
| 312 | BBPunfix(b->batCacheid); |
| 313 | } |
| 314 | #endif |
| 315 | |
| 316 | #define starthash(TYPE) \ |
| 317 | do { \ |
| 318 | const TYPE *restrict v = (const TYPE *) BUNtloc(bi, 0); \ |
| 319 | for (; p < cnt1; p++) { \ |
| 320 | c = hash_##TYPE(h, v + o - b->hseqbase); \ |
| 321 | hget = HASHget(h, c); \ |
| 322 | if (hget == hnil) { \ |
| 323 | if (nslots == maxslots) \ |
| 324 | break; /* mask too full */ \ |
| 325 | nslots++; \ |
| 326 | } \ |
| 327 | HASHputlink(h, p, hget); \ |
| 328 | HASHput(h, c, p); \ |
| 329 | o = canditer_next(&ci); \ |
| 330 | } \ |
| 331 | } while (0) |
| 332 | #define finishhash(TYPE) \ |
| 333 | do { \ |
| 334 | const TYPE *restrict v = (const TYPE *) BUNtloc(bi, 0); \ |
| 335 | for (; p < cnt; p++) { \ |
| 336 | c = hash_##TYPE(h, v + o - b->hseqbase); \ |
| 337 | c = hash_##TYPE(h, v + o - b->hseqbase); \ |
| 338 | hget = HASHget(h, c); \ |
| 339 | nslots += hget == hnil; \ |
| 340 | HASHputlink(h, p, hget); \ |
| 341 | HASHput(h, c, p); \ |
| 342 | o = canditer_next(&ci); \ |
| 343 | } \ |
| 344 | } while (0) |
| 345 | |
| 346 | /* |
| 347 | * The prime routine for the BAT layer is to create a new hash index. |
| 348 | * Its argument is the element type and the maximum number of BUNs be |
| 349 | * stored under the hash function. |
| 350 | */ |
| 351 | Hash * |
| 352 | BAThash_impl(BAT *b, BAT *s, const char *ext) |
| 353 | { |
| 354 | lng t0 = 0; |
| 355 | unsigned int tpe = ATOMbasetype(b->ttype); |
| 356 | BUN cnt, cnt1; |
| 357 | struct canditer ci; |
| 358 | BUN mask, maxmask = 0; |
| 359 | BUN p, c; |
| 360 | oid o; |
| 361 | BUN nslots; |
| 362 | BUN hnil, hget; |
| 363 | Hash *h = NULL; |
| 364 | const char *nme = GDKinmemory() ? ":inmemory" : BBP_physical(b->batCacheid); |
| 365 | BATiter bi = bat_iterator(b); |
| 366 | PROPrec *prop; |
| 367 | |
| 368 | ACCELDEBUG t0 = GDKusec(); |
| 369 | ACCELDEBUG fprintf(stderr, "#BAThash: create hash(" ALGOBATFMT ");\n" , |
| 370 | ALGOBATPAR(b)); |
| 371 | if (b->ttype == TYPE_void) { |
| 372 | if (is_oid_nil(b->tseqbase)) { |
| 373 | ACCELDEBUG fprintf(stderr, "#BAThash: cannot create hash-table on void-NIL column.\n" ); |
| 374 | GDKerror("BAThash: no hash on void/nil column\n" ); |
| 375 | return NULL; |
| 376 | } |
| 377 | ACCELDEBUG fprintf(stderr, "#BAThash: creating hash-table on void column..\n" ); |
| 378 | |
| 379 | tpe = TYPE_void; |
| 380 | } |
| 381 | |
| 382 | cnt = canditer_init(&ci, b, s); |
| 383 | |
| 384 | if ((h = GDKzalloc(sizeof(*h))) == NULL || |
| 385 | (h->heap.farmid = BBPselectfarm(b->batRole, b->ttype, hashheap)) < 0) { |
| 386 | GDKfree(h); |
| 387 | return NULL; |
| 388 | } |
| 389 | h->heap.dirty = true; |
| 390 | strconcat_len(h->heap.filename, sizeof(h->heap.filename), |
| 391 | nme, "." , ext, NULL); |
| 392 | |
| 393 | /* determine hash mask size */ |
| 394 | cnt1 = 0; |
| 395 | if (ATOMsize(tpe) == 1) { |
| 396 | /* perfect hash for one-byte sized atoms */ |
| 397 | mask = (1 << 8); |
| 398 | } else if (ATOMsize(tpe) == 2) { |
| 399 | /* perfect hash for two-byte sized atoms */ |
| 400 | mask = (1 << 16); |
| 401 | } else if (b->tkey || cnt <= 4096) { |
| 402 | /* if key, or if small, don't bother dynamically |
| 403 | * adjusting the hash mask */ |
| 404 | mask = HASHmask(cnt); |
| 405 | } else if (s == NULL && (prop = BATgetprop_nolock(b, GDK_HASH_MASK)) != NULL) { |
| 406 | assert(prop->v.vtype == TYPE_oid); |
| 407 | mask = prop->v.val.oval; |
| 408 | assert((mask & (mask - 1)) == 0); /* power of two */ |
| 409 | maxmask = HASHmask(cnt); |
| 410 | if (mask > maxmask) |
| 411 | mask = maxmask; |
| 412 | } else { |
| 413 | /* dynamic hash: we start with HASHmask(cnt)/64, or, |
| 414 | * if cnt large enough, HASHmask(cnt)/256; if there |
| 415 | * are too many collisions we try HASHmask(cnt)/64, |
| 416 | * HASHmask(cnt)/16, HASHmask(cnt)/4, and finally |
| 417 | * HASHmask(cnt), but we might skip some of these if |
| 418 | * there are many distinct values. */ |
| 419 | maxmask = HASHmask(cnt); |
| 420 | mask = maxmask >> 6; |
| 421 | while (mask > 4096) |
| 422 | mask >>= 2; |
| 423 | /* try out on first 25% of b */ |
| 424 | cnt1 = cnt >> 2; |
| 425 | } |
| 426 | |
| 427 | o = canditer_next(&ci); /* always one ahead */ |
| 428 | for (;;) { |
| 429 | BUN maxslots = (mask >> 3) - 1; /* 1/8 full is too full */ |
| 430 | |
| 431 | nslots = 0; |
| 432 | p = 0; |
| 433 | HEAPfree(&h->heap, true); |
| 434 | /* create the hash structures */ |
| 435 | if (HASHnew(h, ATOMtype(b->ttype), s ? cnt : BATcapacity(b), |
| 436 | mask, cnt) != GDK_SUCCEED) { |
| 437 | |
| 438 | GDKfree(h); |
| 439 | return NULL; |
| 440 | } |
| 441 | |
| 442 | hnil = HASHnil(h); |
| 443 | |
| 444 | switch (tpe) { |
| 445 | case TYPE_bte: |
| 446 | starthash(bte); |
| 447 | break; |
| 448 | case TYPE_sht: |
| 449 | starthash(sht); |
| 450 | break; |
| 451 | case TYPE_flt: |
| 452 | starthash(flt); |
| 453 | break; |
| 454 | case TYPE_int: |
| 455 | starthash(int); |
| 456 | break; |
| 457 | case TYPE_dbl: |
| 458 | starthash(dbl); |
| 459 | break; |
| 460 | case TYPE_lng: |
| 461 | starthash(lng); |
| 462 | break; |
| 463 | #ifdef HAVE_HGE |
| 464 | case TYPE_hge: |
| 465 | starthash(hge); |
| 466 | break; |
| 467 | #endif |
| 468 | default: |
| 469 | for (; p < cnt1; p++) { |
| 470 | const void *restrict v = BUNtail(bi, o - b->hseqbase); |
| 471 | c = heap_hash_any(b->tvheap, h, v); |
| 472 | hget = HASHget(h, c); |
| 473 | if (hget == hnil) { |
| 474 | if (nslots == maxslots) |
| 475 | break; /* mask too full */ |
| 476 | nslots++; |
| 477 | } |
| 478 | HASHputlink(h, p, hget); |
| 479 | HASHput(h, c, p); |
| 480 | o = canditer_next(&ci); |
| 481 | } |
| 482 | break; |
| 483 | } |
| 484 | ACCELDEBUG if (p < cnt1) |
| 485 | fprintf(stderr, "#BAThash(%s): abort starthash with " |
| 486 | "mask " BUNFMT " at " BUNFMT "\n" , BATgetId(b), |
| 487 | mask, p); |
| 488 | if (p == cnt1 || mask == maxmask) |
| 489 | break; |
| 490 | mask <<= 2; |
| 491 | /* if we fill up the slots fast (p <= maxslots * 1.2) |
| 492 | * increase mask size a bit more quickly */ |
| 493 | if (mask < maxmask && p <= maxslots * 1.2) |
| 494 | mask <<= 2; |
| 495 | canditer_reset(&ci); |
| 496 | o = canditer_next(&ci); |
| 497 | } |
| 498 | |
| 499 | /* finish the hashtable with the current mask */ |
| 500 | switch (tpe) { |
| 501 | case TYPE_bte: |
| 502 | finishhash(bte); |
| 503 | break; |
| 504 | case TYPE_sht: |
| 505 | finishhash(sht); |
| 506 | break; |
| 507 | case TYPE_int: |
| 508 | finishhash(int); |
| 509 | break; |
| 510 | case TYPE_flt: |
| 511 | finishhash(flt); |
| 512 | break; |
| 513 | case TYPE_dbl: |
| 514 | finishhash(dbl); |
| 515 | break; |
| 516 | case TYPE_lng: |
| 517 | finishhash(lng); |
| 518 | break; |
| 519 | #ifdef HAVE_HGE |
| 520 | case TYPE_hge: |
| 521 | finishhash(hge); |
| 522 | break; |
| 523 | #endif |
| 524 | default: |
| 525 | for (; p < cnt; p++) { |
| 526 | const void *restrict v = BUNtail(bi, o - b->hseqbase); |
| 527 | c = heap_hash_any(b->tvheap, h, v); |
| 528 | hget = HASHget(h, c); |
| 529 | nslots += hget == hnil; |
| 530 | HASHputlink(h, p, hget); |
| 531 | HASHput(h, c, p); |
| 532 | o = canditer_next(&ci); |
| 533 | } |
| 534 | break; |
| 535 | } |
| 536 | if (s == NULL) |
| 537 | BATsetprop_nolock(b, GDK_HASH_MASK, TYPE_oid, &(oid){h->mask + 1}); |
| 538 | ((size_t *) h->heap.base)[5] = (size_t) nslots; |
| 539 | #ifndef NDEBUG |
| 540 | /* clear unused part of Link array */ |
| 541 | memset((char *) h->Link + cnt * h->width, 0, (h->lim - cnt) * h->width); |
| 542 | #endif |
| 543 | h->heap.parentid = b->batCacheid; |
| 544 | /* if the number of occupied slots is equal to the bat count, |
| 545 | * all values are necessarily distinct */ |
| 546 | if (nslots == BATcount(b) && !b->tkey) { |
| 547 | b->tkey = true; |
| 548 | b->batDirtydesc = true; |
| 549 | } |
| 550 | ACCELDEBUG { |
| 551 | fprintf(stderr, "#BAThash: hash construction " LLFMT " usec\n" , GDKusec() - t0); |
| 552 | HASHcollisions(b, h); |
| 553 | } |
| 554 | return h; |
| 555 | } |
| 556 | |
| 557 | gdk_return |
| 558 | BAThash(BAT *b) |
| 559 | { |
| 560 | assert(b->batCacheid > 0); |
| 561 | if (BATcheckhash(b)) { |
| 562 | return GDK_SUCCEED; |
| 563 | } |
| 564 | MT_lock_set(&b->batIdxLock); |
| 565 | if (b->thash == NULL) { |
| 566 | if ((b->thash = BAThash_impl(b, NULL, "thash" )) == NULL) { |
| 567 | MT_lock_unset(&b->batIdxLock); |
| 568 | return GDK_FAIL; |
| 569 | } |
| 570 | #ifdef PERSISTENTHASH |
| 571 | if (BBP_status(b->batCacheid) & BBPEXISTING && !b->theap.dirty && !GDKinmemory()) { |
| 572 | MT_Id tid; |
| 573 | BBPfix(b->batCacheid); |
| 574 | char name[16]; |
| 575 | snprintf(name, sizeof(name), "hashsync%d" , b->batCacheid); |
| 576 | MT_lock_unset(&b->batIdxLock); |
| 577 | if (MT_create_thread(&tid, BAThashsync, b, |
| 578 | MT_THR_DETACHED, |
| 579 | name) < 0) { |
| 580 | /* couldn't start thread: clean up */ |
| 581 | BBPunfix(b->batCacheid); |
| 582 | } |
| 583 | return GDK_SUCCEED; |
| 584 | } else |
| 585 | ACCELDEBUG fprintf(stderr, "#BAThash: NOT persisting hash %d\n" , b->batCacheid); |
| 586 | #endif |
| 587 | } |
| 588 | MT_lock_unset(&b->batIdxLock); |
| 589 | return GDK_SUCCEED; |
| 590 | } |
| 591 | |
| 592 | /* |
| 593 | * The entry on which a value hashes can be calculated with the |
| 594 | * routine HASHprobe. |
| 595 | */ |
| 596 | BUN |
| 597 | HASHprobe(const Hash *h, const void *v) |
| 598 | { |
| 599 | switch (ATOMbasetype(h->type)) { |
| 600 | case TYPE_bte: |
| 601 | return hash_bte(h, v); |
| 602 | case TYPE_sht: |
| 603 | return hash_sht(h, v); |
| 604 | case TYPE_int: |
| 605 | case TYPE_flt: |
| 606 | return hash_int(h, v); |
| 607 | case TYPE_dbl: |
| 608 | case TYPE_lng: |
| 609 | return hash_lng(h, v); |
| 610 | #ifdef HAVE_HGE |
| 611 | case TYPE_hge: |
| 612 | return hash_hge(h, v); |
| 613 | #endif |
| 614 | default: |
| 615 | return hash_any(h, v); |
| 616 | } |
| 617 | } |
| 618 | |
| 619 | BUN |
| 620 | HASHlist(Hash *h, BUN i) |
| 621 | { |
| 622 | BUN c = 1; |
| 623 | BUN j = HASHget(h, i), nil = HASHnil(h); |
| 624 | |
| 625 | if (j == nil) |
| 626 | return 1; |
| 627 | while ((j = HASHgetlink(h, i)) != nil) { |
| 628 | c++; |
| 629 | i = j; |
| 630 | } |
| 631 | return c; |
| 632 | } |
| 633 | |
| 634 | void |
| 635 | HASHdestroy(BAT *b) |
| 636 | { |
| 637 | if (b && b->thash) { |
| 638 | Hash *hs; |
| 639 | MT_lock_set(&b->batIdxLock); |
| 640 | hs = b->thash; |
| 641 | b->thash = NULL; |
| 642 | MT_lock_unset(&b->batIdxLock); |
| 643 | if (hs == (Hash *) 1) { |
| 644 | GDKunlink(BBPselectfarm(b->batRole, b->ttype, hashheap), |
| 645 | BATDIR, |
| 646 | BBP_physical(b->batCacheid), |
| 647 | "thash" ); |
| 648 | } else if (hs) { |
| 649 | bat p = VIEWtparent(b); |
| 650 | BAT *hp = NULL; |
| 651 | |
| 652 | if (p) |
| 653 | hp = BBP_cache(p); |
| 654 | |
| 655 | if (!hp || hs != hp->thash) { |
| 656 | ACCELDEBUG if (*(size_t *) hs->heap.base & (1 << 24)) |
| 657 | fprintf(stderr, "#HASHdestroy: removing persisted hash %d\n" , b->batCacheid); |
| 658 | HEAPfree(&hs->heap, true); |
| 659 | GDKfree(hs); |
| 660 | } |
| 661 | } |
| 662 | } |
| 663 | } |
| 664 | |
| 665 | void |
| 666 | HASHfree(BAT *b) |
| 667 | { |
| 668 | if (b && b->thash) { |
| 669 | MT_lock_set(&b->batIdxLock); |
| 670 | if (b->thash && b->thash != (Hash *) 1) { |
| 671 | bool rmheap = GDKinmemory() || b->thash->heap.dirty; |
| 672 | |
| 673 | HEAPfree(&b->thash->heap, rmheap); |
| 674 | GDKfree(b->thash); |
| 675 | b->thash = rmheap ? NULL : (Hash *) 1; |
| 676 | } |
| 677 | MT_lock_unset(&b->batIdxLock); |
| 678 | } |
| 679 | } |
| 680 | |
| 681 | bool |
| 682 | HASHgonebad(BAT *b, const void *v) |
| 683 | { |
| 684 | Hash *h = b->thash; |
| 685 | BATiter bi = bat_iterator(b); |
| 686 | BUN cnt, hit; |
| 687 | |
| 688 | if (h == NULL) |
| 689 | return true; /* no hash is bad hash? */ |
| 690 | |
| 691 | if (h->mask * 2 < BATcount(b)) { |
| 692 | int (*cmp) (const void *, const void *) = ATOMcompare(b->ttype); |
| 693 | BUN i = HASHget(h, (BUN) HASHprobe(h, v)), nil = HASHnil(h); |
| 694 | for (cnt = hit = 1; i != nil; i = HASHgetlink(h, i), cnt++) |
| 695 | hit += ((*cmp) (v, BUNtail(bi, (BUN) i)) == 0); |
| 696 | |
| 697 | if (cnt / hit > 4) |
| 698 | return true; /* linked list too long */ |
| 699 | |
| 700 | /* in this case, linked lists are long but contain the |
| 701 | * desired values such hash tables may be useful for |
| 702 | * locating all duplicates */ |
| 703 | } |
| 704 | return false; /* a-ok */ |
| 705 | } |
| 706 | |