1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9/*
10 * - Hash Table Creation
11 * The hash indexing scheme for BATs reserves a block of memory to
12 * maintain the hash table and a collision list. A one-to-one mapping
13 * exists between the BAT and the collision list using the BUN
14 * index. NOTE: we alloc the link list as a parallel array to the BUN
15 * array; hence the hash link array has the same size as
16 * BATcapacity(b) (not BATcount(b)). This allows us in the BUN insert
17 * and delete to assume that there is hash space iff there is BUN
18 * space. If there is no BUN space, the BATextend now destroys the
19 * hash table.
20 *
21 * The hash mask size is a power of two, so we can do bitwise AND on
22 * the hash (integer) number to quickly find the head of the bucket
23 * chain. Clearly, the hash mask size is a crucial parameter. If we
24 * know that the column is unique (tkey), we use direct hashing (mask
25 * size ~= BATcount). Otherwise we dynamically determine the mask size
26 * by starting out with mask size = BATcount/64 (just 1.5% of memory
27 * storage overhead). Then we start building the hash table on the
28 * first 25% of the BAT. As we aim for max-collisions list length of
29 * 4, the list on 25% should not exceed length 1. So, if a small
30 * number of collisssions occurs (mask/2) then we abandon the attempt
31 * and restart with a mask that is 4 times larger. This converges
32 * after three cycles to direct hashing.
33 */
34
35#include "monetdb_config.h"
36#include "gdk.h"
37#include "gdk_private.h"
38
39static int
40HASHwidth(BUN hashsize)
41{
42 if (hashsize <= (BUN) BUN2_NONE)
43 return BUN2;
44#if SIZEOF_BUN <= 4
45 return BUN4;
46#else
47 if (hashsize <= (BUN) BUN4_NONE)
48 return BUN4;
49 return BUN8;
50#endif
51}
52
53BUN
54HASHmask(BUN cnt)
55{
56 BUN m = cnt;
57
58 /* find largest power of 2 smaller than or equal to cnt */
59 m |= m >> 1;
60 m |= m >> 2;
61 m |= m >> 4;
62 m |= m >> 8;
63 m |= m >> 16;
64#if SIZEOF_BUN == 8
65 m |= m >> 32;
66#endif
67 m -= m >> 1;
68
69 /* if cnt is more than 1/3 into the gap between m and 2*m,
70 double m */
71 if (m + m - cnt < 2 * (cnt - m))
72 m += m;
73 if (m < BATTINY)
74 m = BATTINY;
75 return m;
76}
77
78static inline void
79HASHclear(Hash *h)
80{
81 /* since BUN2_NONE, BUN4_NONE, BUN8_NONE
82 * are all equal to -1 (~0), i.e., have all bits set,
83 * we can use a simple memset() to clear the Hash,
84 * rather than iteratively assigning individual
85 * BUNi_NONE values in a for-loop
86 */
87 memset(h->Hash, 0xFF, (h->mask + 1) * h->width);
88}
89
90#define HASH_VERSION 2
91#define HASH_HEADER_SIZE 6 /* nr of size_t fields in header */
92
93gdk_return
94HASHnew(Hash *h, int tpe, BUN size, BUN mask, BUN count)
95{
96 Heap *hp = &h->heap;
97 int width = HASHwidth(size);
98
99 if (HEAPalloc(hp, mask + size + HASH_HEADER_SIZE * SIZEOF_SIZE_T / width, width) != GDK_SUCCEED)
100 return GDK_FAIL;
101 h->heap.free = (mask + size) * width + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
102 h->lim = size;
103 h->mask = mask - 1;
104 h->width = width;
105 switch (width) {
106 case BUN2:
107 h->nil = (BUN) BUN2_NONE;
108 break;
109 case BUN4:
110 h->nil = (BUN) BUN4_NONE;
111 break;
112#ifdef BUN8
113 case BUN8:
114 h->nil = (BUN) BUN8_NONE;
115 break;
116#endif
117 default:
118 assert(0);
119 }
120 h->Link = h->heap.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
121 h->Hash = (void *) ((char *) h->Link + h->lim * width);
122 h->type = tpe;
123 HASHclear(h); /* zero the mask */
124 ((size_t *) h->heap.base)[0] = HASH_VERSION;
125 ((size_t *) h->heap.base)[1] = size;
126 ((size_t *) h->heap.base)[2] = mask;
127 ((size_t *) h->heap.base)[3] = width;
128 ((size_t *) h->heap.base)[4] = count;
129 ((size_t *) h->heap.base)[5] = 0; /* # filled slots (chain heads) */
130 ACCELDEBUG fprintf(stderr, "#HASHnew: create hash(size " BUNFMT ", mask " BUNFMT ", width %d, total " BUNFMT " bytes);\n", size, mask, width, (size + mask) * width);
131 return GDK_SUCCEED;
132}
133
134/* collect HASH statistics for analysis */
135static void
136HASHcollisions(BAT *b, Hash *h)
137{
138 lng cnt, entries = 0, max = 0;
139 double total = 0;
140 BUN p, i, j, nil;
141
142 if (b == 0 || h == 0)
143 return;
144 nil = HASHnil(h);
145 for (i = 0, j = h->mask; i <= j; i++)
146 if ((p = HASHget(h, i)) != nil) {
147 entries++;
148 cnt = 0;
149 for (; p != nil; p = HASHgetlink(h, p))
150 cnt++;
151 if (cnt > max)
152 max = cnt;
153 total += cnt;
154 }
155 fprintf(stderr, "#BAThash: statistics (" BUNFMT ", entries " LLFMT ", mask " BUNFMT ", max " LLFMT ", avg %2.6f);\n", BATcount(b), entries, h->mask, max, entries == 0 ? 0 : total / entries);
156}
157
158/* Return TRUE if we have a hash on the tail, even if we need to read
159 * one from disk.
160 *
161 * Note that the b->thash pointer can be NULL, meaning there is no
162 * hash; (Hash *) 1, meaning there is no hash loaded, but it may exist
163 * on disk; or a valid pointer to a loaded hash. These values are
164 * maintained here, in the HASHdestroy and HASHfree functions, and in
165 * BBPdiskscan during initialization. */
166bool
167BATcheckhash(BAT *b)
168{
169 bool ret;
170 lng t = 0;
171
172 /* we don't need the lock just to read the value b->thash */
173 if (b->thash == (Hash *) 1) {
174 /* but when we want to change it, we need the lock */
175 ACCELDEBUG t = GDKusec();
176 MT_lock_set(&b->batIdxLock);
177 ACCELDEBUG t = GDKusec() - t;
178 /* if still 1 now that we have the lock, we can update */
179 if (b->thash == (Hash *) 1) {
180 Hash *h;
181 int fd;
182
183 assert(!GDKinmemory());
184 b->thash = NULL;
185 if ((h = GDKzalloc(sizeof(*h))) != NULL &&
186 (h->heap.farmid = BBPselectfarm(b->batRole, b->ttype, hashheap)) >= 0) {
187 const char *nme = BBP_physical(b->batCacheid);
188 strconcat_len(h->heap.filename,
189 sizeof(h->heap.filename),
190 nme, ".thash", NULL);
191
192 /* check whether a persisted hash can be found */
193 if ((fd = GDKfdlocate(h->heap.farmid, nme, "rb+", "thash")) >= 0) {
194 size_t hdata[HASH_HEADER_SIZE];
195 struct stat st;
196
197 if (read(fd, hdata, sizeof(hdata)) == sizeof(hdata) &&
198 hdata[0] == (
199#ifdef PERSISTENTHASH
200 ((size_t) 1 << 24) |
201#endif
202 HASH_VERSION) &&
203 hdata[4] == (size_t) BATcount(b) &&
204 fstat(fd, &st) == 0 &&
205 st.st_size >= (off_t) (h->heap.size = h->heap.free = (hdata[1] + hdata[2]) * hdata[3] + HASH_HEADER_SIZE * SIZEOF_SIZE_T) &&
206 HEAPload(&h->heap, nme, "thash", false) == GDK_SUCCEED) {
207 h->lim = (BUN) hdata[1];
208 h->type = ATOMtype(b->ttype);
209 h->mask = (BUN) (hdata[2] - 1);
210 h->width = (int) hdata[3];
211 switch (h->width) {
212 case BUN2:
213 h->nil = (BUN) BUN2_NONE;
214 break;
215 case BUN4:
216 h->nil = (BUN) BUN4_NONE;
217 break;
218#ifdef BUN8
219 case BUN8:
220 h->nil = (BUN) BUN8_NONE;
221 break;
222#endif
223 default:
224 assert(0);
225 }
226 h->Link = h->heap.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
227 h->Hash = (void *) ((char *) h->Link + h->lim * h->width);
228 close(fd);
229 h->heap.parentid = b->batCacheid;
230 h->heap.dirty = false;
231 BATsetprop_nolock(
232 b,
233 GDK_HASH_MASK,
234 TYPE_oid,
235 &(oid){h->mask + 1});
236 b->thash = h;
237 ACCELDEBUG fprintf(stderr, "#BATcheckhash: reusing persisted hash %s\n", BATgetId(b));
238 MT_lock_unset(&b->batIdxLock);
239 return true;
240 }
241 close(fd);
242 /* unlink unusable file */
243 GDKunlink(h->heap.farmid, BATDIR, nme, "thash");
244 }
245 }
246 GDKfree(h);
247 GDKclrerr(); /* we're not currently interested in errors */
248 }
249 MT_lock_unset(&b->batIdxLock);
250 }
251 ret = b->thash != NULL;
252 ACCELDEBUG if (ret) fprintf(stderr, "#BATcheckhash: already has hash %s, waited " LLFMT " usec\n", BATgetId(b), t);
253 return ret;
254}
255
256#ifdef PERSISTENTHASH
257static void
258BAThashsync(void *arg)
259{
260 BAT *b = arg;
261 int fd;
262 lng t0 = 0;
263 const char *failed = " failed";
264
265 ACCELDEBUG t0 = GDKusec();
266
267 /* we could check whether b->thash == NULL before getting the
268 * lock, and only lock if it isn't; however, it's very
269 * unlikely that that is the case, so we don't */
270 MT_lock_set(&b->batIdxLock);
271 if (b->thash != NULL) {
272 Heap *hp = &b->thash->heap;
273 /* only persist if parent BAT hasn't changed in the
274 * mean time */
275 if (!b->theap.dirty &&
276 ((size_t *) hp->base)[4] == b->batCount &&
277 HEAPsave(hp, hp->filename, NULL) == GDK_SUCCEED) {
278 if (hp->storage == STORE_MEM) {
279 if ((fd = GDKfdlocate(hp->farmid, hp->filename, "rb+", NULL)) >= 0) {
280 ((size_t *) hp->base)[0] |= (size_t) 1 << 24;
281 if (write(fd, hp->base, SIZEOF_SIZE_T) >= 0) {
282 failed = ""; /* not failed */
283 if (!(GDKdebug & NOSYNCMASK)) {
284#if defined(NATIVE_WIN32)
285 _commit(fd);
286#elif defined(HAVE_FDATASYNC)
287 fdatasync(fd);
288#elif defined(HAVE_FSYNC)
289 fsync(fd);
290#endif
291 }
292 hp->dirty = false;
293 } else {
294 perror("write hash");
295 }
296 close(fd);
297 }
298 } else {
299 ((size_t *) hp->base)[0] |= (size_t) 1 << 24;
300 if (!(GDKdebug & NOSYNCMASK) &&
301 MT_msync(hp->base, SIZEOF_SIZE_T) < 0) {
302 ((size_t *) hp->base)[0] &= ~((size_t) 1 << 24);
303 } else {
304 hp->dirty = false;
305 failed = ""; /* not failed */
306 }
307 }
308 ACCELDEBUG fprintf(stderr, "#BAThash: persisting hash %s (" LLFMT " usec)%s\n", hp->filename, GDKusec() - t0, failed);
309 }
310 }
311 MT_lock_unset(&b->batIdxLock);
312 BBPunfix(b->batCacheid);
313}
314#endif
315
316#define starthash(TYPE) \
317 do { \
318 const TYPE *restrict v = (const TYPE *) BUNtloc(bi, 0); \
319 for (; p < cnt1; p++) { \
320 c = hash_##TYPE(h, v + o - b->hseqbase); \
321 hget = HASHget(h, c); \
322 if (hget == hnil) { \
323 if (nslots == maxslots) \
324 break; /* mask too full */ \
325 nslots++; \
326 } \
327 HASHputlink(h, p, hget); \
328 HASHput(h, c, p); \
329 o = canditer_next(&ci); \
330 } \
331 } while (0)
332#define finishhash(TYPE) \
333 do { \
334 const TYPE *restrict v = (const TYPE *) BUNtloc(bi, 0); \
335 for (; p < cnt; p++) { \
336 c = hash_##TYPE(h, v + o - b->hseqbase); \
337 c = hash_##TYPE(h, v + o - b->hseqbase); \
338 hget = HASHget(h, c); \
339 nslots += hget == hnil; \
340 HASHputlink(h, p, hget); \
341 HASHput(h, c, p); \
342 o = canditer_next(&ci); \
343 } \
344 } while (0)
345
346/*
347 * The prime routine for the BAT layer is to create a new hash index.
348 * Its argument is the element type and the maximum number of BUNs be
349 * stored under the hash function.
350 */
351Hash *
352BAThash_impl(BAT *b, BAT *s, const char *ext)
353{
354 lng t0 = 0;
355 unsigned int tpe = ATOMbasetype(b->ttype);
356 BUN cnt, cnt1;
357 struct canditer ci;
358 BUN mask, maxmask = 0;
359 BUN p, c;
360 oid o;
361 BUN nslots;
362 BUN hnil, hget;
363 Hash *h = NULL;
364 const char *nme = GDKinmemory() ? ":inmemory" : BBP_physical(b->batCacheid);
365 BATiter bi = bat_iterator(b);
366 PROPrec *prop;
367
368 ACCELDEBUG t0 = GDKusec();
369 ACCELDEBUG fprintf(stderr, "#BAThash: create hash(" ALGOBATFMT ");\n",
370 ALGOBATPAR(b));
371 if (b->ttype == TYPE_void) {
372 if (is_oid_nil(b->tseqbase)) {
373 ACCELDEBUG fprintf(stderr, "#BAThash: cannot create hash-table on void-NIL column.\n");
374 GDKerror("BAThash: no hash on void/nil column\n");
375 return NULL;
376 }
377 ACCELDEBUG fprintf(stderr, "#BAThash: creating hash-table on void column..\n");
378
379 tpe = TYPE_void;
380 }
381
382 cnt = canditer_init(&ci, b, s);
383
384 if ((h = GDKzalloc(sizeof(*h))) == NULL ||
385 (h->heap.farmid = BBPselectfarm(b->batRole, b->ttype, hashheap)) < 0) {
386 GDKfree(h);
387 return NULL;
388 }
389 h->heap.dirty = true;
390 strconcat_len(h->heap.filename, sizeof(h->heap.filename),
391 nme, ".", ext, NULL);
392
393 /* determine hash mask size */
394 cnt1 = 0;
395 if (ATOMsize(tpe) == 1) {
396 /* perfect hash for one-byte sized atoms */
397 mask = (1 << 8);
398 } else if (ATOMsize(tpe) == 2) {
399 /* perfect hash for two-byte sized atoms */
400 mask = (1 << 16);
401 } else if (b->tkey || cnt <= 4096) {
402 /* if key, or if small, don't bother dynamically
403 * adjusting the hash mask */
404 mask = HASHmask(cnt);
405 } else if (s == NULL && (prop = BATgetprop_nolock(b, GDK_HASH_MASK)) != NULL) {
406 assert(prop->v.vtype == TYPE_oid);
407 mask = prop->v.val.oval;
408 assert((mask & (mask - 1)) == 0); /* power of two */
409 maxmask = HASHmask(cnt);
410 if (mask > maxmask)
411 mask = maxmask;
412 } else {
413 /* dynamic hash: we start with HASHmask(cnt)/64, or,
414 * if cnt large enough, HASHmask(cnt)/256; if there
415 * are too many collisions we try HASHmask(cnt)/64,
416 * HASHmask(cnt)/16, HASHmask(cnt)/4, and finally
417 * HASHmask(cnt), but we might skip some of these if
418 * there are many distinct values. */
419 maxmask = HASHmask(cnt);
420 mask = maxmask >> 6;
421 while (mask > 4096)
422 mask >>= 2;
423 /* try out on first 25% of b */
424 cnt1 = cnt >> 2;
425 }
426
427 o = canditer_next(&ci); /* always one ahead */
428 for (;;) {
429 BUN maxslots = (mask >> 3) - 1; /* 1/8 full is too full */
430
431 nslots = 0;
432 p = 0;
433 HEAPfree(&h->heap, true);
434 /* create the hash structures */
435 if (HASHnew(h, ATOMtype(b->ttype), s ? cnt : BATcapacity(b),
436 mask, cnt) != GDK_SUCCEED) {
437
438 GDKfree(h);
439 return NULL;
440 }
441
442 hnil = HASHnil(h);
443
444 switch (tpe) {
445 case TYPE_bte:
446 starthash(bte);
447 break;
448 case TYPE_sht:
449 starthash(sht);
450 break;
451 case TYPE_flt:
452 starthash(flt);
453 break;
454 case TYPE_int:
455 starthash(int);
456 break;
457 case TYPE_dbl:
458 starthash(dbl);
459 break;
460 case TYPE_lng:
461 starthash(lng);
462 break;
463#ifdef HAVE_HGE
464 case TYPE_hge:
465 starthash(hge);
466 break;
467#endif
468 default:
469 for (; p < cnt1; p++) {
470 const void *restrict v = BUNtail(bi, o - b->hseqbase);
471 c = heap_hash_any(b->tvheap, h, v);
472 hget = HASHget(h, c);
473 if (hget == hnil) {
474 if (nslots == maxslots)
475 break; /* mask too full */
476 nslots++;
477 }
478 HASHputlink(h, p, hget);
479 HASHput(h, c, p);
480 o = canditer_next(&ci);
481 }
482 break;
483 }
484 ACCELDEBUG if (p < cnt1)
485 fprintf(stderr, "#BAThash(%s): abort starthash with "
486 "mask " BUNFMT " at " BUNFMT "\n", BATgetId(b),
487 mask, p);
488 if (p == cnt1 || mask == maxmask)
489 break;
490 mask <<= 2;
491 /* if we fill up the slots fast (p <= maxslots * 1.2)
492 * increase mask size a bit more quickly */
493 if (mask < maxmask && p <= maxslots * 1.2)
494 mask <<= 2;
495 canditer_reset(&ci);
496 o = canditer_next(&ci);
497 }
498
499 /* finish the hashtable with the current mask */
500 switch (tpe) {
501 case TYPE_bte:
502 finishhash(bte);
503 break;
504 case TYPE_sht:
505 finishhash(sht);
506 break;
507 case TYPE_int:
508 finishhash(int);
509 break;
510 case TYPE_flt:
511 finishhash(flt);
512 break;
513 case TYPE_dbl:
514 finishhash(dbl);
515 break;
516 case TYPE_lng:
517 finishhash(lng);
518 break;
519#ifdef HAVE_HGE
520 case TYPE_hge:
521 finishhash(hge);
522 break;
523#endif
524 default:
525 for (; p < cnt; p++) {
526 const void *restrict v = BUNtail(bi, o - b->hseqbase);
527 c = heap_hash_any(b->tvheap, h, v);
528 hget = HASHget(h, c);
529 nslots += hget == hnil;
530 HASHputlink(h, p, hget);
531 HASHput(h, c, p);
532 o = canditer_next(&ci);
533 }
534 break;
535 }
536 if (s == NULL)
537 BATsetprop_nolock(b, GDK_HASH_MASK, TYPE_oid, &(oid){h->mask + 1});
538 ((size_t *) h->heap.base)[5] = (size_t) nslots;
539#ifndef NDEBUG
540 /* clear unused part of Link array */
541 memset((char *) h->Link + cnt * h->width, 0, (h->lim - cnt) * h->width);
542#endif
543 h->heap.parentid = b->batCacheid;
544 /* if the number of occupied slots is equal to the bat count,
545 * all values are necessarily distinct */
546 if (nslots == BATcount(b) && !b->tkey) {
547 b->tkey = true;
548 b->batDirtydesc = true;
549 }
550 ACCELDEBUG {
551 fprintf(stderr, "#BAThash: hash construction " LLFMT " usec\n", GDKusec() - t0);
552 HASHcollisions(b, h);
553 }
554 return h;
555}
556
557gdk_return
558BAThash(BAT *b)
559{
560 assert(b->batCacheid > 0);
561 if (BATcheckhash(b)) {
562 return GDK_SUCCEED;
563 }
564 MT_lock_set(&b->batIdxLock);
565 if (b->thash == NULL) {
566 if ((b->thash = BAThash_impl(b, NULL, "thash")) == NULL) {
567 MT_lock_unset(&b->batIdxLock);
568 return GDK_FAIL;
569 }
570#ifdef PERSISTENTHASH
571 if (BBP_status(b->batCacheid) & BBPEXISTING && !b->theap.dirty && !GDKinmemory()) {
572 MT_Id tid;
573 BBPfix(b->batCacheid);
574 char name[16];
575 snprintf(name, sizeof(name), "hashsync%d", b->batCacheid);
576 MT_lock_unset(&b->batIdxLock);
577 if (MT_create_thread(&tid, BAThashsync, b,
578 MT_THR_DETACHED,
579 name) < 0) {
580 /* couldn't start thread: clean up */
581 BBPunfix(b->batCacheid);
582 }
583 return GDK_SUCCEED;
584 } else
585 ACCELDEBUG fprintf(stderr, "#BAThash: NOT persisting hash %d\n", b->batCacheid);
586#endif
587 }
588 MT_lock_unset(&b->batIdxLock);
589 return GDK_SUCCEED;
590}
591
592/*
593 * The entry on which a value hashes can be calculated with the
594 * routine HASHprobe.
595 */
596BUN
597HASHprobe(const Hash *h, const void *v)
598{
599 switch (ATOMbasetype(h->type)) {
600 case TYPE_bte:
601 return hash_bte(h, v);
602 case TYPE_sht:
603 return hash_sht(h, v);
604 case TYPE_int:
605 case TYPE_flt:
606 return hash_int(h, v);
607 case TYPE_dbl:
608 case TYPE_lng:
609 return hash_lng(h, v);
610#ifdef HAVE_HGE
611 case TYPE_hge:
612 return hash_hge(h, v);
613#endif
614 default:
615 return hash_any(h, v);
616 }
617}
618
619BUN
620HASHlist(Hash *h, BUN i)
621{
622 BUN c = 1;
623 BUN j = HASHget(h, i), nil = HASHnil(h);
624
625 if (j == nil)
626 return 1;
627 while ((j = HASHgetlink(h, i)) != nil) {
628 c++;
629 i = j;
630 }
631 return c;
632}
633
634void
635HASHdestroy(BAT *b)
636{
637 if (b && b->thash) {
638 Hash *hs;
639 MT_lock_set(&b->batIdxLock);
640 hs = b->thash;
641 b->thash = NULL;
642 MT_lock_unset(&b->batIdxLock);
643 if (hs == (Hash *) 1) {
644 GDKunlink(BBPselectfarm(b->batRole, b->ttype, hashheap),
645 BATDIR,
646 BBP_physical(b->batCacheid),
647 "thash");
648 } else if (hs) {
649 bat p = VIEWtparent(b);
650 BAT *hp = NULL;
651
652 if (p)
653 hp = BBP_cache(p);
654
655 if (!hp || hs != hp->thash) {
656 ACCELDEBUG if (*(size_t *) hs->heap.base & (1 << 24))
657 fprintf(stderr, "#HASHdestroy: removing persisted hash %d\n", b->batCacheid);
658 HEAPfree(&hs->heap, true);
659 GDKfree(hs);
660 }
661 }
662 }
663}
664
665void
666HASHfree(BAT *b)
667{
668 if (b && b->thash) {
669 MT_lock_set(&b->batIdxLock);
670 if (b->thash && b->thash != (Hash *) 1) {
671 bool rmheap = GDKinmemory() || b->thash->heap.dirty;
672
673 HEAPfree(&b->thash->heap, rmheap);
674 GDKfree(b->thash);
675 b->thash = rmheap ? NULL : (Hash *) 1;
676 }
677 MT_lock_unset(&b->batIdxLock);
678 }
679}
680
681bool
682HASHgonebad(BAT *b, const void *v)
683{
684 Hash *h = b->thash;
685 BATiter bi = bat_iterator(b);
686 BUN cnt, hit;
687
688 if (h == NULL)
689 return true; /* no hash is bad hash? */
690
691 if (h->mask * 2 < BATcount(b)) {
692 int (*cmp) (const void *, const void *) = ATOMcompare(b->ttype);
693 BUN i = HASHget(h, (BUN) HASHprobe(h, v)), nil = HASHnil(h);
694 for (cnt = hit = 1; i != nil; i = HASHgetlink(h, i), cnt++)
695 hit += ((*cmp) (v, BUNtail(bi, (BUN) i)) == 0);
696
697 if (cnt / hit > 4)
698 return true; /* linked list too long */
699
700 /* in this case, linked lists are long but contain the
701 * desired values such hash tables may be useful for
702 * locating all duplicates */
703 }
704 return false; /* a-ok */
705}
706