1 | /* hash - hashing table processing. |
2 | |
3 | Copyright (C) 1998-2004, 2006-2007, 2009-2019 Free Software Foundation, Inc. |
4 | |
5 | Written by Jim Meyering, 1992. |
6 | |
7 | This program is free software: you can redistribute it and/or modify |
8 | it under the terms of the GNU General Public License as published by |
9 | the Free Software Foundation; either version 3 of the License, or |
10 | (at your option) any later version. |
11 | |
12 | This program is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | GNU General Public License for more details. |
16 | |
17 | You should have received a copy of the GNU General Public License |
18 | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
19 | |
20 | /* A generic hash table package. */ |
21 | |
22 | /* Define USE_OBSTACK to 1 if you want the allocator to use obstacks instead |
23 | of malloc. If you change USE_OBSTACK, you have to recompile! */ |
24 | |
25 | #include <config.h> |
26 | |
27 | #include "hash.h" |
28 | |
29 | #include "bitrotate.h" |
30 | #include "xalloc-oversized.h" |
31 | |
32 | #include <stdint.h> |
33 | #include <stdio.h> |
34 | #include <stdlib.h> |
35 | |
36 | #if USE_OBSTACK |
37 | # include "obstack.h" |
38 | # ifndef obstack_chunk_alloc |
39 | # define obstack_chunk_alloc malloc |
40 | # endif |
41 | # ifndef obstack_chunk_free |
42 | # define obstack_chunk_free free |
43 | # endif |
44 | #endif |
45 | |
46 | struct hash_entry |
47 | { |
48 | void *data; |
49 | struct hash_entry *next; |
50 | }; |
51 | |
52 | struct hash_table |
53 | { |
54 | /* The array of buckets starts at BUCKET and extends to BUCKET_LIMIT-1, |
55 | for a possibility of N_BUCKETS. Among those, N_BUCKETS_USED buckets |
56 | are not empty, there are N_ENTRIES active entries in the table. */ |
57 | struct hash_entry *bucket; |
58 | struct hash_entry const *bucket_limit; |
59 | size_t n_buckets; |
60 | size_t n_buckets_used; |
61 | size_t n_entries; |
62 | |
63 | /* Tuning arguments, kept in a physically separate structure. */ |
64 | const Hash_tuning *tuning; |
65 | |
66 | /* Three functions are given to 'hash_initialize', see the documentation |
67 | block for this function. In a word, HASHER randomizes a user entry |
68 | into a number up from 0 up to some maximum minus 1; COMPARATOR returns |
69 | true if two user entries compare equally; and DATA_FREER is the cleanup |
70 | function for a user entry. */ |
71 | Hash_hasher hasher; |
72 | Hash_comparator comparator; |
73 | Hash_data_freer data_freer; |
74 | |
75 | /* A linked list of freed struct hash_entry structs. */ |
76 | struct hash_entry *free_entry_list; |
77 | |
78 | #if USE_OBSTACK |
79 | /* Whenever obstacks are used, it is possible to allocate all overflowed |
80 | entries into a single stack, so they all can be freed in a single |
81 | operation. It is not clear if the speedup is worth the trouble. */ |
82 | struct obstack entry_stack; |
83 | #endif |
84 | }; |
85 | |
86 | /* A hash table contains many internal entries, each holding a pointer to |
87 | some user-provided data (also called a user entry). An entry indistinctly |
88 | refers to both the internal entry and its associated user entry. A user |
89 | entry contents may be hashed by a randomization function (the hashing |
90 | function, or just "hasher" for short) into a number (or "slot") between 0 |
91 | and the current table size. At each slot position in the hash table, |
92 | starts a linked chain of entries for which the user data all hash to this |
93 | slot. A bucket is the collection of all entries hashing to the same slot. |
94 | |
95 | A good "hasher" function will distribute entries rather evenly in buckets. |
96 | In the ideal case, the length of each bucket is roughly the number of |
97 | entries divided by the table size. Finding the slot for a data is usually |
98 | done in constant time by the "hasher", and the later finding of a precise |
99 | entry is linear in time with the size of the bucket. Consequently, a |
100 | larger hash table size (that is, a larger number of buckets) is prone to |
101 | yielding shorter chains, *given* the "hasher" function behaves properly. |
102 | |
103 | Long buckets slow down the lookup algorithm. One might use big hash table |
104 | sizes in hope to reduce the average length of buckets, but this might |
105 | become inordinate, as unused slots in the hash table take some space. The |
106 | best bet is to make sure you are using a good "hasher" function (beware |
107 | that those are not that easy to write! :-), and to use a table size |
108 | larger than the actual number of entries. */ |
109 | |
110 | /* If an insertion makes the ratio of nonempty buckets to table size larger |
111 | than the growth threshold (a number between 0.0 and 1.0), then increase |
112 | the table size by multiplying by the growth factor (a number greater than |
113 | 1.0). The growth threshold defaults to 0.8, and the growth factor |
114 | defaults to 1.414, meaning that the table will have doubled its size |
115 | every second time 80% of the buckets get used. */ |
116 | #define DEFAULT_GROWTH_THRESHOLD 0.8f |
117 | #define DEFAULT_GROWTH_FACTOR 1.414f |
118 | |
119 | /* If a deletion empties a bucket and causes the ratio of used buckets to |
120 | table size to become smaller than the shrink threshold (a number between |
121 | 0.0 and 1.0), then shrink the table by multiplying by the shrink factor (a |
122 | number greater than the shrink threshold but smaller than 1.0). The shrink |
123 | threshold and factor default to 0.0 and 1.0, meaning that the table never |
124 | shrinks. */ |
125 | #define DEFAULT_SHRINK_THRESHOLD 0.0f |
126 | #define DEFAULT_SHRINK_FACTOR 1.0f |
127 | |
128 | /* Use this to initialize or reset a TUNING structure to |
129 | some sensible values. */ |
130 | static const Hash_tuning default_tuning = |
131 | { |
132 | DEFAULT_SHRINK_THRESHOLD, |
133 | DEFAULT_SHRINK_FACTOR, |
134 | DEFAULT_GROWTH_THRESHOLD, |
135 | DEFAULT_GROWTH_FACTOR, |
136 | false |
137 | }; |
138 | |
139 | /* Information and lookup. */ |
140 | |
141 | /* The following few functions provide information about the overall hash |
142 | table organization: the number of entries, number of buckets and maximum |
143 | length of buckets. */ |
144 | |
145 | /* Return the number of buckets in the hash table. The table size, the total |
146 | number of buckets (used plus unused), or the maximum number of slots, are |
147 | the same quantity. */ |
148 | |
149 | size_t |
150 | hash_get_n_buckets (const Hash_table *table) |
151 | { |
152 | return table->n_buckets; |
153 | } |
154 | |
155 | /* Return the number of slots in use (non-empty buckets). */ |
156 | |
157 | size_t |
158 | hash_get_n_buckets_used (const Hash_table *table) |
159 | { |
160 | return table->n_buckets_used; |
161 | } |
162 | |
163 | /* Return the number of active entries. */ |
164 | |
165 | size_t |
166 | hash_get_n_entries (const Hash_table *table) |
167 | { |
168 | return table->n_entries; |
169 | } |
170 | |
171 | /* Return the length of the longest chain (bucket). */ |
172 | |
173 | size_t |
174 | hash_get_max_bucket_length (const Hash_table *table) |
175 | { |
176 | struct hash_entry const *bucket; |
177 | size_t max_bucket_length = 0; |
178 | |
179 | for (bucket = table->bucket; bucket < table->bucket_limit; bucket++) |
180 | { |
181 | if (bucket->data) |
182 | { |
183 | struct hash_entry const *cursor = bucket; |
184 | size_t bucket_length = 1; |
185 | |
186 | while (cursor = cursor->next, cursor) |
187 | bucket_length++; |
188 | |
189 | if (bucket_length > max_bucket_length) |
190 | max_bucket_length = bucket_length; |
191 | } |
192 | } |
193 | |
194 | return max_bucket_length; |
195 | } |
196 | |
197 | /* Do a mild validation of a hash table, by traversing it and checking two |
198 | statistics. */ |
199 | |
200 | bool |
201 | hash_table_ok (const Hash_table *table) |
202 | { |
203 | struct hash_entry const *bucket; |
204 | size_t n_buckets_used = 0; |
205 | size_t n_entries = 0; |
206 | |
207 | for (bucket = table->bucket; bucket < table->bucket_limit; bucket++) |
208 | { |
209 | if (bucket->data) |
210 | { |
211 | struct hash_entry const *cursor = bucket; |
212 | |
213 | /* Count bucket head. */ |
214 | n_buckets_used++; |
215 | n_entries++; |
216 | |
217 | /* Count bucket overflow. */ |
218 | while (cursor = cursor->next, cursor) |
219 | n_entries++; |
220 | } |
221 | } |
222 | |
223 | if (n_buckets_used == table->n_buckets_used && n_entries == table->n_entries) |
224 | return true; |
225 | |
226 | return false; |
227 | } |
228 | |
229 | void |
230 | hash_print_statistics (const Hash_table *table, FILE *stream) |
231 | { |
232 | size_t n_entries = hash_get_n_entries (table); |
233 | size_t n_buckets = hash_get_n_buckets (table); |
234 | size_t n_buckets_used = hash_get_n_buckets_used (table); |
235 | size_t max_bucket_length = hash_get_max_bucket_length (table); |
236 | |
237 | fprintf (stream, "# entries: %lu\n" , (unsigned long int) n_entries); |
238 | fprintf (stream, "# buckets: %lu\n" , (unsigned long int) n_buckets); |
239 | fprintf (stream, "# buckets used: %lu (%.2f%%)\n" , |
240 | (unsigned long int) n_buckets_used, |
241 | (100.0 * n_buckets_used) / n_buckets); |
242 | fprintf (stream, "max bucket length: %lu\n" , |
243 | (unsigned long int) max_bucket_length); |
244 | } |
245 | |
246 | /* Hash KEY and return a pointer to the selected bucket. |
247 | If TABLE->hasher misbehaves, abort. */ |
248 | static struct hash_entry * |
249 | safe_hasher (const Hash_table *table, const void *key) |
250 | { |
251 | size_t n = table->hasher (key, table->n_buckets); |
252 | if (! (n < table->n_buckets)) |
253 | abort (); |
254 | return table->bucket + n; |
255 | } |
256 | |
257 | /* If ENTRY matches an entry already in the hash table, return the |
258 | entry from the table. Otherwise, return NULL. */ |
259 | |
260 | void * |
261 | hash_lookup (const Hash_table *table, const void *entry) |
262 | { |
263 | struct hash_entry const *bucket = safe_hasher (table, entry); |
264 | struct hash_entry const *cursor; |
265 | |
266 | if (bucket->data == NULL) |
267 | return NULL; |
268 | |
269 | for (cursor = bucket; cursor; cursor = cursor->next) |
270 | if (entry == cursor->data || table->comparator (entry, cursor->data)) |
271 | return cursor->data; |
272 | |
273 | return NULL; |
274 | } |
275 | |
276 | /* Walking. */ |
277 | |
278 | /* The functions in this page traverse the hash table and process the |
279 | contained entries. For the traversal to work properly, the hash table |
280 | should not be resized nor modified while any particular entry is being |
281 | processed. In particular, entries should not be added, and an entry |
282 | may be removed only if there is no shrink threshold and the entry being |
283 | removed has already been passed to hash_get_next. */ |
284 | |
285 | /* Return the first data in the table, or NULL if the table is empty. */ |
286 | |
287 | void * |
288 | hash_get_first (const Hash_table *table) |
289 | { |
290 | struct hash_entry const *bucket; |
291 | |
292 | if (table->n_entries == 0) |
293 | return NULL; |
294 | |
295 | for (bucket = table->bucket; ; bucket++) |
296 | if (! (bucket < table->bucket_limit)) |
297 | abort (); |
298 | else if (bucket->data) |
299 | return bucket->data; |
300 | } |
301 | |
302 | /* Return the user data for the entry following ENTRY, where ENTRY has been |
303 | returned by a previous call to either 'hash_get_first' or 'hash_get_next'. |
304 | Return NULL if there are no more entries. */ |
305 | |
306 | void * |
307 | hash_get_next (const Hash_table *table, const void *entry) |
308 | { |
309 | struct hash_entry const *bucket = safe_hasher (table, entry); |
310 | struct hash_entry const *cursor; |
311 | |
312 | /* Find next entry in the same bucket. */ |
313 | cursor = bucket; |
314 | do |
315 | { |
316 | if (cursor->data == entry && cursor->next) |
317 | return cursor->next->data; |
318 | cursor = cursor->next; |
319 | } |
320 | while (cursor != NULL); |
321 | |
322 | /* Find first entry in any subsequent bucket. */ |
323 | while (++bucket < table->bucket_limit) |
324 | if (bucket->data) |
325 | return bucket->data; |
326 | |
327 | /* None found. */ |
328 | return NULL; |
329 | } |
330 | |
331 | /* Fill BUFFER with pointers to active user entries in the hash table, then |
332 | return the number of pointers copied. Do not copy more than BUFFER_SIZE |
333 | pointers. */ |
334 | |
335 | size_t |
336 | hash_get_entries (const Hash_table *table, void **buffer, |
337 | size_t buffer_size) |
338 | { |
339 | size_t counter = 0; |
340 | struct hash_entry const *bucket; |
341 | struct hash_entry const *cursor; |
342 | |
343 | for (bucket = table->bucket; bucket < table->bucket_limit; bucket++) |
344 | { |
345 | if (bucket->data) |
346 | { |
347 | for (cursor = bucket; cursor; cursor = cursor->next) |
348 | { |
349 | if (counter >= buffer_size) |
350 | return counter; |
351 | buffer[counter++] = cursor->data; |
352 | } |
353 | } |
354 | } |
355 | |
356 | return counter; |
357 | } |
358 | |
359 | /* Call a PROCESSOR function for each entry of a hash table, and return the |
360 | number of entries for which the processor function returned success. A |
361 | pointer to some PROCESSOR_DATA which will be made available to each call to |
362 | the processor function. The PROCESSOR accepts two arguments: the first is |
363 | the user entry being walked into, the second is the value of PROCESSOR_DATA |
364 | as received. The walking continue for as long as the PROCESSOR function |
365 | returns nonzero. When it returns zero, the walking is interrupted. */ |
366 | |
367 | size_t |
368 | hash_do_for_each (const Hash_table *table, Hash_processor processor, |
369 | void *processor_data) |
370 | { |
371 | size_t counter = 0; |
372 | struct hash_entry const *bucket; |
373 | struct hash_entry const *cursor; |
374 | |
375 | for (bucket = table->bucket; bucket < table->bucket_limit; bucket++) |
376 | { |
377 | if (bucket->data) |
378 | { |
379 | for (cursor = bucket; cursor; cursor = cursor->next) |
380 | { |
381 | if (! processor (cursor->data, processor_data)) |
382 | return counter; |
383 | counter++; |
384 | } |
385 | } |
386 | } |
387 | |
388 | return counter; |
389 | } |
390 | |
391 | /* Allocation and clean-up. */ |
392 | |
393 | /* Return a hash index for a NUL-terminated STRING between 0 and N_BUCKETS-1. |
394 | This is a convenience routine for constructing other hashing functions. */ |
395 | |
396 | #if USE_DIFF_HASH |
397 | |
398 | /* About hashings, Paul Eggert writes to me (FP), on 1994-01-01: "Please see |
399 | B. J. McKenzie, R. Harries & T. Bell, Selecting a hashing algorithm, |
400 | Software--practice & experience 20, 2 (Feb 1990), 209-224. Good hash |
401 | algorithms tend to be domain-specific, so what's good for [diffutils'] io.c |
402 | may not be good for your application." */ |
403 | |
404 | size_t |
405 | hash_string (const char *string, size_t n_buckets) |
406 | { |
407 | # define HASH_ONE_CHAR(Value, Byte) \ |
408 | ((Byte) + rotl_sz (Value, 7)) |
409 | |
410 | size_t value = 0; |
411 | unsigned char ch; |
412 | |
413 | for (; (ch = *string); string++) |
414 | value = HASH_ONE_CHAR (value, ch); |
415 | return value % n_buckets; |
416 | |
417 | # undef HASH_ONE_CHAR |
418 | } |
419 | |
420 | #else /* not USE_DIFF_HASH */ |
421 | |
422 | /* This one comes from 'recode', and performs a bit better than the above as |
423 | per a few experiments. It is inspired from a hashing routine found in the |
424 | very old Cyber 'snoop', itself written in typical Greg Mansfield style. |
425 | (By the way, what happened to this excellent man? Is he still alive?) */ |
426 | |
427 | size_t |
428 | hash_string (const char *string, size_t n_buckets) |
429 | { |
430 | size_t value = 0; |
431 | unsigned char ch; |
432 | |
433 | for (; (ch = *string); string++) |
434 | value = (value * 31 + ch) % n_buckets; |
435 | return value; |
436 | } |
437 | |
438 | #endif /* not USE_DIFF_HASH */ |
439 | |
440 | /* Return true if CANDIDATE is a prime number. CANDIDATE should be an odd |
441 | number at least equal to 11. */ |
442 | |
443 | static bool _GL_ATTRIBUTE_CONST |
444 | is_prime (size_t candidate) |
445 | { |
446 | size_t divisor = 3; |
447 | size_t square = divisor * divisor; |
448 | |
449 | while (square < candidate && (candidate % divisor)) |
450 | { |
451 | divisor++; |
452 | square += 4 * divisor; |
453 | divisor++; |
454 | } |
455 | |
456 | return (candidate % divisor ? true : false); |
457 | } |
458 | |
459 | /* Round a given CANDIDATE number up to the nearest prime, and return that |
460 | prime. Primes lower than 10 are merely skipped. */ |
461 | |
462 | static size_t _GL_ATTRIBUTE_CONST |
463 | next_prime (size_t candidate) |
464 | { |
465 | /* Skip small primes. */ |
466 | if (candidate < 10) |
467 | candidate = 10; |
468 | |
469 | /* Make it definitely odd. */ |
470 | candidate |= 1; |
471 | |
472 | while (SIZE_MAX != candidate && !is_prime (candidate)) |
473 | candidate += 2; |
474 | |
475 | return candidate; |
476 | } |
477 | |
478 | void |
479 | hash_reset_tuning (Hash_tuning *tuning) |
480 | { |
481 | *tuning = default_tuning; |
482 | } |
483 | |
484 | /* If the user passes a NULL hasher, we hash the raw pointer. */ |
485 | static size_t |
486 | raw_hasher (const void *data, size_t n) |
487 | { |
488 | /* When hashing unique pointers, it is often the case that they were |
489 | generated by malloc and thus have the property that the low-order |
490 | bits are 0. As this tends to give poorer performance with small |
491 | tables, we rotate the pointer value before performing division, |
492 | in an attempt to improve hash quality. */ |
493 | size_t val = rotr_sz ((size_t) data, 3); |
494 | return val % n; |
495 | } |
496 | |
497 | /* If the user passes a NULL comparator, we use pointer comparison. */ |
498 | static bool |
499 | raw_comparator (const void *a, const void *b) |
500 | { |
501 | return a == b; |
502 | } |
503 | |
504 | |
505 | /* For the given hash TABLE, check the user supplied tuning structure for |
506 | reasonable values, and return true if there is no gross error with it. |
507 | Otherwise, definitively reset the TUNING field to some acceptable default |
508 | in the hash table (that is, the user loses the right of further modifying |
509 | tuning arguments), and return false. */ |
510 | |
511 | static bool |
512 | check_tuning (Hash_table *table) |
513 | { |
514 | const Hash_tuning *tuning = table->tuning; |
515 | float epsilon; |
516 | if (tuning == &default_tuning) |
517 | return true; |
518 | |
519 | /* Be a bit stricter than mathematics would require, so that |
520 | rounding errors in size calculations do not cause allocations to |
521 | fail to grow or shrink as they should. The smallest allocation |
522 | is 11 (due to next_prime's algorithm), so an epsilon of 0.1 |
523 | should be good enough. */ |
524 | epsilon = 0.1f; |
525 | |
526 | if (epsilon < tuning->growth_threshold |
527 | && tuning->growth_threshold < 1 - epsilon |
528 | && 1 + epsilon < tuning->growth_factor |
529 | && 0 <= tuning->shrink_threshold |
530 | && tuning->shrink_threshold + epsilon < tuning->shrink_factor |
531 | && tuning->shrink_factor <= 1 |
532 | && tuning->shrink_threshold + epsilon < tuning->growth_threshold) |
533 | return true; |
534 | |
535 | table->tuning = &default_tuning; |
536 | return false; |
537 | } |
538 | |
539 | /* Compute the size of the bucket array for the given CANDIDATE and |
540 | TUNING, or return 0 if there is no possible way to allocate that |
541 | many entries. */ |
542 | |
543 | static size_t _GL_ATTRIBUTE_PURE |
544 | compute_bucket_size (size_t candidate, const Hash_tuning *tuning) |
545 | { |
546 | if (!tuning->is_n_buckets) |
547 | { |
548 | float new_candidate = candidate / tuning->growth_threshold; |
549 | if (SIZE_MAX <= new_candidate) |
550 | return 0; |
551 | candidate = new_candidate; |
552 | } |
553 | candidate = next_prime (candidate); |
554 | if (xalloc_oversized (candidate, sizeof (struct hash_entry *))) |
555 | return 0; |
556 | return candidate; |
557 | } |
558 | |
559 | /* Allocate and return a new hash table, or NULL upon failure. The initial |
560 | number of buckets is automatically selected so as to _guarantee_ that you |
561 | may insert at least CANDIDATE different user entries before any growth of |
562 | the hash table size occurs. So, if have a reasonably tight a-priori upper |
563 | bound on the number of entries you intend to insert in the hash table, you |
564 | may save some table memory and insertion time, by specifying it here. If |
565 | the IS_N_BUCKETS field of the TUNING structure is true, the CANDIDATE |
566 | argument has its meaning changed to the wanted number of buckets. |
567 | |
568 | TUNING points to a structure of user-supplied values, in case some fine |
569 | tuning is wanted over the default behavior of the hasher. If TUNING is |
570 | NULL, the default tuning parameters are used instead. If TUNING is |
571 | provided but the values requested are out of bounds or might cause |
572 | rounding errors, return NULL. |
573 | |
574 | The user-supplied HASHER function, when not NULL, accepts two |
575 | arguments ENTRY and TABLE_SIZE. It computes, by hashing ENTRY contents, a |
576 | slot number for that entry which should be in the range 0..TABLE_SIZE-1. |
577 | This slot number is then returned. |
578 | |
579 | The user-supplied COMPARATOR function, when not NULL, accepts two |
580 | arguments pointing to user data, it then returns true for a pair of entries |
581 | that compare equal, or false otherwise. This function is internally called |
582 | on entries which are already known to hash to the same bucket index, |
583 | but which are distinct pointers. |
584 | |
585 | The user-supplied DATA_FREER function, when not NULL, may be later called |
586 | with the user data as an argument, just before the entry containing the |
587 | data gets freed. This happens from within 'hash_free' or 'hash_clear'. |
588 | You should specify this function only if you want these functions to free |
589 | all of your 'data' data. This is typically the case when your data is |
590 | simply an auxiliary struct that you have malloc'd to aggregate several |
591 | values. */ |
592 | |
593 | Hash_table * |
594 | hash_initialize (size_t candidate, const Hash_tuning *tuning, |
595 | Hash_hasher hasher, Hash_comparator comparator, |
596 | Hash_data_freer data_freer) |
597 | { |
598 | Hash_table *table; |
599 | |
600 | if (hasher == NULL) |
601 | hasher = raw_hasher; |
602 | if (comparator == NULL) |
603 | comparator = raw_comparator; |
604 | |
605 | table = malloc (sizeof *table); |
606 | if (table == NULL) |
607 | return NULL; |
608 | |
609 | if (!tuning) |
610 | tuning = &default_tuning; |
611 | table->tuning = tuning; |
612 | if (!check_tuning (table)) |
613 | { |
614 | /* Fail if the tuning options are invalid. This is the only occasion |
615 | when the user gets some feedback about it. Once the table is created, |
616 | if the user provides invalid tuning options, we silently revert to |
617 | using the defaults, and ignore further request to change the tuning |
618 | options. */ |
619 | goto fail; |
620 | } |
621 | |
622 | table->n_buckets = compute_bucket_size (candidate, tuning); |
623 | if (!table->n_buckets) |
624 | goto fail; |
625 | |
626 | table->bucket = calloc (table->n_buckets, sizeof *table->bucket); |
627 | if (table->bucket == NULL) |
628 | goto fail; |
629 | table->bucket_limit = table->bucket + table->n_buckets; |
630 | table->n_buckets_used = 0; |
631 | table->n_entries = 0; |
632 | |
633 | table->hasher = hasher; |
634 | table->comparator = comparator; |
635 | table->data_freer = data_freer; |
636 | |
637 | table->free_entry_list = NULL; |
638 | #if USE_OBSTACK |
639 | obstack_init (&table->entry_stack); |
640 | #endif |
641 | return table; |
642 | |
643 | fail: |
644 | free (table); |
645 | return NULL; |
646 | } |
647 | |
648 | /* Make all buckets empty, placing any chained entries on the free list. |
649 | Apply the user-specified function data_freer (if any) to the datas of any |
650 | affected entries. */ |
651 | |
652 | void |
653 | hash_clear (Hash_table *table) |
654 | { |
655 | struct hash_entry *bucket; |
656 | |
657 | for (bucket = table->bucket; bucket < table->bucket_limit; bucket++) |
658 | { |
659 | if (bucket->data) |
660 | { |
661 | struct hash_entry *cursor; |
662 | struct hash_entry *next; |
663 | |
664 | /* Free the bucket overflow. */ |
665 | for (cursor = bucket->next; cursor; cursor = next) |
666 | { |
667 | if (table->data_freer) |
668 | table->data_freer (cursor->data); |
669 | cursor->data = NULL; |
670 | |
671 | next = cursor->next; |
672 | /* Relinking is done one entry at a time, as it is to be expected |
673 | that overflows are either rare or short. */ |
674 | cursor->next = table->free_entry_list; |
675 | table->free_entry_list = cursor; |
676 | } |
677 | |
678 | /* Free the bucket head. */ |
679 | if (table->data_freer) |
680 | table->data_freer (bucket->data); |
681 | bucket->data = NULL; |
682 | bucket->next = NULL; |
683 | } |
684 | } |
685 | |
686 | table->n_buckets_used = 0; |
687 | table->n_entries = 0; |
688 | } |
689 | |
690 | /* Reclaim all storage associated with a hash table. If a data_freer |
691 | function has been supplied by the user when the hash table was created, |
692 | this function applies it to the data of each entry before freeing that |
693 | entry. */ |
694 | |
695 | void |
696 | hash_free (Hash_table *table) |
697 | { |
698 | struct hash_entry *bucket; |
699 | struct hash_entry *cursor; |
700 | struct hash_entry *next; |
701 | |
702 | /* Call the user data_freer function. */ |
703 | if (table->data_freer && table->n_entries) |
704 | { |
705 | for (bucket = table->bucket; bucket < table->bucket_limit; bucket++) |
706 | { |
707 | if (bucket->data) |
708 | { |
709 | for (cursor = bucket; cursor; cursor = cursor->next) |
710 | table->data_freer (cursor->data); |
711 | } |
712 | } |
713 | } |
714 | |
715 | #if USE_OBSTACK |
716 | |
717 | obstack_free (&table->entry_stack, NULL); |
718 | |
719 | #else |
720 | |
721 | /* Free all bucket overflowed entries. */ |
722 | for (bucket = table->bucket; bucket < table->bucket_limit; bucket++) |
723 | { |
724 | for (cursor = bucket->next; cursor; cursor = next) |
725 | { |
726 | next = cursor->next; |
727 | free (cursor); |
728 | } |
729 | } |
730 | |
731 | /* Also reclaim the internal list of previously freed entries. */ |
732 | for (cursor = table->free_entry_list; cursor; cursor = next) |
733 | { |
734 | next = cursor->next; |
735 | free (cursor); |
736 | } |
737 | |
738 | #endif |
739 | |
740 | /* Free the remainder of the hash table structure. */ |
741 | free (table->bucket); |
742 | free (table); |
743 | } |
744 | |
745 | /* Insertion and deletion. */ |
746 | |
747 | /* Get a new hash entry for a bucket overflow, possibly by recycling a |
748 | previously freed one. If this is not possible, allocate a new one. */ |
749 | |
750 | static struct hash_entry * |
751 | allocate_entry (Hash_table *table) |
752 | { |
753 | struct hash_entry *new; |
754 | |
755 | if (table->free_entry_list) |
756 | { |
757 | new = table->free_entry_list; |
758 | table->free_entry_list = new->next; |
759 | } |
760 | else |
761 | { |
762 | #if USE_OBSTACK |
763 | new = obstack_alloc (&table->entry_stack, sizeof *new); |
764 | #else |
765 | new = malloc (sizeof *new); |
766 | #endif |
767 | } |
768 | |
769 | return new; |
770 | } |
771 | |
772 | /* Free a hash entry which was part of some bucket overflow, |
773 | saving it for later recycling. */ |
774 | |
775 | static void |
776 | free_entry (Hash_table *table, struct hash_entry *entry) |
777 | { |
778 | entry->data = NULL; |
779 | entry->next = table->free_entry_list; |
780 | table->free_entry_list = entry; |
781 | } |
782 | |
783 | /* This private function is used to help with insertion and deletion. When |
784 | ENTRY matches an entry in the table, return a pointer to the corresponding |
785 | user data and set *BUCKET_HEAD to the head of the selected bucket. |
786 | Otherwise, return NULL. When DELETE is true and ENTRY matches an entry in |
787 | the table, unlink the matching entry. */ |
788 | |
789 | static void * |
790 | hash_find_entry (Hash_table *table, const void *entry, |
791 | struct hash_entry **bucket_head, bool delete) |
792 | { |
793 | struct hash_entry *bucket = safe_hasher (table, entry); |
794 | struct hash_entry *cursor; |
795 | |
796 | *bucket_head = bucket; |
797 | |
798 | /* Test for empty bucket. */ |
799 | if (bucket->data == NULL) |
800 | return NULL; |
801 | |
802 | /* See if the entry is the first in the bucket. */ |
803 | if (entry == bucket->data || table->comparator (entry, bucket->data)) |
804 | { |
805 | void *data = bucket->data; |
806 | |
807 | if (delete) |
808 | { |
809 | if (bucket->next) |
810 | { |
811 | struct hash_entry *next = bucket->next; |
812 | |
813 | /* Bump the first overflow entry into the bucket head, then save |
814 | the previous first overflow entry for later recycling. */ |
815 | *bucket = *next; |
816 | free_entry (table, next); |
817 | } |
818 | else |
819 | { |
820 | bucket->data = NULL; |
821 | } |
822 | } |
823 | |
824 | return data; |
825 | } |
826 | |
827 | /* Scan the bucket overflow. */ |
828 | for (cursor = bucket; cursor->next; cursor = cursor->next) |
829 | { |
830 | if (entry == cursor->next->data |
831 | || table->comparator (entry, cursor->next->data)) |
832 | { |
833 | void *data = cursor->next->data; |
834 | |
835 | if (delete) |
836 | { |
837 | struct hash_entry *next = cursor->next; |
838 | |
839 | /* Unlink the entry to delete, then save the freed entry for later |
840 | recycling. */ |
841 | cursor->next = next->next; |
842 | free_entry (table, next); |
843 | } |
844 | |
845 | return data; |
846 | } |
847 | } |
848 | |
849 | /* No entry found. */ |
850 | return NULL; |
851 | } |
852 | |
853 | /* Internal helper, to move entries from SRC to DST. Both tables must |
854 | share the same free entry list. If SAFE, only move overflow |
855 | entries, saving bucket heads for later, so that no allocations will |
856 | occur. Return false if the free entry list is exhausted and an |
857 | allocation fails. */ |
858 | |
859 | static bool |
860 | transfer_entries (Hash_table *dst, Hash_table *src, bool safe) |
861 | { |
862 | struct hash_entry *bucket; |
863 | struct hash_entry *cursor; |
864 | struct hash_entry *next; |
865 | for (bucket = src->bucket; bucket < src->bucket_limit; bucket++) |
866 | if (bucket->data) |
867 | { |
868 | void *data; |
869 | struct hash_entry *new_bucket; |
870 | |
871 | /* Within each bucket, transfer overflow entries first and |
872 | then the bucket head, to minimize memory pressure. After |
873 | all, the only time we might allocate is when moving the |
874 | bucket head, but moving overflow entries first may create |
875 | free entries that can be recycled by the time we finally |
876 | get to the bucket head. */ |
877 | for (cursor = bucket->next; cursor; cursor = next) |
878 | { |
879 | data = cursor->data; |
880 | new_bucket = safe_hasher (dst, data); |
881 | |
882 | next = cursor->next; |
883 | |
884 | if (new_bucket->data) |
885 | { |
886 | /* Merely relink an existing entry, when moving from a |
887 | bucket overflow into a bucket overflow. */ |
888 | cursor->next = new_bucket->next; |
889 | new_bucket->next = cursor; |
890 | } |
891 | else |
892 | { |
893 | /* Free an existing entry, when moving from a bucket |
894 | overflow into a bucket header. */ |
895 | new_bucket->data = data; |
896 | dst->n_buckets_used++; |
897 | free_entry (dst, cursor); |
898 | } |
899 | } |
900 | /* Now move the bucket head. Be sure that if we fail due to |
901 | allocation failure that the src table is in a consistent |
902 | state. */ |
903 | data = bucket->data; |
904 | bucket->next = NULL; |
905 | if (safe) |
906 | continue; |
907 | new_bucket = safe_hasher (dst, data); |
908 | |
909 | if (new_bucket->data) |
910 | { |
911 | /* Allocate or recycle an entry, when moving from a bucket |
912 | header into a bucket overflow. */ |
913 | struct hash_entry *new_entry = allocate_entry (dst); |
914 | |
915 | if (new_entry == NULL) |
916 | return false; |
917 | |
918 | new_entry->data = data; |
919 | new_entry->next = new_bucket->next; |
920 | new_bucket->next = new_entry; |
921 | } |
922 | else |
923 | { |
924 | /* Move from one bucket header to another. */ |
925 | new_bucket->data = data; |
926 | dst->n_buckets_used++; |
927 | } |
928 | bucket->data = NULL; |
929 | src->n_buckets_used--; |
930 | } |
931 | return true; |
932 | } |
933 | |
934 | /* For an already existing hash table, change the number of buckets through |
935 | specifying CANDIDATE. The contents of the hash table are preserved. The |
936 | new number of buckets is automatically selected so as to _guarantee_ that |
937 | the table may receive at least CANDIDATE different user entries, including |
938 | those already in the table, before any other growth of the hash table size |
939 | occurs. If TUNING->IS_N_BUCKETS is true, then CANDIDATE specifies the |
940 | exact number of buckets desired. Return true iff the rehash succeeded. */ |
941 | |
942 | bool |
943 | hash_rehash (Hash_table *table, size_t candidate) |
944 | { |
945 | Hash_table storage; |
946 | Hash_table *new_table; |
947 | size_t new_size = compute_bucket_size (candidate, table->tuning); |
948 | |
949 | if (!new_size) |
950 | return false; |
951 | if (new_size == table->n_buckets) |
952 | return true; |
953 | new_table = &storage; |
954 | new_table->bucket = calloc (new_size, sizeof *new_table->bucket); |
955 | if (new_table->bucket == NULL) |
956 | return false; |
957 | new_table->n_buckets = new_size; |
958 | new_table->bucket_limit = new_table->bucket + new_size; |
959 | new_table->n_buckets_used = 0; |
960 | new_table->n_entries = 0; |
961 | new_table->tuning = table->tuning; |
962 | new_table->hasher = table->hasher; |
963 | new_table->comparator = table->comparator; |
964 | new_table->data_freer = table->data_freer; |
965 | |
966 | /* In order for the transfer to successfully complete, we need |
967 | additional overflow entries when distinct buckets in the old |
968 | table collide into a common bucket in the new table. The worst |
969 | case possible is a hasher that gives a good spread with the old |
970 | size, but returns a constant with the new size; if we were to |
971 | guarantee table->n_buckets_used-1 free entries in advance, then |
972 | the transfer would be guaranteed to not allocate memory. |
973 | However, for large tables, a guarantee of no further allocation |
974 | introduces a lot of extra memory pressure, all for an unlikely |
975 | corner case (most rehashes reduce, rather than increase, the |
976 | number of overflow entries needed). So, we instead ensure that |
977 | the transfer process can be reversed if we hit a memory |
978 | allocation failure mid-transfer. */ |
979 | |
980 | /* Merely reuse the extra old space into the new table. */ |
981 | #if USE_OBSTACK |
982 | new_table->entry_stack = table->entry_stack; |
983 | #endif |
984 | new_table->free_entry_list = table->free_entry_list; |
985 | |
986 | if (transfer_entries (new_table, table, false)) |
987 | { |
988 | /* Entries transferred successfully; tie up the loose ends. */ |
989 | free (table->bucket); |
990 | table->bucket = new_table->bucket; |
991 | table->bucket_limit = new_table->bucket_limit; |
992 | table->n_buckets = new_table->n_buckets; |
993 | table->n_buckets_used = new_table->n_buckets_used; |
994 | table->free_entry_list = new_table->free_entry_list; |
995 | /* table->n_entries and table->entry_stack already hold their value. */ |
996 | return true; |
997 | } |
998 | |
999 | /* We've allocated new_table->bucket (and possibly some entries), |
1000 | exhausted the free list, and moved some but not all entries into |
1001 | new_table. We must undo the partial move before returning |
1002 | failure. The only way to get into this situation is if new_table |
1003 | uses fewer buckets than the old table, so we will reclaim some |
1004 | free entries as overflows in the new table are put back into |
1005 | distinct buckets in the old table. |
1006 | |
1007 | There are some pathological cases where a single pass through the |
1008 | table requires more intermediate overflow entries than using two |
1009 | passes. Two passes give worse cache performance and takes |
1010 | longer, but at this point, we're already out of memory, so slow |
1011 | and safe is better than failure. */ |
1012 | table->free_entry_list = new_table->free_entry_list; |
1013 | if (! (transfer_entries (table, new_table, true) |
1014 | && transfer_entries (table, new_table, false))) |
1015 | abort (); |
1016 | /* table->n_entries already holds its value. */ |
1017 | free (new_table->bucket); |
1018 | return false; |
1019 | } |
1020 | |
1021 | /* Insert ENTRY into hash TABLE if there is not already a matching entry. |
1022 | |
1023 | Return -1 upon memory allocation failure. |
1024 | Return 1 if insertion succeeded. |
1025 | Return 0 if there is already a matching entry in the table, |
1026 | and in that case, if MATCHED_ENT is non-NULL, set *MATCHED_ENT |
1027 | to that entry. |
1028 | |
1029 | This interface is easier to use than hash_insert when you must |
1030 | distinguish between the latter two cases. More importantly, |
1031 | hash_insert is unusable for some types of ENTRY values. When using |
1032 | hash_insert, the only way to distinguish those cases is to compare |
1033 | the return value and ENTRY. That works only when you can have two |
1034 | different ENTRY values that point to data that compares "equal". Thus, |
1035 | when the ENTRY value is a simple scalar, you must use |
1036 | hash_insert_if_absent. ENTRY must not be NULL. */ |
1037 | int |
1038 | hash_insert_if_absent (Hash_table *table, void const *entry, |
1039 | void const **matched_ent) |
1040 | { |
1041 | void *data; |
1042 | struct hash_entry *bucket; |
1043 | |
1044 | /* The caller cannot insert a NULL entry, since hash_lookup returns NULL |
1045 | to indicate "not found", and hash_find_entry uses "bucket->data == NULL" |
1046 | to indicate an empty bucket. */ |
1047 | if (! entry) |
1048 | abort (); |
1049 | |
1050 | /* If there's a matching entry already in the table, return that. */ |
1051 | if ((data = hash_find_entry (table, entry, &bucket, false)) != NULL) |
1052 | { |
1053 | if (matched_ent) |
1054 | *matched_ent = data; |
1055 | return 0; |
1056 | } |
1057 | |
1058 | /* If the growth threshold of the buckets in use has been reached, increase |
1059 | the table size and rehash. There's no point in checking the number of |
1060 | entries: if the hashing function is ill-conditioned, rehashing is not |
1061 | likely to improve it. */ |
1062 | |
1063 | if (table->n_buckets_used |
1064 | > table->tuning->growth_threshold * table->n_buckets) |
1065 | { |
1066 | /* Check more fully, before starting real work. If tuning arguments |
1067 | became invalid, the second check will rely on proper defaults. */ |
1068 | check_tuning (table); |
1069 | if (table->n_buckets_used |
1070 | > table->tuning->growth_threshold * table->n_buckets) |
1071 | { |
1072 | const Hash_tuning *tuning = table->tuning; |
1073 | float candidate = |
1074 | (tuning->is_n_buckets |
1075 | ? (table->n_buckets * tuning->growth_factor) |
1076 | : (table->n_buckets * tuning->growth_factor |
1077 | * tuning->growth_threshold)); |
1078 | |
1079 | if (SIZE_MAX <= candidate) |
1080 | return -1; |
1081 | |
1082 | /* If the rehash fails, arrange to return NULL. */ |
1083 | if (!hash_rehash (table, candidate)) |
1084 | return -1; |
1085 | |
1086 | /* Update the bucket we are interested in. */ |
1087 | if (hash_find_entry (table, entry, &bucket, false) != NULL) |
1088 | abort (); |
1089 | } |
1090 | } |
1091 | |
1092 | /* ENTRY is not matched, it should be inserted. */ |
1093 | |
1094 | if (bucket->data) |
1095 | { |
1096 | struct hash_entry *new_entry = allocate_entry (table); |
1097 | |
1098 | if (new_entry == NULL) |
1099 | return -1; |
1100 | |
1101 | /* Add ENTRY in the overflow of the bucket. */ |
1102 | |
1103 | new_entry->data = (void *) entry; |
1104 | new_entry->next = bucket->next; |
1105 | bucket->next = new_entry; |
1106 | table->n_entries++; |
1107 | return 1; |
1108 | } |
1109 | |
1110 | /* Add ENTRY right in the bucket head. */ |
1111 | |
1112 | bucket->data = (void *) entry; |
1113 | table->n_entries++; |
1114 | table->n_buckets_used++; |
1115 | |
1116 | return 1; |
1117 | } |
1118 | |
1119 | /* If ENTRY matches an entry already in the hash table, return the pointer |
1120 | to the entry from the table. Otherwise, insert ENTRY and return ENTRY. |
1121 | Return NULL if the storage required for insertion cannot be allocated. |
1122 | This implementation does not support duplicate entries or insertion of |
1123 | NULL. */ |
1124 | |
1125 | void * |
1126 | hash_insert (Hash_table *table, void const *entry) |
1127 | { |
1128 | void const *matched_ent; |
1129 | int err = hash_insert_if_absent (table, entry, &matched_ent); |
1130 | return (err == -1 |
1131 | ? NULL |
1132 | : (void *) (err == 0 ? matched_ent : entry)); |
1133 | } |
1134 | |
1135 | /* If ENTRY is already in the table, remove it and return the just-deleted |
1136 | data (the user may want to deallocate its storage). If ENTRY is not in the |
1137 | table, don't modify the table and return NULL. */ |
1138 | |
1139 | void * |
1140 | hash_delete (Hash_table *table, const void *entry) |
1141 | { |
1142 | void *data; |
1143 | struct hash_entry *bucket; |
1144 | |
1145 | data = hash_find_entry (table, entry, &bucket, true); |
1146 | if (!data) |
1147 | return NULL; |
1148 | |
1149 | table->n_entries--; |
1150 | if (!bucket->data) |
1151 | { |
1152 | table->n_buckets_used--; |
1153 | |
1154 | /* If the shrink threshold of the buckets in use has been reached, |
1155 | rehash into a smaller table. */ |
1156 | |
1157 | if (table->n_buckets_used |
1158 | < table->tuning->shrink_threshold * table->n_buckets) |
1159 | { |
1160 | /* Check more fully, before starting real work. If tuning arguments |
1161 | became invalid, the second check will rely on proper defaults. */ |
1162 | check_tuning (table); |
1163 | if (table->n_buckets_used |
1164 | < table->tuning->shrink_threshold * table->n_buckets) |
1165 | { |
1166 | const Hash_tuning *tuning = table->tuning; |
1167 | size_t candidate = |
1168 | (tuning->is_n_buckets |
1169 | ? table->n_buckets * tuning->shrink_factor |
1170 | : (table->n_buckets * tuning->shrink_factor |
1171 | * tuning->growth_threshold)); |
1172 | |
1173 | if (!hash_rehash (table, candidate)) |
1174 | { |
1175 | /* Failure to allocate memory in an attempt to |
1176 | shrink the table is not fatal. But since memory |
1177 | is low, we can at least be kind and free any |
1178 | spare entries, rather than keeping them tied up |
1179 | in the free entry list. */ |
1180 | #if ! USE_OBSTACK |
1181 | struct hash_entry *cursor = table->free_entry_list; |
1182 | struct hash_entry *next; |
1183 | while (cursor) |
1184 | { |
1185 | next = cursor->next; |
1186 | free (cursor); |
1187 | cursor = next; |
1188 | } |
1189 | table->free_entry_list = NULL; |
1190 | #endif |
1191 | } |
1192 | } |
1193 | } |
1194 | } |
1195 | |
1196 | return data; |
1197 | } |
1198 | |
1199 | /* Testing. */ |
1200 | |
1201 | #if TESTING |
1202 | |
1203 | void |
1204 | hash_print (const Hash_table *table) |
1205 | { |
1206 | struct hash_entry *bucket = (struct hash_entry *) table->bucket; |
1207 | |
1208 | for ( ; bucket < table->bucket_limit; bucket++) |
1209 | { |
1210 | struct hash_entry *cursor; |
1211 | |
1212 | if (bucket) |
1213 | printf ("%lu:\n" , (unsigned long int) (bucket - table->bucket)); |
1214 | |
1215 | for (cursor = bucket; cursor; cursor = cursor->next) |
1216 | { |
1217 | char const *s = cursor->data; |
1218 | /* FIXME */ |
1219 | if (s) |
1220 | printf (" %s\n" , s); |
1221 | } |
1222 | } |
1223 | } |
1224 | |
1225 | #endif /* TESTING */ |
1226 | |