1// This is an open source non-commercial project. Dear PVS-Studio, please check
2// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3
4/// @file hashtab.c
5///
6/// Handling of a hashtable with Vim-specific properties.
7///
8/// Each item in a hashtable has a NUL terminated string key. A key can appear
9/// only once in the table.
10///
11/// A hash number is computed from the key for quick lookup. When the hashes
12/// of two different keys point to the same entry an algorithm is used to
13/// iterate over other entries in the table until the right one is found.
14/// To make the iteration work removed keys are different from entries where a
15/// key was never present.
16///
17/// The mechanism has been partly based on how Python Dictionaries are
18/// implemented. The algorithm is from Knuth Vol. 3, Sec. 6.4.
19///
20/// The hashtable grows to accommodate more entries when needed. At least 1/3
21/// of the entries is empty to keep the lookup efficient (at the cost of extra
22/// memory).
23
24#include <assert.h>
25#include <stdbool.h>
26#include <string.h>
27#include <inttypes.h>
28
29#include "nvim/vim.h"
30#include "nvim/ascii.h"
31#include "nvim/hashtab.h"
32#include "nvim/message.h"
33#include "nvim/memory.h"
34
35// Magic value for algorithm that walks through the array.
36#define PERTURB_SHIFT 5
37
38#ifdef INCLUDE_GENERATED_DECLARATIONS
39# include "hashtab.c.generated.h"
40#endif
41
42char hash_removed;
43
44/// Initialize an empty hash table.
45void hash_init(hashtab_T *ht)
46{
47 // This zeroes all "ht_" entries and all the "hi_key" in "ht_smallarray".
48 memset(ht, 0, sizeof(hashtab_T));
49 ht->ht_array = ht->ht_smallarray;
50 ht->ht_mask = HT_INIT_SIZE - 1;
51}
52
53/// Free the array of a hash table without freeing contained values.
54///
55/// If "ht" is not freed (after calling this) then you should call hash_init()
56/// right next!
57void hash_clear(hashtab_T *ht)
58{
59 if (ht->ht_array != ht->ht_smallarray) {
60 xfree(ht->ht_array);
61 }
62}
63
64/// Free the array of a hash table and all contained values.
65///
66/// @param off the offset from start of value to start of key (@see hashitem_T).
67void hash_clear_all(hashtab_T *ht, unsigned int off)
68{
69 size_t todo = ht->ht_used;
70 for (hashitem_T *hi = ht->ht_array; todo > 0; ++hi) {
71 if (!HASHITEM_EMPTY(hi)) {
72 xfree(hi->hi_key - off);
73 todo--;
74 }
75 }
76 hash_clear(ht);
77}
78
79/// Find item for given "key" in hashtable "ht".
80///
81/// @param key The key of the looked-for item. Must not be NULL.
82///
83/// @return Pointer to the hash item corresponding to the given key.
84/// If not found, then return pointer to the empty item that would be
85/// used for that key.
86/// WARNING: Returned pointer becomes invalid as soon as the hash table
87/// is changed in any way.
88hashitem_T *hash_find(const hashtab_T *const ht, const char_u *const key)
89{
90 return hash_lookup(ht, (const char *)key, STRLEN(key), hash_hash(key));
91}
92
93/// Like hash_find, but key is not NUL-terminated
94///
95/// @param[in] ht Hashtab to look in.
96/// @param[in] key Key of the looked-for item. Must not be NULL.
97/// @param[in] len Key length.
98///
99/// @return Pointer to the hash item corresponding to the given key.
100/// If not found, then return pointer to the empty item that would be
101/// used for that key.
102///
103/// @warning Returned pointer becomes invalid as soon as the hash table
104/// is changed in any way.
105hashitem_T *hash_find_len(const hashtab_T *const ht, const char *const key,
106 const size_t len)
107{
108 return hash_lookup(ht, key, len, hash_hash_len(key, len));
109}
110
111/// Like hash_find(), but caller computes "hash".
112///
113/// @param[in] key The key of the looked-for item. Must not be NULL.
114/// @param[in] key_len Key length.
115/// @param[in] hash The precomputed hash for the key.
116///
117/// @return Pointer to the hashitem corresponding to the given key.
118/// If not found, then return pointer to the empty item that would be
119/// used for that key.
120/// WARNING: Returned pointer becomes invalid as soon as the hash table
121/// is changed in any way.
122hashitem_T *hash_lookup(const hashtab_T *const ht,
123 const char *const key, const size_t key_len,
124 const hash_T hash)
125{
126#ifdef HT_DEBUG
127 hash_count_lookup++;
128#endif // ifdef HT_DEBUG
129
130 // Quickly handle the most common situations:
131 // - return if there is no item at all
132 // - skip over a removed item
133 // - return if the item matches
134 hash_T idx = hash & ht->ht_mask;
135 hashitem_T *hi = &ht->ht_array[idx];
136
137 if (hi->hi_key == NULL) {
138 return hi;
139 }
140
141 hashitem_T *freeitem = NULL;
142 if (hi->hi_key == HI_KEY_REMOVED) {
143 freeitem = hi;
144 } else if ((hi->hi_hash == hash)
145 && (STRNCMP(hi->hi_key, key, key_len) == 0)
146 && hi->hi_key[key_len] == NUL) {
147 return hi;
148 }
149
150 // Need to search through the table to find the key. The algorithm
151 // to step through the table starts with large steps, gradually becoming
152 // smaller down to (1/4 table size + 1). This means it goes through all
153 // table entries in the end.
154 // When we run into a NULL key it's clear that the key isn't there.
155 // Return the first available slot found (can be a slot of a removed
156 // item).
157 for (hash_T perturb = hash;; perturb >>= PERTURB_SHIFT) {
158#ifdef HT_DEBUG
159 // count a "miss" for hashtab lookup
160 hash_count_perturb++;
161#endif // ifdef HT_DEBUG
162 idx = 5 * idx + perturb + 1;
163 hi = &ht->ht_array[idx & ht->ht_mask];
164
165 if (hi->hi_key == NULL) {
166 return freeitem == NULL ? hi : freeitem;
167 }
168
169 if ((hi->hi_hash == hash)
170 && (hi->hi_key != HI_KEY_REMOVED)
171 && (STRNCMP(hi->hi_key, key, key_len) == 0)
172 && hi->hi_key[key_len] == NUL) {
173 return hi;
174 }
175
176 if ((hi->hi_key == HI_KEY_REMOVED) && (freeitem == NULL)) {
177 freeitem = hi;
178 }
179 }
180}
181
182/// Print the efficiency of hashtable lookups.
183///
184/// Useful when trying different hash algorithms.
185/// Called when exiting.
186void hash_debug_results(void)
187{
188#ifdef HT_DEBUG
189 fprintf(stderr, "\r\n\r\n\r\n\r\n");
190 fprintf(stderr, "Number of hashtable lookups: %" PRId64 "\r\n",
191 (int64_t)hash_count_lookup);
192 fprintf(stderr, "Number of perturb loops: %" PRId64 "\r\n",
193 (int64_t)hash_count_perturb);
194 fprintf(stderr, "Percentage of perturb loops: %" PRId64 "%%\r\n",
195 (int64_t)(hash_count_perturb * 100 / hash_count_lookup));
196#endif // ifdef HT_DEBUG
197}
198
199/// Add item for key "key" to hashtable "ht".
200///
201/// @param key Pointer to the key for the new item. The key has to be contained
202/// in the new item (@see hashitem_T). Must not be NULL.
203///
204/// @return OK if success.
205/// FAIL if key already present
206int hash_add(hashtab_T *ht, char_u *key)
207{
208 hash_T hash = hash_hash(key);
209 hashitem_T *hi = hash_lookup(ht, (const char *)key, STRLEN(key), hash);
210 if (!HASHITEM_EMPTY(hi)) {
211 internal_error("hash_add()");
212 return FAIL;
213 }
214 hash_add_item(ht, hi, key, hash);
215 return OK;
216}
217
218/// Add item "hi" for key "key" to hashtable "ht".
219///
220/// @param hi The hash item to be used. Must have been obtained through
221/// hash_lookup() and point to an empty item.
222/// @param key Pointer to the key for the new item. The key has to be contained
223/// in the new item (@see hashitem_T). Must not be NULL.
224/// @param hash The precomputed hash value for the key.
225void hash_add_item(hashtab_T *ht, hashitem_T *hi, char_u *key, hash_T hash)
226{
227 ht->ht_used++;
228 if (hi->hi_key == NULL) {
229 ht->ht_filled++;
230 }
231 hi->hi_key = key;
232 hi->hi_hash = hash;
233
234 // When the space gets low may resize the array.
235 hash_may_resize(ht, 0);
236}
237
238/// Remove item "hi" from hashtable "ht".
239///
240/// Caller must take care of freeing the item itself.
241///
242/// @param hi The hash item to be removed.
243/// It must have been obtained with hash_lookup().
244void hash_remove(hashtab_T *ht, hashitem_T *hi)
245{
246 ht->ht_used--;
247 hi->hi_key = HI_KEY_REMOVED;
248 hash_may_resize(ht, 0);
249}
250
251/// Lock hashtable (prevent changes in ht_array).
252///
253/// Don't use this when items are to be added!
254/// Must call hash_unlock() later.
255void hash_lock(hashtab_T *ht)
256{
257 ht->ht_locked++;
258}
259
260/// Unlock hashtable (allow changes in ht_array again).
261///
262/// Table will be resized (shrunk) when necessary.
263/// This must balance a call to hash_lock().
264void hash_unlock(hashtab_T *ht)
265{
266 ht->ht_locked--;
267 hash_may_resize(ht, 0);
268}
269
270/// Resize hastable (new size can be given or automatically computed).
271///
272/// @param minitems Minimum number of items the new table should hold.
273/// If zero, new size will depend on currently used items:
274/// - Shrink when too much empty space.
275/// - Grow when not enough empty space.
276/// If non-zero, passed minitems will be used.
277static void hash_may_resize(hashtab_T *ht, size_t minitems)
278{
279 // Don't resize a locked table.
280 if (ht->ht_locked > 0) {
281 return;
282 }
283
284#ifdef HT_DEBUG
285 if (ht->ht_used > ht->ht_filled) {
286 EMSG("hash_may_resize(): more used than filled");
287 }
288
289 if (ht->ht_filled >= ht->ht_mask + 1) {
290 EMSG("hash_may_resize(): table completely filled");
291 }
292#endif // ifdef HT_DEBUG
293
294 size_t minsize;
295 if (minitems == 0) {
296 // Return quickly for small tables with at least two NULL items.
297 // items are required for the lookup to decide a key isn't there.
298 if ((ht->ht_filled < HT_INIT_SIZE - 1)
299 && (ht->ht_array == ht->ht_smallarray)) {
300 return;
301 }
302
303 // Grow or refill the array when it's more than 2/3 full (including
304 // removed items, so that they get cleaned up).
305 // Shrink the array when it's less than 1/5 full. When growing it is
306 // at least 1/4 full (avoids repeated grow-shrink operations)
307 size_t oldsize = ht->ht_mask + 1;
308 if ((ht->ht_filled * 3 < oldsize * 2) && (ht->ht_used > oldsize / 5)) {
309 return;
310 }
311
312 if (ht->ht_used > 1000) {
313 // it's big, don't make too much room
314 minsize = ht->ht_used * 2;
315 } else {
316 // make plenty of room
317 minsize = ht->ht_used * 4;
318 }
319 } else {
320 // Use specified size.
321 if (minitems < ht->ht_used) {
322 // just in case...
323 minitems = ht->ht_used;
324 }
325 // array is up to 2/3 full
326 minsize = minitems * 3 / 2;
327 }
328
329 size_t newsize = HT_INIT_SIZE;
330 while (newsize < minsize) {
331 // make sure it's always a power of 2
332 newsize <<= 1;
333 // assert newsize didn't overflow
334 assert(newsize != 0);
335 }
336
337 bool newarray_is_small = newsize == HT_INIT_SIZE;
338 bool keep_smallarray = newarray_is_small
339 && ht->ht_array == ht->ht_smallarray;
340
341 // Make sure that oldarray and newarray do not overlap,
342 // so that copying is possible.
343 hashitem_T temparray[HT_INIT_SIZE];
344 hashitem_T *oldarray = keep_smallarray
345 ? memcpy(temparray, ht->ht_smallarray, sizeof(temparray))
346 : ht->ht_array;
347 hashitem_T *newarray = newarray_is_small
348 ? ht->ht_smallarray
349 : xmalloc(sizeof(hashitem_T) * newsize);
350
351 memset(newarray, 0, sizeof(hashitem_T) * newsize);
352
353 // Move all the items from the old array to the new one, placing them in
354 // the right spot. The new array won't have any removed items, thus this
355 // is also a cleanup action.
356 hash_T newmask = newsize - 1;
357 size_t todo = ht->ht_used;
358
359 for (hashitem_T *olditem = oldarray; todo > 0; ++olditem) {
360 if (HASHITEM_EMPTY(olditem)) {
361 continue;
362 }
363 // The algorithm to find the spot to add the item is identical to
364 // the algorithm to find an item in hash_lookup(). But we only
365 // need to search for a NULL key, thus it's simpler.
366 hash_T newi = olditem->hi_hash & newmask;
367 hashitem_T *newitem = &newarray[newi];
368 if (newitem->hi_key != NULL) {
369 for (hash_T perturb = olditem->hi_hash;; perturb >>= PERTURB_SHIFT) {
370 newi = 5 * newi + perturb + 1;
371 newitem = &newarray[newi & newmask];
372 if (newitem->hi_key == NULL) {
373 break;
374 }
375 }
376 }
377 *newitem = *olditem;
378 todo--;
379 }
380
381 if (ht->ht_array != ht->ht_smallarray) {
382 xfree(ht->ht_array);
383 }
384 ht->ht_array = newarray;
385 ht->ht_mask = newmask;
386 ht->ht_filled = ht->ht_used;
387}
388
389#define HASH_CYCLE_BODY(hash, p) \
390 hash = hash * 101 + *p++
391
392/// Get the hash number for a key.
393///
394/// If you think you know a better hash function: Compile with HT_DEBUG set and
395/// run a script that uses hashtables a lot. Vim will then print statistics
396/// when exiting. Try that with the current hash algorithm and yours. The
397/// lower the percentage the better.
398hash_T hash_hash(const char_u *key)
399{
400 hash_T hash = *key;
401
402 if (hash == 0) {
403 return (hash_T)0;
404 }
405
406 // A simplistic algorithm that appears to do very well.
407 // Suggested by George Reilly.
408 const uint8_t *p = key + 1;
409 while (*p != NUL) {
410 HASH_CYCLE_BODY(hash, p);
411 }
412
413 return hash;
414}
415
416/// Get the hash number for a key that is not a NUL-terminated string
417///
418/// @warning Function does not check whether key contains NUL. But you will not
419/// be able to get hash entry in this case.
420///
421/// @param[in] key Key.
422/// @param[in] len Key length.
423///
424/// @return Key hash.
425hash_T hash_hash_len(const char *key, const size_t len)
426 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
427{
428 if (len == 0) {
429 return 0;
430 }
431
432 hash_T hash = *(uint8_t *)key;
433 const uint8_t *end = (uint8_t *)key + len;
434
435 const uint8_t *p = (const uint8_t *)key + 1;
436 while (p < end) {
437 HASH_CYCLE_BODY(hash, p);
438 }
439
440 return hash;
441}
442
443#undef HASH_CYCLE_BODY
444
445/// Function to get HI_KEY_REMOVED value
446///
447/// Used for testing because luajit ffi does not allow getting addresses of
448/// globals.
449const char_u *_hash_key_removed(void)
450 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
451{
452 return HI_KEY_REMOVED;
453}
454