1 | // This is an open source non-commercial project. Dear PVS-Studio, please check |
2 | // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com |
3 | |
4 | /// @file hashtab.c |
5 | /// |
6 | /// Handling of a hashtable with Vim-specific properties. |
7 | /// |
8 | /// Each item in a hashtable has a NUL terminated string key. A key can appear |
9 | /// only once in the table. |
10 | /// |
11 | /// A hash number is computed from the key for quick lookup. When the hashes |
12 | /// of two different keys point to the same entry an algorithm is used to |
13 | /// iterate over other entries in the table until the right one is found. |
14 | /// To make the iteration work removed keys are different from entries where a |
15 | /// key was never present. |
16 | /// |
17 | /// The mechanism has been partly based on how Python Dictionaries are |
18 | /// implemented. The algorithm is from Knuth Vol. 3, Sec. 6.4. |
19 | /// |
20 | /// The hashtable grows to accommodate more entries when needed. At least 1/3 |
21 | /// of the entries is empty to keep the lookup efficient (at the cost of extra |
22 | /// memory). |
23 | |
24 | #include <assert.h> |
25 | #include <stdbool.h> |
26 | #include <string.h> |
27 | #include <inttypes.h> |
28 | |
29 | #include "nvim/vim.h" |
30 | #include "nvim/ascii.h" |
31 | #include "nvim/hashtab.h" |
32 | #include "nvim/message.h" |
33 | #include "nvim/memory.h" |
34 | |
35 | // Magic value for algorithm that walks through the array. |
36 | #define PERTURB_SHIFT 5 |
37 | |
38 | #ifdef INCLUDE_GENERATED_DECLARATIONS |
39 | # include "hashtab.c.generated.h" |
40 | #endif |
41 | |
42 | char hash_removed; |
43 | |
44 | /// Initialize an empty hash table. |
45 | void hash_init(hashtab_T *ht) |
46 | { |
47 | // This zeroes all "ht_" entries and all the "hi_key" in "ht_smallarray". |
48 | memset(ht, 0, sizeof(hashtab_T)); |
49 | ht->ht_array = ht->ht_smallarray; |
50 | ht->ht_mask = HT_INIT_SIZE - 1; |
51 | } |
52 | |
53 | /// Free the array of a hash table without freeing contained values. |
54 | /// |
55 | /// If "ht" is not freed (after calling this) then you should call hash_init() |
56 | /// right next! |
57 | void hash_clear(hashtab_T *ht) |
58 | { |
59 | if (ht->ht_array != ht->ht_smallarray) { |
60 | xfree(ht->ht_array); |
61 | } |
62 | } |
63 | |
64 | /// Free the array of a hash table and all contained values. |
65 | /// |
66 | /// @param off the offset from start of value to start of key (@see hashitem_T). |
67 | void hash_clear_all(hashtab_T *ht, unsigned int off) |
68 | { |
69 | size_t todo = ht->ht_used; |
70 | for (hashitem_T *hi = ht->ht_array; todo > 0; ++hi) { |
71 | if (!HASHITEM_EMPTY(hi)) { |
72 | xfree(hi->hi_key - off); |
73 | todo--; |
74 | } |
75 | } |
76 | hash_clear(ht); |
77 | } |
78 | |
79 | /// Find item for given "key" in hashtable "ht". |
80 | /// |
81 | /// @param key The key of the looked-for item. Must not be NULL. |
82 | /// |
83 | /// @return Pointer to the hash item corresponding to the given key. |
84 | /// If not found, then return pointer to the empty item that would be |
85 | /// used for that key. |
86 | /// WARNING: Returned pointer becomes invalid as soon as the hash table |
87 | /// is changed in any way. |
88 | hashitem_T *hash_find(const hashtab_T *const ht, const char_u *const key) |
89 | { |
90 | return hash_lookup(ht, (const char *)key, STRLEN(key), hash_hash(key)); |
91 | } |
92 | |
93 | /// Like hash_find, but key is not NUL-terminated |
94 | /// |
95 | /// @param[in] ht Hashtab to look in. |
96 | /// @param[in] key Key of the looked-for item. Must not be NULL. |
97 | /// @param[in] len Key length. |
98 | /// |
99 | /// @return Pointer to the hash item corresponding to the given key. |
100 | /// If not found, then return pointer to the empty item that would be |
101 | /// used for that key. |
102 | /// |
103 | /// @warning Returned pointer becomes invalid as soon as the hash table |
104 | /// is changed in any way. |
105 | hashitem_T *hash_find_len(const hashtab_T *const ht, const char *const key, |
106 | const size_t len) |
107 | { |
108 | return hash_lookup(ht, key, len, hash_hash_len(key, len)); |
109 | } |
110 | |
111 | /// Like hash_find(), but caller computes "hash". |
112 | /// |
113 | /// @param[in] key The key of the looked-for item. Must not be NULL. |
114 | /// @param[in] key_len Key length. |
115 | /// @param[in] hash The precomputed hash for the key. |
116 | /// |
117 | /// @return Pointer to the hashitem corresponding to the given key. |
118 | /// If not found, then return pointer to the empty item that would be |
119 | /// used for that key. |
120 | /// WARNING: Returned pointer becomes invalid as soon as the hash table |
121 | /// is changed in any way. |
122 | hashitem_T *hash_lookup(const hashtab_T *const ht, |
123 | const char *const key, const size_t key_len, |
124 | const hash_T hash) |
125 | { |
126 | #ifdef HT_DEBUG |
127 | hash_count_lookup++; |
128 | #endif // ifdef HT_DEBUG |
129 | |
130 | // Quickly handle the most common situations: |
131 | // - return if there is no item at all |
132 | // - skip over a removed item |
133 | // - return if the item matches |
134 | hash_T idx = hash & ht->ht_mask; |
135 | hashitem_T *hi = &ht->ht_array[idx]; |
136 | |
137 | if (hi->hi_key == NULL) { |
138 | return hi; |
139 | } |
140 | |
141 | hashitem_T *freeitem = NULL; |
142 | if (hi->hi_key == HI_KEY_REMOVED) { |
143 | freeitem = hi; |
144 | } else if ((hi->hi_hash == hash) |
145 | && (STRNCMP(hi->hi_key, key, key_len) == 0) |
146 | && hi->hi_key[key_len] == NUL) { |
147 | return hi; |
148 | } |
149 | |
150 | // Need to search through the table to find the key. The algorithm |
151 | // to step through the table starts with large steps, gradually becoming |
152 | // smaller down to (1/4 table size + 1). This means it goes through all |
153 | // table entries in the end. |
154 | // When we run into a NULL key it's clear that the key isn't there. |
155 | // Return the first available slot found (can be a slot of a removed |
156 | // item). |
157 | for (hash_T perturb = hash;; perturb >>= PERTURB_SHIFT) { |
158 | #ifdef HT_DEBUG |
159 | // count a "miss" for hashtab lookup |
160 | hash_count_perturb++; |
161 | #endif // ifdef HT_DEBUG |
162 | idx = 5 * idx + perturb + 1; |
163 | hi = &ht->ht_array[idx & ht->ht_mask]; |
164 | |
165 | if (hi->hi_key == NULL) { |
166 | return freeitem == NULL ? hi : freeitem; |
167 | } |
168 | |
169 | if ((hi->hi_hash == hash) |
170 | && (hi->hi_key != HI_KEY_REMOVED) |
171 | && (STRNCMP(hi->hi_key, key, key_len) == 0) |
172 | && hi->hi_key[key_len] == NUL) { |
173 | return hi; |
174 | } |
175 | |
176 | if ((hi->hi_key == HI_KEY_REMOVED) && (freeitem == NULL)) { |
177 | freeitem = hi; |
178 | } |
179 | } |
180 | } |
181 | |
182 | /// Print the efficiency of hashtable lookups. |
183 | /// |
184 | /// Useful when trying different hash algorithms. |
185 | /// Called when exiting. |
186 | void hash_debug_results(void) |
187 | { |
188 | #ifdef HT_DEBUG |
189 | fprintf(stderr, "\r\n\r\n\r\n\r\n" ); |
190 | fprintf(stderr, "Number of hashtable lookups: %" PRId64 "\r\n" , |
191 | (int64_t)hash_count_lookup); |
192 | fprintf(stderr, "Number of perturb loops: %" PRId64 "\r\n" , |
193 | (int64_t)hash_count_perturb); |
194 | fprintf(stderr, "Percentage of perturb loops: %" PRId64 "%%\r\n" , |
195 | (int64_t)(hash_count_perturb * 100 / hash_count_lookup)); |
196 | #endif // ifdef HT_DEBUG |
197 | } |
198 | |
199 | /// Add item for key "key" to hashtable "ht". |
200 | /// |
201 | /// @param key Pointer to the key for the new item. The key has to be contained |
202 | /// in the new item (@see hashitem_T). Must not be NULL. |
203 | /// |
204 | /// @return OK if success. |
205 | /// FAIL if key already present |
206 | int hash_add(hashtab_T *ht, char_u *key) |
207 | { |
208 | hash_T hash = hash_hash(key); |
209 | hashitem_T *hi = hash_lookup(ht, (const char *)key, STRLEN(key), hash); |
210 | if (!HASHITEM_EMPTY(hi)) { |
211 | internal_error("hash_add()" ); |
212 | return FAIL; |
213 | } |
214 | hash_add_item(ht, hi, key, hash); |
215 | return OK; |
216 | } |
217 | |
218 | /// Add item "hi" for key "key" to hashtable "ht". |
219 | /// |
220 | /// @param hi The hash item to be used. Must have been obtained through |
221 | /// hash_lookup() and point to an empty item. |
222 | /// @param key Pointer to the key for the new item. The key has to be contained |
223 | /// in the new item (@see hashitem_T). Must not be NULL. |
224 | /// @param hash The precomputed hash value for the key. |
225 | void hash_add_item(hashtab_T *ht, hashitem_T *hi, char_u *key, hash_T hash) |
226 | { |
227 | ht->ht_used++; |
228 | if (hi->hi_key == NULL) { |
229 | ht->ht_filled++; |
230 | } |
231 | hi->hi_key = key; |
232 | hi->hi_hash = hash; |
233 | |
234 | // When the space gets low may resize the array. |
235 | hash_may_resize(ht, 0); |
236 | } |
237 | |
238 | /// Remove item "hi" from hashtable "ht". |
239 | /// |
240 | /// Caller must take care of freeing the item itself. |
241 | /// |
242 | /// @param hi The hash item to be removed. |
243 | /// It must have been obtained with hash_lookup(). |
244 | void hash_remove(hashtab_T *ht, hashitem_T *hi) |
245 | { |
246 | ht->ht_used--; |
247 | hi->hi_key = HI_KEY_REMOVED; |
248 | hash_may_resize(ht, 0); |
249 | } |
250 | |
251 | /// Lock hashtable (prevent changes in ht_array). |
252 | /// |
253 | /// Don't use this when items are to be added! |
254 | /// Must call hash_unlock() later. |
255 | void hash_lock(hashtab_T *ht) |
256 | { |
257 | ht->ht_locked++; |
258 | } |
259 | |
260 | /// Unlock hashtable (allow changes in ht_array again). |
261 | /// |
262 | /// Table will be resized (shrunk) when necessary. |
263 | /// This must balance a call to hash_lock(). |
264 | void hash_unlock(hashtab_T *ht) |
265 | { |
266 | ht->ht_locked--; |
267 | hash_may_resize(ht, 0); |
268 | } |
269 | |
270 | /// Resize hastable (new size can be given or automatically computed). |
271 | /// |
272 | /// @param minitems Minimum number of items the new table should hold. |
273 | /// If zero, new size will depend on currently used items: |
274 | /// - Shrink when too much empty space. |
275 | /// - Grow when not enough empty space. |
276 | /// If non-zero, passed minitems will be used. |
277 | static void hash_may_resize(hashtab_T *ht, size_t minitems) |
278 | { |
279 | // Don't resize a locked table. |
280 | if (ht->ht_locked > 0) { |
281 | return; |
282 | } |
283 | |
284 | #ifdef HT_DEBUG |
285 | if (ht->ht_used > ht->ht_filled) { |
286 | EMSG("hash_may_resize(): more used than filled" ); |
287 | } |
288 | |
289 | if (ht->ht_filled >= ht->ht_mask + 1) { |
290 | EMSG("hash_may_resize(): table completely filled" ); |
291 | } |
292 | #endif // ifdef HT_DEBUG |
293 | |
294 | size_t minsize; |
295 | if (minitems == 0) { |
296 | // Return quickly for small tables with at least two NULL items. |
297 | // items are required for the lookup to decide a key isn't there. |
298 | if ((ht->ht_filled < HT_INIT_SIZE - 1) |
299 | && (ht->ht_array == ht->ht_smallarray)) { |
300 | return; |
301 | } |
302 | |
303 | // Grow or refill the array when it's more than 2/3 full (including |
304 | // removed items, so that they get cleaned up). |
305 | // Shrink the array when it's less than 1/5 full. When growing it is |
306 | // at least 1/4 full (avoids repeated grow-shrink operations) |
307 | size_t oldsize = ht->ht_mask + 1; |
308 | if ((ht->ht_filled * 3 < oldsize * 2) && (ht->ht_used > oldsize / 5)) { |
309 | return; |
310 | } |
311 | |
312 | if (ht->ht_used > 1000) { |
313 | // it's big, don't make too much room |
314 | minsize = ht->ht_used * 2; |
315 | } else { |
316 | // make plenty of room |
317 | minsize = ht->ht_used * 4; |
318 | } |
319 | } else { |
320 | // Use specified size. |
321 | if (minitems < ht->ht_used) { |
322 | // just in case... |
323 | minitems = ht->ht_used; |
324 | } |
325 | // array is up to 2/3 full |
326 | minsize = minitems * 3 / 2; |
327 | } |
328 | |
329 | size_t newsize = HT_INIT_SIZE; |
330 | while (newsize < minsize) { |
331 | // make sure it's always a power of 2 |
332 | newsize <<= 1; |
333 | // assert newsize didn't overflow |
334 | assert(newsize != 0); |
335 | } |
336 | |
337 | bool newarray_is_small = newsize == HT_INIT_SIZE; |
338 | bool keep_smallarray = newarray_is_small |
339 | && ht->ht_array == ht->ht_smallarray; |
340 | |
341 | // Make sure that oldarray and newarray do not overlap, |
342 | // so that copying is possible. |
343 | hashitem_T temparray[HT_INIT_SIZE]; |
344 | hashitem_T *oldarray = keep_smallarray |
345 | ? memcpy(temparray, ht->ht_smallarray, sizeof(temparray)) |
346 | : ht->ht_array; |
347 | hashitem_T *newarray = newarray_is_small |
348 | ? ht->ht_smallarray |
349 | : xmalloc(sizeof(hashitem_T) * newsize); |
350 | |
351 | memset(newarray, 0, sizeof(hashitem_T) * newsize); |
352 | |
353 | // Move all the items from the old array to the new one, placing them in |
354 | // the right spot. The new array won't have any removed items, thus this |
355 | // is also a cleanup action. |
356 | hash_T newmask = newsize - 1; |
357 | size_t todo = ht->ht_used; |
358 | |
359 | for (hashitem_T *olditem = oldarray; todo > 0; ++olditem) { |
360 | if (HASHITEM_EMPTY(olditem)) { |
361 | continue; |
362 | } |
363 | // The algorithm to find the spot to add the item is identical to |
364 | // the algorithm to find an item in hash_lookup(). But we only |
365 | // need to search for a NULL key, thus it's simpler. |
366 | hash_T newi = olditem->hi_hash & newmask; |
367 | hashitem_T *newitem = &newarray[newi]; |
368 | if (newitem->hi_key != NULL) { |
369 | for (hash_T perturb = olditem->hi_hash;; perturb >>= PERTURB_SHIFT) { |
370 | newi = 5 * newi + perturb + 1; |
371 | newitem = &newarray[newi & newmask]; |
372 | if (newitem->hi_key == NULL) { |
373 | break; |
374 | } |
375 | } |
376 | } |
377 | *newitem = *olditem; |
378 | todo--; |
379 | } |
380 | |
381 | if (ht->ht_array != ht->ht_smallarray) { |
382 | xfree(ht->ht_array); |
383 | } |
384 | ht->ht_array = newarray; |
385 | ht->ht_mask = newmask; |
386 | ht->ht_filled = ht->ht_used; |
387 | } |
388 | |
389 | #define HASH_CYCLE_BODY(hash, p) \ |
390 | hash = hash * 101 + *p++ |
391 | |
392 | /// Get the hash number for a key. |
393 | /// |
394 | /// If you think you know a better hash function: Compile with HT_DEBUG set and |
395 | /// run a script that uses hashtables a lot. Vim will then print statistics |
396 | /// when exiting. Try that with the current hash algorithm and yours. The |
397 | /// lower the percentage the better. |
398 | hash_T hash_hash(const char_u *key) |
399 | { |
400 | hash_T hash = *key; |
401 | |
402 | if (hash == 0) { |
403 | return (hash_T)0; |
404 | } |
405 | |
406 | // A simplistic algorithm that appears to do very well. |
407 | // Suggested by George Reilly. |
408 | const uint8_t *p = key + 1; |
409 | while (*p != NUL) { |
410 | HASH_CYCLE_BODY(hash, p); |
411 | } |
412 | |
413 | return hash; |
414 | } |
415 | |
416 | /// Get the hash number for a key that is not a NUL-terminated string |
417 | /// |
418 | /// @warning Function does not check whether key contains NUL. But you will not |
419 | /// be able to get hash entry in this case. |
420 | /// |
421 | /// @param[in] key Key. |
422 | /// @param[in] len Key length. |
423 | /// |
424 | /// @return Key hash. |
425 | hash_T hash_hash_len(const char *key, const size_t len) |
426 | FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT |
427 | { |
428 | if (len == 0) { |
429 | return 0; |
430 | } |
431 | |
432 | hash_T hash = *(uint8_t *)key; |
433 | const uint8_t *end = (uint8_t *)key + len; |
434 | |
435 | const uint8_t *p = (const uint8_t *)key + 1; |
436 | while (p < end) { |
437 | HASH_CYCLE_BODY(hash, p); |
438 | } |
439 | |
440 | return hash; |
441 | } |
442 | |
443 | #undef HASH_CYCLE_BODY |
444 | |
445 | /// Function to get HI_KEY_REMOVED value |
446 | /// |
447 | /// Used for testing because luajit ffi does not allow getting addresses of |
448 | /// globals. |
449 | const char_u *_hash_key_removed(void) |
450 | FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT |
451 | { |
452 | return HI_KEY_REMOVED; |
453 | } |
454 | |