1 | // Licensed to the .NET Foundation under one or more agreements. |
2 | // The .NET Foundation licenses this file to you under the MIT license. |
3 | // See the LICENSE file in the project root for more information. |
4 | |
5 | #ifndef _SMALLHASHTABLE_H_ |
6 | #define _SMALLHASHTABLE_H_ |
7 | |
8 | // genLog2 is defined in compiler.hpp |
9 | unsigned genLog2(unsigned value); |
10 | |
11 | //------------------------------------------------------------------------ |
12 | // HashTableInfo: a concept that provides equality and hashing methods for |
13 | // a particular key type. Used by HashTableBase and its |
14 | // subclasses. |
15 | template <typename TKey> |
16 | struct HashTableInfo |
17 | { |
18 | // static bool Equals(const TKey& x, const TKey& y); |
19 | // static unsigned GetHashCode(const TKey& key); |
20 | }; |
21 | |
22 | //------------------------------------------------------------------------ |
23 | // HashTableInfo<TKey*>: specialized version of HashTableInfo for pointer- |
24 | // typed keys. |
25 | template <typename TKey> |
26 | struct HashTableInfo<TKey*> |
27 | { |
28 | static bool Equals(const TKey* x, const TKey* y) |
29 | { |
30 | return x == y; |
31 | } |
32 | |
33 | static unsigned GetHashCode(const TKey* key) |
34 | { |
35 | // Shift off bits that are not likely to be significant |
36 | size_t keyval = reinterpret_cast<size_t>(key) >> ConstLog2<__alignof(TKey)>::value; |
37 | |
38 | // Truncate and return the result |
39 | return static_cast<unsigned>(keyval); |
40 | } |
41 | }; |
42 | |
43 | //------------------------------------------------------------------------ |
44 | // HashTableInfo<int>: specialized version of HashTableInfo for int- |
45 | // typed keys. |
46 | template <> |
47 | struct HashTableInfo<int> |
48 | { |
49 | static bool Equals(int x, int y) |
50 | { |
51 | return x == y; |
52 | } |
53 | |
54 | static unsigned GetHashCode(int key) |
55 | { |
56 | // Cast and return the key |
57 | return static_cast<unsigned>(key); |
58 | } |
59 | }; |
60 | |
61 | //------------------------------------------------------------------------ |
62 | // HashTableInfo<unsigned>: specialized version of HashTableInfo for unsigned- |
63 | // typed keys. |
64 | template <> |
65 | struct HashTableInfo<unsigned> |
66 | { |
67 | static bool Equals(unsigned x, unsigned y) |
68 | { |
69 | return x == y; |
70 | } |
71 | |
72 | static unsigned GetHashCode(unsigned key) |
73 | { |
74 | // Return the key itself |
75 | return key; |
76 | } |
77 | }; |
78 | |
79 | //------------------------------------------------------------------------ |
80 | // HashTableBase: base type for HashTable and SmallHashTable. This class |
81 | // provides the vast majority of the implementation. The |
82 | // subclasses differ in the storage they use at the time of |
83 | // construction: HashTable allocates the initial bucket |
84 | // array on the heap; SmallHashTable contains a small inline |
85 | // array. |
86 | // |
87 | // This implementation is based on the ideas presented in Herlihy, Shavit, |
88 | // and Tzafrir '08 (http://mcg.cs.tau.ac.il/papers/disc2008-hopscotch.pdf), |
89 | // though it does not currently implement the hopscotch algorithm. |
90 | // |
91 | // The approach taken is intended to perform well in both space and speed. |
92 | // This approach is a hybrid of separate chaining and open addressing with |
93 | // linear probing: collisions are resolved using a bucket chain, but that |
94 | // chain is stored in the bucket array itself. |
95 | // |
96 | // Resolving collisions using a bucket chain avoids the primary clustering |
97 | // issue common in linearly-probed open addressed hash tables, while using |
98 | // buckets as chain nodes avoids the allocaiton traffic typical of chained |
99 | // tables. Applying the hopscotch algorithm in the aforementioned paper |
100 | // could further improve performance by optimizing access patterns for |
101 | // better cache usage. |
102 | // |
103 | // Template parameters: |
104 | // TKey - The type of the table's keys. |
105 | // TValue - The type of the table's values. |
106 | // TKeyInfo - A type that conforms to the HashTableInfo<TKey> concept. |
107 | template <typename TKey, typename TValue, typename TKeyInfo = HashTableInfo<TKey>, typename TAllocator = CompAllocator> |
108 | class HashTableBase |
109 | { |
110 | friend class KeyValuePair; |
111 | friend class Iterator; |
112 | |
113 | enum : unsigned |
114 | { |
115 | InitialNumBuckets = 8 |
116 | }; |
117 | |
118 | protected: |
119 | //------------------------------------------------------------------------ |
120 | // HashTableBase::Bucket: provides storage for the key-value pairs that |
121 | // make up the contents of the table. |
122 | // |
123 | // The "home" bucket for a particular key is the bucket indexed by the |
124 | // key's hash code modulo the size of the bucket array (the "home index"). |
125 | // |
126 | // The home bucket is always considered to be part of the chain that it |
127 | // roots, even if it is also part of the chain rooted at a different |
128 | // bucket. `m_firstOffset` indicates the offset of the first non-home |
129 | // bucket in the home bucket's chain. If the `m_firstOffset` of a bucket |
130 | // is 0, the chain rooted at that bucket is empty. |
131 | // |
132 | // The index of the next bucket in a chain is calculated by adding the |
133 | // value in `m_nextOffset` to the index of the current bucket. If |
134 | // `m_nextOffset` is 0, the current bucket is the end of its chain. Each |
135 | // bucket in a chain must be occupied (i.e. `m_isFull` will be true). |
136 | struct Bucket |
137 | { |
138 | bool m_isFull; // True if the bucket is occupied; false otherwise. |
139 | |
140 | unsigned m_firstOffset; // The offset to the first node in the chain for this bucket index. |
141 | unsigned m_nextOffset; // The offset to the next node in the chain for this bucket index. |
142 | |
143 | unsigned m_hash; // The hash code for the element stored in this bucket. |
144 | TKey m_key; // The key for the element stored in this bucket. |
145 | TValue m_value; // The value for the element stored in this bucket. |
146 | }; |
147 | |
148 | private: |
149 | TAllocator m_alloc; // The memory allocator. |
150 | Bucket* m_buckets; // The bucket array. |
151 | unsigned m_numBuckets; // The number of buckets in the bucket array. |
152 | unsigned m_numFullBuckets; // The number of occupied buckets. |
153 | |
154 | //------------------------------------------------------------------------ |
155 | // HashTableBase::Insert: inserts a key-value pair into a bucket array. |
156 | // |
157 | // Arguments: |
158 | // buckets - The bucket array in which to insert the key-value pair. |
159 | // numBuckets - The number of buckets in the bucket array. |
160 | // hash - The hash code of the key to insert. |
161 | // key - The key to insert. |
162 | // value - The value to insert. |
163 | // |
164 | // Returns: |
165 | // True if the key-value pair was successfully inserted; false |
166 | // otherwise. |
167 | static bool Insert(Bucket* buckets, unsigned numBuckets, unsigned hash, const TKey& key, const TValue& value) |
168 | { |
169 | const unsigned mask = numBuckets - 1; |
170 | unsigned homeIndex = hash & mask; |
171 | |
172 | Bucket* home = &buckets[homeIndex]; |
173 | if (!home->m_isFull) |
174 | { |
175 | // The home bucket is empty; use it. |
176 | // |
177 | // Note that `m_firstOffset` does not need to be updated: whether or not it is non-zero, |
178 | // it is already correct, since we're inserting at the head of the list. `m_nextOffset` |
179 | // must be 0, however, since this node should not be part of a list. |
180 | assert(home->m_nextOffset == 0); |
181 | |
182 | home->m_isFull = true; |
183 | home->m_hash = hash; |
184 | home->m_key = key; |
185 | home->m_value = value; |
186 | return true; |
187 | } |
188 | |
189 | // If the home bucket is full, probe to find the next empty bucket. |
190 | unsigned precedingIndexInChain = homeIndex; |
191 | unsigned nextIndexInChain = (homeIndex + home->m_firstOffset) & mask; |
192 | for (unsigned j = 1; j < numBuckets; j++) |
193 | { |
194 | unsigned bucketIndex = (homeIndex + j) & mask; |
195 | Bucket* bucket = &buckets[bucketIndex]; |
196 | if (bucketIndex == nextIndexInChain) |
197 | { |
198 | assert(bucket->m_isFull); |
199 | precedingIndexInChain = bucketIndex; |
200 | nextIndexInChain = (bucketIndex + bucket->m_nextOffset) & mask; |
201 | } |
202 | else if (!bucket->m_isFull) |
203 | { |
204 | bucket->m_isFull = true; |
205 | if (precedingIndexInChain == nextIndexInChain) |
206 | { |
207 | bucket->m_nextOffset = 0; |
208 | } |
209 | else |
210 | { |
211 | assert(((nextIndexInChain - bucketIndex) & mask) > 0); |
212 | bucket->m_nextOffset = (nextIndexInChain - bucketIndex) & mask; |
213 | } |
214 | |
215 | unsigned offset = (bucketIndex - precedingIndexInChain) & mask; |
216 | assert(offset != 0); |
217 | |
218 | if (precedingIndexInChain == homeIndex) |
219 | { |
220 | buckets[precedingIndexInChain].m_firstOffset = offset; |
221 | } |
222 | else |
223 | { |
224 | buckets[precedingIndexInChain].m_nextOffset = offset; |
225 | } |
226 | |
227 | bucket->m_hash = hash; |
228 | bucket->m_key = key; |
229 | bucket->m_value = value; |
230 | return true; |
231 | } |
232 | } |
233 | |
234 | // No more free buckets. |
235 | return false; |
236 | } |
237 | |
238 | //------------------------------------------------------------------------ |
239 | // HashTableBase::TryGetBucket: attempts to get the bucket that holds a |
240 | // particular key. |
241 | // |
242 | // Arguments: |
243 | // hash - The hash code of the key to find. |
244 | // key - The key to find. |
245 | // precedingIndex - An output parameter that will hold the index of the |
246 | // preceding bucket in the chain for the key. May be |
247 | // equal to `bucketIndex` if the key is stored in its |
248 | // home bucket. |
249 | // bucketIndex - An output parameter that will hold the index of the |
250 | // bucket that stores the key. |
251 | // |
252 | // Returns: |
253 | // True if the key was successfully found; false otherwise. |
254 | bool TryGetBucket(unsigned hash, const TKey& key, unsigned* precedingIndex, unsigned* bucketIndex) const |
255 | { |
256 | if (m_numBuckets == 0) |
257 | { |
258 | return false; |
259 | } |
260 | |
261 | const unsigned mask = m_numBuckets - 1; |
262 | unsigned index = hash & mask; |
263 | |
264 | Bucket* bucket = &m_buckets[index]; |
265 | if (bucket->m_isFull && bucket->m_hash == hash && TKeyInfo::Equals(bucket->m_key, key)) |
266 | { |
267 | *precedingIndex = index; |
268 | *bucketIndex = index; |
269 | return true; |
270 | } |
271 | |
272 | for (unsigned offset = bucket->m_firstOffset; offset != 0; offset = bucket->m_nextOffset) |
273 | { |
274 | unsigned precedingIndexInChain = index; |
275 | |
276 | index = (index + offset) & mask; |
277 | bucket = &m_buckets[index]; |
278 | |
279 | assert(bucket->m_isFull); |
280 | if (bucket->m_hash == hash && TKeyInfo::Equals(bucket->m_key, key)) |
281 | { |
282 | *precedingIndex = precedingIndexInChain; |
283 | *bucketIndex = index; |
284 | return true; |
285 | } |
286 | } |
287 | |
288 | return false; |
289 | } |
290 | |
291 | //------------------------------------------------------------------------ |
292 | // HashTableBase::Resize: allocates a new bucket array twice the size of |
293 | // the current array and copies the key-value pairs |
294 | // from the current bucket array into the new array. |
295 | void Resize() |
296 | { |
297 | Bucket* currentBuckets = m_buckets; |
298 | |
299 | unsigned newNumBuckets = m_numBuckets == 0 ? InitialNumBuckets : m_numBuckets * 2; |
300 | Bucket* newBuckets = m_alloc.template allocate<Bucket>(newNumBuckets); |
301 | memset(newBuckets, 0, sizeof(Bucket) * newNumBuckets); |
302 | |
303 | for (unsigned currentIndex = 0; currentIndex < m_numBuckets; currentIndex++) |
304 | { |
305 | Bucket* currentBucket = ¤tBuckets[currentIndex]; |
306 | if (!currentBucket->m_isFull) |
307 | { |
308 | continue; |
309 | } |
310 | |
311 | bool inserted = |
312 | Insert(newBuckets, newNumBuckets, currentBucket->m_hash, currentBucket->m_key, currentBucket->m_value); |
313 | (assert(inserted), (void)inserted); |
314 | } |
315 | |
316 | m_numBuckets = newNumBuckets; |
317 | m_buckets = newBuckets; |
318 | } |
319 | |
320 | protected: |
321 | HashTableBase(TAllocator alloc, Bucket* buckets, unsigned numBuckets) |
322 | : m_alloc(alloc), m_buckets(buckets), m_numBuckets(numBuckets), m_numFullBuckets(0) |
323 | { |
324 | if (numBuckets > 0) |
325 | { |
326 | assert((numBuckets & (numBuckets - 1)) == 0); // Size must be a power of 2 |
327 | assert(m_buckets != nullptr); |
328 | |
329 | memset(m_buckets, 0, sizeof(Bucket) * numBuckets); |
330 | } |
331 | } |
332 | |
333 | public: |
334 | #ifdef DEBUG |
335 | class Iterator; |
336 | |
337 | class KeyValuePair final |
338 | { |
339 | friend class HashTableBase<TKey, TValue, TKeyInfo>::Iterator; |
340 | |
341 | Bucket* m_bucket; |
342 | |
343 | KeyValuePair(Bucket* bucket) : m_bucket(bucket) |
344 | { |
345 | assert(m_bucket != nullptr); |
346 | } |
347 | |
348 | public: |
349 | KeyValuePair() : m_bucket(nullptr) |
350 | { |
351 | } |
352 | |
353 | inline TKey& Key() |
354 | { |
355 | return m_bucket->m_key; |
356 | } |
357 | |
358 | inline TValue& Value() |
359 | { |
360 | return m_bucket->m_value; |
361 | } |
362 | }; |
363 | |
364 | // NOTE: HashTableBase only provides iterators in debug builds because the order in which |
365 | // the iterator type produces values is undefined (e.g. it is not related to the order in |
366 | // which key-value pairs were inserted). |
367 | class Iterator final |
368 | { |
369 | friend class HashTableBase<TKey, TValue, TKeyInfo>; |
370 | |
371 | Bucket* m_buckets; |
372 | unsigned m_numBuckets; |
373 | unsigned m_index; |
374 | |
375 | Iterator(Bucket* buckets, unsigned numBuckets, unsigned index) |
376 | : m_buckets(buckets), m_numBuckets(numBuckets), m_index(index) |
377 | { |
378 | assert((buckets != nullptr) || (numBuckets == 0)); |
379 | assert(index <= numBuckets); |
380 | |
381 | // Advance to the first occupied bucket |
382 | while (m_index != m_numBuckets && !m_buckets[m_index].m_isFull) |
383 | { |
384 | m_index++; |
385 | } |
386 | } |
387 | |
388 | public: |
389 | Iterator() : m_buckets(nullptr), m_numBuckets(0), m_index(0) |
390 | { |
391 | } |
392 | |
393 | KeyValuePair operator*() const |
394 | { |
395 | if (m_index >= m_numBuckets) |
396 | { |
397 | return KeyValuePair(); |
398 | } |
399 | |
400 | Bucket* bucket = &m_buckets[m_index]; |
401 | assert(bucket->m_isFull); |
402 | return KeyValuePair(bucket); |
403 | } |
404 | |
405 | KeyValuePair operator->() const |
406 | { |
407 | return this->operator*(); |
408 | } |
409 | |
410 | bool operator==(const Iterator& other) const |
411 | { |
412 | return (m_buckets == other.m_buckets) && (m_index == other.m_index); |
413 | } |
414 | |
415 | bool operator!=(const Iterator& other) const |
416 | { |
417 | return (m_buckets != other.m_buckets) || (m_index != other.m_index); |
418 | } |
419 | |
420 | Iterator& operator++() |
421 | { |
422 | do |
423 | { |
424 | m_index++; |
425 | } while (m_index != m_numBuckets && !m_buckets[m_index].m_isFull); |
426 | |
427 | return *this; |
428 | } |
429 | }; |
430 | |
431 | Iterator begin() const |
432 | { |
433 | return Iterator(m_buckets, m_numBuckets, 0); |
434 | } |
435 | |
436 | Iterator end() const |
437 | { |
438 | return Iterator(m_buckets, m_numBuckets, m_numBuckets); |
439 | } |
440 | #endif // DEBUG |
441 | |
442 | unsigned Count() const |
443 | { |
444 | return m_numFullBuckets; |
445 | } |
446 | |
447 | void Clear() |
448 | { |
449 | if (m_numBuckets > 0) |
450 | { |
451 | memset(m_buckets, 0, sizeof(Bucket) * m_numBuckets); |
452 | m_numFullBuckets = 0; |
453 | } |
454 | } |
455 | |
456 | //------------------------------------------------------------------------ |
457 | // HashTableBase::AddOrUpdate: adds a key-value pair to the hash table if |
458 | // the key does not already exist in the |
459 | // table, or updates the value if the key |
460 | // already exists. |
461 | // |
462 | // Arguments: |
463 | // key - The key for which to add or update a value. |
464 | // value - The value. |
465 | // |
466 | // Returns: |
467 | // True if the value was added; false if it was updated. |
468 | bool AddOrUpdate(const TKey& key, const TValue& value) |
469 | { |
470 | unsigned hash = TKeyInfo::GetHashCode(key); |
471 | |
472 | unsigned unused, index; |
473 | if (TryGetBucket(hash, key, &unused, &index)) |
474 | { |
475 | m_buckets[index].m_value = value; |
476 | return false; |
477 | } |
478 | |
479 | // If the load is greater than 0.8, resize the table before inserting. |
480 | if ((m_numFullBuckets * 5) >= (m_numBuckets * 4)) |
481 | { |
482 | Resize(); |
483 | } |
484 | |
485 | bool inserted = Insert(m_buckets, m_numBuckets, hash, key, value); |
486 | (assert(inserted), (void)inserted); |
487 | |
488 | m_numFullBuckets++; |
489 | |
490 | return true; |
491 | } |
492 | |
493 | //------------------------------------------------------------------------ |
494 | // HashTableBase::TryRemove: removes a key from the hash table and returns |
495 | // its value if the key exists in the table. |
496 | // |
497 | // Arguments: |
498 | // key - The key to remove from the table. |
499 | // value - An output parameter that will hold the value for the removed |
500 | // key. |
501 | // |
502 | // Returns: |
503 | // True if the key was removed from the table; false otherwise. |
504 | bool TryRemove(const TKey& key, TValue* value) |
505 | { |
506 | unsigned hash = TKeyInfo::GetHashCode(key); |
507 | |
508 | unsigned precedingIndexInChain, bucketIndex; |
509 | if (!TryGetBucket(hash, key, &precedingIndexInChain, &bucketIndex)) |
510 | { |
511 | return false; |
512 | } |
513 | |
514 | Bucket* bucket = &m_buckets[bucketIndex]; |
515 | |
516 | if (precedingIndexInChain != bucketIndex) |
517 | { |
518 | const unsigned mask = m_numBuckets - 1; |
519 | unsigned homeIndex = hash & mask; |
520 | |
521 | unsigned nextOffset; |
522 | if (bucket->m_nextOffset == 0) |
523 | { |
524 | nextOffset = 0; |
525 | } |
526 | else |
527 | { |
528 | unsigned nextIndexInChain = (bucketIndex + bucket->m_nextOffset) & mask; |
529 | nextOffset = (nextIndexInChain - precedingIndexInChain) & mask; |
530 | } |
531 | |
532 | if (precedingIndexInChain == homeIndex) |
533 | { |
534 | m_buckets[precedingIndexInChain].m_firstOffset = nextOffset; |
535 | } |
536 | else |
537 | { |
538 | m_buckets[precedingIndexInChain].m_nextOffset = nextOffset; |
539 | } |
540 | } |
541 | |
542 | bucket->m_isFull = false; |
543 | bucket->m_nextOffset = 0; |
544 | |
545 | m_numFullBuckets--; |
546 | |
547 | *value = bucket->m_value; |
548 | return true; |
549 | } |
550 | |
551 | //------------------------------------------------------------------------ |
552 | // HashTableBase::TryGetValue: retrieves the value for a key if the key |
553 | // exists in the table. |
554 | // |
555 | // Arguments: |
556 | // key - The key to find from the table. |
557 | // value - An output parameter that will hold the value for the key. |
558 | // |
559 | // Returns: |
560 | // True if the key was found in the table; false otherwise. |
561 | bool TryGetValue(const TKey& key, TValue* value) const |
562 | { |
563 | unsigned unused, index; |
564 | if (!TryGetBucket(TKeyInfo::GetHashCode(key), key, &unused, &index)) |
565 | { |
566 | return false; |
567 | } |
568 | |
569 | *value = m_buckets[index].m_value; |
570 | return true; |
571 | } |
572 | |
573 | //------------------------------------------------------------------------ |
574 | // HashTableBase::Contains: returns true if a key exists in the table and |
575 | // false otherwise. |
576 | // |
577 | // Arguments: |
578 | // key - The key to find from the table. |
579 | // |
580 | // Returns: |
581 | // True if the key was found in the table; false otherwise. |
582 | bool Contains(const TKey& key) const |
583 | { |
584 | unsigned unused, index; |
585 | return TryGetBucket(TKeyInfo::GetHashCode(key), key, &unused, &index); |
586 | } |
587 | }; |
588 | |
589 | //------------------------------------------------------------------------ |
590 | // HashTable: a simple subclass of `HashTableBase` that always uses heap |
591 | // storage for its bucket array. |
592 | template <typename TKey, typename TValue, typename TKeyInfo = HashTableInfo<TKey>, typename TAllocator = CompAllocator> |
593 | class HashTable final : public HashTableBase<TKey, TValue, TKeyInfo, TAllocator> |
594 | { |
595 | typedef HashTableBase<TKey, TValue, TKeyInfo, TAllocator> TBase; |
596 | |
597 | static unsigned RoundUp(unsigned initialSize) |
598 | { |
599 | return 1 << genLog2(initialSize); |
600 | } |
601 | |
602 | public: |
603 | HashTable(TAllocator alloc) : TBase(alloc, nullptr, 0) |
604 | { |
605 | } |
606 | |
607 | HashTable(TAllocator alloc, unsigned initialSize) |
608 | : TBase(alloc, alloc.template allocate<TBase::Bucket>(RoundUp(initialSize)), RoundUp(initialSize)) |
609 | { |
610 | } |
611 | }; |
612 | |
613 | //------------------------------------------------------------------------ |
614 | // SmallHashTable: an alternative to `HashTable` that stores the initial |
615 | // bucket array inline. Most useful for situations where |
616 | // the number of key-value pairs that will be stored in |
617 | // the map at any given time falls below a certain |
618 | // threshold. Switches to heap storage once the initial |
619 | // inline storage is exhausted. |
620 | template <typename TKey, |
621 | typename TValue, |
622 | unsigned NumInlineBuckets = 8, |
623 | typename TKeyInfo = HashTableInfo<TKey>, |
624 | typename TAllocator = CompAllocator> |
625 | class SmallHashTable final : public HashTableBase<TKey, TValue, TKeyInfo, TAllocator> |
626 | { |
627 | typedef HashTableBase<TKey, TValue, TKeyInfo, TAllocator> TBase; |
628 | |
629 | enum : unsigned |
630 | { |
631 | RoundedNumInlineBuckets = 1 << ConstLog2<NumInlineBuckets>::value |
632 | }; |
633 | |
634 | typename TBase::Bucket m_inlineBuckets[RoundedNumInlineBuckets]; |
635 | |
636 | public: |
637 | SmallHashTable(TAllocator alloc) : TBase(alloc, m_inlineBuckets, RoundedNumInlineBuckets) |
638 | { |
639 | } |
640 | }; |
641 | |
642 | #endif // _SMALLHASHTABLE_H_ |
643 | |