1 | /* |
2 | * Copyright 2017-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #pragma once |
18 | |
19 | #include <cstddef> |
20 | #include <cstdint> |
21 | #include <cstring> |
22 | |
23 | #include <array> |
24 | #include <iterator> |
25 | #include <limits> |
26 | #include <memory> |
27 | #include <new> |
28 | #include <type_traits> |
29 | #include <utility> |
30 | #include <vector> |
31 | |
32 | #include <folly/Bits.h> |
33 | #include <folly/ConstexprMath.h> |
34 | #include <folly/Likely.h> |
35 | #include <folly/Portability.h> |
36 | #include <folly/ScopeGuard.h> |
37 | #include <folly/Traits.h> |
38 | #include <folly/functional/ApplyTuple.h> |
39 | #include <folly/functional/Invoke.h> |
40 | #include <folly/lang/Align.h> |
41 | #include <folly/lang/Assume.h> |
42 | #include <folly/lang/Exception.h> |
43 | #include <folly/lang/Launder.h> |
44 | #include <folly/lang/SafeAssert.h> |
45 | #include <folly/portability/Builtins.h> |
46 | |
47 | #include <folly/container/HeterogeneousAccess.h> |
48 | #include <folly/container/detail/F14Defaults.h> |
49 | #include <folly/container/detail/F14IntrinsicsAvailability.h> |
50 | |
51 | #if FOLLY_ASAN_ENABLED && defined(FOLLY_TLS) |
52 | #define FOLLY_F14_TLS_IF_ASAN FOLLY_TLS |
53 | #else |
54 | #define FOLLY_F14_TLS_IF_ASAN |
55 | #endif |
56 | |
57 | #if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE |
58 | |
59 | #if FOLLY_F14_CRC_INTRINSIC_AVAILABLE |
60 | #if FOLLY_NEON |
61 | #include <arm_acle.h> // __crc32cd |
62 | #else |
63 | #include <nmmintrin.h> // _mm_crc32_u64 |
64 | #endif |
65 | #else |
66 | #ifdef _WIN32 |
67 | #include <intrin.h> // _mul128 in fallback bit mixer |
68 | #endif |
69 | #endif |
70 | |
71 | #if FOLLY_NEON |
72 | #include <arm_neon.h> // uint8x16t intrinsics |
73 | #else // SSE2 |
74 | #include <immintrin.h> // __m128i intrinsics |
75 | #include <xmmintrin.h> // _mm_prefetch |
76 | #endif |
77 | |
78 | #endif |
79 | |
80 | #ifndef FOLLY_F14_PERTURB_INSERTION_ORDER |
81 | #define FOLLY_F14_PERTURB_INSERTION_ORDER folly::kIsDebug |
82 | #endif |
83 | |
84 | namespace folly { |
85 | |
86 | struct F14TableStats { |
87 | char const* policy; |
88 | std::size_t size{0}; |
89 | std::size_t valueSize{0}; |
90 | std::size_t bucketCount{0}; |
91 | std::size_t chunkCount{0}; |
92 | std::vector<std::size_t> chunkOccupancyHisto; |
93 | std::vector<std::size_t> chunkOutboundOverflowHisto; |
94 | std::vector<std::size_t> chunkHostedOverflowHisto; |
95 | std::vector<std::size_t> keyProbeLengthHisto; |
96 | std::vector<std::size_t> missProbeLengthHisto; |
97 | std::size_t totalBytes{0}; |
98 | std::size_t overheadBytes{0}; |
99 | |
100 | private: |
101 | template <typename T> |
102 | static auto computeHelper(T const* m) -> decltype(m->computeStats()) { |
103 | return m->computeStats(); |
104 | } |
105 | |
106 | static F14TableStats computeHelper(...) { |
107 | return {}; |
108 | } |
109 | |
110 | public: |
111 | template <typename T> |
112 | static F14TableStats compute(T const& m) { |
113 | return computeHelper(&m); |
114 | } |
115 | }; |
116 | |
117 | namespace f14 { |
118 | namespace detail { |
119 | |
120 | template <F14IntrinsicsMode> |
121 | struct F14LinkCheck {}; |
122 | |
123 | template <> |
124 | struct F14LinkCheck<getF14IntrinsicsMode()> { |
125 | // The purpose of this method is to trigger a link failure if |
126 | // compilation flags vary across compilation units. The definition |
127 | // is in F14Table.cpp, so only one of F14LinkCheck<None>::check, |
128 | // F14LinkCheck<Simd>::check, or F14LinkCheck<SimdAndCrc>::check will |
129 | // be available at link time. |
130 | // |
131 | // To cause a link failure the function must be invoked in code that |
132 | // is not optimized away, so we call it on a couple of cold paths |
133 | // (exception handling paths in copy construction and rehash). LTO may |
134 | // remove it entirely, but that's fine. |
135 | static void check() noexcept; |
136 | }; |
137 | |
138 | bool tlsPendingSafeInserts(std::ptrdiff_t delta = 0); |
139 | std::size_t tlsMinstdRand(std::size_t n); |
140 | |
141 | #if defined(_LIBCPP_VERSION) |
142 | |
143 | template <typename K, typename V, typename H> |
144 | struct StdNodeReplica { |
145 | void* next; |
146 | std::size_t hash; |
147 | V value; |
148 | }; |
149 | |
150 | #else |
151 | |
152 | template <typename H> |
153 | struct StdIsFastHash : std::true_type {}; |
154 | template <> |
155 | struct StdIsFastHash<std::hash<long double>> : std::false_type {}; |
156 | template <typename... Args> |
157 | struct StdIsFastHash<std::hash<std::basic_string<Args...>>> : std::false_type { |
158 | }; |
159 | |
160 | // TODO: add specialization for std::basic_string_view |
161 | |
162 | // mimic internal node of unordered containers in STL to estimate the size |
163 | template <typename K, typename V, typename H, typename Enable = void> |
164 | struct StdNodeReplica { |
165 | void* next; |
166 | V value; |
167 | }; |
168 | template <typename K, typename V, typename H> |
169 | struct StdNodeReplica< |
170 | K, |
171 | V, |
172 | H, |
173 | std::enable_if_t< |
174 | !StdIsFastHash<H>::value || !is_nothrow_invocable<H, K>::value>> { |
175 | void* next; |
176 | V value; |
177 | std::size_t hash; |
178 | }; |
179 | |
180 | #endif |
181 | |
182 | } // namespace detail |
183 | } // namespace f14 |
184 | |
185 | #if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE |
186 | namespace f14 { |
187 | namespace detail { |
188 | template <typename Policy> |
189 | class F14Table; |
190 | } // namespace detail |
191 | } // namespace f14 |
192 | |
193 | class F14HashToken final { |
194 | public: |
195 | F14HashToken() = default; |
196 | |
197 | private: |
198 | using HashPair = std::pair<std::size_t, std::size_t>; |
199 | |
200 | explicit F14HashToken(HashPair hp) : hp_(hp) {} |
201 | explicit operator HashPair() const { |
202 | return hp_; |
203 | } |
204 | |
205 | HashPair hp_; |
206 | |
207 | template <typename Policy> |
208 | friend class f14::detail::F14Table; |
209 | }; |
210 | |
211 | namespace f14 { |
212 | namespace detail { |
213 | //// Defaults should be selected using void |
214 | |
215 | template <typename Arg, typename Default> |
216 | using VoidDefault = |
217 | std::conditional_t<std::is_same<Arg, Default>::value, void, Arg>; |
218 | |
219 | template <typename Arg, typename Default> |
220 | using Defaulted = |
221 | typename std::conditional_t<std::is_same<Arg, void>::value, Default, Arg>; |
222 | |
223 | template < |
224 | typename TableKey, |
225 | typename Hasher, |
226 | typename KeyEqual, |
227 | typename ArgKey, |
228 | typename Void = void> |
229 | struct EligibleForHeterogeneousFind : std::false_type {}; |
230 | |
231 | template < |
232 | typename TableKey, |
233 | typename Hasher, |
234 | typename KeyEqual, |
235 | typename ArgKey> |
236 | struct EligibleForHeterogeneousFind< |
237 | TableKey, |
238 | Hasher, |
239 | KeyEqual, |
240 | ArgKey, |
241 | void_t< |
242 | typename Hasher::is_transparent, |
243 | typename KeyEqual::is_transparent, |
244 | invoke_result_t<Hasher, ArgKey const&>, |
245 | invoke_result_t<KeyEqual, ArgKey const&, TableKey const&>>> |
246 | : std::true_type {}; |
247 | |
248 | template < |
249 | typename TableKey, |
250 | typename Hasher, |
251 | typename KeyEqual, |
252 | typename ArgKey> |
253 | using EligibleForHeterogeneousInsert = Conjunction< |
254 | EligibleForHeterogeneousFind<TableKey, Hasher, KeyEqual, ArgKey>, |
255 | std::is_constructible<TableKey, ArgKey>>; |
256 | |
257 | template < |
258 | typename TableKey, |
259 | typename Hasher, |
260 | typename KeyEqual, |
261 | typename KeyArg0OrBool, |
262 | typename... KeyArgs> |
263 | using KeyTypeForEmplaceHelper = std::conditional_t< |
264 | sizeof...(KeyArgs) == 1 && |
265 | (std::is_same<remove_cvref_t<KeyArg0OrBool>, TableKey>::value || |
266 | EligibleForHeterogeneousFind< |
267 | TableKey, |
268 | Hasher, |
269 | KeyEqual, |
270 | KeyArg0OrBool>::value), |
271 | KeyArg0OrBool&&, |
272 | TableKey>; |
273 | |
274 | template < |
275 | typename TableKey, |
276 | typename Hasher, |
277 | typename KeyEqual, |
278 | typename... KeyArgs> |
279 | using KeyTypeForEmplace = KeyTypeForEmplaceHelper< |
280 | TableKey, |
281 | Hasher, |
282 | KeyEqual, |
283 | std::tuple_element_t<0, std::tuple<KeyArgs..., bool>>, |
284 | KeyArgs...>; |
285 | |
286 | //////////////// |
287 | |
288 | template <typename T> |
289 | FOLLY_ALWAYS_INLINE static void prefetchAddr(T const* ptr) { |
290 | #ifndef _WIN32 |
291 | __builtin_prefetch(static_cast<void const*>(ptr)); |
292 | #elif FOLLY_NEON |
293 | __prefetch(static_cast<void const*>(ptr)); |
294 | #else |
295 | _mm_prefetch( |
296 | static_cast<char const*>(static_cast<void const*>(ptr)), _MM_HINT_T0); |
297 | #endif |
298 | } |
299 | |
300 | template <typename T> |
301 | FOLLY_ALWAYS_INLINE static unsigned findFirstSetNonZero(T mask) { |
302 | assume(mask != 0); |
303 | if (sizeof(mask) == sizeof(unsigned)) { |
304 | return __builtin_ctz(static_cast<unsigned>(mask)); |
305 | } else { |
306 | return __builtin_ctzll(mask); |
307 | } |
308 | } |
309 | |
310 | #if FOLLY_NEON |
311 | using TagVector = uint8x16_t; |
312 | |
313 | using MaskType = uint64_t; |
314 | |
315 | constexpr unsigned kMaskSpacing = 4; |
316 | #else // SSE2 |
317 | using TagVector = __m128i; |
318 | |
319 | using MaskType = uint32_t; |
320 | |
321 | constexpr unsigned kMaskSpacing = 1; |
322 | #endif |
323 | |
324 | // We could use unaligned loads to relax this requirement, but that |
325 | // would be both a performance penalty and require a bulkier packed |
326 | // ItemIter format |
327 | constexpr std::size_t kRequiredVectorAlignment = |
328 | constexpr_max(std::size_t{16}, alignof(max_align_t)); |
329 | |
330 | using EmptyTagVectorType = std::aligned_storage_t< |
331 | sizeof(TagVector) + kRequiredVectorAlignment, |
332 | alignof(max_align_t)>; |
333 | |
334 | extern EmptyTagVectorType kEmptyTagVector; |
335 | |
336 | template <unsigned BitCount> |
337 | struct FullMask { |
338 | static constexpr MaskType value = |
339 | (FullMask<BitCount - 1>::value << kMaskSpacing) + 1; |
340 | }; |
341 | |
342 | template <> |
343 | struct FullMask<1> : std::integral_constant<MaskType, 1> {}; |
344 | |
345 | #if FOLLY_ARM |
346 | // Mask iteration is different for ARM because that is the only platform |
347 | // for which the mask is bigger than a register. |
348 | |
349 | // Iterates a mask, optimized for the case that only a few bits are set |
350 | class SparseMaskIter { |
351 | static_assert(kMaskSpacing == 4, "" ); |
352 | |
353 | uint32_t interleavedMask_; |
354 | |
355 | public: |
356 | explicit SparseMaskIter(MaskType mask) |
357 | : interleavedMask_{static_cast<uint32_t>(((mask >> 32) << 2) | mask)} {} |
358 | |
359 | bool hasNext() { |
360 | return interleavedMask_ != 0; |
361 | } |
362 | |
363 | unsigned next() { |
364 | FOLLY_SAFE_DCHECK(hasNext(), "" ); |
365 | unsigned i = findFirstSetNonZero(interleavedMask_); |
366 | interleavedMask_ &= (interleavedMask_ - 1); |
367 | return ((i >> 2) | (i << 2)) & 0xf; |
368 | } |
369 | }; |
370 | |
371 | // Iterates a mask, optimized for the case that most bits are set |
372 | class DenseMaskIter { |
373 | static_assert(kMaskSpacing == 4, "" ); |
374 | |
375 | std::size_t count_; |
376 | unsigned index_; |
377 | uint8_t const* tags_; |
378 | |
379 | public: |
380 | explicit DenseMaskIter(uint8_t const* tags, MaskType mask) { |
381 | if (mask == 0) { |
382 | count_ = 0; |
383 | } else { |
384 | count_ = popcount(static_cast<uint32_t>(((mask >> 32) << 2) | mask)); |
385 | if (LIKELY((mask & 1) != 0)) { |
386 | index_ = 0; |
387 | } else { |
388 | index_ = findFirstSetNonZero(mask) / kMaskSpacing; |
389 | } |
390 | tags_ = tags; |
391 | } |
392 | } |
393 | |
394 | bool hasNext() { |
395 | return count_ > 0; |
396 | } |
397 | |
398 | unsigned next() { |
399 | auto rv = index_; |
400 | --count_; |
401 | if (count_ > 0) { |
402 | do { |
403 | ++index_; |
404 | } while ((tags_[index_] & 0x80) == 0); |
405 | } |
406 | FOLLY_SAFE_DCHECK(index_ < 16, "" ); |
407 | return rv; |
408 | } |
409 | }; |
410 | |
411 | #else |
412 | // Iterates a mask, optimized for the case that only a few bits are set |
413 | class SparseMaskIter { |
414 | MaskType mask_; |
415 | |
416 | public: |
417 | explicit SparseMaskIter(MaskType mask) : mask_{mask} {} |
418 | |
419 | bool hasNext() { |
420 | return mask_ != 0; |
421 | } |
422 | |
423 | unsigned next() { |
424 | FOLLY_SAFE_DCHECK(hasNext(), "" ); |
425 | unsigned i = findFirstSetNonZero(mask_); |
426 | mask_ &= (mask_ - 1); |
427 | return i / kMaskSpacing; |
428 | } |
429 | }; |
430 | |
431 | // Iterates a mask, optimized for the case that most bits are set |
432 | class DenseMaskIter { |
433 | MaskType mask_; |
434 | unsigned index_{0}; |
435 | |
436 | public: |
437 | explicit DenseMaskIter(uint8_t const*, MaskType mask) : mask_{mask} {} |
438 | |
439 | bool hasNext() { |
440 | return mask_ != 0; |
441 | } |
442 | |
443 | unsigned next() { |
444 | FOLLY_SAFE_DCHECK(hasNext(), "" ); |
445 | if (LIKELY((mask_ & 1) != 0)) { |
446 | mask_ >>= kMaskSpacing; |
447 | return index_++; |
448 | } else { |
449 | unsigned s = findFirstSetNonZero(mask_); |
450 | unsigned rv = index_ + (s / kMaskSpacing); |
451 | mask_ >>= (s + kMaskSpacing); |
452 | index_ = rv + 1; |
453 | return rv; |
454 | } |
455 | } |
456 | }; |
457 | #endif |
458 | |
459 | // Iterates a mask, returning pairs of [begin,end) index covering blocks |
460 | // of set bits |
461 | class MaskRangeIter { |
462 | MaskType mask_; |
463 | unsigned shift_{0}; |
464 | |
465 | public: |
466 | explicit MaskRangeIter(MaskType mask) { |
467 | // If kMaskSpacing is > 1 then there will be empty bits even for |
468 | // contiguous ranges. Fill them in. |
469 | mask_ = mask * ((1 << kMaskSpacing) - 1); |
470 | } |
471 | |
472 | bool hasNext() { |
473 | return mask_ != 0; |
474 | } |
475 | |
476 | std::pair<unsigned, unsigned> next() { |
477 | FOLLY_SAFE_DCHECK(hasNext(), "" ); |
478 | auto s = shift_; |
479 | unsigned b = findFirstSetNonZero(mask_); |
480 | unsigned e = findFirstSetNonZero(~(mask_ | (mask_ - 1))); |
481 | mask_ >>= e; |
482 | shift_ = s + e; |
483 | return std::make_pair((s + b) / kMaskSpacing, (s + e) / kMaskSpacing); |
484 | } |
485 | }; |
486 | |
487 | // Holds the result of an index query that has an optional result, |
488 | // interpreting a mask of 0 to be the empty answer and the index of the |
489 | // last set bit to be the non-empty answer |
490 | class LastOccupiedInMask { |
491 | MaskType mask_; |
492 | |
493 | public: |
494 | explicit LastOccupiedInMask(MaskType mask) : mask_{mask} {} |
495 | |
496 | bool hasIndex() const { |
497 | return mask_ != 0; |
498 | } |
499 | |
500 | unsigned index() const { |
501 | assume(mask_ != 0); |
502 | return (findLastSet(mask_) - 1) / kMaskSpacing; |
503 | } |
504 | }; |
505 | |
506 | // Holds the result of an index query that has an optional result, |
507 | // interpreting a mask of 0 to be the empty answer and the index of the |
508 | // first set bit to be the non-empty answer |
509 | class FirstEmptyInMask { |
510 | MaskType mask_; |
511 | |
512 | public: |
513 | explicit FirstEmptyInMask(MaskType mask) : mask_{mask} {} |
514 | |
515 | bool hasIndex() const { |
516 | return mask_ != 0; |
517 | } |
518 | |
519 | unsigned index() const { |
520 | FOLLY_SAFE_DCHECK(mask_ != 0, "" ); |
521 | return findFirstSetNonZero(mask_) / kMaskSpacing; |
522 | } |
523 | }; |
524 | |
525 | template <typename ItemType> |
526 | struct alignas(kRequiredVectorAlignment) F14Chunk { |
527 | using Item = ItemType; |
528 | |
529 | // For our 16 byte vector alignment (and assuming alignof(Item) >= |
530 | // 4) kCapacity of 14 is the most space efficient. Slightly smaller |
531 | // or larger capacities can help with cache alignment in a couple of |
532 | // cases without wasting too much space, but once the items are larger |
533 | // then we're unlikely to get much benefit anyway. The only case we |
534 | // optimize is using kCapacity of 12 for 4 byte items, which makes the |
535 | // chunk take exactly 1 cache line, and adding 16 bytes of padding for |
536 | // 16 byte items so that a chunk takes exactly 4 cache lines. |
537 | static constexpr unsigned kCapacity = sizeof(Item) == 4 ? 12 : 14; |
538 | |
539 | static constexpr unsigned kDesiredCapacity = kCapacity - 2; |
540 | |
541 | static constexpr unsigned kAllocatedCapacity = |
542 | kCapacity + (sizeof(Item) == 16 ? 1 : 0); |
543 | |
544 | // If kCapacity == 12 then we get 16 bits of capacityScale by using |
545 | // tag 12 and 13, otherwise we only get 4 bits of control_ |
546 | static constexpr std::size_t kCapacityScaleBits = kCapacity == 12 ? 16 : 4; |
547 | static constexpr std::size_t kCapacityScaleShift = kCapacityScaleBits - 4; |
548 | |
549 | static constexpr MaskType kFullMask = FullMask<kCapacity>::value; |
550 | |
551 | // Non-empty tags have their top bit set. tags_ array might be bigger |
552 | // than kCapacity to keep alignment of first item. |
553 | std::array<uint8_t, 14> tags_; |
554 | |
555 | // Bits 0..3 of chunk 0 record the scaling factor between the number of |
556 | // chunks and the max size without rehash. Bits 4-7 in any chunk are a |
557 | // 4-bit counter of the number of values in this chunk that were placed |
558 | // because they overflowed their desired chunk (hostedOverflowCount). |
559 | uint8_t control_; |
560 | |
561 | // The number of values that would have been placed into this chunk if |
562 | // there had been space, including values that also overflowed previous |
563 | // full chunks. This value saturates; once it becomes 255 it no longer |
564 | // increases nor decreases. |
565 | uint8_t outboundOverflowCount_; |
566 | |
567 | std::array<aligned_storage_for_t<Item>, kAllocatedCapacity> rawItems_; |
568 | |
569 | static F14Chunk* emptyInstance() { |
570 | auto raw = reinterpret_cast<char*>(&kEmptyTagVector); |
571 | if (kRequiredVectorAlignment > alignof(max_align_t)) { |
572 | auto delta = kRequiredVectorAlignment - |
573 | (reinterpret_cast<uintptr_t>(raw) % kRequiredVectorAlignment); |
574 | raw += delta; |
575 | } |
576 | auto rv = reinterpret_cast<F14Chunk*>(raw); |
577 | FOLLY_SAFE_DCHECK( |
578 | (reinterpret_cast<uintptr_t>(rv) % kRequiredVectorAlignment) == 0, "" ); |
579 | return rv; |
580 | } |
581 | |
582 | void clear() { |
583 | // tags_ = {}; control_ = 0; outboundOverflowCount_ = 0; |
584 | |
585 | // gcc < 6 doesn't exploit chunk alignment to generate the optimal |
586 | // SSE clear from memset. This is very hot code, so it is worth |
587 | // handling that case specially. |
588 | #if FOLLY_SSE >= 2 && __GNUC__ <= 5 && !__clang__ |
589 | // this doesn't violate strict aliasing rules because __m128i is |
590 | // tagged as __may_alias__ |
591 | auto* v = static_cast<__m128i*>(static_cast<void*>(&tags_[0])); |
592 | _mm_store_si128(v, _mm_setzero_si128()); |
593 | #else |
594 | std::memset(&tags_[0], '\0', 16); |
595 | #endif |
596 | } |
597 | |
598 | void copyOverflowInfoFrom(F14Chunk const& rhs) { |
599 | FOLLY_SAFE_DCHECK(hostedOverflowCount() == 0, "" ); |
600 | control_ += static_cast<uint8_t>(rhs.control_ & 0xf0); |
601 | outboundOverflowCount_ = rhs.outboundOverflowCount_; |
602 | } |
603 | |
604 | unsigned hostedOverflowCount() const { |
605 | return control_ >> 4; |
606 | } |
607 | |
608 | static constexpr uint8_t kIncrHostedOverflowCount = 0x10; |
609 | static constexpr uint8_t kDecrHostedOverflowCount = |
610 | static_cast<uint8_t>(-0x10); |
611 | |
612 | void adjustHostedOverflowCount(uint8_t op) { |
613 | control_ += op; |
614 | } |
615 | |
616 | bool eof() const { |
617 | return capacityScale() != 0; |
618 | } |
619 | |
620 | std::size_t capacityScale() const { |
621 | if (kCapacityScaleBits == 4) { |
622 | return control_ & 0xf; |
623 | } else { |
624 | uint16_t v; |
625 | std::memcpy(&v, &tags_[12], 2); |
626 | return v; |
627 | } |
628 | } |
629 | |
630 | void setCapacityScale(std::size_t scale) { |
631 | FOLLY_SAFE_DCHECK( |
632 | this != emptyInstance() && scale > 0 && |
633 | scale < (std::size_t{1} << kCapacityScaleBits), |
634 | "" ); |
635 | if (kCapacityScaleBits == 4) { |
636 | control_ = (control_ & ~0xf) | static_cast<uint8_t>(scale); |
637 | } else { |
638 | uint16_t v = static_cast<uint16_t>(scale); |
639 | std::memcpy(&tags_[12], &v, 2); |
640 | } |
641 | } |
642 | |
643 | void markEof(std::size_t scale) { |
644 | folly::assume(control_ == 0); |
645 | setCapacityScale(scale); |
646 | } |
647 | |
648 | unsigned outboundOverflowCount() const { |
649 | return outboundOverflowCount_; |
650 | } |
651 | |
652 | void incrOutboundOverflowCount() { |
653 | if (outboundOverflowCount_ != 255) { |
654 | ++outboundOverflowCount_; |
655 | } |
656 | } |
657 | |
658 | void decrOutboundOverflowCount() { |
659 | if (outboundOverflowCount_ != 255) { |
660 | --outboundOverflowCount_; |
661 | } |
662 | } |
663 | |
664 | std::size_t tag(std::size_t index) const { |
665 | return tags_[index]; |
666 | } |
667 | |
668 | void setTag(std::size_t index, std::size_t tag) { |
669 | FOLLY_SAFE_DCHECK( |
670 | this != emptyInstance() && tag >= 0x80 && tag <= 0xff, "" ); |
671 | tags_[index] = static_cast<uint8_t>(tag); |
672 | } |
673 | |
674 | void clearTag(std::size_t index) { |
675 | tags_[index] = 0; |
676 | } |
677 | |
678 | #if FOLLY_NEON |
679 | //////// |
680 | // Tag filtering using NEON intrinsics |
681 | |
682 | SparseMaskIter tagMatchIter(std::size_t needle) const { |
683 | FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "" ); |
684 | uint8x16_t tagV = vld1q_u8(&tags_[0]); |
685 | auto needleV = vdupq_n_u8(static_cast<uint8_t>(needle)); |
686 | auto eqV = vceqq_u8(tagV, needleV); |
687 | // get info from every byte into the bottom half of every uint16_t |
688 | // by shifting right 4, then round to get it into a 64-bit vector |
689 | uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(eqV), 4); |
690 | uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask; |
691 | return SparseMaskIter(mask); |
692 | } |
693 | |
694 | MaskType occupiedMask() const { |
695 | uint8x16_t tagV = vld1q_u8(&tags_[0]); |
696 | // signed shift extends top bit to all bits |
697 | auto occupiedV = |
698 | vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(tagV), 7)); |
699 | uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(occupiedV), 4); |
700 | return vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask; |
701 | } |
702 | #else |
703 | //////// |
704 | // Tag filtering using SSE2 intrinsics |
705 | |
706 | TagVector const* tagVector() const { |
707 | return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0])); |
708 | } |
709 | |
710 | SparseMaskIter tagMatchIter(std::size_t needle) const { |
711 | FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "" ); |
712 | auto tagV = _mm_load_si128(tagVector()); |
713 | |
714 | // TRICKY! It may seem strange to have a std::size_t needle and narrow |
715 | // it at the last moment, rather than making HashPair::second be a |
716 | // uint8_t, but the latter choice sometimes leads to a performance |
717 | // problem. |
718 | // |
719 | // On architectures with SSE2 but not AVX2, _mm_set1_epi8 expands |
720 | // to multiple instructions. One of those is a MOVD of either 4 or |
721 | // 8 byte width. Only the bottom byte of that move actually affects |
722 | // the result, but if a 1-byte needle has been spilled then this will |
723 | // be a 4 byte load. GCC 5.5 has been observed to reload needle |
724 | // (or perhaps fuse a reload and part of a previous static_cast) |
725 | // needle using a MOVZX with a 1 byte load in parallel with the MOVD. |
726 | // This combination causes a failure of store-to-load forwarding, |
727 | // which has a big performance penalty (60 nanoseconds per find on |
728 | // a microbenchmark). Keeping needle >= 4 bytes avoids the problem |
729 | // and also happens to result in slightly more compact assembly. |
730 | auto needleV = _mm_set1_epi8(static_cast<uint8_t>(needle)); |
731 | auto eqV = _mm_cmpeq_epi8(tagV, needleV); |
732 | auto mask = _mm_movemask_epi8(eqV) & kFullMask; |
733 | return SparseMaskIter{mask}; |
734 | } |
735 | |
736 | MaskType occupiedMask() const { |
737 | auto tagV = _mm_load_si128(tagVector()); |
738 | return _mm_movemask_epi8(tagV) & kFullMask; |
739 | } |
740 | #endif |
741 | |
742 | DenseMaskIter occupiedIter() const { |
743 | return DenseMaskIter{&tags_[0], occupiedMask()}; |
744 | } |
745 | |
746 | MaskRangeIter occupiedRangeIter() const { |
747 | return MaskRangeIter{occupiedMask()}; |
748 | } |
749 | |
750 | LastOccupiedInMask lastOccupied() const { |
751 | return LastOccupiedInMask{occupiedMask()}; |
752 | } |
753 | |
754 | FirstEmptyInMask firstEmpty() const { |
755 | return FirstEmptyInMask{occupiedMask() ^ kFullMask}; |
756 | } |
757 | |
758 | bool occupied(std::size_t index) const { |
759 | FOLLY_SAFE_DCHECK(tags_[index] == 0 || (tags_[index] & 0x80) != 0, "" ); |
760 | return tags_[index] != 0; |
761 | } |
762 | |
763 | Item* itemAddr(std::size_t i) const { |
764 | return static_cast<Item*>( |
765 | const_cast<void*>(static_cast<void const*>(&rawItems_[i]))); |
766 | } |
767 | |
768 | Item& item(std::size_t i) { |
769 | FOLLY_SAFE_DCHECK(this->occupied(i), "" ); |
770 | return *launder(itemAddr(i)); |
771 | } |
772 | |
773 | Item const& citem(std::size_t i) const { |
774 | FOLLY_SAFE_DCHECK(this->occupied(i), "" ); |
775 | return *launder(itemAddr(i)); |
776 | } |
777 | |
778 | static F14Chunk& owner(Item& item, std::size_t index) { |
779 | auto rawAddr = |
780 | static_cast<uint8_t*>(static_cast<void*>(std::addressof(item))) - |
781 | offsetof(F14Chunk, rawItems_) - index * sizeof(Item); |
782 | auto chunkAddr = static_cast<F14Chunk*>(static_cast<void*>(rawAddr)); |
783 | FOLLY_SAFE_DCHECK(std::addressof(item) == chunkAddr->itemAddr(index), "" ); |
784 | return *chunkAddr; |
785 | } |
786 | }; |
787 | |
788 | //////////////// |
789 | |
790 | // PackedChunkItemPtr points to an Item in an F14Chunk, allowing both the |
791 | // Item& and its index to be recovered. It sorts by the address of the |
792 | // item, and it only works for items that are in a properly-aligned chunk. |
793 | |
794 | // generic form, not actually packed |
795 | template <typename Ptr> |
796 | class PackedChunkItemPtr { |
797 | public: |
798 | PackedChunkItemPtr(Ptr p, std::size_t i) noexcept : ptr_{p}, index_{i} { |
799 | FOLLY_SAFE_DCHECK(ptr_ != nullptr || index_ == 0, "" ); |
800 | } |
801 | |
802 | Ptr ptr() const { |
803 | return ptr_; |
804 | } |
805 | |
806 | std::size_t index() const { |
807 | return index_; |
808 | } |
809 | |
810 | bool operator<(PackedChunkItemPtr const& rhs) const { |
811 | FOLLY_SAFE_DCHECK(ptr_ != rhs.ptr_ || index_ == rhs.index_, "" ); |
812 | return ptr_ < rhs.ptr_; |
813 | } |
814 | |
815 | bool operator==(PackedChunkItemPtr const& rhs) const { |
816 | FOLLY_SAFE_DCHECK(ptr_ != rhs.ptr_ || index_ == rhs.index_, "" ); |
817 | return ptr_ == rhs.ptr_; |
818 | } |
819 | |
820 | bool operator!=(PackedChunkItemPtr const& rhs) const { |
821 | return !(*this == rhs); |
822 | } |
823 | |
824 | private: |
825 | Ptr ptr_; |
826 | std::size_t index_; |
827 | }; |
828 | |
829 | // Bare pointer form, packed into a uintptr_t. Uses only bits wasted by |
830 | // alignment, so it works on 32-bit and 64-bit platforms |
831 | template <typename T> |
832 | class PackedChunkItemPtr<T*> { |
833 | static_assert((alignof(F14Chunk<T>) % 16) == 0, "" ); |
834 | |
835 | // Chunks are 16-byte aligned, so we can maintain a packed pointer to a |
836 | // chunk item by packing the 4-bit item index into the least significant |
837 | // bits of a pointer to the chunk itself. This makes ItemIter::pack |
838 | // more expensive, however, since it has to compute the chunk address. |
839 | // |
840 | // Chunk items have varying alignment constraints, so it would seem |
841 | // to be that we can't do a similar trick while using only bit masking |
842 | // operations on the Item* itself. It happens to be, however, that if |
843 | // sizeof(Item) is not a multiple of 16 then we can recover a portion |
844 | // of the index bits from the knowledge that the Item-s are stored in |
845 | // an array that is itself 16-byte aligned. |
846 | // |
847 | // If kAlignBits is the number of trailing zero bits in sizeof(Item) |
848 | // (up to 4), then we can borrow those bits to store kAlignBits of the |
849 | // index directly. We can recover (4 - kAlignBits) bits of the index |
850 | // from the item pointer itself, by defining/observing that |
851 | // |
852 | // A = kAlignBits (A <= 4) |
853 | // |
854 | // S = (sizeof(Item) % 16) >> A (shifted-away bits are all zero) |
855 | // |
856 | // R = (itemPtr % 16) >> A (shifted-away bits are all zero) |
857 | // |
858 | // M = 16 >> A |
859 | // |
860 | // itemPtr % 16 = (index * sizeof(Item)) % 16 |
861 | // |
862 | // (R * 2^A) % 16 = (index * (sizeof(Item) % 16)) % 16 |
863 | // |
864 | // (R * 2^A) % 16 = (index * 2^A * S) % 16 |
865 | // |
866 | // R % M = (index * S) % M |
867 | // |
868 | // S is relatively prime with M, so a multiplicative inverse is easy |
869 | // to compute |
870 | // |
871 | // Sinv = S^(M - 1) % M |
872 | // |
873 | // (R * Sinv) % M = index % M |
874 | // |
875 | // This lets us recover the bottom bits of the index. When sizeof(T) |
876 | // is 8-byte aligned kSizeInverse will always be 1. When sizeof(T) |
877 | // is 4-byte aligned kSizeInverse will be either 1 or 3. |
878 | |
879 | // returns pow(x, y) % m |
880 | static constexpr uintptr_t powerMod(uintptr_t x, uintptr_t y, uintptr_t m) { |
881 | return y == 0 ? 1 : (x * powerMod(x, y - 1, m)) % m; |
882 | } |
883 | |
884 | static constexpr uintptr_t kIndexBits = 4; |
885 | static constexpr uintptr_t kIndexMask = (uintptr_t{1} << kIndexBits) - 1; |
886 | |
887 | static constexpr uintptr_t kAlignBits = constexpr_min( |
888 | uintptr_t{4}, |
889 | constexpr_find_first_set(uintptr_t{sizeof(T)}) - 1); |
890 | |
891 | static constexpr uintptr_t kAlignMask = (uintptr_t{1} << kAlignBits) - 1; |
892 | |
893 | static constexpr uintptr_t kModulus = uintptr_t{1} |
894 | << (kIndexBits - kAlignBits); |
895 | static constexpr uintptr_t kSizeInverse = |
896 | powerMod(sizeof(T) >> kAlignBits, kModulus - 1, kModulus); |
897 | |
898 | public: |
899 | PackedChunkItemPtr(T* p, std::size_t i) noexcept { |
900 | uintptr_t encoded = i >> (kIndexBits - kAlignBits); |
901 | assume((encoded & ~kAlignMask) == 0); |
902 | raw_ = reinterpret_cast<uintptr_t>(p) | encoded; |
903 | FOLLY_SAFE_DCHECK(p == ptr(), "" ); |
904 | FOLLY_SAFE_DCHECK(i == index(), "" ); |
905 | } |
906 | |
907 | T* ptr() const { |
908 | return reinterpret_cast<T*>(raw_ & ~kAlignMask); |
909 | } |
910 | |
911 | std::size_t index() const { |
912 | auto encoded = (raw_ & kAlignMask) << (kIndexBits - kAlignBits); |
913 | auto deduced = |
914 | ((raw_ >> kAlignBits) * kSizeInverse) & (kIndexMask >> kAlignBits); |
915 | return encoded | deduced; |
916 | } |
917 | |
918 | bool operator<(PackedChunkItemPtr const& rhs) const { |
919 | return raw_ < rhs.raw_; |
920 | } |
921 | bool operator==(PackedChunkItemPtr const& rhs) const { |
922 | return raw_ == rhs.raw_; |
923 | } |
924 | bool operator!=(PackedChunkItemPtr const& rhs) const { |
925 | return !(*this == rhs); |
926 | } |
927 | |
928 | private: |
929 | uintptr_t raw_; |
930 | }; |
931 | |
932 | template <typename ChunkPtr> |
933 | class F14ItemIter { |
934 | private: |
935 | using Chunk = typename std::pointer_traits<ChunkPtr>::element_type; |
936 | |
937 | public: |
938 | using Item = typename Chunk::Item; |
939 | using ItemPtr = typename std::pointer_traits<ChunkPtr>::template rebind<Item>; |
940 | using ItemConstPtr = |
941 | typename std::pointer_traits<ChunkPtr>::template rebind<Item const>; |
942 | |
943 | using Packed = PackedChunkItemPtr<ItemPtr>; |
944 | |
945 | //// PUBLIC |
946 | |
947 | F14ItemIter() noexcept : itemPtr_{nullptr}, index_{0} {} |
948 | |
949 | // default copy and move constructors and assignment operators are correct |
950 | |
951 | explicit F14ItemIter(Packed const& packed) |
952 | : itemPtr_{packed.ptr()}, index_{packed.index()} {} |
953 | |
954 | F14ItemIter(ChunkPtr chunk, std::size_t index) |
955 | : itemPtr_{std::pointer_traits<ItemPtr>::pointer_to(chunk->item(index))}, |
956 | index_{index} { |
957 | FOLLY_SAFE_DCHECK(index < Chunk::kCapacity, "" ); |
958 | assume( |
959 | std::pointer_traits<ItemPtr>::pointer_to(chunk->item(index)) != |
960 | nullptr); |
961 | assume(itemPtr_ != nullptr); |
962 | } |
963 | |
964 | FOLLY_ALWAYS_INLINE void advanceImpl(bool checkEof, bool likelyDead) { |
965 | auto c = chunk(); |
966 | |
967 | // common case is packed entries |
968 | while (index_ > 0) { |
969 | --index_; |
970 | --itemPtr_; |
971 | if (LIKELY(c->occupied(index_))) { |
972 | return; |
973 | } |
974 | } |
975 | |
976 | // It's fairly common for an iterator to be advanced and then become |
977 | // dead, for example in the return value from erase(iter) or in |
978 | // the last step of a loop. We'd like to make sure that the entire |
979 | // advance() method can be eliminated by the compiler's dead code |
980 | // elimination pass. To do that it must eliminate the loops, which |
981 | // requires it to prove that they have no side effects. It's easy |
982 | // to show that there are no escaping stores, but at the moment |
983 | // compilers also consider an infinite loop to be a side effect. |
984 | // (There are parts of the standard that would allow them to treat |
985 | // this as undefined behavior, but at the moment they don't exploit |
986 | // those clauses.) |
987 | // |
988 | // The following loop should really be a while loop, which would |
989 | // save a register, some instructions, and a conditional branch, |
990 | // but by writing it as a for loop the compiler can prove to itself |
991 | // that it will eventually terminate. (No matter that even if the |
992 | // loop executed in a single cycle it would take about 200 years to |
993 | // run all 2^64 iterations.) |
994 | // |
995 | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82776 has the bug we |
996 | // filed about the issue. while (true) { |
997 | for (std::size_t i = 1; !likelyDead || i != 0; ++i) { |
998 | if (checkEof) { |
999 | // exhausted the current chunk |
1000 | if (UNLIKELY(c->eof())) { |
1001 | FOLLY_SAFE_DCHECK(index_ == 0, "" ); |
1002 | itemPtr_ = nullptr; |
1003 | return; |
1004 | } |
1005 | } else { |
1006 | FOLLY_SAFE_DCHECK(!c->eof(), "" ); |
1007 | } |
1008 | --c; |
1009 | auto last = c->lastOccupied(); |
1010 | if (checkEof && !likelyDead) { |
1011 | prefetchAddr(&*c - 1); |
1012 | } |
1013 | if (LIKELY(last.hasIndex())) { |
1014 | index_ = last.index(); |
1015 | itemPtr_ = std::pointer_traits<ItemPtr>::pointer_to(c->item(index_)); |
1016 | return; |
1017 | } |
1018 | } |
1019 | } |
1020 | |
1021 | void precheckedAdvance() { |
1022 | advanceImpl(false, false); |
1023 | } |
1024 | |
1025 | FOLLY_ALWAYS_INLINE void advance() { |
1026 | advanceImpl(true, false); |
1027 | } |
1028 | |
1029 | FOLLY_ALWAYS_INLINE void advanceLikelyDead() { |
1030 | advanceImpl(true, true); |
1031 | } |
1032 | |
1033 | ChunkPtr chunk() const { |
1034 | return std::pointer_traits<ChunkPtr>::pointer_to( |
1035 | Chunk::owner(*itemPtr_, index_)); |
1036 | } |
1037 | |
1038 | std::size_t index() const { |
1039 | return index_; |
1040 | } |
1041 | |
1042 | Item* itemAddr() const { |
1043 | return std::addressof(*itemPtr_); |
1044 | } |
1045 | Item& item() const { |
1046 | return *itemPtr_; |
1047 | } |
1048 | Item const& citem() const { |
1049 | return *itemPtr_; |
1050 | } |
1051 | |
1052 | bool atEnd() const { |
1053 | return itemPtr_ == nullptr; |
1054 | } |
1055 | |
1056 | Packed pack() const { |
1057 | return Packed{itemPtr_, static_cast<uint8_t>(index_)}; |
1058 | } |
1059 | |
1060 | bool operator==(F14ItemIter const& rhs) const { |
1061 | // this form makes iter == end() into a single null check after inlining |
1062 | // and constant propagation |
1063 | return itemPtr_ == rhs.itemPtr_; |
1064 | } |
1065 | |
1066 | bool operator!=(F14ItemIter const& rhs) const { |
1067 | return !(*this == rhs); |
1068 | } |
1069 | |
1070 | private: |
1071 | ItemPtr itemPtr_; |
1072 | std::size_t index_; |
1073 | }; |
1074 | |
1075 | //////////////// |
1076 | |
1077 | template <typename SizeType, typename ItemIter, bool EnablePackedItemIter> |
1078 | struct SizeAndPackedBegin { |
1079 | SizeType size_{0}; |
1080 | |
1081 | private: |
1082 | typename ItemIter::Packed packedBegin_{ItemIter{}.pack()}; |
1083 | |
1084 | public: |
1085 | typename ItemIter::Packed& packedBegin() { |
1086 | return packedBegin_; |
1087 | } |
1088 | |
1089 | typename ItemIter::Packed const& packedBegin() const { |
1090 | return packedBegin_; |
1091 | } |
1092 | }; |
1093 | |
1094 | template <typename SizeType, typename ItemIter> |
1095 | struct SizeAndPackedBegin<SizeType, ItemIter, false> { |
1096 | SizeType size_{0}; |
1097 | |
1098 | [[noreturn]] typename ItemIter::Packed& packedBegin() { |
1099 | assume_unreachable(); |
1100 | } |
1101 | |
1102 | [[noreturn]] typename ItemIter::Packed const& packedBegin() const { |
1103 | assume_unreachable(); |
1104 | } |
1105 | }; |
1106 | |
1107 | template <typename Policy> |
1108 | class F14Table : public Policy { |
1109 | public: |
1110 | using Item = typename Policy::Item; |
1111 | |
1112 | using value_type = typename Policy::Value; |
1113 | using allocator_type = typename Policy::Alloc; |
1114 | |
1115 | private: |
1116 | using Alloc = typename Policy::Alloc; |
1117 | using AllocTraits = typename Policy::AllocTraits; |
1118 | using Hasher = typename Policy::Hasher; |
1119 | using InternalSizeType = typename Policy::InternalSizeType; |
1120 | using KeyEqual = typename Policy::KeyEqual; |
1121 | |
1122 | using Policy::kAllocIsAlwaysEqual; |
1123 | using Policy::kContinuousCapacity; |
1124 | using Policy::kDefaultConstructIsNoexcept; |
1125 | using Policy::kEnableItemIteration; |
1126 | using Policy::kSwapIsNoexcept; |
1127 | |
1128 | using Policy::destroyItemOnClear; |
1129 | using Policy::isAvalanchingHasher; |
1130 | using Policy::prefetchBeforeCopy; |
1131 | using Policy::prefetchBeforeDestroy; |
1132 | using Policy::prefetchBeforeRehash; |
1133 | |
1134 | using ByteAlloc = typename AllocTraits::template rebind_alloc<uint8_t>; |
1135 | using BytePtr = typename std::allocator_traits<ByteAlloc>::pointer; |
1136 | |
1137 | using Chunk = F14Chunk<Item>; |
1138 | using ChunkPtr = |
1139 | typename std::pointer_traits<BytePtr>::template rebind<Chunk>; |
1140 | |
1141 | using HashPair = typename F14HashToken::HashPair; |
1142 | |
1143 | public: |
1144 | using ItemIter = F14ItemIter<ChunkPtr>; |
1145 | |
1146 | private: |
1147 | //////// begin fields |
1148 | |
1149 | ChunkPtr chunks_{Chunk::emptyInstance()}; |
1150 | InternalSizeType chunkMask_{0}; |
1151 | SizeAndPackedBegin<InternalSizeType, ItemIter, kEnableItemIteration> |
1152 | sizeAndPackedBegin_; |
1153 | |
1154 | //////// end fields |
1155 | |
1156 | void swapContents(F14Table& rhs) noexcept { |
1157 | using std::swap; |
1158 | swap(chunks_, rhs.chunks_); |
1159 | swap(chunkMask_, rhs.chunkMask_); |
1160 | swap(sizeAndPackedBegin_.size_, rhs.sizeAndPackedBegin_.size_); |
1161 | if (kEnableItemIteration) { |
1162 | swap( |
1163 | sizeAndPackedBegin_.packedBegin(), |
1164 | rhs.sizeAndPackedBegin_.packedBegin()); |
1165 | } |
1166 | } |
1167 | |
1168 | public: |
1169 | F14Table( |
1170 | std::size_t initialCapacity, |
1171 | Hasher const& hasher, |
1172 | KeyEqual const& keyEqual, |
1173 | Alloc const& alloc) |
1174 | : Policy{hasher, keyEqual, alloc} { |
1175 | if (initialCapacity > 0) { |
1176 | reserve(initialCapacity); |
1177 | } |
1178 | } |
1179 | |
1180 | F14Table(F14Table const& rhs) : Policy{rhs} { |
1181 | buildFromF14Table(rhs); |
1182 | } |
1183 | |
1184 | F14Table(F14Table const& rhs, Alloc const& alloc) : Policy{rhs, alloc} { |
1185 | buildFromF14Table(rhs); |
1186 | } |
1187 | |
1188 | F14Table(F14Table&& rhs) noexcept( |
1189 | std::is_nothrow_move_constructible<Hasher>::value&& |
1190 | std::is_nothrow_move_constructible<KeyEqual>::value&& |
1191 | std::is_nothrow_move_constructible<Alloc>::value) |
1192 | : Policy{std::move(rhs)} { |
1193 | swapContents(rhs); |
1194 | } |
1195 | |
1196 | F14Table(F14Table&& rhs, Alloc const& alloc) noexcept(kAllocIsAlwaysEqual) |
1197 | : Policy{std::move(rhs), alloc} { |
1198 | if (kAllocIsAlwaysEqual || this->alloc() == rhs.alloc()) { |
1199 | // move storage (common case) |
1200 | swapContents(rhs); |
1201 | } else { |
1202 | // new storage because allocators unequal, move values (rare case) |
1203 | buildFromF14Table(std::move(rhs)); |
1204 | } |
1205 | } |
1206 | |
1207 | F14Table& operator=(F14Table const& rhs) { |
1208 | if (this != &rhs) { |
1209 | reset(); |
1210 | static_cast<Policy&>(*this) = rhs; |
1211 | buildFromF14Table(rhs); |
1212 | } |
1213 | return *this; |
1214 | } |
1215 | |
1216 | F14Table& operator=(F14Table&& rhs) noexcept( |
1217 | std::is_nothrow_move_assignable<Hasher>::value&& |
1218 | std::is_nothrow_move_assignable<KeyEqual>::value && |
1219 | (kAllocIsAlwaysEqual || |
1220 | (AllocTraits::propagate_on_container_move_assignment::value && |
1221 | std::is_nothrow_move_assignable<Alloc>::value))) { |
1222 | if (this != &rhs) { |
1223 | reset(); |
1224 | static_cast<Policy&>(*this) = std::move(rhs); |
1225 | if (AllocTraits::propagate_on_container_move_assignment::value || |
1226 | kAllocIsAlwaysEqual || this->alloc() == rhs.alloc()) { |
1227 | // move storage (common case) |
1228 | swapContents(rhs); |
1229 | } else { |
1230 | // new storage because allocators unequal, move values (rare case) |
1231 | buildFromF14Table(std::move(rhs)); |
1232 | } |
1233 | } |
1234 | return *this; |
1235 | } |
1236 | |
1237 | ~F14Table() { |
1238 | reset(); |
1239 | } |
1240 | |
1241 | void swap(F14Table& rhs) noexcept(kSwapIsNoexcept) { |
1242 | // If propagate_on_container_swap is false and allocators are |
1243 | // not equal, the only way to accomplish a swap would be to do |
1244 | // dynamic allocation and then move (or swap) each contained value. |
1245 | // AllocatorAwareContainer-s are not supposed to attempt this, but |
1246 | // rather are supposed to have undefined behavior in that case. |
1247 | FOLLY_SAFE_CHECK( |
1248 | AllocTraits::propagate_on_container_swap::value || |
1249 | kAllocIsAlwaysEqual || this->alloc() == rhs.alloc(), |
1250 | "swap is undefined for unequal non-propagating allocators" ); |
1251 | this->swapPolicy(rhs); |
1252 | swapContents(rhs); |
1253 | } |
1254 | |
1255 | private: |
1256 | //////// hash helpers |
1257 | |
1258 | // Hash values are used to compute the desired position, which is the |
1259 | // chunk index at which we would like to place a value (if there is no |
1260 | // overflow), and the tag, which is an additional 8 bits of entropy. |
1261 | // |
1262 | // The standard's definition of hash function quality only refers to |
1263 | // the probability of collisions of the entire hash value, not to the |
1264 | // probability of collisions of the results of shifting or masking the |
1265 | // hash value. Some hash functions, however, provide this stronger |
1266 | // guarantee (not quite the same as the definition of avalanching, |
1267 | // but similar). |
1268 | // |
1269 | // If the user-supplied hasher is an avalanching one (each bit of the |
1270 | // hash value has a 50% chance of being the same for differing hash |
1271 | // inputs), then we can just take 1 byte of the hash value for the tag |
1272 | // and the rest for the desired position. Avalanching hashers also |
1273 | // let us map hash value to array index position with just a bitmask |
1274 | // without risking clumping. (Many hash tables just accept the risk |
1275 | // and do it regardless.) |
1276 | // |
1277 | // std::hash<std::string> avalanches in all implementations we've |
1278 | // examined: libstdc++-v3 uses MurmurHash2, and libc++ uses CityHash |
1279 | // or MurmurHash2. The other std::hash specializations, however, do not |
1280 | // have this property. std::hash for integral and pointer values is the |
1281 | // identity function on libstdc++-v3 and libc++, in particular. In our |
1282 | // experience it is also fairly common for user-defined specializations |
1283 | // of std::hash to combine fields in an ad-hoc way that does not evenly |
1284 | // distribute entropy among the bits of the result (a + 37 * b, for |
1285 | // example, where a and b are integer fields). |
1286 | // |
1287 | // For hash functions we don't trust to avalanche, we repair things by |
1288 | // applying a bit mixer to the user-supplied hash. |
1289 | |
1290 | #if FOLLY_X64 || FOLLY_AARCH64 |
1291 | // 64-bit |
1292 | static HashPair splitHash(std::size_t hash) { |
1293 | static_assert(sizeof(std::size_t) == sizeof(uint64_t), "" ); |
1294 | std::size_t tag; |
1295 | if (!isAvalanchingHasher()) { |
1296 | #if FOLLY_F14_CRC_INTRINSIC_AVAILABLE |
1297 | #if FOLLY_SSE |
1298 | // SSE4.2 CRC |
1299 | std::size_t c = _mm_crc32_u64(0, hash); |
1300 | tag = (c >> 24) | 0x80; |
1301 | hash += c; |
1302 | #else |
1303 | // CRC is optional on armv8 (-march=armv8-a+crc), standard on armv8.1 |
1304 | std::size_t c = __crc32cd(0, hash); |
1305 | tag = (c >> 24) | 0x80; |
1306 | hash += c; |
1307 | #endif |
1308 | #else |
1309 | // The mixer below is not fully avalanching for all 64 bits of |
1310 | // output, but looks quite good for bits 18..63 and puts plenty |
1311 | // of entropy even lower when considering multiple bits together |
1312 | // (like the tag). Importantly, when under register pressure it |
1313 | // uses fewer registers, instructions, and immediate constants |
1314 | // than the alternatives, resulting in compact code that is more |
1315 | // easily inlinable. In one instantiation a modified Murmur mixer |
1316 | // was 48 bytes of assembly (even after using the same multiplicand |
1317 | // for both steps) and this one was 27 bytes, for example. |
1318 | auto const kMul = 0xc4ceb9fe1a85ec53ULL; |
1319 | #ifdef _WIN32 |
1320 | __int64 signedHi; |
1321 | __int64 signedLo = _mul128( |
1322 | static_cast<__int64>(hash), static_cast<__int64>(kMul), &signedHi); |
1323 | auto hi = static_cast<uint64_t>(signedHi); |
1324 | auto lo = static_cast<uint64_t>(signedLo); |
1325 | #else |
1326 | auto hi = static_cast<uint64_t>( |
1327 | (static_cast<unsigned __int128>(hash) * kMul) >> 64); |
1328 | auto lo = hash * kMul; |
1329 | #endif |
1330 | hash = hi ^ lo; |
1331 | hash *= kMul; |
1332 | tag = ((hash >> 15) & 0x7f) | 0x80; |
1333 | hash >>= 22; |
1334 | #endif |
1335 | } else { |
1336 | // we don't trust the top bit |
1337 | tag = (hash >> 56) | 0x80; |
1338 | } |
1339 | return std::make_pair(hash, tag); |
1340 | } |
1341 | #else |
1342 | // 32-bit |
1343 | static HashPair splitHash(std::size_t hash) { |
1344 | static_assert(sizeof(std::size_t) == sizeof(uint32_t), "" ); |
1345 | uint8_t tag; |
1346 | if (!isAvalanchingHasher()) { |
1347 | #if FOLLY_F14_CRC_INTRINSIC_AVAILABLE |
1348 | #if FOLLY_SSE |
1349 | // SSE4.2 CRC |
1350 | auto c = _mm_crc32_u32(0, hash); |
1351 | tag = static_cast<uint8_t>(~(c >> 25)); |
1352 | hash += c; |
1353 | #else |
1354 | auto c = __crc32cw(0, hash); |
1355 | tag = static_cast<uint8_t>(~(c >> 25)); |
1356 | hash += c; |
1357 | #endif |
1358 | #else |
1359 | // finalizer for 32-bit murmur2 |
1360 | hash ^= hash >> 13; |
1361 | hash *= 0x5bd1e995; |
1362 | hash ^= hash >> 15; |
1363 | tag = static_cast<uint8_t>(~(hash >> 25)); |
1364 | #endif |
1365 | } else { |
1366 | // we don't trust the top bit |
1367 | tag = (hash >> 24) | 0x80; |
1368 | } |
1369 | return std::make_pair(hash, tag); |
1370 | } |
1371 | #endif |
1372 | |
1373 | //////// memory management helpers |
1374 | |
1375 | static std::size_t computeCapacity( |
1376 | std::size_t chunkCount, |
1377 | std::size_t scale) { |
1378 | FOLLY_SAFE_DCHECK(!(chunkCount > 1 && scale == 0), "" ); |
1379 | FOLLY_SAFE_DCHECK( |
1380 | scale < (std::size_t{1} << Chunk::kCapacityScaleBits), "" ); |
1381 | FOLLY_SAFE_DCHECK((chunkCount & (chunkCount - 1)) == 0, "" ); |
1382 | return (((chunkCount - 1) >> Chunk::kCapacityScaleShift) + 1) * scale; |
1383 | } |
1384 | |
1385 | std::pair<std::size_t, std::size_t> computeChunkCountAndScale( |
1386 | std::size_t desiredCapacity, |
1387 | bool continuousSingleChunkCapacity, |
1388 | bool continuousMultiChunkCapacity) const { |
1389 | if (desiredCapacity <= Chunk::kCapacity) { |
1390 | // we can go to 100% capacity in a single chunk with no problem |
1391 | if (!continuousSingleChunkCapacity) { |
1392 | if (desiredCapacity <= 2) { |
1393 | desiredCapacity = 2; |
1394 | } else if (desiredCapacity <= 6) { |
1395 | desiredCapacity = 6; |
1396 | } else { |
1397 | desiredCapacity = Chunk::kCapacity; |
1398 | } |
1399 | } |
1400 | auto rv = std::make_pair(std::size_t{1}, desiredCapacity); |
1401 | FOLLY_SAFE_DCHECK( |
1402 | computeCapacity(rv.first, rv.second) == desiredCapacity, "" ); |
1403 | return rv; |
1404 | } else { |
1405 | std::size_t minChunks = |
1406 | (desiredCapacity - 1) / Chunk::kDesiredCapacity + 1; |
1407 | std::size_t chunkPow = findLastSet(minChunks - 1); |
1408 | if (chunkPow == 8 * sizeof(std::size_t)) { |
1409 | throw_exception<std::bad_alloc>(); |
1410 | } |
1411 | |
1412 | std::size_t chunkCount = std::size_t{1} << chunkPow; |
1413 | |
1414 | // Let cc * scale be the actual capacity. |
1415 | // cc = ((chunkCount - 1) >> kCapacityScaleShift) + 1. |
1416 | // If chunkPow >= kCapacityScaleShift, then cc = chunkCount >> |
1417 | // kCapacityScaleShift = 1 << (chunkPow - kCapacityScaleShift), |
1418 | // otherwise it equals 1 = 1 << 0. Let cc = 1 << ss. |
1419 | std::size_t ss = chunkPow >= Chunk::kCapacityScaleShift |
1420 | ? chunkPow - Chunk::kCapacityScaleShift |
1421 | : 0; |
1422 | |
1423 | std::size_t scale; |
1424 | if (continuousMultiChunkCapacity) { |
1425 | // (1 << ss) * scale >= desiredCapacity |
1426 | scale = ((desiredCapacity - 1) >> ss) + 1; |
1427 | } else { |
1428 | // (1 << ss) * scale == chunkCount * kDesiredCapacity |
1429 | scale = Chunk::kDesiredCapacity << (chunkPow - ss); |
1430 | } |
1431 | |
1432 | std::size_t actualCapacity = computeCapacity(chunkCount, scale); |
1433 | FOLLY_SAFE_DCHECK(actualCapacity >= desiredCapacity, "" ); |
1434 | if (actualCapacity > max_size()) { |
1435 | throw_exception<std::bad_alloc>(); |
1436 | } |
1437 | |
1438 | return std::make_pair(chunkCount, scale); |
1439 | } |
1440 | } |
1441 | |
1442 | static std::size_t chunkAllocSize( |
1443 | std::size_t chunkCount, |
1444 | std::size_t capacityScale) { |
1445 | FOLLY_SAFE_DCHECK(chunkCount > 0, "" ); |
1446 | FOLLY_SAFE_DCHECK(!(chunkCount > 1 && capacityScale == 0), "" ); |
1447 | if (chunkCount == 1) { |
1448 | static_assert(offsetof(Chunk, rawItems_) == 16, "" ); |
1449 | return 16 + sizeof(Item) * computeCapacity(1, capacityScale); |
1450 | } else { |
1451 | return sizeof(Chunk) * chunkCount; |
1452 | } |
1453 | } |
1454 | |
1455 | ChunkPtr initializeChunks( |
1456 | BytePtr raw, |
1457 | std::size_t chunkCount, |
1458 | std::size_t capacityScale) { |
1459 | static_assert(std::is_trivial<Chunk>::value, "F14Chunk should be POD" ); |
1460 | auto chunks = static_cast<Chunk*>(static_cast<void*>(&*raw)); |
1461 | for (std::size_t i = 0; i < chunkCount; ++i) { |
1462 | chunks[i].clear(); |
1463 | } |
1464 | chunks[0].markEof(capacityScale); |
1465 | return std::pointer_traits<ChunkPtr>::pointer_to(*chunks); |
1466 | } |
1467 | |
1468 | std::size_t itemCount() const noexcept { |
1469 | if (chunkMask_ == 0) { |
1470 | return computeCapacity(1, chunks_->capacityScale()); |
1471 | } else { |
1472 | return (chunkMask_ + 1) * Chunk::kCapacity; |
1473 | } |
1474 | } |
1475 | |
1476 | public: |
1477 | ItemIter begin() const noexcept { |
1478 | FOLLY_SAFE_DCHECK(kEnableItemIteration, "" ); |
1479 | return ItemIter{sizeAndPackedBegin_.packedBegin()}; |
1480 | } |
1481 | |
1482 | ItemIter end() const noexcept { |
1483 | return ItemIter{}; |
1484 | } |
1485 | |
1486 | bool empty() const noexcept { |
1487 | return size() == 0; |
1488 | } |
1489 | |
1490 | InternalSizeType size() const noexcept { |
1491 | return sizeAndPackedBegin_.size_; |
1492 | } |
1493 | |
1494 | std::size_t max_size() const noexcept { |
1495 | auto& a = this->alloc(); |
1496 | return std::min<std::size_t>( |
1497 | (std::numeric_limits<InternalSizeType>::max)(), |
1498 | AllocTraits::max_size(a)); |
1499 | } |
1500 | |
1501 | std::size_t bucket_count() const noexcept { |
1502 | return computeCapacity(chunkMask_ + 1, chunks_->capacityScale()); |
1503 | } |
1504 | |
1505 | std::size_t max_bucket_count() const noexcept { |
1506 | return max_size(); |
1507 | } |
1508 | |
1509 | float load_factor() const noexcept { |
1510 | return empty() |
1511 | ? 0.0f |
1512 | : static_cast<float>(size()) / static_cast<float>(bucket_count()); |
1513 | } |
1514 | |
1515 | float max_load_factor() const noexcept { |
1516 | return 1.0f; |
1517 | } |
1518 | |
1519 | void max_load_factor(float) noexcept { |
1520 | // Probing hash tables can't run load factors >= 1 (unlike chaining |
1521 | // tables). In addition, we have measured that there is little or |
1522 | // no performance advantage to running a smaller load factor (cache |
1523 | // locality losses outweigh the small reduction in probe lengths, |
1524 | // often making it slower). Therefore, we've decided to just fix |
1525 | // max_load_factor at 1.0f regardless of what the user requests. |
1526 | // This has an additional advantage that we don't have to store it. |
1527 | // Taking alignment into consideration this makes every F14 table |
1528 | // 8 bytes smaller, and is part of the reason an empty F14NodeMap |
1529 | // is almost half the size of an empty std::unordered_map (32 vs |
1530 | // 56 bytes). |
1531 | // |
1532 | // I don't have a strong opinion on whether we should remove this |
1533 | // method or leave a stub, let ngbronson or xshi know if you have a |
1534 | // compelling argument either way. |
1535 | } |
1536 | |
1537 | private: |
1538 | // Our probe strategy is to advance through additional chunks with |
1539 | // a stride that is key-specific. This is called double hashing, |
1540 | // and is a well known and high quality probing strategy. So long as |
1541 | // the stride and the chunk count are relatively prime, we will visit |
1542 | // every chunk once and then return to the original chunk, letting us |
1543 | // detect and end the cycle. The chunk count is a power of two, so |
1544 | // we can satisfy the relatively prime part by choosing an odd stride. |
1545 | // We've already computed a high quality secondary hash value for the |
1546 | // tag, so we just use it for the second probe hash as well. |
1547 | // |
1548 | // At the maximum load factor of 12/14, expected probe length for a |
1549 | // find hit is 1.041, with 99% of keys found in the first three chunks. |
1550 | // Expected probe length for a find miss (or insert) is 1.275, with a |
1551 | // p99 probe length of 4 (fewer than 1% of failing find look at 5 or |
1552 | // more chunks). |
1553 | // |
1554 | // This code is structured so you can try various ways of encoding |
1555 | // the current probe state. For example, at the moment the probe's |
1556 | // state is the position in the cycle and the resulting chunk index is |
1557 | // computed from that inside probeCurrentIndex. We could also make the |
1558 | // probe state the chunk index, and then increment it by hp.second * |
1559 | // 2 + 1 in probeAdvance. Wrapping can be applied early or late as |
1560 | // well. This particular code seems to be easier for the optimizer |
1561 | // to understand. |
1562 | // |
1563 | // We could also implement probing strategies that resulted in the same |
1564 | // tour for every key initially assigned to a chunk (linear probing or |
1565 | // quadratic), but that results in longer probe lengths. In particular, |
1566 | // the cache locality wins of linear probing are not worth the increase |
1567 | // in probe lengths (extra work and less branch predictability) in |
1568 | // our experiments. |
1569 | |
1570 | std::size_t probeDelta(HashPair hp) const { |
1571 | return 2 * hp.second + 1; |
1572 | } |
1573 | |
1574 | template <typename K> |
1575 | FOLLY_ALWAYS_INLINE ItemIter findImpl(HashPair hp, K const& key) const { |
1576 | std::size_t index = hp.first; |
1577 | std::size_t step = probeDelta(hp); |
1578 | for (std::size_t tries = 0; tries <= chunkMask_; ++tries) { |
1579 | ChunkPtr chunk = chunks_ + (index & chunkMask_); |
1580 | if (sizeof(Chunk) > 64) { |
1581 | prefetchAddr(chunk->itemAddr(8)); |
1582 | } |
1583 | auto hits = chunk->tagMatchIter(hp.second); |
1584 | while (hits.hasNext()) { |
1585 | auto i = hits.next(); |
1586 | if (LIKELY(this->keyMatchesItem(key, chunk->item(i)))) { |
1587 | // Tag match and key match were both successful. The chance |
1588 | // of a false tag match is 1/128 for each key in the chunk |
1589 | // (with a proper hash function). |
1590 | return ItemIter{chunk, i}; |
1591 | } |
1592 | } |
1593 | if (LIKELY(chunk->outboundOverflowCount() == 0)) { |
1594 | // No keys that wanted to be placed in this chunk were denied |
1595 | // entry, so our search is over. This is the common case. |
1596 | break; |
1597 | } |
1598 | index += step; |
1599 | } |
1600 | // Loop exit because tries is exhausted is rare, but possible. |
1601 | // That means that for every chunk there is currently a key present |
1602 | // in the map that visited that chunk on its probe search but ended |
1603 | // up somewhere else, and we have searched every chunk. |
1604 | return ItemIter{}; |
1605 | } |
1606 | |
1607 | public: |
1608 | // Prehashing splits the work of find(key) into two calls, enabling you |
1609 | // to manually implement loop pipelining for hot bulk lookups. prehash |
1610 | // computes the hash and prefetches the first computed memory location, |
1611 | // and the two-arg find(F14HashToken,K) performs the rest of the search. |
1612 | template <typename K> |
1613 | F14HashToken prehash(K const& key) const { |
1614 | FOLLY_SAFE_DCHECK(chunks_ != nullptr, "" ); |
1615 | auto hp = splitHash(this->computeKeyHash(key)); |
1616 | ChunkPtr firstChunk = chunks_ + (hp.first & chunkMask_); |
1617 | prefetchAddr(firstChunk); |
1618 | return F14HashToken(std::move(hp)); |
1619 | } |
1620 | |
1621 | template <typename K> |
1622 | FOLLY_ALWAYS_INLINE ItemIter find(K const& key) const { |
1623 | auto hp = splitHash(this->computeKeyHash(key)); |
1624 | return findImpl(hp, key); |
1625 | } |
1626 | |
1627 | template <typename K> |
1628 | FOLLY_ALWAYS_INLINE ItemIter |
1629 | find(F14HashToken const& token, K const& key) const { |
1630 | FOLLY_SAFE_DCHECK( |
1631 | splitHash(this->computeKeyHash(key)) == static_cast<HashPair>(token), |
1632 | "" ); |
1633 | return findImpl(static_cast<HashPair>(token), key); |
1634 | } |
1635 | |
1636 | private: |
1637 | void adjustSizeAndBeginAfterInsert(ItemIter iter) { |
1638 | if (kEnableItemIteration) { |
1639 | // packedBegin is the max of all valid ItemIter::pack() |
1640 | auto packed = iter.pack(); |
1641 | if (sizeAndPackedBegin_.packedBegin() < packed) { |
1642 | sizeAndPackedBegin_.packedBegin() = packed; |
1643 | } |
1644 | } |
1645 | |
1646 | ++sizeAndPackedBegin_.size_; |
1647 | } |
1648 | |
1649 | // Ignores hp if pos.chunk()->hostedOverflowCount() == 0 |
1650 | void eraseBlank(ItemIter iter, HashPair hp) { |
1651 | iter.chunk()->clearTag(iter.index()); |
1652 | |
1653 | if (iter.chunk()->hostedOverflowCount() != 0) { |
1654 | // clean up |
1655 | std::size_t index = hp.first; |
1656 | std::size_t delta = probeDelta(hp); |
1657 | uint8_t hostedOp = 0; |
1658 | while (true) { |
1659 | ChunkPtr chunk = chunks_ + (index & chunkMask_); |
1660 | if (chunk == iter.chunk()) { |
1661 | chunk->adjustHostedOverflowCount(hostedOp); |
1662 | break; |
1663 | } |
1664 | chunk->decrOutboundOverflowCount(); |
1665 | hostedOp = Chunk::kDecrHostedOverflowCount; |
1666 | index += delta; |
1667 | } |
1668 | } |
1669 | } |
1670 | |
1671 | void adjustSizeAndBeginBeforeErase(ItemIter iter) { |
1672 | --sizeAndPackedBegin_.size_; |
1673 | if (kEnableItemIteration) { |
1674 | if (iter.pack() == sizeAndPackedBegin_.packedBegin()) { |
1675 | if (size() == 0) { |
1676 | iter = ItemIter{}; |
1677 | } else { |
1678 | iter.precheckedAdvance(); |
1679 | } |
1680 | sizeAndPackedBegin_.packedBegin() = iter.pack(); |
1681 | } |
1682 | } |
1683 | } |
1684 | |
1685 | template <typename... Args> |
1686 | void insertAtBlank(ItemIter pos, HashPair hp, Args&&... args) { |
1687 | try { |
1688 | auto dst = pos.itemAddr(); |
1689 | this->constructValueAtItem(*this, dst, std::forward<Args>(args)...); |
1690 | } catch (...) { |
1691 | eraseBlank(pos, hp); |
1692 | throw; |
1693 | } |
1694 | adjustSizeAndBeginAfterInsert(pos); |
1695 | } |
1696 | |
1697 | ItemIter allocateTag(uint8_t* fullness, HashPair hp) { |
1698 | ChunkPtr chunk; |
1699 | std::size_t index = hp.first; |
1700 | std::size_t delta = probeDelta(hp); |
1701 | uint8_t hostedOp = 0; |
1702 | while (true) { |
1703 | index &= chunkMask_; |
1704 | chunk = chunks_ + index; |
1705 | if (LIKELY(fullness[index] < Chunk::kCapacity)) { |
1706 | break; |
1707 | } |
1708 | chunk->incrOutboundOverflowCount(); |
1709 | hostedOp = Chunk::kIncrHostedOverflowCount; |
1710 | index += delta; |
1711 | } |
1712 | unsigned itemIndex = fullness[index]++; |
1713 | FOLLY_SAFE_DCHECK(!chunk->occupied(itemIndex), "" ); |
1714 | chunk->setTag(itemIndex, hp.second); |
1715 | chunk->adjustHostedOverflowCount(hostedOp); |
1716 | return ItemIter{chunk, itemIndex}; |
1717 | } |
1718 | |
1719 | ChunkPtr lastOccupiedChunk() const { |
1720 | FOLLY_SAFE_DCHECK(size() > 0, "" ); |
1721 | if (kEnableItemIteration) { |
1722 | return begin().chunk(); |
1723 | } else { |
1724 | return chunks_ + chunkMask_; |
1725 | } |
1726 | } |
1727 | |
1728 | template <typename T> |
1729 | void directBuildFrom(T&& src) { |
1730 | FOLLY_SAFE_DCHECK(src.size() > 0 && chunkMask_ == src.chunkMask_, "" ); |
1731 | |
1732 | // We use std::forward<T> to allow portions of src to be moved out by |
1733 | // either beforeBuild or afterBuild, but we are just relying on good |
1734 | // behavior of our Policy superclass to ensure that any particular |
1735 | // field of this is a donor at most once. |
1736 | |
1737 | auto undoState = |
1738 | this->beforeBuild(src.size(), bucket_count(), std::forward<T>(src)); |
1739 | bool success = false; |
1740 | SCOPE_EXIT { |
1741 | this->afterBuild( |
1742 | undoState, success, src.size(), bucket_count(), std::forward<T>(src)); |
1743 | }; |
1744 | |
1745 | // Copy can fail part-way through if a Value copy constructor throws. |
1746 | // Failing afterBuild is limited in its cleanup power in this case, |
1747 | // because it can't enumerate the items that were actually copied. |
1748 | // Fortunately we can divide the situation into cases where all of |
1749 | // the state is owned by the table itself (F14Node and F14Value), |
1750 | // for which clearImpl() can do partial cleanup, and cases where all |
1751 | // of the values are owned by the policy (F14Vector), in which case |
1752 | // partial failure should not occur. Sorry for the subtle invariants |
1753 | // in the Policy API. |
1754 | |
1755 | if (is_trivially_copyable<Item>::value && !this->destroyItemOnClear() && |
1756 | itemCount() == src.itemCount()) { |
1757 | FOLLY_SAFE_DCHECK(chunkMask_ == src.chunkMask_, "" ); |
1758 | |
1759 | auto scale = chunks_->capacityScale(); |
1760 | |
1761 | // most happy path |
1762 | auto n = chunkAllocSize(chunkMask_ + 1, scale); |
1763 | std::memcpy(&chunks_[0], &src.chunks_[0], n); |
1764 | sizeAndPackedBegin_.size_ = src.size(); |
1765 | if (kEnableItemIteration) { |
1766 | auto srcBegin = src.begin(); |
1767 | sizeAndPackedBegin_.packedBegin() = |
1768 | ItemIter{chunks_ + (srcBegin.chunk() - src.chunks_), |
1769 | srcBegin.index()} |
1770 | .pack(); |
1771 | } |
1772 | if (kContinuousCapacity) { |
1773 | // capacityScale might not match even if itemCount matches |
1774 | chunks_->setCapacityScale(scale); |
1775 | } |
1776 | } else { |
1777 | std::size_t maxChunkIndex = src.lastOccupiedChunk() - src.chunks_; |
1778 | |
1779 | // happy path, no rehash but pack items toward bottom of chunk and |
1780 | // use copy constructor |
1781 | auto srcChunk = &src.chunks_[maxChunkIndex]; |
1782 | Chunk* dstChunk = &chunks_[maxChunkIndex]; |
1783 | do { |
1784 | dstChunk->copyOverflowInfoFrom(*srcChunk); |
1785 | |
1786 | auto iter = srcChunk->occupiedIter(); |
1787 | if (prefetchBeforeCopy()) { |
1788 | for (auto piter = iter; piter.hasNext();) { |
1789 | this->prefetchValue(srcChunk->citem(piter.next())); |
1790 | } |
1791 | } |
1792 | |
1793 | std::size_t dstI = 0; |
1794 | for (; iter.hasNext(); ++dstI) { |
1795 | auto srcI = iter.next(); |
1796 | auto&& srcArg = |
1797 | std::forward<T>(src).buildArgForItem(srcChunk->item(srcI)); |
1798 | auto dst = dstChunk->itemAddr(dstI); |
1799 | this->constructValueAtItem( |
1800 | 0, dst, std::forward<decltype(srcArg)>(srcArg)); |
1801 | dstChunk->setTag(dstI, srcChunk->tag(srcI)); |
1802 | ++sizeAndPackedBegin_.size_; |
1803 | } |
1804 | |
1805 | --srcChunk; |
1806 | --dstChunk; |
1807 | } while (size() != src.size()); |
1808 | |
1809 | // reset doesn't care about packedBegin, so we don't fix it until the end |
1810 | if (kEnableItemIteration) { |
1811 | sizeAndPackedBegin_.packedBegin() = |
1812 | ItemIter{chunks_ + maxChunkIndex, |
1813 | chunks_[maxChunkIndex].lastOccupied().index()} |
1814 | .pack(); |
1815 | } |
1816 | } |
1817 | |
1818 | success = true; |
1819 | } |
1820 | |
1821 | template <typename T> |
1822 | void rehashBuildFrom(T&& src) { |
1823 | FOLLY_SAFE_DCHECK(src.chunkMask_ > chunkMask_, "" ); |
1824 | |
1825 | // 1 byte per chunk means < 1 bit per value temporary overhead |
1826 | std::array<uint8_t, 256> stackBuf; |
1827 | uint8_t* fullness; |
1828 | auto cc = chunkMask_ + 1; |
1829 | if (cc <= stackBuf.size()) { |
1830 | fullness = stackBuf.data(); |
1831 | } else { |
1832 | ByteAlloc a{this->alloc()}; |
1833 | fullness = &*std::allocator_traits<ByteAlloc>::allocate(a, cc); |
1834 | } |
1835 | SCOPE_EXIT { |
1836 | if (cc > stackBuf.size()) { |
1837 | ByteAlloc a{this->alloc()}; |
1838 | std::allocator_traits<ByteAlloc>::deallocate( |
1839 | a, |
1840 | std::pointer_traits<typename std::allocator_traits< |
1841 | ByteAlloc>::pointer>::pointer_to(*fullness), |
1842 | cc); |
1843 | } |
1844 | }; |
1845 | std::memset(fullness, '\0', cc); |
1846 | |
1847 | // We use std::forward<T> to allow portions of src to be moved out by |
1848 | // either beforeBuild or afterBuild, but we are just relying on good |
1849 | // behavior of our Policy superclass to ensure that any particular |
1850 | // field of this is a donor at most once. |
1851 | |
1852 | // Exception safety requires beforeBuild to happen after all of the |
1853 | // allocate() calls. |
1854 | auto undoState = |
1855 | this->beforeBuild(src.size(), bucket_count(), std::forward<T>(src)); |
1856 | bool success = false; |
1857 | SCOPE_EXIT { |
1858 | this->afterBuild( |
1859 | undoState, success, src.size(), bucket_count(), std::forward<T>(src)); |
1860 | }; |
1861 | |
1862 | // The current table is at a valid state at all points for policies |
1863 | // in which non-trivial values are owned by the main table (F14Node |
1864 | // and F14Value), so reset() will clean things up properly if we |
1865 | // fail partway through. For the case that the policy manages value |
1866 | // lifecycle (F14Vector) then nothing after beforeBuild can throw and |
1867 | // we don't have to worry about partial failure. |
1868 | |
1869 | std::size_t srcChunkIndex = src.lastOccupiedChunk() - src.chunks_; |
1870 | while (true) { |
1871 | auto srcChunk = &src.chunks_[srcChunkIndex]; |
1872 | auto iter = srcChunk->occupiedIter(); |
1873 | if (prefetchBeforeRehash()) { |
1874 | for (auto piter = iter; piter.hasNext();) { |
1875 | this->prefetchValue(srcChunk->item(piter.next())); |
1876 | } |
1877 | } |
1878 | if (srcChunk->hostedOverflowCount() == 0) { |
1879 | // all items are in their preferred chunk (no probing), so we |
1880 | // don't need to compute any hash values |
1881 | while (iter.hasNext()) { |
1882 | auto i = iter.next(); |
1883 | auto& srcItem = srcChunk->item(i); |
1884 | auto&& srcArg = std::forward<T>(src).buildArgForItem(srcItem); |
1885 | HashPair hp{srcChunkIndex, srcChunk->tag(i)}; |
1886 | insertAtBlank( |
1887 | allocateTag(fullness, hp), |
1888 | hp, |
1889 | std::forward<decltype(srcArg)>(srcArg)); |
1890 | } |
1891 | } else { |
1892 | // any chunk's items might be in here |
1893 | while (iter.hasNext()) { |
1894 | auto i = iter.next(); |
1895 | auto& srcItem = srcChunk->item(i); |
1896 | auto&& srcArg = std::forward<T>(src).buildArgForItem(srcItem); |
1897 | auto const& srcKey = src.keyForValue(srcArg); |
1898 | auto hp = splitHash(this->computeKeyHash(srcKey)); |
1899 | FOLLY_SAFE_DCHECK(hp.second == srcChunk->tag(i), "" ); |
1900 | insertAtBlank( |
1901 | allocateTag(fullness, hp), |
1902 | hp, |
1903 | std::forward<decltype(srcArg)>(srcArg)); |
1904 | } |
1905 | } |
1906 | if (srcChunkIndex == 0) { |
1907 | break; |
1908 | } |
1909 | --srcChunkIndex; |
1910 | } |
1911 | |
1912 | success = true; |
1913 | } |
1914 | |
1915 | template <typename T> |
1916 | FOLLY_NOINLINE void buildFromF14Table(T&& src) { |
1917 | FOLLY_SAFE_DCHECK(bucket_count() == 0, "" ); |
1918 | if (src.size() == 0) { |
1919 | return; |
1920 | } |
1921 | |
1922 | // Use the source's capacity, unless it is oversized. |
1923 | auto upperLimit = computeChunkCountAndScale(src.size(), false, false); |
1924 | auto ccas = |
1925 | std::make_pair(src.chunkMask_ + 1, src.chunks_->capacityScale()); |
1926 | FOLLY_SAFE_DCHECK( |
1927 | ccas.first >= upperLimit.first, |
1928 | "rounded chunk count can't be bigger than actual" ); |
1929 | if (ccas.first > upperLimit.first || ccas.second > upperLimit.second) { |
1930 | ccas = upperLimit; |
1931 | } |
1932 | rehashImpl(0, 1, 0, ccas.first, ccas.second); |
1933 | |
1934 | try { |
1935 | if (chunkMask_ == src.chunkMask_) { |
1936 | directBuildFrom(std::forward<T>(src)); |
1937 | } else { |
1938 | rehashBuildFrom(std::forward<T>(src)); |
1939 | } |
1940 | } catch (...) { |
1941 | reset(); |
1942 | F14LinkCheck<getF14IntrinsicsMode()>::check(); |
1943 | throw; |
1944 | } |
1945 | } |
1946 | |
1947 | void reserveImpl(std::size_t desiredCapacity) { |
1948 | desiredCapacity = std::max<std::size_t>(desiredCapacity, size()); |
1949 | if (desiredCapacity == 0) { |
1950 | reset(); |
1951 | return; |
1952 | } |
1953 | |
1954 | auto origChunkCount = chunkMask_ + 1; |
1955 | auto origCapacityScale = chunks_->capacityScale(); |
1956 | auto origCapacity = computeCapacity(origChunkCount, origCapacityScale); |
1957 | |
1958 | // This came from an explicit reserve() or rehash() call, so there's |
1959 | // a good chance the capacity is exactly right. To avoid O(n^2) |
1960 | // behavior, we don't do rehashes that decrease the size by less |
1961 | // than 1/8, and if we have a requested increase of less than 1/8 we |
1962 | // instead go to the next power of two. |
1963 | |
1964 | if (desiredCapacity <= origCapacity && |
1965 | desiredCapacity >= origCapacity - origCapacity / 8) { |
1966 | return; |
1967 | } |
1968 | bool attemptExact = |
1969 | !(desiredCapacity > origCapacity && |
1970 | desiredCapacity < origCapacity + origCapacity / 8); |
1971 | |
1972 | std::size_t newChunkCount; |
1973 | std::size_t newCapacityScale; |
1974 | std::tie(newChunkCount, newCapacityScale) = computeChunkCountAndScale( |
1975 | desiredCapacity, attemptExact, kContinuousCapacity && attemptExact); |
1976 | auto newCapacity = computeCapacity(newChunkCount, newCapacityScale); |
1977 | |
1978 | if (origCapacity != newCapacity) { |
1979 | rehashImpl( |
1980 | size(), |
1981 | origChunkCount, |
1982 | origCapacityScale, |
1983 | newChunkCount, |
1984 | newCapacityScale); |
1985 | } |
1986 | } |
1987 | |
1988 | FOLLY_NOINLINE void reserveForInsertImpl( |
1989 | std::size_t capacityMinusOne, |
1990 | std::size_t origChunkCount, |
1991 | std::size_t origCapacityScale, |
1992 | std::size_t origCapacity) { |
1993 | FOLLY_SAFE_DCHECK(capacityMinusOne >= size(), "" ); |
1994 | std::size_t capacity = capacityMinusOne + 1; |
1995 | |
1996 | // we want to grow by between 2^0.5 and 2^1.5 ending at a "good" |
1997 | // size, so we grow by 2^0.5 and then round up |
1998 | |
1999 | // 1.01101_2 = 1.40625 |
2000 | std::size_t minGrowth = origCapacity + (origCapacity >> 2) + |
2001 | (origCapacity >> 3) + (origCapacity >> 5); |
2002 | capacity = std::max<std::size_t>(capacity, minGrowth); |
2003 | |
2004 | std::size_t newChunkCount; |
2005 | std::size_t newCapacityScale; |
2006 | std::tie(newChunkCount, newCapacityScale) = |
2007 | computeChunkCountAndScale(capacity, false, false); |
2008 | |
2009 | FOLLY_SAFE_DCHECK( |
2010 | computeCapacity(newChunkCount, newCapacityScale) > origCapacity, "" ); |
2011 | |
2012 | rehashImpl( |
2013 | size(), |
2014 | origChunkCount, |
2015 | origCapacityScale, |
2016 | newChunkCount, |
2017 | newCapacityScale); |
2018 | } |
2019 | |
2020 | void rehashImpl( |
2021 | std::size_t origSize, |
2022 | std::size_t origChunkCount, |
2023 | std::size_t origCapacityScale, |
2024 | std::size_t newChunkCount, |
2025 | std::size_t newCapacityScale) { |
2026 | auto origChunks = chunks_; |
2027 | auto origCapacity = computeCapacity(origChunkCount, origCapacityScale); |
2028 | auto origAllocSize = chunkAllocSize(origChunkCount, origCapacityScale); |
2029 | auto newCapacity = computeCapacity(newChunkCount, newCapacityScale); |
2030 | auto newAllocSize = chunkAllocSize(newChunkCount, newCapacityScale); |
2031 | |
2032 | BytePtr rawAllocation; |
2033 | auto undoState = this->beforeRehash( |
2034 | origSize, origCapacity, newCapacity, newAllocSize, rawAllocation); |
2035 | chunks_ = initializeChunks(rawAllocation, newChunkCount, newCapacityScale); |
2036 | |
2037 | FOLLY_SAFE_DCHECK( |
2038 | newChunkCount < std::numeric_limits<InternalSizeType>::max(), "" ); |
2039 | chunkMask_ = static_cast<InternalSizeType>(newChunkCount - 1); |
2040 | |
2041 | bool success = false; |
2042 | SCOPE_EXIT { |
2043 | // this SCOPE_EXIT reverts chunks_ and chunkMask_ if necessary |
2044 | BytePtr finishedRawAllocation = nullptr; |
2045 | std::size_t finishedAllocSize = 0; |
2046 | if (LIKELY(success)) { |
2047 | if (origCapacity > 0) { |
2048 | finishedRawAllocation = std::pointer_traits<BytePtr>::pointer_to( |
2049 | *static_cast<uint8_t*>(static_cast<void*>(&*origChunks))); |
2050 | finishedAllocSize = origAllocSize; |
2051 | } |
2052 | } else { |
2053 | finishedRawAllocation = rawAllocation; |
2054 | finishedAllocSize = newAllocSize; |
2055 | chunks_ = origChunks; |
2056 | FOLLY_SAFE_DCHECK( |
2057 | origChunkCount < std::numeric_limits<InternalSizeType>::max(), "" ); |
2058 | chunkMask_ = static_cast<InternalSizeType>(origChunkCount - 1); |
2059 | F14LinkCheck<getF14IntrinsicsMode()>::check(); |
2060 | } |
2061 | |
2062 | this->afterRehash( |
2063 | std::move(undoState), |
2064 | success, |
2065 | origSize, |
2066 | origCapacity, |
2067 | newCapacity, |
2068 | finishedRawAllocation, |
2069 | finishedAllocSize); |
2070 | }; |
2071 | |
2072 | if (origSize == 0) { |
2073 | // nothing to do |
2074 | } else if (origChunkCount == 1 && newChunkCount == 1) { |
2075 | // no mask, no chunk scan, no hash computation, no probing |
2076 | auto srcChunk = origChunks; |
2077 | auto dstChunk = chunks_; |
2078 | std::size_t srcI = 0; |
2079 | std::size_t dstI = 0; |
2080 | while (dstI < origSize) { |
2081 | if (LIKELY(srcChunk->occupied(srcI))) { |
2082 | dstChunk->setTag(dstI, srcChunk->tag(srcI)); |
2083 | this->moveItemDuringRehash( |
2084 | dstChunk->itemAddr(dstI), srcChunk->item(srcI)); |
2085 | ++dstI; |
2086 | } |
2087 | ++srcI; |
2088 | } |
2089 | if (kEnableItemIteration) { |
2090 | sizeAndPackedBegin_.packedBegin() = ItemIter{dstChunk, dstI - 1}.pack(); |
2091 | } |
2092 | } else { |
2093 | // 1 byte per chunk means < 1 bit per value temporary overhead |
2094 | std::array<uint8_t, 256> stackBuf; |
2095 | uint8_t* fullness; |
2096 | if (newChunkCount <= stackBuf.size()) { |
2097 | fullness = stackBuf.data(); |
2098 | } else { |
2099 | ByteAlloc a{this->alloc()}; |
2100 | // may throw |
2101 | fullness = |
2102 | &*std::allocator_traits<ByteAlloc>::allocate(a, newChunkCount); |
2103 | } |
2104 | std::memset(fullness, '\0', newChunkCount); |
2105 | SCOPE_EXIT { |
2106 | if (newChunkCount > stackBuf.size()) { |
2107 | ByteAlloc a{this->alloc()}; |
2108 | std::allocator_traits<ByteAlloc>::deallocate( |
2109 | a, |
2110 | std::pointer_traits<typename std::allocator_traits< |
2111 | ByteAlloc>::pointer>::pointer_to(*fullness), |
2112 | newChunkCount); |
2113 | } |
2114 | }; |
2115 | |
2116 | auto srcChunk = origChunks + origChunkCount - 1; |
2117 | std::size_t remaining = origSize; |
2118 | while (remaining > 0) { |
2119 | auto iter = srcChunk->occupiedIter(); |
2120 | if (prefetchBeforeRehash()) { |
2121 | for (auto piter = iter; piter.hasNext();) { |
2122 | this->prefetchValue(srcChunk->item(piter.next())); |
2123 | } |
2124 | } |
2125 | while (iter.hasNext()) { |
2126 | --remaining; |
2127 | auto srcI = iter.next(); |
2128 | Item& srcItem = srcChunk->item(srcI); |
2129 | auto hp = splitHash( |
2130 | this->computeItemHash(const_cast<Item const&>(srcItem))); |
2131 | FOLLY_SAFE_DCHECK(hp.second == srcChunk->tag(srcI), "" ); |
2132 | |
2133 | auto dstIter = allocateTag(fullness, hp); |
2134 | this->moveItemDuringRehash(dstIter.itemAddr(), srcItem); |
2135 | } |
2136 | --srcChunk; |
2137 | } |
2138 | |
2139 | if (kEnableItemIteration) { |
2140 | // this code replaces size invocations of adjustSizeAndBeginAfterInsert |
2141 | std::size_t i = chunkMask_; |
2142 | while (fullness[i] == 0) { |
2143 | --i; |
2144 | } |
2145 | sizeAndPackedBegin_.packedBegin() = |
2146 | ItemIter{chunks_ + i, std::size_t{fullness[i]} - 1}.pack(); |
2147 | } |
2148 | } |
2149 | |
2150 | success = true; |
2151 | } |
2152 | |
2153 | // Randomization to help expose bugs when running tests in debug or |
2154 | // sanitizer builds |
2155 | |
2156 | FOLLY_ALWAYS_INLINE void debugModeOnReserve(std::size_t capacity) { |
2157 | if (kIsSanitizeAddress || kIsDebug) { |
2158 | if (capacity > size()) { |
2159 | tlsPendingSafeInserts(static_cast<std::ptrdiff_t>(capacity - size())); |
2160 | } |
2161 | } |
2162 | } |
2163 | |
2164 | void debugModeSpuriousRehash() { |
2165 | auto cc = chunkMask_ + 1; |
2166 | auto ss = chunks_->capacityScale(); |
2167 | rehashImpl(size(), cc, ss, cc, ss); |
2168 | } |
2169 | |
2170 | FOLLY_ALWAYS_INLINE void debugModeBeforeInsert() { |
2171 | // When running under ASAN, we add a spurious rehash with 1/size() |
2172 | // probability before every insert. This means that finding reference |
2173 | // stability problems for F14Value and F14Vector is much more likely. |
2174 | // The most common pattern that causes this is |
2175 | // |
2176 | // auto& ref = map[k1]; map[k2] = foo(ref); |
2177 | // |
2178 | // One way to fix this is to call map.reserve(N) before such a |
2179 | // sequence, where N is the number of keys that might be inserted |
2180 | // within the section that retains references plus the existing size. |
2181 | if (kIsSanitizeAddress && !tlsPendingSafeInserts() && size() > 0 && |
2182 | tlsMinstdRand(size()) == 0) { |
2183 | debugModeSpuriousRehash(); |
2184 | } |
2185 | } |
2186 | |
2187 | FOLLY_ALWAYS_INLINE void debugModeAfterInsert() { |
2188 | if (kIsSanitizeAddress || kIsDebug) { |
2189 | tlsPendingSafeInserts(-1); |
2190 | } |
2191 | } |
2192 | |
2193 | FOLLY_ALWAYS_INLINE void debugModePerturbSlotInsertOrder( |
2194 | ChunkPtr chunk, |
2195 | std::size_t& itemIndex) { |
2196 | FOLLY_SAFE_DCHECK(!chunk->occupied(itemIndex), "" ); |
2197 | constexpr bool perturbSlot = FOLLY_F14_PERTURB_INSERTION_ORDER; |
2198 | if (perturbSlot && !tlsPendingSafeInserts()) { |
2199 | std::size_t e = chunkMask_ == 0 ? bucket_count() : Chunk::kCapacity; |
2200 | std::size_t i = itemIndex + tlsMinstdRand(e - itemIndex); |
2201 | if (!chunk->occupied(i)) { |
2202 | itemIndex = i; |
2203 | } |
2204 | } |
2205 | } |
2206 | |
2207 | public: |
2208 | // user has no control over max_load_factor |
2209 | |
2210 | void rehash(std::size_t capacity) { |
2211 | reserve(capacity); |
2212 | } |
2213 | |
2214 | void reserve(std::size_t capacity) { |
2215 | // We want to support the pattern |
2216 | // map.reserve(map.size() + 2); auto& r1 = map[k1]; auto& r2 = map[k2]; |
2217 | debugModeOnReserve(capacity); |
2218 | reserveImpl(capacity); |
2219 | } |
2220 | |
2221 | // Returns true iff a rehash was performed |
2222 | void reserveForInsert(size_t incoming = 1) { |
2223 | FOLLY_SAFE_DCHECK(incoming > 0, "" ); |
2224 | |
2225 | auto needed = size() + incoming; |
2226 | auto chunkCount = chunkMask_ + 1; |
2227 | auto scale = chunks_->capacityScale(); |
2228 | auto existing = computeCapacity(chunkCount, scale); |
2229 | if (needed - 1 >= existing) { |
2230 | reserveForInsertImpl(needed - 1, chunkCount, scale, existing); |
2231 | } |
2232 | } |
2233 | |
2234 | // Returns pos,true if construct, pos,false if found. key is only used |
2235 | // during the search; all constructor args for an inserted value come |
2236 | // from args... key won't be accessed after args are touched. |
2237 | template <typename K, typename... Args> |
2238 | std::pair<ItemIter, bool> tryEmplaceValue(K const& key, Args&&... args) { |
2239 | const auto hp = splitHash(this->computeKeyHash(key)); |
2240 | |
2241 | if (size() > 0) { |
2242 | auto existing = findImpl(hp, key); |
2243 | if (!existing.atEnd()) { |
2244 | return std::make_pair(existing, false); |
2245 | } |
2246 | } |
2247 | |
2248 | debugModeBeforeInsert(); |
2249 | |
2250 | reserveForInsert(); |
2251 | |
2252 | std::size_t index = hp.first; |
2253 | ChunkPtr chunk = chunks_ + (index & chunkMask_); |
2254 | auto firstEmpty = chunk->firstEmpty(); |
2255 | |
2256 | if (!firstEmpty.hasIndex()) { |
2257 | std::size_t delta = probeDelta(hp); |
2258 | do { |
2259 | chunk->incrOutboundOverflowCount(); |
2260 | index += delta; |
2261 | chunk = chunks_ + (index & chunkMask_); |
2262 | firstEmpty = chunk->firstEmpty(); |
2263 | } while (!firstEmpty.hasIndex()); |
2264 | chunk->adjustHostedOverflowCount(Chunk::kIncrHostedOverflowCount); |
2265 | } |
2266 | std::size_t itemIndex = firstEmpty.index(); |
2267 | |
2268 | debugModePerturbSlotInsertOrder(chunk, itemIndex); |
2269 | |
2270 | chunk->setTag(itemIndex, hp.second); |
2271 | ItemIter iter{chunk, itemIndex}; |
2272 | |
2273 | // insertAtBlank will clear the tag if the constructor throws |
2274 | insertAtBlank(iter, hp, std::forward<Args>(args)...); |
2275 | |
2276 | debugModeAfterInsert(); |
2277 | |
2278 | return std::make_pair(iter, true); |
2279 | } |
2280 | |
2281 | private: |
2282 | template <bool Reset> |
2283 | void clearImpl() noexcept { |
2284 | if (chunks_ == Chunk::emptyInstance()) { |
2285 | FOLLY_SAFE_DCHECK(empty() && bucket_count() == 0, "" ); |
2286 | return; |
2287 | } |
2288 | |
2289 | // turn clear into reset if the table is >= 16 chunks so that |
2290 | // we don't get too low a load factor |
2291 | bool willReset = Reset || chunkMask_ + 1 >= 16; |
2292 | |
2293 | auto origSize = size(); |
2294 | auto origCapacity = bucket_count(); |
2295 | if (willReset) { |
2296 | this->beforeReset(origSize, origCapacity); |
2297 | } else { |
2298 | this->beforeClear(origSize, origCapacity); |
2299 | } |
2300 | |
2301 | if (!empty()) { |
2302 | if (destroyItemOnClear()) { |
2303 | for (std::size_t ci = 0; ci <= chunkMask_; ++ci) { |
2304 | ChunkPtr chunk = chunks_ + ci; |
2305 | auto iter = chunk->occupiedIter(); |
2306 | if (prefetchBeforeDestroy()) { |
2307 | for (auto piter = iter; piter.hasNext();) { |
2308 | this->prefetchValue(chunk->item(piter.next())); |
2309 | } |
2310 | } |
2311 | while (iter.hasNext()) { |
2312 | this->destroyItem(chunk->item(iter.next())); |
2313 | } |
2314 | } |
2315 | } |
2316 | if (!willReset) { |
2317 | // It's okay to do this in a separate loop because we only do it |
2318 | // when the chunk count is small. That avoids a branch when we |
2319 | // are promoting a clear to a reset for a large table. |
2320 | auto scale = chunks_[0].capacityScale(); |
2321 | for (std::size_t ci = 0; ci <= chunkMask_; ++ci) { |
2322 | chunks_[ci].clear(); |
2323 | } |
2324 | chunks_[0].markEof(scale); |
2325 | } |
2326 | if (kEnableItemIteration) { |
2327 | sizeAndPackedBegin_.packedBegin() = ItemIter{}.pack(); |
2328 | } |
2329 | sizeAndPackedBegin_.size_ = 0; |
2330 | } |
2331 | |
2332 | if (willReset) { |
2333 | BytePtr rawAllocation = std::pointer_traits<BytePtr>::pointer_to( |
2334 | *static_cast<uint8_t*>(static_cast<void*>(&*chunks_))); |
2335 | std::size_t rawSize = |
2336 | chunkAllocSize(chunkMask_ + 1, chunks_->capacityScale()); |
2337 | |
2338 | chunks_ = Chunk::emptyInstance(); |
2339 | chunkMask_ = 0; |
2340 | |
2341 | this->afterReset(origSize, origCapacity, rawAllocation, rawSize); |
2342 | } else { |
2343 | this->afterClear(origSize, origCapacity); |
2344 | } |
2345 | } |
2346 | |
2347 | void eraseImpl(ItemIter pos, HashPair hp) { |
2348 | this->destroyItem(pos.item()); |
2349 | adjustSizeAndBeginBeforeErase(pos); |
2350 | eraseBlank(pos, hp); |
2351 | } |
2352 | |
2353 | public: |
2354 | // The item needs to still be hashable during this call. If you want |
2355 | // to intercept the value before it is destroyed (to extract it, for |
2356 | // example), use eraseIterInto(pos, beforeDestroy). |
2357 | void eraseIter(ItemIter pos) { |
2358 | eraseIterInto(pos, [](value_type&&) {}); |
2359 | } |
2360 | |
2361 | // The item needs to still be hashable during this call. If you want |
2362 | // to intercept the value before it is destroyed (to extract it, for |
2363 | // example), do so in the beforeDestroy callback. |
2364 | template <typename BeforeDestroy> |
2365 | void eraseIterInto(ItemIter pos, BeforeDestroy&& beforeDestroy) { |
2366 | HashPair hp{}; |
2367 | if (pos.chunk()->hostedOverflowCount() != 0) { |
2368 | hp = splitHash(this->computeItemHash(pos.citem())); |
2369 | } |
2370 | beforeDestroy(this->valueAtItemForExtract(pos.item())); |
2371 | eraseImpl(pos, hp); |
2372 | } |
2373 | |
2374 | template <typename K> |
2375 | std::size_t eraseKey(K const& key) { |
2376 | return eraseKeyInto(key, [](value_type&&) {}); |
2377 | } |
2378 | |
2379 | template <typename K, typename BeforeDestroy> |
2380 | std::size_t eraseKeyInto(K const& key, BeforeDestroy&& beforeDestroy) { |
2381 | if (UNLIKELY(size() == 0)) { |
2382 | return 0; |
2383 | } |
2384 | auto hp = splitHash(this->computeKeyHash(key)); |
2385 | auto iter = findImpl(hp, key); |
2386 | if (!iter.atEnd()) { |
2387 | beforeDestroy(this->valueAtItemForExtract(iter.item())); |
2388 | eraseImpl(iter, hp); |
2389 | return 1; |
2390 | } else { |
2391 | return 0; |
2392 | } |
2393 | } |
2394 | |
2395 | void clear() noexcept { |
2396 | if (kIsSanitizeAddress) { |
2397 | // force recycling of heap memory |
2398 | auto bc = bucket_count(); |
2399 | reset(); |
2400 | try { |
2401 | reserveImpl(bc); |
2402 | } catch (std::bad_alloc const&) { |
2403 | // ASAN mode only, keep going |
2404 | } |
2405 | } else { |
2406 | clearImpl<false>(); |
2407 | } |
2408 | } |
2409 | |
2410 | // Like clear(), but always frees all dynamic storage allocated |
2411 | // by the table. |
2412 | void reset() noexcept { |
2413 | clearImpl<true>(); |
2414 | } |
2415 | |
2416 | // Get memory footprint, not including sizeof(*this). |
2417 | std::size_t getAllocatedMemorySize() const { |
2418 | std::size_t sum = 0; |
2419 | visitAllocationClasses( |
2420 | [&sum](std::size_t bytes, std::size_t n) { sum += bytes * n; }); |
2421 | return sum; |
2422 | } |
2423 | |
2424 | // Enumerates classes of allocated memory blocks currently owned |
2425 | // by this table, calling visitor(allocationSize, allocationCount). |
2426 | // This can be used to get a more accurate indication of memory footprint |
2427 | // than getAllocatedMemorySize() if you have some way of computing the |
2428 | // internal fragmentation of the allocator, such as JEMalloc's nallocx. |
2429 | // The visitor might be called twice with the same allocationSize. The |
2430 | // visitor's computation should produce the same result for visitor(8, |
2431 | // 2) as for two calls to visitor(8, 1), for example. The visitor may |
2432 | // be called with a zero allocationCount. |
2433 | template <typename V> |
2434 | void visitAllocationClasses(V&& visitor) const { |
2435 | auto scale = chunks_->capacityScale(); |
2436 | this->visitPolicyAllocationClasses( |
2437 | scale == 0 ? 0 : chunkAllocSize(chunkMask_ + 1, scale), |
2438 | size(), |
2439 | bucket_count(), |
2440 | visitor); |
2441 | } |
2442 | |
2443 | // visitor should take an Item const& |
2444 | template <typename V> |
2445 | void visitItems(V&& visitor) const { |
2446 | if (empty()) { |
2447 | return; |
2448 | } |
2449 | std::size_t maxChunkIndex = lastOccupiedChunk() - chunks_; |
2450 | auto chunk = &chunks_[0]; |
2451 | for (std::size_t i = 0; i <= maxChunkIndex; ++i, ++chunk) { |
2452 | auto iter = chunk->occupiedIter(); |
2453 | if (prefetchBeforeCopy()) { |
2454 | for (auto piter = iter; piter.hasNext();) { |
2455 | this->prefetchValue(chunk->citem(piter.next())); |
2456 | } |
2457 | } |
2458 | while (iter.hasNext()) { |
2459 | visitor(chunk->citem(iter.next())); |
2460 | } |
2461 | } |
2462 | } |
2463 | |
2464 | // visitor should take two Item const* |
2465 | template <typename V> |
2466 | void visitContiguousItemRanges(V&& visitor) const { |
2467 | if (empty()) { |
2468 | return; |
2469 | } |
2470 | std::size_t maxChunkIndex = lastOccupiedChunk() - chunks_; |
2471 | auto chunk = &chunks_[0]; |
2472 | for (std::size_t i = 0; i <= maxChunkIndex; ++i, ++chunk) { |
2473 | for (auto iter = chunk->occupiedRangeIter(); iter.hasNext();) { |
2474 | auto be = iter.next(); |
2475 | FOLLY_SAFE_DCHECK( |
2476 | chunk->occupied(be.first) && chunk->occupied(be.second - 1), "" ); |
2477 | Item const* b = chunk->itemAddr(be.first); |
2478 | visitor(b, b + (be.second - be.first)); |
2479 | } |
2480 | } |
2481 | } |
2482 | |
2483 | private: |
2484 | static std::size_t& histoAt( |
2485 | std::vector<std::size_t>& histo, |
2486 | std::size_t index) { |
2487 | if (histo.size() <= index) { |
2488 | histo.resize(index + 1); |
2489 | } |
2490 | return histo.at(index); |
2491 | } |
2492 | |
2493 | public: |
2494 | // Expensive |
2495 | F14TableStats computeStats() const { |
2496 | F14TableStats stats; |
2497 | |
2498 | if (kIsDebug && kEnableItemIteration) { |
2499 | // validate iteration |
2500 | std::size_t n = 0; |
2501 | ItemIter prev; |
2502 | for (auto iter = begin(); iter != end(); iter.advance()) { |
2503 | FOLLY_SAFE_DCHECK(n == 0 || iter.pack() < prev.pack(), "" ); |
2504 | ++n; |
2505 | prev = iter; |
2506 | } |
2507 | FOLLY_SAFE_DCHECK(n == size(), "" ); |
2508 | } |
2509 | |
2510 | FOLLY_SAFE_DCHECK( |
2511 | (chunks_ == Chunk::emptyInstance()) == (bucket_count() == 0), "" ); |
2512 | |
2513 | std::size_t n1 = 0; |
2514 | std::size_t n2 = 0; |
2515 | auto cc = bucket_count() == 0 ? 0 : chunkMask_ + 1; |
2516 | for (std::size_t ci = 0; ci < cc; ++ci) { |
2517 | ChunkPtr chunk = chunks_ + ci; |
2518 | FOLLY_SAFE_DCHECK(chunk->eof() == (ci == 0), "" ); |
2519 | |
2520 | auto iter = chunk->occupiedIter(); |
2521 | |
2522 | std::size_t chunkOccupied = 0; |
2523 | for (auto piter = iter; piter.hasNext(); piter.next()) { |
2524 | ++chunkOccupied; |
2525 | } |
2526 | n1 += chunkOccupied; |
2527 | |
2528 | histoAt(stats.chunkOccupancyHisto, chunkOccupied)++; |
2529 | histoAt( |
2530 | stats.chunkOutboundOverflowHisto, chunk->outboundOverflowCount())++; |
2531 | histoAt(stats.chunkHostedOverflowHisto, chunk->hostedOverflowCount())++; |
2532 | |
2533 | while (iter.hasNext()) { |
2534 | auto ii = iter.next(); |
2535 | ++n2; |
2536 | |
2537 | { |
2538 | auto& item = chunk->citem(ii); |
2539 | auto hp = splitHash(this->computeItemHash(item)); |
2540 | FOLLY_SAFE_DCHECK(chunk->tag(ii) == hp.second, "" ); |
2541 | |
2542 | std::size_t dist = 1; |
2543 | std::size_t index = hp.first; |
2544 | std::size_t delta = probeDelta(hp); |
2545 | while ((index & chunkMask_) != ci) { |
2546 | index += delta; |
2547 | ++dist; |
2548 | } |
2549 | |
2550 | histoAt(stats.keyProbeLengthHisto, dist)++; |
2551 | } |
2552 | |
2553 | // misses could have any tag, so we do the dumb but accurate |
2554 | // thing and just try them all |
2555 | for (std::size_t ti = 0; ti < 256; ++ti) { |
2556 | uint8_t tag = static_cast<uint8_t>(ti == 0 ? 1 : 0); |
2557 | HashPair hp{ci, tag}; |
2558 | |
2559 | std::size_t dist = 1; |
2560 | std::size_t index = hp.first; |
2561 | std::size_t delta = probeDelta(hp); |
2562 | for (std::size_t tries = 0; tries <= chunkMask_ && |
2563 | chunks_[index & chunkMask_].outboundOverflowCount() != 0; |
2564 | ++tries) { |
2565 | index += delta; |
2566 | ++dist; |
2567 | } |
2568 | |
2569 | histoAt(stats.missProbeLengthHisto, dist)++; |
2570 | } |
2571 | } |
2572 | } |
2573 | |
2574 | FOLLY_SAFE_DCHECK(n1 == size(), "" ); |
2575 | FOLLY_SAFE_DCHECK(n2 == size(), "" ); |
2576 | |
2577 | #if FOLLY_HAS_RTTI |
2578 | stats.policy = typeid(Policy).name(); |
2579 | #endif |
2580 | stats.size = size(); |
2581 | stats.valueSize = sizeof(value_type); |
2582 | stats.bucketCount = bucket_count(); |
2583 | stats.chunkCount = cc; |
2584 | |
2585 | stats.totalBytes = sizeof(*this) + getAllocatedMemorySize(); |
2586 | stats.overheadBytes = stats.totalBytes - size() * sizeof(value_type); |
2587 | |
2588 | return stats; |
2589 | } |
2590 | }; |
2591 | } // namespace detail |
2592 | } // namespace f14 |
2593 | |
2594 | #endif // FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE |
2595 | |
2596 | namespace f14 { |
2597 | namespace test { |
2598 | inline void disableInsertOrderRandomization() { |
2599 | if (kIsSanitizeAddress || kIsDebug) { |
2600 | detail::tlsPendingSafeInserts(static_cast<std::ptrdiff_t>( |
2601 | (std::numeric_limits<std::size_t>::max)() / 2)); |
2602 | } |
2603 | } |
2604 | } // namespace test |
2605 | } // namespace f14 |
2606 | } // namespace folly |
2607 | |