1 | // Copyright (c) 2010, Google Inc. |
2 | // All rights reserved. |
3 | // |
4 | // Redistribution and use in source and binary forms, with or without |
5 | // modification, are permitted provided that the following conditions are |
6 | // met: |
7 | // |
8 | // * Redistributions of source code must retain the above copyright |
9 | // notice, this list of conditions and the following disclaimer. |
10 | // * Redistributions in binary form must reproduce the above |
11 | // copyright notice, this list of conditions and the following disclaimer |
12 | // in the documentation and/or other materials provided with the |
13 | // distribution. |
14 | // * Neither the name of Google Inc. nor the names of its |
15 | // contributors may be used to endorse or promote products derived from |
16 | // this software without specific prior written permission. |
17 | // |
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | |
30 | // --- |
31 | // |
32 | // Provides classes shared by both sparse and dense hashtable. |
33 | // |
34 | // sh_hashtable_settings has parameters for growing and shrinking |
35 | // a hashtable. It also packages zero-size functor (ie. hasher). |
36 | // |
37 | // Other functions and classes provide common code for serializing |
38 | // and deserializing hashtables to a stream (such as a FILE*). |
39 | |
40 | #pragma once |
41 | |
42 | #include <cassert> |
43 | #include <cstdio> |
44 | #include <cstddef> // for size_t |
45 | #include <iosfwd> |
46 | #include <stdexcept> // For length_error |
47 | |
48 | namespace google { |
49 | namespace sparsehash_internal { |
50 | |
51 | template<typename... Ts> struct make_void { typedef void type;}; |
52 | template<typename... Ts> using void_t = typename make_void<Ts...>::type; |
53 | |
54 | template <class HashFcn, class = void> |
55 | struct has_is_transparent : std::false_type {}; |
56 | |
57 | template <class HashFcn> |
58 | struct has_is_transparent<HashFcn, void_t<typename HashFcn::transparent_key_equal::is_transparent>> : std::true_type {}; |
59 | |
60 | template <class HashFcn, class = void, class = void> |
61 | struct has_transparent_key_equal : std::false_type {}; |
62 | |
63 | template <class HashFcn, class Key> |
64 | struct has_transparent_key_equal<HashFcn, Key, void_t<typename HashFcn::transparent_key_equal>> : std::true_type {}; |
65 | |
66 | template <class HashFcn, class EqualKey, bool = has_transparent_key_equal<HashFcn>::value> |
67 | struct key_equal_chosen { |
68 | using type = EqualKey; |
69 | }; |
70 | |
71 | template <class HashFcn, class EqualKey> |
72 | struct key_equal_chosen<HashFcn, EqualKey, true> { |
73 | using type = typename HashFcn::transparent_key_equal; |
74 | }; |
75 | |
76 | // Adaptor methods for reading/writing data from an INPUT or OUPTUT |
77 | // variable passed to serialize() or unserialize(). For now we |
78 | // have implemented INPUT/OUTPUT for FILE*, istream*/ostream* (note |
79 | // they are pointers, unlike typical use), or else a pointer to |
80 | // something that supports a Read()/Write() method. |
81 | // |
82 | // For technical reasons, we implement read_data/write_data in two |
83 | // stages. The actual work is done in *_data_internal, which takes |
84 | // the stream argument twice: once as a template type, and once with |
85 | // normal type information. (We only use the second version.) We do |
86 | // this because of how C++ picks what function overload to use. If we |
87 | // implemented this the naive way: |
88 | // bool read_data(istream* is, const void* data, size_t length); |
89 | // template<typename T> read_data(T* fp, const void* data, size_t length); |
90 | // C++ would prefer the second version for every stream type except |
91 | // istream. However, we want C++ to prefer the first version for |
92 | // streams that are *subclasses* of istream, such as istringstream. |
93 | // This is not possible given the way template types are resolved. So |
94 | // we split the stream argument in two, one of which is templated and |
95 | // one of which is not. The specialized functions (like the istream |
96 | // version above) ignore the template arg and use the second, 'type' |
97 | // arg, getting subclass matching as normal. The 'catch-all' |
98 | // functions (the second version above) use the template arg to deduce |
99 | // the type, and use a second, void* arg to achieve the desired |
100 | // 'catch-all' semantics. |
101 | |
102 | // ----- low-level I/O for FILE* ---- |
103 | |
104 | template <typename Ignored> |
105 | inline bool read_data_internal(Ignored*, FILE* fp, void* data, size_t length) { |
106 | return fread(data, length, 1, fp) == 1; |
107 | } |
108 | |
109 | template <typename Ignored> |
110 | inline bool write_data_internal(Ignored*, FILE* fp, const void* data, |
111 | size_t length) { |
112 | return fwrite(data, length, 1, fp) == 1; |
113 | } |
114 | |
115 | // ----- low-level I/O for iostream ---- |
116 | |
117 | // We want the caller to be responsible for #including <iostream>, not |
118 | // us, because iostream is a big header! According to the standard, |
119 | // it's only legal to delay the instantiation the way we want to if |
120 | // the istream/ostream is a template type. So we jump through hoops. |
121 | template <typename ISTREAM> |
122 | inline bool read_data_internal_for_istream(ISTREAM* fp, void* data, |
123 | size_t length) { |
124 | return fp->read(reinterpret_cast<char*>(data), length).good(); |
125 | } |
126 | template <typename Ignored> |
127 | inline bool read_data_internal(Ignored*, std::istream* fp, void* data, |
128 | size_t length) { |
129 | return read_data_internal_for_istream(fp, data, length); |
130 | } |
131 | |
132 | template <typename OSTREAM> |
133 | inline bool write_data_internal_for_ostream(OSTREAM* fp, const void* data, |
134 | size_t length) { |
135 | return fp->write(reinterpret_cast<const char*>(data), length).good(); |
136 | } |
137 | template <typename Ignored> |
138 | inline bool write_data_internal(Ignored*, std::ostream* fp, const void* data, |
139 | size_t length) { |
140 | return write_data_internal_for_ostream(fp, data, length); |
141 | } |
142 | |
143 | // ----- low-level I/O for custom streams ---- |
144 | |
145 | // The INPUT type needs to support a Read() method that takes a |
146 | // buffer and a length and returns the number of bytes read. |
147 | template <typename INPUT> |
148 | inline bool read_data_internal(INPUT* fp, void*, void* data, size_t length) { |
149 | return static_cast<size_t>(fp->Read(data, length)) == length; |
150 | } |
151 | |
152 | // The OUTPUT type needs to support a Write() operation that takes |
153 | // a buffer and a length and returns the number of bytes written. |
154 | template <typename OUTPUT> |
155 | inline bool write_data_internal(OUTPUT* fp, void*, const void* data, |
156 | size_t length) { |
157 | return static_cast<size_t>(fp->Write(data, length)) == length; |
158 | } |
159 | |
160 | // ----- low-level I/O: the public API ---- |
161 | |
162 | template <typename INPUT> |
163 | inline bool read_data(INPUT* fp, void* data, size_t length) { |
164 | return read_data_internal(fp, fp, data, length); |
165 | } |
166 | |
167 | template <typename OUTPUT> |
168 | inline bool write_data(OUTPUT* fp, const void* data, size_t length) { |
169 | return write_data_internal(fp, fp, data, length); |
170 | } |
171 | |
172 | // Uses read_data() and write_data() to read/write an integer. |
173 | // length is the number of bytes to read/write (which may differ |
174 | // from sizeof(IntType), allowing us to save on a 32-bit system |
175 | // and load on a 64-bit system). Excess bytes are taken to be 0. |
176 | // INPUT and OUTPUT must match legal inputs to read/write_data (above). |
177 | template <typename INPUT, typename IntType> |
178 | bool read_bigendian_number(INPUT* fp, IntType* value, size_t length) { |
179 | *value = 0; |
180 | unsigned char byte; |
181 | // We require IntType to be unsigned or else the shifting gets all screwy. |
182 | static_assert(static_cast<IntType>(-1) > static_cast<IntType>(0), |
183 | "serializing int requires an unsigned type" ); |
184 | for (size_t i = 0; i < length; ++i) { |
185 | if (!read_data(fp, &byte, sizeof(byte))) return false; |
186 | *value |= static_cast<IntType>(byte) << ((length - 1 - i) * 8); |
187 | } |
188 | return true; |
189 | } |
190 | |
191 | template <typename OUTPUT, typename IntType> |
192 | bool write_bigendian_number(OUTPUT* fp, IntType value, size_t length) { |
193 | unsigned char byte; |
194 | // We require IntType to be unsigned or else the shifting gets all screwy. |
195 | static_assert(static_cast<IntType>(-1) > static_cast<IntType>(0), |
196 | "serializing int requires an unsigned type" ); |
197 | for (size_t i = 0; i < length; ++i) { |
198 | byte = (sizeof(value) <= length - 1 - i) |
199 | ? 0 |
200 | : static_cast<unsigned char>((value >> ((length - 1 - i) * 8)) & |
201 | 255); |
202 | if (!write_data(fp, &byte, sizeof(byte))) return false; |
203 | } |
204 | return true; |
205 | } |
206 | |
207 | // If your keys and values are simple enough, you can pass this |
208 | // serializer to serialize()/unserialize(). "Simple enough" means |
209 | // value_type is a POD type that contains no pointers. Note, |
210 | // however, we don't try to normalize endianness. |
211 | // This is the type used for NopointerSerializer. |
212 | template <typename value_type> |
213 | struct pod_serializer { |
214 | template <typename INPUT> |
215 | bool operator()(INPUT* fp, value_type* value) const { |
216 | return read_data(fp, value, sizeof(*value)); |
217 | } |
218 | |
219 | template <typename OUTPUT> |
220 | bool operator()(OUTPUT* fp, const value_type& value) const { |
221 | return write_data(fp, &value, sizeof(value)); |
222 | } |
223 | }; |
224 | |
225 | // Settings contains parameters for growing and shrinking the table. |
226 | // It also packages zero-size functor (ie. hasher). |
227 | // |
228 | // It does some munging of the hash value in cases where we think |
229 | // (fear) the original hash function might not be very good. In |
230 | // particular, the default hash of pointers is the identity hash, |
231 | // so probably all the low bits are 0. We identify when we think |
232 | // we're hashing a pointer, and chop off the low bits. Note this |
233 | // isn't perfect: even when the key is a pointer, we can't tell |
234 | // for sure that the hash is the identity hash. If it's not, this |
235 | // is needless work (and possibly, though not likely, harmful). |
236 | |
237 | template <typename Key, typename HashFunc, typename SizeType, |
238 | int HT_MIN_BUCKETS> |
239 | class sh_hashtable_settings : public HashFunc { |
240 | public: |
241 | typedef Key key_type; |
242 | typedef HashFunc hasher; |
243 | typedef SizeType size_type; |
244 | static_assert(!has_transparent_key_equal<HashFunc>::value || has_is_transparent<HashFunc, void>::value, |
245 | "hash provided non-transparent key_equal" ); |
246 | |
247 | public: |
248 | sh_hashtable_settings(const hasher& hf, const float ht_occupancy_flt, |
249 | const float ht_empty_flt) |
250 | : hasher(hf), |
251 | enlarge_threshold_(0), |
252 | shrink_threshold_(0), |
253 | consider_shrink_(false), |
254 | use_empty_(false), |
255 | use_deleted_(false), |
256 | num_ht_copies_(0) { |
257 | set_enlarge_factor(ht_occupancy_flt); |
258 | set_shrink_factor(ht_empty_flt); |
259 | } |
260 | |
261 | template<typename K> |
262 | size_type hash(const K& v) const { |
263 | // We munge the hash value when we don't trust hasher::operator(). |
264 | return hash_munger<Key>::MungedHash(hasher::operator()(v)); |
265 | } |
266 | |
267 | float enlarge_factor() const { return enlarge_factor_; } |
268 | void set_enlarge_factor(float f) { enlarge_factor_ = f; } |
269 | float shrink_factor() const { return shrink_factor_; } |
270 | void set_shrink_factor(float f) { shrink_factor_ = f; } |
271 | |
272 | size_type enlarge_threshold() const { return enlarge_threshold_; } |
273 | void set_enlarge_threshold(size_type t) { enlarge_threshold_ = t; } |
274 | size_type shrink_threshold() const { return shrink_threshold_; } |
275 | void set_shrink_threshold(size_type t) { shrink_threshold_ = t; } |
276 | |
277 | size_type enlarge_size(size_type x) const { |
278 | return static_cast<size_type>(x * enlarge_factor_); |
279 | } |
280 | size_type shrink_size(size_type x) const { |
281 | return static_cast<size_type>(x * shrink_factor_); |
282 | } |
283 | |
284 | bool consider_shrink() const { return consider_shrink_; } |
285 | void set_consider_shrink(bool t) { consider_shrink_ = t; } |
286 | |
287 | bool use_empty() const { return use_empty_; } |
288 | void set_use_empty(bool t) { use_empty_ = t; } |
289 | |
290 | bool use_deleted() const { return use_deleted_; } |
291 | void set_use_deleted(bool t) { use_deleted_ = t; } |
292 | |
293 | size_type num_ht_copies() const { |
294 | return static_cast<size_type>(num_ht_copies_); |
295 | } |
296 | void inc_num_ht_copies() { ++num_ht_copies_; } |
297 | |
298 | // Reset the enlarge and shrink thresholds |
299 | void reset_thresholds(size_type num_buckets) { |
300 | set_enlarge_threshold(enlarge_size(num_buckets)); |
301 | set_shrink_threshold(shrink_size(num_buckets)); |
302 | // whatever caused us to reset already considered |
303 | set_consider_shrink(false); |
304 | } |
305 | |
306 | // Caller is resposible for calling reset_threshold right after |
307 | // set_resizing_parameters. |
308 | void set_resizing_parameters(float shrink, float grow) { |
309 | assert(shrink >= 0.0); |
310 | assert(grow <= 1.0); |
311 | if (shrink > grow / 2.0f) |
312 | shrink = grow / 2.0f; // otherwise we thrash hashtable size |
313 | set_shrink_factor(shrink); |
314 | set_enlarge_factor(grow); |
315 | } |
316 | |
317 | // This is the smallest size a hashtable can be without being too crowded |
318 | // If you like, you can give a min #buckets as well as a min #elts |
319 | size_type min_buckets(size_type num_elts, size_type min_buckets_wanted) { |
320 | float enlarge = enlarge_factor(); |
321 | size_type sz = HT_MIN_BUCKETS; // min buckets allowed |
322 | while (sz < min_buckets_wanted || |
323 | num_elts >= static_cast<size_type>(sz * enlarge)) { |
324 | // This just prevents overflowing size_type, since sz can exceed |
325 | // max_size() here. |
326 | if (static_cast<size_type>(sz * 2) < sz) { |
327 | throw std::length_error("resize overflow" ); // protect against overflow |
328 | } |
329 | sz *= 2; |
330 | } |
331 | return sz; |
332 | } |
333 | |
334 | private: |
335 | template <class HashKey> |
336 | class hash_munger { |
337 | public: |
338 | static size_t MungedHash(size_t hash) { return hash; } |
339 | }; |
340 | // This matches when the hashtable key is a pointer. |
341 | template <class HashKey> |
342 | class hash_munger<HashKey*> { |
343 | public: |
344 | static size_t MungedHash(size_t hash) { |
345 | // TODO(csilvers): consider rotating instead: |
346 | // static const int shift = (sizeof(void *) == 4) ? 2 : 3; |
347 | // return (hash << (sizeof(hash) * 8) - shift)) | (hash >> |
348 | // shift); |
349 | // This matters if we ever change sparse/dense_hash_* to compare |
350 | // hashes before comparing actual values. It's speedy on x86. |
351 | return hash / sizeof(void*); // get rid of known-0 bits |
352 | } |
353 | }; |
354 | |
355 | size_type enlarge_threshold_; // table.size() * enlarge_factor |
356 | size_type shrink_threshold_; // table.size() * shrink_factor |
357 | float enlarge_factor_; // how full before resize |
358 | float shrink_factor_; // how empty before resize |
359 | // consider_shrink=true if we should try to shrink before next insert |
360 | bool consider_shrink_; |
361 | bool use_empty_; // used only by densehashtable, not sparsehashtable |
362 | bool use_deleted_; // false until delkey has been set |
363 | // num_ht_copies is a counter incremented every Copy/Move |
364 | unsigned int num_ht_copies_; |
365 | }; |
366 | |
367 | } // namespace sparsehash_internal |
368 | } // namespace google |
369 | |