1// Copyright (c) 2010, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8// * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14// * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30// ---
31//
32// Provides classes shared by both sparse and dense hashtable.
33//
34// sh_hashtable_settings has parameters for growing and shrinking
35// a hashtable. It also packages zero-size functor (ie. hasher).
36//
37// Other functions and classes provide common code for serializing
38// and deserializing hashtables to a stream (such as a FILE*).
39
40#pragma once
41
42#include <cassert>
43#include <cstdio>
44#include <cstddef> // for size_t
45#include <iosfwd>
46#include <stdexcept> // For length_error
47
48namespace google {
49namespace sparsehash_internal {
50
51template<typename... Ts> struct make_void { typedef void type;};
52template<typename... Ts> using void_t = typename make_void<Ts...>::type;
53
54template <class HashFcn, class = void>
55struct has_is_transparent : std::false_type {};
56
57template <class HashFcn>
58struct has_is_transparent<HashFcn, void_t<typename HashFcn::transparent_key_equal::is_transparent>> : std::true_type {};
59
60template <class HashFcn, class = void, class = void>
61struct has_transparent_key_equal : std::false_type {};
62
63template <class HashFcn, class Key>
64struct has_transparent_key_equal<HashFcn, Key, void_t<typename HashFcn::transparent_key_equal>> : std::true_type {};
65
66template <class HashFcn, class EqualKey, bool = has_transparent_key_equal<HashFcn>::value>
67struct key_equal_chosen {
68 using type = EqualKey;
69};
70
71template <class HashFcn, class EqualKey>
72struct key_equal_chosen<HashFcn, EqualKey, true> {
73 using type = typename HashFcn::transparent_key_equal;
74};
75
76// Adaptor methods for reading/writing data from an INPUT or OUPTUT
77// variable passed to serialize() or unserialize(). For now we
78// have implemented INPUT/OUTPUT for FILE*, istream*/ostream* (note
79// they are pointers, unlike typical use), or else a pointer to
80// something that supports a Read()/Write() method.
81//
82// For technical reasons, we implement read_data/write_data in two
83// stages. The actual work is done in *_data_internal, which takes
84// the stream argument twice: once as a template type, and once with
85// normal type information. (We only use the second version.) We do
86// this because of how C++ picks what function overload to use. If we
87// implemented this the naive way:
88// bool read_data(istream* is, const void* data, size_t length);
89// template<typename T> read_data(T* fp, const void* data, size_t length);
90// C++ would prefer the second version for every stream type except
91// istream. However, we want C++ to prefer the first version for
92// streams that are *subclasses* of istream, such as istringstream.
93// This is not possible given the way template types are resolved. So
94// we split the stream argument in two, one of which is templated and
95// one of which is not. The specialized functions (like the istream
96// version above) ignore the template arg and use the second, 'type'
97// arg, getting subclass matching as normal. The 'catch-all'
98// functions (the second version above) use the template arg to deduce
99// the type, and use a second, void* arg to achieve the desired
100// 'catch-all' semantics.
101
102// ----- low-level I/O for FILE* ----
103
104template <typename Ignored>
105inline bool read_data_internal(Ignored*, FILE* fp, void* data, size_t length) {
106 return fread(data, length, 1, fp) == 1;
107}
108
109template <typename Ignored>
110inline bool write_data_internal(Ignored*, FILE* fp, const void* data,
111 size_t length) {
112 return fwrite(data, length, 1, fp) == 1;
113}
114
115// ----- low-level I/O for iostream ----
116
117// We want the caller to be responsible for #including <iostream>, not
118// us, because iostream is a big header! According to the standard,
119// it's only legal to delay the instantiation the way we want to if
120// the istream/ostream is a template type. So we jump through hoops.
121template <typename ISTREAM>
122inline bool read_data_internal_for_istream(ISTREAM* fp, void* data,
123 size_t length) {
124 return fp->read(reinterpret_cast<char*>(data), length).good();
125}
126template <typename Ignored>
127inline bool read_data_internal(Ignored*, std::istream* fp, void* data,
128 size_t length) {
129 return read_data_internal_for_istream(fp, data, length);
130}
131
132template <typename OSTREAM>
133inline bool write_data_internal_for_ostream(OSTREAM* fp, const void* data,
134 size_t length) {
135 return fp->write(reinterpret_cast<const char*>(data), length).good();
136}
137template <typename Ignored>
138inline bool write_data_internal(Ignored*, std::ostream* fp, const void* data,
139 size_t length) {
140 return write_data_internal_for_ostream(fp, data, length);
141}
142
143// ----- low-level I/O for custom streams ----
144
145// The INPUT type needs to support a Read() method that takes a
146// buffer and a length and returns the number of bytes read.
147template <typename INPUT>
148inline bool read_data_internal(INPUT* fp, void*, void* data, size_t length) {
149 return static_cast<size_t>(fp->Read(data, length)) == length;
150}
151
152// The OUTPUT type needs to support a Write() operation that takes
153// a buffer and a length and returns the number of bytes written.
154template <typename OUTPUT>
155inline bool write_data_internal(OUTPUT* fp, void*, const void* data,
156 size_t length) {
157 return static_cast<size_t>(fp->Write(data, length)) == length;
158}
159
160// ----- low-level I/O: the public API ----
161
162template <typename INPUT>
163inline bool read_data(INPUT* fp, void* data, size_t length) {
164 return read_data_internal(fp, fp, data, length);
165}
166
167template <typename OUTPUT>
168inline bool write_data(OUTPUT* fp, const void* data, size_t length) {
169 return write_data_internal(fp, fp, data, length);
170}
171
172// Uses read_data() and write_data() to read/write an integer.
173// length is the number of bytes to read/write (which may differ
174// from sizeof(IntType), allowing us to save on a 32-bit system
175// and load on a 64-bit system). Excess bytes are taken to be 0.
176// INPUT and OUTPUT must match legal inputs to read/write_data (above).
177template <typename INPUT, typename IntType>
178bool read_bigendian_number(INPUT* fp, IntType* value, size_t length) {
179 *value = 0;
180 unsigned char byte;
181 // We require IntType to be unsigned or else the shifting gets all screwy.
182 static_assert(static_cast<IntType>(-1) > static_cast<IntType>(0),
183 "serializing int requires an unsigned type");
184 for (size_t i = 0; i < length; ++i) {
185 if (!read_data(fp, &byte, sizeof(byte))) return false;
186 *value |= static_cast<IntType>(byte) << ((length - 1 - i) * 8);
187 }
188 return true;
189}
190
191template <typename OUTPUT, typename IntType>
192bool write_bigendian_number(OUTPUT* fp, IntType value, size_t length) {
193 unsigned char byte;
194 // We require IntType to be unsigned or else the shifting gets all screwy.
195 static_assert(static_cast<IntType>(-1) > static_cast<IntType>(0),
196 "serializing int requires an unsigned type");
197 for (size_t i = 0; i < length; ++i) {
198 byte = (sizeof(value) <= length - 1 - i)
199 ? 0
200 : static_cast<unsigned char>((value >> ((length - 1 - i) * 8)) &
201 255);
202 if (!write_data(fp, &byte, sizeof(byte))) return false;
203 }
204 return true;
205}
206
207// If your keys and values are simple enough, you can pass this
208// serializer to serialize()/unserialize(). "Simple enough" means
209// value_type is a POD type that contains no pointers. Note,
210// however, we don't try to normalize endianness.
211// This is the type used for NopointerSerializer.
212template <typename value_type>
213struct pod_serializer {
214 template <typename INPUT>
215 bool operator()(INPUT* fp, value_type* value) const {
216 return read_data(fp, value, sizeof(*value));
217 }
218
219 template <typename OUTPUT>
220 bool operator()(OUTPUT* fp, const value_type& value) const {
221 return write_data(fp, &value, sizeof(value));
222 }
223};
224
225// Settings contains parameters for growing and shrinking the table.
226// It also packages zero-size functor (ie. hasher).
227//
228// It does some munging of the hash value in cases where we think
229// (fear) the original hash function might not be very good. In
230// particular, the default hash of pointers is the identity hash,
231// so probably all the low bits are 0. We identify when we think
232// we're hashing a pointer, and chop off the low bits. Note this
233// isn't perfect: even when the key is a pointer, we can't tell
234// for sure that the hash is the identity hash. If it's not, this
235// is needless work (and possibly, though not likely, harmful).
236
237template <typename Key, typename HashFunc, typename SizeType,
238 int HT_MIN_BUCKETS>
239class sh_hashtable_settings : public HashFunc {
240 public:
241 typedef Key key_type;
242 typedef HashFunc hasher;
243 typedef SizeType size_type;
244 static_assert(!has_transparent_key_equal<HashFunc>::value || has_is_transparent<HashFunc, void>::value,
245 "hash provided non-transparent key_equal");
246
247 public:
248 sh_hashtable_settings(const hasher& hf, const float ht_occupancy_flt,
249 const float ht_empty_flt)
250 : hasher(hf),
251 enlarge_threshold_(0),
252 shrink_threshold_(0),
253 consider_shrink_(false),
254 use_empty_(false),
255 use_deleted_(false),
256 num_ht_copies_(0) {
257 set_enlarge_factor(ht_occupancy_flt);
258 set_shrink_factor(ht_empty_flt);
259 }
260
261 template<typename K>
262 size_type hash(const K& v) const {
263 // We munge the hash value when we don't trust hasher::operator().
264 return hash_munger<Key>::MungedHash(hasher::operator()(v));
265 }
266
267 float enlarge_factor() const { return enlarge_factor_; }
268 void set_enlarge_factor(float f) { enlarge_factor_ = f; }
269 float shrink_factor() const { return shrink_factor_; }
270 void set_shrink_factor(float f) { shrink_factor_ = f; }
271
272 size_type enlarge_threshold() const { return enlarge_threshold_; }
273 void set_enlarge_threshold(size_type t) { enlarge_threshold_ = t; }
274 size_type shrink_threshold() const { return shrink_threshold_; }
275 void set_shrink_threshold(size_type t) { shrink_threshold_ = t; }
276
277 size_type enlarge_size(size_type x) const {
278 return static_cast<size_type>(x * enlarge_factor_);
279 }
280 size_type shrink_size(size_type x) const {
281 return static_cast<size_type>(x * shrink_factor_);
282 }
283
284 bool consider_shrink() const { return consider_shrink_; }
285 void set_consider_shrink(bool t) { consider_shrink_ = t; }
286
287 bool use_empty() const { return use_empty_; }
288 void set_use_empty(bool t) { use_empty_ = t; }
289
290 bool use_deleted() const { return use_deleted_; }
291 void set_use_deleted(bool t) { use_deleted_ = t; }
292
293 size_type num_ht_copies() const {
294 return static_cast<size_type>(num_ht_copies_);
295 }
296 void inc_num_ht_copies() { ++num_ht_copies_; }
297
298 // Reset the enlarge and shrink thresholds
299 void reset_thresholds(size_type num_buckets) {
300 set_enlarge_threshold(enlarge_size(num_buckets));
301 set_shrink_threshold(shrink_size(num_buckets));
302 // whatever caused us to reset already considered
303 set_consider_shrink(false);
304 }
305
306 // Caller is resposible for calling reset_threshold right after
307 // set_resizing_parameters.
308 void set_resizing_parameters(float shrink, float grow) {
309 assert(shrink >= 0.0);
310 assert(grow <= 1.0);
311 if (shrink > grow / 2.0f)
312 shrink = grow / 2.0f; // otherwise we thrash hashtable size
313 set_shrink_factor(shrink);
314 set_enlarge_factor(grow);
315 }
316
317 // This is the smallest size a hashtable can be without being too crowded
318 // If you like, you can give a min #buckets as well as a min #elts
319 size_type min_buckets(size_type num_elts, size_type min_buckets_wanted) {
320 float enlarge = enlarge_factor();
321 size_type sz = HT_MIN_BUCKETS; // min buckets allowed
322 while (sz < min_buckets_wanted ||
323 num_elts >= static_cast<size_type>(sz * enlarge)) {
324 // This just prevents overflowing size_type, since sz can exceed
325 // max_size() here.
326 if (static_cast<size_type>(sz * 2) < sz) {
327 throw std::length_error("resize overflow"); // protect against overflow
328 }
329 sz *= 2;
330 }
331 return sz;
332 }
333
334 private:
335 template <class HashKey>
336 class hash_munger {
337 public:
338 static size_t MungedHash(size_t hash) { return hash; }
339 };
340 // This matches when the hashtable key is a pointer.
341 template <class HashKey>
342 class hash_munger<HashKey*> {
343 public:
344 static size_t MungedHash(size_t hash) {
345 // TODO(csilvers): consider rotating instead:
346 // static const int shift = (sizeof(void *) == 4) ? 2 : 3;
347 // return (hash << (sizeof(hash) * 8) - shift)) | (hash >>
348 // shift);
349 // This matters if we ever change sparse/dense_hash_* to compare
350 // hashes before comparing actual values. It's speedy on x86.
351 return hash / sizeof(void*); // get rid of known-0 bits
352 }
353 };
354
355 size_type enlarge_threshold_; // table.size() * enlarge_factor
356 size_type shrink_threshold_; // table.size() * shrink_factor
357 float enlarge_factor_; // how full before resize
358 float shrink_factor_; // how empty before resize
359 // consider_shrink=true if we should try to shrink before next insert
360 bool consider_shrink_;
361 bool use_empty_; // used only by densehashtable, not sparsehashtable
362 bool use_deleted_; // false until delkey has been set
363 // num_ht_copies is a counter incremented every Copy/Move
364 unsigned int num_ht_copies_;
365};
366
367} // namespace sparsehash_internal
368} // namespace google
369