| 1 | // Copyright (c) 2010, Google Inc. |
| 2 | // All rights reserved. |
| 3 | // |
| 4 | // Redistribution and use in source and binary forms, with or without |
| 5 | // modification, are permitted provided that the following conditions are |
| 6 | // met: |
| 7 | // |
| 8 | // * Redistributions of source code must retain the above copyright |
| 9 | // notice, this list of conditions and the following disclaimer. |
| 10 | // * Redistributions in binary form must reproduce the above |
| 11 | // copyright notice, this list of conditions and the following disclaimer |
| 12 | // in the documentation and/or other materials provided with the |
| 13 | // distribution. |
| 14 | // * Neither the name of Google Inc. nor the names of its |
| 15 | // contributors may be used to endorse or promote products derived from |
| 16 | // this software without specific prior written permission. |
| 17 | // |
| 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | |
| 30 | // --- |
| 31 | // |
| 32 | // Provides classes shared by both sparse and dense hashtable. |
| 33 | // |
| 34 | // sh_hashtable_settings has parameters for growing and shrinking |
| 35 | // a hashtable. It also packages zero-size functor (ie. hasher). |
| 36 | // |
| 37 | // Other functions and classes provide common code for serializing |
| 38 | // and deserializing hashtables to a stream (such as a FILE*). |
| 39 | |
| 40 | #pragma once |
| 41 | |
| 42 | #include <cassert> |
| 43 | #include <cstdio> |
| 44 | #include <cstddef> // for size_t |
| 45 | #include <iosfwd> |
| 46 | #include <stdexcept> // For length_error |
| 47 | |
| 48 | namespace google { |
| 49 | namespace sparsehash_internal { |
| 50 | |
| 51 | template<typename... Ts> struct make_void { typedef void type;}; |
| 52 | template<typename... Ts> using void_t = typename make_void<Ts...>::type; |
| 53 | |
| 54 | template <class HashFcn, class = void> |
| 55 | struct has_is_transparent : std::false_type {}; |
| 56 | |
| 57 | template <class HashFcn> |
| 58 | struct has_is_transparent<HashFcn, void_t<typename HashFcn::transparent_key_equal::is_transparent>> : std::true_type {}; |
| 59 | |
| 60 | template <class HashFcn, class = void, class = void> |
| 61 | struct has_transparent_key_equal : std::false_type {}; |
| 62 | |
| 63 | template <class HashFcn, class Key> |
| 64 | struct has_transparent_key_equal<HashFcn, Key, void_t<typename HashFcn::transparent_key_equal>> : std::true_type {}; |
| 65 | |
| 66 | template <class HashFcn, class EqualKey, bool = has_transparent_key_equal<HashFcn>::value> |
| 67 | struct key_equal_chosen { |
| 68 | using type = EqualKey; |
| 69 | }; |
| 70 | |
| 71 | template <class HashFcn, class EqualKey> |
| 72 | struct key_equal_chosen<HashFcn, EqualKey, true> { |
| 73 | using type = typename HashFcn::transparent_key_equal; |
| 74 | }; |
| 75 | |
| 76 | // Adaptor methods for reading/writing data from an INPUT or OUPTUT |
| 77 | // variable passed to serialize() or unserialize(). For now we |
| 78 | // have implemented INPUT/OUTPUT for FILE*, istream*/ostream* (note |
| 79 | // they are pointers, unlike typical use), or else a pointer to |
| 80 | // something that supports a Read()/Write() method. |
| 81 | // |
| 82 | // For technical reasons, we implement read_data/write_data in two |
| 83 | // stages. The actual work is done in *_data_internal, which takes |
| 84 | // the stream argument twice: once as a template type, and once with |
| 85 | // normal type information. (We only use the second version.) We do |
| 86 | // this because of how C++ picks what function overload to use. If we |
| 87 | // implemented this the naive way: |
| 88 | // bool read_data(istream* is, const void* data, size_t length); |
| 89 | // template<typename T> read_data(T* fp, const void* data, size_t length); |
| 90 | // C++ would prefer the second version for every stream type except |
| 91 | // istream. However, we want C++ to prefer the first version for |
| 92 | // streams that are *subclasses* of istream, such as istringstream. |
| 93 | // This is not possible given the way template types are resolved. So |
| 94 | // we split the stream argument in two, one of which is templated and |
| 95 | // one of which is not. The specialized functions (like the istream |
| 96 | // version above) ignore the template arg and use the second, 'type' |
| 97 | // arg, getting subclass matching as normal. The 'catch-all' |
| 98 | // functions (the second version above) use the template arg to deduce |
| 99 | // the type, and use a second, void* arg to achieve the desired |
| 100 | // 'catch-all' semantics. |
| 101 | |
| 102 | // ----- low-level I/O for FILE* ---- |
| 103 | |
| 104 | template <typename Ignored> |
| 105 | inline bool read_data_internal(Ignored*, FILE* fp, void* data, size_t length) { |
| 106 | return fread(data, length, 1, fp) == 1; |
| 107 | } |
| 108 | |
| 109 | template <typename Ignored> |
| 110 | inline bool write_data_internal(Ignored*, FILE* fp, const void* data, |
| 111 | size_t length) { |
| 112 | return fwrite(data, length, 1, fp) == 1; |
| 113 | } |
| 114 | |
| 115 | // ----- low-level I/O for iostream ---- |
| 116 | |
| 117 | // We want the caller to be responsible for #including <iostream>, not |
| 118 | // us, because iostream is a big header! According to the standard, |
| 119 | // it's only legal to delay the instantiation the way we want to if |
| 120 | // the istream/ostream is a template type. So we jump through hoops. |
| 121 | template <typename ISTREAM> |
| 122 | inline bool read_data_internal_for_istream(ISTREAM* fp, void* data, |
| 123 | size_t length) { |
| 124 | return fp->read(reinterpret_cast<char*>(data), length).good(); |
| 125 | } |
| 126 | template <typename Ignored> |
| 127 | inline bool read_data_internal(Ignored*, std::istream* fp, void* data, |
| 128 | size_t length) { |
| 129 | return read_data_internal_for_istream(fp, data, length); |
| 130 | } |
| 131 | |
| 132 | template <typename OSTREAM> |
| 133 | inline bool write_data_internal_for_ostream(OSTREAM* fp, const void* data, |
| 134 | size_t length) { |
| 135 | return fp->write(reinterpret_cast<const char*>(data), length).good(); |
| 136 | } |
| 137 | template <typename Ignored> |
| 138 | inline bool write_data_internal(Ignored*, std::ostream* fp, const void* data, |
| 139 | size_t length) { |
| 140 | return write_data_internal_for_ostream(fp, data, length); |
| 141 | } |
| 142 | |
| 143 | // ----- low-level I/O for custom streams ---- |
| 144 | |
| 145 | // The INPUT type needs to support a Read() method that takes a |
| 146 | // buffer and a length and returns the number of bytes read. |
| 147 | template <typename INPUT> |
| 148 | inline bool read_data_internal(INPUT* fp, void*, void* data, size_t length) { |
| 149 | return static_cast<size_t>(fp->Read(data, length)) == length; |
| 150 | } |
| 151 | |
| 152 | // The OUTPUT type needs to support a Write() operation that takes |
| 153 | // a buffer and a length and returns the number of bytes written. |
| 154 | template <typename OUTPUT> |
| 155 | inline bool write_data_internal(OUTPUT* fp, void*, const void* data, |
| 156 | size_t length) { |
| 157 | return static_cast<size_t>(fp->Write(data, length)) == length; |
| 158 | } |
| 159 | |
| 160 | // ----- low-level I/O: the public API ---- |
| 161 | |
| 162 | template <typename INPUT> |
| 163 | inline bool read_data(INPUT* fp, void* data, size_t length) { |
| 164 | return read_data_internal(fp, fp, data, length); |
| 165 | } |
| 166 | |
| 167 | template <typename OUTPUT> |
| 168 | inline bool write_data(OUTPUT* fp, const void* data, size_t length) { |
| 169 | return write_data_internal(fp, fp, data, length); |
| 170 | } |
| 171 | |
| 172 | // Uses read_data() and write_data() to read/write an integer. |
| 173 | // length is the number of bytes to read/write (which may differ |
| 174 | // from sizeof(IntType), allowing us to save on a 32-bit system |
| 175 | // and load on a 64-bit system). Excess bytes are taken to be 0. |
| 176 | // INPUT and OUTPUT must match legal inputs to read/write_data (above). |
| 177 | template <typename INPUT, typename IntType> |
| 178 | bool read_bigendian_number(INPUT* fp, IntType* value, size_t length) { |
| 179 | *value = 0; |
| 180 | unsigned char byte; |
| 181 | // We require IntType to be unsigned or else the shifting gets all screwy. |
| 182 | static_assert(static_cast<IntType>(-1) > static_cast<IntType>(0), |
| 183 | "serializing int requires an unsigned type" ); |
| 184 | for (size_t i = 0; i < length; ++i) { |
| 185 | if (!read_data(fp, &byte, sizeof(byte))) return false; |
| 186 | *value |= static_cast<IntType>(byte) << ((length - 1 - i) * 8); |
| 187 | } |
| 188 | return true; |
| 189 | } |
| 190 | |
| 191 | template <typename OUTPUT, typename IntType> |
| 192 | bool write_bigendian_number(OUTPUT* fp, IntType value, size_t length) { |
| 193 | unsigned char byte; |
| 194 | // We require IntType to be unsigned or else the shifting gets all screwy. |
| 195 | static_assert(static_cast<IntType>(-1) > static_cast<IntType>(0), |
| 196 | "serializing int requires an unsigned type" ); |
| 197 | for (size_t i = 0; i < length; ++i) { |
| 198 | byte = (sizeof(value) <= length - 1 - i) |
| 199 | ? 0 |
| 200 | : static_cast<unsigned char>((value >> ((length - 1 - i) * 8)) & |
| 201 | 255); |
| 202 | if (!write_data(fp, &byte, sizeof(byte))) return false; |
| 203 | } |
| 204 | return true; |
| 205 | } |
| 206 | |
| 207 | // If your keys and values are simple enough, you can pass this |
| 208 | // serializer to serialize()/unserialize(). "Simple enough" means |
| 209 | // value_type is a POD type that contains no pointers. Note, |
| 210 | // however, we don't try to normalize endianness. |
| 211 | // This is the type used for NopointerSerializer. |
| 212 | template <typename value_type> |
| 213 | struct pod_serializer { |
| 214 | template <typename INPUT> |
| 215 | bool operator()(INPUT* fp, value_type* value) const { |
| 216 | return read_data(fp, value, sizeof(*value)); |
| 217 | } |
| 218 | |
| 219 | template <typename OUTPUT> |
| 220 | bool operator()(OUTPUT* fp, const value_type& value) const { |
| 221 | return write_data(fp, &value, sizeof(value)); |
| 222 | } |
| 223 | }; |
| 224 | |
| 225 | // Settings contains parameters for growing and shrinking the table. |
| 226 | // It also packages zero-size functor (ie. hasher). |
| 227 | // |
| 228 | // It does some munging of the hash value in cases where we think |
| 229 | // (fear) the original hash function might not be very good. In |
| 230 | // particular, the default hash of pointers is the identity hash, |
| 231 | // so probably all the low bits are 0. We identify when we think |
| 232 | // we're hashing a pointer, and chop off the low bits. Note this |
| 233 | // isn't perfect: even when the key is a pointer, we can't tell |
| 234 | // for sure that the hash is the identity hash. If it's not, this |
| 235 | // is needless work (and possibly, though not likely, harmful). |
| 236 | |
| 237 | template <typename Key, typename HashFunc, typename SizeType, |
| 238 | int HT_MIN_BUCKETS> |
| 239 | class sh_hashtable_settings : public HashFunc { |
| 240 | public: |
| 241 | typedef Key key_type; |
| 242 | typedef HashFunc hasher; |
| 243 | typedef SizeType size_type; |
| 244 | static_assert(!has_transparent_key_equal<HashFunc>::value || has_is_transparent<HashFunc, void>::value, |
| 245 | "hash provided non-transparent key_equal" ); |
| 246 | |
| 247 | public: |
| 248 | sh_hashtable_settings(const hasher& hf, const float ht_occupancy_flt, |
| 249 | const float ht_empty_flt) |
| 250 | : hasher(hf), |
| 251 | enlarge_threshold_(0), |
| 252 | shrink_threshold_(0), |
| 253 | consider_shrink_(false), |
| 254 | use_empty_(false), |
| 255 | use_deleted_(false), |
| 256 | num_ht_copies_(0) { |
| 257 | set_enlarge_factor(ht_occupancy_flt); |
| 258 | set_shrink_factor(ht_empty_flt); |
| 259 | } |
| 260 | |
| 261 | template<typename K> |
| 262 | size_type hash(const K& v) const { |
| 263 | // We munge the hash value when we don't trust hasher::operator(). |
| 264 | return hash_munger<Key>::MungedHash(hasher::operator()(v)); |
| 265 | } |
| 266 | |
| 267 | float enlarge_factor() const { return enlarge_factor_; } |
| 268 | void set_enlarge_factor(float f) { enlarge_factor_ = f; } |
| 269 | float shrink_factor() const { return shrink_factor_; } |
| 270 | void set_shrink_factor(float f) { shrink_factor_ = f; } |
| 271 | |
| 272 | size_type enlarge_threshold() const { return enlarge_threshold_; } |
| 273 | void set_enlarge_threshold(size_type t) { enlarge_threshold_ = t; } |
| 274 | size_type shrink_threshold() const { return shrink_threshold_; } |
| 275 | void set_shrink_threshold(size_type t) { shrink_threshold_ = t; } |
| 276 | |
| 277 | size_type enlarge_size(size_type x) const { |
| 278 | return static_cast<size_type>(x * enlarge_factor_); |
| 279 | } |
| 280 | size_type shrink_size(size_type x) const { |
| 281 | return static_cast<size_type>(x * shrink_factor_); |
| 282 | } |
| 283 | |
| 284 | bool consider_shrink() const { return consider_shrink_; } |
| 285 | void set_consider_shrink(bool t) { consider_shrink_ = t; } |
| 286 | |
| 287 | bool use_empty() const { return use_empty_; } |
| 288 | void set_use_empty(bool t) { use_empty_ = t; } |
| 289 | |
| 290 | bool use_deleted() const { return use_deleted_; } |
| 291 | void set_use_deleted(bool t) { use_deleted_ = t; } |
| 292 | |
| 293 | size_type num_ht_copies() const { |
| 294 | return static_cast<size_type>(num_ht_copies_); |
| 295 | } |
| 296 | void inc_num_ht_copies() { ++num_ht_copies_; } |
| 297 | |
| 298 | // Reset the enlarge and shrink thresholds |
| 299 | void reset_thresholds(size_type num_buckets) { |
| 300 | set_enlarge_threshold(enlarge_size(num_buckets)); |
| 301 | set_shrink_threshold(shrink_size(num_buckets)); |
| 302 | // whatever caused us to reset already considered |
| 303 | set_consider_shrink(false); |
| 304 | } |
| 305 | |
| 306 | // Caller is resposible for calling reset_threshold right after |
| 307 | // set_resizing_parameters. |
| 308 | void set_resizing_parameters(float shrink, float grow) { |
| 309 | assert(shrink >= 0.0); |
| 310 | assert(grow <= 1.0); |
| 311 | if (shrink > grow / 2.0f) |
| 312 | shrink = grow / 2.0f; // otherwise we thrash hashtable size |
| 313 | set_shrink_factor(shrink); |
| 314 | set_enlarge_factor(grow); |
| 315 | } |
| 316 | |
| 317 | // This is the smallest size a hashtable can be without being too crowded |
| 318 | // If you like, you can give a min #buckets as well as a min #elts |
| 319 | size_type min_buckets(size_type num_elts, size_type min_buckets_wanted) { |
| 320 | float enlarge = enlarge_factor(); |
| 321 | size_type sz = HT_MIN_BUCKETS; // min buckets allowed |
| 322 | while (sz < min_buckets_wanted || |
| 323 | num_elts >= static_cast<size_type>(sz * enlarge)) { |
| 324 | // This just prevents overflowing size_type, since sz can exceed |
| 325 | // max_size() here. |
| 326 | if (static_cast<size_type>(sz * 2) < sz) { |
| 327 | throw std::length_error("resize overflow" ); // protect against overflow |
| 328 | } |
| 329 | sz *= 2; |
| 330 | } |
| 331 | return sz; |
| 332 | } |
| 333 | |
| 334 | private: |
| 335 | template <class HashKey> |
| 336 | class hash_munger { |
| 337 | public: |
| 338 | static size_t MungedHash(size_t hash) { return hash; } |
| 339 | }; |
| 340 | // This matches when the hashtable key is a pointer. |
| 341 | template <class HashKey> |
| 342 | class hash_munger<HashKey*> { |
| 343 | public: |
| 344 | static size_t MungedHash(size_t hash) { |
| 345 | // TODO(csilvers): consider rotating instead: |
| 346 | // static const int shift = (sizeof(void *) == 4) ? 2 : 3; |
| 347 | // return (hash << (sizeof(hash) * 8) - shift)) | (hash >> |
| 348 | // shift); |
| 349 | // This matters if we ever change sparse/dense_hash_* to compare |
| 350 | // hashes before comparing actual values. It's speedy on x86. |
| 351 | return hash / sizeof(void*); // get rid of known-0 bits |
| 352 | } |
| 353 | }; |
| 354 | |
| 355 | size_type enlarge_threshold_; // table.size() * enlarge_factor |
| 356 | size_type shrink_threshold_; // table.size() * shrink_factor |
| 357 | float enlarge_factor_; // how full before resize |
| 358 | float shrink_factor_; // how empty before resize |
| 359 | // consider_shrink=true if we should try to shrink before next insert |
| 360 | bool consider_shrink_; |
| 361 | bool use_empty_; // used only by densehashtable, not sparsehashtable |
| 362 | bool use_deleted_; // false until delkey has been set |
| 363 | // num_ht_copies is a counter incremented every Copy/Move |
| 364 | unsigned int num_ht_copies_; |
| 365 | }; |
| 366 | |
| 367 | } // namespace sparsehash_internal |
| 368 | } // namespace google |
| 369 | |