hashtable-common.h source code [ClickHouse/contrib/sparsehash-c11/sparsehash/internal/hashtable-common.h]

1	// Copyright (c) 2010, Google Inc.
2	// All rights reserved.
3	//
4	// Redistribution and use in source and binary forms, with or without
5	// modification, are permitted provided that the following conditions are
6	// met:
7	//
8	// Redistributions of source code must retain the above copyright*
9	// notice, this list of conditions and the following disclaimer.
10	// Redistributions in binary form must reproduce the above*
11	// copyright notice, this list of conditions and the following disclaimer
12	// in the documentation and/or other materials provided with the
13	// distribution.
14	// Neither the name of Google Inc. nor the names of its*
15	// contributors may be used to endorse or promote products derived from
16	// this software without specific prior written permission.
17	//
18	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30	// ---
31	//
32	// Provides classes shared by both sparse and dense hashtable.
33	//
34	// sh_hashtable_settings has parameters for growing and shrinking
35	// a hashtable. It also packages zero-size functor (ie. hasher).
36	//
37	// Other functions and classes provide common code for serializing
38	// and deserializing hashtables to a stream (such as a FILE).*
39
40	#pragma once
41
42	#include <cassert>
43	#include <cstdio>
44	#include <cstddef> // for size_t
45	#include <iosfwd>
46	#include <stdexcept> // For length_error
47
48	namespace google {
49	namespace sparsehash_internal {
50
51	template<typename... Ts> struct make_void { typedef void type;};
52	template<typename... Ts> using void_t = typename make_void<Ts...>::type;
53
54	template <class HashFcn, class = void>
55	struct has_is_transparent : std::false_type {};
56
57	template <class HashFcn>
58	struct has_is_transparent<HashFcn, void_t<typename HashFcn::transparent_key_equal::is_transparent>> : std::true_type {};
59
60	template <class HashFcn, class = void, class = void>
61	struct has_transparent_key_equal : std::false_type {};
62
63	template <class HashFcn, class Key>
64	struct has_transparent_key_equal<HashFcn, Key, void_t<typename HashFcn::transparent_key_equal>> : std::true_type {};
65
66	template <class HashFcn, class EqualKey, bool = has_transparent_key_equal<HashFcn>::value>
67	struct key_equal_chosen {
68	using type = EqualKey;
69	};
70
71	template <class HashFcn, class EqualKey>
72	struct key_equal_chosen<HashFcn, EqualKey, true> {
73	using type = typename HashFcn::transparent_key_equal;
74	};
75
76	// Adaptor methods for reading/writing data from an INPUT or OUPTUT
77	// variable passed to serialize() or unserialize(). For now we
78	// have implemented INPUT/OUTPUT for FILE, istream/ostream (note*
79	// they are pointers, unlike typical use), or else a pointer to
80	// something that supports a Read()/Write() method.
81	//
82	// For technical reasons, we implement read_data/write_data in two
83	// stages. The actual work is done in _data_internal, which takes*
84	// the stream argument twice: once as a template type, and once with
85	// normal type information. (We only use the second version.) We do
86	// this because of how C++ picks what function overload to use. If we
87	// implemented this the naive way:
88	// bool read_data(istream is, const void* data, size_t length);*
89	// template<typename T> read_data(T fp, const void* data, size_t length);*
90	// C++ would prefer the second version for every stream type except
91	// istream. However, we want C++ to prefer the first version for
92	// streams that are subclasses* of istream, such as istringstream.*
93	// This is not possible given the way template types are resolved. So
94	// we split the stream argument in two, one of which is templated and
95	// one of which is not. The specialized functions (like the istream
96	// version above) ignore the template arg and use the second, 'type'
97	// arg, getting subclass matching as normal. The 'catch-all'
98	// functions (the second version above) use the template arg to deduce
99	// the type, and use a second, void arg to achieve the desired*
100	// 'catch-all' semantics.
101
102	// ----- low-level I/O for FILE ----*
103
104	template <typename Ignored>
105	inline bool read_data_internal(Ignored, FILE fp, void* data, size_t length) {
106	return fread(data, length, `1`, fp) == `1`;
107	}
108
109	template <typename Ignored>
110	inline bool write_data_internal(Ignored, FILE fp, const void* data,
111	size_t length) {
112	return fwrite(data, length, `1`, fp) == `1`;
113	}
114
115	// ----- low-level I/O for iostream ----
116
117	// We want the caller to be responsible for #including <iostream>, not
118	// us, because iostream is a big header! According to the standard,
119	// it's only legal to delay the instantiation the way we want to if
120	// the istream/ostream is a template type. So we jump through hoops.
121	template <typename ISTREAM>
122	inline bool read_data_internal_for_istream(ISTREAM* fp, void* data,
123	size_t length) {
124	return fp->read(reinterpret_cast<char*>(data), length).good();
125	}
126	template <typename Ignored>
127	inline bool read_data_internal(Ignored, std::istream fp, void* data,
128	size_t length) {
129	return read_data_internal_for_istream(fp, data, length);
130	}
131
132	template <typename OSTREAM>
133	inline bool write_data_internal_for_ostream(OSTREAM* fp, const void* data,
134	size_t length) {
135	return fp->write(reinterpret_cast<const char*>(data), length).good();
136	}
137	template <typename Ignored>
138	inline bool write_data_internal(Ignored, std::ostream fp, const void* data,
139	size_t length) {
140	return write_data_internal_for_ostream(fp, data, length);
141	}
142
143	// ----- low-level I/O for custom streams ----
144
145	// The INPUT type needs to support a Read() method that takes a
146	// buffer and a length and returns the number of bytes read.
147	template <typename INPUT>
148	inline bool read_data_internal(INPUT* fp, void, void** data, size_t length) {
149	return static_cast<size_t>(fp->Read(data, length)) == length;
150	}
151
152	// The OUTPUT type needs to support a Write() operation that takes
153	// a buffer and a length and returns the number of bytes written.
154	template <typename OUTPUT>
155	inline bool write_data_internal(OUTPUT* fp, void, const* void* data,
156	size_t length) {
157	return static_cast<size_t>(fp->Write(data, length)) == length;
158	}
159
160	// ----- low-level I/O: the public API ----
161
162	template <typename INPUT>
163	inline bool read_data(INPUT* fp, void* data, size_t length) {
164	return read_data_internal(fp, fp, data, length);
165	}
166
167	template <typename OUTPUT>
168	inline bool write_data(OUTPUT* fp, const void* data, size_t length) {
169	return write_data_internal(fp, fp, data, length);
170	}
171
172	// Uses read_data() and write_data() to read/write an integer.
173	// length is the number of bytes to read/write (which may differ
174	// from sizeof(IntType), allowing us to save on a 32-bit system
175	// and load on a 64-bit system). Excess bytes are taken to be 0.
176	// INPUT and OUTPUT must match legal inputs to read/write_data (above).
177	template <typename INPUT, typename IntType>
178	bool read_bigendian_number(INPUT* fp, IntType* value, size_t length) {
179	*value = `0`;
180	unsigned char byte;
181	// We require IntType to be unsigned or else the shifting gets all screwy.
182	static_assert(static_cast<IntType>(-`1`) > static_cast<IntType>(`0`),
183	"serializing int requires an unsigned type");
184	for (size_t i = `0`; i < length; ++i) {
185	if (!read_data(fp, &byte, sizeof(byte))) return false;
186	value \|= static_cast<IntType>(byte) << ((length - `1` - i) `8`);
187	}
188	return true;
189	}
190
191	template <typename OUTPUT, typename IntType>
192	bool write_bigendian_number(OUTPUT* fp, IntType value, size_t length) {
193	unsigned char byte;
194	// We require IntType to be unsigned or else the shifting gets all screwy.
195	static_assert(static_cast<IntType>(-`1`) > static_cast<IntType>(`0`),
196	"serializing int requires an unsigned type");
197	for (size_t i = `0`; i < length; ++i) {
198	byte = (sizeof(value) <= length - `1` - i)
199	? `0`
200	: static_cast<unsigned char>((value >> ((length - `1` - i) * `8`)) &
201	`255`);
202	if (!write_data(fp, &byte, sizeof(byte))) return false;
203	}
204	return true;
205	}
206
207	// If your keys and values are simple enough, you can pass this
208	// serializer to serialize()/unserialize(). "Simple enough" means
209	// value_type is a POD type that contains no pointers. Note,
210	// however, we don't try to normalize endianness.
211	// This is the type used for NopointerSerializer.
212	template <typename value_type>
213	struct pod_serializer {
214	template <typename INPUT>
215	bool operator()(INPUT* fp, value_type* value) const {
216	return read_data(fp, value, sizeof(*value));
217	}
218
219	template <typename OUTPUT>
220	bool operator()(OUTPUT* fp, const value_type& value) const {
221	return write_data(fp, &value, sizeof(value));
222	}
223	};
224
225	// Settings contains parameters for growing and shrinking the table.
226	// It also packages zero-size functor (ie. hasher).
227	//
228	// It does some munging of the hash value in cases where we think
229	// (fear) the original hash function might not be very good. In
230	// particular, the default hash of pointers is the identity hash,
231	// so probably all the low bits are 0. We identify when we think
232	// we're hashing a pointer, and chop off the low bits. Note this
233	// isn't perfect: even when the key is a pointer, we can't tell
234	// for sure that the hash is the identity hash. If it's not, this
235	// is needless work (and possibly, though not likely, harmful).
236
237	template <typename Key, typename HashFunc, typename SizeType,
238	int HT_MIN_BUCKETS>
239	class sh_hashtable_settings : public HashFunc {
240	public:
241	typedef Key key_type;
242	typedef HashFunc hasher;
243	typedef SizeType size_type;
244	static_assert(!has_transparent_key_equal<HashFunc>::value \|\| has_is_transparent<HashFunc, void>::value,
245	"hash provided non-transparent key_equal");
246
247	public:
248	sh_hashtable_settings(const hasher& hf, const float ht_occupancy_flt,
249	const float ht_empty_flt)
250	: hasher(hf),
251	enlarge_threshold_(`0`),
252	shrink_threshold_(`0`),
253	consider_shrink_(false),
254	use_empty_(false),
255	use_deleted_(false),
256	num_ht_copies_(`0`) {
257	set_enlarge_factor(ht_occupancy_flt);
258	set_shrink_factor(ht_empty_flt);
259	}
260
261	template<typename K>
262	size_type hash(const K& v) const {
263	// We munge the hash value when we don't trust hasher::operator().
264	return hash_munger<Key>::MungedHash(hasher::operator()(v));
265	}
266
267	float enlarge_factor() const { return enlarge_factor_; }
268	void set_enlarge_factor(float f) { enlarge_factor_ = f; }
269	float shrink_factor() const { return shrink_factor_; }
270	void set_shrink_factor(float f) { shrink_factor_ = f; }
271
272	size_type enlarge_threshold() const { return enlarge_threshold_; }
273	void set_enlarge_threshold(size_type t) { enlarge_threshold_ = t; }
274	size_type shrink_threshold() const { return shrink_threshold_; }
275	void set_shrink_threshold(size_type t) { shrink_threshold_ = t; }
276
277	size_type enlarge_size(size_type x) const {
278	return static_cast<size_type>(x * enlarge_factor_);
279	}
280	size_type shrink_size(size_type x) const {
281	return static_cast<size_type>(x * shrink_factor_);
282	}
283
284	bool consider_shrink() const { return consider_shrink_; }
285	void set_consider_shrink(bool t) { consider_shrink_ = t; }
286
287	bool use_empty() const { return use_empty_; }
288	void set_use_empty(bool t) { use_empty_ = t; }
289
290	bool use_deleted() const { return use_deleted_; }
291	void set_use_deleted(bool t) { use_deleted_ = t; }
292
293	size_type num_ht_copies() const {
294	return static_cast<size_type>(num_ht_copies_);
295	}
296	void inc_num_ht_copies() { ++num_ht_copies_; }
297
298	// Reset the enlarge and shrink thresholds
299	void reset_thresholds(size_type num_buckets) {
300	set_enlarge_threshold(enlarge_size(num_buckets));
301	set_shrink_threshold(shrink_size(num_buckets));
302	// whatever caused us to reset already considered
303	set_consider_shrink(false);
304	}
305
306	// Caller is resposible for calling reset_threshold right after
307	// set_resizing_parameters.
308	void set_resizing_parameters(float shrink, float grow) {
309	assert(shrink >= `0.0`);
310	assert(grow <= `1.0`);
311	if (shrink > grow / `2.0f`)
312	shrink = grow / `2.0f`; // otherwise we thrash hashtable size
313	set_shrink_factor(shrink);
314	set_enlarge_factor(grow);
315	}
316
317	// This is the smallest size a hashtable can be without being too crowded
318	// If you like, you can give a min #buckets as well as a min #elts
319	size_type min_buckets(size_type num_elts, size_type min_buckets_wanted) {
320	float enlarge = enlarge_factor();
321	size_type sz = HT_MIN_BUCKETS; // min buckets allowed
322	while (sz < min_buckets_wanted \|\|
323	num_elts >= static_cast<size_type>(sz * enlarge)) {
324	// This just prevents overflowing size_type, since sz can exceed
325	// max_size() here.
326	if (static_cast<size_type>(sz * `2`) < sz) {
327	throw std::length_error ("resize overflow"); // protect against overflow
328	}
329	sz *= `2`;
330	}
331	return sz;
332	}
333
334	private:
335	template <class HashKey>
336	class hash_munger {
337	public:
338	static size_t MungedHash(size_t hash) { return hash; }
339	};
340	// This matches when the hashtable key is a pointer.
341	template <class HashKey>
342	class hash_munger<HashKey*> {
343	public:
344	static size_t MungedHash(size_t hash) {
345	// TODO(csilvers): consider rotating instead:
346	// static const int shift = (sizeof(void ) == 4) ? 2 : 3;*
347	// return (hash << (sizeof(hash) 8) - shift)) \| (hash >>*
348	// shift);
349	// This matters if we ever change sparse/dense_hash_ to compare*
350	// hashes before comparing actual values. It's speedy on x86.
351	return hash / sizeof(void); // get rid of known-0 bits*
352	}
353	};
354
355	size_type enlarge_threshold_; // table.size() enlarge_factor*
356	size_type shrink_threshold_; // table.size() shrink_factor*
357	float enlarge_factor_; // how full before resize
358	float shrink_factor_; // how empty before resize
359	// consider_shrink=true if we should try to shrink before next insert
360	bool consider_shrink_;
361	bool use_empty_; // used only by densehashtable, not sparsehashtable
362	bool use_deleted_; // false until delkey has been set
363	// num_ht_copies is a counter incremented every Copy/Move
364	unsigned int num_ht_copies_;
365	};
366
367	} // namespace sparsehash_internal
368	} // namespace google
369

Browse the source code of ClickHouse/contrib/sparsehash-c11/sparsehash/internal/hashtable-common.h