hash_map_string.cpp source code [ClickHouse/dbms/src/Interpreters/tests/hash_map_string.cpp]

1	#include <iostream>
2	#include <iomanip>
3	#include <vector>
4
5	#include <unordered_map>
6
7	#include <sparsehash/dense_hash_map>
8	#include <sparsehash/sparse_hash_map>
9
10	#include <Common/Stopwatch.h>
11
12	//#define DBMS_HASH_MAP_COUNT_COLLISIONS
13	#define DBMS_HASH_MAP_DEBUG_RESIZES
14
15	#include <Core/Types.h>
16	#include <IO/ReadBufferFromFile.h>
17	#include <IO/ReadHelpers.h>
18	#include <Compression/CompressedReadBuffer.h>
19	#include <common/StringRef.h>
20	#include <Common/HashTable/HashMap.h>
21	#include <Interpreters/AggregationCommon.h>
22
23
24	struct CompactStringRef
25	{
26	union
27	{
28	const char * data_mixed = nullptr;
29	struct
30	{
31	char dummy[`6`];
32	UInt16 size;
33	};
34	};
35
36	CompactStringRef(const char * data_, size_t size_)
37	{
38	data_mixed = data_;
39	size = size_;
40	}
41
42	CompactStringRef(const unsigned char * data_, size_t size_) : CompactStringRef (reinterpret_cast<const char *>(data_), size_) {}
43	explicit CompactStringRef(const std::string & s) : CompactStringRef (s.data(), s.size()) {}
44	CompactStringRef() {}
45
46	const char * data() const { return reinterpret_cast<const char >(reinterpret_cast*<intptr_t>(data_mixed) & `0x0000FFFFFFFFFFFFULL`); }
47
48	std::string toString() const { return std::string (data(), size); }
49	};
50
51	inline bool operator==(CompactStringRef lhs, CompactStringRef rhs)
52	{
53	if (lhs.size != rhs.size)
54	return false;
55
56	const char * lhs_data = lhs.data();
57	const char * rhs_data = rhs.data();
58	for (size_t pos = lhs.size - `1`; pos < lhs.size; --pos)
59	if (lhs_data[pos] != rhs_data[pos])
60	return false;
61
62	return true;
63	}
64
65	namespace ZeroTraits
66	{
67	template <>
68	inline bool check<CompactStringRef>(CompactStringRef x) { return nullptr == x.data_mixed; }
69
70	template <>
71	inline void set<CompactStringRef>(CompactStringRef & x) { x.data_mixed = nullptr; }
72	}
73
74	template <>
75	struct DefaultHash<CompactStringRef>
76	{
77	size_t operator() (CompactStringRef x) const
78	{
79	return CityHash_v1_0_2::CityHash64(x.data(), x.size);
80	}
81	};
82
83
84	static inline UInt64 mix(UInt64 h)
85	{
86	h ^= h >> `23`;
87	h *= `0x2127599bf4325c37ULL`;
88	h ^= h >> `47`;
89	return h;
90	}
91
92	struct FastHash64
93	{
94	size_t operator() (CompactStringRef x) const
95	{
96	const char * buf = x.data();
97	size_t len = x.size;
98
99	const UInt64 m = `0x880355f21e6d1965ULL`;
100	const UInt64 pos = reinterpret_cast<const* UInt64 *>(buf);
101	const UInt64 *end = pos + (len / `8`);
102	const unsigned char *pos2;
103	UInt64 h = len * m;
104	UInt64 v;
105
106	while (pos != end)
107	{
108	v = *pos++;
109	h ^= mix(v);
110	h *= m;
111	}
112
113	pos2 = reinterpret_cast<const unsigned char*>(pos);
114	v = `0`;
115
116	switch (len & `7`)
117	{
118	case `7`: v ^= static_cast<UInt64>(pos2[`6`]) << `48`; [[fallthrough]];
119	case `6`: v ^= static_cast<UInt64>(pos2[`5`]) << `40`; [[fallthrough]];
120	case `5`: v ^= static_cast<UInt64>(pos2[`4`]) << `32`; [[fallthrough]];
121	case `4`: v ^= static_cast<UInt64>(pos2[`3`]) << `24`; [[fallthrough]];
122	case `3`: v ^= static_cast<UInt64>(pos2[`2`]) << `16`; [[fallthrough]];
123	case `2`: v ^= static_cast<UInt64>(pos2[`1`]) << `8`; [[fallthrough]];
124	case `1`: v ^= static_cast<UInt64>(pos2[`0`]);
125	h ^= mix(v);
126	h *= m;
127	}
128
129	return mix(h);
130	}
131	};
132
133
134	#if defined(__x86_64__)
135	struct CrapWow
136	{
137	size_t operator() (CompactStringRef x) const
138	{
139	const char * key = x.data();
140	size_t len = x.size;
141	size_t seed = `0`;
142
143	const UInt64 m = `0x95b47aa3355ba1a1`, n = `0x8a970be7488fda55`;
144
145	UInt64 hash;
146	// 3 = m, 4 = n
147	// r12 = h, r13 = k, ecx = seed, r12 = key
148	asm(
149	"leaq (%%rcx,%4), %%r13\n"
150	"movq %%rdx, %%r14\n"
151	"movq %%rcx, %%r15\n"
152	"movq %%rcx, %%r12\n"
153	"addq %%rax, %%r13\n"
154	"andq $0xfffffffffffffff0, %%rcx\n"
155	"jz QW%=\n"
156	"addq %%rcx, %%r14\n\n"
157	"negq %%rcx\n"
158	"XW%=:\n"
159	"movq %4, %%rax\n"
160	"mulq (%%r14,%%rcx)\n"
161	"xorq %%rax, %%r12\n"
162	"xorq %%rdx, %%r13\n"
163	"movq %3, %%rax\n"
164	"mulq 8(%%r14,%%rcx)\n"
165	"xorq %%rdx, %%r12\n"
166	"xorq %%rax, %%r13\n"
167	"addq $16, %%rcx\n"
168	"jnz XW%=\n"
169	"QW%=:\n"
170	"movq %%r15, %%rcx\n"
171	"andq $8, %%r15\n"
172	"jz B%=\n"
173	"movq %4, %%rax\n"
174	"mulq (%%r14)\n"
175	"addq $8, %%r14\n"
176	"xorq %%rax, %%r12\n"
177	"xorq %%rdx, %%r13\n"
178	"B%=:\n"
179	"andq $7, %%rcx\n"
180	"jz F%=\n"
181	"movq $1, %%rdx\n"
182	"shlq $3, %%rcx\n"
183	"movq %3, %%rax\n"
184	"shlq %%cl, %%rdx\n"
185	"addq $-1, %%rdx\n"
186	"andq (%%r14), %%rdx\n"
187	"mulq %%rdx\n"
188	"xorq %%rdx, %%r12\n"
189	"xorq %%rax, %%r13\n"
190	"F%=:\n"
191	"leaq (%%r13,%4), %%rax\n"
192	"xorq %%r12, %%rax\n"
193	"mulq %4\n"
194	"xorq %%rdx, %%rax\n"
195	"xorq %%r12, %%rax\n"
196	"xorq %%r13, %%rax\n"
197	: "=a"(hash), "=c"(key), "=d"(key)
198	: "r"(m), "r"(n), "a"(seed), "c"(len), "d"(key)
199	: "%r12", "%r13", "%r14", "%r15", "cc"
200	);
201	return hash;
202	}
203	};
204	#endif
205
206
207	struct SimpleHash
208	{
209	size_t operator() (CompactStringRef x) const
210	{
211	const char * pos = x.data();
212	size_t size = x.size;
213
214	const char * end = pos + size;
215
216	size_t res = `0`;
217
218	if (size == `0`)
219	return `0`;
220
221	if (size < `8`)
222	{
223	memcpy(reinterpret_cast<char *>(&res), pos, size);
224	return intHash64(res);
225	}
226
227	while (pos + `8` < end)
228	{
229	UInt64 word = *reinterpret_cast<const UInt64 *>(pos);
230	res = intHash64(word ^ res);
231
232	pos += `8`;
233	}
234
235	UInt64 word = *reinterpret_cast<const UInt64 *>(end - `8`);
236	res = intHash64(word ^ res);
237
238	return res;
239	}
240	};
241
242
243	using Key = CompactStringRef;
244	using Value = UInt64;
245
246
247	struct Grower : public HashTableGrower<>
248	{
249	/// The state of this structure is enough to get the buffer size of the hash table.
250
251	/// Defines the initial size of the hash table.
252	static const size_t initial_size_degree = `16`;
253	Grower() { size_degree = initial_size_degree; }
254
255	size_t max_fill = (`1ULL` << initial_size_degree) * `0.9`;
256
257	/// The size of the hash table in the cells.
258	size_t bufSize() const { return `1ULL` << size_degree; }
259
260	size_t maxFill() const { return max_fill /1 << (size_degree - 1)/; }
261	size_t mask() const { return bufSize() - `1`; }
262
263	/// From the hash value, get the cell number in the hash table.
264	size_t place(size_t x) const { return x & mask(); }
265
266	/// The next cell in the collision resolution chain.
267	size_t next(size_t pos) const { ++pos; return pos & mask(); }
268
269	/// Whether the hash table is sufficiently full. You need to increase the size of the hash table, or remove something unnecessary from it.
270	bool overflow(size_t elems) const { return elems > maxFill(); }
271
272	/// Increase the size of the hash table.
273	void increaseSize()
274	{
275	size_degree += size_degree >= `23` ? `1` : `2`;
276	max_fill = (`1ULL` << size_degree) * `0.9`;
277	}
278
279	/// Set the buffer size by the number of elements in the hash table. Used when deserializing a hash table.
280	[[noreturn]] void set(size_t /num_elems/)
281	{
282	throw Poco::Exception (__PRETTY_FUNCTION__);
283	}
284	};
285
286
287	int main(int argc, char ** argv)
288	{
289	if (argc < `3`)
290	{
291	std::cerr << "Usage: program n m\n";
292	return `1`;
293	}
294
295	size_t n = atoi(argv[`1`]);
296	size_t m = atoi(argv[`2`]);
297
298	DB::Arena pool;
299	std::vector<Key> data(n);
300
301	std::cerr << "sizeof(Key) = " << sizeof(Key) << ", sizeof(Value) = " << sizeof(Value) << std::endl;
302
303	{
304	Stopwatch watch;
305	DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
306	DB::CompressedReadBuffer in2(in1);
307
308	std::string tmp;
309	for (size_t i = `0`; i < n && !in2.eof(); ++i)
310	{
311	DB::readStringBinary(tmp, in2);
312	data [i] = Key (pool.insert(tmp.data(), tmp.size()), tmp.size());
313	}
314
315	watch.stop();
316	std::cerr << std::fixed << std::setprecision(`2`)
317	<< "Vector. Size: " << n
318	<< ", elapsed: " << watch.elapsedSeconds()
319	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
320	<< std::endl;
321	}
322
323	if (!m \|\| m == `1`)
324	{
325	Stopwatch watch;
326
327	//using Map = HashMap<Key, Value>;
328
329	/// Saving the hash accelerates the resize by about 2 times, and the overall performance by 6-8%.
330	using Map = HashMapWithSavedHash<Key, Value, DefaultHash<Key>, Grower>;
331
332	Map map;
333	Map::LookupResult it;
334	bool inserted;
335
336	for (size_t i = `0`; i < n; ++i)
337	{
338	map.emplace(data [i], it, inserted);
339	if (inserted)
340	it->getMapped() = `0`;
341	++it->getMapped();
342	}
343
344	watch.stop();
345	std::cerr << std::fixed << std::setprecision(`2`)
346	<< "HashMap (CityHash64). Size: " << map.size()
347	<< ", elapsed: " << watch.elapsedSeconds()
348	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
349	#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
350	<< ", collisions: " << map.getCollisions()
351	#endif
352	<< std::endl;
353	}
354
355	if (!m \|\| m == `2`)
356	{
357	Stopwatch watch;
358
359	using Map = HashMapWithSavedHash<Key, Value, FastHash64, Grower>;
360
361	Map map;
362	Map::LookupResult it;
363	bool inserted;
364
365	for (size_t i = `0`; i < n; ++i)
366	{
367	map.emplace(data [i], it, inserted);
368	if (inserted)
369	it->getMapped() = `0`;
370	++it->getMapped();
371	}
372
373	watch.stop();
374	std::cerr << std::fixed << std::setprecision(`2`)
375	<< "HashMap (FastHash64). Size: " << map.size()
376	<< ", elapsed: " << watch.elapsedSeconds()
377	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
378	#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
379	<< ", collisions: " << map.getCollisions()
380	#endif
381	<< std::endl;
382	}
383
384	#if defined(__x86_64__)
385	if (!m \|\| m == `3`)
386	{
387	Stopwatch watch;
388
389	using Map = HashMapWithSavedHash<Key, Value, CrapWow, Grower>;
390
391	Map map;
392	Map::LookupResult it;
393	bool inserted;
394
395	for (size_t i = `0`; i < n; ++i)
396	{
397	map.emplace(data [i], it, inserted);
398	if (inserted)
399	it->getMapped() = `0`;
400	++it->getMapped();
401	}
402
403	watch.stop();
404	std::cerr << std::fixed << std::setprecision(`2`)
405	<< "HashMap (CrapWow). Size: " << map.size()
406	<< ", elapsed: " << watch.elapsedSeconds()
407	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
408	#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
409	<< ", collisions: " << map.getCollisions()
410	#endif
411	<< std::endl;
412	}
413	#endif
414
415	if (!m \|\| m == `4`)
416	{
417	Stopwatch watch;
418
419	using Map = HashMapWithSavedHash<Key, Value, SimpleHash, Grower>;
420
421	Map map;
422	Map::LookupResult it;
423	bool inserted;
424
425	for (size_t i = `0`; i < n; ++i)
426	{
427	map.emplace(data [i], it, inserted);
428	if (inserted)
429	it->getMapped() = `0`;
430	++it->getMapped();
431	}
432
433	watch.stop();
434	std::cerr << std::fixed << std::setprecision(`2`)
435	<< "HashMap (SimpleHash). Size: " << map.size()
436	<< ", elapsed: " << watch.elapsedSeconds()
437	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
438	#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
439	<< ", collisions: " << map.getCollisions()
440	#endif
441	<< std::endl;
442	}
443
444	if (!m \|\| m == `5`)
445	{
446	Stopwatch watch;
447
448	std::unordered_map<Key, Value, DefaultHash<Key>> map;
449	for (size_t i = `0`; i < n; ++i)
450	++map [data [i]];
451
452	watch.stop();
453	std::cerr << std::fixed << std::setprecision(`2`)
454	<< "std::unordered_map. Size: " << map.size()
455	<< ", elapsed: " << watch.elapsedSeconds()
456	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
457	<< std::endl;
458	}
459
460	if (!m \|\| m == `6`)
461	{
462	Stopwatch watch;
463
464	google::dense_hash_map<Key, Value, DefaultHash<Key>> map;
465	map.set_empty_key(Key ("\0", `1`));
466	for (size_t i = `0`; i < n; ++i)
467	++map [data [i]];
468
469	watch.stop();
470	std::cerr << std::fixed << std::setprecision(`2`)
471	<< "google::dense_hash_map. Size: " << map.size()
472	<< ", elapsed: " << watch.elapsedSeconds()
473	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
474	<< std::endl;
475	}
476
477	if (!m \|\| m == `7`)
478	{
479	Stopwatch watch;
480
481	google::sparse_hash_map<Key, Value, DefaultHash<Key>> map;
482	for (size_t i = `0`; i < n; ++i)
483	++map [data [i]];
484
485	watch.stop();
486	std::cerr << std::fixed << std::setprecision(`2`)
487	<< "google::sparse_hash_map. Size: " << map.size()
488	<< ", elapsed: " << watch.elapsedSeconds()
489	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
490	<< std::endl;
491	}
492
493	return `0`;
494	}
495

Browse the source code of ClickHouse/dbms/src/Interpreters/tests/hash_map_string.cpp