1#include <iostream>
2#include <iomanip>
3#include <vector>
4
5#include <Common/Stopwatch.h>
6
7#include <farmhash.h>
8#include <metrohash.h>
9
10#define DBMS_HASH_MAP_COUNT_COLLISIONS
11#define DBMS_HASH_MAP_DEBUG_RESIZES
12
13#include <Core/Types.h>
14#include <IO/ReadBufferFromFile.h>
15#include <IO/ReadHelpers.h>
16#include <Compression/CompressedReadBuffer.h>
17#include <common/StringRef.h>
18#include <Common/HashTable/HashMap.h>
19#include <Interpreters/AggregationCommon.h>
20
21#ifdef __SSE4_2__
22 #include <smmintrin.h>
23#endif
24
25
26/** Do this:
27for file in MobilePhoneModel PageCharset Params URLDomain UTMSource Referer URL Title; do
28 for size in 30000 100000 300000 1000000 5000000; do
29 echo
30 BEST_METHOD=0
31 BEST_RESULT=0
32 for method in {1..11}; do
33 echo -ne $file $size $method '';
34 TOTAL_ELEMS=0
35 for i in {0..1000}; do
36 TOTAL_ELEMS=$(( $TOTAL_ELEMS + $size ))
37 if [[ $TOTAL_ELEMS -gt 25000000 ]]; then break; fi
38 ./hash_map_string_3 $size $method < ${file}.bin 2>&1 |
39 grep HashMap | grep -oE '[0-9\.]+ elem';
40 done | awk -W interactive '{ if ($1 > x) { x = $1 }; printf(".") } END { print x }' | tee /tmp/hash_map_string_3_res;
41 CUR_RESULT=$(cat /tmp/hash_map_string_3_res | tr -d '.')
42 if [[ $CUR_RESULT -gt $BEST_RESULT ]]; then
43 BEST_METHOD=$method
44 BEST_RESULT=$CUR_RESULT
45 fi;
46 done;
47 echo Best: $BEST_METHOD - $BEST_RESULT
48 done;
49done
50*/
51
52
53#define DefineStringRef(STRUCT) \
54\
55struct STRUCT : public StringRef {}; \
56\
57namespace ZeroTraits \
58{ \
59 template <> \
60 inline bool check<STRUCT>(STRUCT x) { return nullptr == x.data; } \
61 \
62 template <> \
63 inline void set<STRUCT>(STRUCT & x) { x.data = nullptr; } \
64} \
65 \
66template <> \
67struct DefaultHash<STRUCT> \
68{ \
69 size_t operator() (STRUCT x) const \
70 { \
71 return CityHash_v1_0_2::CityHash64(x.data, x.size); \
72 } \
73};
74
75
76DefineStringRef(StringRef_CompareMemcmp)
77DefineStringRef(StringRef_CompareAlwaysTrue)
78
79
80inline bool operator==(StringRef_CompareMemcmp lhs, StringRef_CompareMemcmp rhs)
81{
82 if (lhs.size != rhs.size)
83 return false;
84
85 if (lhs.size == 0)
86 return true;
87
88 return 0 == memcmp(lhs.data, rhs.data, lhs.size);
89}
90
91inline bool operator==(StringRef_CompareAlwaysTrue, StringRef_CompareAlwaysTrue)
92{
93 return true;
94}
95
96
97struct FastHash64
98{
99 static inline uint64_t mix(uint64_t h)
100 {
101 h ^= h >> 23;
102 h *= 0x2127599bf4325c37ULL;
103 h ^= h >> 47;
104 return h;
105 }
106
107 size_t operator() (StringRef x) const
108 {
109 const char * buf = x.data;
110 size_t len = x.size;
111
112 const uint64_t m = 0x880355f21e6d1965ULL;
113 const uint64_t *pos = reinterpret_cast<const uint64_t *>(buf);
114 const uint64_t *end = pos + (len / 8);
115 const unsigned char *pos2;
116 uint64_t h = len * m;
117 uint64_t v;
118
119 while (pos != end)
120 {
121 v = *pos++;
122 h ^= mix(v);
123 h *= m;
124 }
125
126 pos2 = reinterpret_cast<const unsigned char*>(pos);
127 v = 0;
128
129 switch (len & 7)
130 {
131 case 7: v ^= static_cast<uint64_t>(pos2[6]) << 48; [[fallthrough]];
132 case 6: v ^= static_cast<uint64_t>(pos2[5]) << 40; [[fallthrough]];
133 case 5: v ^= static_cast<uint64_t>(pos2[4]) << 32; [[fallthrough]];
134 case 4: v ^= static_cast<uint64_t>(pos2[3]) << 24; [[fallthrough]];
135 case 3: v ^= static_cast<uint64_t>(pos2[2]) << 16; [[fallthrough]];
136 case 2: v ^= static_cast<uint64_t>(pos2[1]) << 8; [[fallthrough]];
137 case 1: v ^= static_cast<uint64_t>(pos2[0]);
138 h ^= mix(v);
139 h *= m;
140 }
141
142 return mix(h);
143 }
144};
145
146
147struct FNV1a
148{
149 size_t operator() (StringRef x) const
150 {
151 size_t res = 0xcbf29ce484222325ULL;
152
153 const char * pos = x.data;
154 const char * end = x.data + x.size;
155
156 for (; pos < end; ++pos)
157 {
158 res *= 1099511628211ULL;
159 res ^= *pos;
160 }
161
162 return res;
163 }
164};
165
166
167#ifdef __SSE4_2__
168
169struct CrapWow
170{
171 size_t operator() (StringRef x) const
172 {
173 const char * key = x.data;
174 size_t len = x.size;
175 size_t seed = 0;
176
177 const UInt64 m = 0x95b47aa3355ba1a1, n = 0x8a970be7488fda55;
178 UInt64 hash;
179 // 3 = m, 4 = n
180 // r12 = h, r13 = k, ecx = seed, r12 = key
181 asm(
182 "leaq (%%rcx,%4), %%r13\n"
183 "movq %%rdx, %%r14\n"
184 "movq %%rcx, %%r15\n"
185 "movq %%rcx, %%r12\n"
186 "addq %%rax, %%r13\n"
187 "andq $0xfffffffffffffff0, %%rcx\n"
188 "jz QW%=\n"
189 "addq %%rcx, %%r14\n\n"
190 "negq %%rcx\n"
191 "XW%=:\n"
192 "movq %4, %%rax\n"
193 "mulq (%%r14,%%rcx)\n"
194 "xorq %%rax, %%r12\n"
195 "xorq %%rdx, %%r13\n"
196 "movq %3, %%rax\n"
197 "mulq 8(%%r14,%%rcx)\n"
198 "xorq %%rdx, %%r12\n"
199 "xorq %%rax, %%r13\n"
200 "addq $16, %%rcx\n"
201 "jnz XW%=\n"
202 "QW%=:\n"
203 "movq %%r15, %%rcx\n"
204 "andq $8, %%r15\n"
205 "jz B%=\n"
206 "movq %4, %%rax\n"
207 "mulq (%%r14)\n"
208 "addq $8, %%r14\n"
209 "xorq %%rax, %%r12\n"
210 "xorq %%rdx, %%r13\n"
211 "B%=:\n"
212 "andq $7, %%rcx\n"
213 "jz F%=\n"
214 "movq $1, %%rdx\n"
215 "shlq $3, %%rcx\n"
216 "movq %3, %%rax\n"
217 "shlq %%cl, %%rdx\n"
218 "addq $-1, %%rdx\n"
219 "andq (%%r14), %%rdx\n"
220 "mulq %%rdx\n"
221 "xorq %%rdx, %%r12\n"
222 "xorq %%rax, %%r13\n"
223 "F%=:\n"
224 "leaq (%%r13,%4), %%rax\n"
225 "xorq %%r12, %%rax\n"
226 "mulq %4\n"
227 "xorq %%rdx, %%rax\n"
228 "xorq %%r12, %%rax\n"
229 "xorq %%r13, %%rax\n"
230 : "=a"(hash), "=c"(key), "=d"(key)
231 : "r"(m), "r"(n), "a"(seed), "c"(len), "d"(key)
232 : "%r12", "%r13", "%r14", "%r15", "cc"
233 );
234 return hash;
235 }
236};
237
238#endif
239
240
241struct SimpleHash
242{
243 size_t operator() (StringRef x) const
244 {
245 const char * pos = x.data;
246 size_t size = x.size;
247
248 const char * end = pos + size;
249
250 size_t res = 0;
251
252 if (size == 0)
253 return 0;
254
255 if (size < 8)
256 {
257#ifdef __SSE4_2__
258 return hashLessThan8(x.data, x.size);
259#endif
260 }
261
262 while (pos + 8 < end)
263 {
264 uint64_t word = *reinterpret_cast<const uint64_t *>(pos);
265 res = intHash64(word ^ res);
266
267 pos += 8;
268 }
269
270 uint64_t word = *reinterpret_cast<const uint64_t *>(end - 8);
271 res = intHash64(word ^ res);
272
273 return res;
274 }
275};
276
277
278struct VerySimpleHash
279{
280 size_t operator() (StringRef x) const
281 {
282 const char * pos = x.data;
283 size_t size = x.size;
284
285 const char * end = pos + size;
286
287 size_t res = 0;
288
289 if (size == 0)
290 return 0;
291
292 if (size < 8)
293 {
294#ifdef __SSE4_2__
295 return hashLessThan8(x.data, x.size);
296#endif
297 }
298
299 while (pos + 8 < end)
300 {
301 res ^= reinterpret_cast<const uint64_t *>(pos)[0];
302 res ^= res >> 33;
303 res *= 0xff51afd7ed558ccdULL;
304
305 pos += 8;
306 }
307
308 res ^= *reinterpret_cast<const uint64_t *>(end - 8);
309 res ^= res >> 33;
310 res *= 0xc4ceb9fe1a85ec53ULL;
311 res ^= res >> 33;
312
313 return res;
314 }
315};
316
317
318struct FarmHash64
319{
320 size_t operator() (StringRef x) const
321 {
322 return NAMESPACE_FOR_HASH_FUNCTIONS::Hash64(x.data, x.size);
323 }
324};
325
326
327template <void metrohash64(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out)>
328struct SMetroHash64
329{
330 size_t operator() (StringRef x) const
331 {
332 union
333 {
334 uint64_t u64;
335 std::uint8_t u8[sizeof(u64)];
336 };
337
338 metrohash64(reinterpret_cast<const std::uint8_t *>(x.data), x.size, 0, u8);
339
340 return u64;
341 }
342};
343
344
345#ifdef __SSE4_2__
346
347/*struct CRC32Hash
348{
349 size_t operator() (StringRef x) const
350 {
351 const char * pos = x.data;
352 size_t size = x.size;
353
354 if (size == 0)
355 return 0;
356
357 if (size < 8)
358 {
359 return hashLessThan8(x.data, x.size);
360 }
361
362 const char * end = pos + size;
363 size_t res = -1ULL;
364
365 do
366 {
367 uint64_t word = *reinterpret_cast<const uint64_t *>(pos);
368 res = _mm_crc32_u64(res, word);
369
370 pos += 8;
371 } while (pos + 8 < end);
372
373 uint64_t word = *reinterpret_cast<const uint64_t *>(end - 8);
374 res = _mm_crc32_u64(res, word);
375
376 return res;
377 }
378};*/
379
380
381struct CRC32ILPHash
382{
383 size_t operator() (StringRef x) const
384 {
385 const char * pos = x.data;
386 size_t size = x.size;
387
388 if (size == 0)
389 return 0;
390
391 if (size < 16)
392 {
393 return hashLessThan16(x.data, x.size);
394 }
395
396 const char * end = pos + size;
397 const char * end_16 = pos + size / 16 * 16;
398 size_t res0 = -1ULL;
399 size_t res1 = -1ULL;
400
401 do
402 {
403 uint64_t word0 = reinterpret_cast<const uint64_t *>(pos)[0];
404 uint64_t word1 = reinterpret_cast<const uint64_t *>(pos)[1];
405 res0 = _mm_crc32_u64(res0, word0);
406 res1 = _mm_crc32_u64(res1, word1);
407
408 pos += 16;
409 } while (pos < end_16);
410
411 uint64_t word0 = *reinterpret_cast<const uint64_t *>(end - 8);
412 uint64_t word1 = *reinterpret_cast<const uint64_t *>(end - 16);
413
414 /* return HashLen16(Rotate(word0 - word1, 43) + Rotate(res0, 30) + res1,
415 word0 + Rotate(word1 ^ k3, 20) - res0 + size);*/
416
417 res0 = _mm_crc32_u64(res0, word0);
418 res1 = _mm_crc32_u64(res1, word1);
419
420 return hashLen16(res0, res1);
421 }
422};
423
424#endif
425
426
427using Value = uint64_t;
428
429
430template <typename Key, typename Hash>
431void NO_INLINE bench(const std::vector<StringRef> & data, const char * name)
432{
433 Stopwatch watch;
434
435 using Map = HashMapWithSavedHash<Key, Value, Hash>;
436
437 Map map;
438 typename Map::LookupResult it;
439 bool inserted;
440
441 for (size_t i = 0, size = data.size(); i < size; ++i)
442 {
443 map.emplace(static_cast<const Key &>(data[i]), it, inserted);
444 if (inserted)
445 it->getMapped() = 0;
446 ++it->getMapped();
447 }
448
449 watch.stop();
450 std::cerr << std::fixed << std::setprecision(2)
451 << "HashMap (" << name << "). Size: " << map.size()
452 << ", elapsed: " << watch.elapsedSeconds()
453 << " (" << data.size() / watch.elapsedSeconds() << " elem/sec.)"
454#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
455 << ", collisions: " << map.getCollisions()
456#endif
457 << std::endl;
458}
459
460
461int main(int argc, char ** argv)
462{
463 if (argc < 3)
464 {
465 std::cerr << "Usage: program n m\n";
466 return 1;
467 }
468
469 size_t n = atoi(argv[1]);
470 size_t m = atoi(argv[2]);
471
472 DB::Arena pool;
473 std::vector<StringRef> data(n);
474
475 std::cerr << "sizeof(Key) = " << sizeof(StringRef) << ", sizeof(Value) = " << sizeof(Value) << std::endl;
476
477 {
478 Stopwatch watch;
479 DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
480 DB::CompressedReadBuffer in2(in1);
481
482 std::string tmp;
483 for (size_t i = 0; i < n && !in2.eof(); ++i)
484 {
485 DB::readStringBinary(tmp, in2);
486 data[i] = StringRef(pool.insert(tmp.data(), tmp.size()), tmp.size());
487 }
488
489 watch.stop();
490 std::cerr << std::fixed << std::setprecision(2)
491 << "Vector. Size: " << n
492 << ", elapsed: " << watch.elapsedSeconds()
493 << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
494 << std::endl;
495 }
496
497 if (!m || m == 1) bench<StringRef, StringRefHash64>(data, "StringRef_CityHash64");
498 if (!m || m == 2) bench<StringRef, FastHash64> (data, "StringRef_FastHash64");
499 if (!m || m == 3) bench<StringRef, SimpleHash> (data, "StringRef_SimpleHash");
500 if (!m || m == 4) bench<StringRef, FNV1a> (data, "StringRef_FNV1a");
501
502#ifdef __SSE4_2__
503 if (!m || m == 5) bench<StringRef, CrapWow> (data, "StringRef_CrapWow");
504 if (!m || m == 6) bench<StringRef, CRC32Hash> (data, "StringRef_CRC32Hash");
505 if (!m || m == 7) bench<StringRef, CRC32ILPHash> (data, "StringRef_CRC32ILPHash");
506#endif
507
508 if (!m || m == 8) bench<StringRef, VerySimpleHash> (data, "StringRef_VerySimpleHash");
509 if (!m || m == 9) bench<StringRef, FarmHash64> (data, "StringRef_FarmHash64");
510 if (!m || m == 10) bench<StringRef, SMetroHash64<metrohash64_1>>(data, "StringRef_MetroHash64_1");
511 if (!m || m == 11) bench<StringRef, SMetroHash64<metrohash64_2>>(data, "StringRef_MetroHash64_2");
512
513 return 0;
514}
515