1#pragma once
2
3#include <Core/Types.h>
4#include <Common/UInt128.h>
5
6#include <type_traits>
7
8
9/** Hash functions that are better than the trivial function std::hash.
10 *
11 * Example: when we do aggregation by the visitor ID, the performance increase is more than 5 times.
12 * This is because of following reasons:
13 * - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits;
14 * - in typical implementation of standard library, hash function for integers is trivial and just use lower bits;
15 * - traffic is non-uniformly distributed across a day;
16 * - we are using open-addressing linear probing hash tables that are most critical to hash function quality,
17 * and trivial hash function gives disastrous results.
18 */
19
20/** Taken from MurmurHash. This is Murmur finalizer.
21 * Faster than intHash32 when inserting into the hash table UInt64 -> UInt64, where the key is the visitor ID.
22 */
23inline DB::UInt64 intHash64(DB::UInt64 x)
24{
25 x ^= x >> 33;
26 x *= 0xff51afd7ed558ccdULL;
27 x ^= x >> 33;
28 x *= 0xc4ceb9fe1a85ec53ULL;
29 x ^= x >> 33;
30
31 return x;
32}
33
34/** CRC32C is not very high-quality as a hash function,
35 * according to avalanche and bit independence tests (see SMHasher software), as well as a small number of bits,
36 * but can behave well when used in hash tables,
37 * due to high speed (latency 3 + 1 clock cycle, throughput 1 clock cycle).
38 * Works only with SSE 4.2 support.
39 */
40#ifdef __SSE4_2__
41#include <nmmintrin.h>
42#endif
43
44#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
45#include <arm_acle.h>
46#include <arm_neon.h>
47#endif
48
49inline DB::UInt64 intHashCRC32(DB::UInt64 x)
50{
51#ifdef __SSE4_2__
52 return _mm_crc32_u64(-1ULL, x);
53#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
54 return __crc32cd(-1U, x);
55#else
56 /// On other platforms we do not have CRC32. NOTE This can be confusing.
57 return intHash64(x);
58#endif
59}
60
61
62template <typename T>
63inline size_t DefaultHash64(T key)
64{
65 union
66 {
67 T in;
68 DB::UInt64 out;
69 } u;
70 u.out = 0;
71 u.in = key;
72 return intHash64(u.out);
73}
74
75template <typename T, typename Enable = void>
76struct DefaultHash;
77
78template <typename T>
79struct DefaultHash<T, std::enable_if_t<is_arithmetic_v<T>>>
80{
81 size_t operator() (T key) const
82 {
83 return DefaultHash64<T>(key);
84 }
85};
86
87template <typename T>
88struct DefaultHash<T, std::enable_if_t<DB::IsDecimalNumber<T> && sizeof(T) <= 8>>
89{
90 size_t operator() (T key) const
91 {
92 return DefaultHash64<typename T::NativeType>(key);
93 }
94};
95
96template <typename T>
97struct DefaultHash<T, std::enable_if_t<DB::IsDecimalNumber<T> && sizeof(T) == 16>>
98{
99 size_t operator() (T key) const
100 {
101 return DefaultHash64<Int64>(key >> 64) ^ DefaultHash64<Int64>(key);
102 }
103};
104
105template <typename T> struct HashCRC32;
106
107template <typename T>
108inline size_t hashCRC32(T key)
109{
110 union
111 {
112 T in;
113 DB::UInt64 out;
114 } u;
115 u.out = 0;
116 u.in = key;
117 return intHashCRC32(u.out);
118}
119
120#define DEFINE_HASH(T) \
121template <> struct HashCRC32<T>\
122{\
123 size_t operator() (T key) const\
124 {\
125 return hashCRC32<T>(key);\
126 }\
127};
128
129DEFINE_HASH(DB::UInt8)
130DEFINE_HASH(DB::UInt16)
131DEFINE_HASH(DB::UInt32)
132DEFINE_HASH(DB::UInt64)
133DEFINE_HASH(DB::UInt128)
134DEFINE_HASH(DB::Int8)
135DEFINE_HASH(DB::Int16)
136DEFINE_HASH(DB::Int32)
137DEFINE_HASH(DB::Int64)
138DEFINE_HASH(DB::Float32)
139DEFINE_HASH(DB::Float64)
140
141#undef DEFINE_HASH
142
143
144/// It is reasonable to use for UInt8, UInt16 with sufficient hash table size.
145struct TrivialHash
146{
147 template <typename T>
148 size_t operator() (T key) const
149 {
150 return key;
151 }
152};
153
154
155/** A relatively good non-cryptographic hash function from UInt64 to UInt32.
156 * But worse (both in quality and speed) than just cutting intHash64.
157 * Taken from here: http://www.concentric.net/~ttwang/tech/inthash.htm
158 *
159 * Slightly changed compared to the function by link: shifts to the right are accidentally replaced by a cyclic shift to the right.
160 * This change did not affect the smhasher test results.
161 *
162 * It is recommended to use different salt for different tasks.
163 * That was the case that in the database values were sorted by hash (for low-quality pseudo-random spread),
164 * and in another place, in the aggregate function, the same hash was used in the hash table,
165 * as a result, this aggregate function was monstrously slowed due to collisions.
166 *
167 * NOTE Salting is far from perfect, because it commutes with first steps of calculation.
168 *
169 * NOTE As mentioned, this function is slower than intHash64.
170 * But occasionally, it is faster, when written in a loop and loop is vectorized.
171 */
172template <DB::UInt64 salt>
173inline DB::UInt32 intHash32(DB::UInt64 key)
174{
175 key ^= salt;
176
177 key = (~key) + (key << 18);
178 key = key ^ ((key >> 31) | (key << 33));
179 key = key * 21;
180 key = key ^ ((key >> 11) | (key << 53));
181 key = key + (key << 6);
182 key = key ^ ((key >> 22) | (key << 42));
183
184 return key;
185}
186
187
188/// For containers.
189template <typename T, DB::UInt64 salt = 0>
190struct IntHash32
191{
192 size_t operator() (const T & key) const
193 {
194 return intHash32<salt>(key);
195 }
196};
197