Hash.h source code [ClickHouse/dbms/src/Common/HashTable/Hash.h]

1	#pragma once
2
3	#include <Core/Types.h>
4	#include <Common/UInt128.h>
5
6	#include <type_traits>
7
8
9	/* Hash functions that are better than the trivial function std::hash.*
10	*
11	* Example: when we do aggregation by the visitor ID, the performance increase is more than 5 times.
12	* This is because of following reasons:
13	* - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits;
14	* - in typical implementation of standard library, hash function for integers is trivial and just use lower bits;
15	* - traffic is non-uniformly distributed across a day;
16	* - we are using open-addressing linear probing hash tables that are most critical to hash function quality,
17	* and trivial hash function gives disastrous results.
18	*/
19
20	/* Taken from MurmurHash. This is Murmur finalizer.*
21	* Faster than intHash32 when inserting into the hash table UInt64 -> UInt64, where the key is the visitor ID.
22	*/
23	inline DB::UInt64 intHash64(DB::UInt64 x)
24	{
25	x ^= x >> `33`;
26	x *= `0xff51afd7ed558ccdULL`;
27	x ^= x >> `33`;
28	x *= `0xc4ceb9fe1a85ec53ULL`;
29	x ^= x >> `33`;
30
31	return x;
32	}
33
34	/* CRC32C is not very high-quality as a hash function,*
35	* according to avalanche and bit independence tests (see SMHasher software), as well as a small number of bits,
36	* but can behave well when used in hash tables,
37	* due to high speed (latency 3 + 1 clock cycle, throughput 1 clock cycle).
38	* Works only with SSE 4.2 support.
39	*/
40	#ifdef __SSE4_2__
41	#include <nmmintrin.h>
42	#endif
43
44	#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
45	#include <arm_acle.h>
46	#include <arm_neon.h>
47	#endif
48
49	inline DB::UInt64 intHashCRC32(DB::UInt64 x)
50	{
51	#ifdef __SSE4_2__
52	return _mm_crc32_u64(-`1ULL`, x);
53	#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
54	return __crc32cd(-`1U`, x);
55	#else
56	/// On other platforms we do not have CRC32. NOTE This can be confusing.
57	return intHash64(x);
58	#endif
59	}
60
61
62	template <typename T>
63	inline size_t DefaultHash64(T key)
64	{
65	union
66	{
67	T in;
68	DB::UInt64 out;
69	} u;
70	u.out = `0`;
71	u.in = key;
72	return intHash64(u.out);
73	}
74
75	template <typename T, typename Enable = void>
76	struct DefaultHash;
77
78	template <typename T>
79	struct DefaultHash<T, std::enable_if_t<is_arithmetic_v<T>>>
80	{
81	size_t operator() (T key) const
82	{
83	return DefaultHash64<T>(key);
84	}
85	};
86
87	template <typename T>
88	struct DefaultHash<T, std::enable_if_t<DB::IsDecimalNumber<T> && sizeof(T) <= `8`>>
89	{
90	size_t operator() (T key) const
91	{
92	return DefaultHash64<typename T::NativeType>(key);
93	}
94	};
95
96	template <typename T>
97	struct DefaultHash<T, std::enable_if_t<DB::IsDecimalNumber<T> && sizeof(T) == `16`>>
98	{
99	size_t operator() (T key) const
100	{
101	return DefaultHash64<Int64>(key >> `64`) ^ DefaultHash64<Int64>(key);
102	}
103	};
104
105	template <typename T> struct HashCRC32;
106
107	template <typename T>
108	inline size_t hashCRC32(T key)
109	{
110	union
111	{
112	T in;
113	DB::UInt64 out;
114	} u;
115	u.out = `0`;
116	u.in = key;
117	return intHashCRC32(u.out);
118	}
119
120	#define DEFINE_HASH(T) \
121	template <> struct HashCRC32<T>\
122	{\
123	size_t operator() (T key) const\
124	{\
125	return hashCRC32<T>(key);\
126	}\
127	};
128
129	DEFINE_HASH(DB::UInt8)
130	DEFINE_HASH(DB::UInt16)
131	DEFINE_HASH(DB::UInt32)
132	DEFINE_HASH(DB::UInt64)
133	DEFINE_HASH(DB::UInt128)
134	DEFINE_HASH(DB::Int8)
135	DEFINE_HASH(DB::Int16)
136	DEFINE_HASH(DB::Int32)
137	DEFINE_HASH(DB::Int64)
138	DEFINE_HASH(DB::Float32)
139	DEFINE_HASH(DB::Float64)
140
141	#undef DEFINE_HASH
142
143
144	/// It is reasonable to use for UInt8, UInt16 with sufficient hash table size.
145	struct TrivialHash
146	{
147	template <typename T>
148	size_t operator() (T key) const
149	{
150	return key;
151	}
152	};
153
154
155	/* A relatively good non-cryptographic hash function from UInt64 to UInt32.*
156	* But worse (both in quality and speed) than just cutting intHash64.
157	* Taken from here: http://www.concentric.net/~ttwang/tech/inthash.htm
158	*
159	* Slightly changed compared to the function by link: shifts to the right are accidentally replaced by a cyclic shift to the right.
160	* This change did not affect the smhasher test results.
161	*
162	* It is recommended to use different salt for different tasks.
163	* That was the case that in the database values were sorted by hash (for low-quality pseudo-random spread),
164	* and in another place, in the aggregate function, the same hash was used in the hash table,
165	* as a result, this aggregate function was monstrously slowed due to collisions.
166	*
167	* NOTE Salting is far from perfect, because it commutes with first steps of calculation.
168	*
169	* NOTE As mentioned, this function is slower than intHash64.
170	* But occasionally, it is faster, when written in a loop and loop is vectorized.
171	*/
172	template <DB::UInt64 salt>
173	inline DB::UInt32 intHash32(DB::UInt64 key)
174	{
175	key ^= salt;
176
177	key = (~key) + (key << `18`);
178	key = key ^ ((key >> `31`) \| (key << `33`));
179	key = key * `21`;
180	key = key ^ ((key >> `11`) \| (key << `53`));
181	key = key + (key << `6`);
182	key = key ^ ((key >> `22`) \| (key << `42`));
183
184	return key;
185	}
186
187
188	/// For containers.
189	template <typename T, DB::UInt64 salt = `0`>
190	struct IntHash32
191	{
192	size_t operator() (const T & key) const
193	{
194	return intHash32<salt>(key);
195	}
196	};
197

Browse the source code of ClickHouse/dbms/src/Common/HashTable/Hash.h