1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | // From Apache Impala (incubating) as of 2016-02-22 |
19 | |
20 | #ifndef ARROW_UTIL_HASH_UTIL_H |
21 | #define ARROW_UTIL_HASH_UTIL_H |
22 | |
23 | #include <cassert> |
24 | #include <cstdint> |
25 | |
26 | #include "arrow/util/logging.h" |
27 | #include "arrow/util/macros.h" |
28 | #include "arrow/util/neon-util.h" |
29 | #include "arrow/util/sse-util.h" |
30 | |
31 | static inline uint32_t HW_crc32_u8(uint32_t crc, uint8_t v) { |
32 | DCHECK(false) << "Hardware CRC support is not enabled" ; |
33 | return 0; |
34 | } |
35 | |
36 | static inline uint32_t HW_crc32_u16(uint32_t crc, uint16_t v) { |
37 | DCHECK(false) << "Hardware CRC support is not enabled" ; |
38 | return 0; |
39 | } |
40 | |
41 | static inline uint32_t HW_crc32_u32(uint32_t crc, uint32_t v) { |
42 | DCHECK(false) << "Hardware CRC support is not enabled" ; |
43 | return 0; |
44 | } |
45 | |
46 | static inline uint32_t HW_crc32_u64(uint32_t crc, uint64_t v) { |
47 | DCHECK(false) << "Hardware CRC support is not enabled" ; |
48 | return 0; |
49 | } |
50 | |
51 | #ifdef ARROW_HAVE_SSE4_2 |
52 | #define HW_crc32_u8 SSE4_crc32_u8 |
53 | #define HW_crc32_u16 SSE4_crc32_u16 |
54 | #define HW_crc32_u32 SSE4_crc32_u32 |
55 | #define HW_crc32_u64 SSE4_crc32_u64 |
56 | #elif defined(ARROW_HAVE_ARM_CRC) |
57 | #define HW_crc32_u8 ARMCE_crc32_u8 |
58 | #define HW_crc32_u16 ARMCE_crc32_u16 |
59 | #define HW_crc32_u32 ARMCE_crc32_u32 |
60 | #define HW_crc32_u64 ARMCE_crc32_u64 |
61 | #endif |
62 | |
63 | namespace arrow { |
64 | |
65 | /// Utility class to compute hash values. |
66 | class HashUtil { |
67 | public: |
68 | #if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_ARM_CRC) |
69 | static constexpr bool have_hardware_crc32 = true; |
70 | #else |
71 | static constexpr bool have_hardware_crc32 = false; |
72 | #endif |
73 | |
74 | /// Compute the Crc32 hash for data using SSE4/ArmCRC instructions. The input hash |
75 | /// parameter is the current hash/seed value. |
76 | /// This should only be called if SSE/ArmCRC is supported. |
77 | /// This is ~4x faster than Fnv/Boost Hash. |
78 | /// TODO: crc32 hashes with different seeds do not result in different hash functions. |
79 | /// The resulting hashes are correlated. |
80 | static uint32_t CrcHash(const void* data, int32_t nbytes, uint32_t hash) { |
81 | const uint8_t* p = reinterpret_cast<const uint8_t*>(data); |
82 | const uint8_t* end = p + nbytes; |
83 | |
84 | while (p <= end - 8) { |
85 | hash = HW_crc32_u64(hash, *reinterpret_cast<const uint64_t*>(p)); |
86 | p += 8; |
87 | } |
88 | while (p <= end - 4) { |
89 | hash = HW_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p)); |
90 | p += 4; |
91 | } |
92 | while (p < end) { |
93 | hash = HW_crc32_u8(hash, *p); |
94 | ++p; |
95 | } |
96 | |
97 | // The lower half of the CRC hash has has poor uniformity, so swap the halves |
98 | // for anyone who only uses the first several bits of the hash. |
99 | hash = (hash << 16) | (hash >> 16); |
100 | return hash; |
101 | } |
102 | |
103 | /// A variant of CRC32 hashing that computes two independent running CRCs |
104 | /// over interleaved halves of the input, giving out a 64-bit integer. |
105 | /// The result's quality should be improved by a finalization step. |
106 | /// |
107 | /// In addition to producing more bits of output, this should be twice |
108 | /// faster than CrcHash on CPUs that can overlap several independent |
109 | /// CRC computations. |
110 | static uint64_t DoubleCrcHash(const void* data, int32_t nbytes, uint64_t hash) { |
111 | const uint8_t* p = reinterpret_cast<const uint8_t*>(data); |
112 | |
113 | uint32_t h1 = static_cast<uint32_t>(hash >> 32); |
114 | uint32_t h2 = static_cast<uint32_t>(hash); |
115 | |
116 | while (nbytes >= 16) { |
117 | h1 = HW_crc32_u64(h1, *reinterpret_cast<const uint64_t*>(p)); |
118 | h2 = HW_crc32_u64(h2, *reinterpret_cast<const uint64_t*>(p + 8)); |
119 | nbytes -= 16; |
120 | p += 16; |
121 | } |
122 | if (nbytes >= 8) { |
123 | h1 = HW_crc32_u32(h1, *reinterpret_cast<const uint32_t*>(p)); |
124 | h2 = HW_crc32_u32(h2, *reinterpret_cast<const uint32_t*>(p + 4)); |
125 | nbytes -= 8; |
126 | p += 8; |
127 | } |
128 | if (nbytes >= 4) { |
129 | h1 = HW_crc32_u16(h1, *reinterpret_cast<const uint16_t*>(p)); |
130 | h2 = HW_crc32_u16(h2, *reinterpret_cast<const uint16_t*>(p + 2)); |
131 | nbytes -= 4; |
132 | p += 4; |
133 | } |
134 | switch (nbytes) { |
135 | case 3: |
136 | h1 = HW_crc32_u8(h1, p[3]); |
137 | // fallthrough |
138 | case 2: |
139 | h2 = HW_crc32_u8(h2, p[2]); |
140 | // fallthrough |
141 | case 1: |
142 | h1 = HW_crc32_u8(h1, p[1]); |
143 | // fallthrough |
144 | case 0: |
145 | break; |
146 | default: |
147 | assert(0); |
148 | } |
149 | |
150 | // A finalization step is recommended to mix up the result's bits |
151 | return (static_cast<uint64_t>(h1) << 32) + h2; |
152 | } |
153 | |
154 | /// CrcHash() specialized for 1-byte data |
155 | static inline uint32_t CrcHash1(const void* v, uint32_t hash) { |
156 | const uint8_t* s = reinterpret_cast<const uint8_t*>(v); |
157 | hash = HW_crc32_u8(hash, *s); |
158 | hash = (hash << 16) | (hash >> 16); |
159 | return hash; |
160 | } |
161 | |
162 | /// CrcHash() specialized for 2-byte data |
163 | static inline uint32_t CrcHash2(const void* v, uint32_t hash) { |
164 | const uint16_t* s = reinterpret_cast<const uint16_t*>(v); |
165 | hash = HW_crc32_u16(hash, *s); |
166 | hash = (hash << 16) | (hash >> 16); |
167 | return hash; |
168 | } |
169 | |
170 | /// CrcHash() specialized for 4-byte data |
171 | static inline uint32_t CrcHash4(const void* v, uint32_t hash) { |
172 | const uint32_t* p = reinterpret_cast<const uint32_t*>(v); |
173 | hash = HW_crc32_u32(hash, *p); |
174 | hash = (hash << 16) | (hash >> 16); |
175 | return hash; |
176 | } |
177 | |
178 | /// CrcHash() specialized for 8-byte data |
179 | static inline uint32_t CrcHash8(const void* v, uint32_t hash) { |
180 | const uint64_t* p = reinterpret_cast<const uint64_t*>(v); |
181 | hash = HW_crc32_u64(hash, *p); |
182 | hash = (hash << 16) | (hash >> 16); |
183 | return hash; |
184 | } |
185 | |
186 | /// CrcHash() specialized for 12-byte data |
187 | static inline uint32_t CrcHash12(const void* v, uint32_t hash) { |
188 | const uint64_t* p = reinterpret_cast<const uint64_t*>(v); |
189 | hash = HW_crc32_u64(hash, *p); |
190 | ++p; |
191 | hash = HW_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p)); |
192 | hash = (hash << 16) | (hash >> 16); |
193 | return hash; |
194 | } |
195 | |
196 | /// CrcHash() specialized for 16-byte data |
197 | static inline uint32_t CrcHash16(const void* v, uint32_t hash) { |
198 | const uint64_t* p = reinterpret_cast<const uint64_t*>(v); |
199 | hash = HW_crc32_u64(hash, *p); |
200 | ++p; |
201 | hash = HW_crc32_u64(hash, *p); |
202 | hash = (hash << 16) | (hash >> 16); |
203 | return hash; |
204 | } |
205 | |
206 | static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995; |
207 | static const int MURMUR_R = 47; |
208 | |
209 | /// Murmur2 hash implementation returning 64-bit hashes. |
210 | static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) { |
211 | uint64_t h = seed ^ (len * MURMUR_PRIME); |
212 | |
213 | const uint64_t* data = reinterpret_cast<const uint64_t*>(input); |
214 | const uint64_t* end = data + (len / sizeof(uint64_t)); |
215 | |
216 | while (data != end) { |
217 | uint64_t k = *data++; |
218 | k *= MURMUR_PRIME; |
219 | k ^= k >> MURMUR_R; |
220 | k *= MURMUR_PRIME; |
221 | h ^= k; |
222 | h *= MURMUR_PRIME; |
223 | } |
224 | |
225 | const uint8_t* data2 = reinterpret_cast<const uint8_t*>(data); |
226 | switch (len & 7) { |
227 | case 7: |
228 | h ^= uint64_t(data2[6]) << 48; |
229 | case 6: |
230 | h ^= uint64_t(data2[5]) << 40; |
231 | case 5: |
232 | h ^= uint64_t(data2[4]) << 32; |
233 | case 4: |
234 | h ^= uint64_t(data2[3]) << 24; |
235 | case 3: |
236 | h ^= uint64_t(data2[2]) << 16; |
237 | case 2: |
238 | h ^= uint64_t(data2[1]) << 8; |
239 | case 1: |
240 | h ^= uint64_t(data2[0]); |
241 | h *= MURMUR_PRIME; |
242 | } |
243 | |
244 | h ^= h >> MURMUR_R; |
245 | h *= MURMUR_PRIME; |
246 | h ^= h >> MURMUR_R; |
247 | return h; |
248 | } |
249 | |
250 | /// default values recommended by http://isthe.com/chongo/tech/comp/fnv/ |
251 | static const uint32_t FNV_PRIME = 0x01000193; // 16777619 |
252 | static const uint32_t FNV_SEED = 0x811C9DC5; // 2166136261 |
253 | static const uint64_t FNV64_PRIME = 1099511628211UL; |
254 | static const uint64_t FNV64_SEED = 14695981039346656037UL; |
255 | |
256 | /// Implementation of the Fowler-Noll-Vo hash function. This is not as performant |
257 | /// as boost's hash on int types (2x slower) but has bit entropy. |
258 | /// For ints, boost just returns the value of the int which can be pathological. |
259 | /// For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000 |
260 | /// is taken on the hash, all values will collide to the same bucket. |
261 | /// For string values, Fnv is slightly faster than boost. |
262 | /// IMPORTANT: FNV hash suffers from poor diffusion of the least significant bit, |
263 | /// which can lead to poor results when input bytes are duplicated. |
264 | /// See FnvHash64to32() for how this can be mitigated. |
265 | static uint64_t FnvHash64(const void* data, int32_t bytes, uint64_t hash) { |
266 | const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data); |
267 | while (bytes--) { |
268 | hash = (*ptr ^ hash) * FNV64_PRIME; |
269 | ++ptr; |
270 | } |
271 | return hash; |
272 | } |
273 | |
274 | /// Return a 32-bit hash computed by invoking FNV-64 and folding the result to 32-bits. |
275 | /// This technique is recommended instead of FNV-32 since the LSB of an FNV hash is the |
276 | /// XOR of the LSBs of its input bytes, leading to poor results for duplicate inputs. |
277 | /// The input seed 'hash' is duplicated so the top half of the seed is not all zero. |
278 | /// Data length must be at least 1 byte: zero-length data should be handled separately, |
279 | /// for example using CombineHash with a unique constant value to avoid returning the |
280 | /// hash argument. Zero-length data gives terrible results: the initial hash value is |
281 | /// xored with itself cancelling all bits. |
282 | static uint32_t FnvHash64to32(const void* data, int32_t bytes, uint32_t hash) { |
283 | // IMPALA-2270: this function should never be used for zero-byte inputs. |
284 | DCHECK_GT(bytes, 0); |
285 | uint64_t hash_u64 = hash | (static_cast<uint64_t>(hash) << 32); |
286 | hash_u64 = FnvHash64(data, bytes, hash_u64); |
287 | return static_cast<uint32_t>((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF)); |
288 | } |
289 | |
290 | // Hash template |
291 | template <bool hw> |
292 | static inline int Hash(const void* data, int32_t bytes, uint32_t seed); |
293 | |
294 | /// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio). |
295 | static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9; |
296 | |
297 | /// Combine hashes 'value' and 'seed' to get a new hash value. Similar to |
298 | /// boost::hash_combine(), but for uint32_t. This function should be used with a |
299 | /// constant first argument to update the hash value for zero-length values such as |
300 | /// NULL, boolean, and empty strings. |
301 | static inline uint32_t HashCombine32(uint32_t value, uint32_t seed) { |
302 | return seed ^ (HASH_COMBINE_SEED + value + (seed << 6) + (seed >> 2)); |
303 | } |
304 | |
305 | // Get 32 more bits of randomness from a 32-bit hash: |
306 | static inline uint32_t Rehash32to32(const uint32_t hash) { |
307 | // Constants generated by uuidgen(1) with the -r flag |
308 | static const uint64_t m = 0x7850f11ec6d14889ull, a = 0x6773610597ca4c63ull; |
309 | // This is strongly universal hashing following Dietzfelbinger's "Universal hashing |
310 | // and k-wise independent random variables via integer arithmetic without primes". As |
311 | // such, for any two distinct uint32_t's hash1 and hash2, the probability (over the |
312 | // randomness of the constants) that any subset of bit positions of |
313 | // Rehash32to32(hash1) is equal to the same subset of bit positions |
314 | // Rehash32to32(hash2) is minimal. |
315 | return static_cast<uint32_t>((static_cast<uint64_t>(hash) * m + a) >> 32); |
316 | } |
317 | |
318 | static inline uint64_t Rehash32to64(const uint32_t hash) { |
319 | static const uint64_t m1 = 0x47b6137a44974d91ull, m2 = 0x8824ad5ba2b7289cull, |
320 | a1 = 0x705495c62df1424aull, a2 = 0x9efc49475c6bfb31ull; |
321 | const uint64_t hash1 = (static_cast<uint64_t>(hash) * m1 + a1) >> 32; |
322 | const uint64_t hash2 = (static_cast<uint64_t>(hash) * m2 + a2) >> 32; |
323 | return hash1 | (hash2 << 32); |
324 | } |
325 | }; |
326 | |
327 | // HW Hash |
328 | template <> |
329 | inline int HashUtil::Hash<true>(const void* data, int32_t bytes, uint32_t seed) { |
330 | #ifdef ARROW_HAVE_ARM_CRC |
331 | // Need run time check for Arm |
332 | // if not support, fall back to Murmur |
333 | if (!crc32c_runtime_check()) |
334 | return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed)); |
335 | else |
336 | #endif |
337 | // Double CRC |
338 | return static_cast<int>(HashUtil::DoubleCrcHash(data, bytes, seed)); |
339 | } |
340 | |
341 | // Murmur Hash |
342 | template <> |
343 | inline int HashUtil::Hash<false>(const void* data, int32_t bytes, uint32_t seed) { |
344 | return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed)); |
345 | } |
346 | |
347 | } // namespace arrow |
348 | |
349 | #endif // ARROW_UTIL_HASH_UTIL_H |
350 | |