1 | #pragma once |
2 | |
3 | /** SipHash is a fast cryptographic hash function for short strings. |
4 | * Taken from here: https://www.131002.net/siphash/ |
5 | * |
6 | * This is SipHash 2-4 variant. |
7 | * |
8 | * Two changes are made: |
9 | * - returns also 128 bits, not only 64; |
10 | * - done streaming (can be calculated in parts). |
11 | * |
12 | * On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL. |
13 | * (~ 700 MB/sec, 15 million strings per second) |
14 | */ |
15 | |
16 | #include <common/Types.h> |
17 | #include <common/unaligned.h> |
18 | #include <string> |
19 | #include <type_traits> |
20 | #include <Core/Defines.h> |
21 | |
22 | #define ROTL(x, b) static_cast<UInt64>(((x) << (b)) | ((x) >> (64 - (b)))) |
23 | |
24 | #define SIPROUND \ |
25 | do \ |
26 | { \ |
27 | v0 += v1; v1 = ROTL(v1, 13); v1 ^= v0; v0 = ROTL(v0, 32); \ |
28 | v2 += v3; v3 = ROTL(v3, 16); v3 ^= v2; \ |
29 | v0 += v3; v3 = ROTL(v3, 21); v3 ^= v0; \ |
30 | v2 += v1; v1 = ROTL(v1, 17); v1 ^= v2; v2 = ROTL(v2, 32); \ |
31 | } while(0) |
32 | |
33 | |
34 | class SipHash |
35 | { |
36 | private: |
37 | /// State. |
38 | UInt64 v0; |
39 | UInt64 v1; |
40 | UInt64 v2; |
41 | UInt64 v3; |
42 | |
43 | /// How many bytes have been processed. |
44 | UInt64 cnt; |
45 | |
46 | /// The current 8 bytes of input data. |
47 | union |
48 | { |
49 | UInt64 current_word; |
50 | UInt8 current_bytes[8]; |
51 | }; |
52 | |
53 | ALWAYS_INLINE void finalize() |
54 | { |
55 | /// In the last free byte, we write the remainder of the division by 256. |
56 | current_bytes[7] = cnt; |
57 | |
58 | v3 ^= current_word; |
59 | SIPROUND; |
60 | SIPROUND; |
61 | v0 ^= current_word; |
62 | |
63 | v2 ^= 0xff; |
64 | SIPROUND; |
65 | SIPROUND; |
66 | SIPROUND; |
67 | SIPROUND; |
68 | } |
69 | |
70 | public: |
71 | /// Arguments - seed. |
72 | SipHash(UInt64 k0 = 0, UInt64 k1 = 0) |
73 | { |
74 | /// Initialize the state with some random bytes and seed. |
75 | v0 = 0x736f6d6570736575ULL ^ k0; |
76 | v1 = 0x646f72616e646f6dULL ^ k1; |
77 | v2 = 0x6c7967656e657261ULL ^ k0; |
78 | v3 = 0x7465646279746573ULL ^ k1; |
79 | |
80 | cnt = 0; |
81 | current_word = 0; |
82 | } |
83 | |
84 | void update(const char * data, UInt64 size) |
85 | { |
86 | const char * end = data + size; |
87 | |
88 | /// We'll finish to process the remainder of the previous update, if any. |
89 | if (cnt & 7) |
90 | { |
91 | while (cnt & 7 && data < end) |
92 | { |
93 | current_bytes[cnt & 7] = *data; |
94 | ++data; |
95 | ++cnt; |
96 | } |
97 | |
98 | /// If we still do not have enough bytes to an 8-byte word. |
99 | if (cnt & 7) |
100 | return; |
101 | |
102 | v3 ^= current_word; |
103 | SIPROUND; |
104 | SIPROUND; |
105 | v0 ^= current_word; |
106 | } |
107 | |
108 | cnt += end - data; |
109 | |
110 | while (data + 8 <= end) |
111 | { |
112 | current_word = unalignedLoad<UInt64>(data); |
113 | |
114 | v3 ^= current_word; |
115 | SIPROUND; |
116 | SIPROUND; |
117 | v0 ^= current_word; |
118 | |
119 | data += 8; |
120 | } |
121 | |
122 | /// Pad the remainder, which is missing up to an 8-byte word. |
123 | current_word = 0; |
124 | switch (end - data) |
125 | { |
126 | case 7: current_bytes[6] = data[6]; [[fallthrough]]; |
127 | case 6: current_bytes[5] = data[5]; [[fallthrough]]; |
128 | case 5: current_bytes[4] = data[4]; [[fallthrough]]; |
129 | case 4: current_bytes[3] = data[3]; [[fallthrough]]; |
130 | case 3: current_bytes[2] = data[2]; [[fallthrough]]; |
131 | case 2: current_bytes[1] = data[1]; [[fallthrough]]; |
132 | case 1: current_bytes[0] = data[0]; [[fallthrough]]; |
133 | case 0: break; |
134 | } |
135 | } |
136 | |
137 | /// NOTE: std::has_unique_object_representations is only available since clang 6. As of Mar 2017 we still use clang 5 sometimes. |
138 | template <typename T> |
139 | std::enable_if_t<std::/*has_unique_object_representations_v*/is_standard_layout_v<T>, void> update(const T & x) |
140 | { |
141 | update(reinterpret_cast<const char *>(&x), sizeof(x)); |
142 | } |
143 | |
144 | void update(const std::string & x) |
145 | { |
146 | update(x.data(), x.length()); |
147 | } |
148 | |
149 | /// Get the result in some form. This can only be done once! |
150 | |
151 | void get128(char * out) |
152 | { |
153 | finalize(); |
154 | reinterpret_cast<UInt64 *>(out)[0] = v0 ^ v1; |
155 | reinterpret_cast<UInt64 *>(out)[1] = v2 ^ v3; |
156 | } |
157 | |
158 | /// template for avoiding 'unsigned long long' vs 'unsigned long' problem on old poco in macos |
159 | template <typename T> |
160 | ALWAYS_INLINE void get128(T & lo, T & hi) |
161 | { |
162 | static_assert(sizeof(T) == 8); |
163 | finalize(); |
164 | lo = v0 ^ v1; |
165 | hi = v2 ^ v3; |
166 | } |
167 | |
168 | UInt64 get64() |
169 | { |
170 | finalize(); |
171 | return v0 ^ v1 ^ v2 ^ v3; |
172 | } |
173 | }; |
174 | |
175 | |
176 | #undef ROTL |
177 | #undef SIPROUND |
178 | |
179 | #include <cstddef> |
180 | |
181 | inline void sipHash128(const char * data, const size_t size, char * out) |
182 | { |
183 | SipHash hash; |
184 | hash.update(data, size); |
185 | hash.get128(out); |
186 | } |
187 | |
188 | inline UInt64 sipHash64(const char * data, const size_t size) |
189 | { |
190 | SipHash hash; |
191 | hash.update(data, size); |
192 | return hash.get64(); |
193 | } |
194 | |
195 | template <typename T> |
196 | std::enable_if_t<std::/*has_unique_object_representations_v*/is_standard_layout_v<T>, UInt64> sipHash64(const T & x) |
197 | { |
198 | SipHash hash; |
199 | hash.update(x); |
200 | return hash.get64(); |
201 | } |
202 | |
203 | inline UInt64 sipHash64(const std::string & s) |
204 | { |
205 | return sipHash64(s.data(), s.size()); |
206 | } |
207 | |