| 1 | #pragma once |
| 2 | |
| 3 | /** SipHash is a fast cryptographic hash function for short strings. |
| 4 | * Taken from here: https://www.131002.net/siphash/ |
| 5 | * |
| 6 | * This is SipHash 2-4 variant. |
| 7 | * |
| 8 | * Two changes are made: |
| 9 | * - returns also 128 bits, not only 64; |
| 10 | * - done streaming (can be calculated in parts). |
| 11 | * |
| 12 | * On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL. |
| 13 | * (~ 700 MB/sec, 15 million strings per second) |
| 14 | */ |
| 15 | |
| 16 | #include <common/Types.h> |
| 17 | #include <common/unaligned.h> |
| 18 | #include <string> |
| 19 | #include <type_traits> |
| 20 | #include <Core/Defines.h> |
| 21 | |
| 22 | #define ROTL(x, b) static_cast<UInt64>(((x) << (b)) | ((x) >> (64 - (b)))) |
| 23 | |
| 24 | #define SIPROUND \ |
| 25 | do \ |
| 26 | { \ |
| 27 | v0 += v1; v1 = ROTL(v1, 13); v1 ^= v0; v0 = ROTL(v0, 32); \ |
| 28 | v2 += v3; v3 = ROTL(v3, 16); v3 ^= v2; \ |
| 29 | v0 += v3; v3 = ROTL(v3, 21); v3 ^= v0; \ |
| 30 | v2 += v1; v1 = ROTL(v1, 17); v1 ^= v2; v2 = ROTL(v2, 32); \ |
| 31 | } while(0) |
| 32 | |
| 33 | |
| 34 | class SipHash |
| 35 | { |
| 36 | private: |
| 37 | /// State. |
| 38 | UInt64 v0; |
| 39 | UInt64 v1; |
| 40 | UInt64 v2; |
| 41 | UInt64 v3; |
| 42 | |
| 43 | /// How many bytes have been processed. |
| 44 | UInt64 cnt; |
| 45 | |
| 46 | /// The current 8 bytes of input data. |
| 47 | union |
| 48 | { |
| 49 | UInt64 current_word; |
| 50 | UInt8 current_bytes[8]; |
| 51 | }; |
| 52 | |
| 53 | ALWAYS_INLINE void finalize() |
| 54 | { |
| 55 | /// In the last free byte, we write the remainder of the division by 256. |
| 56 | current_bytes[7] = cnt; |
| 57 | |
| 58 | v3 ^= current_word; |
| 59 | SIPROUND; |
| 60 | SIPROUND; |
| 61 | v0 ^= current_word; |
| 62 | |
| 63 | v2 ^= 0xff; |
| 64 | SIPROUND; |
| 65 | SIPROUND; |
| 66 | SIPROUND; |
| 67 | SIPROUND; |
| 68 | } |
| 69 | |
| 70 | public: |
| 71 | /// Arguments - seed. |
| 72 | SipHash(UInt64 k0 = 0, UInt64 k1 = 0) |
| 73 | { |
| 74 | /// Initialize the state with some random bytes and seed. |
| 75 | v0 = 0x736f6d6570736575ULL ^ k0; |
| 76 | v1 = 0x646f72616e646f6dULL ^ k1; |
| 77 | v2 = 0x6c7967656e657261ULL ^ k0; |
| 78 | v3 = 0x7465646279746573ULL ^ k1; |
| 79 | |
| 80 | cnt = 0; |
| 81 | current_word = 0; |
| 82 | } |
| 83 | |
| 84 | void update(const char * data, UInt64 size) |
| 85 | { |
| 86 | const char * end = data + size; |
| 87 | |
| 88 | /// We'll finish to process the remainder of the previous update, if any. |
| 89 | if (cnt & 7) |
| 90 | { |
| 91 | while (cnt & 7 && data < end) |
| 92 | { |
| 93 | current_bytes[cnt & 7] = *data; |
| 94 | ++data; |
| 95 | ++cnt; |
| 96 | } |
| 97 | |
| 98 | /// If we still do not have enough bytes to an 8-byte word. |
| 99 | if (cnt & 7) |
| 100 | return; |
| 101 | |
| 102 | v3 ^= current_word; |
| 103 | SIPROUND; |
| 104 | SIPROUND; |
| 105 | v0 ^= current_word; |
| 106 | } |
| 107 | |
| 108 | cnt += end - data; |
| 109 | |
| 110 | while (data + 8 <= end) |
| 111 | { |
| 112 | current_word = unalignedLoad<UInt64>(data); |
| 113 | |
| 114 | v3 ^= current_word; |
| 115 | SIPROUND; |
| 116 | SIPROUND; |
| 117 | v0 ^= current_word; |
| 118 | |
| 119 | data += 8; |
| 120 | } |
| 121 | |
| 122 | /// Pad the remainder, which is missing up to an 8-byte word. |
| 123 | current_word = 0; |
| 124 | switch (end - data) |
| 125 | { |
| 126 | case 7: current_bytes[6] = data[6]; [[fallthrough]]; |
| 127 | case 6: current_bytes[5] = data[5]; [[fallthrough]]; |
| 128 | case 5: current_bytes[4] = data[4]; [[fallthrough]]; |
| 129 | case 4: current_bytes[3] = data[3]; [[fallthrough]]; |
| 130 | case 3: current_bytes[2] = data[2]; [[fallthrough]]; |
| 131 | case 2: current_bytes[1] = data[1]; [[fallthrough]]; |
| 132 | case 1: current_bytes[0] = data[0]; [[fallthrough]]; |
| 133 | case 0: break; |
| 134 | } |
| 135 | } |
| 136 | |
| 137 | /// NOTE: std::has_unique_object_representations is only available since clang 6. As of Mar 2017 we still use clang 5 sometimes. |
| 138 | template <typename T> |
| 139 | std::enable_if_t<std::/*has_unique_object_representations_v*/is_standard_layout_v<T>, void> update(const T & x) |
| 140 | { |
| 141 | update(reinterpret_cast<const char *>(&x), sizeof(x)); |
| 142 | } |
| 143 | |
| 144 | void update(const std::string & x) |
| 145 | { |
| 146 | update(x.data(), x.length()); |
| 147 | } |
| 148 | |
| 149 | /// Get the result in some form. This can only be done once! |
| 150 | |
| 151 | void get128(char * out) |
| 152 | { |
| 153 | finalize(); |
| 154 | reinterpret_cast<UInt64 *>(out)[0] = v0 ^ v1; |
| 155 | reinterpret_cast<UInt64 *>(out)[1] = v2 ^ v3; |
| 156 | } |
| 157 | |
| 158 | /// template for avoiding 'unsigned long long' vs 'unsigned long' problem on old poco in macos |
| 159 | template <typename T> |
| 160 | ALWAYS_INLINE void get128(T & lo, T & hi) |
| 161 | { |
| 162 | static_assert(sizeof(T) == 8); |
| 163 | finalize(); |
| 164 | lo = v0 ^ v1; |
| 165 | hi = v2 ^ v3; |
| 166 | } |
| 167 | |
| 168 | UInt64 get64() |
| 169 | { |
| 170 | finalize(); |
| 171 | return v0 ^ v1 ^ v2 ^ v3; |
| 172 | } |
| 173 | }; |
| 174 | |
| 175 | |
| 176 | #undef ROTL |
| 177 | #undef SIPROUND |
| 178 | |
| 179 | #include <cstddef> |
| 180 | |
| 181 | inline void sipHash128(const char * data, const size_t size, char * out) |
| 182 | { |
| 183 | SipHash hash; |
| 184 | hash.update(data, size); |
| 185 | hash.get128(out); |
| 186 | } |
| 187 | |
| 188 | inline UInt64 sipHash64(const char * data, const size_t size) |
| 189 | { |
| 190 | SipHash hash; |
| 191 | hash.update(data, size); |
| 192 | return hash.get64(); |
| 193 | } |
| 194 | |
| 195 | template <typename T> |
| 196 | std::enable_if_t<std::/*has_unique_object_representations_v*/is_standard_layout_v<T>, UInt64> sipHash64(const T & x) |
| 197 | { |
| 198 | SipHash hash; |
| 199 | hash.update(x); |
| 200 | return hash.get64(); |
| 201 | } |
| 202 | |
| 203 | inline UInt64 sipHash64(const std::string & s) |
| 204 | { |
| 205 | return sipHash64(s.data(), s.size()); |
| 206 | } |
| 207 | |