| 1 | /* |
| 2 | * Copyright 2012-present Facebook, Inc. |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | /** |
| 18 | * Compute 64-, 96-, and 128-bit Rabin fingerprints, as described in |
| 19 | * Michael O. Rabin (1981) |
| 20 | * Fingerprinting by Random Polynomials |
| 21 | * Center for Research in Computing Technology, Harvard University |
| 22 | * Tech Report TR-CSE-03-01 |
| 23 | * |
| 24 | * The implementation follows the optimization described in |
| 25 | * Andrei Z. Broder (1993) |
| 26 | * Some applications of Rabin's fingerprinting method |
| 27 | * |
| 28 | * extended for fingerprints larger than 64 bits, and modified to use |
| 29 | * 64-bit instead of 32-bit integers for computation. |
| 30 | * |
| 31 | * The precomputed tables are in Fingerprint.cpp. |
| 32 | * |
| 33 | * Benchmarked on 10/13/2009 on a 2.5GHz quad-core Xeon L5420, |
| 34 | * - Fingerprint<64>::update64() takes about 12ns |
| 35 | * - Fingerprint<96>::update64() takes about 30ns |
| 36 | * - Fingerprint<128>::update128() takes about 30ns |
| 37 | * (unsurprisingly, Fingerprint<96> and Fingerprint<128> take the |
| 38 | * same amount of time, as they both use 128-bit operations; the least |
| 39 | * significant 32 bits of Fingerprint<96> will always be 0) |
| 40 | * |
| 41 | * @author Tudor Bosman (tudorb@facebook.com) |
| 42 | */ |
| 43 | |
| 44 | #pragma once |
| 45 | |
| 46 | #include <array> |
| 47 | #include <cstdint> |
| 48 | |
| 49 | #include <folly/Range.h> |
| 50 | |
| 51 | namespace folly { |
| 52 | |
| 53 | namespace detail { |
| 54 | |
| 55 | constexpr size_t poly_size(size_t bits) { |
| 56 | return 1 + (bits - 1) / 64; |
| 57 | } |
| 58 | |
| 59 | template <size_t Deg> |
| 60 | using poly_table = |
| 61 | std::array<std::array<std::array<uint64_t, poly_size(Deg)>, 256>, 8>; |
| 62 | |
| 63 | template <int BITS> |
| 64 | struct FingerprintTable { |
| 65 | static const uint64_t poly[poly_size(BITS)]; |
| 66 | static const poly_table<BITS> table; |
| 67 | }; |
| 68 | |
| 69 | template <int BITS> |
| 70 | const uint64_t FingerprintTable<BITS>::poly[poly_size(BITS)] = {}; |
| 71 | template <int BITS> |
| 72 | const poly_table<BITS> FingerprintTable<BITS>::table = {}; |
| 73 | |
| 74 | #ifndef _MSC_VER |
| 75 | // MSVC 2015 can't handle these extern specialization declarations, |
| 76 | // but they aren't needed for things to work right, so we just don't |
| 77 | // declare them in the header for MSVC. |
| 78 | |
| 79 | #define FOLLY_DECLARE_FINGERPRINT_TABLES(BITS) \ |
| 80 | template <> \ |
| 81 | const uint64_t FingerprintTable<BITS>::poly[poly_size(BITS)]; \ |
| 82 | template <> \ |
| 83 | const poly_table<BITS> FingerprintTable<BITS>::table |
| 84 | |
| 85 | FOLLY_DECLARE_FINGERPRINT_TABLES(64); |
| 86 | FOLLY_DECLARE_FINGERPRINT_TABLES(96); |
| 87 | FOLLY_DECLARE_FINGERPRINT_TABLES(128); |
| 88 | |
| 89 | #undef FOLLY_DECLARE_FINGERPRINT_TABLES |
| 90 | #endif |
| 91 | |
| 92 | } // namespace detail |
| 93 | |
| 94 | /** |
| 95 | * Compute the Rabin fingerprint. |
| 96 | * |
| 97 | * TODO(tudorb): Extend this to allow removing values from the computed |
| 98 | * fingerprint (so we can fingerprint a sliding window, as in the Rabin-Karp |
| 99 | * string matching algorithm) |
| 100 | * |
| 101 | * update* methods return *this, so you can chain them together: |
| 102 | * Fingerprint<96>().update8(x).update(str).update64(val).write(output); |
| 103 | */ |
| 104 | template <int BITS> |
| 105 | class Fingerprint { |
| 106 | public: |
| 107 | Fingerprint() { |
| 108 | // Use a non-zero starting value. We'll use (1 << (BITS-1)) |
| 109 | fp_[0] = 1ULL << 63; |
| 110 | for (int i = 1; i < size(); i++) { |
| 111 | fp_[i] = 0; |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | Fingerprint& update8(uint8_t v) { |
| 116 | uint8_t out = shlor8(v); |
| 117 | xortab(detail::FingerprintTable<BITS>::table[0][out]); |
| 118 | return *this; |
| 119 | } |
| 120 | |
| 121 | // update32 and update64 are convenience functions to update the fingerprint |
| 122 | // with 4 and 8 bytes at a time. They are faster than calling update8 |
| 123 | // in a loop. They process the bytes in big-endian order. |
| 124 | Fingerprint& update32(uint32_t v) { |
| 125 | uint32_t out = shlor32(v); |
| 126 | for (int i = 0; i < 4; i++) { |
| 127 | xortab(detail::FingerprintTable<BITS>::table[i][out & 0xff]); |
| 128 | out >>= 8; |
| 129 | } |
| 130 | return *this; |
| 131 | } |
| 132 | |
| 133 | Fingerprint& update64(uint64_t v) { |
| 134 | uint64_t out = shlor64(v); |
| 135 | for (int i = 0; i < 8; i++) { |
| 136 | xortab(detail::FingerprintTable<BITS>::table[i][out & 0xff]); |
| 137 | out >>= 8; |
| 138 | } |
| 139 | return *this; |
| 140 | } |
| 141 | |
| 142 | Fingerprint& update(StringPiece str) { |
| 143 | // TODO(tudorb): We could be smart and do update64 or update32 if aligned |
| 144 | for (auto c : str) { |
| 145 | update8(uint8_t(c)); |
| 146 | } |
| 147 | return *this; |
| 148 | } |
| 149 | |
| 150 | /** |
| 151 | * Return the number of uint64s needed to hold the fingerprint value. |
| 152 | */ |
| 153 | constexpr static int size() { |
| 154 | return detail::poly_size(BITS); |
| 155 | } |
| 156 | |
| 157 | /** |
| 158 | * Write the computed fingeprint to an array of size() uint64_t's. |
| 159 | * For Fingerprint<64>, size()==1; we write 64 bits in out[0] |
| 160 | * For Fingerprint<96>, size()==2; we write 64 bits in out[0] and |
| 161 | * the most significant 32 bits of out[1] |
| 162 | * For Fingerprint<128>, size()==2; we write 64 bits in out[0] and |
| 163 | * 64 bits in out[1]. |
| 164 | */ |
| 165 | void write(uint64_t* out) const { |
| 166 | for (int i = 0; i < size(); i++) { |
| 167 | out[i] = fp_[i]; |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | private: |
| 172 | // XOR the fingerprint with a value from one of the tables. |
| 173 | void xortab(std::array<uint64_t, detail::poly_size(BITS)> const& tab) { |
| 174 | for (int i = 0; i < size(); i++) { |
| 175 | fp_[i] ^= tab[i]; |
| 176 | } |
| 177 | } |
| 178 | |
| 179 | // Helper functions: shift the fingerprint value left by 8/32/64 bits, |
| 180 | // return the "out" value (the bits that were shifted out), and add "v" |
| 181 | // in the bits on the right. |
| 182 | uint8_t shlor8(uint8_t v); |
| 183 | uint32_t shlor32(uint32_t v); |
| 184 | uint64_t shlor64(uint64_t v); |
| 185 | |
| 186 | uint64_t fp_[detail::poly_size(BITS)]; |
| 187 | }; |
| 188 | |
| 189 | // Convenience functions |
| 190 | |
| 191 | /** |
| 192 | * Return the 64-bit Rabin fingerprint of a string. |
| 193 | */ |
| 194 | inline uint64_t fingerprint64(StringPiece str) { |
| 195 | uint64_t fp; |
| 196 | Fingerprint<64>().update(str).write(&fp); |
| 197 | return fp; |
| 198 | } |
| 199 | |
| 200 | /** |
| 201 | * Compute the 96-bit Rabin fingerprint of a string. |
| 202 | * Return the 64 most significant bits in *msb, and the 32 least significant |
| 203 | * bits in *lsb. |
| 204 | */ |
| 205 | inline void fingerprint96(StringPiece str, uint64_t* msb, uint32_t* lsb) { |
| 206 | uint64_t fp[2]; |
| 207 | Fingerprint<96>().update(str).write(fp); |
| 208 | *msb = fp[0]; |
| 209 | *lsb = (uint32_t)(fp[1] >> 32); |
| 210 | } |
| 211 | |
| 212 | /** |
| 213 | * Compute the 128-bit Rabin fingerprint of a string. |
| 214 | * Return the 64 most significant bits in *msb, and the 64 least significant |
| 215 | * bits in *lsb. |
| 216 | */ |
| 217 | inline void fingerprint128(StringPiece str, uint64_t* msb, uint64_t* lsb) { |
| 218 | uint64_t fp[2]; |
| 219 | Fingerprint<128>().update(str).write(fp); |
| 220 | *msb = fp[0]; |
| 221 | *lsb = fp[1]; |
| 222 | } |
| 223 | |
| 224 | template <> |
| 225 | inline uint8_t Fingerprint<64>::shlor8(uint8_t v) { |
| 226 | uint8_t out = (uint8_t)(fp_[0] >> 56); |
| 227 | fp_[0] = (fp_[0] << 8) | ((uint64_t)v); |
| 228 | return out; |
| 229 | } |
| 230 | |
| 231 | template <> |
| 232 | inline uint32_t Fingerprint<64>::shlor32(uint32_t v) { |
| 233 | uint32_t out = (uint32_t)(fp_[0] >> 32); |
| 234 | fp_[0] = (fp_[0] << 32) | ((uint64_t)v); |
| 235 | return out; |
| 236 | } |
| 237 | |
| 238 | template <> |
| 239 | inline uint64_t Fingerprint<64>::shlor64(uint64_t v) { |
| 240 | uint64_t out = fp_[0]; |
| 241 | fp_[0] = v; |
| 242 | return out; |
| 243 | } |
| 244 | |
| 245 | template <> |
| 246 | inline uint8_t Fingerprint<96>::shlor8(uint8_t v) { |
| 247 | uint8_t out = (uint8_t)(fp_[0] >> 56); |
| 248 | fp_[0] = (fp_[0] << 8) | (fp_[1] >> 56); |
| 249 | fp_[1] = (fp_[1] << 8) | ((uint64_t)v << 32); |
| 250 | return out; |
| 251 | } |
| 252 | |
| 253 | template <> |
| 254 | inline uint32_t Fingerprint<96>::shlor32(uint32_t v) { |
| 255 | uint32_t out = (uint32_t)(fp_[0] >> 32); |
| 256 | fp_[0] = (fp_[0] << 32) | (fp_[1] >> 32); |
| 257 | fp_[1] = ((uint64_t)v << 32); |
| 258 | return out; |
| 259 | } |
| 260 | |
| 261 | template <> |
| 262 | inline uint64_t Fingerprint<96>::shlor64(uint64_t v) { |
| 263 | uint64_t out = fp_[0]; |
| 264 | fp_[0] = fp_[1] | (v >> 32); |
| 265 | fp_[1] = v << 32; |
| 266 | return out; |
| 267 | } |
| 268 | |
| 269 | template <> |
| 270 | inline uint8_t Fingerprint<128>::shlor8(uint8_t v) { |
| 271 | uint8_t out = (uint8_t)(fp_[0] >> 56); |
| 272 | fp_[0] = (fp_[0] << 8) | (fp_[1] >> 56); |
| 273 | fp_[1] = (fp_[1] << 8) | ((uint64_t)v); |
| 274 | return out; |
| 275 | } |
| 276 | |
| 277 | template <> |
| 278 | inline uint32_t Fingerprint<128>::shlor32(uint32_t v) { |
| 279 | uint32_t out = (uint32_t)(fp_[0] >> 32); |
| 280 | fp_[0] = (fp_[0] << 32) | (fp_[1] >> 32); |
| 281 | fp_[1] = (fp_[1] << 32) | ((uint64_t)v); |
| 282 | return out; |
| 283 | } |
| 284 | |
| 285 | template <> |
| 286 | inline uint64_t Fingerprint<128>::shlor64(uint64_t v) { |
| 287 | uint64_t out = fp_[0]; |
| 288 | fp_[0] = fp_[1]; |
| 289 | fp_[1] = v; |
| 290 | return out; |
| 291 | } |
| 292 | |
| 293 | } // namespace folly |
| 294 | |