| 1 | /* |
| 2 | * crc32_impl.h |
| 3 | * |
| 4 | * Copyright 2016 Eric Biggers |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person |
| 7 | * obtaining a copy of this software and associated documentation |
| 8 | * files (the "Software"), to deal in the Software without |
| 9 | * restriction, including without limitation the rights to use, |
| 10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 11 | * copies of the Software, and to permit persons to whom the |
| 12 | * Software is furnished to do so, subject to the following |
| 13 | * conditions: |
| 14 | * |
| 15 | * The above copyright notice and this permission notice shall be |
| 16 | * included in all copies or substantial portions of the Software. |
| 17 | * |
| 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| 19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
| 20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| 21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
| 22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
| 23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| 24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| 25 | * OTHER DEALINGS IN THE SOFTWARE. |
| 26 | */ |
| 27 | |
| 28 | /* |
| 29 | * CRC-32 folding with PCLMULQDQ. |
| 30 | * |
| 31 | * The basic idea is to repeatedly "fold" each 512 bits into the next |
| 32 | * 512 bits, producing an abbreviated message which is congruent the |
| 33 | * original message modulo the generator polynomial G(x). |
| 34 | * |
| 35 | * Folding each 512 bits is implemented as eight 64-bit folds, each of |
| 36 | * which uses one carryless multiplication instruction. It's expected |
| 37 | * that CPUs may be able to execute some of these multiplications in |
| 38 | * parallel. |
| 39 | * |
| 40 | * Explanation of "folding": let A(x) be 64 bits from the message, and |
| 41 | * let B(x) be 95 bits from a constant distance D later in the |
| 42 | * message. The relevant portion of the message can be written as: |
| 43 | * |
| 44 | * M(x) = A(x)*x^D + B(x) |
| 45 | * |
| 46 | * ... where + and * represent addition and multiplication, |
| 47 | * respectively, of polynomials over GF(2). Note that when |
| 48 | * implemented on a computer, these operations are equivalent to XOR |
| 49 | * and carryless multiplication, respectively. |
| 50 | * |
| 51 | * For the purpose of CRC calculation, only the remainder modulo the |
| 52 | * generator polynomial G(x) matters: |
| 53 | * |
| 54 | * M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x) |
| 55 | * |
| 56 | * Since the modulo operation can be applied anywhere in a sequence of |
| 57 | * additions and multiplications without affecting the result, this is |
| 58 | * equivalent to: |
| 59 | * |
| 60 | * M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x) |
| 61 | * |
| 62 | * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree |
| 63 | * 31, i.e. a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is |
| 64 | * equivalent to a carryless multiplication of a 64-bit quantity by a |
| 65 | * 32-bit quantity, producing a 95-bit product. Then, adding |
| 66 | * (XOR-ing) the product to B(x) produces a polynomial with the same |
| 67 | * length as B(x) but with the same remainder as 'A(x)*x^D + B(x)'. |
| 68 | * This is the basic fold operation with 64 bits. |
| 69 | * |
| 70 | * Note that the carryless multiplication instruction PCLMULQDQ |
| 71 | * actually takes two 64-bit inputs and produces a 127-bit product in |
| 72 | * the low-order bits of a 128-bit XMM register. This works fine, but |
| 73 | * care must be taken to account for "bit endianness". With the CRC |
| 74 | * version implemented here, bits are always ordered such that the |
| 75 | * lowest-order bit represents the coefficient of highest power of x |
| 76 | * and the highest-order bit represents the coefficient of the lowest |
| 77 | * power of x. This is backwards from the more intuitive order. |
| 78 | * Still, carryless multiplication works essentially the same either |
| 79 | * way. It just must be accounted for that when we XOR the 95-bit |
| 80 | * product in the low-order 95 bits of a 128-bit XMM register into |
| 81 | * 128-bits of later data held in another XMM register, we'll really |
| 82 | * be XOR-ing the product into the mathematically higher degree end of |
| 83 | * those later bits, not the lower degree end as may be expected. |
| 84 | * |
| 85 | * So given that caveat and the fact that we process 512 bits per |
| 86 | * iteration, the 'D' values we need for the two 64-bit halves of each |
| 87 | * 128 bits of data are: |
| 88 | * |
| 89 | * D = (512 + 95) - 64 for the higher-degree half of each 128 |
| 90 | * bits, i.e. the lower order bits in |
| 91 | * the XMM register |
| 92 | * |
| 93 | * D = (512 + 95) - 128 for the lower-degree half of each 128 |
| 94 | * bits, i.e. the higher order bits in |
| 95 | * the XMM register |
| 96 | * |
| 97 | * The required 'x^D mod G(x)' values were precomputed. |
| 98 | * |
| 99 | * When <= 512 bits remain in the message, we finish up by folding |
| 100 | * across smaller distances. This works similarly; the distance D is |
| 101 | * just different, so different constant multipliers must be used. |
| 102 | * Finally, once the remaining message is just 64 bits, it is is |
| 103 | * reduced to the CRC-32 using Barrett reduction (explained later). |
| 104 | * |
| 105 | * For more information see the original paper from Intel: "Fast CRC |
| 106 | * Computation for Generic Polynomials Using PCLMULQDQ |
| 107 | * Instruction" December 2009 |
| 108 | * http://www.intel.com/content/dam/www/public/us/en/documents/ |
| 109 | * white-papers/ |
| 110 | * fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
| 111 | */ |
| 112 | |
| 113 | #include <folly/hash/detail/ChecksumDetail.h> |
| 114 | |
| 115 | namespace folly { |
| 116 | namespace detail { |
| 117 | |
| 118 | #if FOLLY_SSE_PREREQ(4, 2) |
| 119 | |
| 120 | uint32_t |
| 121 | crc32_hw_aligned(uint32_t remainder, const __m128i* p, size_t vec_count) { |
| 122 | /* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */ |
| 123 | const __m128i multipliers_4 = _mm_set_epi32(0, 0x1D9513D7, 0, 0x8F352D95); |
| 124 | const __m128i multipliers_2 = _mm_set_epi32(0, 0x81256527, 0, 0xF1DA05AA); |
| 125 | const __m128i multipliers_1 = _mm_set_epi32(0, 0xCCAA009E, 0, 0xAE689191); |
| 126 | const __m128i final_multiplier = _mm_set_epi32(0, 0, 0, 0xB8BC6765); |
| 127 | const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF); |
| 128 | const __m128i barrett_reduction_constants = |
| 129 | _mm_set_epi32(0x1, 0xDB710641, 0x1, 0xF7011641); |
| 130 | |
| 131 | const __m128i* const end = p + vec_count; |
| 132 | const __m128i* const end512 = p + (vec_count & ~3); |
| 133 | __m128i x0, x1, x2, x3; |
| 134 | |
| 135 | /* |
| 136 | * Account for the current 'remainder', i.e. the CRC of the part of |
| 137 | * the message already processed. Explanation: rewrite the message |
| 138 | * polynomial M(x) in terms of the first part A(x), the second part |
| 139 | * B(x), and the length of the second part in bits |B(x)| >= 32: |
| 140 | * |
| 141 | * M(x) = A(x)*x^|B(x)| + B(x) |
| 142 | * |
| 143 | * Then the CRC of M(x) is: |
| 144 | * |
| 145 | * CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x)) |
| 146 | * = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x)) |
| 147 | * = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x)) |
| 148 | * |
| 149 | * Note: all arithmetic is modulo G(x), the generator polynomial; that's |
| 150 | * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x). |
| 151 | * |
| 152 | * So the CRC of the full message is the CRC of the second part of the |
| 153 | * message where the first 32 bits of the second part of the message |
| 154 | * have been XOR'ed with the CRC of the first part of the message. |
| 155 | */ |
| 156 | x0 = *p++; |
| 157 | x0 = _mm_xor_si128(x0, _mm_set_epi32(0, 0, 0, remainder)); |
| 158 | |
| 159 | if (p > end512) /* only 128, 256, or 384 bits of input? */ |
| 160 | goto _128_bits_at_a_time; |
| 161 | x1 = *p++; |
| 162 | x2 = *p++; |
| 163 | x3 = *p++; |
| 164 | |
| 165 | /* Fold 512 bits at a time */ |
| 166 | for (; p != end512; p += 4) { |
| 167 | __m128i y0, y1, y2, y3; |
| 168 | |
| 169 | y0 = p[0]; |
| 170 | y1 = p[1]; |
| 171 | y2 = p[2]; |
| 172 | y3 = p[3]; |
| 173 | |
| 174 | /* |
| 175 | * Note: the immediate constant for PCLMULQDQ specifies which |
| 176 | * 64-bit halves of the 128-bit vectors to multiply: |
| 177 | * |
| 178 | * 0x00 means low halves (higher degree polynomial terms for us) |
| 179 | * 0x11 means high halves (lower degree polynomial terms for us) |
| 180 | */ |
| 181 | y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x00)); |
| 182 | y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x00)); |
| 183 | y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x00)); |
| 184 | y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x00)); |
| 185 | y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x11)); |
| 186 | y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x11)); |
| 187 | y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x11)); |
| 188 | y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x11)); |
| 189 | |
| 190 | x0 = y0; |
| 191 | x1 = y1; |
| 192 | x2 = y2; |
| 193 | x3 = y3; |
| 194 | } |
| 195 | |
| 196 | /* Fold 512 bits => 128 bits */ |
| 197 | x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x00)); |
| 198 | x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x00)); |
| 199 | x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x11)); |
| 200 | x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x11)); |
| 201 | x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x00)); |
| 202 | x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x11)); |
| 203 | x0 = x3; |
| 204 | |
| 205 | _128_bits_at_a_time: |
| 206 | while (p != end) { |
| 207 | /* Fold 128 bits into next 128 bits */ |
| 208 | x1 = *p++; |
| 209 | x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x00)); |
| 210 | x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x11)); |
| 211 | x0 = x1; |
| 212 | } |
| 213 | |
| 214 | /* Now there are just 128 bits left, stored in 'x0'. */ |
| 215 | |
| 216 | /* |
| 217 | * Fold 128 => 96 bits. This also implicitly appends 32 zero bits, |
| 218 | * which is equivalent to multiplying by x^32. This is needed because |
| 219 | * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). |
| 220 | */ |
| 221 | x0 = _mm_xor_si128( |
| 222 | _mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, multipliers_1, 0x10)); |
| 223 | |
| 224 | /* Fold 96 => 64 bits */ |
| 225 | x0 = _mm_xor_si128( |
| 226 | _mm_srli_si128(x0, 4), |
| 227 | _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), final_multiplier, 0x00)); |
| 228 | |
| 229 | /* |
| 230 | * Finally, reduce 64 => 32 bits using Barrett reduction. |
| 231 | * |
| 232 | * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to |
| 233 | * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)): |
| 234 | * |
| 235 | * R(x) = (A(x)*x^32 + B(x)) mod G(x) |
| 236 | * = (A(x)*x^32) mod G(x) + B(x) |
| 237 | * |
| 238 | * Then, by the Division Algorithm there exists a unique q(x) such that: |
| 239 | * |
| 240 | * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x) |
| 241 | * |
| 242 | * Since the left-hand side is of maximum degree 31, the right-hand side |
| 243 | * must be too. This implies that we can apply 'mod x^32' to the |
| 244 | * right-hand side without changing its value: |
| 245 | * |
| 246 | * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32 |
| 247 | * |
| 248 | * Note that '+' is equivalent to '-' in polynomials over GF(2). |
| 249 | * |
| 250 | * We also know that: |
| 251 | * |
| 252 | * / A(x)*x^32 \ |
| 253 | * q(x) = floor ( --------- ) |
| 254 | * \ G(x) / |
| 255 | * |
| 256 | * To compute this efficiently, we can multiply the top and bottom by |
| 257 | * x^32 and move the division by G(x) to the top: |
| 258 | * |
| 259 | * / A(x) * floor(x^64 / G(x)) \ |
| 260 | * q(x) = floor ( ------------------------- ) |
| 261 | * \ x^32 / |
| 262 | * |
| 263 | * Note that floor(x^64 / G(x)) is a constant. |
| 264 | * |
| 265 | * So finally we have: |
| 266 | * |
| 267 | * / A(x) * floor(x^64 / G(x)) \ |
| 268 | * R(x) = B(x) + G(x)*floor ( ------------------------- ) |
| 269 | * \ x^32 / |
| 270 | */ |
| 271 | x1 = x0; |
| 272 | x0 = _mm_clmulepi64_si128( |
| 273 | _mm_and_si128(x0, mask32), barrett_reduction_constants, 0x00); |
| 274 | x0 = _mm_clmulepi64_si128( |
| 275 | _mm_and_si128(x0, mask32), barrett_reduction_constants, 0x10); |
| 276 | return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(x0, x1), 4)); |
| 277 | } |
| 278 | |
| 279 | #endif |
| 280 | } // namespace detail |
| 281 | } // namespace folly |
| 282 | |