1 | /* |
2 | * crc32_impl.h |
3 | * |
4 | * Copyright 2016 Eric Biggers |
5 | * |
6 | * Permission is hereby granted, free of charge, to any person |
7 | * obtaining a copy of this software and associated documentation |
8 | * files (the "Software"), to deal in the Software without |
9 | * restriction, including without limitation the rights to use, |
10 | * copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | * copies of the Software, and to permit persons to whom the |
12 | * Software is furnished to do so, subject to the following |
13 | * conditions: |
14 | * |
15 | * The above copyright notice and this permission notice shall be |
16 | * included in all copies or substantial portions of the Software. |
17 | * |
18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
20 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
22 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
23 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
24 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
25 | * OTHER DEALINGS IN THE SOFTWARE. |
26 | */ |
27 | |
28 | /* |
29 | * CRC-32 folding with PCLMULQDQ. |
30 | * |
31 | * The basic idea is to repeatedly "fold" each 512 bits into the next |
32 | * 512 bits, producing an abbreviated message which is congruent the |
33 | * original message modulo the generator polynomial G(x). |
34 | * |
35 | * Folding each 512 bits is implemented as eight 64-bit folds, each of |
36 | * which uses one carryless multiplication instruction. It's expected |
37 | * that CPUs may be able to execute some of these multiplications in |
38 | * parallel. |
39 | * |
40 | * Explanation of "folding": let A(x) be 64 bits from the message, and |
41 | * let B(x) be 95 bits from a constant distance D later in the |
42 | * message. The relevant portion of the message can be written as: |
43 | * |
44 | * M(x) = A(x)*x^D + B(x) |
45 | * |
46 | * ... where + and * represent addition and multiplication, |
47 | * respectively, of polynomials over GF(2). Note that when |
48 | * implemented on a computer, these operations are equivalent to XOR |
49 | * and carryless multiplication, respectively. |
50 | * |
51 | * For the purpose of CRC calculation, only the remainder modulo the |
52 | * generator polynomial G(x) matters: |
53 | * |
54 | * M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x) |
55 | * |
56 | * Since the modulo operation can be applied anywhere in a sequence of |
57 | * additions and multiplications without affecting the result, this is |
58 | * equivalent to: |
59 | * |
60 | * M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x) |
61 | * |
62 | * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree |
63 | * 31, i.e. a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is |
64 | * equivalent to a carryless multiplication of a 64-bit quantity by a |
65 | * 32-bit quantity, producing a 95-bit product. Then, adding |
66 | * (XOR-ing) the product to B(x) produces a polynomial with the same |
67 | * length as B(x) but with the same remainder as 'A(x)*x^D + B(x)'. |
68 | * This is the basic fold operation with 64 bits. |
69 | * |
70 | * Note that the carryless multiplication instruction PCLMULQDQ |
71 | * actually takes two 64-bit inputs and produces a 127-bit product in |
72 | * the low-order bits of a 128-bit XMM register. This works fine, but |
73 | * care must be taken to account for "bit endianness". With the CRC |
74 | * version implemented here, bits are always ordered such that the |
75 | * lowest-order bit represents the coefficient of highest power of x |
76 | * and the highest-order bit represents the coefficient of the lowest |
77 | * power of x. This is backwards from the more intuitive order. |
78 | * Still, carryless multiplication works essentially the same either |
79 | * way. It just must be accounted for that when we XOR the 95-bit |
80 | * product in the low-order 95 bits of a 128-bit XMM register into |
81 | * 128-bits of later data held in another XMM register, we'll really |
82 | * be XOR-ing the product into the mathematically higher degree end of |
83 | * those later bits, not the lower degree end as may be expected. |
84 | * |
85 | * So given that caveat and the fact that we process 512 bits per |
86 | * iteration, the 'D' values we need for the two 64-bit halves of each |
87 | * 128 bits of data are: |
88 | * |
89 | * D = (512 + 95) - 64 for the higher-degree half of each 128 |
90 | * bits, i.e. the lower order bits in |
91 | * the XMM register |
92 | * |
93 | * D = (512 + 95) - 128 for the lower-degree half of each 128 |
94 | * bits, i.e. the higher order bits in |
95 | * the XMM register |
96 | * |
97 | * The required 'x^D mod G(x)' values were precomputed. |
98 | * |
99 | * When <= 512 bits remain in the message, we finish up by folding |
100 | * across smaller distances. This works similarly; the distance D is |
101 | * just different, so different constant multipliers must be used. |
102 | * Finally, once the remaining message is just 64 bits, it is is |
103 | * reduced to the CRC-32 using Barrett reduction (explained later). |
104 | * |
105 | * For more information see the original paper from Intel: "Fast CRC |
106 | * Computation for Generic Polynomials Using PCLMULQDQ |
107 | * Instruction" December 2009 |
108 | * http://www.intel.com/content/dam/www/public/us/en/documents/ |
109 | * white-papers/ |
110 | * fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
111 | */ |
112 | |
113 | #include <folly/hash/detail/ChecksumDetail.h> |
114 | |
115 | namespace folly { |
116 | namespace detail { |
117 | |
118 | #if FOLLY_SSE_PREREQ(4, 2) |
119 | |
120 | uint32_t |
121 | crc32_hw_aligned(uint32_t remainder, const __m128i* p, size_t vec_count) { |
122 | /* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */ |
123 | const __m128i multipliers_4 = _mm_set_epi32(0, 0x1D9513D7, 0, 0x8F352D95); |
124 | const __m128i multipliers_2 = _mm_set_epi32(0, 0x81256527, 0, 0xF1DA05AA); |
125 | const __m128i multipliers_1 = _mm_set_epi32(0, 0xCCAA009E, 0, 0xAE689191); |
126 | const __m128i final_multiplier = _mm_set_epi32(0, 0, 0, 0xB8BC6765); |
127 | const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF); |
128 | const __m128i barrett_reduction_constants = |
129 | _mm_set_epi32(0x1, 0xDB710641, 0x1, 0xF7011641); |
130 | |
131 | const __m128i* const end = p + vec_count; |
132 | const __m128i* const end512 = p + (vec_count & ~3); |
133 | __m128i x0, x1, x2, x3; |
134 | |
135 | /* |
136 | * Account for the current 'remainder', i.e. the CRC of the part of |
137 | * the message already processed. Explanation: rewrite the message |
138 | * polynomial M(x) in terms of the first part A(x), the second part |
139 | * B(x), and the length of the second part in bits |B(x)| >= 32: |
140 | * |
141 | * M(x) = A(x)*x^|B(x)| + B(x) |
142 | * |
143 | * Then the CRC of M(x) is: |
144 | * |
145 | * CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x)) |
146 | * = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x)) |
147 | * = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x)) |
148 | * |
149 | * Note: all arithmetic is modulo G(x), the generator polynomial; that's |
150 | * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x). |
151 | * |
152 | * So the CRC of the full message is the CRC of the second part of the |
153 | * message where the first 32 bits of the second part of the message |
154 | * have been XOR'ed with the CRC of the first part of the message. |
155 | */ |
156 | x0 = *p++; |
157 | x0 = _mm_xor_si128(x0, _mm_set_epi32(0, 0, 0, remainder)); |
158 | |
159 | if (p > end512) /* only 128, 256, or 384 bits of input? */ |
160 | goto _128_bits_at_a_time; |
161 | x1 = *p++; |
162 | x2 = *p++; |
163 | x3 = *p++; |
164 | |
165 | /* Fold 512 bits at a time */ |
166 | for (; p != end512; p += 4) { |
167 | __m128i y0, y1, y2, y3; |
168 | |
169 | y0 = p[0]; |
170 | y1 = p[1]; |
171 | y2 = p[2]; |
172 | y3 = p[3]; |
173 | |
174 | /* |
175 | * Note: the immediate constant for PCLMULQDQ specifies which |
176 | * 64-bit halves of the 128-bit vectors to multiply: |
177 | * |
178 | * 0x00 means low halves (higher degree polynomial terms for us) |
179 | * 0x11 means high halves (lower degree polynomial terms for us) |
180 | */ |
181 | y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x00)); |
182 | y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x00)); |
183 | y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x00)); |
184 | y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x00)); |
185 | y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x11)); |
186 | y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x11)); |
187 | y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x11)); |
188 | y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x11)); |
189 | |
190 | x0 = y0; |
191 | x1 = y1; |
192 | x2 = y2; |
193 | x3 = y3; |
194 | } |
195 | |
196 | /* Fold 512 bits => 128 bits */ |
197 | x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x00)); |
198 | x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x00)); |
199 | x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x11)); |
200 | x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x11)); |
201 | x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x00)); |
202 | x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x11)); |
203 | x0 = x3; |
204 | |
205 | _128_bits_at_a_time: |
206 | while (p != end) { |
207 | /* Fold 128 bits into next 128 bits */ |
208 | x1 = *p++; |
209 | x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x00)); |
210 | x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x11)); |
211 | x0 = x1; |
212 | } |
213 | |
214 | /* Now there are just 128 bits left, stored in 'x0'. */ |
215 | |
216 | /* |
217 | * Fold 128 => 96 bits. This also implicitly appends 32 zero bits, |
218 | * which is equivalent to multiplying by x^32. This is needed because |
219 | * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). |
220 | */ |
221 | x0 = _mm_xor_si128( |
222 | _mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, multipliers_1, 0x10)); |
223 | |
224 | /* Fold 96 => 64 bits */ |
225 | x0 = _mm_xor_si128( |
226 | _mm_srli_si128(x0, 4), |
227 | _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), final_multiplier, 0x00)); |
228 | |
229 | /* |
230 | * Finally, reduce 64 => 32 bits using Barrett reduction. |
231 | * |
232 | * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to |
233 | * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)): |
234 | * |
235 | * R(x) = (A(x)*x^32 + B(x)) mod G(x) |
236 | * = (A(x)*x^32) mod G(x) + B(x) |
237 | * |
238 | * Then, by the Division Algorithm there exists a unique q(x) such that: |
239 | * |
240 | * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x) |
241 | * |
242 | * Since the left-hand side is of maximum degree 31, the right-hand side |
243 | * must be too. This implies that we can apply 'mod x^32' to the |
244 | * right-hand side without changing its value: |
245 | * |
246 | * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32 |
247 | * |
248 | * Note that '+' is equivalent to '-' in polynomials over GF(2). |
249 | * |
250 | * We also know that: |
251 | * |
252 | * / A(x)*x^32 \ |
253 | * q(x) = floor ( --------- ) |
254 | * \ G(x) / |
255 | * |
256 | * To compute this efficiently, we can multiply the top and bottom by |
257 | * x^32 and move the division by G(x) to the top: |
258 | * |
259 | * / A(x) * floor(x^64 / G(x)) \ |
260 | * q(x) = floor ( ------------------------- ) |
261 | * \ x^32 / |
262 | * |
263 | * Note that floor(x^64 / G(x)) is a constant. |
264 | * |
265 | * So finally we have: |
266 | * |
267 | * / A(x) * floor(x^64 / G(x)) \ |
268 | * R(x) = B(x) + G(x)*floor ( ------------------------- ) |
269 | * \ x^32 / |
270 | */ |
271 | x1 = x0; |
272 | x0 = _mm_clmulepi64_si128( |
273 | _mm_and_si128(x0, mask32), barrett_reduction_constants, 0x00); |
274 | x0 = _mm_clmulepi64_si128( |
275 | _mm_and_si128(x0, mask32), barrett_reduction_constants, 0x10); |
276 | return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(x0, x1), 4)); |
277 | } |
278 | |
279 | #endif |
280 | } // namespace detail |
281 | } // namespace folly |
282 | |