1/*
2 * crc32_impl.h
3 *
4 * Copyright 2016 Eric Biggers
5 *
6 * Permission is hereby granted, free of charge, to any person
7 * obtaining a copy of this software and associated documentation
8 * files (the "Software"), to deal in the Software without
9 * restriction, including without limitation the rights to use,
10 * copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following
13 * conditions:
14 *
15 * The above copyright notice and this permission notice shall be
16 * included in all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 * OTHER DEALINGS IN THE SOFTWARE.
26 */
27
28/*
29 * CRC-32 folding with PCLMULQDQ.
30 *
31 * The basic idea is to repeatedly "fold" each 512 bits into the next
32 * 512 bits, producing an abbreviated message which is congruent the
33 * original message modulo the generator polynomial G(x).
34 *
35 * Folding each 512 bits is implemented as eight 64-bit folds, each of
36 * which uses one carryless multiplication instruction. It's expected
37 * that CPUs may be able to execute some of these multiplications in
38 * parallel.
39 *
40 * Explanation of "folding": let A(x) be 64 bits from the message, and
41 * let B(x) be 95 bits from a constant distance D later in the
42 * message. The relevant portion of the message can be written as:
43 *
44 * M(x) = A(x)*x^D + B(x)
45 *
46 * ... where + and * represent addition and multiplication,
47 * respectively, of polynomials over GF(2). Note that when
48 * implemented on a computer, these operations are equivalent to XOR
49 * and carryless multiplication, respectively.
50 *
51 * For the purpose of CRC calculation, only the remainder modulo the
52 * generator polynomial G(x) matters:
53 *
54 * M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
55 *
56 * Since the modulo operation can be applied anywhere in a sequence of
57 * additions and multiplications without affecting the result, this is
58 * equivalent to:
59 *
60 * M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
61 *
62 * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree
63 * 31, i.e. a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is
64 * equivalent to a carryless multiplication of a 64-bit quantity by a
65 * 32-bit quantity, producing a 95-bit product. Then, adding
66 * (XOR-ing) the product to B(x) produces a polynomial with the same
67 * length as B(x) but with the same remainder as 'A(x)*x^D + B(x)'.
68 * This is the basic fold operation with 64 bits.
69 *
70 * Note that the carryless multiplication instruction PCLMULQDQ
71 * actually takes two 64-bit inputs and produces a 127-bit product in
72 * the low-order bits of a 128-bit XMM register. This works fine, but
73 * care must be taken to account for "bit endianness". With the CRC
74 * version implemented here, bits are always ordered such that the
75 * lowest-order bit represents the coefficient of highest power of x
76 * and the highest-order bit represents the coefficient of the lowest
77 * power of x. This is backwards from the more intuitive order.
78 * Still, carryless multiplication works essentially the same either
79 * way. It just must be accounted for that when we XOR the 95-bit
80 * product in the low-order 95 bits of a 128-bit XMM register into
81 * 128-bits of later data held in another XMM register, we'll really
82 * be XOR-ing the product into the mathematically higher degree end of
83 * those later bits, not the lower degree end as may be expected.
84 *
85 * So given that caveat and the fact that we process 512 bits per
86 * iteration, the 'D' values we need for the two 64-bit halves of each
87 * 128 bits of data are:
88 *
89 * D = (512 + 95) - 64 for the higher-degree half of each 128
90 * bits, i.e. the lower order bits in
91 * the XMM register
92 *
93 * D = (512 + 95) - 128 for the lower-degree half of each 128
94 * bits, i.e. the higher order bits in
95 * the XMM register
96 *
97 * The required 'x^D mod G(x)' values were precomputed.
98 *
99 * When <= 512 bits remain in the message, we finish up by folding
100 * across smaller distances. This works similarly; the distance D is
101 * just different, so different constant multipliers must be used.
102 * Finally, once the remaining message is just 64 bits, it is is
103 * reduced to the CRC-32 using Barrett reduction (explained later).
104 *
105 * For more information see the original paper from Intel: "Fast CRC
106 * Computation for Generic Polynomials Using PCLMULQDQ
107 * Instruction" December 2009
108 * http://www.intel.com/content/dam/www/public/us/en/documents/
109 * white-papers/
110 * fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
111 */
112
113#include <folly/hash/detail/ChecksumDetail.h>
114
115namespace folly {
116namespace detail {
117
118#if FOLLY_SSE_PREREQ(4, 2)
119
120uint32_t
121crc32_hw_aligned(uint32_t remainder, const __m128i* p, size_t vec_count) {
122 /* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */
123 const __m128i multipliers_4 = _mm_set_epi32(0, 0x1D9513D7, 0, 0x8F352D95);
124 const __m128i multipliers_2 = _mm_set_epi32(0, 0x81256527, 0, 0xF1DA05AA);
125 const __m128i multipliers_1 = _mm_set_epi32(0, 0xCCAA009E, 0, 0xAE689191);
126 const __m128i final_multiplier = _mm_set_epi32(0, 0, 0, 0xB8BC6765);
127 const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
128 const __m128i barrett_reduction_constants =
129 _mm_set_epi32(0x1, 0xDB710641, 0x1, 0xF7011641);
130
131 const __m128i* const end = p + vec_count;
132 const __m128i* const end512 = p + (vec_count & ~3);
133 __m128i x0, x1, x2, x3;
134
135 /*
136 * Account for the current 'remainder', i.e. the CRC of the part of
137 * the message already processed. Explanation: rewrite the message
138 * polynomial M(x) in terms of the first part A(x), the second part
139 * B(x), and the length of the second part in bits |B(x)| >= 32:
140 *
141 * M(x) = A(x)*x^|B(x)| + B(x)
142 *
143 * Then the CRC of M(x) is:
144 *
145 * CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
146 * = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
147 * = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
148 *
149 * Note: all arithmetic is modulo G(x), the generator polynomial; that's
150 * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
151 *
152 * So the CRC of the full message is the CRC of the second part of the
153 * message where the first 32 bits of the second part of the message
154 * have been XOR'ed with the CRC of the first part of the message.
155 */
156 x0 = *p++;
157 x0 = _mm_xor_si128(x0, _mm_set_epi32(0, 0, 0, remainder));
158
159 if (p > end512) /* only 128, 256, or 384 bits of input? */
160 goto _128_bits_at_a_time;
161 x1 = *p++;
162 x2 = *p++;
163 x3 = *p++;
164
165 /* Fold 512 bits at a time */
166 for (; p != end512; p += 4) {
167 __m128i y0, y1, y2, y3;
168
169 y0 = p[0];
170 y1 = p[1];
171 y2 = p[2];
172 y3 = p[3];
173
174 /*
175 * Note: the immediate constant for PCLMULQDQ specifies which
176 * 64-bit halves of the 128-bit vectors to multiply:
177 *
178 * 0x00 means low halves (higher degree polynomial terms for us)
179 * 0x11 means high halves (lower degree polynomial terms for us)
180 */
181 y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x00));
182 y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x00));
183 y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x00));
184 y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x00));
185 y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x11));
186 y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x11));
187 y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x11));
188 y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x11));
189
190 x0 = y0;
191 x1 = y1;
192 x2 = y2;
193 x3 = y3;
194 }
195
196 /* Fold 512 bits => 128 bits */
197 x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x00));
198 x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x00));
199 x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x11));
200 x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x11));
201 x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x00));
202 x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x11));
203 x0 = x3;
204
205_128_bits_at_a_time:
206 while (p != end) {
207 /* Fold 128 bits into next 128 bits */
208 x1 = *p++;
209 x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x00));
210 x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x11));
211 x0 = x1;
212 }
213
214 /* Now there are just 128 bits left, stored in 'x0'. */
215
216 /*
217 * Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
218 * which is equivalent to multiplying by x^32. This is needed because
219 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
220 */
221 x0 = _mm_xor_si128(
222 _mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, multipliers_1, 0x10));
223
224 /* Fold 96 => 64 bits */
225 x0 = _mm_xor_si128(
226 _mm_srli_si128(x0, 4),
227 _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), final_multiplier, 0x00));
228
229 /*
230 * Finally, reduce 64 => 32 bits using Barrett reduction.
231 *
232 * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
233 * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
234 *
235 * R(x) = (A(x)*x^32 + B(x)) mod G(x)
236 * = (A(x)*x^32) mod G(x) + B(x)
237 *
238 * Then, by the Division Algorithm there exists a unique q(x) such that:
239 *
240 * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
241 *
242 * Since the left-hand side is of maximum degree 31, the right-hand side
243 * must be too. This implies that we can apply 'mod x^32' to the
244 * right-hand side without changing its value:
245 *
246 * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
247 *
248 * Note that '+' is equivalent to '-' in polynomials over GF(2).
249 *
250 * We also know that:
251 *
252 * / A(x)*x^32 \
253 * q(x) = floor ( --------- )
254 * \ G(x) /
255 *
256 * To compute this efficiently, we can multiply the top and bottom by
257 * x^32 and move the division by G(x) to the top:
258 *
259 * / A(x) * floor(x^64 / G(x)) \
260 * q(x) = floor ( ------------------------- )
261 * \ x^32 /
262 *
263 * Note that floor(x^64 / G(x)) is a constant.
264 *
265 * So finally we have:
266 *
267 * / A(x) * floor(x^64 / G(x)) \
268 * R(x) = B(x) + G(x)*floor ( ------------------------- )
269 * \ x^32 /
270 */
271 x1 = x0;
272 x0 = _mm_clmulepi64_si128(
273 _mm_and_si128(x0, mask32), barrett_reduction_constants, 0x00);
274 x0 = _mm_clmulepi64_si128(
275 _mm_and_si128(x0, mask32), barrett_reduction_constants, 0x10);
276 return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(x0, x1), 4));
277}
278
279#endif
280} // namespace detail
281} // namespace folly
282