ChecksumDetail.cpp source code [folly/hash/detail/ChecksumDetail.cpp]

1	/*
2	* crc32_impl.h
3	*
4	* Copyright 2016 Eric Biggers
5	*
6	* Permission is hereby granted, free of charge, to any person
7	* obtaining a copy of this software and associated documentation
8	* files (the "Software"), to deal in the Software without
9	* restriction, including without limitation the rights to use,
10	* copy, modify, merge, publish, distribute, sublicense, and/or sell
11	* copies of the Software, and to permit persons to whom the
12	* Software is furnished to do so, subject to the following
13	* conditions:
14	*
15	* The above copyright notice and this permission notice shall be
16	* included in all copies or substantial portions of the Software.
17	*
18	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20	* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22	* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23	* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25	* OTHER DEALINGS IN THE SOFTWARE.
26	*/
27
28	/*
29	* CRC-32 folding with PCLMULQDQ.
30	*
31	* The basic idea is to repeatedly "fold" each 512 bits into the next
32	* 512 bits, producing an abbreviated message which is congruent the
33	* original message modulo the generator polynomial G(x).
34	*
35	* Folding each 512 bits is implemented as eight 64-bit folds, each of
36	* which uses one carryless multiplication instruction. It's expected
37	* that CPUs may be able to execute some of these multiplications in
38	* parallel.
39	*
40	* Explanation of "folding": let A(x) be 64 bits from the message, and
41	* let B(x) be 95 bits from a constant distance D later in the
42	* message. The relevant portion of the message can be written as:
43	*
44	* M(x) = A(x)*x^D + B(x)
45	*
46	* ... where + and * represent addition and multiplication,
47	* respectively, of polynomials over GF(2). Note that when
48	* implemented on a computer, these operations are equivalent to XOR
49	* and carryless multiplication, respectively.
50	*
51	* For the purpose of CRC calculation, only the remainder modulo the
52	* generator polynomial G(x) matters:
53	*
54	* M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
55	*
56	* Since the modulo operation can be applied anywhere in a sequence of
57	* additions and multiplications without affecting the result, this is
58	* equivalent to:
59	*
60	* M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
61	*
62	* For any D, 'x^D mod G(x)' will be a polynomial with maximum degree
63	* 31, i.e. a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is
64	* equivalent to a carryless multiplication of a 64-bit quantity by a
65	* 32-bit quantity, producing a 95-bit product. Then, adding
66	* (XOR-ing) the product to B(x) produces a polynomial with the same
67	* length as B(x) but with the same remainder as 'A(x)*x^D + B(x)'.
68	* This is the basic fold operation with 64 bits.
69	*
70	* Note that the carryless multiplication instruction PCLMULQDQ
71	* actually takes two 64-bit inputs and produces a 127-bit product in
72	* the low-order bits of a 128-bit XMM register. This works fine, but
73	* care must be taken to account for "bit endianness". With the CRC
74	* version implemented here, bits are always ordered such that the
75	* lowest-order bit represents the coefficient of highest power of x
76	* and the highest-order bit represents the coefficient of the lowest
77	* power of x. This is backwards from the more intuitive order.
78	* Still, carryless multiplication works essentially the same either
79	* way. It just must be accounted for that when we XOR the 95-bit
80	* product in the low-order 95 bits of a 128-bit XMM register into
81	* 128-bits of later data held in another XMM register, we'll really
82	* be XOR-ing the product into the mathematically higher degree end of
83	* those later bits, not the lower degree end as may be expected.
84	*
85	* So given that caveat and the fact that we process 512 bits per
86	* iteration, the 'D' values we need for the two 64-bit halves of each
87	* 128 bits of data are:
88	*
89	* D = (512 + 95) - 64 for the higher-degree half of each 128
90	* bits, i.e. the lower order bits in
91	* the XMM register
92	*
93	* D = (512 + 95) - 128 for the lower-degree half of each 128
94	* bits, i.e. the higher order bits in
95	* the XMM register
96	*
97	* The required 'x^D mod G(x)' values were precomputed.
98	*
99	* When <= 512 bits remain in the message, we finish up by folding
100	* across smaller distances. This works similarly; the distance D is
101	* just different, so different constant multipliers must be used.
102	* Finally, once the remaining message is just 64 bits, it is is
103	* reduced to the CRC-32 using Barrett reduction (explained later).
104	*
105	* For more information see the original paper from Intel: "Fast CRC
106	* Computation for Generic Polynomials Using PCLMULQDQ
107	* Instruction" December 2009
108	* http://www.intel.com/content/dam/www/public/us/en/documents/
109	* white-papers/
110	* fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
111	*/
112
113	#include <folly/hash/detail/ChecksumDetail.h>
114
115	namespace folly {
116	namespace detail {
117
118	#if FOLLY_SSE_PREREQ(4, 2)
119
120	uint32_t
121	crc32_hw_aligned(uint32_t remainder, const __m128i* p, size_t vec_count) {
122	/ Constants precomputed by gen_crc32_multipliers.c. Do not edit! /
123	const __m128i multipliers_4 = _mm_set_epi32(`0`, `0x1D9513D7`, `0`, `0x8F352D95`);
124	const __m128i multipliers_2 = _mm_set_epi32(`0`, `0x81256527`, `0`, `0xF1DA05AA`);
125	const __m128i multipliers_1 = _mm_set_epi32(`0`, `0xCCAA009E`, `0`, `0xAE689191`);
126	const __m128i final_multiplier = _mm_set_epi32(`0`, `0`, `0`, `0xB8BC6765`);
127	const __m128i mask32 = _mm_set_epi32(`0`, `0`, `0`, `0xFFFFFFFF`);
128	const __m128i barrett_reduction_constants =
129	_mm_set_epi32(`0x1`, `0xDB710641`, `0x1`, `0xF7011641`);
130
131	const __m128i* const end = p + vec_count;
132	const __m128i* const end512 = p + (vec_count & ~`3`);
133	__m128i x0, x1, x2, x3;
134
135	/*
136	* Account for the current 'remainder', i.e. the CRC of the part of
137	* the message already processed. Explanation: rewrite the message
138	* polynomial M(x) in terms of the first part A(x), the second part
139	* B(x), and the length of the second part in bits \|B(x)\| >= 32:
140	*
141	* M(x) = A(x)*x^\|B(x)\| + B(x)
142	*
143	* Then the CRC of M(x) is:
144	*
145	* CRC(M(x)) = CRC(A(x)*x^\|B(x)\| + B(x))
146	* = CRC(A(x)x^32x^(\|B(x)\| - 32) + B(x))
147	* = CRC(CRC(A(x))*x^(\|B(x)\| - 32) + B(x))
148	*
149	* Note: all arithmetic is modulo G(x), the generator polynomial; that's
150	* why A(x)x^32 can be replaced with CRC(A(x)) = A(x)x^32 mod G(x).
151	*
152	* So the CRC of the full message is the CRC of the second part of the
153	* message where the first 32 bits of the second part of the message
154	* have been XOR'ed with the CRC of the first part of the message.
155	*/
156	x0 = *p++;
157	x0 = _mm_xor_si128(x0, _mm_set_epi32(`0`, `0`, `0`, remainder));
158
159	if (p > end512) / only 128, 256, or 384 bits of input? /
160	goto _128_bits_at_a_time;
161	x1 = *p++;
162	x2 = *p++;
163	x3 = *p++;
164
165	/ Fold 512 bits at a time /
166	for (; p != end512; p += `4`) {
167	__m128i y0, y1, y2, y3;
168
169	y0 = p[`0`];
170	y1 = p[`1`];
171	y2 = p[`2`];
172	y3 = p[`3`];
173
174	/*
175	* Note: the immediate constant for PCLMULQDQ specifies which
176	* 64-bit halves of the 128-bit vectors to multiply:
177	*
178	* 0x00 means low halves (higher degree polynomial terms for us)
179	* 0x11 means high halves (lower degree polynomial terms for us)
180	*/
181	y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, `0x00`));
182	y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, `0x00`));
183	y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, `0x00`));
184	y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, `0x00`));
185	y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, `0x11`));
186	y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, `0x11`));
187	y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, `0x11`));
188	y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, `0x11`));
189
190	x0 = y0;
191	x1 = y1;
192	x2 = y2;
193	x3 = y3;
194	}
195
196	/ Fold 512 bits => 128 bits /
197	x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, `0x00`));
198	x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, `0x00`));
199	x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, `0x11`));
200	x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, `0x11`));
201	x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, `0x00`));
202	x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, `0x11`));
203	x0 = x3;
204
205	_128_bits_at_a_time:
206	while (p != end) {
207	/ Fold 128 bits into next 128 bits /
208	x1 = *p++;
209	x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, `0x00`));
210	x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, `0x11`));
211	x0 = x1;
212	}
213
214	/ Now there are just 128 bits left, stored in 'x0'. /
215
216	/*
217	* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
218	* which is equivalent to multiplying by x^32. This is needed because
219	* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
220	*/
221	x0 = _mm_xor_si128(
222	_mm_srli_si128(x0, `8`), _mm_clmulepi64_si128(x0, multipliers_1, `0x10`));
223
224	/ Fold 96 => 64 bits /
225	x0 = _mm_xor_si128(
226	_mm_srli_si128(x0, `4`),
227	_mm_clmulepi64_si128(_mm_and_si128(x0, mask32), final_multiplier, `0x00`));
228
229	/*
230	* Finally, reduce 64 => 32 bits using Barrett reduction.
231	*
232	* Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
233	* compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
234	*
235	* R(x) = (A(x)*x^32 + B(x)) mod G(x)
236	* = (A(x)*x^32) mod G(x) + B(x)
237	*
238	* Then, by the Division Algorithm there exists a unique q(x) such that:
239	*
240	* A(x)x^32 mod G(x) = A(x)x^32 - q(x)*G(x)
241	*
242	* Since the left-hand side is of maximum degree 31, the right-hand side
243	* must be too. This implies that we can apply 'mod x^32' to the
244	* right-hand side without changing its value:
245	*
246	* (A(x)x^32 - q(x)G(x)) mod x^32 = q(x)*G(x) mod x^32
247	*
248	* Note that '+' is equivalent to '-' in polynomials over GF(2).
249	*
250	* We also know that:
251	*
252	* / A(x)*x^32 \
253	* q(x) = floor ( --------- )
254	* \ G(x) /
255	*
256	* To compute this efficiently, we can multiply the top and bottom by
257	* x^32 and move the division by G(x) to the top:
258	*
259	* / A(x) * floor(x^64 / G(x)) \
260	* q(x) = floor ( ------------------------- )
261	* \ x^32 /
262	*
263	* Note that floor(x^64 / G(x)) is a constant.
264	*
265	* So finally we have:
266	*
267	* / A(x) * floor(x^64 / G(x)) \
268	* R(x) = B(x) + G(x)*floor ( ------------------------- )
269	* \ x^32 /
270	*/
271	x1 = x0;
272	x0 = _mm_clmulepi64_si128(
273	_mm_and_si128(x0, mask32), barrett_reduction_constants, `0x00`);
274	x0 = _mm_clmulepi64_si128(
275	_mm_and_si128(x0, mask32), barrett_reduction_constants, `0x10`);
276	return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(x0, x1), `4`));
277	}
278
279	#endif
280	} // namespace detail
281	} // namespace folly
282

Browse the source code of folly/hash/detail/ChecksumDetail.cpp