1 | /* Copied from http://stackoverflow.com/a/17646775/1821055 |
2 | * with the following modifications: |
3 | * * remove test code |
4 | * * global hw/sw initialization to be called once per process |
5 | * * HW support is determined by configure's WITH_CRC32C_HW |
6 | * * Windows porting (no hardware support on Windows yet) |
7 | * |
8 | * FIXME: |
9 | * * Hardware support on Windows (MSVC assembler) |
10 | * * Hardware support on ARM |
11 | */ |
12 | |
13 | /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction |
14 | * Copyright (C) 2013 Mark Adler |
15 | * Version 1.1 1 Aug 2013 Mark Adler |
16 | */ |
17 | |
18 | /* |
19 | This software is provided 'as-is', without any express or implied |
20 | warranty. In no event will the author be held liable for any damages |
21 | arising from the use of this software. |
22 | |
23 | Permission is granted to anyone to use this software for any purpose, |
24 | including commercial applications, and to alter it and redistribute it |
25 | freely, subject to the following restrictions: |
26 | |
27 | 1. The origin of this software must not be misrepresented; you must not |
28 | claim that you wrote the original software. If you use this software |
29 | in a product, an acknowledgment in the product documentation would be |
30 | appreciated but is not required. |
31 | 2. Altered source versions must be plainly marked as such, and must not be |
32 | misrepresented as being the original software. |
33 | 3. This notice may not be removed or altered from any source distribution. |
34 | |
35 | Mark Adler |
36 | madler@alumni.caltech.edu |
37 | */ |
38 | |
39 | /* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a |
40 | CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software |
41 | version is provided as a fall-back, as well as for speed comparisons. */ |
42 | |
43 | /* Version history: |
44 | 1.0 10 Feb 2013 First version |
45 | 1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel |
46 | */ |
47 | |
48 | #include "rd.h" |
49 | |
50 | #include <stdio.h> |
51 | #include <stdlib.h> |
52 | #include <stdint.h> |
53 | #ifndef _MSC_VER |
54 | #include <unistd.h> |
55 | #endif |
56 | |
57 | #include "rdunittest.h" |
58 | #include "rdendian.h" |
59 | |
60 | #include "crc32c.h" |
61 | |
62 | /* CRC-32C (iSCSI) polynomial in reversed bit order. */ |
63 | #define POLY 0x82f63b78 |
64 | |
65 | /* Table for a quadword-at-a-time software crc. */ |
66 | static uint32_t crc32c_table[8][256]; |
67 | |
68 | /* Construct table for software CRC-32C calculation. */ |
69 | static void crc32c_init_sw(void) |
70 | { |
71 | uint32_t n, crc, k; |
72 | |
73 | for (n = 0; n < 256; n++) { |
74 | crc = n; |
75 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
76 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
77 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
78 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
79 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
80 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
81 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
82 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
83 | crc32c_table[0][n] = crc; |
84 | } |
85 | for (n = 0; n < 256; n++) { |
86 | crc = crc32c_table[0][n]; |
87 | for (k = 1; k < 8; k++) { |
88 | crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8); |
89 | crc32c_table[k][n] = crc; |
90 | } |
91 | } |
92 | } |
93 | |
94 | /* Table-driven software version as a fall-back. This is about 15 times slower |
95 | than using the hardware instructions. This assumes little-endian integers, |
96 | as is the case on Intel processors that the assembler code here is for. */ |
97 | static uint32_t crc32c_sw(uint32_t crci, const void *buf, size_t len) |
98 | { |
99 | const unsigned char *next = buf; |
100 | uint64_t crc; |
101 | |
102 | crc = crci ^ 0xffffffff; |
103 | while (len && ((uintptr_t)next & 7) != 0) { |
104 | crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); |
105 | len--; |
106 | } |
107 | while (len >= 8) { |
108 | #if defined(__sparc) || defined(__sparc__) || defined(__APPLE__) || defined(__mips__) || defined(__arm__) |
109 | /* Alignment-safe alternative. |
110 | * This is also needed on Apple to avoid compilation warnings for |
111 | * non-appearant alignment reasons. */ |
112 | uint64_t ncopy; |
113 | memcpy(&ncopy, next, sizeof(ncopy)); |
114 | crc ^= le64toh(ncopy); |
115 | #else |
116 | crc ^= le64toh(*(uint64_t *)next); |
117 | #endif |
118 | crc = crc32c_table[7][crc & 0xff] ^ |
119 | crc32c_table[6][(crc >> 8) & 0xff] ^ |
120 | crc32c_table[5][(crc >> 16) & 0xff] ^ |
121 | crc32c_table[4][(crc >> 24) & 0xff] ^ |
122 | crc32c_table[3][(crc >> 32) & 0xff] ^ |
123 | crc32c_table[2][(crc >> 40) & 0xff] ^ |
124 | crc32c_table[1][(crc >> 48) & 0xff] ^ |
125 | crc32c_table[0][crc >> 56]; |
126 | next += 8; |
127 | len -= 8; |
128 | } |
129 | while (len) { |
130 | crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); |
131 | len--; |
132 | } |
133 | return (uint32_t)crc ^ 0xffffffff; |
134 | } |
135 | |
136 | |
137 | #if WITH_CRC32C_HW |
138 | static int sse42; /* Cached SSE42 support */ |
139 | |
140 | /* Multiply a matrix times a vector over the Galois field of two elements, |
141 | GF(2). Each element is a bit in an unsigned integer. mat must have at |
142 | least as many entries as the power of two for most significant one bit in |
143 | vec. */ |
144 | static RD_INLINE uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) |
145 | { |
146 | uint32_t sum; |
147 | |
148 | sum = 0; |
149 | while (vec) { |
150 | if (vec & 1) |
151 | sum ^= *mat; |
152 | vec >>= 1; |
153 | mat++; |
154 | } |
155 | return sum; |
156 | } |
157 | |
158 | /* Multiply a matrix by itself over GF(2). Both mat and square must have 32 |
159 | rows. */ |
160 | static RD_INLINE void gf2_matrix_square(uint32_t *square, uint32_t *mat) |
161 | { |
162 | int n; |
163 | |
164 | for (n = 0; n < 32; n++) |
165 | square[n] = gf2_matrix_times(mat, mat[n]); |
166 | } |
167 | |
168 | /* Construct an operator to apply len zeros to a crc. len must be a power of |
169 | two. If len is not a power of two, then the result is the same as for the |
170 | largest power of two less than len. The result for len == 0 is the same as |
171 | for len == 1. A version of this routine could be easily written for any |
172 | len, but that is not needed for this application. */ |
173 | static void crc32c_zeros_op(uint32_t *even, size_t len) |
174 | { |
175 | int n; |
176 | uint32_t row; |
177 | uint32_t odd[32]; /* odd-power-of-two zeros operator */ |
178 | |
179 | /* put operator for one zero bit in odd */ |
180 | odd[0] = POLY; /* CRC-32C polynomial */ |
181 | row = 1; |
182 | for (n = 1; n < 32; n++) { |
183 | odd[n] = row; |
184 | row <<= 1; |
185 | } |
186 | |
187 | /* put operator for two zero bits in even */ |
188 | gf2_matrix_square(even, odd); |
189 | |
190 | /* put operator for four zero bits in odd */ |
191 | gf2_matrix_square(odd, even); |
192 | |
193 | /* first square will put the operator for one zero byte (eight zero bits), |
194 | in even -- next square puts operator for two zero bytes in odd, and so |
195 | on, until len has been rotated down to zero */ |
196 | do { |
197 | gf2_matrix_square(even, odd); |
198 | len >>= 1; |
199 | if (len == 0) |
200 | return; |
201 | gf2_matrix_square(odd, even); |
202 | len >>= 1; |
203 | } while (len); |
204 | |
205 | /* answer ended up in odd -- copy to even */ |
206 | for (n = 0; n < 32; n++) |
207 | even[n] = odd[n]; |
208 | } |
209 | |
210 | /* Take a length and build four lookup tables for applying the zeros operator |
211 | for that length, byte-by-byte on the operand. */ |
212 | static void crc32c_zeros(uint32_t zeros[][256], size_t len) |
213 | { |
214 | uint32_t n; |
215 | uint32_t op[32]; |
216 | |
217 | crc32c_zeros_op(op, len); |
218 | for (n = 0; n < 256; n++) { |
219 | zeros[0][n] = gf2_matrix_times(op, n); |
220 | zeros[1][n] = gf2_matrix_times(op, n << 8); |
221 | zeros[2][n] = gf2_matrix_times(op, n << 16); |
222 | zeros[3][n] = gf2_matrix_times(op, n << 24); |
223 | } |
224 | } |
225 | |
226 | /* Apply the zeros operator table to crc. */ |
227 | static RD_INLINE uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) |
228 | { |
229 | return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^ |
230 | zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24]; |
231 | } |
232 | |
233 | /* Block sizes for three-way parallel crc computation. LONG and SHORT must |
234 | both be powers of two. The associated string constants must be set |
235 | accordingly, for use in constructing the assembler instructions. */ |
236 | #define LONG 8192 |
237 | #define LONGx1 "8192" |
238 | #define LONGx2 "16384" |
239 | #define SHORT 256 |
240 | #define SHORTx1 "256" |
241 | #define SHORTx2 "512" |
242 | |
243 | /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */ |
244 | static uint32_t crc32c_long[4][256]; |
245 | static uint32_t crc32c_short[4][256]; |
246 | |
247 | /* Initialize tables for shifting crcs. */ |
248 | static void crc32c_init_hw(void) |
249 | { |
250 | crc32c_zeros(crc32c_long, LONG); |
251 | crc32c_zeros(crc32c_short, SHORT); |
252 | } |
253 | |
254 | /* Compute CRC-32C using the Intel hardware instruction. */ |
255 | static uint32_t crc32c_hw(uint32_t crc, const void *buf, size_t len) |
256 | { |
257 | const unsigned char *next = buf; |
258 | const unsigned char *end; |
259 | uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */ |
260 | |
261 | /* pre-process the crc */ |
262 | crc0 = crc ^ 0xffffffff; |
263 | |
264 | /* compute the crc for up to seven leading bytes to bring the data pointer |
265 | to an eight-byte boundary */ |
266 | while (len && ((uintptr_t)next & 7) != 0) { |
267 | __asm__("crc32b\t" "(%1), %0" |
268 | : "=r" (crc0) |
269 | : "r" (next), "0" (crc0)); |
270 | next++; |
271 | len--; |
272 | } |
273 | |
274 | /* compute the crc on sets of LONG*3 bytes, executing three independent crc |
275 | instructions, each on LONG bytes -- this is optimized for the Nehalem, |
276 | Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a |
277 | throughput of one crc per cycle, but a latency of three cycles */ |
278 | while (len >= LONG*3) { |
279 | crc1 = 0; |
280 | crc2 = 0; |
281 | end = next + LONG; |
282 | do { |
283 | __asm__("crc32q\t" "(%3), %0\n\t" |
284 | "crc32q\t" LONGx1 "(%3), %1\n\t" |
285 | "crc32q\t" LONGx2 "(%3), %2" |
286 | : "=r" (crc0), "=r" (crc1), "=r" (crc2) |
287 | : "r" (next), "0" (crc0), "1" (crc1), "2" (crc2)); |
288 | next += 8; |
289 | } while (next < end); |
290 | crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1; |
291 | crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2; |
292 | next += LONG*2; |
293 | len -= LONG*3; |
294 | } |
295 | |
296 | /* do the same thing, but now on SHORT*3 blocks for the remaining data less |
297 | than a LONG*3 block */ |
298 | while (len >= SHORT*3) { |
299 | crc1 = 0; |
300 | crc2 = 0; |
301 | end = next + SHORT; |
302 | do { |
303 | __asm__("crc32q\t" "(%3), %0\n\t" |
304 | "crc32q\t" SHORTx1 "(%3), %1\n\t" |
305 | "crc32q\t" SHORTx2 "(%3), %2" |
306 | : "=r" (crc0), "=r" (crc1), "=r" (crc2) |
307 | : "r" (next), "0" (crc0), "1" (crc1), "2" (crc2)); |
308 | next += 8; |
309 | } while (next < end); |
310 | crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1; |
311 | crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2; |
312 | next += SHORT*2; |
313 | len -= SHORT*3; |
314 | } |
315 | |
316 | /* compute the crc on the remaining eight-byte units less than a SHORT*3 |
317 | block */ |
318 | end = next + (len - (len & 7)); |
319 | while (next < end) { |
320 | __asm__("crc32q\t" "(%1), %0" |
321 | : "=r" (crc0) |
322 | : "r" (next), "0" (crc0)); |
323 | next += 8; |
324 | } |
325 | len &= 7; |
326 | |
327 | /* compute the crc for up to seven trailing bytes */ |
328 | while (len) { |
329 | __asm__("crc32b\t" "(%1), %0" |
330 | : "=r" (crc0) |
331 | : "r" (next), "0" (crc0)); |
332 | next++; |
333 | len--; |
334 | } |
335 | |
336 | /* return a post-processed crc */ |
337 | return (uint32_t)crc0 ^ 0xffffffff; |
338 | } |
339 | |
340 | /* Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors |
341 | introduced in November, 2008. This does not check for the existence of the |
342 | cpuid instruction itself, which was introduced on the 486SL in 1992, so this |
343 | will fail on earlier x86 processors. cpuid works on all Pentium and later |
344 | processors. */ |
345 | #define SSE42(have) \ |
346 | do { \ |
347 | uint32_t eax, ecx; \ |
348 | eax = 1; \ |
349 | __asm__("cpuid" \ |
350 | : "=c"(ecx) \ |
351 | : "a"(eax) \ |
352 | : "%ebx", "%edx"); \ |
353 | (have) = (ecx >> 20) & 1; \ |
354 | } while (0) |
355 | |
356 | #endif /* WITH_CRC32C_HW */ |
357 | |
358 | /* Compute a CRC-32C. If the crc32 instruction is available, use the hardware |
359 | version. Otherwise, use the software version. */ |
360 | uint32_t crc32c(uint32_t crc, const void *buf, size_t len) |
361 | { |
362 | #if WITH_CRC32C_HW |
363 | if (sse42) |
364 | return crc32c_hw(crc, buf, len); |
365 | else |
366 | #endif |
367 | return crc32c_sw(crc, buf, len); |
368 | } |
369 | |
370 | |
371 | |
372 | |
373 | |
374 | |
375 | /** |
376 | * @brief Populate shift tables once |
377 | */ |
378 | void crc32c_global_init (void) { |
379 | #if WITH_CRC32C_HW |
380 | SSE42(sse42); |
381 | if (sse42) |
382 | crc32c_init_hw(); |
383 | else |
384 | #endif |
385 | crc32c_init_sw(); |
386 | } |
387 | |
388 | int unittest_crc32c (void) { |
389 | const char *buf = |
390 | " This software is provided 'as-is', without any express or implied\n" |
391 | " warranty. In no event will the author be held liable for any damages\n" |
392 | " arising from the use of this software.\n" |
393 | "\n" |
394 | " Permission is granted to anyone to use this software for any purpose,\n" |
395 | " including commercial applications, and to alter it and redistribute it\n" |
396 | " freely, subject to the following restrictions:\n" |
397 | "\n" |
398 | " 1. The origin of this software must not be misrepresented; you must not\n" |
399 | " claim that you wrote the original software. If you use this software\n" |
400 | " in a product, an acknowledgment in the product documentation would be\n" |
401 | " appreciated but is not required.\n" |
402 | " 2. Altered source versions must be plainly marked as such, and must not be\n" |
403 | " misrepresented as being the original software.\n" |
404 | " 3. This notice may not be removed or altered from any source distribution." ; |
405 | const uint32_t expected_crc = 0x7dcde113; |
406 | uint32_t crc; |
407 | const char *how; |
408 | |
409 | crc32c_global_init(); |
410 | |
411 | #if WITH_CRC32C_HW |
412 | if (sse42) |
413 | how = "hardware (SSE42)" ; |
414 | else |
415 | how = "software (SSE42 supported in build but not at runtime)" ; |
416 | #else |
417 | how = "software" ; |
418 | #endif |
419 | RD_UT_SAY("Calculate CRC32C using %s" , how); |
420 | |
421 | crc = crc32c(0, buf, strlen(buf)); |
422 | RD_UT_ASSERT(crc == expected_crc, |
423 | "Calculated CRC (%s) 0x%" PRIx32 |
424 | " not matching expected CRC 0x%" PRIx32, |
425 | how, crc, expected_crc); |
426 | |
427 | /* Verify software version too, regardless of which |
428 | * version was used above. */ |
429 | crc32c_init_sw(); |
430 | RD_UT_SAY("Calculate CRC32C using software" ); |
431 | crc = crc32c_sw(0, buf, strlen(buf)); |
432 | RD_UT_ASSERT(crc == expected_crc, |
433 | "Calculated CRC (software) 0x%" PRIx32 |
434 | " not matching expected CRC 0x%" PRIx32, |
435 | crc, expected_crc); |
436 | |
437 | RD_UT_PASS(); |
438 | } |
439 | |