1/* Copied from http://stackoverflow.com/a/17646775/1821055
2 * with the following modifications:
3 * * remove test code
4 * * global hw/sw initialization to be called once per process
5 * * HW support is determined by configure's WITH_CRC32C_HW
6 * * Windows porting (no hardware support on Windows yet)
7 *
8 * FIXME:
9 * * Hardware support on Windows (MSVC assembler)
10 * * Hardware support on ARM
11 */
12
13/* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
14 * Copyright (C) 2013 Mark Adler
15 * Version 1.1 1 Aug 2013 Mark Adler
16 */
17
18/*
19 This software is provided 'as-is', without any express or implied
20 warranty. In no event will the author be held liable for any damages
21 arising from the use of this software.
22
23 Permission is granted to anyone to use this software for any purpose,
24 including commercial applications, and to alter it and redistribute it
25 freely, subject to the following restrictions:
26
27 1. The origin of this software must not be misrepresented; you must not
28 claim that you wrote the original software. If you use this software
29 in a product, an acknowledgment in the product documentation would be
30 appreciated but is not required.
31 2. Altered source versions must be plainly marked as such, and must not be
32 misrepresented as being the original software.
33 3. This notice may not be removed or altered from any source distribution.
34
35 Mark Adler
36 madler@alumni.caltech.edu
37 */
38
39/* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
40 CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software
41 version is provided as a fall-back, as well as for speed comparisons. */
42
43/* Version history:
44 1.0 10 Feb 2013 First version
45 1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
46 */
47
48#include "rd.h"
49
50#include <stdio.h>
51#include <stdlib.h>
52#include <stdint.h>
53#ifndef _MSC_VER
54#include <unistd.h>
55#endif
56
57#include "rdunittest.h"
58#include "rdendian.h"
59
60#include "crc32c.h"
61
62/* CRC-32C (iSCSI) polynomial in reversed bit order. */
63#define POLY 0x82f63b78
64
65/* Table for a quadword-at-a-time software crc. */
66static uint32_t crc32c_table[8][256];
67
68/* Construct table for software CRC-32C calculation. */
69static void crc32c_init_sw(void)
70{
71 uint32_t n, crc, k;
72
73 for (n = 0; n < 256; n++) {
74 crc = n;
75 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
76 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
77 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
78 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
79 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
80 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
81 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
82 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
83 crc32c_table[0][n] = crc;
84 }
85 for (n = 0; n < 256; n++) {
86 crc = crc32c_table[0][n];
87 for (k = 1; k < 8; k++) {
88 crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8);
89 crc32c_table[k][n] = crc;
90 }
91 }
92}
93
94/* Table-driven software version as a fall-back. This is about 15 times slower
95 than using the hardware instructions. This assumes little-endian integers,
96 as is the case on Intel processors that the assembler code here is for. */
97static uint32_t crc32c_sw(uint32_t crci, const void *buf, size_t len)
98{
99 const unsigned char *next = buf;
100 uint64_t crc;
101
102 crc = crci ^ 0xffffffff;
103 while (len && ((uintptr_t)next & 7) != 0) {
104 crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
105 len--;
106 }
107 while (len >= 8) {
108#if defined(__sparc) || defined(__sparc__) || defined(__APPLE__) || defined(__mips__) || defined(__arm__)
109 /* Alignment-safe alternative.
110 * This is also needed on Apple to avoid compilation warnings for
111 * non-appearant alignment reasons. */
112 uint64_t ncopy;
113 memcpy(&ncopy, next, sizeof(ncopy));
114 crc ^= le64toh(ncopy);
115#else
116 crc ^= le64toh(*(uint64_t *)next);
117#endif
118 crc = crc32c_table[7][crc & 0xff] ^
119 crc32c_table[6][(crc >> 8) & 0xff] ^
120 crc32c_table[5][(crc >> 16) & 0xff] ^
121 crc32c_table[4][(crc >> 24) & 0xff] ^
122 crc32c_table[3][(crc >> 32) & 0xff] ^
123 crc32c_table[2][(crc >> 40) & 0xff] ^
124 crc32c_table[1][(crc >> 48) & 0xff] ^
125 crc32c_table[0][crc >> 56];
126 next += 8;
127 len -= 8;
128 }
129 while (len) {
130 crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
131 len--;
132 }
133 return (uint32_t)crc ^ 0xffffffff;
134}
135
136
137#if WITH_CRC32C_HW
138static int sse42; /* Cached SSE42 support */
139
140/* Multiply a matrix times a vector over the Galois field of two elements,
141 GF(2). Each element is a bit in an unsigned integer. mat must have at
142 least as many entries as the power of two for most significant one bit in
143 vec. */
144static RD_INLINE uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec)
145{
146 uint32_t sum;
147
148 sum = 0;
149 while (vec) {
150 if (vec & 1)
151 sum ^= *mat;
152 vec >>= 1;
153 mat++;
154 }
155 return sum;
156}
157
158/* Multiply a matrix by itself over GF(2). Both mat and square must have 32
159 rows. */
160static RD_INLINE void gf2_matrix_square(uint32_t *square, uint32_t *mat)
161{
162 int n;
163
164 for (n = 0; n < 32; n++)
165 square[n] = gf2_matrix_times(mat, mat[n]);
166}
167
168/* Construct an operator to apply len zeros to a crc. len must be a power of
169 two. If len is not a power of two, then the result is the same as for the
170 largest power of two less than len. The result for len == 0 is the same as
171 for len == 1. A version of this routine could be easily written for any
172 len, but that is not needed for this application. */
173static void crc32c_zeros_op(uint32_t *even, size_t len)
174{
175 int n;
176 uint32_t row;
177 uint32_t odd[32]; /* odd-power-of-two zeros operator */
178
179 /* put operator for one zero bit in odd */
180 odd[0] = POLY; /* CRC-32C polynomial */
181 row = 1;
182 for (n = 1; n < 32; n++) {
183 odd[n] = row;
184 row <<= 1;
185 }
186
187 /* put operator for two zero bits in even */
188 gf2_matrix_square(even, odd);
189
190 /* put operator for four zero bits in odd */
191 gf2_matrix_square(odd, even);
192
193 /* first square will put the operator for one zero byte (eight zero bits),
194 in even -- next square puts operator for two zero bytes in odd, and so
195 on, until len has been rotated down to zero */
196 do {
197 gf2_matrix_square(even, odd);
198 len >>= 1;
199 if (len == 0)
200 return;
201 gf2_matrix_square(odd, even);
202 len >>= 1;
203 } while (len);
204
205 /* answer ended up in odd -- copy to even */
206 for (n = 0; n < 32; n++)
207 even[n] = odd[n];
208}
209
210/* Take a length and build four lookup tables for applying the zeros operator
211 for that length, byte-by-byte on the operand. */
212static void crc32c_zeros(uint32_t zeros[][256], size_t len)
213{
214 uint32_t n;
215 uint32_t op[32];
216
217 crc32c_zeros_op(op, len);
218 for (n = 0; n < 256; n++) {
219 zeros[0][n] = gf2_matrix_times(op, n);
220 zeros[1][n] = gf2_matrix_times(op, n << 8);
221 zeros[2][n] = gf2_matrix_times(op, n << 16);
222 zeros[3][n] = gf2_matrix_times(op, n << 24);
223 }
224}
225
226/* Apply the zeros operator table to crc. */
227static RD_INLINE uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc)
228{
229 return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
230 zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
231}
232
233/* Block sizes for three-way parallel crc computation. LONG and SHORT must
234 both be powers of two. The associated string constants must be set
235 accordingly, for use in constructing the assembler instructions. */
236#define LONG 8192
237#define LONGx1 "8192"
238#define LONGx2 "16384"
239#define SHORT 256
240#define SHORTx1 "256"
241#define SHORTx2 "512"
242
243/* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
244static uint32_t crc32c_long[4][256];
245static uint32_t crc32c_short[4][256];
246
247/* Initialize tables for shifting crcs. */
248static void crc32c_init_hw(void)
249{
250 crc32c_zeros(crc32c_long, LONG);
251 crc32c_zeros(crc32c_short, SHORT);
252}
253
254/* Compute CRC-32C using the Intel hardware instruction. */
255static uint32_t crc32c_hw(uint32_t crc, const void *buf, size_t len)
256{
257 const unsigned char *next = buf;
258 const unsigned char *end;
259 uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */
260
261 /* pre-process the crc */
262 crc0 = crc ^ 0xffffffff;
263
264 /* compute the crc for up to seven leading bytes to bring the data pointer
265 to an eight-byte boundary */
266 while (len && ((uintptr_t)next & 7) != 0) {
267 __asm__("crc32b\t" "(%1), %0"
268 : "=r"(crc0)
269 : "r"(next), "0"(crc0));
270 next++;
271 len--;
272 }
273
274 /* compute the crc on sets of LONG*3 bytes, executing three independent crc
275 instructions, each on LONG bytes -- this is optimized for the Nehalem,
276 Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
277 throughput of one crc per cycle, but a latency of three cycles */
278 while (len >= LONG*3) {
279 crc1 = 0;
280 crc2 = 0;
281 end = next + LONG;
282 do {
283 __asm__("crc32q\t" "(%3), %0\n\t"
284 "crc32q\t" LONGx1 "(%3), %1\n\t"
285 "crc32q\t" LONGx2 "(%3), %2"
286 : "=r"(crc0), "=r"(crc1), "=r"(crc2)
287 : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
288 next += 8;
289 } while (next < end);
290 crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
291 crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
292 next += LONG*2;
293 len -= LONG*3;
294 }
295
296 /* do the same thing, but now on SHORT*3 blocks for the remaining data less
297 than a LONG*3 block */
298 while (len >= SHORT*3) {
299 crc1 = 0;
300 crc2 = 0;
301 end = next + SHORT;
302 do {
303 __asm__("crc32q\t" "(%3), %0\n\t"
304 "crc32q\t" SHORTx1 "(%3), %1\n\t"
305 "crc32q\t" SHORTx2 "(%3), %2"
306 : "=r"(crc0), "=r"(crc1), "=r"(crc2)
307 : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
308 next += 8;
309 } while (next < end);
310 crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
311 crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
312 next += SHORT*2;
313 len -= SHORT*3;
314 }
315
316 /* compute the crc on the remaining eight-byte units less than a SHORT*3
317 block */
318 end = next + (len - (len & 7));
319 while (next < end) {
320 __asm__("crc32q\t" "(%1), %0"
321 : "=r"(crc0)
322 : "r"(next), "0"(crc0));
323 next += 8;
324 }
325 len &= 7;
326
327 /* compute the crc for up to seven trailing bytes */
328 while (len) {
329 __asm__("crc32b\t" "(%1), %0"
330 : "=r"(crc0)
331 : "r"(next), "0"(crc0));
332 next++;
333 len--;
334 }
335
336 /* return a post-processed crc */
337 return (uint32_t)crc0 ^ 0xffffffff;
338}
339
340/* Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors
341 introduced in November, 2008. This does not check for the existence of the
342 cpuid instruction itself, which was introduced on the 486SL in 1992, so this
343 will fail on earlier x86 processors. cpuid works on all Pentium and later
344 processors. */
345#define SSE42(have) \
346 do { \
347 uint32_t eax, ecx; \
348 eax = 1; \
349 __asm__("cpuid" \
350 : "=c"(ecx) \
351 : "a"(eax) \
352 : "%ebx", "%edx"); \
353 (have) = (ecx >> 20) & 1; \
354 } while (0)
355
356#endif /* WITH_CRC32C_HW */
357
358/* Compute a CRC-32C. If the crc32 instruction is available, use the hardware
359 version. Otherwise, use the software version. */
360uint32_t crc32c(uint32_t crc, const void *buf, size_t len)
361{
362#if WITH_CRC32C_HW
363 if (sse42)
364 return crc32c_hw(crc, buf, len);
365 else
366#endif
367 return crc32c_sw(crc, buf, len);
368}
369
370
371
372
373
374
375/**
376 * @brief Populate shift tables once
377 */
378void crc32c_global_init (void) {
379#if WITH_CRC32C_HW
380 SSE42(sse42);
381 if (sse42)
382 crc32c_init_hw();
383 else
384#endif
385 crc32c_init_sw();
386}
387
388int unittest_crc32c (void) {
389 const char *buf =
390" This software is provided 'as-is', without any express or implied\n"
391" warranty. In no event will the author be held liable for any damages\n"
392" arising from the use of this software.\n"
393"\n"
394" Permission is granted to anyone to use this software for any purpose,\n"
395" including commercial applications, and to alter it and redistribute it\n"
396" freely, subject to the following restrictions:\n"
397"\n"
398" 1. The origin of this software must not be misrepresented; you must not\n"
399" claim that you wrote the original software. If you use this software\n"
400" in a product, an acknowledgment in the product documentation would be\n"
401" appreciated but is not required.\n"
402" 2. Altered source versions must be plainly marked as such, and must not be\n"
403" misrepresented as being the original software.\n"
404" 3. This notice may not be removed or altered from any source distribution.";
405 const uint32_t expected_crc = 0x7dcde113;
406 uint32_t crc;
407 const char *how;
408
409 crc32c_global_init();
410
411#if WITH_CRC32C_HW
412 if (sse42)
413 how = "hardware (SSE42)";
414 else
415 how = "software (SSE42 supported in build but not at runtime)";
416#else
417 how = "software";
418#endif
419 RD_UT_SAY("Calculate CRC32C using %s", how);
420
421 crc = crc32c(0, buf, strlen(buf));
422 RD_UT_ASSERT(crc == expected_crc,
423 "Calculated CRC (%s) 0x%"PRIx32
424 " not matching expected CRC 0x%"PRIx32,
425 how, crc, expected_crc);
426
427 /* Verify software version too, regardless of which
428 * version was used above. */
429 crc32c_init_sw();
430 RD_UT_SAY("Calculate CRC32C using software");
431 crc = crc32c_sw(0, buf, strlen(buf));
432 RD_UT_ASSERT(crc == expected_crc,
433 "Calculated CRC (software) 0x%"PRIx32
434 " not matching expected CRC 0x%"PRIx32,
435 crc, expected_crc);
436
437 RD_UT_PASS();
438}
439