| 1 | /* Copied from http://stackoverflow.com/a/17646775/1821055 |
| 2 | * with the following modifications: |
| 3 | * * remove test code |
| 4 | * * global hw/sw initialization to be called once per process |
| 5 | * * HW support is determined by configure's WITH_CRC32C_HW |
| 6 | * * Windows porting (no hardware support on Windows yet) |
| 7 | * |
| 8 | * FIXME: |
| 9 | * * Hardware support on Windows (MSVC assembler) |
| 10 | * * Hardware support on ARM |
| 11 | */ |
| 12 | |
| 13 | /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction |
| 14 | * Copyright (C) 2013 Mark Adler |
| 15 | * Version 1.1 1 Aug 2013 Mark Adler |
| 16 | */ |
| 17 | |
| 18 | /* |
| 19 | This software is provided 'as-is', without any express or implied |
| 20 | warranty. In no event will the author be held liable for any damages |
| 21 | arising from the use of this software. |
| 22 | |
| 23 | Permission is granted to anyone to use this software for any purpose, |
| 24 | including commercial applications, and to alter it and redistribute it |
| 25 | freely, subject to the following restrictions: |
| 26 | |
| 27 | 1. The origin of this software must not be misrepresented; you must not |
| 28 | claim that you wrote the original software. If you use this software |
| 29 | in a product, an acknowledgment in the product documentation would be |
| 30 | appreciated but is not required. |
| 31 | 2. Altered source versions must be plainly marked as such, and must not be |
| 32 | misrepresented as being the original software. |
| 33 | 3. This notice may not be removed or altered from any source distribution. |
| 34 | |
| 35 | Mark Adler |
| 36 | madler@alumni.caltech.edu |
| 37 | */ |
| 38 | |
| 39 | /* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a |
| 40 | CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software |
| 41 | version is provided as a fall-back, as well as for speed comparisons. */ |
| 42 | |
| 43 | /* Version history: |
| 44 | 1.0 10 Feb 2013 First version |
| 45 | 1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel |
| 46 | */ |
| 47 | |
| 48 | #include "rd.h" |
| 49 | |
| 50 | #include <stdio.h> |
| 51 | #include <stdlib.h> |
| 52 | #include <stdint.h> |
| 53 | #ifndef _MSC_VER |
| 54 | #include <unistd.h> |
| 55 | #endif |
| 56 | |
| 57 | #include "rdunittest.h" |
| 58 | #include "rdendian.h" |
| 59 | |
| 60 | #include "crc32c.h" |
| 61 | |
| 62 | /* CRC-32C (iSCSI) polynomial in reversed bit order. */ |
| 63 | #define POLY 0x82f63b78 |
| 64 | |
| 65 | /* Table for a quadword-at-a-time software crc. */ |
| 66 | static uint32_t crc32c_table[8][256]; |
| 67 | |
| 68 | /* Construct table for software CRC-32C calculation. */ |
| 69 | static void crc32c_init_sw(void) |
| 70 | { |
| 71 | uint32_t n, crc, k; |
| 72 | |
| 73 | for (n = 0; n < 256; n++) { |
| 74 | crc = n; |
| 75 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
| 76 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
| 77 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
| 78 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
| 79 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
| 80 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
| 81 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
| 82 | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
| 83 | crc32c_table[0][n] = crc; |
| 84 | } |
| 85 | for (n = 0; n < 256; n++) { |
| 86 | crc = crc32c_table[0][n]; |
| 87 | for (k = 1; k < 8; k++) { |
| 88 | crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8); |
| 89 | crc32c_table[k][n] = crc; |
| 90 | } |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | /* Table-driven software version as a fall-back. This is about 15 times slower |
| 95 | than using the hardware instructions. This assumes little-endian integers, |
| 96 | as is the case on Intel processors that the assembler code here is for. */ |
| 97 | static uint32_t crc32c_sw(uint32_t crci, const void *buf, size_t len) |
| 98 | { |
| 99 | const unsigned char *next = buf; |
| 100 | uint64_t crc; |
| 101 | |
| 102 | crc = crci ^ 0xffffffff; |
| 103 | while (len && ((uintptr_t)next & 7) != 0) { |
| 104 | crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); |
| 105 | len--; |
| 106 | } |
| 107 | while (len >= 8) { |
| 108 | #if defined(__sparc) || defined(__sparc__) || defined(__APPLE__) || defined(__mips__) || defined(__arm__) |
| 109 | /* Alignment-safe alternative. |
| 110 | * This is also needed on Apple to avoid compilation warnings for |
| 111 | * non-appearant alignment reasons. */ |
| 112 | uint64_t ncopy; |
| 113 | memcpy(&ncopy, next, sizeof(ncopy)); |
| 114 | crc ^= le64toh(ncopy); |
| 115 | #else |
| 116 | crc ^= le64toh(*(uint64_t *)next); |
| 117 | #endif |
| 118 | crc = crc32c_table[7][crc & 0xff] ^ |
| 119 | crc32c_table[6][(crc >> 8) & 0xff] ^ |
| 120 | crc32c_table[5][(crc >> 16) & 0xff] ^ |
| 121 | crc32c_table[4][(crc >> 24) & 0xff] ^ |
| 122 | crc32c_table[3][(crc >> 32) & 0xff] ^ |
| 123 | crc32c_table[2][(crc >> 40) & 0xff] ^ |
| 124 | crc32c_table[1][(crc >> 48) & 0xff] ^ |
| 125 | crc32c_table[0][crc >> 56]; |
| 126 | next += 8; |
| 127 | len -= 8; |
| 128 | } |
| 129 | while (len) { |
| 130 | crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); |
| 131 | len--; |
| 132 | } |
| 133 | return (uint32_t)crc ^ 0xffffffff; |
| 134 | } |
| 135 | |
| 136 | |
| 137 | #if WITH_CRC32C_HW |
| 138 | static int sse42; /* Cached SSE42 support */ |
| 139 | |
| 140 | /* Multiply a matrix times a vector over the Galois field of two elements, |
| 141 | GF(2). Each element is a bit in an unsigned integer. mat must have at |
| 142 | least as many entries as the power of two for most significant one bit in |
| 143 | vec. */ |
| 144 | static RD_INLINE uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) |
| 145 | { |
| 146 | uint32_t sum; |
| 147 | |
| 148 | sum = 0; |
| 149 | while (vec) { |
| 150 | if (vec & 1) |
| 151 | sum ^= *mat; |
| 152 | vec >>= 1; |
| 153 | mat++; |
| 154 | } |
| 155 | return sum; |
| 156 | } |
| 157 | |
| 158 | /* Multiply a matrix by itself over GF(2). Both mat and square must have 32 |
| 159 | rows. */ |
| 160 | static RD_INLINE void gf2_matrix_square(uint32_t *square, uint32_t *mat) |
| 161 | { |
| 162 | int n; |
| 163 | |
| 164 | for (n = 0; n < 32; n++) |
| 165 | square[n] = gf2_matrix_times(mat, mat[n]); |
| 166 | } |
| 167 | |
| 168 | /* Construct an operator to apply len zeros to a crc. len must be a power of |
| 169 | two. If len is not a power of two, then the result is the same as for the |
| 170 | largest power of two less than len. The result for len == 0 is the same as |
| 171 | for len == 1. A version of this routine could be easily written for any |
| 172 | len, but that is not needed for this application. */ |
| 173 | static void crc32c_zeros_op(uint32_t *even, size_t len) |
| 174 | { |
| 175 | int n; |
| 176 | uint32_t row; |
| 177 | uint32_t odd[32]; /* odd-power-of-two zeros operator */ |
| 178 | |
| 179 | /* put operator for one zero bit in odd */ |
| 180 | odd[0] = POLY; /* CRC-32C polynomial */ |
| 181 | row = 1; |
| 182 | for (n = 1; n < 32; n++) { |
| 183 | odd[n] = row; |
| 184 | row <<= 1; |
| 185 | } |
| 186 | |
| 187 | /* put operator for two zero bits in even */ |
| 188 | gf2_matrix_square(even, odd); |
| 189 | |
| 190 | /* put operator for four zero bits in odd */ |
| 191 | gf2_matrix_square(odd, even); |
| 192 | |
| 193 | /* first square will put the operator for one zero byte (eight zero bits), |
| 194 | in even -- next square puts operator for two zero bytes in odd, and so |
| 195 | on, until len has been rotated down to zero */ |
| 196 | do { |
| 197 | gf2_matrix_square(even, odd); |
| 198 | len >>= 1; |
| 199 | if (len == 0) |
| 200 | return; |
| 201 | gf2_matrix_square(odd, even); |
| 202 | len >>= 1; |
| 203 | } while (len); |
| 204 | |
| 205 | /* answer ended up in odd -- copy to even */ |
| 206 | for (n = 0; n < 32; n++) |
| 207 | even[n] = odd[n]; |
| 208 | } |
| 209 | |
| 210 | /* Take a length and build four lookup tables for applying the zeros operator |
| 211 | for that length, byte-by-byte on the operand. */ |
| 212 | static void crc32c_zeros(uint32_t zeros[][256], size_t len) |
| 213 | { |
| 214 | uint32_t n; |
| 215 | uint32_t op[32]; |
| 216 | |
| 217 | crc32c_zeros_op(op, len); |
| 218 | for (n = 0; n < 256; n++) { |
| 219 | zeros[0][n] = gf2_matrix_times(op, n); |
| 220 | zeros[1][n] = gf2_matrix_times(op, n << 8); |
| 221 | zeros[2][n] = gf2_matrix_times(op, n << 16); |
| 222 | zeros[3][n] = gf2_matrix_times(op, n << 24); |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | /* Apply the zeros operator table to crc. */ |
| 227 | static RD_INLINE uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) |
| 228 | { |
| 229 | return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^ |
| 230 | zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24]; |
| 231 | } |
| 232 | |
| 233 | /* Block sizes for three-way parallel crc computation. LONG and SHORT must |
| 234 | both be powers of two. The associated string constants must be set |
| 235 | accordingly, for use in constructing the assembler instructions. */ |
| 236 | #define LONG 8192 |
| 237 | #define LONGx1 "8192" |
| 238 | #define LONGx2 "16384" |
| 239 | #define SHORT 256 |
| 240 | #define SHORTx1 "256" |
| 241 | #define SHORTx2 "512" |
| 242 | |
| 243 | /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */ |
| 244 | static uint32_t crc32c_long[4][256]; |
| 245 | static uint32_t crc32c_short[4][256]; |
| 246 | |
| 247 | /* Initialize tables for shifting crcs. */ |
| 248 | static void crc32c_init_hw(void) |
| 249 | { |
| 250 | crc32c_zeros(crc32c_long, LONG); |
| 251 | crc32c_zeros(crc32c_short, SHORT); |
| 252 | } |
| 253 | |
| 254 | /* Compute CRC-32C using the Intel hardware instruction. */ |
| 255 | static uint32_t crc32c_hw(uint32_t crc, const void *buf, size_t len) |
| 256 | { |
| 257 | const unsigned char *next = buf; |
| 258 | const unsigned char *end; |
| 259 | uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */ |
| 260 | |
| 261 | /* pre-process the crc */ |
| 262 | crc0 = crc ^ 0xffffffff; |
| 263 | |
| 264 | /* compute the crc for up to seven leading bytes to bring the data pointer |
| 265 | to an eight-byte boundary */ |
| 266 | while (len && ((uintptr_t)next & 7) != 0) { |
| 267 | __asm__("crc32b\t" "(%1), %0" |
| 268 | : "=r" (crc0) |
| 269 | : "r" (next), "0" (crc0)); |
| 270 | next++; |
| 271 | len--; |
| 272 | } |
| 273 | |
| 274 | /* compute the crc on sets of LONG*3 bytes, executing three independent crc |
| 275 | instructions, each on LONG bytes -- this is optimized for the Nehalem, |
| 276 | Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a |
| 277 | throughput of one crc per cycle, but a latency of three cycles */ |
| 278 | while (len >= LONG*3) { |
| 279 | crc1 = 0; |
| 280 | crc2 = 0; |
| 281 | end = next + LONG; |
| 282 | do { |
| 283 | __asm__("crc32q\t" "(%3), %0\n\t" |
| 284 | "crc32q\t" LONGx1 "(%3), %1\n\t" |
| 285 | "crc32q\t" LONGx2 "(%3), %2" |
| 286 | : "=r" (crc0), "=r" (crc1), "=r" (crc2) |
| 287 | : "r" (next), "0" (crc0), "1" (crc1), "2" (crc2)); |
| 288 | next += 8; |
| 289 | } while (next < end); |
| 290 | crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1; |
| 291 | crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2; |
| 292 | next += LONG*2; |
| 293 | len -= LONG*3; |
| 294 | } |
| 295 | |
| 296 | /* do the same thing, but now on SHORT*3 blocks for the remaining data less |
| 297 | than a LONG*3 block */ |
| 298 | while (len >= SHORT*3) { |
| 299 | crc1 = 0; |
| 300 | crc2 = 0; |
| 301 | end = next + SHORT; |
| 302 | do { |
| 303 | __asm__("crc32q\t" "(%3), %0\n\t" |
| 304 | "crc32q\t" SHORTx1 "(%3), %1\n\t" |
| 305 | "crc32q\t" SHORTx2 "(%3), %2" |
| 306 | : "=r" (crc0), "=r" (crc1), "=r" (crc2) |
| 307 | : "r" (next), "0" (crc0), "1" (crc1), "2" (crc2)); |
| 308 | next += 8; |
| 309 | } while (next < end); |
| 310 | crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1; |
| 311 | crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2; |
| 312 | next += SHORT*2; |
| 313 | len -= SHORT*3; |
| 314 | } |
| 315 | |
| 316 | /* compute the crc on the remaining eight-byte units less than a SHORT*3 |
| 317 | block */ |
| 318 | end = next + (len - (len & 7)); |
| 319 | while (next < end) { |
| 320 | __asm__("crc32q\t" "(%1), %0" |
| 321 | : "=r" (crc0) |
| 322 | : "r" (next), "0" (crc0)); |
| 323 | next += 8; |
| 324 | } |
| 325 | len &= 7; |
| 326 | |
| 327 | /* compute the crc for up to seven trailing bytes */ |
| 328 | while (len) { |
| 329 | __asm__("crc32b\t" "(%1), %0" |
| 330 | : "=r" (crc0) |
| 331 | : "r" (next), "0" (crc0)); |
| 332 | next++; |
| 333 | len--; |
| 334 | } |
| 335 | |
| 336 | /* return a post-processed crc */ |
| 337 | return (uint32_t)crc0 ^ 0xffffffff; |
| 338 | } |
| 339 | |
| 340 | /* Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors |
| 341 | introduced in November, 2008. This does not check for the existence of the |
| 342 | cpuid instruction itself, which was introduced on the 486SL in 1992, so this |
| 343 | will fail on earlier x86 processors. cpuid works on all Pentium and later |
| 344 | processors. */ |
| 345 | #define SSE42(have) \ |
| 346 | do { \ |
| 347 | uint32_t eax, ecx; \ |
| 348 | eax = 1; \ |
| 349 | __asm__("cpuid" \ |
| 350 | : "=c"(ecx) \ |
| 351 | : "a"(eax) \ |
| 352 | : "%ebx", "%edx"); \ |
| 353 | (have) = (ecx >> 20) & 1; \ |
| 354 | } while (0) |
| 355 | |
| 356 | #endif /* WITH_CRC32C_HW */ |
| 357 | |
| 358 | /* Compute a CRC-32C. If the crc32 instruction is available, use the hardware |
| 359 | version. Otherwise, use the software version. */ |
| 360 | uint32_t crc32c(uint32_t crc, const void *buf, size_t len) |
| 361 | { |
| 362 | #if WITH_CRC32C_HW |
| 363 | if (sse42) |
| 364 | return crc32c_hw(crc, buf, len); |
| 365 | else |
| 366 | #endif |
| 367 | return crc32c_sw(crc, buf, len); |
| 368 | } |
| 369 | |
| 370 | |
| 371 | |
| 372 | |
| 373 | |
| 374 | |
| 375 | /** |
| 376 | * @brief Populate shift tables once |
| 377 | */ |
| 378 | void crc32c_global_init (void) { |
| 379 | #if WITH_CRC32C_HW |
| 380 | SSE42(sse42); |
| 381 | if (sse42) |
| 382 | crc32c_init_hw(); |
| 383 | else |
| 384 | #endif |
| 385 | crc32c_init_sw(); |
| 386 | } |
| 387 | |
| 388 | int unittest_crc32c (void) { |
| 389 | const char *buf = |
| 390 | " This software is provided 'as-is', without any express or implied\n" |
| 391 | " warranty. In no event will the author be held liable for any damages\n" |
| 392 | " arising from the use of this software.\n" |
| 393 | "\n" |
| 394 | " Permission is granted to anyone to use this software for any purpose,\n" |
| 395 | " including commercial applications, and to alter it and redistribute it\n" |
| 396 | " freely, subject to the following restrictions:\n" |
| 397 | "\n" |
| 398 | " 1. The origin of this software must not be misrepresented; you must not\n" |
| 399 | " claim that you wrote the original software. If you use this software\n" |
| 400 | " in a product, an acknowledgment in the product documentation would be\n" |
| 401 | " appreciated but is not required.\n" |
| 402 | " 2. Altered source versions must be plainly marked as such, and must not be\n" |
| 403 | " misrepresented as being the original software.\n" |
| 404 | " 3. This notice may not be removed or altered from any source distribution." ; |
| 405 | const uint32_t expected_crc = 0x7dcde113; |
| 406 | uint32_t crc; |
| 407 | const char *how; |
| 408 | |
| 409 | crc32c_global_init(); |
| 410 | |
| 411 | #if WITH_CRC32C_HW |
| 412 | if (sse42) |
| 413 | how = "hardware (SSE42)" ; |
| 414 | else |
| 415 | how = "software (SSE42 supported in build but not at runtime)" ; |
| 416 | #else |
| 417 | how = "software" ; |
| 418 | #endif |
| 419 | RD_UT_SAY("Calculate CRC32C using %s" , how); |
| 420 | |
| 421 | crc = crc32c(0, buf, strlen(buf)); |
| 422 | RD_UT_ASSERT(crc == expected_crc, |
| 423 | "Calculated CRC (%s) 0x%" PRIx32 |
| 424 | " not matching expected CRC 0x%" PRIx32, |
| 425 | how, crc, expected_crc); |
| 426 | |
| 427 | /* Verify software version too, regardless of which |
| 428 | * version was used above. */ |
| 429 | crc32c_init_sw(); |
| 430 | RD_UT_SAY("Calculate CRC32C using software" ); |
| 431 | crc = crc32c_sw(0, buf, strlen(buf)); |
| 432 | RD_UT_ASSERT(crc == expected_crc, |
| 433 | "Calculated CRC (software) 0x%" PRIx32 |
| 434 | " not matching expected CRC 0x%" PRIx32, |
| 435 | crc, expected_crc); |
| 436 | |
| 437 | RD_UT_PASS(); |
| 438 | } |
| 439 | |