1 | /* crc32.c -- compute the CRC-32 of a data stream |
2 | * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler |
3 | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | * |
5 | * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster |
6 | * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing |
7 | * tables for updating the shift register in one step with three exclusive-ors |
8 | * instead of four steps with four exclusive-ors. This results in about a |
9 | * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. |
10 | */ |
11 | |
12 | /* @(#) $Id$ */ |
13 | |
14 | /* |
15 | Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore |
16 | protection on the static variables used to control the first-use generation |
17 | of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should |
18 | first call get_crc_table() to initialize the tables before allowing more than |
19 | one thread to use crc32(). |
20 | |
21 | DYNAMIC_CRC_TABLE and MAKECRCH can be #defined to write out crc32.h. |
22 | */ |
23 | |
24 | #ifdef MAKECRCH |
25 | # include <stdio.h> |
26 | # ifndef DYNAMIC_CRC_TABLE |
27 | # define DYNAMIC_CRC_TABLE |
28 | # endif /* !DYNAMIC_CRC_TABLE */ |
29 | #endif /* MAKECRCH */ |
30 | |
31 | #include "deflate.h" |
32 | #include "x86.h" |
33 | #include "zutil.h" /* for STDC and FAR definitions */ |
34 | |
35 | #if defined(CRC32_SIMD_SSE42_PCLMUL) |
36 | #include "crc32_simd.h" |
37 | #elif defined(CRC32_ARMV8_CRC32) |
38 | #include "arm_features.h" |
39 | #include "crc32_simd.h" |
40 | #endif |
41 | |
42 | /* Definitions for doing the crc four data bytes at a time. */ |
43 | #if !defined(NOBYFOUR) && defined(Z_U4) |
44 | # define BYFOUR |
45 | #endif |
46 | #ifdef BYFOUR |
47 | local unsigned long crc32_little OF((unsigned long, |
48 | const unsigned char FAR *, z_size_t)); |
49 | local unsigned long crc32_big OF((unsigned long, |
50 | const unsigned char FAR *, z_size_t)); |
51 | # define TBLS 8 |
52 | #else |
53 | # define TBLS 1 |
54 | #endif /* BYFOUR */ |
55 | |
56 | /* Local functions for crc concatenation */ |
57 | local unsigned long gf2_matrix_times OF((unsigned long *mat, |
58 | unsigned long vec)); |
59 | local void gf2_matrix_square OF((unsigned long *square, unsigned long *mat)); |
60 | local uLong crc32_combine_ OF((uLong crc1, uLong crc2, z_off64_t len2)); |
61 | |
62 | |
63 | #ifdef DYNAMIC_CRC_TABLE |
64 | |
65 | local volatile int crc_table_empty = 1; |
66 | local z_crc_t FAR crc_table[TBLS][256]; |
67 | local void make_crc_table OF((void)); |
68 | #ifdef MAKECRCH |
69 | local void write_table OF((FILE *, const z_crc_t FAR *)); |
70 | #endif /* MAKECRCH */ |
71 | /* |
72 | Generate tables for a byte-wise 32-bit CRC calculation on the polynomial: |
73 | x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1. |
74 | |
75 | Polynomials over GF(2) are represented in binary, one bit per coefficient, |
76 | with the lowest powers in the most significant bit. Then adding polynomials |
77 | is just exclusive-or, and multiplying a polynomial by x is a right shift by |
78 | one. If we call the above polynomial p, and represent a byte as the |
79 | polynomial q, also with the lowest power in the most significant bit (so the |
80 | byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p, |
81 | where a mod b means the remainder after dividing a by b. |
82 | |
83 | This calculation is done using the shift-register method of multiplying and |
84 | taking the remainder. The register is initialized to zero, and for each |
85 | incoming bit, x^32 is added mod p to the register if the bit is a one (where |
86 | x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by |
87 | x (which is shifting right by one and adding x^32 mod p if the bit shifted |
88 | out is a one). We start with the highest power (least significant bit) of |
89 | q and repeat for all eight bits of q. |
90 | |
91 | The first table is simply the CRC of all possible eight bit values. This is |
92 | all the information needed to generate CRCs on data a byte at a time for all |
93 | combinations of CRC register values and incoming bytes. The remaining tables |
94 | allow for word-at-a-time CRC calculation for both big-endian and little- |
95 | endian machines, where a word is four bytes. |
96 | */ |
97 | local void make_crc_table() |
98 | { |
99 | z_crc_t c; |
100 | int n, k; |
101 | z_crc_t poly; /* polynomial exclusive-or pattern */ |
102 | /* terms of polynomial defining this crc (except x^32): */ |
103 | static volatile int first = 1; /* flag to limit concurrent making */ |
104 | static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26}; |
105 | |
106 | /* See if another task is already doing this (not thread-safe, but better |
107 | than nothing -- significantly reduces duration of vulnerability in |
108 | case the advice about DYNAMIC_CRC_TABLE is ignored) */ |
109 | if (first) { |
110 | first = 0; |
111 | |
112 | /* make exclusive-or pattern from polynomial (0xedb88320UL) */ |
113 | poly = 0; |
114 | for (n = 0; n < (int)(sizeof(p)/sizeof(unsigned char)); n++) |
115 | poly |= (z_crc_t)1 << (31 - p[n]); |
116 | |
117 | /* generate a crc for every 8-bit value */ |
118 | for (n = 0; n < 256; n++) { |
119 | c = (z_crc_t)n; |
120 | for (k = 0; k < 8; k++) |
121 | c = c & 1 ? poly ^ (c >> 1) : c >> 1; |
122 | crc_table[0][n] = c; |
123 | } |
124 | |
125 | #ifdef BYFOUR |
126 | /* generate crc for each value followed by one, two, and three zeros, |
127 | and then the byte reversal of those as well as the first table */ |
128 | for (n = 0; n < 256; n++) { |
129 | c = crc_table[0][n]; |
130 | crc_table[4][n] = ZSWAP32(c); |
131 | for (k = 1; k < 4; k++) { |
132 | c = crc_table[0][c & 0xff] ^ (c >> 8); |
133 | crc_table[k][n] = c; |
134 | crc_table[k + 4][n] = ZSWAP32(c); |
135 | } |
136 | } |
137 | #endif /* BYFOUR */ |
138 | |
139 | crc_table_empty = 0; |
140 | } |
141 | else { /* not first */ |
142 | /* wait for the other guy to finish (not efficient, but rare) */ |
143 | while (crc_table_empty) |
144 | ; |
145 | } |
146 | |
147 | #ifdef MAKECRCH |
148 | /* write out CRC tables to crc32.h */ |
149 | { |
150 | FILE *out; |
151 | |
152 | out = fopen("crc32.h" , "w" ); |
153 | if (out == NULL) return; |
154 | fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n" ); |
155 | fprintf(out, " * Generated automatically by crc32.c\n */\n\n" ); |
156 | fprintf(out, "local const z_crc_t FAR " ); |
157 | fprintf(out, "crc_table[TBLS][256] =\n{\n {\n" ); |
158 | write_table(out, crc_table[0]); |
159 | # ifdef BYFOUR |
160 | fprintf(out, "#ifdef BYFOUR\n" ); |
161 | for (k = 1; k < 8; k++) { |
162 | fprintf(out, " },\n {\n" ); |
163 | write_table(out, crc_table[k]); |
164 | } |
165 | fprintf(out, "#endif\n" ); |
166 | # endif /* BYFOUR */ |
167 | fprintf(out, " }\n};\n" ); |
168 | fclose(out); |
169 | } |
170 | #endif /* MAKECRCH */ |
171 | } |
172 | |
173 | #ifdef MAKECRCH |
174 | local void write_table(out, table) |
175 | FILE *out; |
176 | const z_crc_t FAR *table; |
177 | { |
178 | int n; |
179 | |
180 | for (n = 0; n < 256; n++) |
181 | fprintf(out, "%s0x%08lxUL%s" , n % 5 ? "" : " " , |
182 | (unsigned long)(table[n]), |
183 | n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", " )); |
184 | } |
185 | #endif /* MAKECRCH */ |
186 | |
187 | #else /* !DYNAMIC_CRC_TABLE */ |
188 | /* ======================================================================== |
189 | * Tables of CRC-32s of all single-byte values, made by make_crc_table(). |
190 | */ |
191 | #include "crc32.h" |
192 | #endif /* DYNAMIC_CRC_TABLE */ |
193 | |
194 | /* ========================================================================= |
195 | * This function can be used by asm versions of crc32() |
196 | */ |
197 | const z_crc_t FAR * ZEXPORT get_crc_table() |
198 | { |
199 | #ifdef DYNAMIC_CRC_TABLE |
200 | if (crc_table_empty) |
201 | make_crc_table(); |
202 | #endif /* DYNAMIC_CRC_TABLE */ |
203 | return (const z_crc_t FAR *)crc_table; |
204 | } |
205 | |
206 | /* ========================================================================= */ |
207 | #define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8) |
208 | #define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 |
209 | |
210 | /* ========================================================================= */ |
211 | unsigned long ZEXPORT crc32_z(crc, buf, len) |
212 | unsigned long crc; |
213 | const unsigned char FAR *buf; |
214 | z_size_t len; |
215 | { |
216 | /* |
217 | * zlib convention is to call crc32(0, NULL, 0); before making |
218 | * calls to crc32(). So this is a good, early (and infrequent) |
219 | * place to cache CPU features if needed for those later, more |
220 | * interesting crc32() calls. |
221 | */ |
222 | #if defined(CRC32_SIMD_SSE42_PCLMUL) |
223 | /* |
224 | * Use x86 sse4.2+pclmul SIMD to compute the crc32. Since this |
225 | * routine can be freely used, check CPU features here. |
226 | */ |
227 | if (buf == Z_NULL) { |
228 | if (!len) /* Assume user is calling crc32(0, NULL, 0); */ |
229 | x86_check_features(); |
230 | return 0UL; |
231 | } |
232 | |
233 | if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) { |
234 | /* crc32 16-byte chunks */ |
235 | z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK; |
236 | crc = ~crc32_sse42_simd_(buf, chunk_size, ~(uint32_t)crc); |
237 | /* check remaining data */ |
238 | len -= chunk_size; |
239 | if (!len) |
240 | return crc; |
241 | /* Fall into the default crc32 for the remaining data. */ |
242 | buf += chunk_size; |
243 | } |
244 | #else |
245 | if (buf == Z_NULL) { |
246 | return 0UL; |
247 | } |
248 | #endif /* CRC32_SIMD_SSE42_PCLMUL */ |
249 | |
250 | #ifdef DYNAMIC_CRC_TABLE |
251 | if (crc_table_empty) |
252 | make_crc_table(); |
253 | #endif /* DYNAMIC_CRC_TABLE */ |
254 | |
255 | #ifdef BYFOUR |
256 | if (sizeof(void *) == sizeof(ptrdiff_t)) { |
257 | z_crc_t endian; |
258 | |
259 | endian = 1; |
260 | if (*((unsigned char *)(&endian))) |
261 | return crc32_little(crc, buf, len); |
262 | else |
263 | return crc32_big(crc, buf, len); |
264 | } |
265 | #endif /* BYFOUR */ |
266 | crc = crc ^ 0xffffffffUL; |
267 | while (len >= 8) { |
268 | DO8; |
269 | len -= 8; |
270 | } |
271 | if (len) do { |
272 | DO1; |
273 | } while (--len); |
274 | return crc ^ 0xffffffffUL; |
275 | } |
276 | |
277 | /* ========================================================================= */ |
278 | unsigned long ZEXPORT crc32(crc, buf, len) |
279 | unsigned long crc; |
280 | const unsigned char FAR *buf; |
281 | uInt len; |
282 | { |
283 | #if defined(CRC32_ARMV8_CRC32) |
284 | /* We got to verify ARM CPU features, so exploit the common usage pattern |
285 | * of calling this function with Z_NULL for an initial valid crc value. |
286 | * This allows to cache the result of the feature check and avoid extraneous |
287 | * function calls. |
288 | * TODO: try to move this to crc32_z if we don't loose performance on ARM. |
289 | */ |
290 | if (buf == Z_NULL) { |
291 | if (!len) /* Assume user is calling crc32(0, NULL, 0); */ |
292 | arm_check_features(); |
293 | return 0UL; |
294 | } |
295 | |
296 | if (arm_cpu_enable_crc32) |
297 | return armv8_crc32_little(crc, buf, len); |
298 | #endif |
299 | return crc32_z(crc, buf, len); |
300 | } |
301 | |
302 | #ifdef BYFOUR |
303 | |
304 | /* |
305 | This BYFOUR code accesses the passed unsigned char * buffer with a 32-bit |
306 | integer pointer type. This violates the strict aliasing rule, where a |
307 | compiler can assume, for optimization purposes, that two pointers to |
308 | fundamentally different types won't ever point to the same memory. This can |
309 | manifest as a problem only if one of the pointers is written to. This code |
310 | only reads from those pointers. So long as this code remains isolated in |
311 | this compilation unit, there won't be a problem. For this reason, this code |
312 | should not be copied and pasted into a compilation unit in which other code |
313 | writes to the buffer that is passed to these routines. |
314 | */ |
315 | |
316 | /* ========================================================================= */ |
317 | #define DOLIT4 c ^= *buf4++; \ |
318 | c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ |
319 | crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] |
320 | #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 |
321 | |
322 | /* ========================================================================= */ |
323 | local unsigned long crc32_little(crc, buf, len) |
324 | unsigned long crc; |
325 | const unsigned char FAR *buf; |
326 | z_size_t len; |
327 | { |
328 | register z_crc_t c; |
329 | register const z_crc_t FAR *buf4; |
330 | |
331 | c = (z_crc_t)crc; |
332 | c = ~c; |
333 | while (len && ((ptrdiff_t)buf & 3)) { |
334 | c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); |
335 | len--; |
336 | } |
337 | |
338 | buf4 = (const z_crc_t FAR *)(const void FAR *)buf; |
339 | while (len >= 32) { |
340 | DOLIT32; |
341 | len -= 32; |
342 | } |
343 | while (len >= 4) { |
344 | DOLIT4; |
345 | len -= 4; |
346 | } |
347 | buf = (const unsigned char FAR *)buf4; |
348 | |
349 | if (len) do { |
350 | c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); |
351 | } while (--len); |
352 | c = ~c; |
353 | return (unsigned long)c; |
354 | } |
355 | |
356 | /* ========================================================================= */ |
357 | #define DOBIG4 c ^= *buf4++; \ |
358 | c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ |
359 | crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] |
360 | #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 |
361 | |
362 | /* ========================================================================= */ |
363 | local unsigned long crc32_big(crc, buf, len) |
364 | unsigned long crc; |
365 | const unsigned char FAR *buf; |
366 | z_size_t len; |
367 | { |
368 | register z_crc_t c; |
369 | register const z_crc_t FAR *buf4; |
370 | |
371 | c = ZSWAP32((z_crc_t)crc); |
372 | c = ~c; |
373 | while (len && ((ptrdiff_t)buf & 3)) { |
374 | c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); |
375 | len--; |
376 | } |
377 | |
378 | buf4 = (const z_crc_t FAR *)(const void FAR *)buf; |
379 | while (len >= 32) { |
380 | DOBIG32; |
381 | len -= 32; |
382 | } |
383 | while (len >= 4) { |
384 | DOBIG4; |
385 | len -= 4; |
386 | } |
387 | buf = (const unsigned char FAR *)buf4; |
388 | |
389 | if (len) do { |
390 | c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); |
391 | } while (--len); |
392 | c = ~c; |
393 | return (unsigned long)(ZSWAP32(c)); |
394 | } |
395 | |
396 | #endif /* BYFOUR */ |
397 | |
398 | #define GF2_DIM 32 /* dimension of GF(2) vectors (length of CRC) */ |
399 | |
400 | /* ========================================================================= */ |
401 | local unsigned long gf2_matrix_times(mat, vec) |
402 | unsigned long *mat; |
403 | unsigned long vec; |
404 | { |
405 | unsigned long sum; |
406 | |
407 | sum = 0; |
408 | while (vec) { |
409 | if (vec & 1) |
410 | sum ^= *mat; |
411 | vec >>= 1; |
412 | mat++; |
413 | } |
414 | return sum; |
415 | } |
416 | |
417 | /* ========================================================================= */ |
418 | local void gf2_matrix_square(square, mat) |
419 | unsigned long *square; |
420 | unsigned long *mat; |
421 | { |
422 | int n; |
423 | |
424 | for (n = 0; n < GF2_DIM; n++) |
425 | square[n] = gf2_matrix_times(mat, mat[n]); |
426 | } |
427 | |
428 | /* ========================================================================= */ |
429 | local uLong crc32_combine_(crc1, crc2, len2) |
430 | uLong crc1; |
431 | uLong crc2; |
432 | z_off64_t len2; |
433 | { |
434 | int n; |
435 | unsigned long row; |
436 | unsigned long even[GF2_DIM]; /* even-power-of-two zeros operator */ |
437 | unsigned long odd[GF2_DIM]; /* odd-power-of-two zeros operator */ |
438 | |
439 | /* degenerate case (also disallow negative lengths) */ |
440 | if (len2 <= 0) |
441 | return crc1; |
442 | |
443 | /* put operator for one zero bit in odd */ |
444 | odd[0] = 0xedb88320UL; /* CRC-32 polynomial */ |
445 | row = 1; |
446 | for (n = 1; n < GF2_DIM; n++) { |
447 | odd[n] = row; |
448 | row <<= 1; |
449 | } |
450 | |
451 | /* put operator for two zero bits in even */ |
452 | gf2_matrix_square(even, odd); |
453 | |
454 | /* put operator for four zero bits in odd */ |
455 | gf2_matrix_square(odd, even); |
456 | |
457 | /* apply len2 zeros to crc1 (first square will put the operator for one |
458 | zero byte, eight zero bits, in even) */ |
459 | do { |
460 | /* apply zeros operator for this bit of len2 */ |
461 | gf2_matrix_square(even, odd); |
462 | if (len2 & 1) |
463 | crc1 = gf2_matrix_times(even, crc1); |
464 | len2 >>= 1; |
465 | |
466 | /* if no more bits set, then done */ |
467 | if (len2 == 0) |
468 | break; |
469 | |
470 | /* another iteration of the loop with odd and even swapped */ |
471 | gf2_matrix_square(odd, even); |
472 | if (len2 & 1) |
473 | crc1 = gf2_matrix_times(odd, crc1); |
474 | len2 >>= 1; |
475 | |
476 | /* if no more bits set, then done */ |
477 | } while (len2 != 0); |
478 | |
479 | /* return combined crc */ |
480 | crc1 ^= crc2; |
481 | return crc1; |
482 | } |
483 | |
484 | /* ========================================================================= */ |
485 | uLong ZEXPORT crc32_combine(crc1, crc2, len2) |
486 | uLong crc1; |
487 | uLong crc2; |
488 | z_off_t len2; |
489 | { |
490 | return crc32_combine_(crc1, crc2, len2); |
491 | } |
492 | |
493 | uLong ZEXPORT crc32_combine64(crc1, crc2, len2) |
494 | uLong crc1; |
495 | uLong crc2; |
496 | z_off64_t len2; |
497 | { |
498 | return crc32_combine_(crc1, crc2, len2); |
499 | } |
500 | |
501 | ZLIB_INTERNAL void crc_reset(deflate_state *const s) |
502 | { |
503 | if (x86_cpu_enable_simd) { |
504 | crc_fold_init(s); |
505 | return; |
506 | } |
507 | s->strm->adler = crc32(0L, Z_NULL, 0); |
508 | } |
509 | |
510 | ZLIB_INTERNAL void crc_finalize(deflate_state *const s) |
511 | { |
512 | if (x86_cpu_enable_simd) |
513 | s->strm->adler = crc_fold_512to32(s); |
514 | } |
515 | |
516 | ZLIB_INTERNAL void copy_with_crc(z_streamp strm, Bytef *dst, long size) |
517 | { |
518 | if (x86_cpu_enable_simd) { |
519 | crc_fold_copy(strm->state, dst, strm->next_in, size); |
520 | return; |
521 | } |
522 | zmemcpy(dst, strm->next_in, size); |
523 | strm->adler = crc32(strm->adler, dst, size); |
524 | } |
525 | |