crc32c.c source code [ClickHouse/contrib/librdkafka/src/crc32c.c]

1	/ Copied from http://stackoverflow.com/a/17646775/1821055*
2	* with the following modifications:
3	* * remove test code
4	* * global hw/sw initialization to be called once per process
5	* * HW support is determined by configure's WITH_CRC32C_HW
6	* * Windows porting (no hardware support on Windows yet)
7	*
8	* FIXME:
9	* * Hardware support on Windows (MSVC assembler)
10	* * Hardware support on ARM
11	*/
12
13	/ crc32c.c -- compute CRC-32C using the Intel crc32 instruction*
14	* Copyright (C) 2013 Mark Adler
15	* Version 1.1 1 Aug 2013 Mark Adler
16	*/
17
18	/*
19	This software is provided 'as-is', without any express or implied
20	warranty. In no event will the author be held liable for any damages
21	arising from the use of this software.
22
23	Permission is granted to anyone to use this software for any purpose,
24	including commercial applications, and to alter it and redistribute it
25	freely, subject to the following restrictions:
26
27	1. The origin of this software must not be misrepresented; you must not
28	claim that you wrote the original software. If you use this software
29	in a product, an acknowledgment in the product documentation would be
30	appreciated but is not required.
31	2. Altered source versions must be plainly marked as such, and must not be
32	misrepresented as being the original software.
33	3. This notice may not be removed or altered from any source distribution.
34
35	Mark Adler
36	madler@alumni.caltech.edu
37	*/
38
39	/ Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a*
40	CRC-32C, not* the CRC-32 used by Ethernet and zip, gzip, etc. A software*
41	version is provided as a fall-back, as well as for speed comparisons. /*
42
43	/ Version history:*
44	1.0 10 Feb 2013 First version
45	1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
46	*/
47
48	#include "rd.h"
49
50	#include <stdio.h>
51	#include <stdlib.h>
52	#include <stdint.h>
53	#ifndef _MSC_VER
54	#include <unistd.h>
55	#endif
56
57	#include "rdunittest.h"
58	#include "rdendian.h"
59
60	#include "crc32c.h"
61
62	/ CRC-32C (iSCSI) polynomial in reversed bit order. /
63	#define POLY 0x82f63b78
64
65	/ Table for a quadword-at-a-time software crc. /
66	static uint32_t crc32c_table[`8`][`256`];
67
68	/ Construct table for software CRC-32C calculation. /
69	static void crc32c_init_sw(void)
70	{
71	uint32_t n, crc, k;
72
73	for (n = `0`; n < `256`; n++) {
74	crc = n;
75	crc = crc & `1` ? (crc >> `1`) ^ POLY : crc >> `1`;
76	crc = crc & `1` ? (crc >> `1`) ^ POLY : crc >> `1`;
77	crc = crc & `1` ? (crc >> `1`) ^ POLY : crc >> `1`;
78	crc = crc & `1` ? (crc >> `1`) ^ POLY : crc >> `1`;
79	crc = crc & `1` ? (crc >> `1`) ^ POLY : crc >> `1`;
80	crc = crc & `1` ? (crc >> `1`) ^ POLY : crc >> `1`;
81	crc = crc & `1` ? (crc >> `1`) ^ POLY : crc >> `1`;
82	crc = crc & `1` ? (crc >> `1`) ^ POLY : crc >> `1`;
83	crc32c_table[`0`][n] = crc;
84	}
85	for (n = `0`; n < `256`; n++) {
86	crc = crc32c_table[`0`][n];
87	for (k = `1`; k < `8`; k++) {
88	crc = crc32c_table[`0`][crc & `0xff`] ^ (crc >> `8`);
89	crc32c_table[k][n] = crc;
90	}
91	}
92	}
93
94	/ Table-driven software version as a fall-back. This is about 15 times slower*
95	than using the hardware instructions. This assumes little-endian integers,
96	as is the case on Intel processors that the assembler code here is for. /*
97	static uint32_t crc32c_sw(uint32_t crci, const void *buf, size_t len)
98	{
99	const unsigned char *next = buf;
100	uint64_t crc;
101
102	crc = crci ^ `0xffffffff`;
103	while (len && ((uintptr_t)next & `7`) != `0`) {
104	crc = crc32c_table[`0`][(crc ^ *next++) & `0xff`] ^ (crc >> `8`);
105	len--;
106	}
107	while (len >= `8`) {
108	#if defined(__sparc) \|\| defined(__sparc__) \|\| defined(__APPLE__) \|\| defined(__mips__) \|\| defined(__arm__)
109	/ Alignment-safe alternative.*
110	* This is also needed on Apple to avoid compilation warnings for
111	* non-appearant alignment reasons. */
112	uint64_t ncopy;
113	memcpy(&ncopy, next, sizeof(ncopy));
114	crc ^= le64toh(ncopy);
115	#else
116	crc ^= le64toh((uint64_t )next);
117	#endif
118	crc = crc32c_table[`7`][crc & `0xff`] ^
119	crc32c_table[`6`][(crc >> `8`) & `0xff`] ^
120	crc32c_table[`5`][(crc >> `16`) & `0xff`] ^
121	crc32c_table[`4`][(crc >> `24`) & `0xff`] ^
122	crc32c_table[`3`][(crc >> `32`) & `0xff`] ^
123	crc32c_table[`2`][(crc >> `40`) & `0xff`] ^
124	crc32c_table[`1`][(crc >> `48`) & `0xff`] ^
125	crc32c_table[`0`][crc >> `56`];
126	next += `8`;
127	len -= `8`;
128	}
129	while (len) {
130	crc = crc32c_table[`0`][(crc ^ *next++) & `0xff`] ^ (crc >> `8`);
131	len--;
132	}
133	return (uint32_t)crc ^ `0xffffffff`;
134	}
135
136
137	#if WITH_CRC32C_HW
138	static int sse42; / Cached SSE42 support /
139
140	/ Multiply a matrix times a vector over the Galois field of two elements,*
141	GF(2). Each element is a bit in an unsigned integer. mat must have at
142	least as many entries as the power of two for most significant one bit in
143	vec. /*
144	static RD_INLINE uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec)
145	{
146	uint32_t sum;
147
148	sum = `0`;
149	while (vec) {
150	if (vec & `1`)
151	sum ^= *mat;
152	vec >>= `1`;
153	mat++;
154	}
155	return sum;
156	}
157
158	/ Multiply a matrix by itself over GF(2). Both mat and square must have 32*
159	rows. /*
160	static RD_INLINE void gf2_matrix_square(uint32_t square, uint32_t mat)
161	{
162	int n;
163
164	for (n = `0`; n < `32`; n++)
165	square[n] = gf2_matrix_times(mat, mat[n]);
166	}
167
168	/ Construct an operator to apply len zeros to a crc. len must be a power of*
169	two. If len is not a power of two, then the result is the same as for the
170	largest power of two less than len. The result for len == 0 is the same as
171	for len == 1. A version of this routine could be easily written for any
172	len, but that is not needed for this application. /*
173	static void crc32c_zeros_op(uint32_t *even, size_t len)
174	{
175	int n;
176	uint32_t row;
177	uint32_t odd[`32`]; / odd-power-of-two zeros operator /
178
179	/ put operator for one zero bit in odd /
180	odd[`0`] = POLY; / CRC-32C polynomial /
181	row = `1`;
182	for (n = `1`; n < `32`; n++) {
183	odd[n] = row;
184	row <<= `1`;
185	}
186
187	/ put operator for two zero bits in even /
188	gf2_matrix_square(even, odd);
189
190	/ put operator for four zero bits in odd /
191	gf2_matrix_square(odd, even);
192
193	/ first square will put the operator for one zero byte (eight zero bits),*
194	in even -- next square puts operator for two zero bytes in odd, and so
195	on, until len has been rotated down to zero /*
196	do {
197	gf2_matrix_square(even, odd);
198	len >>= `1`;
199	if (len == `0`)
200	return;
201	gf2_matrix_square(odd, even);
202	len >>= `1`;
203	} while (len);
204
205	/ answer ended up in odd -- copy to even /
206	for (n = `0`; n < `32`; n++)
207	even[n] = odd[n];
208	}
209
210	/ Take a length and build four lookup tables for applying the zeros operator*
211	for that length, byte-by-byte on the operand. /*
212	static void crc32c_zeros(uint32_t zeros[][`256`], size_t len)
213	{
214	uint32_t n;
215	uint32_t op[`32`];
216
217	crc32c_zeros_op(op, len);
218	for (n = `0`; n < `256`; n++) {
219	zeros[`0`][n] = gf2_matrix_times(op, n);
220	zeros[`1`][n] = gf2_matrix_times(op, n << `8`);
221	zeros[`2`][n] = gf2_matrix_times(op, n << `16`);
222	zeros[`3`][n] = gf2_matrix_times(op, n << `24`);
223	}
224	}
225
226	/ Apply the zeros operator table to crc. /
227	static RD_INLINE uint32_t crc32c_shift(uint32_t zeros[][`256`], uint32_t crc)
228	{
229	return zeros[`0`][crc & `0xff`] ^ zeros[`1`][(crc >> `8`) & `0xff`] ^
230	zeros[`2`][(crc >> `16`) & `0xff`] ^ zeros[`3`][crc >> `24`];
231	}
232
233	/ Block sizes for three-way parallel crc computation. LONG and SHORT must*
234	both be powers of two. The associated string constants must be set
235	accordingly, for use in constructing the assembler instructions. /*
236	#define LONG 8192
237	#define LONGx1 "8192"
238	#define LONGx2 "16384"
239	#define SHORT 256
240	#define SHORTx1 "256"
241	#define SHORTx2 "512"
242
243	/ Tables for hardware crc that shift a crc by LONG and SHORT zeros. /
244	static uint32_t crc32c_long[`4`][`256`];
245	static uint32_t crc32c_short[`4`][`256`];
246
247	/ Initialize tables for shifting crcs. /
248	static void crc32c_init_hw(void)
249	{
250	crc32c_zeros(crc32c_long, LONG);
251	crc32c_zeros(crc32c_short, SHORT);
252	}
253
254	/ Compute CRC-32C using the Intel hardware instruction. /
255	static uint32_t crc32c_hw(uint32_t crc, const void *buf, size_t len)
256	{
257	const unsigned char *next = buf;
258	const unsigned char *end;
259	uint64_t crc0, crc1, crc2; / need to be 64 bits for crc32q /
260
261	/ pre-process the crc /
262	crc0 = crc ^ `0xffffffff`;
263
264	/ compute the crc for up to seven leading bytes to bring the data pointer*
265	to an eight-byte boundary /*
266	while (len && ((uintptr_t)next & `7`) != `0`) {
267	__asm__("crc32b\t" "(%1), %0"
268	: "=r"(crc0)
269	: "r"(next), "0"(crc0));
270	next++;
271	len--;
272	}
273
274	/ compute the crc on sets of LONG3 bytes, executing three independent crc
275	instructions, each on LONG bytes -- this is optimized for the Nehalem,
276	Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
277	throughput of one crc per cycle, but a latency of three cycles /*
278	while (len >= LONG*`3`) {
279	crc1 = `0`;
280	crc2 = `0`;
281	end = next + LONG;
282	do {
283	__asm__("crc32q\t" "(%3), %0\n\t"
284	"crc32q\t" LONGx1 "(%3), %1\n\t"
285	"crc32q\t" LONGx2 "(%3), %2"
286	: "=r"(crc0), "=r"(crc1), "=r"(crc2)
287	: "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
288	next += `8`;
289	} while (next < end);
290	crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
291	crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
292	next += LONG*`2`;
293	len -= LONG*`3`;
294	}
295
296	/ do the same thing, but now on SHORT3 blocks for the remaining data less
297	than a LONG3 block /
298	while (len >= SHORT*`3`) {
299	crc1 = `0`;
300	crc2 = `0`;
301	end = next + SHORT;
302	do {
303	__asm__("crc32q\t" "(%3), %0\n\t"
304	"crc32q\t" SHORTx1 "(%3), %1\n\t"
305	"crc32q\t" SHORTx2 "(%3), %2"
306	: "=r"(crc0), "=r"(crc1), "=r"(crc2)
307	: "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
308	next += `8`;
309	} while (next < end);
310	crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
311	crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
312	next += SHORT*`2`;
313	len -= SHORT*`3`;
314	}
315
316	/ compute the crc on the remaining eight-byte units less than a SHORT3
317	block /*
318	end = next + (len - (len & `7`));
319	while (next < end) {
320	__asm__("crc32q\t" "(%1), %0"
321	: "=r"(crc0)
322	: "r"(next), "0"(crc0));
323	next += `8`;
324	}
325	len &= `7`;
326
327	/ compute the crc for up to seven trailing bytes /
328	while (len) {
329	__asm__("crc32b\t" "(%1), %0"
330	: "=r"(crc0)
331	: "r"(next), "0"(crc0));
332	next++;
333	len--;
334	}
335
336	/ return a post-processed crc /
337	return (uint32_t)crc0 ^ `0xffffffff`;
338	}
339
340	/ Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors*
341	introduced in November, 2008. This does not check for the existence of the
342	cpuid instruction itself, which was introduced on the 486SL in 1992, so this
343	will fail on earlier x86 processors. cpuid works on all Pentium and later
344	processors. /*
345	#define SSE42(have) \
346	do { \
347	uint32_t eax, ecx; \
348	eax = 1; \
349	__asm__("cpuid" \
350	: "=c"(ecx) \
351	: "a"(eax) \
352	: "%ebx", "%edx"); \
353	(have) = (ecx >> 20) & 1; \
354	} while (0)
355
356	#endif /* WITH_CRC32C_HW */
357
358	/ Compute a CRC-32C. If the crc32 instruction is available, use the hardware*
359	version. Otherwise, use the software version. /*
360	uint32_t crc32c(uint32_t crc, const void *buf, size_t len)
361	{
362	#if WITH_CRC32C_HW
363	if (sse42)
364	return crc32c_hw(crc, buf, len);
365	else
366	#endif
367	return crc32c_sw(crc, buf, len);
368	}
369
370
371
372
373
374
375	/**
376	* @brief Populate shift tables once
377	*/
378	void crc32c_global_init (void) {
379	#if WITH_CRC32C_HW
380	SSE42(sse42);
381	if (sse42)
382	crc32c_init_hw();
383	else
384	#endif
385	crc32c_init_sw();
386	}
387
388	int unittest_crc32c (void) {
389	const char *buf =
390	" This software is provided 'as-is', without any express or implied\n"
391	" warranty. In no event will the author be held liable for any damages\n"
392	" arising from the use of this software.\n"
393	"\n"
394	" Permission is granted to anyone to use this software for any purpose,\n"
395	" including commercial applications, and to alter it and redistribute it\n"
396	" freely, subject to the following restrictions:\n"
397	"\n"
398	" 1. The origin of this software must not be misrepresented; you must not\n"
399	" claim that you wrote the original software. If you use this software\n"
400	" in a product, an acknowledgment in the product documentation would be\n"
401	" appreciated but is not required.\n"
402	" 2. Altered source versions must be plainly marked as such, and must not be\n"
403	" misrepresented as being the original software.\n"
404	" 3. This notice may not be removed or altered from any source distribution.";
405	const uint32_t expected_crc = `0x7dcde113`;
406	uint32_t crc;
407	const char *how;
408
409	crc32c_global_init();
410
411	#if WITH_CRC32C_HW
412	if (sse42)
413	how = "hardware (SSE42)";
414	else
415	how = "software (SSE42 supported in build but not at runtime)";
416	#else
417	how = "software";
418	#endif
419	RD_UT_SAY("Calculate CRC32C using %s", how);
420
421	crc = crc32c(`0`, buf, strlen(buf));
422	RD_UT_ASSERT(crc == expected_crc,
423	"Calculated CRC (%s) 0x%"PRIx32
424	" not matching expected CRC 0x%"PRIx32,
425	how, crc, expected_crc);
426
427	/ Verify software version too, regardless of which*
428	* version was used above. */
429	crc32c_init_sw();
430	RD_UT_SAY("Calculate CRC32C using software");
431	crc = crc32c_sw(`0`, buf, strlen(buf));
432	RD_UT_ASSERT(crc == expected_crc,
433	"Calculated CRC (software) 0x%"PRIx32
434	" not matching expected CRC 0x%"PRIx32,
435	crc, expected_crc);
436
437	RD_UT_PASS();
438	}
439

Browse the source code of ClickHouse/contrib/librdkafka/src/crc32c.c