crc32c_sse42_asm.c source code [ClickHouse/contrib/aws-checksums/source/intel/crc32c_sse42_asm.c]

1	/*
2	* Copyright 2010-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License").
5	* You may not use this file except in compliance with the License.
6	* A copy of the License is located at
7	*
8	* http://aws.amazon.com/apache2.0
9	*
10	* or in the "license" file accompanying this file. This file is distributed
11	* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12	* express or implied. See the License for the specific language governing
13	* permissions and limitations under the License.
14	*/
15
16	#include <aws/checksums/private/cpuid.h>
17	#include <aws/checksums/private/crc_priv.h>
18
19	/this implementation is only for 64 bit arch and (if on GCC, release mode).*
20	* If using clang, this will run for both debug and release.*/
21	#if defined(__x86_64__) && !(defined(__GNUC__) && defined(DEBUG_BUILD))
22
23	# define LIKELY(x) __builtin_expect((x), 1)
24	# define UNLIKELY(x) __builtin_expect((x), 0)
25
26	/*
27	* Factored out common inline asm for folding crc0,crc1,crc2 stripes in rcx, r11, r10 using
28	* the specified Magic Constants K1 and K2.
29	* Assumes rcx, r11, r10 contain crc0, crc1, crc2 that need folding
30	* Utilizes xmm1, xmm2, xmm3, xmm4 as well as clobbering r8, r9, r11
31	* Result is placed in ecx
32	*/
33	# define FOLD_K1K2(NAME, K1, K2) \
34	"fold_k1k2_" #NAME "_%=: \n" \
35	"mov " #K1 ", %%r8d # Magic K1 constant \n" \
36	"mov " #K2 ", %%r9d # Magic K2 constant \n" \
37	"movq %%rcx, %%xmm1 # crc0 into lower dword of xmm1 \n" \
38	"movq %%r8, %%xmm3 # K1 into lower dword of xmm3 \n" \
39	"movq %%r11, %%xmm2 # crc1 into lower dword of xmm2 \n" \
40	"movq %%r9, %%xmm4 # K2 into lower dword of xmm4 \n" \
41	"pclmulqdq $0x00, %%xmm3, %%xmm1 # Multiply crc0 by K1 \n" \
42	"pclmulqdq $0x00, %%xmm4, %%xmm2 # Multiply crc1 by K2 \n" \
43	"xor %%rcx, %%rcx # \n" \
44	"xor %%r11, %%r11 # \n" \
45	"movq %%xmm1, %%r8 # \n" \
46	"movq %%xmm2, %%r9 # \n" \
47	"crc32q %%r8, %%rcx # folding crc0 \n" \
48	"crc32q %%r9, %%r11 # folding crc1 \n" \
49	"xor %%r10d, %%ecx # combine crc2 and crc0 \n" \
50	"xor %%r11d, %%ecx # combine crc1 and crc0 \n"
51
52	/**
53	* Private (static) function.
54	* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
55	* instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function
56	* is optimized for exactly 256 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
57	* pointer to input data that is exactly 256 bytes in length. Note: this function does NOT invert bits of the input crc
58	* or return value.
59	*/
60	static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t crc) {
61	asm volatile("enter_256_%=:"
62
63	"xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
64	"xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
65
66	"crc32q 0(%[in]), %%rcx # crc0 \n"
67	"crc32q 88(%[in]), %%r11 # crc1 \n"
68	"crc32q 176(%[in]), %%r10 # crc2 \n"
69
70	"crc32q 8(%[in]), %%rcx # crc0 \n"
71	"crc32q 96(%[in]), %%r11 # crc1 \n"
72	"crc32q 184(%[in]), %%r10 # crc2 \n"
73
74	"crc32q 16(%[in]), %%rcx # crc0 \n"
75	"crc32q 104(%[in]), %%r11 # crc1 \n"
76	"crc32q 192(%[in]), %%r10 # crc2 \n"
77
78	"crc32q 24(%[in]), %%rcx # crc0 \n"
79	"crc32q 112(%[in]), %%r11 # crc1 \n"
80	"crc32q 200(%[in]), %%r10 # crc2 \n"
81
82	"crc32q 32(%[in]), %%rcx # crc0 \n"
83	"crc32q 120(%[in]), %%r11 # crc1 \n"
84	"crc32q 208(%[in]), %%r10 # crc2 \n"
85
86	"crc32q 40(%[in]), %%rcx # crc0 \n"
87	"crc32q 128(%[in]), %%r11 # crc1 \n"
88	"crc32q 216(%[in]), %%r10 # crc2 \n"
89
90	"crc32q 48(%[in]), %%rcx # crc0 \n"
91	"crc32q 136(%[in]), %%r11 # crc1 \n"
92	"crc32q 224(%[in]), %%r10 # crc2 \n"
93
94	"crc32q 56(%[in]), %%rcx # crc0 \n"
95	"crc32q 144(%[in]), %%r11 # crc1 \n"
96	"crc32q 232(%[in]), %%r10 # crc2 \n"
97
98	"crc32q 64(%[in]), %%rcx # crc0 \n"
99	"crc32q 152(%[in]), %%r11 # crc1 \n"
100	"crc32q 240(%[in]), %%r10 # crc2 \n"
101
102	"crc32q 72(%[in]), %%rcx # crc0 \n"
103	"crc32q 160(%[in]), %%r11 # crc1 \n"
104	"crc32q 248(%[in]), %%r10 # crc2 \n"
105
106	"crc32q 80(%[in]), %%rcx # crc0 \n"
107	"crc32q 168(%[in]), %%r11 # crc2 \n"
108
109	FOLD_K1K2(`256`, $0x1b3d8f29, $0x39d3b296) / Magic Constants used to fold crc stripes into ecx /
110
111	/ output registers*
112	[crc] is an input and and output so it is marked read/write (i.e. "+c")/*
113	: "+c"(crc)
114
115	/ input registers /
116	: [crc] "c"(crc), [in] "d"(input)
117
118	/ additional clobbered registers /
119	: "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
120	return crc;
121	}
122
123	/**
124	* Private (static) function.
125	* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
126	* instruction by operating on 3 24-byte stripes in parallel. The results are folded together using CLMUL. This function
127	* is optimized for exactly 1024 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
128	* pointer to input data that is exactly 1024 bytes in length. Note: this function does NOT invert bits of the input crc
129	* or return value.
130	*/
131	static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) {
132	asm volatile(
133	"enter_1024_%=:"
134
135	"xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
136	"xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
137
138	"mov $5, %%r8d # Loop 5 times through 64 byte chunks in 3 parallel stripes \n"
139
140	"loop_1024_%=:"
141
142	"prefetcht0 128(%[in]) # \n"
143	"prefetcht0 472(%[in]) # \n"
144	"prefetcht0 808(%[in]) # \n"
145
146	"crc32q 0(%[in]), %%rcx # crc0: stripe0 \n"
147	"crc32q 344(%[in]), %%r11 # crc1: stripe1 \n"
148	"crc32q 680(%[in]), %%r10 # crc2: stripe2 \n"
149
150	"crc32q 8(%[in]), %%rcx # crc0 \n"
151	"crc32q 352(%[in]), %%r11 # crc1 \n"
152	"crc32q 688(%[in]), %%r10 # crc2 \n"
153
154	"crc32q 16(%[in]), %%rcx # crc0 \n"
155	"crc32q 360(%[in]), %%r11 # crc1 \n"
156	"crc32q 696(%[in]), %%r10 # crc2 \n"
157
158	"crc32q 24(%[in]), %%rcx # crc0 \n"
159	"crc32q 368(%[in]), %%r11 # crc1 \n"
160	"crc32q 704(%[in]), %%r10 # crc2 \n"
161
162	"crc32q 32(%[in]), %%rcx # crc0 \n"
163	"crc32q 376(%[in]), %%r11 # crc1 \n"
164	"crc32q 712(%[in]), %%r10 # crc2 \n"
165
166	"crc32q 40(%[in]), %%rcx # crc0 \n"
167	"crc32q 384(%[in]), %%r11 # crc1 \n"
168	"crc32q 720(%[in]), %%r10 # crc2 \n"
169
170	"crc32q 48(%[in]), %%rcx # crc0 \n"
171	"crc32q 392(%[in]), %%r11 # crc1 \n"
172	"crc32q 728(%[in]), %%r10 # crc2 \n"
173
174	"crc32q 56(%[in]), %%rcx # crc0 \n"
175	"crc32q 400(%[in]), %%r11 # crc1 \n"
176	"crc32q 736(%[in]), %%r10 # crc2 \n"
177
178	"add $64, %[in] # \n"
179	"sub $1, %%r8d # \n"
180	"jnz loop_1024_%= # \n"
181
182	"crc32q 0(%[in]), %%rcx # crc0 \n"
183	"crc32q 344(%[in]), %%r11 # crc1 \n"
184	"crc32q 680(%[in]), %%r10 # crc2 \n"
185
186	"crc32q 8(%[in]), %%rcx # crc0 \n"
187	"crc32q 352(%[in]), %%r11 # crc1 \n"
188	"crc32q 688(%[in]), %%r10 # crc2 \n"
189
190	"crc32q 16(%[in]), %%rcx # crc0 \n"
191	"crc32q 696(%[in]), %%r10 # crc2 \n"
192
193	FOLD_K1K2(
194	`1024`,
195	$0xe417f38a,
196	$0x8f158014) / Magic Constants used to fold crc stripes into ecx*
197
198	output registers
199	[crc] is an input and and output so it is marked read/write (i.e. "+c")
200	we clobber the register for [input] (via add instruction) so we must also
201	tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber /*
202	: "+c"(crc), "+d"(input)
203
204	/ input registers /
205	/ the numeric values match the position of the output registers /
206	: [crc] "c"(crc), [in] "d"(input)
207
208	/ additional clobbered registers /
209	/ "cc" is the flags - we add and sub, so the flags are also clobbered /
210	: "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
211	return crc;
212	}
213
214	/**
215	* Private (static) function.
216	* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
217	* instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function
218	* is optimized for exactly 3072 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
219	* pointer to input data that is exactly 3072 bytes in length. Note: this function does NOT invert bits of the input crc
220	* or return value.
221	*/
222	static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t crc) {
223	asm volatile(
224	"enter_3072_%=:"
225
226	"xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
227	"xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
228
229	"mov $16, %%r8d # Loop 16 times through 64 byte chunks in 3 parallel stripes \n"
230
231	"loop_3072_%=:"
232
233	"prefetcht0 128(%[in]) # \n"
234	"prefetcht0 1152(%[in]) # \n"
235	"prefetcht0 2176(%[in]) # \n"
236
237	"crc32q 0(%[in]), %%rcx # crc0: stripe0 \n"
238	"crc32q 1024(%[in]), %%r11 # crc1: stripe1 \n"
239	"crc32q 2048(%[in]), %%r10 # crc2: stripe2 \n"
240
241	"crc32q 8(%[in]), %%rcx # crc0: stripe0 \n"
242	"crc32q 1032(%[in]), %%r11 # crc1: stripe1 \n"
243	"crc32q 2056(%[in]), %%r10 # crc2: stripe2 \n"
244
245	"crc32q 16(%[in]), %%rcx # crc0: stripe0 \n"
246	"crc32q 1040(%[in]), %%r11 # crc1: stripe1 \n"
247	"crc32q 2064(%[in]), %%r10 # crc2: stripe2 \n"
248
249	"crc32q 24(%[in]), %%rcx # crc0: stripe0 \n"
250	"crc32q 1048(%[in]), %%r11 # crc1: stripe1 \n"
251	"crc32q 2072(%[in]), %%r10 # crc2: stripe2 \n"
252
253	"crc32q 32(%[in]), %%rcx # crc0: stripe0 \n"
254	"crc32q 1056(%[in]), %%r11 # crc1: stripe1 \n"
255	"crc32q 2080(%[in]), %%r10 # crc2: stripe2 \n"
256
257	"crc32q 40(%[in]), %%rcx # crc0: stripe0 \n"
258	"crc32q 1064(%[in]), %%r11 # crc1: stripe1 \n"
259	"crc32q 2088(%[in]), %%r10 # crc2: stripe2 \n"
260
261	"crc32q 48(%[in]), %%rcx # crc0: stripe0 \n"
262	"crc32q 1072(%[in]), %%r11 # crc1: stripe1 \n"
263	"crc32q 2096(%[in]), %%r10 # crc2: stripe2 \n"
264
265	"crc32q 56(%[in]), %%rcx # crc0: stripe0 \n"
266	"crc32q 1080(%[in]), %%r11 # crc1: stripe1 \n"
267	"crc32q 2104(%[in]), %%r10 # crc2: stripe2 \n"
268
269	"add $64, %[in] # \n"
270	"sub $1, %%r8d # \n"
271	"jnz loop_3072_%= # \n"
272
273	FOLD_K1K2(
274	`3072`,
275	$0xa51b6135,
276	$0x170076fa) / Magic Constants used to fold crc stripes into ecx*
277
278	output registers
279	[crc] is an input and and output so it is marked read/write (i.e. "+c")
280	we clobber the register for [input] (via add instruction) so we must also
281	tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber/*
282	: "+c"(crc), "+d"(input)
283
284	/ input registers*
285	the numeric values match the position of the output registers /*
286	: [crc] "c"(crc), [in] "d"(input)
287
288	/ additional clobbered registers*
289	"cc" is the flags - we add and sub, so the flags are also clobbered /*
290	: "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
291
292	return crc;
293	}
294
295	static int detection_performed = `0`;
296	static int detected_clmul = `0`;
297
298	/*
299	* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
300	* PCLMULQDQ machine instructions (if present).
301	* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
302	* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
303	* call.
304	*/
305	uint32_t aws_checksums_crc32c_hw(const uint8_t input, int* length, uint32_t previousCrc32) {
306
307	if (UNLIKELY(!detection_performed)) {
308	detected_clmul = aws_checksums_is_clmul_present();
309	/ Simply setting the flag true to skip HW detection next time*
310	Not using memory barriers since the worst that can
311	happen is a fallback to the non HW accelerated code. /*
312	detection_performed = `1`;
313	}
314
315	uint32_t crc = ~previousCrc32;
316
317	/ For small input, forget about alignment checks - simply compute the CRC32c one byte at a time /
318	if (UNLIKELY(length < `8`)) {
319	while (length-- > `0`) {
320	asm("loop_small_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [crc] "c"(crc), [in] "r"(input));
321	input++;
322	}
323	return ~crc;
324	}
325
326	/ Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits /
327	int input_alignment = (unsigned long int)input & `0x7`;
328
329	/ Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) /
330	int leading = (`8` - input_alignment) & `0x7`;
331
332	/ reduce the length by the leading unaligned bytes we are about to process /
333	length -= leading;
334
335	/ spin through the leading unaligned input bytes (if any) one-by-one /
336	while (leading-- > `0`) {
337	asm("loop_leading_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [crc] "c"(crc), [in] "r"(input));
338	input++;
339	}
340
341	/ Using likely to keep this code inlined /
342	if (LIKELY(detected_clmul)) {
343
344	while (LIKELY(length >= `3072`)) {
345	/ Compute crc32c on each block, chaining each crc result /
346	crc = s_crc32c_sse42_clmul_3072(input, crc);
347	input += `3072`;
348	length -= `3072`;
349	}
350	while (LIKELY(length >= `1024`)) {
351	/ Compute crc32c on each block, chaining each crc result /
352	crc = s_crc32c_sse42_clmul_1024(input, crc);
353	input += `1024`;
354	length -= `1024`;
355	}
356	while (LIKELY(length >= `256`)) {
357	/ Compute crc32c on each block, chaining each crc result /
358	crc = s_crc32c_sse42_clmul_256(input, crc);
359	input += `256`;
360	length -= `256`;
361	}
362	}
363
364	/ Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction /
365	while (LIKELY(length >= `8`)) {
366	/ Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction /
367	asm volatile("loop_8_%=: CRC32Q (%[in]), %%rcx" : "+c"(crc) : [crc] "c"(crc), [in] "r"(input));
368	input += `8`;
369	length -= `8`;
370	}
371
372	/ Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one /
373	while (length-- > `0`) {
374	asm volatile("loop_trailing_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [crc] "c"(crc), [in] "r"(input));
375	input++;
376	}
377
378	return ~crc;
379	}
380
381	#elif !defined(_MSC_VER) && !defined(__arm__) && !defined(__aarch64__)
382
383	/ don't call this without first checking that it is supported. /
384	uint32_t aws_checksums_crc32c_hw(const uint8_t input, int* length, uint32_t previousCrc32) {
385	return `0`;
386	}
387
388	#endif
389

Browse the source code of ClickHouse/contrib/aws-checksums/source/intel/crc32c_sse42_asm.c