1/*
2 * Copyright 2010-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License").
5 * You may not use this file except in compliance with the License.
6 * A copy of the License is located at
7 *
8 * http://aws.amazon.com/apache2.0
9 *
10 * or in the "license" file accompanying this file. This file is distributed
11 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
12 * express or implied. See the License for the specific language governing
13 * permissions and limitations under the License.
14 */
15
16#include <aws/checksums/private/cpuid.h>
17#include <aws/checksums/private/crc_priv.h>
18
19/*this implementation is only for 64 bit arch and (if on GCC, release mode).
20 * If using clang, this will run for both debug and release.*/
21#if defined(__x86_64__) && !(defined(__GNUC__) && defined(DEBUG_BUILD))
22
23# define LIKELY(x) __builtin_expect((x), 1)
24# define UNLIKELY(x) __builtin_expect((x), 0)
25
26/*
27 * Factored out common inline asm for folding crc0,crc1,crc2 stripes in rcx, r11, r10 using
28 * the specified Magic Constants K1 and K2.
29 * Assumes rcx, r11, r10 contain crc0, crc1, crc2 that need folding
30 * Utilizes xmm1, xmm2, xmm3, xmm4 as well as clobbering r8, r9, r11
31 * Result is placed in ecx
32 */
33# define FOLD_K1K2(NAME, K1, K2) \
34 "fold_k1k2_" #NAME "_%=: \n" \
35 "mov " #K1 ", %%r8d # Magic K1 constant \n" \
36 "mov " #K2 ", %%r9d # Magic K2 constant \n" \
37 "movq %%rcx, %%xmm1 # crc0 into lower dword of xmm1 \n" \
38 "movq %%r8, %%xmm3 # K1 into lower dword of xmm3 \n" \
39 "movq %%r11, %%xmm2 # crc1 into lower dword of xmm2 \n" \
40 "movq %%r9, %%xmm4 # K2 into lower dword of xmm4 \n" \
41 "pclmulqdq $0x00, %%xmm3, %%xmm1 # Multiply crc0 by K1 \n" \
42 "pclmulqdq $0x00, %%xmm4, %%xmm2 # Multiply crc1 by K2 \n" \
43 "xor %%rcx, %%rcx # \n" \
44 "xor %%r11, %%r11 # \n" \
45 "movq %%xmm1, %%r8 # \n" \
46 "movq %%xmm2, %%r9 # \n" \
47 "crc32q %%r8, %%rcx # folding crc0 \n" \
48 "crc32q %%r9, %%r11 # folding crc1 \n" \
49 "xor %%r10d, %%ecx # combine crc2 and crc0 \n" \
50 "xor %%r11d, %%ecx # combine crc1 and crc0 \n"
51
52/**
53 * Private (static) function.
54 * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
55 * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function
56 * is optimized for exactly 256 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
57 * pointer to input data that is exactly 256 bytes in length. Note: this function does NOT invert bits of the input crc
58 * or return value.
59 */
60static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t crc) {
61 asm volatile("enter_256_%=:"
62
63 "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
64 "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
65
66 "crc32q 0(%[in]), %%rcx # crc0 \n"
67 "crc32q 88(%[in]), %%r11 # crc1 \n"
68 "crc32q 176(%[in]), %%r10 # crc2 \n"
69
70 "crc32q 8(%[in]), %%rcx # crc0 \n"
71 "crc32q 96(%[in]), %%r11 # crc1 \n"
72 "crc32q 184(%[in]), %%r10 # crc2 \n"
73
74 "crc32q 16(%[in]), %%rcx # crc0 \n"
75 "crc32q 104(%[in]), %%r11 # crc1 \n"
76 "crc32q 192(%[in]), %%r10 # crc2 \n"
77
78 "crc32q 24(%[in]), %%rcx # crc0 \n"
79 "crc32q 112(%[in]), %%r11 # crc1 \n"
80 "crc32q 200(%[in]), %%r10 # crc2 \n"
81
82 "crc32q 32(%[in]), %%rcx # crc0 \n"
83 "crc32q 120(%[in]), %%r11 # crc1 \n"
84 "crc32q 208(%[in]), %%r10 # crc2 \n"
85
86 "crc32q 40(%[in]), %%rcx # crc0 \n"
87 "crc32q 128(%[in]), %%r11 # crc1 \n"
88 "crc32q 216(%[in]), %%r10 # crc2 \n"
89
90 "crc32q 48(%[in]), %%rcx # crc0 \n"
91 "crc32q 136(%[in]), %%r11 # crc1 \n"
92 "crc32q 224(%[in]), %%r10 # crc2 \n"
93
94 "crc32q 56(%[in]), %%rcx # crc0 \n"
95 "crc32q 144(%[in]), %%r11 # crc1 \n"
96 "crc32q 232(%[in]), %%r10 # crc2 \n"
97
98 "crc32q 64(%[in]), %%rcx # crc0 \n"
99 "crc32q 152(%[in]), %%r11 # crc1 \n"
100 "crc32q 240(%[in]), %%r10 # crc2 \n"
101
102 "crc32q 72(%[in]), %%rcx # crc0 \n"
103 "crc32q 160(%[in]), %%r11 # crc1 \n"
104 "crc32q 248(%[in]), %%r10 # crc2 \n"
105
106 "crc32q 80(%[in]), %%rcx # crc0 \n"
107 "crc32q 168(%[in]), %%r11 # crc2 \n"
108
109 FOLD_K1K2(256, $0x1b3d8f29, $0x39d3b296) /* Magic Constants used to fold crc stripes into ecx */
110
111 /* output registers
112 [crc] is an input and and output so it is marked read/write (i.e. "+c")*/
113 : "+c"(crc)
114
115 /* input registers */
116 : [crc] "c"(crc), [in] "d"(input)
117
118 /* additional clobbered registers */
119 : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
120 return crc;
121}
122
123/**
124 * Private (static) function.
125 * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
126 * instruction by operating on 3 24-byte stripes in parallel. The results are folded together using CLMUL. This function
127 * is optimized for exactly 1024 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
128 * pointer to input data that is exactly 1024 bytes in length. Note: this function does NOT invert bits of the input crc
129 * or return value.
130 */
131static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) {
132 asm volatile(
133 "enter_1024_%=:"
134
135 "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
136 "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
137
138 "mov $5, %%r8d # Loop 5 times through 64 byte chunks in 3 parallel stripes \n"
139
140 "loop_1024_%=:"
141
142 "prefetcht0 128(%[in]) # \n"
143 "prefetcht0 472(%[in]) # \n"
144 "prefetcht0 808(%[in]) # \n"
145
146 "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n"
147 "crc32q 344(%[in]), %%r11 # crc1: stripe1 \n"
148 "crc32q 680(%[in]), %%r10 # crc2: stripe2 \n"
149
150 "crc32q 8(%[in]), %%rcx # crc0 \n"
151 "crc32q 352(%[in]), %%r11 # crc1 \n"
152 "crc32q 688(%[in]), %%r10 # crc2 \n"
153
154 "crc32q 16(%[in]), %%rcx # crc0 \n"
155 "crc32q 360(%[in]), %%r11 # crc1 \n"
156 "crc32q 696(%[in]), %%r10 # crc2 \n"
157
158 "crc32q 24(%[in]), %%rcx # crc0 \n"
159 "crc32q 368(%[in]), %%r11 # crc1 \n"
160 "crc32q 704(%[in]), %%r10 # crc2 \n"
161
162 "crc32q 32(%[in]), %%rcx # crc0 \n"
163 "crc32q 376(%[in]), %%r11 # crc1 \n"
164 "crc32q 712(%[in]), %%r10 # crc2 \n"
165
166 "crc32q 40(%[in]), %%rcx # crc0 \n"
167 "crc32q 384(%[in]), %%r11 # crc1 \n"
168 "crc32q 720(%[in]), %%r10 # crc2 \n"
169
170 "crc32q 48(%[in]), %%rcx # crc0 \n"
171 "crc32q 392(%[in]), %%r11 # crc1 \n"
172 "crc32q 728(%[in]), %%r10 # crc2 \n"
173
174 "crc32q 56(%[in]), %%rcx # crc0 \n"
175 "crc32q 400(%[in]), %%r11 # crc1 \n"
176 "crc32q 736(%[in]), %%r10 # crc2 \n"
177
178 "add $64, %[in] # \n"
179 "sub $1, %%r8d # \n"
180 "jnz loop_1024_%= # \n"
181
182 "crc32q 0(%[in]), %%rcx # crc0 \n"
183 "crc32q 344(%[in]), %%r11 # crc1 \n"
184 "crc32q 680(%[in]), %%r10 # crc2 \n"
185
186 "crc32q 8(%[in]), %%rcx # crc0 \n"
187 "crc32q 352(%[in]), %%r11 # crc1 \n"
188 "crc32q 688(%[in]), %%r10 # crc2 \n"
189
190 "crc32q 16(%[in]), %%rcx # crc0 \n"
191 "crc32q 696(%[in]), %%r10 # crc2 \n"
192
193 FOLD_K1K2(
194 1024,
195 $0xe417f38a,
196 $0x8f158014) /* Magic Constants used to fold crc stripes into ecx
197
198 output registers
199 [crc] is an input and and output so it is marked read/write (i.e. "+c")
200 we clobber the register for [input] (via add instruction) so we must also
201 tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber */
202 : "+c"(crc), "+d"(input)
203
204 /* input registers */
205 /* the numeric values match the position of the output registers */
206 : [crc] "c"(crc), [in] "d"(input)
207
208 /* additional clobbered registers */
209 /* "cc" is the flags - we add and sub, so the flags are also clobbered */
210 : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
211 return crc;
212}
213
214/**
215 * Private (static) function.
216 * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
217 * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function
218 * is optimized for exactly 3072 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
219 * pointer to input data that is exactly 3072 bytes in length. Note: this function does NOT invert bits of the input crc
220 * or return value.
221 */
222static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t crc) {
223 asm volatile(
224 "enter_3072_%=:"
225
226 "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
227 "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
228
229 "mov $16, %%r8d # Loop 16 times through 64 byte chunks in 3 parallel stripes \n"
230
231 "loop_3072_%=:"
232
233 "prefetcht0 128(%[in]) # \n"
234 "prefetcht0 1152(%[in]) # \n"
235 "prefetcht0 2176(%[in]) # \n"
236
237 "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n"
238 "crc32q 1024(%[in]), %%r11 # crc1: stripe1 \n"
239 "crc32q 2048(%[in]), %%r10 # crc2: stripe2 \n"
240
241 "crc32q 8(%[in]), %%rcx # crc0: stripe0 \n"
242 "crc32q 1032(%[in]), %%r11 # crc1: stripe1 \n"
243 "crc32q 2056(%[in]), %%r10 # crc2: stripe2 \n"
244
245 "crc32q 16(%[in]), %%rcx # crc0: stripe0 \n"
246 "crc32q 1040(%[in]), %%r11 # crc1: stripe1 \n"
247 "crc32q 2064(%[in]), %%r10 # crc2: stripe2 \n"
248
249 "crc32q 24(%[in]), %%rcx # crc0: stripe0 \n"
250 "crc32q 1048(%[in]), %%r11 # crc1: stripe1 \n"
251 "crc32q 2072(%[in]), %%r10 # crc2: stripe2 \n"
252
253 "crc32q 32(%[in]), %%rcx # crc0: stripe0 \n"
254 "crc32q 1056(%[in]), %%r11 # crc1: stripe1 \n"
255 "crc32q 2080(%[in]), %%r10 # crc2: stripe2 \n"
256
257 "crc32q 40(%[in]), %%rcx # crc0: stripe0 \n"
258 "crc32q 1064(%[in]), %%r11 # crc1: stripe1 \n"
259 "crc32q 2088(%[in]), %%r10 # crc2: stripe2 \n"
260
261 "crc32q 48(%[in]), %%rcx # crc0: stripe0 \n"
262 "crc32q 1072(%[in]), %%r11 # crc1: stripe1 \n"
263 "crc32q 2096(%[in]), %%r10 # crc2: stripe2 \n"
264
265 "crc32q 56(%[in]), %%rcx # crc0: stripe0 \n"
266 "crc32q 1080(%[in]), %%r11 # crc1: stripe1 \n"
267 "crc32q 2104(%[in]), %%r10 # crc2: stripe2 \n"
268
269 "add $64, %[in] # \n"
270 "sub $1, %%r8d # \n"
271 "jnz loop_3072_%= # \n"
272
273 FOLD_K1K2(
274 3072,
275 $0xa51b6135,
276 $0x170076fa) /* Magic Constants used to fold crc stripes into ecx
277
278 output registers
279 [crc] is an input and and output so it is marked read/write (i.e. "+c")
280 we clobber the register for [input] (via add instruction) so we must also
281 tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber*/
282 : "+c"(crc), "+d"(input)
283
284 /* input registers
285 the numeric values match the position of the output registers */
286 : [crc] "c"(crc), [in] "d"(input)
287
288 /* additional clobbered registers
289 "cc" is the flags - we add and sub, so the flags are also clobbered */
290 : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
291
292 return crc;
293}
294
295static int detection_performed = 0;
296static int detected_clmul = 0;
297
298/*
299 * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
300 * PCLMULQDQ machine instructions (if present).
301 * Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
302 * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
303 * call.
304 */
305uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
306
307 if (UNLIKELY(!detection_performed)) {
308 detected_clmul = aws_checksums_is_clmul_present();
309 /* Simply setting the flag true to skip HW detection next time
310 Not using memory barriers since the worst that can
311 happen is a fallback to the non HW accelerated code. */
312 detection_performed = 1;
313 }
314
315 uint32_t crc = ~previousCrc32;
316
317 /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
318 if (UNLIKELY(length < 8)) {
319 while (length-- > 0) {
320 asm("loop_small_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [crc] "c"(crc), [in] "r"(input));
321 input++;
322 }
323 return ~crc;
324 }
325
326 /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
327 int input_alignment = (unsigned long int)input & 0x7;
328
329 /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
330 int leading = (8 - input_alignment) & 0x7;
331
332 /* reduce the length by the leading unaligned bytes we are about to process */
333 length -= leading;
334
335 /* spin through the leading unaligned input bytes (if any) one-by-one */
336 while (leading-- > 0) {
337 asm("loop_leading_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [crc] "c"(crc), [in] "r"(input));
338 input++;
339 }
340
341 /* Using likely to keep this code inlined */
342 if (LIKELY(detected_clmul)) {
343
344 while (LIKELY(length >= 3072)) {
345 /* Compute crc32c on each block, chaining each crc result */
346 crc = s_crc32c_sse42_clmul_3072(input, crc);
347 input += 3072;
348 length -= 3072;
349 }
350 while (LIKELY(length >= 1024)) {
351 /* Compute crc32c on each block, chaining each crc result */
352 crc = s_crc32c_sse42_clmul_1024(input, crc);
353 input += 1024;
354 length -= 1024;
355 }
356 while (LIKELY(length >= 256)) {
357 /* Compute crc32c on each block, chaining each crc result */
358 crc = s_crc32c_sse42_clmul_256(input, crc);
359 input += 256;
360 length -= 256;
361 }
362 }
363
364 /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
365 while (LIKELY(length >= 8)) {
366 /* Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction */
367 asm volatile("loop_8_%=: CRC32Q (%[in]), %%rcx" : "+c"(crc) : [crc] "c"(crc), [in] "r"(input));
368 input += 8;
369 length -= 8;
370 }
371
372 /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
373 while (length-- > 0) {
374 asm volatile("loop_trailing_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [crc] "c"(crc), [in] "r"(input));
375 input++;
376 }
377
378 return ~crc;
379}
380
381#elif !defined(_MSC_VER) && !defined(__arm__) && !defined(__aarch64__)
382
383/* don't call this without first checking that it is supported. */
384uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
385 return 0;
386}
387
388#endif
389