1 | /* |
2 | * Copyright 2010-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"). |
5 | * You may not use this file except in compliance with the License. |
6 | * A copy of the License is located at |
7 | * |
8 | * http://aws.amazon.com/apache2.0 |
9 | * |
10 | * or in the "license" file accompanying this file. This file is distributed |
11 | * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either |
12 | * express or implied. See the License for the specific language governing |
13 | * permissions and limitations under the License. |
14 | */ |
15 | |
16 | #include <aws/checksums/private/cpuid.h> |
17 | #include <aws/checksums/private/crc_priv.h> |
18 | |
19 | /*this implementation is only for 64 bit arch and (if on GCC, release mode). |
20 | * If using clang, this will run for both debug and release.*/ |
21 | #if defined(__x86_64__) && !(defined(__GNUC__) && defined(DEBUG_BUILD)) |
22 | |
23 | # define LIKELY(x) __builtin_expect((x), 1) |
24 | # define UNLIKELY(x) __builtin_expect((x), 0) |
25 | |
26 | /* |
27 | * Factored out common inline asm for folding crc0,crc1,crc2 stripes in rcx, r11, r10 using |
28 | * the specified Magic Constants K1 and K2. |
29 | * Assumes rcx, r11, r10 contain crc0, crc1, crc2 that need folding |
30 | * Utilizes xmm1, xmm2, xmm3, xmm4 as well as clobbering r8, r9, r11 |
31 | * Result is placed in ecx |
32 | */ |
33 | # define FOLD_K1K2(NAME, K1, K2) \ |
34 | "fold_k1k2_" #NAME "_%=: \n" \ |
35 | "mov " #K1 ", %%r8d # Magic K1 constant \n" \ |
36 | "mov " #K2 ", %%r9d # Magic K2 constant \n" \ |
37 | "movq %%rcx, %%xmm1 # crc0 into lower dword of xmm1 \n" \ |
38 | "movq %%r8, %%xmm3 # K1 into lower dword of xmm3 \n" \ |
39 | "movq %%r11, %%xmm2 # crc1 into lower dword of xmm2 \n" \ |
40 | "movq %%r9, %%xmm4 # K2 into lower dword of xmm4 \n" \ |
41 | "pclmulqdq $0x00, %%xmm3, %%xmm1 # Multiply crc0 by K1 \n" \ |
42 | "pclmulqdq $0x00, %%xmm4, %%xmm2 # Multiply crc1 by K2 \n" \ |
43 | "xor %%rcx, %%rcx # \n" \ |
44 | "xor %%r11, %%r11 # \n" \ |
45 | "movq %%xmm1, %%r8 # \n" \ |
46 | "movq %%xmm2, %%r9 # \n" \ |
47 | "crc32q %%r8, %%rcx # folding crc0 \n" \ |
48 | "crc32q %%r9, %%r11 # folding crc1 \n" \ |
49 | "xor %%r10d, %%ecx # combine crc2 and crc0 \n" \ |
50 | "xor %%r11d, %%ecx # combine crc1 and crc0 \n" |
51 | |
52 | /** |
53 | * Private (static) function. |
54 | * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine |
55 | * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function |
56 | * is optimized for exactly 256 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a |
57 | * pointer to input data that is exactly 256 bytes in length. Note: this function does NOT invert bits of the input crc |
58 | * or return value. |
59 | */ |
60 | static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t crc) { |
61 | asm volatile("enter_256_%=:" |
62 | |
63 | "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" |
64 | "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" |
65 | |
66 | "crc32q 0(%[in]), %%rcx # crc0 \n" |
67 | "crc32q 88(%[in]), %%r11 # crc1 \n" |
68 | "crc32q 176(%[in]), %%r10 # crc2 \n" |
69 | |
70 | "crc32q 8(%[in]), %%rcx # crc0 \n" |
71 | "crc32q 96(%[in]), %%r11 # crc1 \n" |
72 | "crc32q 184(%[in]), %%r10 # crc2 \n" |
73 | |
74 | "crc32q 16(%[in]), %%rcx # crc0 \n" |
75 | "crc32q 104(%[in]), %%r11 # crc1 \n" |
76 | "crc32q 192(%[in]), %%r10 # crc2 \n" |
77 | |
78 | "crc32q 24(%[in]), %%rcx # crc0 \n" |
79 | "crc32q 112(%[in]), %%r11 # crc1 \n" |
80 | "crc32q 200(%[in]), %%r10 # crc2 \n" |
81 | |
82 | "crc32q 32(%[in]), %%rcx # crc0 \n" |
83 | "crc32q 120(%[in]), %%r11 # crc1 \n" |
84 | "crc32q 208(%[in]), %%r10 # crc2 \n" |
85 | |
86 | "crc32q 40(%[in]), %%rcx # crc0 \n" |
87 | "crc32q 128(%[in]), %%r11 # crc1 \n" |
88 | "crc32q 216(%[in]), %%r10 # crc2 \n" |
89 | |
90 | "crc32q 48(%[in]), %%rcx # crc0 \n" |
91 | "crc32q 136(%[in]), %%r11 # crc1 \n" |
92 | "crc32q 224(%[in]), %%r10 # crc2 \n" |
93 | |
94 | "crc32q 56(%[in]), %%rcx # crc0 \n" |
95 | "crc32q 144(%[in]), %%r11 # crc1 \n" |
96 | "crc32q 232(%[in]), %%r10 # crc2 \n" |
97 | |
98 | "crc32q 64(%[in]), %%rcx # crc0 \n" |
99 | "crc32q 152(%[in]), %%r11 # crc1 \n" |
100 | "crc32q 240(%[in]), %%r10 # crc2 \n" |
101 | |
102 | "crc32q 72(%[in]), %%rcx # crc0 \n" |
103 | "crc32q 160(%[in]), %%r11 # crc1 \n" |
104 | "crc32q 248(%[in]), %%r10 # crc2 \n" |
105 | |
106 | "crc32q 80(%[in]), %%rcx # crc0 \n" |
107 | "crc32q 168(%[in]), %%r11 # crc2 \n" |
108 | |
109 | FOLD_K1K2(256, $0x1b3d8f29, $0x39d3b296) /* Magic Constants used to fold crc stripes into ecx */ |
110 | |
111 | /* output registers |
112 | [crc] is an input and and output so it is marked read/write (i.e. "+c")*/ |
113 | : "+c" (crc) |
114 | |
115 | /* input registers */ |
116 | : [crc] "c" (crc), [in] "d" (input) |
117 | |
118 | /* additional clobbered registers */ |
119 | : "%r8" , "%r9" , "%r11" , "%r10" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "cc" ); |
120 | return crc; |
121 | } |
122 | |
123 | /** |
124 | * Private (static) function. |
125 | * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine |
126 | * instruction by operating on 3 24-byte stripes in parallel. The results are folded together using CLMUL. This function |
127 | * is optimized for exactly 1024 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a |
128 | * pointer to input data that is exactly 1024 bytes in length. Note: this function does NOT invert bits of the input crc |
129 | * or return value. |
130 | */ |
131 | static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) { |
132 | asm volatile( |
133 | "enter_1024_%=:" |
134 | |
135 | "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" |
136 | "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" |
137 | |
138 | "mov $5, %%r8d # Loop 5 times through 64 byte chunks in 3 parallel stripes \n" |
139 | |
140 | "loop_1024_%=:" |
141 | |
142 | "prefetcht0 128(%[in]) # \n" |
143 | "prefetcht0 472(%[in]) # \n" |
144 | "prefetcht0 808(%[in]) # \n" |
145 | |
146 | "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n" |
147 | "crc32q 344(%[in]), %%r11 # crc1: stripe1 \n" |
148 | "crc32q 680(%[in]), %%r10 # crc2: stripe2 \n" |
149 | |
150 | "crc32q 8(%[in]), %%rcx # crc0 \n" |
151 | "crc32q 352(%[in]), %%r11 # crc1 \n" |
152 | "crc32q 688(%[in]), %%r10 # crc2 \n" |
153 | |
154 | "crc32q 16(%[in]), %%rcx # crc0 \n" |
155 | "crc32q 360(%[in]), %%r11 # crc1 \n" |
156 | "crc32q 696(%[in]), %%r10 # crc2 \n" |
157 | |
158 | "crc32q 24(%[in]), %%rcx # crc0 \n" |
159 | "crc32q 368(%[in]), %%r11 # crc1 \n" |
160 | "crc32q 704(%[in]), %%r10 # crc2 \n" |
161 | |
162 | "crc32q 32(%[in]), %%rcx # crc0 \n" |
163 | "crc32q 376(%[in]), %%r11 # crc1 \n" |
164 | "crc32q 712(%[in]), %%r10 # crc2 \n" |
165 | |
166 | "crc32q 40(%[in]), %%rcx # crc0 \n" |
167 | "crc32q 384(%[in]), %%r11 # crc1 \n" |
168 | "crc32q 720(%[in]), %%r10 # crc2 \n" |
169 | |
170 | "crc32q 48(%[in]), %%rcx # crc0 \n" |
171 | "crc32q 392(%[in]), %%r11 # crc1 \n" |
172 | "crc32q 728(%[in]), %%r10 # crc2 \n" |
173 | |
174 | "crc32q 56(%[in]), %%rcx # crc0 \n" |
175 | "crc32q 400(%[in]), %%r11 # crc1 \n" |
176 | "crc32q 736(%[in]), %%r10 # crc2 \n" |
177 | |
178 | "add $64, %[in] # \n" |
179 | "sub $1, %%r8d # \n" |
180 | "jnz loop_1024_%= # \n" |
181 | |
182 | "crc32q 0(%[in]), %%rcx # crc0 \n" |
183 | "crc32q 344(%[in]), %%r11 # crc1 \n" |
184 | "crc32q 680(%[in]), %%r10 # crc2 \n" |
185 | |
186 | "crc32q 8(%[in]), %%rcx # crc0 \n" |
187 | "crc32q 352(%[in]), %%r11 # crc1 \n" |
188 | "crc32q 688(%[in]), %%r10 # crc2 \n" |
189 | |
190 | "crc32q 16(%[in]), %%rcx # crc0 \n" |
191 | "crc32q 696(%[in]), %%r10 # crc2 \n" |
192 | |
193 | FOLD_K1K2( |
194 | 1024, |
195 | $0xe417f38a, |
196 | $0x8f158014) /* Magic Constants used to fold crc stripes into ecx |
197 | |
198 | output registers |
199 | [crc] is an input and and output so it is marked read/write (i.e. "+c") |
200 | we clobber the register for [input] (via add instruction) so we must also |
201 | tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber */ |
202 | : "+c" (crc), "+d" (input) |
203 | |
204 | /* input registers */ |
205 | /* the numeric values match the position of the output registers */ |
206 | : [crc] "c" (crc), [in] "d" (input) |
207 | |
208 | /* additional clobbered registers */ |
209 | /* "cc" is the flags - we add and sub, so the flags are also clobbered */ |
210 | : "%r8" , "%r9" , "%r11" , "%r10" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "cc" ); |
211 | return crc; |
212 | } |
213 | |
214 | /** |
215 | * Private (static) function. |
216 | * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine |
217 | * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function |
218 | * is optimized for exactly 3072 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a |
219 | * pointer to input data that is exactly 3072 bytes in length. Note: this function does NOT invert bits of the input crc |
220 | * or return value. |
221 | */ |
222 | static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t crc) { |
223 | asm volatile( |
224 | "enter_3072_%=:" |
225 | |
226 | "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" |
227 | "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" |
228 | |
229 | "mov $16, %%r8d # Loop 16 times through 64 byte chunks in 3 parallel stripes \n" |
230 | |
231 | "loop_3072_%=:" |
232 | |
233 | "prefetcht0 128(%[in]) # \n" |
234 | "prefetcht0 1152(%[in]) # \n" |
235 | "prefetcht0 2176(%[in]) # \n" |
236 | |
237 | "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n" |
238 | "crc32q 1024(%[in]), %%r11 # crc1: stripe1 \n" |
239 | "crc32q 2048(%[in]), %%r10 # crc2: stripe2 \n" |
240 | |
241 | "crc32q 8(%[in]), %%rcx # crc0: stripe0 \n" |
242 | "crc32q 1032(%[in]), %%r11 # crc1: stripe1 \n" |
243 | "crc32q 2056(%[in]), %%r10 # crc2: stripe2 \n" |
244 | |
245 | "crc32q 16(%[in]), %%rcx # crc0: stripe0 \n" |
246 | "crc32q 1040(%[in]), %%r11 # crc1: stripe1 \n" |
247 | "crc32q 2064(%[in]), %%r10 # crc2: stripe2 \n" |
248 | |
249 | "crc32q 24(%[in]), %%rcx # crc0: stripe0 \n" |
250 | "crc32q 1048(%[in]), %%r11 # crc1: stripe1 \n" |
251 | "crc32q 2072(%[in]), %%r10 # crc2: stripe2 \n" |
252 | |
253 | "crc32q 32(%[in]), %%rcx # crc0: stripe0 \n" |
254 | "crc32q 1056(%[in]), %%r11 # crc1: stripe1 \n" |
255 | "crc32q 2080(%[in]), %%r10 # crc2: stripe2 \n" |
256 | |
257 | "crc32q 40(%[in]), %%rcx # crc0: stripe0 \n" |
258 | "crc32q 1064(%[in]), %%r11 # crc1: stripe1 \n" |
259 | "crc32q 2088(%[in]), %%r10 # crc2: stripe2 \n" |
260 | |
261 | "crc32q 48(%[in]), %%rcx # crc0: stripe0 \n" |
262 | "crc32q 1072(%[in]), %%r11 # crc1: stripe1 \n" |
263 | "crc32q 2096(%[in]), %%r10 # crc2: stripe2 \n" |
264 | |
265 | "crc32q 56(%[in]), %%rcx # crc0: stripe0 \n" |
266 | "crc32q 1080(%[in]), %%r11 # crc1: stripe1 \n" |
267 | "crc32q 2104(%[in]), %%r10 # crc2: stripe2 \n" |
268 | |
269 | "add $64, %[in] # \n" |
270 | "sub $1, %%r8d # \n" |
271 | "jnz loop_3072_%= # \n" |
272 | |
273 | FOLD_K1K2( |
274 | 3072, |
275 | $0xa51b6135, |
276 | $0x170076fa) /* Magic Constants used to fold crc stripes into ecx |
277 | |
278 | output registers |
279 | [crc] is an input and and output so it is marked read/write (i.e. "+c") |
280 | we clobber the register for [input] (via add instruction) so we must also |
281 | tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber*/ |
282 | : "+c" (crc), "+d" (input) |
283 | |
284 | /* input registers |
285 | the numeric values match the position of the output registers */ |
286 | : [crc] "c" (crc), [in] "d" (input) |
287 | |
288 | /* additional clobbered registers |
289 | "cc" is the flags - we add and sub, so the flags are also clobbered */ |
290 | : "%r8" , "%r9" , "%r11" , "%r10" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "cc" ); |
291 | |
292 | return crc; |
293 | } |
294 | |
295 | static int detection_performed = 0; |
296 | static int detected_clmul = 0; |
297 | |
298 | /* |
299 | * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and |
300 | * PCLMULQDQ machine instructions (if present). |
301 | * Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction. |
302 | * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent |
303 | * call. |
304 | */ |
305 | uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { |
306 | |
307 | if (UNLIKELY(!detection_performed)) { |
308 | detected_clmul = aws_checksums_is_clmul_present(); |
309 | /* Simply setting the flag true to skip HW detection next time |
310 | Not using memory barriers since the worst that can |
311 | happen is a fallback to the non HW accelerated code. */ |
312 | detection_performed = 1; |
313 | } |
314 | |
315 | uint32_t crc = ~previousCrc32; |
316 | |
317 | /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ |
318 | if (UNLIKELY(length < 8)) { |
319 | while (length-- > 0) { |
320 | asm("loop_small_%=: CRC32B (%[in]), %[crc]" : "+c" (crc) : [crc] "c" (crc), [in] "r" (input)); |
321 | input++; |
322 | } |
323 | return ~crc; |
324 | } |
325 | |
326 | /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */ |
327 | int input_alignment = (unsigned long int)input & 0x7; |
328 | |
329 | /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */ |
330 | int leading = (8 - input_alignment) & 0x7; |
331 | |
332 | /* reduce the length by the leading unaligned bytes we are about to process */ |
333 | length -= leading; |
334 | |
335 | /* spin through the leading unaligned input bytes (if any) one-by-one */ |
336 | while (leading-- > 0) { |
337 | asm("loop_leading_%=: CRC32B (%[in]), %[crc]" : "+c" (crc) : [crc] "c" (crc), [in] "r" (input)); |
338 | input++; |
339 | } |
340 | |
341 | /* Using likely to keep this code inlined */ |
342 | if (LIKELY(detected_clmul)) { |
343 | |
344 | while (LIKELY(length >= 3072)) { |
345 | /* Compute crc32c on each block, chaining each crc result */ |
346 | crc = s_crc32c_sse42_clmul_3072(input, crc); |
347 | input += 3072; |
348 | length -= 3072; |
349 | } |
350 | while (LIKELY(length >= 1024)) { |
351 | /* Compute crc32c on each block, chaining each crc result */ |
352 | crc = s_crc32c_sse42_clmul_1024(input, crc); |
353 | input += 1024; |
354 | length -= 1024; |
355 | } |
356 | while (LIKELY(length >= 256)) { |
357 | /* Compute crc32c on each block, chaining each crc result */ |
358 | crc = s_crc32c_sse42_clmul_256(input, crc); |
359 | input += 256; |
360 | length -= 256; |
361 | } |
362 | } |
363 | |
364 | /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */ |
365 | while (LIKELY(length >= 8)) { |
366 | /* Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction */ |
367 | asm volatile("loop_8_%=: CRC32Q (%[in]), %%rcx" : "+c" (crc) : [crc] "c" (crc), [in] "r" (input)); |
368 | input += 8; |
369 | length -= 8; |
370 | } |
371 | |
372 | /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */ |
373 | while (length-- > 0) { |
374 | asm volatile("loop_trailing_%=: CRC32B (%[in]), %[crc]" : "+c" (crc) : [crc] "c" (crc), [in] "r" (input)); |
375 | input++; |
376 | } |
377 | |
378 | return ~crc; |
379 | } |
380 | |
381 | #elif !defined(_MSC_VER) && !defined(__arm__) && !defined(__aarch64__) |
382 | |
383 | /* don't call this without first checking that it is supported. */ |
384 | uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { |
385 | return 0; |
386 | } |
387 | |
388 | #endif |
389 | |