| 1 | /* |
| 2 | * Copyright 2010-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"). |
| 5 | * You may not use this file except in compliance with the License. |
| 6 | * A copy of the License is located at |
| 7 | * |
| 8 | * http://aws.amazon.com/apache2.0 |
| 9 | * |
| 10 | * or in the "license" file accompanying this file. This file is distributed |
| 11 | * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either |
| 12 | * express or implied. See the License for the specific language governing |
| 13 | * permissions and limitations under the License. |
| 14 | */ |
| 15 | |
| 16 | #include <aws/checksums/private/cpuid.h> |
| 17 | #include <aws/checksums/private/crc_priv.h> |
| 18 | |
| 19 | /*this implementation is only for 64 bit arch and (if on GCC, release mode). |
| 20 | * If using clang, this will run for both debug and release.*/ |
| 21 | #if defined(__x86_64__) && !(defined(__GNUC__) && defined(DEBUG_BUILD)) |
| 22 | |
| 23 | # define LIKELY(x) __builtin_expect((x), 1) |
| 24 | # define UNLIKELY(x) __builtin_expect((x), 0) |
| 25 | |
| 26 | /* |
| 27 | * Factored out common inline asm for folding crc0,crc1,crc2 stripes in rcx, r11, r10 using |
| 28 | * the specified Magic Constants K1 and K2. |
| 29 | * Assumes rcx, r11, r10 contain crc0, crc1, crc2 that need folding |
| 30 | * Utilizes xmm1, xmm2, xmm3, xmm4 as well as clobbering r8, r9, r11 |
| 31 | * Result is placed in ecx |
| 32 | */ |
| 33 | # define FOLD_K1K2(NAME, K1, K2) \ |
| 34 | "fold_k1k2_" #NAME "_%=: \n" \ |
| 35 | "mov " #K1 ", %%r8d # Magic K1 constant \n" \ |
| 36 | "mov " #K2 ", %%r9d # Magic K2 constant \n" \ |
| 37 | "movq %%rcx, %%xmm1 # crc0 into lower dword of xmm1 \n" \ |
| 38 | "movq %%r8, %%xmm3 # K1 into lower dword of xmm3 \n" \ |
| 39 | "movq %%r11, %%xmm2 # crc1 into lower dword of xmm2 \n" \ |
| 40 | "movq %%r9, %%xmm4 # K2 into lower dword of xmm4 \n" \ |
| 41 | "pclmulqdq $0x00, %%xmm3, %%xmm1 # Multiply crc0 by K1 \n" \ |
| 42 | "pclmulqdq $0x00, %%xmm4, %%xmm2 # Multiply crc1 by K2 \n" \ |
| 43 | "xor %%rcx, %%rcx # \n" \ |
| 44 | "xor %%r11, %%r11 # \n" \ |
| 45 | "movq %%xmm1, %%r8 # \n" \ |
| 46 | "movq %%xmm2, %%r9 # \n" \ |
| 47 | "crc32q %%r8, %%rcx # folding crc0 \n" \ |
| 48 | "crc32q %%r9, %%r11 # folding crc1 \n" \ |
| 49 | "xor %%r10d, %%ecx # combine crc2 and crc0 \n" \ |
| 50 | "xor %%r11d, %%ecx # combine crc1 and crc0 \n" |
| 51 | |
| 52 | /** |
| 53 | * Private (static) function. |
| 54 | * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine |
| 55 | * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function |
| 56 | * is optimized for exactly 256 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a |
| 57 | * pointer to input data that is exactly 256 bytes in length. Note: this function does NOT invert bits of the input crc |
| 58 | * or return value. |
| 59 | */ |
| 60 | static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t crc) { |
| 61 | asm volatile("enter_256_%=:" |
| 62 | |
| 63 | "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" |
| 64 | "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" |
| 65 | |
| 66 | "crc32q 0(%[in]), %%rcx # crc0 \n" |
| 67 | "crc32q 88(%[in]), %%r11 # crc1 \n" |
| 68 | "crc32q 176(%[in]), %%r10 # crc2 \n" |
| 69 | |
| 70 | "crc32q 8(%[in]), %%rcx # crc0 \n" |
| 71 | "crc32q 96(%[in]), %%r11 # crc1 \n" |
| 72 | "crc32q 184(%[in]), %%r10 # crc2 \n" |
| 73 | |
| 74 | "crc32q 16(%[in]), %%rcx # crc0 \n" |
| 75 | "crc32q 104(%[in]), %%r11 # crc1 \n" |
| 76 | "crc32q 192(%[in]), %%r10 # crc2 \n" |
| 77 | |
| 78 | "crc32q 24(%[in]), %%rcx # crc0 \n" |
| 79 | "crc32q 112(%[in]), %%r11 # crc1 \n" |
| 80 | "crc32q 200(%[in]), %%r10 # crc2 \n" |
| 81 | |
| 82 | "crc32q 32(%[in]), %%rcx # crc0 \n" |
| 83 | "crc32q 120(%[in]), %%r11 # crc1 \n" |
| 84 | "crc32q 208(%[in]), %%r10 # crc2 \n" |
| 85 | |
| 86 | "crc32q 40(%[in]), %%rcx # crc0 \n" |
| 87 | "crc32q 128(%[in]), %%r11 # crc1 \n" |
| 88 | "crc32q 216(%[in]), %%r10 # crc2 \n" |
| 89 | |
| 90 | "crc32q 48(%[in]), %%rcx # crc0 \n" |
| 91 | "crc32q 136(%[in]), %%r11 # crc1 \n" |
| 92 | "crc32q 224(%[in]), %%r10 # crc2 \n" |
| 93 | |
| 94 | "crc32q 56(%[in]), %%rcx # crc0 \n" |
| 95 | "crc32q 144(%[in]), %%r11 # crc1 \n" |
| 96 | "crc32q 232(%[in]), %%r10 # crc2 \n" |
| 97 | |
| 98 | "crc32q 64(%[in]), %%rcx # crc0 \n" |
| 99 | "crc32q 152(%[in]), %%r11 # crc1 \n" |
| 100 | "crc32q 240(%[in]), %%r10 # crc2 \n" |
| 101 | |
| 102 | "crc32q 72(%[in]), %%rcx # crc0 \n" |
| 103 | "crc32q 160(%[in]), %%r11 # crc1 \n" |
| 104 | "crc32q 248(%[in]), %%r10 # crc2 \n" |
| 105 | |
| 106 | "crc32q 80(%[in]), %%rcx # crc0 \n" |
| 107 | "crc32q 168(%[in]), %%r11 # crc2 \n" |
| 108 | |
| 109 | FOLD_K1K2(256, $0x1b3d8f29, $0x39d3b296) /* Magic Constants used to fold crc stripes into ecx */ |
| 110 | |
| 111 | /* output registers |
| 112 | [crc] is an input and and output so it is marked read/write (i.e. "+c")*/ |
| 113 | : "+c" (crc) |
| 114 | |
| 115 | /* input registers */ |
| 116 | : [crc] "c" (crc), [in] "d" (input) |
| 117 | |
| 118 | /* additional clobbered registers */ |
| 119 | : "%r8" , "%r9" , "%r11" , "%r10" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "cc" ); |
| 120 | return crc; |
| 121 | } |
| 122 | |
| 123 | /** |
| 124 | * Private (static) function. |
| 125 | * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine |
| 126 | * instruction by operating on 3 24-byte stripes in parallel. The results are folded together using CLMUL. This function |
| 127 | * is optimized for exactly 1024 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a |
| 128 | * pointer to input data that is exactly 1024 bytes in length. Note: this function does NOT invert bits of the input crc |
| 129 | * or return value. |
| 130 | */ |
| 131 | static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) { |
| 132 | asm volatile( |
| 133 | "enter_1024_%=:" |
| 134 | |
| 135 | "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" |
| 136 | "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" |
| 137 | |
| 138 | "mov $5, %%r8d # Loop 5 times through 64 byte chunks in 3 parallel stripes \n" |
| 139 | |
| 140 | "loop_1024_%=:" |
| 141 | |
| 142 | "prefetcht0 128(%[in]) # \n" |
| 143 | "prefetcht0 472(%[in]) # \n" |
| 144 | "prefetcht0 808(%[in]) # \n" |
| 145 | |
| 146 | "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n" |
| 147 | "crc32q 344(%[in]), %%r11 # crc1: stripe1 \n" |
| 148 | "crc32q 680(%[in]), %%r10 # crc2: stripe2 \n" |
| 149 | |
| 150 | "crc32q 8(%[in]), %%rcx # crc0 \n" |
| 151 | "crc32q 352(%[in]), %%r11 # crc1 \n" |
| 152 | "crc32q 688(%[in]), %%r10 # crc2 \n" |
| 153 | |
| 154 | "crc32q 16(%[in]), %%rcx # crc0 \n" |
| 155 | "crc32q 360(%[in]), %%r11 # crc1 \n" |
| 156 | "crc32q 696(%[in]), %%r10 # crc2 \n" |
| 157 | |
| 158 | "crc32q 24(%[in]), %%rcx # crc0 \n" |
| 159 | "crc32q 368(%[in]), %%r11 # crc1 \n" |
| 160 | "crc32q 704(%[in]), %%r10 # crc2 \n" |
| 161 | |
| 162 | "crc32q 32(%[in]), %%rcx # crc0 \n" |
| 163 | "crc32q 376(%[in]), %%r11 # crc1 \n" |
| 164 | "crc32q 712(%[in]), %%r10 # crc2 \n" |
| 165 | |
| 166 | "crc32q 40(%[in]), %%rcx # crc0 \n" |
| 167 | "crc32q 384(%[in]), %%r11 # crc1 \n" |
| 168 | "crc32q 720(%[in]), %%r10 # crc2 \n" |
| 169 | |
| 170 | "crc32q 48(%[in]), %%rcx # crc0 \n" |
| 171 | "crc32q 392(%[in]), %%r11 # crc1 \n" |
| 172 | "crc32q 728(%[in]), %%r10 # crc2 \n" |
| 173 | |
| 174 | "crc32q 56(%[in]), %%rcx # crc0 \n" |
| 175 | "crc32q 400(%[in]), %%r11 # crc1 \n" |
| 176 | "crc32q 736(%[in]), %%r10 # crc2 \n" |
| 177 | |
| 178 | "add $64, %[in] # \n" |
| 179 | "sub $1, %%r8d # \n" |
| 180 | "jnz loop_1024_%= # \n" |
| 181 | |
| 182 | "crc32q 0(%[in]), %%rcx # crc0 \n" |
| 183 | "crc32q 344(%[in]), %%r11 # crc1 \n" |
| 184 | "crc32q 680(%[in]), %%r10 # crc2 \n" |
| 185 | |
| 186 | "crc32q 8(%[in]), %%rcx # crc0 \n" |
| 187 | "crc32q 352(%[in]), %%r11 # crc1 \n" |
| 188 | "crc32q 688(%[in]), %%r10 # crc2 \n" |
| 189 | |
| 190 | "crc32q 16(%[in]), %%rcx # crc0 \n" |
| 191 | "crc32q 696(%[in]), %%r10 # crc2 \n" |
| 192 | |
| 193 | FOLD_K1K2( |
| 194 | 1024, |
| 195 | $0xe417f38a, |
| 196 | $0x8f158014) /* Magic Constants used to fold crc stripes into ecx |
| 197 | |
| 198 | output registers |
| 199 | [crc] is an input and and output so it is marked read/write (i.e. "+c") |
| 200 | we clobber the register for [input] (via add instruction) so we must also |
| 201 | tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber */ |
| 202 | : "+c" (crc), "+d" (input) |
| 203 | |
| 204 | /* input registers */ |
| 205 | /* the numeric values match the position of the output registers */ |
| 206 | : [crc] "c" (crc), [in] "d" (input) |
| 207 | |
| 208 | /* additional clobbered registers */ |
| 209 | /* "cc" is the flags - we add and sub, so the flags are also clobbered */ |
| 210 | : "%r8" , "%r9" , "%r11" , "%r10" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "cc" ); |
| 211 | return crc; |
| 212 | } |
| 213 | |
| 214 | /** |
| 215 | * Private (static) function. |
| 216 | * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine |
| 217 | * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function |
| 218 | * is optimized for exactly 3072 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a |
| 219 | * pointer to input data that is exactly 3072 bytes in length. Note: this function does NOT invert bits of the input crc |
| 220 | * or return value. |
| 221 | */ |
| 222 | static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t crc) { |
| 223 | asm volatile( |
| 224 | "enter_3072_%=:" |
| 225 | |
| 226 | "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n" |
| 227 | "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n" |
| 228 | |
| 229 | "mov $16, %%r8d # Loop 16 times through 64 byte chunks in 3 parallel stripes \n" |
| 230 | |
| 231 | "loop_3072_%=:" |
| 232 | |
| 233 | "prefetcht0 128(%[in]) # \n" |
| 234 | "prefetcht0 1152(%[in]) # \n" |
| 235 | "prefetcht0 2176(%[in]) # \n" |
| 236 | |
| 237 | "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n" |
| 238 | "crc32q 1024(%[in]), %%r11 # crc1: stripe1 \n" |
| 239 | "crc32q 2048(%[in]), %%r10 # crc2: stripe2 \n" |
| 240 | |
| 241 | "crc32q 8(%[in]), %%rcx # crc0: stripe0 \n" |
| 242 | "crc32q 1032(%[in]), %%r11 # crc1: stripe1 \n" |
| 243 | "crc32q 2056(%[in]), %%r10 # crc2: stripe2 \n" |
| 244 | |
| 245 | "crc32q 16(%[in]), %%rcx # crc0: stripe0 \n" |
| 246 | "crc32q 1040(%[in]), %%r11 # crc1: stripe1 \n" |
| 247 | "crc32q 2064(%[in]), %%r10 # crc2: stripe2 \n" |
| 248 | |
| 249 | "crc32q 24(%[in]), %%rcx # crc0: stripe0 \n" |
| 250 | "crc32q 1048(%[in]), %%r11 # crc1: stripe1 \n" |
| 251 | "crc32q 2072(%[in]), %%r10 # crc2: stripe2 \n" |
| 252 | |
| 253 | "crc32q 32(%[in]), %%rcx # crc0: stripe0 \n" |
| 254 | "crc32q 1056(%[in]), %%r11 # crc1: stripe1 \n" |
| 255 | "crc32q 2080(%[in]), %%r10 # crc2: stripe2 \n" |
| 256 | |
| 257 | "crc32q 40(%[in]), %%rcx # crc0: stripe0 \n" |
| 258 | "crc32q 1064(%[in]), %%r11 # crc1: stripe1 \n" |
| 259 | "crc32q 2088(%[in]), %%r10 # crc2: stripe2 \n" |
| 260 | |
| 261 | "crc32q 48(%[in]), %%rcx # crc0: stripe0 \n" |
| 262 | "crc32q 1072(%[in]), %%r11 # crc1: stripe1 \n" |
| 263 | "crc32q 2096(%[in]), %%r10 # crc2: stripe2 \n" |
| 264 | |
| 265 | "crc32q 56(%[in]), %%rcx # crc0: stripe0 \n" |
| 266 | "crc32q 1080(%[in]), %%r11 # crc1: stripe1 \n" |
| 267 | "crc32q 2104(%[in]), %%r10 # crc2: stripe2 \n" |
| 268 | |
| 269 | "add $64, %[in] # \n" |
| 270 | "sub $1, %%r8d # \n" |
| 271 | "jnz loop_3072_%= # \n" |
| 272 | |
| 273 | FOLD_K1K2( |
| 274 | 3072, |
| 275 | $0xa51b6135, |
| 276 | $0x170076fa) /* Magic Constants used to fold crc stripes into ecx |
| 277 | |
| 278 | output registers |
| 279 | [crc] is an input and and output so it is marked read/write (i.e. "+c") |
| 280 | we clobber the register for [input] (via add instruction) so we must also |
| 281 | tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber*/ |
| 282 | : "+c" (crc), "+d" (input) |
| 283 | |
| 284 | /* input registers |
| 285 | the numeric values match the position of the output registers */ |
| 286 | : [crc] "c" (crc), [in] "d" (input) |
| 287 | |
| 288 | /* additional clobbered registers |
| 289 | "cc" is the flags - we add and sub, so the flags are also clobbered */ |
| 290 | : "%r8" , "%r9" , "%r11" , "%r10" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "cc" ); |
| 291 | |
| 292 | return crc; |
| 293 | } |
| 294 | |
| 295 | static int detection_performed = 0; |
| 296 | static int detected_clmul = 0; |
| 297 | |
| 298 | /* |
| 299 | * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and |
| 300 | * PCLMULQDQ machine instructions (if present). |
| 301 | * Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction. |
| 302 | * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent |
| 303 | * call. |
| 304 | */ |
| 305 | uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { |
| 306 | |
| 307 | if (UNLIKELY(!detection_performed)) { |
| 308 | detected_clmul = aws_checksums_is_clmul_present(); |
| 309 | /* Simply setting the flag true to skip HW detection next time |
| 310 | Not using memory barriers since the worst that can |
| 311 | happen is a fallback to the non HW accelerated code. */ |
| 312 | detection_performed = 1; |
| 313 | } |
| 314 | |
| 315 | uint32_t crc = ~previousCrc32; |
| 316 | |
| 317 | /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ |
| 318 | if (UNLIKELY(length < 8)) { |
| 319 | while (length-- > 0) { |
| 320 | asm("loop_small_%=: CRC32B (%[in]), %[crc]" : "+c" (crc) : [crc] "c" (crc), [in] "r" (input)); |
| 321 | input++; |
| 322 | } |
| 323 | return ~crc; |
| 324 | } |
| 325 | |
| 326 | /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */ |
| 327 | int input_alignment = (unsigned long int)input & 0x7; |
| 328 | |
| 329 | /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */ |
| 330 | int leading = (8 - input_alignment) & 0x7; |
| 331 | |
| 332 | /* reduce the length by the leading unaligned bytes we are about to process */ |
| 333 | length -= leading; |
| 334 | |
| 335 | /* spin through the leading unaligned input bytes (if any) one-by-one */ |
| 336 | while (leading-- > 0) { |
| 337 | asm("loop_leading_%=: CRC32B (%[in]), %[crc]" : "+c" (crc) : [crc] "c" (crc), [in] "r" (input)); |
| 338 | input++; |
| 339 | } |
| 340 | |
| 341 | /* Using likely to keep this code inlined */ |
| 342 | if (LIKELY(detected_clmul)) { |
| 343 | |
| 344 | while (LIKELY(length >= 3072)) { |
| 345 | /* Compute crc32c on each block, chaining each crc result */ |
| 346 | crc = s_crc32c_sse42_clmul_3072(input, crc); |
| 347 | input += 3072; |
| 348 | length -= 3072; |
| 349 | } |
| 350 | while (LIKELY(length >= 1024)) { |
| 351 | /* Compute crc32c on each block, chaining each crc result */ |
| 352 | crc = s_crc32c_sse42_clmul_1024(input, crc); |
| 353 | input += 1024; |
| 354 | length -= 1024; |
| 355 | } |
| 356 | while (LIKELY(length >= 256)) { |
| 357 | /* Compute crc32c on each block, chaining each crc result */ |
| 358 | crc = s_crc32c_sse42_clmul_256(input, crc); |
| 359 | input += 256; |
| 360 | length -= 256; |
| 361 | } |
| 362 | } |
| 363 | |
| 364 | /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */ |
| 365 | while (LIKELY(length >= 8)) { |
| 366 | /* Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction */ |
| 367 | asm volatile("loop_8_%=: CRC32Q (%[in]), %%rcx" : "+c" (crc) : [crc] "c" (crc), [in] "r" (input)); |
| 368 | input += 8; |
| 369 | length -= 8; |
| 370 | } |
| 371 | |
| 372 | /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */ |
| 373 | while (length-- > 0) { |
| 374 | asm volatile("loop_trailing_%=: CRC32B (%[in]), %[crc]" : "+c" (crc) : [crc] "c" (crc), [in] "r" (input)); |
| 375 | input++; |
| 376 | } |
| 377 | |
| 378 | return ~crc; |
| 379 | } |
| 380 | |
| 381 | #elif !defined(_MSC_VER) && !defined(__arm__) && !defined(__aarch64__) |
| 382 | |
| 383 | /* don't call this without first checking that it is supported. */ |
| 384 | uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { |
| 385 | return 0; |
| 386 | } |
| 387 | |
| 388 | #endif |
| 389 | |