| 1 | /****************************************************** |
| 2 | Copyright (c) 2017 Percona LLC and/or its affiliates. |
| 3 | |
| 4 | CRC32 using Intel's PCLMUL instruction. |
| 5 | |
| 6 | This program is free software; you can redistribute it and/or modify |
| 7 | it under the terms of the GNU General Public License as published by |
| 8 | the Free Software Foundation; version 2 of the License. |
| 9 | |
| 10 | This program is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 | GNU General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU General Public License |
| 16 | along with this program; if not, write to the Free Software |
| 17 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA |
| 18 | |
| 19 | *******************************************************/ |
| 20 | |
| 21 | /* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation |
| 22 | * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
| 23 | * |
| 24 | * This file is part of Libgcrypt. |
| 25 | * |
| 26 | * Libgcrypt is free software; you can redistribute it and/or modify |
| 27 | * it under the terms of the GNU Lesser General Public License as |
| 28 | * published by the Free Software Foundation; either version 2.1 of |
| 29 | * the License, or (at your option) any later version. |
| 30 | * |
| 31 | * Libgcrypt is distributed in the hope that it will be useful, |
| 32 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 33 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 34 | * GNU Lesser General Public License for more details. |
| 35 | * |
| 36 | * You should have received a copy of the GNU Lesser General Public |
| 37 | * License along with this program; if not, write to the Free Software |
| 38 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA |
| 39 | * |
| 40 | */ |
| 41 | |
| 42 | #include <stdio.h> |
| 43 | #include <stdlib.h> |
| 44 | #include <string.h> |
| 45 | #include <stdint.h> |
| 46 | |
| 47 | # define U64_C(c) (c ## UL) |
| 48 | |
| 49 | typedef uint32_t u32; |
| 50 | typedef uint16_t u16; |
| 51 | typedef uint64_t u64; |
| 52 | #ifndef byte |
| 53 | typedef uint8_t byte; |
| 54 | #endif |
| 55 | |
| 56 | # define _gcry_bswap32 __builtin_bswap32 |
| 57 | |
| 58 | #if __GNUC__ >= 4 && defined(__x86_64__) && defined(HAVE_CLMUL_INSTRUCTION) |
| 59 | |
| 60 | #if defined(_GCRY_GCC_VERSION) && _GCRY_GCC_VERSION >= 40400 /* 4.4 */ |
| 61 | /* Prevent compiler from issuing SSE instructions between asm blocks. */ |
| 62 | # pragma GCC target("no-sse") |
| 63 | #endif |
| 64 | |
| 65 | |
| 66 | #define ALIGNED_16 __attribute__ ((aligned (16))) |
| 67 | |
| 68 | |
| 69 | struct u16_unaligned_s |
| 70 | { |
| 71 | u16 a; |
| 72 | } __attribute__((packed, aligned (1), may_alias)); |
| 73 | |
| 74 | |
| 75 | /* Constants structure for generic reflected/non-reflected CRC32 CLMUL |
| 76 | * functions. */ |
| 77 | struct crc32_consts_s |
| 78 | { |
| 79 | /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */ |
| 80 | u64 k[6]; |
| 81 | /* my_p: { floor(x^64 / P(x)), P(x) } */ |
| 82 | u64 my_p[2]; |
| 83 | }; |
| 84 | |
| 85 | |
| 86 | /* CLMUL constants for CRC32 and CRC32RFC1510. */ |
| 87 | static const struct crc32_consts_s crc32_consts ALIGNED_16 = |
| 88 | { |
| 89 | { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */ |
| 90 | U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */ |
| 91 | U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */ |
| 92 | U64_C(0x163cd6124), 0 /* y = 2 */ |
| 93 | }, |
| 94 | { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */ |
| 95 | U64_C(0x1f7011641), U64_C(0x1db710641) |
| 96 | } |
| 97 | }; |
| 98 | |
| 99 | /* Common constants for CRC32 algorithms. */ |
| 100 | static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 = |
| 101 | { |
| 102 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 103 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 104 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 105 | 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
| 106 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 107 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 108 | }; |
| 109 | static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 = |
| 110 | { |
| 111 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 112 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 113 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 114 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
| 115 | }; |
| 116 | static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 = |
| 117 | { |
| 118 | { U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f) }, /* 9 */ |
| 119 | { U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e) }, |
| 120 | { U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d) }, |
| 121 | { U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c) }, |
| 122 | { U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b) }, |
| 123 | { U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a) }, |
| 124 | { U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09) }, /* 15 */ |
| 125 | }; |
| 126 | static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 = |
| 127 | { |
| 128 | { U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff) }, /* 5 */ |
| 129 | { U64_C(0xffff070603020100), U64_C(0xffffffffffffffff) }, |
| 130 | { U64_C(0xff07060503020100), U64_C(0xffffffffffffffff) }, /* 7 */ |
| 131 | }; |
| 132 | |
| 133 | /* PCLMUL functions for reflected CRC32. */ |
| 134 | static inline void |
| 135 | crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, |
| 136 | const struct crc32_consts_s *consts) |
| 137 | { |
| 138 | if (inlen >= 8 * 16) |
| 139 | { |
| 140 | asm volatile ("movd %[crc], %%xmm4\n\t" |
| 141 | "movdqu %[inbuf_0], %%xmm0\n\t" |
| 142 | "movdqu %[inbuf_1], %%xmm1\n\t" |
| 143 | "movdqu %[inbuf_2], %%xmm2\n\t" |
| 144 | "movdqu %[inbuf_3], %%xmm3\n\t" |
| 145 | "pxor %%xmm4, %%xmm0\n\t" |
| 146 | : |
| 147 | : [inbuf_0] "m" (inbuf[0 * 16]), |
| 148 | [inbuf_1] "m" (inbuf[1 * 16]), |
| 149 | [inbuf_2] "m" (inbuf[2 * 16]), |
| 150 | [inbuf_3] "m" (inbuf[3 * 16]), |
| 151 | [crc] "m" (*pcrc) |
| 152 | ); |
| 153 | |
| 154 | inbuf += 4 * 16; |
| 155 | inlen -= 4 * 16; |
| 156 | |
| 157 | asm volatile ("movdqa %[k1k2], %%xmm4\n\t" |
| 158 | : |
| 159 | : [k1k2] "m" (consts->k[1 - 1]) |
| 160 | ); |
| 161 | |
| 162 | /* Fold by 4. */ |
| 163 | while (inlen >= 4 * 16) |
| 164 | { |
| 165 | asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t" |
| 166 | "movdqa %%xmm0, %%xmm6\n\t" |
| 167 | "pclmulqdq $0x00, %%xmm4, %%xmm0\n\t" |
| 168 | "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" |
| 169 | "pxor %%xmm5, %%xmm0\n\t" |
| 170 | "pxor %%xmm6, %%xmm0\n\t" |
| 171 | |
| 172 | "movdqu %[inbuf_1], %%xmm5\n\t" |
| 173 | "movdqa %%xmm1, %%xmm6\n\t" |
| 174 | "pclmulqdq $0x00, %%xmm4, %%xmm1\n\t" |
| 175 | "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" |
| 176 | "pxor %%xmm5, %%xmm1\n\t" |
| 177 | "pxor %%xmm6, %%xmm1\n\t" |
| 178 | |
| 179 | "movdqu %[inbuf_2], %%xmm5\n\t" |
| 180 | "movdqa %%xmm2, %%xmm6\n\t" |
| 181 | "pclmulqdq $0x00, %%xmm4, %%xmm2\n\t" |
| 182 | "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" |
| 183 | "pxor %%xmm5, %%xmm2\n\t" |
| 184 | "pxor %%xmm6, %%xmm2\n\t" |
| 185 | |
| 186 | "movdqu %[inbuf_3], %%xmm5\n\t" |
| 187 | "movdqa %%xmm3, %%xmm6\n\t" |
| 188 | "pclmulqdq $0x00, %%xmm4, %%xmm3\n\t" |
| 189 | "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" |
| 190 | "pxor %%xmm5, %%xmm3\n\t" |
| 191 | "pxor %%xmm6, %%xmm3\n\t" |
| 192 | : |
| 193 | : [inbuf_0] "m" (inbuf[0 * 16]), |
| 194 | [inbuf_1] "m" (inbuf[1 * 16]), |
| 195 | [inbuf_2] "m" (inbuf[2 * 16]), |
| 196 | [inbuf_3] "m" (inbuf[3 * 16]) |
| 197 | ); |
| 198 | |
| 199 | inbuf += 4 * 16; |
| 200 | inlen -= 4 * 16; |
| 201 | } |
| 202 | |
| 203 | asm volatile ("movdqa %[k3k4], %%xmm6\n\t" |
| 204 | "movdqa %[my_p], %%xmm5\n\t" |
| 205 | : |
| 206 | : [k3k4] "m" (consts->k[3 - 1]), |
| 207 | [my_p] "m" (consts->my_p[0]) |
| 208 | ); |
| 209 | |
| 210 | /* Fold 4 to 1. */ |
| 211 | |
| 212 | asm volatile ("movdqa %%xmm0, %%xmm4\n\t" |
| 213 | "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" |
| 214 | "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t" |
| 215 | "pxor %%xmm1, %%xmm0\n\t" |
| 216 | "pxor %%xmm4, %%xmm0\n\t" |
| 217 | |
| 218 | "movdqa %%xmm0, %%xmm4\n\t" |
| 219 | "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" |
| 220 | "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t" |
| 221 | "pxor %%xmm2, %%xmm0\n\t" |
| 222 | "pxor %%xmm4, %%xmm0\n\t" |
| 223 | |
| 224 | "movdqa %%xmm0, %%xmm4\n\t" |
| 225 | "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" |
| 226 | "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t" |
| 227 | "pxor %%xmm3, %%xmm0\n\t" |
| 228 | "pxor %%xmm4, %%xmm0\n\t" |
| 229 | : |
| 230 | : |
| 231 | ); |
| 232 | } |
| 233 | else |
| 234 | { |
| 235 | asm volatile ("movd %[crc], %%xmm1\n\t" |
| 236 | "movdqu %[inbuf], %%xmm0\n\t" |
| 237 | "movdqa %[k3k4], %%xmm6\n\t" |
| 238 | "pxor %%xmm1, %%xmm0\n\t" |
| 239 | "movdqa %[my_p], %%xmm5\n\t" |
| 240 | : |
| 241 | : [inbuf] "m" (*inbuf), |
| 242 | [crc] "m" (*pcrc), |
| 243 | [k3k4] "m" (consts->k[3 - 1]), |
| 244 | [my_p] "m" (consts->my_p[0]) |
| 245 | ); |
| 246 | |
| 247 | inbuf += 16; |
| 248 | inlen -= 16; |
| 249 | } |
| 250 | |
| 251 | /* Fold by 1. */ |
| 252 | if (inlen >= 16) |
| 253 | { |
| 254 | while (inlen >= 16) |
| 255 | { |
| 256 | /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */ |
| 257 | asm volatile ("movdqu %[inbuf], %%xmm2\n\t" |
| 258 | "movdqa %%xmm0, %%xmm1\n\t" |
| 259 | "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" |
| 260 | "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t" |
| 261 | "pxor %%xmm2, %%xmm0\n\t" |
| 262 | "pxor %%xmm1, %%xmm0\n\t" |
| 263 | : |
| 264 | : [inbuf] "m" (*inbuf) |
| 265 | ); |
| 266 | |
| 267 | inbuf += 16; |
| 268 | inlen -= 16; |
| 269 | } |
| 270 | } |
| 271 | |
| 272 | /* Partial fold. */ |
| 273 | if (inlen) |
| 274 | { |
| 275 | /* Load last input and add padding zeros. */ |
| 276 | asm volatile ("movdqu %[shr_shuf], %%xmm3\n\t" |
| 277 | "movdqu %[shl_shuf], %%xmm4\n\t" |
| 278 | "movdqu %[mask], %%xmm2\n\t" |
| 279 | |
| 280 | "movdqa %%xmm0, %%xmm1\n\t" |
| 281 | "pshufb %%xmm4, %%xmm0\n\t" |
| 282 | "movdqu %[inbuf], %%xmm4\n\t" |
| 283 | "pshufb %%xmm3, %%xmm1\n\t" |
| 284 | "pand %%xmm4, %%xmm2\n\t" |
| 285 | "por %%xmm1, %%xmm2\n\t" |
| 286 | |
| 287 | "movdqa %%xmm0, %%xmm1\n\t" |
| 288 | "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" |
| 289 | "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t" |
| 290 | "pxor %%xmm2, %%xmm0\n\t" |
| 291 | "pxor %%xmm1, %%xmm0\n\t" |
| 292 | : |
| 293 | : [inbuf] "m" (*(inbuf - 16 + inlen)), |
| 294 | [mask] "m" (crc32_partial_fold_input_mask[inlen]), |
| 295 | [shl_shuf] "m" (crc32_refl_shuf_shift[inlen]), |
| 296 | [shr_shuf] "m" (crc32_refl_shuf_shift[inlen + 16]) |
| 297 | ); |
| 298 | |
| 299 | inbuf += inlen; |
| 300 | inlen -= inlen; |
| 301 | } |
| 302 | |
| 303 | /* Final fold. */ |
| 304 | asm volatile (/* reduce 128-bits to 96-bits */ |
| 305 | "movdqa %%xmm0, %%xmm1\n\t" |
| 306 | "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t" |
| 307 | "psrldq $8, %%xmm1\n\t" |
| 308 | "pxor %%xmm1, %%xmm0\n\t" |
| 309 | |
| 310 | /* reduce 96-bits to 64-bits */ |
| 311 | "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */ |
| 312 | "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */ |
| 313 | "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */ |
| 314 | "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */ |
| 315 | |
| 316 | /* barrett reduction */ |
| 317 | "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */ |
| 318 | "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */ |
| 319 | "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ |
| 320 | "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ |
| 321 | "pxor %%xmm1, %%xmm0\n\t" |
| 322 | |
| 323 | /* store CRC */ |
| 324 | "pextrd $2, %%xmm0, %[out]\n\t" |
| 325 | : [out] "=m" (*pcrc) |
| 326 | : [k5] "m" (consts->k[5 - 1]) |
| 327 | ); |
| 328 | } |
| 329 | |
| 330 | static inline void |
| 331 | crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen, |
| 332 | const struct crc32_consts_s *consts) |
| 333 | { |
| 334 | if (inlen < 4) |
| 335 | { |
| 336 | u32 crc = *pcrc; |
| 337 | u32 data; |
| 338 | |
| 339 | asm volatile ("movdqa %[my_p], %%xmm5\n\t" |
| 340 | : |
| 341 | : [my_p] "m" (consts->my_p[0]) |
| 342 | ); |
| 343 | |
| 344 | if (inlen == 1) |
| 345 | { |
| 346 | data = inbuf[0]; |
| 347 | data ^= crc; |
| 348 | data <<= 24; |
| 349 | crc >>= 8; |
| 350 | } |
| 351 | else if (inlen == 2) |
| 352 | { |
| 353 | data = ((const struct u16_unaligned_s *)inbuf)->a; |
| 354 | data ^= crc; |
| 355 | data <<= 16; |
| 356 | crc >>= 16; |
| 357 | } |
| 358 | else |
| 359 | { |
| 360 | data = ((const struct u16_unaligned_s *)inbuf)->a; |
| 361 | data |= inbuf[2] << 16; |
| 362 | data ^= crc; |
| 363 | data <<= 8; |
| 364 | crc >>= 24; |
| 365 | } |
| 366 | |
| 367 | /* Barrett reduction */ |
| 368 | asm volatile ("movd %[in], %%xmm0\n\t" |
| 369 | "movd %[crc], %%xmm1\n\t" |
| 370 | |
| 371 | "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ |
| 372 | "psllq $32, %%xmm1\n\t" |
| 373 | "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */ |
| 374 | "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ |
| 375 | "pxor %%xmm1, %%xmm0\n\t" |
| 376 | |
| 377 | "pextrd $1, %%xmm0, %[out]\n\t" |
| 378 | : [out] "=m" (*pcrc) |
| 379 | : [in] "rm" (data), |
| 380 | [crc] "rm" (crc) |
| 381 | ); |
| 382 | } |
| 383 | else if (inlen == 4) |
| 384 | { |
| 385 | /* Barrett reduction */ |
| 386 | asm volatile ("movd %[crc], %%xmm1\n\t" |
| 387 | "movd %[in], %%xmm0\n\t" |
| 388 | "movdqa %[my_p], %%xmm5\n\t" |
| 389 | "pxor %%xmm1, %%xmm0\n\t" |
| 390 | |
| 391 | "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ |
| 392 | "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */ |
| 393 | "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ |
| 394 | |
| 395 | "pextrd $1, %%xmm0, %[out]\n\t" |
| 396 | : [out] "=m" (*pcrc) |
| 397 | : [in] "m" (*inbuf), |
| 398 | [crc] "m" (*pcrc), |
| 399 | [my_p] "m" (consts->my_p[0]) |
| 400 | ); |
| 401 | } |
| 402 | else |
| 403 | { |
| 404 | asm volatile ("movdqu %[shuf], %%xmm4\n\t" |
| 405 | "movd %[crc], %%xmm1\n\t" |
| 406 | "movdqa %[my_p], %%xmm5\n\t" |
| 407 | "movdqa %[k3k4], %%xmm6\n\t" |
| 408 | : |
| 409 | : [shuf] "m" (crc32_refl_shuf_shift[inlen]), |
| 410 | [crc] "m" (*pcrc), |
| 411 | [my_p] "m" (consts->my_p[0]), |
| 412 | [k3k4] "m" (consts->k[3 - 1]) |
| 413 | ); |
| 414 | |
| 415 | if (inlen >= 8) |
| 416 | { |
| 417 | asm volatile ("movq %[inbuf], %%xmm0\n\t" |
| 418 | : |
| 419 | : [inbuf] "m" (*inbuf) |
| 420 | ); |
| 421 | if (inlen > 8) |
| 422 | { |
| 423 | asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/ |
| 424 | "movq %[inbuf_tail], %%xmm2\n\t" |
| 425 | "punpcklqdq %%xmm2, %%xmm0\n\t" |
| 426 | "pshufb %[merge_shuf], %%xmm0\n\t" |
| 427 | : |
| 428 | : [inbuf_tail] "m" (inbuf[inlen - 8]), |
| 429 | [merge_shuf] "m" |
| 430 | (*crc32_merge9to15_shuf[inlen - 9]) |
| 431 | ); |
| 432 | } |
| 433 | } |
| 434 | else |
| 435 | { |
| 436 | asm volatile ("movd %[inbuf], %%xmm0\n\t" |
| 437 | "pinsrd $1, %[inbuf_tail], %%xmm0\n\t" |
| 438 | "pshufb %[merge_shuf], %%xmm0\n\t" |
| 439 | : |
| 440 | : [inbuf] "m" (*inbuf), |
| 441 | [inbuf_tail] "m" (inbuf[inlen - 4]), |
| 442 | [merge_shuf] "m" |
| 443 | (*crc32_merge5to7_shuf[inlen - 5]) |
| 444 | ); |
| 445 | } |
| 446 | |
| 447 | /* Final fold. */ |
| 448 | asm volatile ("pxor %%xmm1, %%xmm0\n\t" |
| 449 | "pshufb %%xmm4, %%xmm0\n\t" |
| 450 | |
| 451 | /* reduce 128-bits to 96-bits */ |
| 452 | "movdqa %%xmm0, %%xmm1\n\t" |
| 453 | "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t" |
| 454 | "psrldq $8, %%xmm1\n\t" |
| 455 | "pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */ |
| 456 | |
| 457 | /* reduce 96-bits to 64-bits */ |
| 458 | "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */ |
| 459 | "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */ |
| 460 | "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */ |
| 461 | "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */ |
| 462 | |
| 463 | /* barrett reduction */ |
| 464 | "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */ |
| 465 | "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */ |
| 466 | "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ |
| 467 | "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ |
| 468 | "pxor %%xmm1, %%xmm0\n\t" |
| 469 | |
| 470 | /* store CRC */ |
| 471 | "pextrd $2, %%xmm0, %[out]\n\t" |
| 472 | : [out] "=m" (*pcrc) |
| 473 | : [k5] "m" (consts->k[5 - 1]) |
| 474 | ); |
| 475 | } |
| 476 | } |
| 477 | |
| 478 | void |
| 479 | crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen) |
| 480 | { |
| 481 | const struct crc32_consts_s *consts = &crc32_consts; |
| 482 | #if defined(__x86_64__) && defined(__WIN64__) |
| 483 | char win64tmp[2 * 16]; |
| 484 | |
| 485 | /* XMM6-XMM7 need to be restored after use. */ |
| 486 | asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" |
| 487 | "movdqu %%xmm7, 1*16(%0)\n\t" |
| 488 | : |
| 489 | : "r" (win64tmp) |
| 490 | : "memory" ); |
| 491 | #endif |
| 492 | |
| 493 | if (!inlen) |
| 494 | return; |
| 495 | |
| 496 | if (inlen >= 16) |
| 497 | crc32_reflected_bulk(pcrc, inbuf, inlen, consts); |
| 498 | else |
| 499 | crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts); |
| 500 | |
| 501 | #if defined(__x86_64__) && defined(__WIN64__) |
| 502 | /* Restore used registers. */ |
| 503 | asm volatile("movdqu 0*16(%0), %%xmm6\n\t" |
| 504 | "movdqu 1*16(%0), %%xmm7\n\t" |
| 505 | : |
| 506 | : "r" (win64tmp) |
| 507 | : "memory" ); |
| 508 | #endif |
| 509 | } |
| 510 | |
| 511 | #endif |
| 512 | |