1/******************************************************
2Copyright (c) 2017 Percona LLC and/or its affiliates.
3
4CRC32 using Intel's PCLMUL instruction.
5
6This program is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; version 2 of the License.
9
10This program is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with this program; if not, write to the Free Software
17Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
18
19*******************************************************/
20
21/* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation
22 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
23 *
24 * This file is part of Libgcrypt.
25 *
26 * Libgcrypt is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU Lesser General Public License as
28 * published by the Free Software Foundation; either version 2.1 of
29 * the License, or (at your option) any later version.
30 *
31 * Libgcrypt is distributed in the hope that it will be useful,
32 * but WITHOUT ANY WARRANTY; without even the implied warranty of
33 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 * GNU Lesser General Public License for more details.
35 *
36 * You should have received a copy of the GNU Lesser General Public
37 * License along with this program; if not, write to the Free Software
38 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
39 *
40 */
41
42#include <stdio.h>
43#include <stdlib.h>
44#include <string.h>
45#include <stdint.h>
46
47# define U64_C(c) (c ## UL)
48
49typedef uint32_t u32;
50typedef uint16_t u16;
51typedef uint64_t u64;
52#ifndef byte
53typedef uint8_t byte;
54#endif
55
56# define _gcry_bswap32 __builtin_bswap32
57
58#if __GNUC__ >= 4 && defined(__x86_64__) && defined(HAVE_CLMUL_INSTRUCTION)
59
60#if defined(_GCRY_GCC_VERSION) && _GCRY_GCC_VERSION >= 40400 /* 4.4 */
61/* Prevent compiler from issuing SSE instructions between asm blocks. */
62# pragma GCC target("no-sse")
63#endif
64
65
66#define ALIGNED_16 __attribute__ ((aligned (16)))
67
68
69struct u16_unaligned_s
70{
71 u16 a;
72} __attribute__((packed, aligned (1), may_alias));
73
74
75/* Constants structure for generic reflected/non-reflected CRC32 CLMUL
76 * functions. */
77struct crc32_consts_s
78{
79 /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
80 u64 k[6];
81 /* my_p: { floor(x^64 / P(x)), P(x) } */
82 u64 my_p[2];
83};
84
85
86/* CLMUL constants for CRC32 and CRC32RFC1510. */
87static const struct crc32_consts_s crc32_consts ALIGNED_16 =
88{
89 { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
90 U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
91 U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
92 U64_C(0x163cd6124), 0 /* y = 2 */
93 },
94 { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
95 U64_C(0x1f7011641), U64_C(0x1db710641)
96 }
97};
98
99/* Common constants for CRC32 algorithms. */
100static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 =
101 {
102 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
103 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
104 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
105 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
106 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
107 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
108 };
109static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 =
110 {
111 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
112 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
113 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
114 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
115 };
116static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 =
117 {
118 { U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f) }, /* 9 */
119 { U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e) },
120 { U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d) },
121 { U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c) },
122 { U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b) },
123 { U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a) },
124 { U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09) }, /* 15 */
125 };
126static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 =
127 {
128 { U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff) }, /* 5 */
129 { U64_C(0xffff070603020100), U64_C(0xffffffffffffffff) },
130 { U64_C(0xff07060503020100), U64_C(0xffffffffffffffff) }, /* 7 */
131 };
132
133/* PCLMUL functions for reflected CRC32. */
134static inline void
135crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
136 const struct crc32_consts_s *consts)
137{
138 if (inlen >= 8 * 16)
139 {
140 asm volatile ("movd %[crc], %%xmm4\n\t"
141 "movdqu %[inbuf_0], %%xmm0\n\t"
142 "movdqu %[inbuf_1], %%xmm1\n\t"
143 "movdqu %[inbuf_2], %%xmm2\n\t"
144 "movdqu %[inbuf_3], %%xmm3\n\t"
145 "pxor %%xmm4, %%xmm0\n\t"
146 :
147 : [inbuf_0] "m" (inbuf[0 * 16]),
148 [inbuf_1] "m" (inbuf[1 * 16]),
149 [inbuf_2] "m" (inbuf[2 * 16]),
150 [inbuf_3] "m" (inbuf[3 * 16]),
151 [crc] "m" (*pcrc)
152 );
153
154 inbuf += 4 * 16;
155 inlen -= 4 * 16;
156
157 asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
158 :
159 : [k1k2] "m" (consts->k[1 - 1])
160 );
161
162 /* Fold by 4. */
163 while (inlen >= 4 * 16)
164 {
165 asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t"
166 "movdqa %%xmm0, %%xmm6\n\t"
167 "pclmulqdq $0x00, %%xmm4, %%xmm0\n\t"
168 "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
169 "pxor %%xmm5, %%xmm0\n\t"
170 "pxor %%xmm6, %%xmm0\n\t"
171
172 "movdqu %[inbuf_1], %%xmm5\n\t"
173 "movdqa %%xmm1, %%xmm6\n\t"
174 "pclmulqdq $0x00, %%xmm4, %%xmm1\n\t"
175 "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
176 "pxor %%xmm5, %%xmm1\n\t"
177 "pxor %%xmm6, %%xmm1\n\t"
178
179 "movdqu %[inbuf_2], %%xmm5\n\t"
180 "movdqa %%xmm2, %%xmm6\n\t"
181 "pclmulqdq $0x00, %%xmm4, %%xmm2\n\t"
182 "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
183 "pxor %%xmm5, %%xmm2\n\t"
184 "pxor %%xmm6, %%xmm2\n\t"
185
186 "movdqu %[inbuf_3], %%xmm5\n\t"
187 "movdqa %%xmm3, %%xmm6\n\t"
188 "pclmulqdq $0x00, %%xmm4, %%xmm3\n\t"
189 "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
190 "pxor %%xmm5, %%xmm3\n\t"
191 "pxor %%xmm6, %%xmm3\n\t"
192 :
193 : [inbuf_0] "m" (inbuf[0 * 16]),
194 [inbuf_1] "m" (inbuf[1 * 16]),
195 [inbuf_2] "m" (inbuf[2 * 16]),
196 [inbuf_3] "m" (inbuf[3 * 16])
197 );
198
199 inbuf += 4 * 16;
200 inlen -= 4 * 16;
201 }
202
203 asm volatile ("movdqa %[k3k4], %%xmm6\n\t"
204 "movdqa %[my_p], %%xmm5\n\t"
205 :
206 : [k3k4] "m" (consts->k[3 - 1]),
207 [my_p] "m" (consts->my_p[0])
208 );
209
210 /* Fold 4 to 1. */
211
212 asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
213 "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
214 "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
215 "pxor %%xmm1, %%xmm0\n\t"
216 "pxor %%xmm4, %%xmm0\n\t"
217
218 "movdqa %%xmm0, %%xmm4\n\t"
219 "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
220 "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
221 "pxor %%xmm2, %%xmm0\n\t"
222 "pxor %%xmm4, %%xmm0\n\t"
223
224 "movdqa %%xmm0, %%xmm4\n\t"
225 "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
226 "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
227 "pxor %%xmm3, %%xmm0\n\t"
228 "pxor %%xmm4, %%xmm0\n\t"
229 :
230 :
231 );
232 }
233 else
234 {
235 asm volatile ("movd %[crc], %%xmm1\n\t"
236 "movdqu %[inbuf], %%xmm0\n\t"
237 "movdqa %[k3k4], %%xmm6\n\t"
238 "pxor %%xmm1, %%xmm0\n\t"
239 "movdqa %[my_p], %%xmm5\n\t"
240 :
241 : [inbuf] "m" (*inbuf),
242 [crc] "m" (*pcrc),
243 [k3k4] "m" (consts->k[3 - 1]),
244 [my_p] "m" (consts->my_p[0])
245 );
246
247 inbuf += 16;
248 inlen -= 16;
249 }
250
251 /* Fold by 1. */
252 if (inlen >= 16)
253 {
254 while (inlen >= 16)
255 {
256 /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */
257 asm volatile ("movdqu %[inbuf], %%xmm2\n\t"
258 "movdqa %%xmm0, %%xmm1\n\t"
259 "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
260 "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
261 "pxor %%xmm2, %%xmm0\n\t"
262 "pxor %%xmm1, %%xmm0\n\t"
263 :
264 : [inbuf] "m" (*inbuf)
265 );
266
267 inbuf += 16;
268 inlen -= 16;
269 }
270 }
271
272 /* Partial fold. */
273 if (inlen)
274 {
275 /* Load last input and add padding zeros. */
276 asm volatile ("movdqu %[shr_shuf], %%xmm3\n\t"
277 "movdqu %[shl_shuf], %%xmm4\n\t"
278 "movdqu %[mask], %%xmm2\n\t"
279
280 "movdqa %%xmm0, %%xmm1\n\t"
281 "pshufb %%xmm4, %%xmm0\n\t"
282 "movdqu %[inbuf], %%xmm4\n\t"
283 "pshufb %%xmm3, %%xmm1\n\t"
284 "pand %%xmm4, %%xmm2\n\t"
285 "por %%xmm1, %%xmm2\n\t"
286
287 "movdqa %%xmm0, %%xmm1\n\t"
288 "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
289 "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
290 "pxor %%xmm2, %%xmm0\n\t"
291 "pxor %%xmm1, %%xmm0\n\t"
292 :
293 : [inbuf] "m" (*(inbuf - 16 + inlen)),
294 [mask] "m" (crc32_partial_fold_input_mask[inlen]),
295 [shl_shuf] "m" (crc32_refl_shuf_shift[inlen]),
296 [shr_shuf] "m" (crc32_refl_shuf_shift[inlen + 16])
297 );
298
299 inbuf += inlen;
300 inlen -= inlen;
301 }
302
303 /* Final fold. */
304 asm volatile (/* reduce 128-bits to 96-bits */
305 "movdqa %%xmm0, %%xmm1\n\t"
306 "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
307 "psrldq $8, %%xmm1\n\t"
308 "pxor %%xmm1, %%xmm0\n\t"
309
310 /* reduce 96-bits to 64-bits */
311 "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
312 "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
313 "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
314 "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
315
316 /* barrett reduction */
317 "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
318 "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
319 "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
320 "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
321 "pxor %%xmm1, %%xmm0\n\t"
322
323 /* store CRC */
324 "pextrd $2, %%xmm0, %[out]\n\t"
325 : [out] "=m" (*pcrc)
326 : [k5] "m" (consts->k[5 - 1])
327 );
328}
329
330static inline void
331crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
332 const struct crc32_consts_s *consts)
333{
334 if (inlen < 4)
335 {
336 u32 crc = *pcrc;
337 u32 data;
338
339 asm volatile ("movdqa %[my_p], %%xmm5\n\t"
340 :
341 : [my_p] "m" (consts->my_p[0])
342 );
343
344 if (inlen == 1)
345 {
346 data = inbuf[0];
347 data ^= crc;
348 data <<= 24;
349 crc >>= 8;
350 }
351 else if (inlen == 2)
352 {
353 data = ((const struct u16_unaligned_s *)inbuf)->a;
354 data ^= crc;
355 data <<= 16;
356 crc >>= 16;
357 }
358 else
359 {
360 data = ((const struct u16_unaligned_s *)inbuf)->a;
361 data |= inbuf[2] << 16;
362 data ^= crc;
363 data <<= 8;
364 crc >>= 24;
365 }
366
367 /* Barrett reduction */
368 asm volatile ("movd %[in], %%xmm0\n\t"
369 "movd %[crc], %%xmm1\n\t"
370
371 "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
372 "psllq $32, %%xmm1\n\t"
373 "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
374 "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
375 "pxor %%xmm1, %%xmm0\n\t"
376
377 "pextrd $1, %%xmm0, %[out]\n\t"
378 : [out] "=m" (*pcrc)
379 : [in] "rm" (data),
380 [crc] "rm" (crc)
381 );
382 }
383 else if (inlen == 4)
384 {
385 /* Barrett reduction */
386 asm volatile ("movd %[crc], %%xmm1\n\t"
387 "movd %[in], %%xmm0\n\t"
388 "movdqa %[my_p], %%xmm5\n\t"
389 "pxor %%xmm1, %%xmm0\n\t"
390
391 "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
392 "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
393 "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
394
395 "pextrd $1, %%xmm0, %[out]\n\t"
396 : [out] "=m" (*pcrc)
397 : [in] "m" (*inbuf),
398 [crc] "m" (*pcrc),
399 [my_p] "m" (consts->my_p[0])
400 );
401 }
402 else
403 {
404 asm volatile ("movdqu %[shuf], %%xmm4\n\t"
405 "movd %[crc], %%xmm1\n\t"
406 "movdqa %[my_p], %%xmm5\n\t"
407 "movdqa %[k3k4], %%xmm6\n\t"
408 :
409 : [shuf] "m" (crc32_refl_shuf_shift[inlen]),
410 [crc] "m" (*pcrc),
411 [my_p] "m" (consts->my_p[0]),
412 [k3k4] "m" (consts->k[3 - 1])
413 );
414
415 if (inlen >= 8)
416 {
417 asm volatile ("movq %[inbuf], %%xmm0\n\t"
418 :
419 : [inbuf] "m" (*inbuf)
420 );
421 if (inlen > 8)
422 {
423 asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/
424 "movq %[inbuf_tail], %%xmm2\n\t"
425 "punpcklqdq %%xmm2, %%xmm0\n\t"
426 "pshufb %[merge_shuf], %%xmm0\n\t"
427 :
428 : [inbuf_tail] "m" (inbuf[inlen - 8]),
429 [merge_shuf] "m"
430 (*crc32_merge9to15_shuf[inlen - 9])
431 );
432 }
433 }
434 else
435 {
436 asm volatile ("movd %[inbuf], %%xmm0\n\t"
437 "pinsrd $1, %[inbuf_tail], %%xmm0\n\t"
438 "pshufb %[merge_shuf], %%xmm0\n\t"
439 :
440 : [inbuf] "m" (*inbuf),
441 [inbuf_tail] "m" (inbuf[inlen - 4]),
442 [merge_shuf] "m"
443 (*crc32_merge5to7_shuf[inlen - 5])
444 );
445 }
446
447 /* Final fold. */
448 asm volatile ("pxor %%xmm1, %%xmm0\n\t"
449 "pshufb %%xmm4, %%xmm0\n\t"
450
451 /* reduce 128-bits to 96-bits */
452 "movdqa %%xmm0, %%xmm1\n\t"
453 "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
454 "psrldq $8, %%xmm1\n\t"
455 "pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */
456
457 /* reduce 96-bits to 64-bits */
458 "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
459 "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
460 "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
461 "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
462
463 /* barrett reduction */
464 "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
465 "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
466 "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
467 "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
468 "pxor %%xmm1, %%xmm0\n\t"
469
470 /* store CRC */
471 "pextrd $2, %%xmm0, %[out]\n\t"
472 : [out] "=m" (*pcrc)
473 : [k5] "m" (consts->k[5 - 1])
474 );
475 }
476}
477
478void
479crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
480{
481 const struct crc32_consts_s *consts = &crc32_consts;
482#if defined(__x86_64__) && defined(__WIN64__)
483 char win64tmp[2 * 16];
484
485 /* XMM6-XMM7 need to be restored after use. */
486 asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
487 "movdqu %%xmm7, 1*16(%0)\n\t"
488 :
489 : "r" (win64tmp)
490 : "memory");
491#endif
492
493 if (!inlen)
494 return;
495
496 if (inlen >= 16)
497 crc32_reflected_bulk(pcrc, inbuf, inlen, consts);
498 else
499 crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts);
500
501#if defined(__x86_64__) && defined(__WIN64__)
502 /* Restore used registers. */
503 asm volatile("movdqu 0*16(%0), %%xmm6\n\t"
504 "movdqu 1*16(%0), %%xmm7\n\t"
505 :
506 : "r" (win64tmp)
507 : "memory");
508#endif
509}
510
511#endif
512