1 | /* crc32_simd.c |
2 | * |
3 | * Copyright 2017 The Chromium Authors. All rights reserved. |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the Chromium source repository LICENSE file. |
6 | */ |
7 | |
8 | #include "crc32_simd.h" |
9 | |
10 | #if defined(CRC32_SIMD_SSE42_PCLMUL) |
11 | |
12 | /* |
13 | * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer |
14 | * length must be at least 64, and a multiple of 16. Based on: |
15 | * |
16 | * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" |
17 | * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 |
18 | */ |
19 | |
20 | #include <emmintrin.h> |
21 | #include <smmintrin.h> |
22 | #include <wmmintrin.h> |
23 | |
24 | uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */ |
25 | const unsigned char *buf, |
26 | z_size_t len, |
27 | uint32_t crc) |
28 | { |
29 | /* |
30 | * Definitions of the bit-reflected domain constants k1,k2,k3, etc and |
31 | * the CRC32+Barrett polynomials given at the end of the paper. |
32 | */ |
33 | static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; |
34 | static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; |
35 | static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; |
36 | static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; |
37 | |
38 | __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; |
39 | |
40 | /* |
41 | * There's at least one block of 64. |
42 | */ |
43 | x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); |
44 | x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); |
45 | x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); |
46 | x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); |
47 | |
48 | x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); |
49 | |
50 | x0 = _mm_load_si128((__m128i *)k1k2); |
51 | |
52 | buf += 64; |
53 | len -= 64; |
54 | |
55 | /* |
56 | * Parallel fold blocks of 64, if any. |
57 | */ |
58 | while (len >= 64) |
59 | { |
60 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
61 | x6 = _mm_clmulepi64_si128(x2, x0, 0x00); |
62 | x7 = _mm_clmulepi64_si128(x3, x0, 0x00); |
63 | x8 = _mm_clmulepi64_si128(x4, x0, 0x00); |
64 | |
65 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
66 | x2 = _mm_clmulepi64_si128(x2, x0, 0x11); |
67 | x3 = _mm_clmulepi64_si128(x3, x0, 0x11); |
68 | x4 = _mm_clmulepi64_si128(x4, x0, 0x11); |
69 | |
70 | y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); |
71 | y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); |
72 | y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); |
73 | y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); |
74 | |
75 | x1 = _mm_xor_si128(x1, x5); |
76 | x2 = _mm_xor_si128(x2, x6); |
77 | x3 = _mm_xor_si128(x3, x7); |
78 | x4 = _mm_xor_si128(x4, x8); |
79 | |
80 | x1 = _mm_xor_si128(x1, y5); |
81 | x2 = _mm_xor_si128(x2, y6); |
82 | x3 = _mm_xor_si128(x3, y7); |
83 | x4 = _mm_xor_si128(x4, y8); |
84 | |
85 | buf += 64; |
86 | len -= 64; |
87 | } |
88 | |
89 | /* |
90 | * Fold into 128-bits. |
91 | */ |
92 | x0 = _mm_load_si128((__m128i *)k3k4); |
93 | |
94 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
95 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
96 | x1 = _mm_xor_si128(x1, x2); |
97 | x1 = _mm_xor_si128(x1, x5); |
98 | |
99 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
100 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
101 | x1 = _mm_xor_si128(x1, x3); |
102 | x1 = _mm_xor_si128(x1, x5); |
103 | |
104 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
105 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
106 | x1 = _mm_xor_si128(x1, x4); |
107 | x1 = _mm_xor_si128(x1, x5); |
108 | |
109 | /* |
110 | * Single fold blocks of 16, if any. |
111 | */ |
112 | while (len >= 16) |
113 | { |
114 | x2 = _mm_loadu_si128((__m128i *)buf); |
115 | |
116 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
117 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
118 | x1 = _mm_xor_si128(x1, x2); |
119 | x1 = _mm_xor_si128(x1, x5); |
120 | |
121 | buf += 16; |
122 | len -= 16; |
123 | } |
124 | |
125 | /* |
126 | * Fold 128-bits to 64-bits. |
127 | */ |
128 | x2 = _mm_clmulepi64_si128(x1, x0, 0x10); |
129 | x3 = _mm_setr_epi32(~0, 0, ~0, 0); |
130 | x1 = _mm_srli_si128(x1, 8); |
131 | x1 = _mm_xor_si128(x1, x2); |
132 | |
133 | x0 = _mm_loadl_epi64((__m128i*)k5k0); |
134 | |
135 | x2 = _mm_srli_si128(x1, 4); |
136 | x1 = _mm_and_si128(x1, x3); |
137 | x1 = _mm_clmulepi64_si128(x1, x0, 0x00); |
138 | x1 = _mm_xor_si128(x1, x2); |
139 | |
140 | /* |
141 | * Barret reduce to 32-bits. |
142 | */ |
143 | x0 = _mm_load_si128((__m128i*)poly); |
144 | |
145 | x2 = _mm_and_si128(x1, x3); |
146 | x2 = _mm_clmulepi64_si128(x2, x0, 0x10); |
147 | x2 = _mm_and_si128(x2, x3); |
148 | x2 = _mm_clmulepi64_si128(x2, x0, 0x00); |
149 | x1 = _mm_xor_si128(x1, x2); |
150 | |
151 | /* |
152 | * Return the crc32. |
153 | */ |
154 | return _mm_extract_epi32(x1, 1); |
155 | } |
156 | |
157 | #elif defined(CRC32_ARMV8_CRC32) |
158 | |
159 | /* CRC32 checksums using ARMv8-a crypto instructions. |
160 | * |
161 | * TODO: implement a version using the PMULL instruction. |
162 | */ |
163 | |
164 | #if defined(__clang__) |
165 | /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an |
166 | * armv8 target, which is incompatible with ThinLTO optimizations on Android. |
167 | * (Namely, mixing and matching different module-level targets makes ThinLTO |
168 | * warn, and Android defaults to armv7-a. This restriction does not apply to |
169 | * function-level `target`s, however.) |
170 | * |
171 | * Since we only need four crc intrinsics, and since clang's implementation of |
172 | * those are just wrappers around compiler builtins, it's simplest to #define |
173 | * those builtins directly. If this #define list grows too much (or we depend on |
174 | * an intrinsic that isn't a trivial wrapper), we may have to find a better way |
175 | * to go about this. |
176 | * |
177 | * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized |
178 | * feature for this target (ignoring feature)." This appears to be a harmless |
179 | * bug in clang. |
180 | */ |
181 | #define __crc32b __builtin_arm_crc32b |
182 | #define __crc32d __builtin_arm_crc32d |
183 | #define __crc32w __builtin_arm_crc32w |
184 | #define __crc32cw __builtin_arm_crc32cw |
185 | |
186 | #if defined(__aarch64__) |
187 | #define TARGET_ARMV8_WITH_CRC __attribute__((target("crc"))) |
188 | #else // !defined(__aarch64__) |
189 | #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc"))) |
190 | #endif // defined(__aarch64__) |
191 | |
192 | #elif defined(__GNUC__) |
193 | /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not |
194 | * allowed. We can just include arm_acle.h. |
195 | */ |
196 | #include <arm_acle.h> |
197 | #define TARGET_ARMV8_WITH_CRC |
198 | #else // !defined(__GNUC__) && !defined(_aarch64__) |
199 | #error ARM CRC32 SIMD extensions only supported for Clang and GCC |
200 | #endif |
201 | |
202 | TARGET_ARMV8_WITH_CRC |
203 | uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc, |
204 | const unsigned char *buf, |
205 | z_size_t len) |
206 | { |
207 | uint32_t c = (uint32_t) ~crc; |
208 | |
209 | while (len && ((uintptr_t)buf & 7)) { |
210 | c = __crc32b(c, *buf++); |
211 | --len; |
212 | } |
213 | |
214 | const uint64_t *buf8 = (const uint64_t *)buf; |
215 | |
216 | while (len >= 64) { |
217 | c = __crc32d(c, *buf8++); |
218 | c = __crc32d(c, *buf8++); |
219 | c = __crc32d(c, *buf8++); |
220 | c = __crc32d(c, *buf8++); |
221 | |
222 | c = __crc32d(c, *buf8++); |
223 | c = __crc32d(c, *buf8++); |
224 | c = __crc32d(c, *buf8++); |
225 | c = __crc32d(c, *buf8++); |
226 | len -= 64; |
227 | } |
228 | |
229 | while (len >= 8) { |
230 | c = __crc32d(c, *buf8++); |
231 | len -= 8; |
232 | } |
233 | |
234 | buf = (const unsigned char *)buf8; |
235 | |
236 | while (len--) { |
237 | c = __crc32b(c, *buf++); |
238 | } |
239 | |
240 | return ~c; |
241 | } |
242 | |
243 | #endif |
244 | |