crc32_simd.c source code [engine/third_party/zlib/crc32_simd.c]

1	/ crc32_simd.c*
2	*
3	* Copyright 2017 The Chromium Authors. All rights reserved.
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the Chromium source repository LICENSE file.
6	*/
7
8	#include "crc32_simd.h"
9
10	#if defined(CRC32_SIMD_SSE42_PCLMUL)
11
12	/*
13	* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
14	* length must be at least 64, and a multiple of 16. Based on:
15	*
16	* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
17	* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
18	*/
19
20	#include <emmintrin.h>
21	#include <smmintrin.h>
22	#include <wmmintrin.h>
23
24	uint32_t ZLIB_INTERNAL crc32_sse42_simd_( / SSE4.2+PCLMUL /
25	const unsigned char *buf,
26	z_size_t len,
27	uint32_t crc)
28	{
29	/*
30	* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
31	* the CRC32+Barrett polynomials given at the end of the paper.
32	*/
33	static const uint64_t zalign(`16`) k1k2[] = { `0x0154442bd4`, `0x01c6e41596` };
34	static const uint64_t zalign(`16`) k3k4[] = { `0x01751997d0`, `0x00ccaa009e` };
35	static const uint64_t zalign(`16`) k5k0[] = { `0x0163cd6124`, `0x0000000000` };
36	static const uint64_t zalign(`16`) poly[] = { `0x01db710641`, `0x01f7011641` };
37
38	__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
39
40	/*
41	* There's at least one block of 64.
42	*/
43	x1 = _mm_loadu_si128((__m128i *)(buf + `0x00`));
44	x2 = _mm_loadu_si128((__m128i *)(buf + `0x10`));
45	x3 = _mm_loadu_si128((__m128i *)(buf + `0x20`));
46	x4 = _mm_loadu_si128((__m128i *)(buf + `0x30`));
47
48	x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
49
50	x0 = _mm_load_si128((__m128i *)k1k2);
51
52	buf += `64`;
53	len -= `64`;
54
55	/*
56	* Parallel fold blocks of 64, if any.
57	*/
58	while (len >= `64`)
59	{
60	x5 = _mm_clmulepi64_si128(x1, x0, `0x00`);
61	x6 = _mm_clmulepi64_si128(x2, x0, `0x00`);
62	x7 = _mm_clmulepi64_si128(x3, x0, `0x00`);
63	x8 = _mm_clmulepi64_si128(x4, x0, `0x00`);
64
65	x1 = _mm_clmulepi64_si128(x1, x0, `0x11`);
66	x2 = _mm_clmulepi64_si128(x2, x0, `0x11`);
67	x3 = _mm_clmulepi64_si128(x3, x0, `0x11`);
68	x4 = _mm_clmulepi64_si128(x4, x0, `0x11`);
69
70	y5 = _mm_loadu_si128((__m128i *)(buf + `0x00`));
71	y6 = _mm_loadu_si128((__m128i *)(buf + `0x10`));
72	y7 = _mm_loadu_si128((__m128i *)(buf + `0x20`));
73	y8 = _mm_loadu_si128((__m128i *)(buf + `0x30`));
74
75	x1 = _mm_xor_si128(x1, x5);
76	x2 = _mm_xor_si128(x2, x6);
77	x3 = _mm_xor_si128(x3, x7);
78	x4 = _mm_xor_si128(x4, x8);
79
80	x1 = _mm_xor_si128(x1, y5);
81	x2 = _mm_xor_si128(x2, y6);
82	x3 = _mm_xor_si128(x3, y7);
83	x4 = _mm_xor_si128(x4, y8);
84
85	buf += `64`;
86	len -= `64`;
87	}
88
89	/*
90	* Fold into 128-bits.
91	*/
92	x0 = _mm_load_si128((__m128i *)k3k4);
93
94	x5 = _mm_clmulepi64_si128(x1, x0, `0x00`);
95	x1 = _mm_clmulepi64_si128(x1, x0, `0x11`);
96	x1 = _mm_xor_si128(x1, x2);
97	x1 = _mm_xor_si128(x1, x5);
98
99	x5 = _mm_clmulepi64_si128(x1, x0, `0x00`);
100	x1 = _mm_clmulepi64_si128(x1, x0, `0x11`);
101	x1 = _mm_xor_si128(x1, x3);
102	x1 = _mm_xor_si128(x1, x5);
103
104	x5 = _mm_clmulepi64_si128(x1, x0, `0x00`);
105	x1 = _mm_clmulepi64_si128(x1, x0, `0x11`);
106	x1 = _mm_xor_si128(x1, x4);
107	x1 = _mm_xor_si128(x1, x5);
108
109	/*
110	* Single fold blocks of 16, if any.
111	*/
112	while (len >= `16`)
113	{
114	x2 = _mm_loadu_si128((__m128i *)buf);
115
116	x5 = _mm_clmulepi64_si128(x1, x0, `0x00`);
117	x1 = _mm_clmulepi64_si128(x1, x0, `0x11`);
118	x1 = _mm_xor_si128(x1, x2);
119	x1 = _mm_xor_si128(x1, x5);
120
121	buf += `16`;
122	len -= `16`;
123	}
124
125	/*
126	* Fold 128-bits to 64-bits.
127	*/
128	x2 = _mm_clmulepi64_si128(x1, x0, `0x10`);
129	x3 = _mm_setr_epi32(~`0`, `0`, ~`0`, `0`);
130	x1 = _mm_srli_si128(x1, `8`);
131	x1 = _mm_xor_si128(x1, x2);
132
133	x0 = _mm_loadl_epi64((__m128i*)k5k0);
134
135	x2 = _mm_srli_si128(x1, `4`);
136	x1 = _mm_and_si128(x1, x3);
137	x1 = _mm_clmulepi64_si128(x1, x0, `0x00`);
138	x1 = _mm_xor_si128(x1, x2);
139
140	/*
141	* Barret reduce to 32-bits.
142	*/
143	x0 = _mm_load_si128((__m128i*)poly);
144
145	x2 = _mm_and_si128(x1, x3);
146	x2 = _mm_clmulepi64_si128(x2, x0, `0x10`);
147	x2 = _mm_and_si128(x2, x3);
148	x2 = _mm_clmulepi64_si128(x2, x0, `0x00`);
149	x1 = _mm_xor_si128(x1, x2);
150
151	/*
152	* Return the crc32.
153	*/
154	return _mm_extract_epi32(x1, `1`);
155	}
156
157	#elif defined(CRC32_ARMV8_CRC32)
158
159	/ CRC32 checksums using ARMv8-a crypto instructions.*
160	*
161	* TODO: implement a version using the PMULL instruction.
162	*/
163
164	#if defined(__clang__)
165	/ CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an*
166	* armv8 target, which is incompatible with ThinLTO optimizations on Android.
167	* (Namely, mixing and matching different module-level targets makes ThinLTO
168	* warn, and Android defaults to armv7-a. This restriction does not apply to
169	* function-level `target`s, however.)
170	*
171	* Since we only need four crc intrinsics, and since clang's implementation of
172	* those are just wrappers around compiler builtins, it's simplest to #define
173	* those builtins directly. If this #define list grows too much (or we depend on
174	* an intrinsic that isn't a trivial wrapper), we may have to find a better way
175	* to go about this.
176	*
177	* NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
178	* feature for this target (ignoring feature)." This appears to be a harmless
179	* bug in clang.
180	*/
181	#define __crc32b __builtin_arm_crc32b
182	#define __crc32d __builtin_arm_crc32d
183	#define __crc32w __builtin_arm_crc32w
184	#define __crc32cw __builtin_arm_crc32cw
185
186	#if defined(__aarch64__)
187	#define TARGET_ARMV8_WITH_CRC __attribute__((target("crc")))
188	#else // !defined(__aarch64__)
189	#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
190	#endif // defined(__aarch64__)
191
192	#elif defined(__GNUC__)
193	/ For GCC, we are setting CRC extensions at module level, so ThinLTO is not*
194	* allowed. We can just include arm_acle.h.
195	*/
196	#include <arm_acle.h>
197	#define TARGET_ARMV8_WITH_CRC
198	#else // !defined(__GNUC__) && !defined(_aarch64__)
199	#error ARM CRC32 SIMD extensions only supported for Clang and GCC
200	#endif
201
202	TARGET_ARMV8_WITH_CRC
203	uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
204	const unsigned char *buf,
205	z_size_t len)
206	{
207	uint32_t c = (uint32_t) ~crc;
208
209	while (len && ((uintptr_t)buf & `7`)) {
210	c = __crc32b(c, *buf++);
211	--len;
212	}
213
214	const uint64_t buf8 = (const* uint64_t *)buf;
215
216	while (len >= `64`) {
217	c = __crc32d(c, *buf8++);
218	c = __crc32d(c, *buf8++);
219	c = __crc32d(c, *buf8++);
220	c = __crc32d(c, *buf8++);
221
222	c = __crc32d(c, *buf8++);
223	c = __crc32d(c, *buf8++);
224	c = __crc32d(c, *buf8++);
225	c = __crc32d(c, *buf8++);
226	len -= `64`;
227	}
228
229	while (len >= `8`) {
230	c = __crc32d(c, *buf8++);
231	len -= `8`;
232	}
233
234	buf = (const unsigned char *)buf8;
235
236	while (len--) {
237	c = __crc32b(c, *buf++);
238	}
239
240	return ~c;
241	}
242
243	#endif
244

Browse the source code of engine/third_party/zlib/crc32_simd.c