1/*
2 * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
3 * This software is provided 'as-is', without any express or implied
4 * warranty. In no event will the author be held liable for any damages
5 * arising from the use of this software.
6 * Permission is granted to anyone to use this software for any purpose,
7 * including commercial applications, and to alter it and redistribute it
8 * freely, subject to the following restrictions:
9 * 1. The origin of this software must not be misrepresented; you must not
10 * claim that you wrote the original software. If you use this software
11 * in a product, an acknowledgment in the product documentation would be
12 * appreciated but is not required.
13 * 2. Altered source versions must be plainly marked as such, and must not be
14 * misrepresented as being the original software.
15 * 3. This notice may not be removed or altered from any source distribution.
16 * Ferry Toth
17 * ftoth@exalondelft.nl
18 *
19 * https://github.com/htot/crc32c
20 *
21 * Modified by Facebook
22 *
23 * Original intel whitepaper:
24 * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
25 * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
26 *
27 * 32-bit support dropped
28 * use intrinsics instead of inline asm
29 * other code cleanup
30 */
31
32#include <stdexcept>
33
34#include <folly/hash/detail/ChecksumDetail.h>
35
36#include <folly/CppAttributes.h>
37
38#include <boost/preprocessor/arithmetic/add.hpp>
39#include <boost/preprocessor/arithmetic/sub.hpp>
40#include <boost/preprocessor/repetition/repeat_from_to.hpp>
41
42namespace folly {
43namespace detail {
44
45#if defined(FOLLY_X64) && FOLLY_SSE_PREREQ(4, 2)
46
47namespace crc32_detail {
48
49#define CRCtriplet(crc, buf, offset) \
50 crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
51 crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \
52 crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset)); \
53 FOLLY_FALLTHROUGH;
54
55#define CRCduplet(crc, buf, offset) \
56 crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
57 crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset));
58
59#define CRCsinglet(crc, buf, offset) \
60 crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset)); \
61 FOLLY_FALLTHROUGH;
62
63#define CASEREPEAT_TRIPLET(unused, count, total) \
64 case BOOST_PP_ADD(1, BOOST_PP_SUB(total, count)): \
65 CRCtriplet(crc, next, -BOOST_PP_ADD(1, BOOST_PP_SUB(total, count)));
66
67#define CASEREPEAT_SINGLET(unused, count, total) \
68 case BOOST_PP_SUB(total, count): \
69 CRCsinglet(crc0, next, -BOOST_PP_SUB(total, count) * 8);
70
71// Numbers taken directly from intel whitepaper.
72// clang-format off
73const uint64_t clmul_constants[] = {
74 0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6,
75 0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e,
76 0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da,
77 0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8,
78 0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296,
79 0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2,
80 0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6,
81 0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092,
82 0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0,
83 0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456,
84 0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e,
85 0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a,
86 0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574,
87 0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832,
88 0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124,
89 0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86,
90 0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e,
91 0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a,
92 0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46,
93 0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a,
94 0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a,
95 0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4,
96 0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56,
97 0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2,
98 0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c,
99 0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac,
100 0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64,
101 0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e,
102 0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c,
103 0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28,
104 0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26,
105 0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c,
106 0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c,
107 0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c,
108 0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4,
109 0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844,
110 0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c,
111 0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730,
112 0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c,
113 0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2,
114 0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2,
115 0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e,
116 0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a,
117 0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a,
118 0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a,
119 0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768,
120 0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4,
121 0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c,
122 0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba,
123 0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312,
124 0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544,
125 0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a,
126 0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e,
127 0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a,
128 0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c,
129 0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a,
130 0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6,
131 0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca,
132 0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888,
133 0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e,
134 0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528,
135 0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a,
136 0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e,
137 0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa,
138};
139// clang-format on
140
141/*
142 * CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
143 * chosen constant and xor's these with the remaining CRC.
144 */
145uint64_t CombineCRC(
146 size_t block_size,
147 uint64_t crc0,
148 uint64_t crc1,
149 uint64_t crc2,
150 const uint64_t* next2) {
151 const auto multiplier =
152 *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1);
153 const auto crc0_xmm = _mm_set_epi64x(0, crc0);
154 const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00);
155 const auto crc1_xmm = _mm_set_epi64x(0, crc1);
156 const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10);
157 const auto res = _mm_xor_si128(res0, res1);
158 crc0 = _mm_cvtsi128_si64(res);
159 crc0 = crc0 ^ *((uint64_t*)next2 - 1);
160 crc2 = _mm_crc32_u64(crc2, crc0);
161 return crc2;
162}
163
164// Generates a block that will crc up to 7 bytes of unaligned data.
165// Always inline to avoid overhead on small crc sizes.
166FOLLY_ALWAYS_INLINE void align_to_8(
167 size_t align,
168 uint64_t& crc0, // crc so far, updated on return
169 const unsigned char*& next) { // next data pointer, updated on return
170 uint32_t crc32bit = static_cast<uint32_t>(crc0);
171 if (align & 0x04) {
172 crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next);
173 next += sizeof(uint32_t);
174 }
175 if (align & 0x02) {
176 crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next);
177 next += sizeof(uint16_t);
178 }
179 if (align & 0x01) {
180 crc32bit = _mm_crc32_u8(crc32bit, *(next));
181 next++;
182 }
183 crc0 = crc32bit;
184}
185
186// The main loop for large crc sizes. Generates three crc32c
187// streams, of varying block sizes, using a duff's device.
188void triplet_loop(
189 size_t block_size,
190 uint64_t& crc0, // crc so far, updated on return
191 const unsigned char*& next, // next data pointer, updated on return
192 size_t n) { // block count
193 uint64_t crc1 = 0, crc2 = 0;
194 // points to the first byte of the next block
195 const uint64_t* next0 = (uint64_t*)next + block_size;
196 const uint64_t* next1 = next0 + block_size;
197 const uint64_t* next2 = next1 + block_size;
198
199 // Use Duff's device, a for() loop inside a switch()
200 // statement. This needs to execute at least once, round len
201 // down to nearest triplet multiple
202 switch (block_size) {
203 case 128:
204 do {
205 // jumps here for a full block of len 128
206 CRCtriplet(crc, next, -128);
207
208 // Generates case statements from 127 to 2 of form:
209 // case 127:
210 // CRCtriplet(crc, next, -127);
211 //
212 // MSVC has a max preprocessor expansion depth of 255, which is
213 // exceeded if this is a single statement.
214 BOOST_PP_REPEAT_FROM_TO(0, 64, CASEREPEAT_TRIPLET, 126);
215 BOOST_PP_REPEAT_FROM_TO(0, 62, CASEREPEAT_TRIPLET, 62);
216
217 // For the last byte, the three crc32c streams must be combined
218 // using carry-less multiplication.
219 case 1:
220 CRCduplet(crc, next, -1); // the final triplet is actually only 2
221 crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2);
222 if (--n > 0) {
223 crc1 = crc2 = 0;
224 block_size = 128;
225 // points to the first byte of the next block
226 next0 = next2 + 128;
227 next1 = next0 + 128; // from here on all blocks are 128 long
228 next2 = next1 + 128;
229 }
230 FOLLY_FALLTHROUGH;
231 case 0:;
232 } while (n > 0);
233 }
234
235 next = (const unsigned char*)next2;
236}
237
238} // namespace crc32_detail
239
240/* Compute CRC-32C using the Intel hardware instruction. */
241FOLLY_TARGET_ATTRIBUTE("sse4.2")
242uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) {
243 const unsigned char* next = (const unsigned char*)buf;
244 size_t count;
245 uint64_t crc0;
246 crc0 = crc;
247
248 if (len >= 8) {
249 // if len > 216 then align and use triplets
250 if (len > 216) {
251 size_t align = (8 - (uintptr_t)next) & 7;
252 crc32_detail::align_to_8(align, crc0, next);
253 len -= align;
254
255 count = len / 24; // number of triplets
256 len %= 24; // bytes remaining
257 size_t n = count >> 7; // #blocks = first block + full blocks
258 size_t block_size = count & 127;
259 if (block_size == 0) {
260 block_size = 128;
261 } else {
262 n++;
263 }
264
265 // This is a separate function call mainly to stop
266 // clang from spilling registers.
267 crc32_detail::triplet_loop(block_size, crc0, next, n);
268 }
269
270 size_t count2 = len >> 3;
271 len = len & 7;
272 next += (count2 * 8);
273
274 // Generates a duff device for the last 128 bytes of aligned data.
275 switch (count2) {
276 // Generates case statements of the form:
277 // case 27:
278 // CRCsinglet(crc0, next, -27 * 8);
279 BOOST_PP_REPEAT_FROM_TO(0, 27, CASEREPEAT_SINGLET, 27);
280 case 0:;
281 }
282 }
283
284 // compute the crc for up to seven trailing bytes
285 crc32_detail::align_to_8(len, crc0, next);
286 return (uint32_t)crc0;
287}
288
289#else
290
291uint32_t
292crc32c_hw(const uint8_t* /* buf */, size_t /* len */, uint32_t /* crc */) {
293 throw std::runtime_error("crc32_hw is not implemented on this platform");
294}
295
296#endif
297
298} // namespace detail
299} // namespace folly
300