1 | /* |
2 | * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands |
3 | * This software is provided 'as-is', without any express or implied |
4 | * warranty. In no event will the author be held liable for any damages |
5 | * arising from the use of this software. |
6 | * Permission is granted to anyone to use this software for any purpose, |
7 | * including commercial applications, and to alter it and redistribute it |
8 | * freely, subject to the following restrictions: |
9 | * 1. The origin of this software must not be misrepresented; you must not |
10 | * claim that you wrote the original software. If you use this software |
11 | * in a product, an acknowledgment in the product documentation would be |
12 | * appreciated but is not required. |
13 | * 2. Altered source versions must be plainly marked as such, and must not be |
14 | * misrepresented as being the original software. |
15 | * 3. This notice may not be removed or altered from any source distribution. |
16 | * Ferry Toth |
17 | * ftoth@exalondelft.nl |
18 | * |
19 | * https://github.com/htot/crc32c |
20 | * |
21 | * Modified by Facebook |
22 | * |
23 | * Original intel whitepaper: |
24 | * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction" |
25 | * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf |
26 | * |
27 | * 32-bit support dropped |
28 | * use intrinsics instead of inline asm |
29 | * other code cleanup |
30 | */ |
31 | |
32 | #include <stdexcept> |
33 | |
34 | #include <folly/hash/detail/ChecksumDetail.h> |
35 | |
36 | #include <folly/CppAttributes.h> |
37 | |
38 | #include <boost/preprocessor/arithmetic/add.hpp> |
39 | #include <boost/preprocessor/arithmetic/sub.hpp> |
40 | #include <boost/preprocessor/repetition/repeat_from_to.hpp> |
41 | |
42 | namespace folly { |
43 | namespace detail { |
44 | |
45 | #if defined(FOLLY_X64) && FOLLY_SSE_PREREQ(4, 2) |
46 | |
47 | namespace crc32_detail { |
48 | |
49 | #define CRCtriplet(crc, buf, offset) \ |
50 | crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ |
51 | crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \ |
52 | crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset)); \ |
53 | FOLLY_FALLTHROUGH; |
54 | |
55 | #define CRCduplet(crc, buf, offset) \ |
56 | crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ |
57 | crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); |
58 | |
59 | #define CRCsinglet(crc, buf, offset) \ |
60 | crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset)); \ |
61 | FOLLY_FALLTHROUGH; |
62 | |
63 | #define CASEREPEAT_TRIPLET(unused, count, total) \ |
64 | case BOOST_PP_ADD(1, BOOST_PP_SUB(total, count)): \ |
65 | CRCtriplet(crc, next, -BOOST_PP_ADD(1, BOOST_PP_SUB(total, count))); |
66 | |
67 | #define CASEREPEAT_SINGLET(unused, count, total) \ |
68 | case BOOST_PP_SUB(total, count): \ |
69 | CRCsinglet(crc0, next, -BOOST_PP_SUB(total, count) * 8); |
70 | |
71 | // Numbers taken directly from intel whitepaper. |
72 | // clang-format off |
73 | const uint64_t clmul_constants[] = { |
74 | 0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6, |
75 | 0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e, |
76 | 0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da, |
77 | 0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8, |
78 | 0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296, |
79 | 0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2, |
80 | 0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6, |
81 | 0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092, |
82 | 0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0, |
83 | 0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456, |
84 | 0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e, |
85 | 0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a, |
86 | 0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574, |
87 | 0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832, |
88 | 0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124, |
89 | 0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86, |
90 | 0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e, |
91 | 0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a, |
92 | 0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46, |
93 | 0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a, |
94 | 0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a, |
95 | 0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4, |
96 | 0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56, |
97 | 0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2, |
98 | 0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c, |
99 | 0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac, |
100 | 0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64, |
101 | 0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e, |
102 | 0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c, |
103 | 0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28, |
104 | 0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26, |
105 | 0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c, |
106 | 0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c, |
107 | 0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c, |
108 | 0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4, |
109 | 0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844, |
110 | 0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c, |
111 | 0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730, |
112 | 0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c, |
113 | 0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2, |
114 | 0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2, |
115 | 0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e, |
116 | 0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a, |
117 | 0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a, |
118 | 0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a, |
119 | 0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768, |
120 | 0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4, |
121 | 0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c, |
122 | 0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba, |
123 | 0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312, |
124 | 0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544, |
125 | 0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a, |
126 | 0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e, |
127 | 0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a, |
128 | 0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c, |
129 | 0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a, |
130 | 0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6, |
131 | 0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca, |
132 | 0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888, |
133 | 0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e, |
134 | 0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528, |
135 | 0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a, |
136 | 0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e, |
137 | 0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa, |
138 | }; |
139 | // clang-format on |
140 | |
141 | /* |
142 | * CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well |
143 | * chosen constant and xor's these with the remaining CRC. |
144 | */ |
145 | uint64_t CombineCRC( |
146 | size_t block_size, |
147 | uint64_t crc0, |
148 | uint64_t crc1, |
149 | uint64_t crc2, |
150 | const uint64_t* next2) { |
151 | const auto multiplier = |
152 | *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1); |
153 | const auto crc0_xmm = _mm_set_epi64x(0, crc0); |
154 | const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00); |
155 | const auto crc1_xmm = _mm_set_epi64x(0, crc1); |
156 | const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10); |
157 | const auto res = _mm_xor_si128(res0, res1); |
158 | crc0 = _mm_cvtsi128_si64(res); |
159 | crc0 = crc0 ^ *((uint64_t*)next2 - 1); |
160 | crc2 = _mm_crc32_u64(crc2, crc0); |
161 | return crc2; |
162 | } |
163 | |
164 | // Generates a block that will crc up to 7 bytes of unaligned data. |
165 | // Always inline to avoid overhead on small crc sizes. |
166 | FOLLY_ALWAYS_INLINE void align_to_8( |
167 | size_t align, |
168 | uint64_t& crc0, // crc so far, updated on return |
169 | const unsigned char*& next) { // next data pointer, updated on return |
170 | uint32_t crc32bit = static_cast<uint32_t>(crc0); |
171 | if (align & 0x04) { |
172 | crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next); |
173 | next += sizeof(uint32_t); |
174 | } |
175 | if (align & 0x02) { |
176 | crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next); |
177 | next += sizeof(uint16_t); |
178 | } |
179 | if (align & 0x01) { |
180 | crc32bit = _mm_crc32_u8(crc32bit, *(next)); |
181 | next++; |
182 | } |
183 | crc0 = crc32bit; |
184 | } |
185 | |
186 | // The main loop for large crc sizes. Generates three crc32c |
187 | // streams, of varying block sizes, using a duff's device. |
188 | void triplet_loop( |
189 | size_t block_size, |
190 | uint64_t& crc0, // crc so far, updated on return |
191 | const unsigned char*& next, // next data pointer, updated on return |
192 | size_t n) { // block count |
193 | uint64_t crc1 = 0, crc2 = 0; |
194 | // points to the first byte of the next block |
195 | const uint64_t* next0 = (uint64_t*)next + block_size; |
196 | const uint64_t* next1 = next0 + block_size; |
197 | const uint64_t* next2 = next1 + block_size; |
198 | |
199 | // Use Duff's device, a for() loop inside a switch() |
200 | // statement. This needs to execute at least once, round len |
201 | // down to nearest triplet multiple |
202 | switch (block_size) { |
203 | case 128: |
204 | do { |
205 | // jumps here for a full block of len 128 |
206 | CRCtriplet(crc, next, -128); |
207 | |
208 | // Generates case statements from 127 to 2 of form: |
209 | // case 127: |
210 | // CRCtriplet(crc, next, -127); |
211 | // |
212 | // MSVC has a max preprocessor expansion depth of 255, which is |
213 | // exceeded if this is a single statement. |
214 | BOOST_PP_REPEAT_FROM_TO(0, 64, CASEREPEAT_TRIPLET, 126); |
215 | BOOST_PP_REPEAT_FROM_TO(0, 62, CASEREPEAT_TRIPLET, 62); |
216 | |
217 | // For the last byte, the three crc32c streams must be combined |
218 | // using carry-less multiplication. |
219 | case 1: |
220 | CRCduplet(crc, next, -1); // the final triplet is actually only 2 |
221 | crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2); |
222 | if (--n > 0) { |
223 | crc1 = crc2 = 0; |
224 | block_size = 128; |
225 | // points to the first byte of the next block |
226 | next0 = next2 + 128; |
227 | next1 = next0 + 128; // from here on all blocks are 128 long |
228 | next2 = next1 + 128; |
229 | } |
230 | FOLLY_FALLTHROUGH; |
231 | case 0:; |
232 | } while (n > 0); |
233 | } |
234 | |
235 | next = (const unsigned char*)next2; |
236 | } |
237 | |
238 | } // namespace crc32_detail |
239 | |
240 | /* Compute CRC-32C using the Intel hardware instruction. */ |
241 | FOLLY_TARGET_ATTRIBUTE("sse4.2" ) |
242 | uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) { |
243 | const unsigned char* next = (const unsigned char*)buf; |
244 | size_t count; |
245 | uint64_t crc0; |
246 | crc0 = crc; |
247 | |
248 | if (len >= 8) { |
249 | // if len > 216 then align and use triplets |
250 | if (len > 216) { |
251 | size_t align = (8 - (uintptr_t)next) & 7; |
252 | crc32_detail::align_to_8(align, crc0, next); |
253 | len -= align; |
254 | |
255 | count = len / 24; // number of triplets |
256 | len %= 24; // bytes remaining |
257 | size_t n = count >> 7; // #blocks = first block + full blocks |
258 | size_t block_size = count & 127; |
259 | if (block_size == 0) { |
260 | block_size = 128; |
261 | } else { |
262 | n++; |
263 | } |
264 | |
265 | // This is a separate function call mainly to stop |
266 | // clang from spilling registers. |
267 | crc32_detail::triplet_loop(block_size, crc0, next, n); |
268 | } |
269 | |
270 | size_t count2 = len >> 3; |
271 | len = len & 7; |
272 | next += (count2 * 8); |
273 | |
274 | // Generates a duff device for the last 128 bytes of aligned data. |
275 | switch (count2) { |
276 | // Generates case statements of the form: |
277 | // case 27: |
278 | // CRCsinglet(crc0, next, -27 * 8); |
279 | BOOST_PP_REPEAT_FROM_TO(0, 27, CASEREPEAT_SINGLET, 27); |
280 | case 0:; |
281 | } |
282 | } |
283 | |
284 | // compute the crc for up to seven trailing bytes |
285 | crc32_detail::align_to_8(len, crc0, next); |
286 | return (uint32_t)crc0; |
287 | } |
288 | |
289 | #else |
290 | |
291 | uint32_t |
292 | crc32c_hw(const uint8_t* /* buf */, size_t /* len */, uint32_t /* crc */) { |
293 | throw std::runtime_error("crc32_hw is not implemented on this platform" ); |
294 | } |
295 | |
296 | #endif |
297 | |
298 | } // namespace detail |
299 | } // namespace folly |
300 | |