| 1 | /* crc32_simd.c | 
|---|
| 2 | * | 
|---|
| 3 | * Copyright 2017 The Chromium Authors. All rights reserved. | 
|---|
| 4 | * Use of this source code is governed by a BSD-style license that can be | 
|---|
| 5 | * found in the Chromium source repository LICENSE file. | 
|---|
| 6 | */ | 
|---|
| 7 |  | 
|---|
| 8 | #include "crc32_simd.h" | 
|---|
| 9 |  | 
|---|
| 10 | #if defined(CRC32_SIMD_SSE42_PCLMUL) | 
|---|
| 11 |  | 
|---|
| 12 | /* | 
|---|
| 13 | * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer | 
|---|
| 14 | * length must be at least 64, and a multiple of 16. Based on: | 
|---|
| 15 | * | 
|---|
| 16 | * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" | 
|---|
| 17 | *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 | 
|---|
| 18 | */ | 
|---|
| 19 |  | 
|---|
| 20 | #include <emmintrin.h> | 
|---|
| 21 | #include <smmintrin.h> | 
|---|
| 22 | #include <wmmintrin.h> | 
|---|
| 23 |  | 
|---|
| 24 | uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */ | 
|---|
| 25 | const unsigned char *buf, | 
|---|
| 26 | z_size_t len, | 
|---|
| 27 | uint32_t crc) | 
|---|
| 28 | { | 
|---|
| 29 | /* | 
|---|
| 30 | * Definitions of the bit-reflected domain constants k1,k2,k3, etc and | 
|---|
| 31 | * the CRC32+Barrett polynomials given at the end of the paper. | 
|---|
| 32 | */ | 
|---|
| 33 | static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; | 
|---|
| 34 | static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; | 
|---|
| 35 | static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; | 
|---|
| 36 | static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; | 
|---|
| 37 |  | 
|---|
| 38 | __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; | 
|---|
| 39 |  | 
|---|
| 40 | /* | 
|---|
| 41 | * There's at least one block of 64. | 
|---|
| 42 | */ | 
|---|
| 43 | x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); | 
|---|
| 44 | x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); | 
|---|
| 45 | x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); | 
|---|
| 46 | x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); | 
|---|
| 47 |  | 
|---|
| 48 | x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); | 
|---|
| 49 |  | 
|---|
| 50 | x0 = _mm_load_si128((__m128i *)k1k2); | 
|---|
| 51 |  | 
|---|
| 52 | buf += 64; | 
|---|
| 53 | len -= 64; | 
|---|
| 54 |  | 
|---|
| 55 | /* | 
|---|
| 56 | * Parallel fold blocks of 64, if any. | 
|---|
| 57 | */ | 
|---|
| 58 | while (len >= 64) | 
|---|
| 59 | { | 
|---|
| 60 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
|---|
| 61 | x6 = _mm_clmulepi64_si128(x2, x0, 0x00); | 
|---|
| 62 | x7 = _mm_clmulepi64_si128(x3, x0, 0x00); | 
|---|
| 63 | x8 = _mm_clmulepi64_si128(x4, x0, 0x00); | 
|---|
| 64 |  | 
|---|
| 65 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | 
|---|
| 66 | x2 = _mm_clmulepi64_si128(x2, x0, 0x11); | 
|---|
| 67 | x3 = _mm_clmulepi64_si128(x3, x0, 0x11); | 
|---|
| 68 | x4 = _mm_clmulepi64_si128(x4, x0, 0x11); | 
|---|
| 69 |  | 
|---|
| 70 | y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); | 
|---|
| 71 | y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); | 
|---|
| 72 | y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); | 
|---|
| 73 | y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); | 
|---|
| 74 |  | 
|---|
| 75 | x1 = _mm_xor_si128(x1, x5); | 
|---|
| 76 | x2 = _mm_xor_si128(x2, x6); | 
|---|
| 77 | x3 = _mm_xor_si128(x3, x7); | 
|---|
| 78 | x4 = _mm_xor_si128(x4, x8); | 
|---|
| 79 |  | 
|---|
| 80 | x1 = _mm_xor_si128(x1, y5); | 
|---|
| 81 | x2 = _mm_xor_si128(x2, y6); | 
|---|
| 82 | x3 = _mm_xor_si128(x3, y7); | 
|---|
| 83 | x4 = _mm_xor_si128(x4, y8); | 
|---|
| 84 |  | 
|---|
| 85 | buf += 64; | 
|---|
| 86 | len -= 64; | 
|---|
| 87 | } | 
|---|
| 88 |  | 
|---|
| 89 | /* | 
|---|
| 90 | * Fold into 128-bits. | 
|---|
| 91 | */ | 
|---|
| 92 | x0 = _mm_load_si128((__m128i *)k3k4); | 
|---|
| 93 |  | 
|---|
| 94 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
|---|
| 95 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | 
|---|
| 96 | x1 = _mm_xor_si128(x1, x2); | 
|---|
| 97 | x1 = _mm_xor_si128(x1, x5); | 
|---|
| 98 |  | 
|---|
| 99 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
|---|
| 100 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | 
|---|
| 101 | x1 = _mm_xor_si128(x1, x3); | 
|---|
| 102 | x1 = _mm_xor_si128(x1, x5); | 
|---|
| 103 |  | 
|---|
| 104 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
|---|
| 105 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | 
|---|
| 106 | x1 = _mm_xor_si128(x1, x4); | 
|---|
| 107 | x1 = _mm_xor_si128(x1, x5); | 
|---|
| 108 |  | 
|---|
| 109 | /* | 
|---|
| 110 | * Single fold blocks of 16, if any. | 
|---|
| 111 | */ | 
|---|
| 112 | while (len >= 16) | 
|---|
| 113 | { | 
|---|
| 114 | x2 = _mm_loadu_si128((__m128i *)buf); | 
|---|
| 115 |  | 
|---|
| 116 | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
|---|
| 117 | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); | 
|---|
| 118 | x1 = _mm_xor_si128(x1, x2); | 
|---|
| 119 | x1 = _mm_xor_si128(x1, x5); | 
|---|
| 120 |  | 
|---|
| 121 | buf += 16; | 
|---|
| 122 | len -= 16; | 
|---|
| 123 | } | 
|---|
| 124 |  | 
|---|
| 125 | /* | 
|---|
| 126 | * Fold 128-bits to 64-bits. | 
|---|
| 127 | */ | 
|---|
| 128 | x2 = _mm_clmulepi64_si128(x1, x0, 0x10); | 
|---|
| 129 | x3 = _mm_setr_epi32(~0, 0, ~0, 0); | 
|---|
| 130 | x1 = _mm_srli_si128(x1, 8); | 
|---|
| 131 | x1 = _mm_xor_si128(x1, x2); | 
|---|
| 132 |  | 
|---|
| 133 | x0 = _mm_loadl_epi64((__m128i*)k5k0); | 
|---|
| 134 |  | 
|---|
| 135 | x2 = _mm_srli_si128(x1, 4); | 
|---|
| 136 | x1 = _mm_and_si128(x1, x3); | 
|---|
| 137 | x1 = _mm_clmulepi64_si128(x1, x0, 0x00); | 
|---|
| 138 | x1 = _mm_xor_si128(x1, x2); | 
|---|
| 139 |  | 
|---|
| 140 | /* | 
|---|
| 141 | * Barret reduce to 32-bits. | 
|---|
| 142 | */ | 
|---|
| 143 | x0 = _mm_load_si128((__m128i*)poly); | 
|---|
| 144 |  | 
|---|
| 145 | x2 = _mm_and_si128(x1, x3); | 
|---|
| 146 | x2 = _mm_clmulepi64_si128(x2, x0, 0x10); | 
|---|
| 147 | x2 = _mm_and_si128(x2, x3); | 
|---|
| 148 | x2 = _mm_clmulepi64_si128(x2, x0, 0x00); | 
|---|
| 149 | x1 = _mm_xor_si128(x1, x2); | 
|---|
| 150 |  | 
|---|
| 151 | /* | 
|---|
| 152 | * Return the crc32. | 
|---|
| 153 | */ | 
|---|
| 154 | return _mm_extract_epi32(x1, 1); | 
|---|
| 155 | } | 
|---|
| 156 |  | 
|---|
| 157 | #elif defined(CRC32_ARMV8_CRC32) | 
|---|
| 158 |  | 
|---|
| 159 | /* CRC32 checksums using ARMv8-a crypto instructions. | 
|---|
| 160 | * | 
|---|
| 161 | * TODO: implement a version using the PMULL instruction. | 
|---|
| 162 | */ | 
|---|
| 163 |  | 
|---|
| 164 | #if defined(__clang__) | 
|---|
| 165 | /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an | 
|---|
| 166 | * armv8 target, which is incompatible with ThinLTO optimizations on Android. | 
|---|
| 167 | * (Namely, mixing and matching different module-level targets makes ThinLTO | 
|---|
| 168 | * warn, and Android defaults to armv7-a. This restriction does not apply to | 
|---|
| 169 | * function-level `target`s, however.) | 
|---|
| 170 | * | 
|---|
| 171 | * Since we only need four crc intrinsics, and since clang's implementation of | 
|---|
| 172 | * those are just wrappers around compiler builtins, it's simplest to #define | 
|---|
| 173 | * those builtins directly. If this #define list grows too much (or we depend on | 
|---|
| 174 | * an intrinsic that isn't a trivial wrapper), we may have to find a better way | 
|---|
| 175 | * to go about this. | 
|---|
| 176 | * | 
|---|
| 177 | * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized | 
|---|
| 178 | * feature for this target (ignoring feature)." This appears to be a harmless | 
|---|
| 179 | * bug in clang. | 
|---|
| 180 | */ | 
|---|
| 181 | #define __crc32b __builtin_arm_crc32b | 
|---|
| 182 | #define __crc32d __builtin_arm_crc32d | 
|---|
| 183 | #define __crc32w __builtin_arm_crc32w | 
|---|
| 184 | #define __crc32cw __builtin_arm_crc32cw | 
|---|
| 185 |  | 
|---|
| 186 | #if defined(__aarch64__) | 
|---|
| 187 | #define TARGET_ARMV8_WITH_CRC __attribute__((target("crc"))) | 
|---|
| 188 | #else  // !defined(__aarch64__) | 
|---|
| 189 | #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc"))) | 
|---|
| 190 | #endif  // defined(__aarch64__) | 
|---|
| 191 |  | 
|---|
| 192 | #elif defined(__GNUC__) | 
|---|
| 193 | /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not | 
|---|
| 194 | * allowed. We can just include arm_acle.h. | 
|---|
| 195 | */ | 
|---|
| 196 | #include <arm_acle.h> | 
|---|
| 197 | #define TARGET_ARMV8_WITH_CRC | 
|---|
| 198 | #else  // !defined(__GNUC__) && !defined(_aarch64__) | 
|---|
| 199 | #error ARM CRC32 SIMD extensions only supported for Clang and GCC | 
|---|
| 200 | #endif | 
|---|
| 201 |  | 
|---|
| 202 | TARGET_ARMV8_WITH_CRC | 
|---|
| 203 | uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc, | 
|---|
| 204 | const unsigned char *buf, | 
|---|
| 205 | z_size_t len) | 
|---|
| 206 | { | 
|---|
| 207 | uint32_t c = (uint32_t) ~crc; | 
|---|
| 208 |  | 
|---|
| 209 | while (len && ((uintptr_t)buf & 7)) { | 
|---|
| 210 | c = __crc32b(c, *buf++); | 
|---|
| 211 | --len; | 
|---|
| 212 | } | 
|---|
| 213 |  | 
|---|
| 214 | const uint64_t *buf8 = (const uint64_t *)buf; | 
|---|
| 215 |  | 
|---|
| 216 | while (len >= 64) { | 
|---|
| 217 | c = __crc32d(c, *buf8++); | 
|---|
| 218 | c = __crc32d(c, *buf8++); | 
|---|
| 219 | c = __crc32d(c, *buf8++); | 
|---|
| 220 | c = __crc32d(c, *buf8++); | 
|---|
| 221 |  | 
|---|
| 222 | c = __crc32d(c, *buf8++); | 
|---|
| 223 | c = __crc32d(c, *buf8++); | 
|---|
| 224 | c = __crc32d(c, *buf8++); | 
|---|
| 225 | c = __crc32d(c, *buf8++); | 
|---|
| 226 | len -= 64; | 
|---|
| 227 | } | 
|---|
| 228 |  | 
|---|
| 229 | while (len >= 8) { | 
|---|
| 230 | c = __crc32d(c, *buf8++); | 
|---|
| 231 | len -= 8; | 
|---|
| 232 | } | 
|---|
| 233 |  | 
|---|
| 234 | buf = (const unsigned char *)buf8; | 
|---|
| 235 |  | 
|---|
| 236 | while (len--) { | 
|---|
| 237 | c = __crc32b(c, *buf++); | 
|---|
| 238 | } | 
|---|
| 239 |  | 
|---|
| 240 | return ~c; | 
|---|
| 241 | } | 
|---|
| 242 |  | 
|---|
| 243 | #endif | 
|---|
| 244 |  | 
|---|