| 1 | /* |
| 2 | * Copyright 2016 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #ifndef SkChecksum_opts_DEFINED |
| 9 | #define SkChecksum_opts_DEFINED |
| 10 | |
| 11 | #include "include/core/SkTypes.h" |
| 12 | #include "include/private/SkChecksum.h" |
| 13 | #include "src/core/SkUtils.h" // sk_unaligned_load |
| 14 | |
| 15 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 |
| 16 | #include <immintrin.h> |
| 17 | #elif defined(SK_ARM_HAS_CRC32) |
| 18 | #include <arm_acle.h> |
| 19 | #endif |
| 20 | |
| 21 | namespace SK_OPTS_NS { |
| 22 | |
| 23 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64)) |
| 24 | // This is not a CRC32. It's Just A Hash that uses those instructions because they're fast. |
| 25 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) { |
| 26 | auto data = (const uint8_t*)vdata; |
| 27 | |
| 28 | // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while. |
| 29 | uint64_t hash = seed; |
| 30 | if (bytes >= 24) { |
| 31 | // We'll create 3 independent hashes, each using _mm_crc32_u64() |
| 32 | // to hash 8 bytes per step. Both 3 and independent are important: |
| 33 | // we can execute 3 of these instructions in parallel on a single core. |
| 34 | uint64_t a = hash, |
| 35 | b = hash, |
| 36 | c = hash; |
| 37 | size_t steps = bytes/24; |
| 38 | while (steps --> 0) { |
| 39 | a = _mm_crc32_u64(a, sk_unaligned_load<uint64_t>(data+ 0)); |
| 40 | b = _mm_crc32_u64(b, sk_unaligned_load<uint64_t>(data+ 8)); |
| 41 | c = _mm_crc32_u64(c, sk_unaligned_load<uint64_t>(data+16)); |
| 42 | data += 24; |
| 43 | } |
| 44 | bytes %= 24; |
| 45 | hash = _mm_crc32_u32(a, _mm_crc32_u32(b, c)); |
| 46 | } |
| 47 | |
| 48 | SkASSERT(bytes < 24); |
| 49 | if (bytes >= 16) { |
| 50 | hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data)); |
| 51 | bytes -= 8; |
| 52 | data += 8; |
| 53 | } |
| 54 | |
| 55 | SkASSERT(bytes < 16); |
| 56 | if (bytes & 8) { |
| 57 | hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data)); |
| 58 | data += 8; |
| 59 | } |
| 60 | |
| 61 | // The remainder of these _mm_crc32_u*() operate on a 32-bit register. |
| 62 | // We don't lose anything here: only the bottom 32-bits were populated. |
| 63 | auto hash32 = (uint32_t)hash; |
| 64 | |
| 65 | if (bytes & 4) { |
| 66 | hash32 = _mm_crc32_u32(hash32, sk_unaligned_load<uint32_t>(data)); |
| 67 | data += 4; |
| 68 | } |
| 69 | if (bytes & 2) { |
| 70 | hash32 = _mm_crc32_u16(hash32, sk_unaligned_load<uint16_t>(data)); |
| 71 | data += 2; |
| 72 | } |
| 73 | if (bytes & 1) { |
| 74 | hash32 = _mm_crc32_u8(hash32, sk_unaligned_load<uint8_t>(data)); |
| 75 | } |
| 76 | return hash32; |
| 77 | } |
| 78 | |
| 79 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 |
| 80 | // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64(). |
| 81 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
| 82 | auto data = (const uint8_t*)vdata; |
| 83 | |
| 84 | if (bytes >= 12) { |
| 85 | // We'll create 3 independent hashes, each using _mm_crc32_u32() |
| 86 | // to hash 4 bytes per step. Both 3 and independent are important: |
| 87 | // we can execute 3 of these instructions in parallel on a single core. |
| 88 | uint32_t a = hash, |
| 89 | b = hash, |
| 90 | c = hash; |
| 91 | size_t steps = bytes/12; |
| 92 | while (steps --> 0) { |
| 93 | a = _mm_crc32_u32(a, sk_unaligned_load<uint32_t>(data+0)); |
| 94 | b = _mm_crc32_u32(b, sk_unaligned_load<uint32_t>(data+4)); |
| 95 | c = _mm_crc32_u32(c, sk_unaligned_load<uint32_t>(data+8)); |
| 96 | data += 12; |
| 97 | } |
| 98 | bytes %= 12; |
| 99 | hash = _mm_crc32_u32(a, _mm_crc32_u32(b, c)); |
| 100 | } |
| 101 | |
| 102 | SkASSERT(bytes < 12); |
| 103 | if (bytes >= 8) { |
| 104 | hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data)); |
| 105 | bytes -= 4; |
| 106 | data += 4; |
| 107 | } |
| 108 | |
| 109 | SkASSERT(bytes < 8); |
| 110 | if (bytes & 4) { |
| 111 | hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data)); |
| 112 | data += 4; |
| 113 | } |
| 114 | if (bytes & 2) { |
| 115 | hash = _mm_crc32_u16(hash, sk_unaligned_load<uint16_t>(data)); |
| 116 | data += 2; |
| 117 | } |
| 118 | if (bytes & 1) { |
| 119 | hash = _mm_crc32_u8(hash, sk_unaligned_load<uint8_t>(data)); |
| 120 | } |
| 121 | return hash; |
| 122 | } |
| 123 | |
| 124 | #elif defined(SK_ARM_HAS_CRC32) |
| 125 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
| 126 | auto data = (const uint8_t*)vdata; |
| 127 | if (bytes >= 24) { |
| 128 | uint32_t a = hash, |
| 129 | b = hash, |
| 130 | c = hash; |
| 131 | size_t steps = bytes/24; |
| 132 | while (steps --> 0) { |
| 133 | a = __crc32d(a, sk_unaligned_load<uint64_t>(data+ 0)); |
| 134 | b = __crc32d(b, sk_unaligned_load<uint64_t>(data+ 8)); |
| 135 | c = __crc32d(c, sk_unaligned_load<uint64_t>(data+16)); |
| 136 | data += 24; |
| 137 | } |
| 138 | bytes %= 24; |
| 139 | hash = __crc32w(a, __crc32w(b, c)); |
| 140 | } |
| 141 | |
| 142 | SkASSERT(bytes < 24); |
| 143 | if (bytes >= 16) { |
| 144 | hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data)); |
| 145 | bytes -= 8; |
| 146 | data += 8; |
| 147 | } |
| 148 | |
| 149 | SkASSERT(bytes < 16); |
| 150 | if (bytes & 8) { |
| 151 | hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data)); |
| 152 | data += 8; |
| 153 | } |
| 154 | if (bytes & 4) { |
| 155 | hash = __crc32w(hash, sk_unaligned_load<uint32_t>(data)); |
| 156 | data += 4; |
| 157 | } |
| 158 | if (bytes & 2) { |
| 159 | hash = __crc32h(hash, sk_unaligned_load<uint16_t>(data)); |
| 160 | data += 2; |
| 161 | } |
| 162 | if (bytes & 1) { |
| 163 | hash = __crc32b(hash, sk_unaligned_load<uint8_t>(data)); |
| 164 | } |
| 165 | return hash; |
| 166 | } |
| 167 | |
| 168 | #else |
| 169 | // This is Murmur3. |
| 170 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
| 171 | auto data = (const uint8_t*)vdata; |
| 172 | |
| 173 | size_t original_bytes = bytes; |
| 174 | |
| 175 | // Handle 4 bytes at a time while possible. |
| 176 | while (bytes >= 4) { |
| 177 | uint32_t k = sk_unaligned_load<uint32_t>(data); |
| 178 | k *= 0xcc9e2d51; |
| 179 | k = (k << 15) | (k >> 17); |
| 180 | k *= 0x1b873593; |
| 181 | |
| 182 | hash ^= k; |
| 183 | hash = (hash << 13) | (hash >> 19); |
| 184 | hash *= 5; |
| 185 | hash += 0xe6546b64; |
| 186 | |
| 187 | bytes -= 4; |
| 188 | data += 4; |
| 189 | } |
| 190 | |
| 191 | // Handle last 0-3 bytes. |
| 192 | uint32_t k = 0; |
| 193 | switch (bytes & 3) { |
| 194 | case 3: k ^= data[2] << 16; |
| 195 | case 2: k ^= data[1] << 8; |
| 196 | case 1: k ^= data[0] << 0; |
| 197 | k *= 0xcc9e2d51; |
| 198 | k = (k << 15) | (k >> 17); |
| 199 | k *= 0x1b873593; |
| 200 | hash ^= k; |
| 201 | } |
| 202 | |
| 203 | hash ^= original_bytes; |
| 204 | return SkChecksum::Mix(hash); |
| 205 | } |
| 206 | #endif |
| 207 | |
| 208 | } // namespace SK_OPTS_NS |
| 209 | |
| 210 | #endif//SkChecksum_opts_DEFINED |
| 211 | |