1 | /* |
2 | * Copyright 2016 Google Inc. |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #ifndef SkChecksum_opts_DEFINED |
9 | #define SkChecksum_opts_DEFINED |
10 | |
11 | #include "include/core/SkTypes.h" |
12 | #include "include/private/SkChecksum.h" |
13 | #include "src/core/SkUtils.h" // sk_unaligned_load |
14 | |
15 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 |
16 | #include <immintrin.h> |
17 | #elif defined(SK_ARM_HAS_CRC32) |
18 | #include <arm_acle.h> |
19 | #endif |
20 | |
21 | namespace SK_OPTS_NS { |
22 | |
23 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64)) |
24 | // This is not a CRC32. It's Just A Hash that uses those instructions because they're fast. |
25 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) { |
26 | auto data = (const uint8_t*)vdata; |
27 | |
28 | // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while. |
29 | uint64_t hash = seed; |
30 | if (bytes >= 24) { |
31 | // We'll create 3 independent hashes, each using _mm_crc32_u64() |
32 | // to hash 8 bytes per step. Both 3 and independent are important: |
33 | // we can execute 3 of these instructions in parallel on a single core. |
34 | uint64_t a = hash, |
35 | b = hash, |
36 | c = hash; |
37 | size_t steps = bytes/24; |
38 | while (steps --> 0) { |
39 | a = _mm_crc32_u64(a, sk_unaligned_load<uint64_t>(data+ 0)); |
40 | b = _mm_crc32_u64(b, sk_unaligned_load<uint64_t>(data+ 8)); |
41 | c = _mm_crc32_u64(c, sk_unaligned_load<uint64_t>(data+16)); |
42 | data += 24; |
43 | } |
44 | bytes %= 24; |
45 | hash = _mm_crc32_u32(a, _mm_crc32_u32(b, c)); |
46 | } |
47 | |
48 | SkASSERT(bytes < 24); |
49 | if (bytes >= 16) { |
50 | hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data)); |
51 | bytes -= 8; |
52 | data += 8; |
53 | } |
54 | |
55 | SkASSERT(bytes < 16); |
56 | if (bytes & 8) { |
57 | hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data)); |
58 | data += 8; |
59 | } |
60 | |
61 | // The remainder of these _mm_crc32_u*() operate on a 32-bit register. |
62 | // We don't lose anything here: only the bottom 32-bits were populated. |
63 | auto hash32 = (uint32_t)hash; |
64 | |
65 | if (bytes & 4) { |
66 | hash32 = _mm_crc32_u32(hash32, sk_unaligned_load<uint32_t>(data)); |
67 | data += 4; |
68 | } |
69 | if (bytes & 2) { |
70 | hash32 = _mm_crc32_u16(hash32, sk_unaligned_load<uint16_t>(data)); |
71 | data += 2; |
72 | } |
73 | if (bytes & 1) { |
74 | hash32 = _mm_crc32_u8(hash32, sk_unaligned_load<uint8_t>(data)); |
75 | } |
76 | return hash32; |
77 | } |
78 | |
79 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 |
80 | // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64(). |
81 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
82 | auto data = (const uint8_t*)vdata; |
83 | |
84 | if (bytes >= 12) { |
85 | // We'll create 3 independent hashes, each using _mm_crc32_u32() |
86 | // to hash 4 bytes per step. Both 3 and independent are important: |
87 | // we can execute 3 of these instructions in parallel on a single core. |
88 | uint32_t a = hash, |
89 | b = hash, |
90 | c = hash; |
91 | size_t steps = bytes/12; |
92 | while (steps --> 0) { |
93 | a = _mm_crc32_u32(a, sk_unaligned_load<uint32_t>(data+0)); |
94 | b = _mm_crc32_u32(b, sk_unaligned_load<uint32_t>(data+4)); |
95 | c = _mm_crc32_u32(c, sk_unaligned_load<uint32_t>(data+8)); |
96 | data += 12; |
97 | } |
98 | bytes %= 12; |
99 | hash = _mm_crc32_u32(a, _mm_crc32_u32(b, c)); |
100 | } |
101 | |
102 | SkASSERT(bytes < 12); |
103 | if (bytes >= 8) { |
104 | hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data)); |
105 | bytes -= 4; |
106 | data += 4; |
107 | } |
108 | |
109 | SkASSERT(bytes < 8); |
110 | if (bytes & 4) { |
111 | hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data)); |
112 | data += 4; |
113 | } |
114 | if (bytes & 2) { |
115 | hash = _mm_crc32_u16(hash, sk_unaligned_load<uint16_t>(data)); |
116 | data += 2; |
117 | } |
118 | if (bytes & 1) { |
119 | hash = _mm_crc32_u8(hash, sk_unaligned_load<uint8_t>(data)); |
120 | } |
121 | return hash; |
122 | } |
123 | |
124 | #elif defined(SK_ARM_HAS_CRC32) |
125 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
126 | auto data = (const uint8_t*)vdata; |
127 | if (bytes >= 24) { |
128 | uint32_t a = hash, |
129 | b = hash, |
130 | c = hash; |
131 | size_t steps = bytes/24; |
132 | while (steps --> 0) { |
133 | a = __crc32d(a, sk_unaligned_load<uint64_t>(data+ 0)); |
134 | b = __crc32d(b, sk_unaligned_load<uint64_t>(data+ 8)); |
135 | c = __crc32d(c, sk_unaligned_load<uint64_t>(data+16)); |
136 | data += 24; |
137 | } |
138 | bytes %= 24; |
139 | hash = __crc32w(a, __crc32w(b, c)); |
140 | } |
141 | |
142 | SkASSERT(bytes < 24); |
143 | if (bytes >= 16) { |
144 | hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data)); |
145 | bytes -= 8; |
146 | data += 8; |
147 | } |
148 | |
149 | SkASSERT(bytes < 16); |
150 | if (bytes & 8) { |
151 | hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data)); |
152 | data += 8; |
153 | } |
154 | if (bytes & 4) { |
155 | hash = __crc32w(hash, sk_unaligned_load<uint32_t>(data)); |
156 | data += 4; |
157 | } |
158 | if (bytes & 2) { |
159 | hash = __crc32h(hash, sk_unaligned_load<uint16_t>(data)); |
160 | data += 2; |
161 | } |
162 | if (bytes & 1) { |
163 | hash = __crc32b(hash, sk_unaligned_load<uint8_t>(data)); |
164 | } |
165 | return hash; |
166 | } |
167 | |
168 | #else |
169 | // This is Murmur3. |
170 | /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) { |
171 | auto data = (const uint8_t*)vdata; |
172 | |
173 | size_t original_bytes = bytes; |
174 | |
175 | // Handle 4 bytes at a time while possible. |
176 | while (bytes >= 4) { |
177 | uint32_t k = sk_unaligned_load<uint32_t>(data); |
178 | k *= 0xcc9e2d51; |
179 | k = (k << 15) | (k >> 17); |
180 | k *= 0x1b873593; |
181 | |
182 | hash ^= k; |
183 | hash = (hash << 13) | (hash >> 19); |
184 | hash *= 5; |
185 | hash += 0xe6546b64; |
186 | |
187 | bytes -= 4; |
188 | data += 4; |
189 | } |
190 | |
191 | // Handle last 0-3 bytes. |
192 | uint32_t k = 0; |
193 | switch (bytes & 3) { |
194 | case 3: k ^= data[2] << 16; [[fallthrough]]; |
195 | case 2: k ^= data[1] << 8; [[fallthrough]]; |
196 | case 1: k ^= data[0] << 0; |
197 | k *= 0xcc9e2d51; |
198 | k = (k << 15) | (k >> 17); |
199 | k *= 0x1b873593; |
200 | hash ^= k; |
201 | } |
202 | |
203 | hash ^= original_bytes; |
204 | return SkChecksum::Mix(hash); |
205 | } |
206 | #endif |
207 | |
208 | } // namespace SK_OPTS_NS |
209 | |
210 | #endif//SkChecksum_opts_DEFINED |
211 | |