1/*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkChecksum_opts_DEFINED
9#define SkChecksum_opts_DEFINED
10
11#include "include/core/SkTypes.h"
12#include "include/private/SkChecksum.h"
13#include "src/core/SkUtils.h" // sk_unaligned_load
14
15#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
16 #include <immintrin.h>
17#elif defined(SK_ARM_HAS_CRC32)
18 #include <arm_acle.h>
19#endif
20
21namespace SK_OPTS_NS {
22
23#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
24 // This is not a CRC32. It's Just A Hash that uses those instructions because they're fast.
25 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
26 auto data = (const uint8_t*)vdata;
27
28 // _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while.
29 uint64_t hash = seed;
30 if (bytes >= 24) {
31 // We'll create 3 independent hashes, each using _mm_crc32_u64()
32 // to hash 8 bytes per step. Both 3 and independent are important:
33 // we can execute 3 of these instructions in parallel on a single core.
34 uint64_t a = hash,
35 b = hash,
36 c = hash;
37 size_t steps = bytes/24;
38 while (steps --> 0) {
39 a = _mm_crc32_u64(a, sk_unaligned_load<uint64_t>(data+ 0));
40 b = _mm_crc32_u64(b, sk_unaligned_load<uint64_t>(data+ 8));
41 c = _mm_crc32_u64(c, sk_unaligned_load<uint64_t>(data+16));
42 data += 24;
43 }
44 bytes %= 24;
45 hash = _mm_crc32_u32(a, _mm_crc32_u32(b, c));
46 }
47
48 SkASSERT(bytes < 24);
49 if (bytes >= 16) {
50 hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data));
51 bytes -= 8;
52 data += 8;
53 }
54
55 SkASSERT(bytes < 16);
56 if (bytes & 8) {
57 hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data));
58 data += 8;
59 }
60
61 // The remainder of these _mm_crc32_u*() operate on a 32-bit register.
62 // We don't lose anything here: only the bottom 32-bits were populated.
63 auto hash32 = (uint32_t)hash;
64
65 if (bytes & 4) {
66 hash32 = _mm_crc32_u32(hash32, sk_unaligned_load<uint32_t>(data));
67 data += 4;
68 }
69 if (bytes & 2) {
70 hash32 = _mm_crc32_u16(hash32, sk_unaligned_load<uint16_t>(data));
71 data += 2;
72 }
73 if (bytes & 1) {
74 hash32 = _mm_crc32_u8(hash32, sk_unaligned_load<uint8_t>(data));
75 }
76 return hash32;
77 }
78
79#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
80 // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64().
81 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
82 auto data = (const uint8_t*)vdata;
83
84 if (bytes >= 12) {
85 // We'll create 3 independent hashes, each using _mm_crc32_u32()
86 // to hash 4 bytes per step. Both 3 and independent are important:
87 // we can execute 3 of these instructions in parallel on a single core.
88 uint32_t a = hash,
89 b = hash,
90 c = hash;
91 size_t steps = bytes/12;
92 while (steps --> 0) {
93 a = _mm_crc32_u32(a, sk_unaligned_load<uint32_t>(data+0));
94 b = _mm_crc32_u32(b, sk_unaligned_load<uint32_t>(data+4));
95 c = _mm_crc32_u32(c, sk_unaligned_load<uint32_t>(data+8));
96 data += 12;
97 }
98 bytes %= 12;
99 hash = _mm_crc32_u32(a, _mm_crc32_u32(b, c));
100 }
101
102 SkASSERT(bytes < 12);
103 if (bytes >= 8) {
104 hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data));
105 bytes -= 4;
106 data += 4;
107 }
108
109 SkASSERT(bytes < 8);
110 if (bytes & 4) {
111 hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data));
112 data += 4;
113 }
114 if (bytes & 2) {
115 hash = _mm_crc32_u16(hash, sk_unaligned_load<uint16_t>(data));
116 data += 2;
117 }
118 if (bytes & 1) {
119 hash = _mm_crc32_u8(hash, sk_unaligned_load<uint8_t>(data));
120 }
121 return hash;
122 }
123
124#elif defined(SK_ARM_HAS_CRC32)
125 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
126 auto data = (const uint8_t*)vdata;
127 if (bytes >= 24) {
128 uint32_t a = hash,
129 b = hash,
130 c = hash;
131 size_t steps = bytes/24;
132 while (steps --> 0) {
133 a = __crc32d(a, sk_unaligned_load<uint64_t>(data+ 0));
134 b = __crc32d(b, sk_unaligned_load<uint64_t>(data+ 8));
135 c = __crc32d(c, sk_unaligned_load<uint64_t>(data+16));
136 data += 24;
137 }
138 bytes %= 24;
139 hash = __crc32w(a, __crc32w(b, c));
140 }
141
142 SkASSERT(bytes < 24);
143 if (bytes >= 16) {
144 hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data));
145 bytes -= 8;
146 data += 8;
147 }
148
149 SkASSERT(bytes < 16);
150 if (bytes & 8) {
151 hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data));
152 data += 8;
153 }
154 if (bytes & 4) {
155 hash = __crc32w(hash, sk_unaligned_load<uint32_t>(data));
156 data += 4;
157 }
158 if (bytes & 2) {
159 hash = __crc32h(hash, sk_unaligned_load<uint16_t>(data));
160 data += 2;
161 }
162 if (bytes & 1) {
163 hash = __crc32b(hash, sk_unaligned_load<uint8_t>(data));
164 }
165 return hash;
166 }
167
168#else
169 // This is Murmur3.
170 /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
171 auto data = (const uint8_t*)vdata;
172
173 size_t original_bytes = bytes;
174
175 // Handle 4 bytes at a time while possible.
176 while (bytes >= 4) {
177 uint32_t k = sk_unaligned_load<uint32_t>(data);
178 k *= 0xcc9e2d51;
179 k = (k << 15) | (k >> 17);
180 k *= 0x1b873593;
181
182 hash ^= k;
183 hash = (hash << 13) | (hash >> 19);
184 hash *= 5;
185 hash += 0xe6546b64;
186
187 bytes -= 4;
188 data += 4;
189 }
190
191 // Handle last 0-3 bytes.
192 uint32_t k = 0;
193 switch (bytes & 3) {
194 case 3: k ^= data[2] << 16;
195 case 2: k ^= data[1] << 8;
196 case 1: k ^= data[0] << 0;
197 k *= 0xcc9e2d51;
198 k = (k << 15) | (k >> 17);
199 k *= 0x1b873593;
200 hash ^= k;
201 }
202
203 hash ^= original_bytes;
204 return SkChecksum::Mix(hash);
205 }
206#endif
207
208} // namespace SK_OPTS_NS
209
210#endif//SkChecksum_opts_DEFINED
211