1 | /* functable.c -- Choose relevant optimized functions at runtime |
2 | * Copyright (C) 2017 Hans Kristian Rosbach |
3 | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | */ |
5 | |
6 | #include "zbuild.h" |
7 | #include "zendian.h" |
8 | #include "deflate.h" |
9 | #include "deflate_p.h" |
10 | |
11 | #include "functable.h" |
12 | |
13 | #ifdef X86_FEATURES |
14 | # include "fallback_builtins.h" |
15 | #endif |
16 | |
17 | /* insert_string */ |
18 | extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count); |
19 | #ifdef X86_SSE42_CRC_HASH |
20 | extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count); |
21 | #elif defined(ARM_ACLE_CRC_HASH) |
22 | extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count); |
23 | #endif |
24 | |
25 | /* quick_insert_string */ |
26 | extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str); |
27 | #ifdef X86_SSE42_CRC_HASH |
28 | extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str); |
29 | #elif defined(ARM_ACLE_CRC_HASH) |
30 | extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str); |
31 | #endif |
32 | |
33 | /* slide_hash */ |
34 | #ifdef X86_SSE2 |
35 | void slide_hash_sse2(deflate_state *s); |
36 | #elif defined(ARM_NEON_SLIDEHASH) |
37 | void slide_hash_neon(deflate_state *s); |
38 | #elif defined(POWER8_VSX_SLIDEHASH) |
39 | void slide_hash_power8(deflate_state *s); |
40 | #endif |
41 | #ifdef X86_AVX2 |
42 | void slide_hash_avx2(deflate_state *s); |
43 | #endif |
44 | |
45 | /* adler32 */ |
46 | extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len); |
47 | #ifdef ARM_NEON_ADLER32 |
48 | extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len); |
49 | #endif |
50 | #ifdef X86_SSSE3_ADLER32 |
51 | extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len); |
52 | #endif |
53 | #ifdef X86_AVX2_ADLER32 |
54 | extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len); |
55 | #endif |
56 | #ifdef POWER8_VSX_ADLER32 |
57 | extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len); |
58 | #endif |
59 | |
60 | /* memory chunking */ |
61 | extern uint32_t chunksize_c(void); |
62 | extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len); |
63 | extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); |
64 | extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len); |
65 | extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len); |
66 | extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left); |
67 | #ifdef X86_SSE2_CHUNKSET |
68 | extern uint32_t chunksize_sse2(void); |
69 | extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len); |
70 | extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); |
71 | extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len); |
72 | extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len); |
73 | extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); |
74 | #endif |
75 | #ifdef X86_AVX_CHUNKSET |
76 | extern uint32_t chunksize_avx(void); |
77 | extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len); |
78 | extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); |
79 | extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len); |
80 | extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len); |
81 | extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left); |
82 | #endif |
83 | #ifdef ARM_NEON_CHUNKSET |
84 | extern uint32_t chunksize_neon(void); |
85 | extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len); |
86 | extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); |
87 | extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len); |
88 | extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len); |
89 | extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left); |
90 | #endif |
91 | |
92 | /* CRC32 */ |
93 | Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t); |
94 | |
95 | #ifdef ARM_ACLE_CRC_HASH |
96 | extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t); |
97 | #endif |
98 | |
99 | #if BYTE_ORDER == LITTLE_ENDIAN |
100 | extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t); |
101 | #elif BYTE_ORDER == BIG_ENDIAN |
102 | extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t); |
103 | #endif |
104 | |
105 | /* compare258 */ |
106 | extern uint32_t compare258_c(const unsigned char *src0, const unsigned char *src1); |
107 | #ifdef UNALIGNED_OK |
108 | extern uint32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1); |
109 | extern uint32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1); |
110 | #ifdef UNALIGNED64_OK |
111 | extern uint32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1); |
112 | #endif |
113 | #ifdef X86_SSE42_CMP_STR |
114 | extern uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1); |
115 | #endif |
116 | #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) |
117 | extern uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1); |
118 | #endif |
119 | #endif |
120 | |
121 | /* longest_match */ |
122 | extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match); |
123 | #ifdef UNALIGNED_OK |
124 | extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match); |
125 | extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match); |
126 | #ifdef UNALIGNED64_OK |
127 | extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match); |
128 | #endif |
129 | #ifdef X86_SSE42_CMP_STR |
130 | extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match); |
131 | #endif |
132 | #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) |
133 | extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match); |
134 | #endif |
135 | #endif |
136 | |
137 | Z_INTERNAL Z_TLS struct functable_s functable; |
138 | |
139 | Z_INTERNAL void cpu_check_features(void) |
140 | { |
141 | static int features_checked = 0; |
142 | if (features_checked) |
143 | return; |
144 | #if defined(X86_FEATURES) |
145 | x86_check_features(); |
146 | #elif defined(ARM_FEATURES) |
147 | arm_check_features(); |
148 | #elif defined(POWER_FEATURES) |
149 | power_check_features(); |
150 | #endif |
151 | features_checked = 1; |
152 | } |
153 | |
154 | /* stub functions */ |
155 | Z_INTERNAL void insert_string_stub(deflate_state *const s, const uint32_t str, uint32_t count) { |
156 | // Initialize default |
157 | |
158 | functable.insert_string = &insert_string_c; |
159 | cpu_check_features(); |
160 | |
161 | #ifdef X86_SSE42_CRC_HASH |
162 | if (x86_cpu_has_sse42) |
163 | functable.insert_string = &insert_string_sse4; |
164 | #elif defined(ARM_ACLE_CRC_HASH) |
165 | if (arm_cpu_has_crc32) |
166 | functable.insert_string = &insert_string_acle; |
167 | #endif |
168 | |
169 | functable.insert_string(s, str, count); |
170 | } |
171 | |
172 | Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) { |
173 | functable.quick_insert_string = &quick_insert_string_c; |
174 | |
175 | #ifdef X86_SSE42_CRC_HASH |
176 | if (x86_cpu_has_sse42) |
177 | functable.quick_insert_string = &quick_insert_string_sse4; |
178 | #elif defined(ARM_ACLE_CRC_HASH) |
179 | if (arm_cpu_has_crc32) |
180 | functable.quick_insert_string = &quick_insert_string_acle; |
181 | #endif |
182 | |
183 | return functable.quick_insert_string(s, str); |
184 | } |
185 | |
186 | Z_INTERNAL void slide_hash_stub(deflate_state *s) { |
187 | |
188 | functable.slide_hash = &slide_hash_c; |
189 | cpu_check_features(); |
190 | |
191 | #ifdef X86_SSE2 |
192 | # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) |
193 | if (x86_cpu_has_sse2) |
194 | # endif |
195 | functable.slide_hash = &slide_hash_sse2; |
196 | #elif defined(ARM_NEON_SLIDEHASH) |
197 | # ifndef ARM_NOCHECK_NEON |
198 | if (arm_cpu_has_neon) |
199 | # endif |
200 | functable.slide_hash = &slide_hash_neon; |
201 | #endif |
202 | #ifdef X86_AVX2 |
203 | if (x86_cpu_has_avx2) |
204 | functable.slide_hash = &slide_hash_avx2; |
205 | #endif |
206 | #ifdef POWER8_VSX_SLIDEHASH |
207 | if (power_cpu_has_arch_2_07) |
208 | functable.slide_hash = &slide_hash_power8; |
209 | #endif |
210 | |
211 | functable.slide_hash(s); |
212 | } |
213 | |
214 | Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) { |
215 | // Initialize default |
216 | functable.adler32 = &adler32_c; |
217 | cpu_check_features(); |
218 | |
219 | #ifdef ARM_NEON_ADLER32 |
220 | # ifndef ARM_NOCHECK_NEON |
221 | if (arm_cpu_has_neon) |
222 | # endif |
223 | functable.adler32 = &adler32_neon; |
224 | #endif |
225 | #ifdef X86_SSSE3_ADLER32 |
226 | if (x86_cpu_has_ssse3) |
227 | functable.adler32 = &adler32_ssse3; |
228 | #endif |
229 | #ifdef X86_AVX2_ADLER32 |
230 | if (x86_cpu_has_avx2) |
231 | functable.adler32 = &adler32_avx2; |
232 | #endif |
233 | #ifdef POWER8_VSX_ADLER32 |
234 | if (power_cpu_has_arch_2_07) |
235 | functable.adler32 = &adler32_power8; |
236 | #endif |
237 | |
238 | return functable.adler32(adler, buf, len); |
239 | } |
240 | |
241 | Z_INTERNAL uint32_t chunksize_stub(void) { |
242 | // Initialize default |
243 | functable.chunksize = &chunksize_c; |
244 | cpu_check_features(); |
245 | |
246 | #ifdef X86_SSE2_CHUNKSET |
247 | # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) |
248 | if (x86_cpu_has_sse2) |
249 | # endif |
250 | functable.chunksize = &chunksize_sse2; |
251 | #endif |
252 | #ifdef X86_AVX_CHUNKSET |
253 | if (x86_cpu_has_avx2) |
254 | functable.chunksize = &chunksize_avx; |
255 | #endif |
256 | #ifdef ARM_NEON_CHUNKSET |
257 | if (arm_cpu_has_neon) |
258 | functable.chunksize = &chunksize_neon; |
259 | #endif |
260 | |
261 | return functable.chunksize(); |
262 | } |
263 | |
264 | Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) { |
265 | // Initialize default |
266 | functable.chunkcopy = &chunkcopy_c; |
267 | |
268 | #ifdef X86_SSE2_CHUNKSET |
269 | # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) |
270 | if (x86_cpu_has_sse2) |
271 | # endif |
272 | functable.chunkcopy = &chunkcopy_sse2; |
273 | #endif |
274 | #ifdef X86_AVX_CHUNKSET |
275 | if (x86_cpu_has_avx2) |
276 | functable.chunkcopy = &chunkcopy_avx; |
277 | #endif |
278 | #ifdef ARM_NEON_CHUNKSET |
279 | if (arm_cpu_has_neon) |
280 | functable.chunkcopy = &chunkcopy_neon; |
281 | #endif |
282 | |
283 | return functable.chunkcopy(out, from, len); |
284 | } |
285 | |
286 | Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) { |
287 | // Initialize default |
288 | functable.chunkcopy_safe = &chunkcopy_safe_c; |
289 | |
290 | #ifdef X86_SSE2_CHUNKSET |
291 | # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) |
292 | if (x86_cpu_has_sse2) |
293 | # endif |
294 | functable.chunkcopy_safe = &chunkcopy_safe_sse2; |
295 | #endif |
296 | #ifdef X86_AVX_CHUNKSET |
297 | if (x86_cpu_has_avx2) |
298 | functable.chunkcopy_safe = &chunkcopy_safe_avx; |
299 | #endif |
300 | #ifdef ARM_NEON_CHUNKSET |
301 | if (arm_cpu_has_neon) |
302 | functable.chunkcopy_safe = &chunkcopy_safe_neon; |
303 | #endif |
304 | |
305 | return functable.chunkcopy_safe(out, from, len, safe); |
306 | } |
307 | |
308 | Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) { |
309 | // Initialize default |
310 | functable.chunkunroll = &chunkunroll_c; |
311 | |
312 | #ifdef X86_SSE2_CHUNKSET |
313 | # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) |
314 | if (x86_cpu_has_sse2) |
315 | # endif |
316 | functable.chunkunroll = &chunkunroll_sse2; |
317 | #endif |
318 | #ifdef X86_AVX_CHUNKSET |
319 | if (x86_cpu_has_avx2) |
320 | functable.chunkunroll = &chunkunroll_avx; |
321 | #endif |
322 | #ifdef ARM_NEON_CHUNKSET |
323 | if (arm_cpu_has_neon) |
324 | functable.chunkunroll = &chunkunroll_neon; |
325 | #endif |
326 | |
327 | return functable.chunkunroll(out, dist, len); |
328 | } |
329 | |
330 | Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) { |
331 | // Initialize default |
332 | functable.chunkmemset = &chunkmemset_c; |
333 | |
334 | #ifdef X86_SSE2_CHUNKSET |
335 | # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) |
336 | if (x86_cpu_has_sse2) |
337 | # endif |
338 | functable.chunkmemset = &chunkmemset_sse2; |
339 | #endif |
340 | #ifdef X86_AVX_CHUNKSET |
341 | if (x86_cpu_has_avx2) |
342 | functable.chunkmemset = &chunkmemset_avx; |
343 | #endif |
344 | #ifdef ARM_NEON_CHUNKSET |
345 | if (arm_cpu_has_neon) |
346 | functable.chunkmemset = &chunkmemset_neon; |
347 | #endif |
348 | |
349 | return functable.chunkmemset(out, dist, len); |
350 | } |
351 | |
352 | Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) { |
353 | // Initialize default |
354 | functable.chunkmemset_safe = &chunkmemset_safe_c; |
355 | |
356 | #ifdef X86_SSE2_CHUNKSET |
357 | # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) |
358 | if (x86_cpu_has_sse2) |
359 | # endif |
360 | functable.chunkmemset_safe = &chunkmemset_safe_sse2; |
361 | #endif |
362 | #ifdef X86_AVX_CHUNKSET |
363 | if (x86_cpu_has_avx2) |
364 | functable.chunkmemset_safe = &chunkmemset_safe_avx; |
365 | #endif |
366 | #ifdef ARM_NEON_CHUNKSET |
367 | if (arm_cpu_has_neon) |
368 | functable.chunkmemset_safe = &chunkmemset_safe_neon; |
369 | #endif |
370 | |
371 | return functable.chunkmemset_safe(out, dist, len, left); |
372 | } |
373 | |
374 | Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) { |
375 | int32_t use_byfour = sizeof(void *) == sizeof(ptrdiff_t); |
376 | |
377 | Assert(sizeof(uint64_t) >= sizeof(size_t), |
378 | "crc32_z takes size_t but internally we have a uint64_t len" ); |
379 | /* return a function pointer for optimized arches here after a capability test */ |
380 | |
381 | functable.crc32 = &crc32_generic; |
382 | cpu_check_features(); |
383 | |
384 | if (use_byfour) { |
385 | #if BYTE_ORDER == LITTLE_ENDIAN |
386 | functable.crc32 = crc32_little; |
387 | # if defined(ARM_ACLE_CRC_HASH) |
388 | if (arm_cpu_has_crc32) |
389 | functable.crc32 = crc32_acle; |
390 | # endif |
391 | #elif BYTE_ORDER == BIG_ENDIAN |
392 | functable.crc32 = crc32_big; |
393 | #else |
394 | # error No endian defined |
395 | #endif |
396 | } |
397 | |
398 | return functable.crc32(crc, buf, len); |
399 | } |
400 | |
401 | Z_INTERNAL uint32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) { |
402 | |
403 | functable.compare258 = &compare258_c; |
404 | |
405 | #ifdef UNALIGNED_OK |
406 | # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) |
407 | functable.compare258 = &compare258_unaligned_64; |
408 | # elif defined(HAVE_BUILTIN_CTZ) |
409 | functable.compare258 = &compare258_unaligned_32; |
410 | # else |
411 | functable.compare258 = &compare258_unaligned_16; |
412 | # endif |
413 | # ifdef X86_SSE42_CMP_STR |
414 | if (x86_cpu_has_sse42) |
415 | functable.compare258 = &compare258_unaligned_sse4; |
416 | # endif |
417 | # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) |
418 | if (x86_cpu_has_avx2) |
419 | functable.compare258 = &compare258_unaligned_avx2; |
420 | # endif |
421 | #endif |
422 | |
423 | return functable.compare258(src0, src1); |
424 | } |
425 | |
426 | Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) { |
427 | |
428 | functable.longest_match = &longest_match_c; |
429 | |
430 | #ifdef UNALIGNED_OK |
431 | # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) |
432 | functable.longest_match = &longest_match_unaligned_64; |
433 | # elif defined(HAVE_BUILTIN_CTZ) |
434 | functable.longest_match = &longest_match_unaligned_32; |
435 | # else |
436 | functable.longest_match = &longest_match_unaligned_16; |
437 | # endif |
438 | # ifdef X86_SSE42_CMP_STR |
439 | if (x86_cpu_has_sse42) |
440 | functable.longest_match = &longest_match_unaligned_sse4; |
441 | # endif |
442 | # if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) |
443 | if (x86_cpu_has_avx2) |
444 | functable.longest_match = &longest_match_unaligned_avx2; |
445 | # endif |
446 | #endif |
447 | |
448 | return functable.longest_match(s, cur_match); |
449 | } |
450 | |
451 | /* functable init */ |
452 | Z_INTERNAL Z_TLS struct functable_s functable = { |
453 | insert_string_stub, |
454 | quick_insert_string_stub, |
455 | adler32_stub, |
456 | crc32_stub, |
457 | slide_hash_stub, |
458 | compare258_stub, |
459 | longest_match_stub, |
460 | chunksize_stub, |
461 | chunkcopy_stub, |
462 | chunkcopy_safe_stub, |
463 | chunkunroll_stub, |
464 | chunkmemset_stub, |
465 | chunkmemset_safe_stub |
466 | }; |
467 | |