1/* functable.c -- Choose relevant optimized functions at runtime
2 * Copyright (C) 2017 Hans Kristian Rosbach
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 */
5
6#include "zbuild.h"
7#include "zendian.h"
8#include "deflate.h"
9#include "deflate_p.h"
10
11#include "functable.h"
12
13#ifdef X86_FEATURES
14# include "fallback_builtins.h"
15#endif
16
17/* insert_string */
18extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
19#ifdef X86_SSE42_CRC_HASH
20extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count);
21#elif defined(ARM_ACLE_CRC_HASH)
22extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
23#endif
24
25/* quick_insert_string */
26extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
27#ifdef X86_SSE42_CRC_HASH
28extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str);
29#elif defined(ARM_ACLE_CRC_HASH)
30extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
31#endif
32
33/* slide_hash */
34#ifdef X86_SSE2
35void slide_hash_sse2(deflate_state *s);
36#elif defined(ARM_NEON_SLIDEHASH)
37void slide_hash_neon(deflate_state *s);
38#elif defined(POWER8_VSX_SLIDEHASH)
39void slide_hash_power8(deflate_state *s);
40#endif
41#ifdef X86_AVX2
42void slide_hash_avx2(deflate_state *s);
43#endif
44
45/* adler32 */
46extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
47#ifdef ARM_NEON_ADLER32
48extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
49#endif
50#ifdef X86_SSSE3_ADLER32
51extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
52#endif
53#ifdef X86_AVX2_ADLER32
54extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
55#endif
56#ifdef POWER8_VSX_ADLER32
57extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
58#endif
59
60/* memory chunking */
61extern uint32_t chunksize_c(void);
62extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
63extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
64extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
65extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
66extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
67#ifdef X86_SSE2_CHUNKSET
68extern uint32_t chunksize_sse2(void);
69extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
70extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
71extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
72extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
73extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
74#endif
75#ifdef X86_AVX_CHUNKSET
76extern uint32_t chunksize_avx(void);
77extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
78extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
79extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len);
80extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len);
81extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left);
82#endif
83#ifdef ARM_NEON_CHUNKSET
84extern uint32_t chunksize_neon(void);
85extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
86extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
87extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
88extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
89extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
90#endif
91
92/* CRC32 */
93Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
94
95#ifdef ARM_ACLE_CRC_HASH
96extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t);
97#endif
98
99#if BYTE_ORDER == LITTLE_ENDIAN
100extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
101#elif BYTE_ORDER == BIG_ENDIAN
102extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
103#endif
104
105/* compare258 */
106extern uint32_t compare258_c(const unsigned char *src0, const unsigned char *src1);
107#ifdef UNALIGNED_OK
108extern uint32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1);
109extern uint32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1);
110#ifdef UNALIGNED64_OK
111extern uint32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1);
112#endif
113#ifdef X86_SSE42_CMP_STR
114extern uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
115#endif
116#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
117extern uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
118#endif
119#endif
120
121/* longest_match */
122extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
123#ifdef UNALIGNED_OK
124extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
125extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
126#ifdef UNALIGNED64_OK
127extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
128#endif
129#ifdef X86_SSE42_CMP_STR
130extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
131#endif
132#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
133extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
134#endif
135#endif
136
137Z_INTERNAL Z_TLS struct functable_s functable;
138
139Z_INTERNAL void cpu_check_features(void)
140{
141 static int features_checked = 0;
142 if (features_checked)
143 return;
144#if defined(X86_FEATURES)
145 x86_check_features();
146#elif defined(ARM_FEATURES)
147 arm_check_features();
148#elif defined(POWER_FEATURES)
149 power_check_features();
150#endif
151 features_checked = 1;
152}
153
154/* stub functions */
155Z_INTERNAL void insert_string_stub(deflate_state *const s, const uint32_t str, uint32_t count) {
156 // Initialize default
157
158 functable.insert_string = &insert_string_c;
159 cpu_check_features();
160
161#ifdef X86_SSE42_CRC_HASH
162 if (x86_cpu_has_sse42)
163 functable.insert_string = &insert_string_sse4;
164#elif defined(ARM_ACLE_CRC_HASH)
165 if (arm_cpu_has_crc32)
166 functable.insert_string = &insert_string_acle;
167#endif
168
169 functable.insert_string(s, str, count);
170}
171
172Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) {
173 functable.quick_insert_string = &quick_insert_string_c;
174
175#ifdef X86_SSE42_CRC_HASH
176 if (x86_cpu_has_sse42)
177 functable.quick_insert_string = &quick_insert_string_sse4;
178#elif defined(ARM_ACLE_CRC_HASH)
179 if (arm_cpu_has_crc32)
180 functable.quick_insert_string = &quick_insert_string_acle;
181#endif
182
183 return functable.quick_insert_string(s, str);
184}
185
186Z_INTERNAL void slide_hash_stub(deflate_state *s) {
187
188 functable.slide_hash = &slide_hash_c;
189 cpu_check_features();
190
191#ifdef X86_SSE2
192# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
193 if (x86_cpu_has_sse2)
194# endif
195 functable.slide_hash = &slide_hash_sse2;
196#elif defined(ARM_NEON_SLIDEHASH)
197# ifndef ARM_NOCHECK_NEON
198 if (arm_cpu_has_neon)
199# endif
200 functable.slide_hash = &slide_hash_neon;
201#endif
202#ifdef X86_AVX2
203 if (x86_cpu_has_avx2)
204 functable.slide_hash = &slide_hash_avx2;
205#endif
206#ifdef POWER8_VSX_SLIDEHASH
207 if (power_cpu_has_arch_2_07)
208 functable.slide_hash = &slide_hash_power8;
209#endif
210
211 functable.slide_hash(s);
212}
213
214Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
215 // Initialize default
216 functable.adler32 = &adler32_c;
217 cpu_check_features();
218
219#ifdef ARM_NEON_ADLER32
220# ifndef ARM_NOCHECK_NEON
221 if (arm_cpu_has_neon)
222# endif
223 functable.adler32 = &adler32_neon;
224#endif
225#ifdef X86_SSSE3_ADLER32
226 if (x86_cpu_has_ssse3)
227 functable.adler32 = &adler32_ssse3;
228#endif
229#ifdef X86_AVX2_ADLER32
230 if (x86_cpu_has_avx2)
231 functable.adler32 = &adler32_avx2;
232#endif
233#ifdef POWER8_VSX_ADLER32
234 if (power_cpu_has_arch_2_07)
235 functable.adler32 = &adler32_power8;
236#endif
237
238 return functable.adler32(adler, buf, len);
239}
240
241Z_INTERNAL uint32_t chunksize_stub(void) {
242 // Initialize default
243 functable.chunksize = &chunksize_c;
244 cpu_check_features();
245
246#ifdef X86_SSE2_CHUNKSET
247# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
248 if (x86_cpu_has_sse2)
249# endif
250 functable.chunksize = &chunksize_sse2;
251#endif
252#ifdef X86_AVX_CHUNKSET
253 if (x86_cpu_has_avx2)
254 functable.chunksize = &chunksize_avx;
255#endif
256#ifdef ARM_NEON_CHUNKSET
257 if (arm_cpu_has_neon)
258 functable.chunksize = &chunksize_neon;
259#endif
260
261 return functable.chunksize();
262}
263
264Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) {
265 // Initialize default
266 functable.chunkcopy = &chunkcopy_c;
267
268#ifdef X86_SSE2_CHUNKSET
269# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
270 if (x86_cpu_has_sse2)
271# endif
272 functable.chunkcopy = &chunkcopy_sse2;
273#endif
274#ifdef X86_AVX_CHUNKSET
275 if (x86_cpu_has_avx2)
276 functable.chunkcopy = &chunkcopy_avx;
277#endif
278#ifdef ARM_NEON_CHUNKSET
279 if (arm_cpu_has_neon)
280 functable.chunkcopy = &chunkcopy_neon;
281#endif
282
283 return functable.chunkcopy(out, from, len);
284}
285
286Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
287 // Initialize default
288 functable.chunkcopy_safe = &chunkcopy_safe_c;
289
290#ifdef X86_SSE2_CHUNKSET
291# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
292 if (x86_cpu_has_sse2)
293# endif
294 functable.chunkcopy_safe = &chunkcopy_safe_sse2;
295#endif
296#ifdef X86_AVX_CHUNKSET
297 if (x86_cpu_has_avx2)
298 functable.chunkcopy_safe = &chunkcopy_safe_avx;
299#endif
300#ifdef ARM_NEON_CHUNKSET
301 if (arm_cpu_has_neon)
302 functable.chunkcopy_safe = &chunkcopy_safe_neon;
303#endif
304
305 return functable.chunkcopy_safe(out, from, len, safe);
306}
307
308Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
309 // Initialize default
310 functable.chunkunroll = &chunkunroll_c;
311
312#ifdef X86_SSE2_CHUNKSET
313# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
314 if (x86_cpu_has_sse2)
315# endif
316 functable.chunkunroll = &chunkunroll_sse2;
317#endif
318#ifdef X86_AVX_CHUNKSET
319 if (x86_cpu_has_avx2)
320 functable.chunkunroll = &chunkunroll_avx;
321#endif
322#ifdef ARM_NEON_CHUNKSET
323 if (arm_cpu_has_neon)
324 functable.chunkunroll = &chunkunroll_neon;
325#endif
326
327 return functable.chunkunroll(out, dist, len);
328}
329
330Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) {
331 // Initialize default
332 functable.chunkmemset = &chunkmemset_c;
333
334#ifdef X86_SSE2_CHUNKSET
335# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
336 if (x86_cpu_has_sse2)
337# endif
338 functable.chunkmemset = &chunkmemset_sse2;
339#endif
340#ifdef X86_AVX_CHUNKSET
341 if (x86_cpu_has_avx2)
342 functable.chunkmemset = &chunkmemset_avx;
343#endif
344#ifdef ARM_NEON_CHUNKSET
345 if (arm_cpu_has_neon)
346 functable.chunkmemset = &chunkmemset_neon;
347#endif
348
349 return functable.chunkmemset(out, dist, len);
350}
351
352Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
353 // Initialize default
354 functable.chunkmemset_safe = &chunkmemset_safe_c;
355
356#ifdef X86_SSE2_CHUNKSET
357# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
358 if (x86_cpu_has_sse2)
359# endif
360 functable.chunkmemset_safe = &chunkmemset_safe_sse2;
361#endif
362#ifdef X86_AVX_CHUNKSET
363 if (x86_cpu_has_avx2)
364 functable.chunkmemset_safe = &chunkmemset_safe_avx;
365#endif
366#ifdef ARM_NEON_CHUNKSET
367 if (arm_cpu_has_neon)
368 functable.chunkmemset_safe = &chunkmemset_safe_neon;
369#endif
370
371 return functable.chunkmemset_safe(out, dist, len, left);
372}
373
374Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) {
375 int32_t use_byfour = sizeof(void *) == sizeof(ptrdiff_t);
376
377 Assert(sizeof(uint64_t) >= sizeof(size_t),
378 "crc32_z takes size_t but internally we have a uint64_t len");
379 /* return a function pointer for optimized arches here after a capability test */
380
381 functable.crc32 = &crc32_generic;
382 cpu_check_features();
383
384 if (use_byfour) {
385#if BYTE_ORDER == LITTLE_ENDIAN
386 functable.crc32 = crc32_little;
387# if defined(ARM_ACLE_CRC_HASH)
388 if (arm_cpu_has_crc32)
389 functable.crc32 = crc32_acle;
390# endif
391#elif BYTE_ORDER == BIG_ENDIAN
392 functable.crc32 = crc32_big;
393#else
394# error No endian defined
395#endif
396 }
397
398 return functable.crc32(crc, buf, len);
399}
400
401Z_INTERNAL uint32_t compare258_stub(const unsigned char *src0, const unsigned char *src1) {
402
403 functable.compare258 = &compare258_c;
404
405#ifdef UNALIGNED_OK
406# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
407 functable.compare258 = &compare258_unaligned_64;
408# elif defined(HAVE_BUILTIN_CTZ)
409 functable.compare258 = &compare258_unaligned_32;
410# else
411 functable.compare258 = &compare258_unaligned_16;
412# endif
413# ifdef X86_SSE42_CMP_STR
414 if (x86_cpu_has_sse42)
415 functable.compare258 = &compare258_unaligned_sse4;
416# endif
417# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
418 if (x86_cpu_has_avx2)
419 functable.compare258 = &compare258_unaligned_avx2;
420# endif
421#endif
422
423 return functable.compare258(src0, src1);
424}
425
426Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
427
428 functable.longest_match = &longest_match_c;
429
430#ifdef UNALIGNED_OK
431# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
432 functable.longest_match = &longest_match_unaligned_64;
433# elif defined(HAVE_BUILTIN_CTZ)
434 functable.longest_match = &longest_match_unaligned_32;
435# else
436 functable.longest_match = &longest_match_unaligned_16;
437# endif
438# ifdef X86_SSE42_CMP_STR
439 if (x86_cpu_has_sse42)
440 functable.longest_match = &longest_match_unaligned_sse4;
441# endif
442# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
443 if (x86_cpu_has_avx2)
444 functable.longest_match = &longest_match_unaligned_avx2;
445# endif
446#endif
447
448 return functable.longest_match(s, cur_match);
449}
450
451/* functable init */
452Z_INTERNAL Z_TLS struct functable_s functable = {
453 insert_string_stub,
454 quick_insert_string_stub,
455 adler32_stub,
456 crc32_stub,
457 slide_hash_stub,
458 compare258_stub,
459 longest_match_stub,
460 chunksize_stub,
461 chunkcopy_stub,
462 chunkcopy_safe_stub,
463 chunkunroll_stub,
464 chunkmemset_stub,
465 chunkmemset_safe_stub
466};
467