1 | /* chunkset_neon.c -- NEON inline functions to copy small data chunks. |
2 | * For conditions of distribution and use, see copyright notice in zlib.h |
3 | */ |
4 | |
5 | #ifdef ARM_NEON_CHUNKSET |
6 | #ifdef _M_ARM64 |
7 | # include <arm64_neon.h> |
8 | #else |
9 | # include <arm_neon.h> |
10 | #endif |
11 | #include "../../zbuild.h" |
12 | #include "../../zutil.h" |
13 | |
14 | typedef uint8x16_t chunk_t; |
15 | |
16 | #define CHUNK_SIZE 16 |
17 | |
18 | #define HAVE_CHUNKMEMSET_1 |
19 | #define HAVE_CHUNKMEMSET_2 |
20 | #define HAVE_CHUNKMEMSET_4 |
21 | #define HAVE_CHUNKMEMSET_8 |
22 | |
23 | static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) { |
24 | *chunk = vld1q_dup_u8(from); |
25 | } |
26 | |
27 | static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { |
28 | uint16_t tmp; |
29 | memcpy(dest: &tmp, src: from, n: 2); |
30 | *chunk = vreinterpretq_u8_u16(p0: vdupq_n_u16(p0: tmp)); |
31 | } |
32 | |
33 | static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { |
34 | uint32_t tmp; |
35 | memcpy(dest: &tmp, src: from, n: 4); |
36 | *chunk = vreinterpretq_u8_u32(p0: vdupq_n_u32(p0: tmp)); |
37 | } |
38 | |
39 | static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { |
40 | uint64_t tmp; |
41 | memcpy(dest: &tmp, src: from, n: 8); |
42 | *chunk = vreinterpretq_u8_u64(p0: vdupq_n_u64(p0: tmp)); |
43 | } |
44 | |
45 | #define CHUNKSIZE chunksize_neon |
46 | #define CHUNKCOPY chunkcopy_neon |
47 | #define CHUNKCOPY_SAFE chunkcopy_safe_neon |
48 | #define CHUNKUNROLL chunkunroll_neon |
49 | #define CHUNKMEMSET chunkmemset_neon |
50 | #define CHUNKMEMSET_SAFE chunkmemset_safe_neon |
51 | |
52 | static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { |
53 | *chunk = vld1q_u8(s); |
54 | } |
55 | |
56 | static inline void storechunk(uint8_t *out, chunk_t *chunk) { |
57 | vst1q_u8(out, *chunk); |
58 | } |
59 | |
60 | #include "chunkset_tpl.h" |
61 | |
62 | #endif |
63 | |