1/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
2 * For conditions of distribution and use, see copyright notice in zlib.h
3 */
4
5#ifdef ARM_NEON_CHUNKSET
6#ifdef _M_ARM64
7# include <arm64_neon.h>
8#else
9# include <arm_neon.h>
10#endif
11#include "../../zbuild.h"
12#include "../../zutil.h"
13
14typedef uint8x16_t chunk_t;
15
16#define CHUNK_SIZE 16
17
18#define HAVE_CHUNKMEMSET_1
19#define HAVE_CHUNKMEMSET_2
20#define HAVE_CHUNKMEMSET_4
21#define HAVE_CHUNKMEMSET_8
22
23static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
24 *chunk = vld1q_dup_u8(from);
25}
26
27static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
28 uint16_t tmp;
29 memcpy(dest: &tmp, src: from, n: 2);
30 *chunk = vreinterpretq_u8_u16(p0: vdupq_n_u16(p0: tmp));
31}
32
33static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
34 uint32_t tmp;
35 memcpy(dest: &tmp, src: from, n: 4);
36 *chunk = vreinterpretq_u8_u32(p0: vdupq_n_u32(p0: tmp));
37}
38
39static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
40 uint64_t tmp;
41 memcpy(dest: &tmp, src: from, n: 8);
42 *chunk = vreinterpretq_u8_u64(p0: vdupq_n_u64(p0: tmp));
43}
44
45#define CHUNKSIZE chunksize_neon
46#define CHUNKCOPY chunkcopy_neon
47#define CHUNKCOPY_SAFE chunkcopy_safe_neon
48#define CHUNKUNROLL chunkunroll_neon
49#define CHUNKMEMSET chunkmemset_neon
50#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
51
52static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
53 *chunk = vld1q_u8(s);
54}
55
56static inline void storechunk(uint8_t *out, chunk_t *chunk) {
57 vst1q_u8(out, *chunk);
58}
59
60#include "chunkset_tpl.h"
61
62#endif
63