1/* memcopy.h -- inline functions to copy small data chunks.
2 * For conditions of distribution and use, see copyright notice in zlib.h
3 */
4#ifndef MEMCOPY_H_
5 #define MEMCOPY_H_
6
7 #include "zendian.h"
8
9/* Load 64 bits from IN and place the bytes at offset BITS in the result. */
10static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
11 uint64_t chunk;
12 memcpy(&chunk, in, sizeof(chunk));
13
14 #if BYTE_ORDER == LITTLE_ENDIAN
15 return chunk << bits;
16 #else
17 return ZSWAP64(chunk) << bits;
18 #endif
19}
20
21 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
22 #include <arm_neon.h>
23typedef uint8x16_t inffast_chunk_t;
24 #define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t)
25 #endif
26
27 #if defined(X86_SSE2)
28 #include <immintrin.h>
29typedef __m128i inffast_chunk_t;
30 #define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t)
31 #endif
32
33 #ifdef INFFAST_CHUNKSIZE
34/*
35 Ask the compiler to perform a wide, unaligned load with an machine
36 instruction appropriate for the inffast_chunk_t type.
37 */
38static inline inffast_chunk_t loadchunk(unsigned char const* s) {
39 inffast_chunk_t c;
40 memcpy(&c, s, sizeof(c));
41 return c;
42}
43
44/*
45 Ask the compiler to perform a wide, unaligned store with an machine
46 instruction appropriate for the inffast_chunk_t type.
47 */
48static inline void storechunk(unsigned char* d, inffast_chunk_t c) {
49 memcpy(d, &c, sizeof(c));
50}
51
52/*
53 Behave like memcpy, but assume that it's OK to overwrite at least
54 INFFAST_CHUNKSIZE bytes of output even if the length is shorter than this,
55 that the length is non-zero, and that `from` lags `out` by at least
56 INFFAST_CHUNKSIZE bytes (or that they don't overlap at all or simply that
57 the distance is less than the length of the copy).
58
59 Aside from better memory bus utilisation, this means that short copies
60 (INFFAST_CHUNKSIZE bytes or fewer) will fall straight through the loop
61 without iteration, which will hopefully make the branch prediction more
62 reliable.
63 */
64static inline unsigned char* chunkcopy(unsigned char *out, unsigned char const *from, unsigned len) {
65 --len;
66 storechunk(out, loadchunk(from));
67 out += (len % INFFAST_CHUNKSIZE) + 1;
68 from += (len % INFFAST_CHUNKSIZE) + 1;
69 len /= INFFAST_CHUNKSIZE;
70 while (len > 0) {
71 storechunk(out, loadchunk(from));
72 out += INFFAST_CHUNKSIZE;
73 from += INFFAST_CHUNKSIZE;
74 --len;
75 }
76 return out;
77}
78
79/*
80 Behave like chunkcopy, but avoid writing beyond of legal output.
81 */
82static inline unsigned char* chunkcopysafe(unsigned char *out, unsigned char const *from, unsigned len,
83 unsigned char *safe) {
84 if ((safe - out) < (ptrdiff_t)INFFAST_CHUNKSIZE) {
85 if (len & 8) {
86 memcpy(out, from, 8);
87 out += 8;
88 from += 8;
89 }
90 if (len & 4) {
91 memcpy(out, from, 4);
92 out += 4;
93 from += 4;
94 }
95 if (len & 2) {
96 memcpy(out, from, 2);
97 out += 2;
98 from += 2;
99 }
100 if (len & 1) {
101 *out++ = *from++;
102 }
103 return out;
104 }
105 return chunkcopy(out, from, len);
106}
107
108/*
109 Perform short copies until distance can be rewritten as being at least
110 INFFAST_CHUNKSIZE.
111
112 This assumes that it's OK to overwrite at least the first
113 2*INFFAST_CHUNKSIZE bytes of output even if the copy is shorter than this.
114 This assumption holds because inflate_fast() starts every iteration with at
115 least 258 bytes of output space available (258 being the maximum length
116 output from a single token; see inflate_fast()'s assumptions below).
117 */
118static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, unsigned *len) {
119 unsigned char const *from = out - *dist;
120 while (*dist < *len && *dist < INFFAST_CHUNKSIZE) {
121 storechunk(out, loadchunk(from));
122 out += *dist;
123 *len -= *dist;
124 *dist += *dist;
125 }
126 return out;
127}
128
129static inline inffast_chunk_t chunkmemset_1(unsigned char *from) {
130 #if defined(X86_SSE2)
131 int8_t c;
132 memcpy(&c, from, sizeof(c));
133 return _mm_set1_epi8(c);
134 #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
135 return vld1q_dup_u8(from);
136 #endif
137}
138
139static inline inffast_chunk_t chunkmemset_2(unsigned char *from) {
140 int16_t c;
141 memcpy(&c, from, sizeof(c));
142 #if defined(X86_SSE2)
143 return _mm_set1_epi16(c);
144 #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
145 return vreinterpretq_u8_s16(vdupq_n_s16(c));
146 #endif
147}
148
149static inline inffast_chunk_t chunkmemset_4(unsigned char *from) {
150 int32_t c;
151 memcpy(&c, from, sizeof(c));
152 #if defined(X86_SSE2)
153 return _mm_set1_epi32(c);
154 #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
155 return vreinterpretq_u8_s32(vdupq_n_s32(c));
156 #endif
157}
158
159static inline inffast_chunk_t chunkmemset_8(unsigned char *from) {
160 #if defined(X86_SSE2)
161 int64_t c;
162 memcpy(&c, from, sizeof(c));
163 return _mm_set1_epi64x(c);
164 #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
165 return vcombine_u8(vld1_u8(from), vld1_u8(from));
166 #endif
167}
168
169 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
170static inline unsigned char *chunkmemset_3(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
171 uint8x8x3_t chunks;
172 unsigned sz = sizeof(chunks);
173 if (len < sz) {
174 out = chunkunroll(out, &dist, &len);
175 return chunkcopy(out, out - dist, len);
176 }
177
178 /* Load 3 bytes 'a,b,c' from FROM and duplicate across all lanes:
179 chunks[0] = {a,a,a,a,a,a,a,a}
180 chunks[1] = {b,b,b,b,b,b,b,b}
181 chunks[2] = {c,c,c,c,c,c,c,c}. */
182 chunks = vld3_dup_u8(from);
183
184 unsigned rem = len % sz;
185 len -= rem;
186 while (len) {
187 /* Store "a,b,c, ..., a,b,c". */
188 vst3_u8(out, chunks);
189 out += sz;
190 len -= sz;
191 }
192
193 if (!rem)
194 return out;
195
196 /* Last, deal with the case when LEN is not a multiple of SZ. */
197 out = chunkunroll(out, &dist, &rem);
198 return chunkcopy(out, out - dist, rem);
199}
200 #endif
201
202 #if defined(__aarch64__) || defined(_M_ARM64)
203static inline unsigned char *chunkmemset_6(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
204 uint16x8x3_t chunks;
205 unsigned sz = sizeof(chunks);
206 if (len < sz) {
207 out = chunkunroll(out, &dist, &len);
208 return chunkcopy(out, out - dist, len);
209 }
210
211 /* Load 6 bytes 'ab,cd,ef' from FROM and duplicate across all lanes:
212 chunks[0] = {ab,ab,ab,ab,ab,ab,ab,ab}
213 chunks[1] = {cd,cd,cd,cd,cd,cd,cd,cd}
214 chunks[2] = {ef,ef,ef,ef,ef,ef,ef,ef}. */
215 chunks = vld3q_dup_u16((unsigned short *)from);
216
217 unsigned rem = len % sz;
218 len -= rem;
219 while (len) {
220 /* Store "ab,cd,ef, ..., ab,cd,ef". */
221 vst3q_u16((unsigned short *)out, chunks);
222 out += sz;
223 len -= sz;
224 }
225
226 if (!rem)
227 return out;
228
229 /* Last, deal with the case when LEN is not a multiple of SZ. */
230 out = chunkunroll(out, &dist, &rem);
231 return chunkcopy(out, out - dist, rem);
232}
233 #endif
234
235/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. Return OUT + LEN. */
236static inline unsigned char *chunkmemset(unsigned char *out, unsigned dist, unsigned len) {
237 /* Debug performance related issues when len < sizeof(uint64_t):
238 Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
239 Assert(dist > 0, "cannot have a distance 0");
240
241 unsigned char *from = out - dist;
242 inffast_chunk_t chunk;
243 unsigned sz = sizeof(chunk);
244 if (len < sz) {
245 do {
246 *out++ = *from++;
247 --len;
248 } while (len != 0);
249 return out;
250 }
251
252 switch (dist) {
253 case 1: {
254 chunk = chunkmemset_1(from);
255 break;
256 }
257 case 2: {
258 chunk = chunkmemset_2(from);
259 break;
260 }
261 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
262 case 3:
263 return chunkmemset_3(out, from, dist, len);
264 #endif
265 case 4: {
266 chunk = chunkmemset_4(from);
267 break;
268 }
269 #if defined(__aarch64__) || defined(_M_ARM64)
270 case 6:
271 return chunkmemset_6(out, from, dist, len);
272 #endif
273 case 8: {
274 chunk = chunkmemset_8(from);
275 break;
276 }
277 case 16:
278 memcpy(&chunk, from, sz);
279 break;
280
281 default:
282 out = chunkunroll(out, &dist, &len);
283 return chunkcopy(out, out - dist, len);
284 }
285
286 unsigned rem = len % sz;
287 len -= rem;
288 while (len) {
289 memcpy(out, &chunk, sz);
290 out += sz;
291 len -= sz;
292 }
293
294 /* Last, deal with the case when LEN is not a multiple of SZ. */
295 if (rem)
296 memcpy(out, &chunk, rem);
297 out += rem;
298 return out;
299}
300
301static inline unsigned char* chunkmemsetsafe(unsigned char *out, unsigned dist, unsigned len, unsigned left) {
302 if (left < (unsigned)(3 * INFFAST_CHUNKSIZE)) {
303 while (len > 0) {
304 *out = *(out - dist);
305 out++;
306 --len;
307 }
308 return out;
309 }
310
311 return chunkmemset(out, dist, len);
312}
313
314 #else /* INFFAST_CHUNKSIZE */
315
316static inline unsigned char *copy_1_bytes(unsigned char *out, unsigned char *from) {
317 *out++ = *from;
318 return out;
319}
320
321static inline unsigned char *copy_2_bytes(unsigned char *out, unsigned char *from) {
322 uint16_t chunk;
323 unsigned sz = sizeof(chunk);
324 memcpy(&chunk, from, sz);
325 memcpy(out, &chunk, sz);
326 return out + sz;
327}
328
329static inline unsigned char *copy_3_bytes(unsigned char *out, unsigned char *from) {
330 out = copy_1_bytes(out, from);
331 return copy_2_bytes(out, from + 1);
332}
333
334static inline unsigned char *copy_4_bytes(unsigned char *out, unsigned char *from) {
335 uint32_t chunk;
336 unsigned sz = sizeof(chunk);
337 memcpy(&chunk, from, sz);
338 memcpy(out, &chunk, sz);
339 return out + sz;
340}
341
342static inline unsigned char *copy_5_bytes(unsigned char *out, unsigned char *from) {
343 out = copy_1_bytes(out, from);
344 return copy_4_bytes(out, from + 1);
345}
346
347static inline unsigned char *copy_6_bytes(unsigned char *out, unsigned char *from) {
348 out = copy_2_bytes(out, from);
349 return copy_4_bytes(out, from + 2);
350}
351
352static inline unsigned char *copy_7_bytes(unsigned char *out, unsigned char *from) {
353 out = copy_3_bytes(out, from);
354 return copy_4_bytes(out, from + 3);
355}
356
357static inline unsigned char *copy_8_bytes(unsigned char *out, unsigned char *from) {
358 uint64_t chunk;
359 unsigned sz = sizeof(chunk);
360 memcpy(&chunk, from, sz);
361 memcpy(out, &chunk, sz);
362 return out + sz;
363}
364
365/* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */
366static inline unsigned char *copy_bytes(unsigned char *out, unsigned char *from, unsigned len) {
367 Assert(len < 8, "copy_bytes should be called with less than 8 bytes");
368
369 #ifndef UNALIGNED_OK
370 while (len--) {
371 *out++ = *from++;
372 }
373 return out;
374 #else
375 switch (len) {
376 case 7:
377 return copy_7_bytes(out, from);
378 case 6:
379 return copy_6_bytes(out, from);
380 case 5:
381 return copy_5_bytes(out, from);
382 case 4:
383 return copy_4_bytes(out, from);
384 case 3:
385 return copy_3_bytes(out, from);
386 case 2:
387 return copy_2_bytes(out, from);
388 case 1:
389 return copy_1_bytes(out, from);
390 case 0:
391 return out;
392 default:
393 Assert(0, "should not happen");
394 }
395
396 return out;
397 #endif /* UNALIGNED_OK */
398}
399
400/* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */
401static inline unsigned char *set_bytes(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
402 Assert(len < 8, "set_bytes should be called with less than 8 bytes");
403
404 #ifndef UNALIGNED_OK
405 (void)dist;
406 while (len--) {
407 *out++ = *from++;
408 }
409 return out;
410 #else
411 if (dist >= len)
412 return copy_bytes(out, from, len);
413
414 switch (dist) {
415 case 6:
416 Assert(len == 7, "len should be exactly 7");
417 out = copy_6_bytes(out, from);
418 return copy_1_bytes(out, from);
419
420 case 5:
421 Assert(len == 6 || len == 7, "len should be either 6 or 7");
422 out = copy_5_bytes(out, from);
423 return copy_bytes(out, from, len - 5);
424
425 case 4:
426 Assert(len == 5 || len == 6 || len == 7, "len should be either 5, 6, or 7");
427 out = copy_4_bytes(out, from);
428 return copy_bytes(out, from, len - 4);
429
430 case 3:
431 Assert(4 <= len && len <= 7, "len should be between 4 and 7");
432 out = copy_3_bytes(out, from);
433 switch (len) {
434 case 7:
435 return copy_4_bytes(out, from);
436 case 6:
437 return copy_3_bytes(out, from);
438 case 5:
439 return copy_2_bytes(out, from);
440 case 4:
441 return copy_1_bytes(out, from);
442 default:
443 Assert(0, "should not happen");
444 break;
445 }
446
447 case 2:
448 Assert(3 <= len && len <= 7, "len should be between 3 and 7");
449 out = copy_2_bytes(out, from);
450 switch (len) {
451 case 7:
452 out = copy_4_bytes(out, from);
453 out = copy_1_bytes(out, from);
454 return out;
455 case 6:
456 out = copy_4_bytes(out, from);
457 return out;
458 case 5:
459 out = copy_2_bytes(out, from);
460 out = copy_1_bytes(out, from);
461 return out;
462 case 4:
463 out = copy_2_bytes(out, from);
464 return out;
465 case 3:
466 out = copy_1_bytes(out, from);
467 return out;
468 default:
469 Assert(0, "should not happen");
470 break;
471 }
472
473 case 1:
474 Assert(2 <= len && len <= 7, "len should be between 2 and 7");
475 unsigned char c = *from;
476 switch (len) {
477 case 7:
478 memset(out, c, 7);
479 return out + 7;
480 case 6:
481 memset(out, c, 6);
482 return out + 6;
483 case 5:
484 memset(out, c, 5);
485 return out + 5;
486 case 4:
487 memset(out, c, 4);
488 return out + 4;
489 case 3:
490 memset(out, c, 3);
491 return out + 3;
492 case 2:
493 memset(out, c, 2);
494 return out + 2;
495 default:
496 Assert(0, "should not happen");
497 break;
498 }
499 }
500 return out;
501 #endif /* UNALIGNED_OK */
502}
503
504/* Byte by byte semantics: copy LEN bytes from OUT + DIST and write them to OUT. Return OUT + LEN. */
505static inline unsigned char *chunk_memcpy(unsigned char *out, unsigned char *from, unsigned len) {
506 unsigned sz = sizeof(uint64_t);
507 Assert(len >= sz, "chunk_memcpy should be called on larger chunks");
508
509 /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
510 copy_8_bytes(out, from);
511
512 unsigned rem = len % sz;
513 len /= sz;
514 out += rem;
515 from += rem;
516
517 unsigned by8 = len % sz;
518 len -= by8;
519 switch (by8) {
520 case 7:
521 out = copy_8_bytes(out, from);
522 from += sz;
523 case 6:
524 out = copy_8_bytes(out, from);
525 from += sz;
526 case 5:
527 out = copy_8_bytes(out, from);
528 from += sz;
529 case 4:
530 out = copy_8_bytes(out, from);
531 from += sz;
532 case 3:
533 out = copy_8_bytes(out, from);
534 from += sz;
535 case 2:
536 out = copy_8_bytes(out, from);
537 from += sz;
538 case 1:
539 out = copy_8_bytes(out, from);
540 from += sz;
541 }
542
543 while (len) {
544 out = copy_8_bytes(out, from);
545 from += sz;
546 out = copy_8_bytes(out, from);
547 from += sz;
548 out = copy_8_bytes(out, from);
549 from += sz;
550 out = copy_8_bytes(out, from);
551 from += sz;
552 out = copy_8_bytes(out, from);
553 from += sz;
554 out = copy_8_bytes(out, from);
555 from += sz;
556 out = copy_8_bytes(out, from);
557 from += sz;
558 out = copy_8_bytes(out, from);
559 from += sz;
560
561 len -= 8;
562 }
563
564 return out;
565}
566
567/* Memset LEN bytes in OUT with the value at OUT - 1. Return OUT + LEN. */
568static inline unsigned char *byte_memset(unsigned char *out, unsigned len) {
569 unsigned sz = sizeof(uint64_t);
570 Assert(len >= sz, "byte_memset should be called on larger chunks");
571
572 unsigned char *from = out - 1;
573 unsigned char c = *from;
574
575 /* First, deal with the case when LEN is not a multiple of SZ. */
576 memset(out, c, sz);
577 unsigned rem = len % sz;
578 len /= sz;
579 out += rem;
580
581 unsigned by8 = len % 8;
582 len -= by8;
583 switch (by8) {
584 case 7:
585 memset(out, c, sz);
586 out += sz;
587 case 6:
588 memset(out, c, sz);
589 out += sz;
590 case 5:
591 memset(out, c, sz);
592 out += sz;
593 case 4:
594 memset(out, c, sz);
595 out += sz;
596 case 3:
597 memset(out, c, sz);
598 out += sz;
599 case 2:
600 memset(out, c, sz);
601 out += sz;
602 case 1:
603 memset(out, c, sz);
604 out += sz;
605 }
606
607 while (len) {
608 /* When sz is a constant, the compiler replaces __builtin_memset with an
609 inline version that does not incur a function call overhead. */
610 memset(out, c, sz);
611 out += sz;
612 memset(out, c, sz);
613 out += sz;
614 memset(out, c, sz);
615 out += sz;
616 memset(out, c, sz);
617 out += sz;
618 memset(out, c, sz);
619 out += sz;
620 memset(out, c, sz);
621 out += sz;
622 memset(out, c, sz);
623 out += sz;
624 memset(out, c, sz);
625 out += sz;
626 len -= 8;
627 }
628
629 return out;
630}
631
632/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. Return OUT + LEN. */
633static inline unsigned char *chunk_memset(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
634 if (dist >= len)
635 return chunk_memcpy(out, from, len);
636
637 Assert(len >= sizeof(uint64_t), "chunk_memset should be called on larger chunks");
638
639 /* Double up the size of the memset pattern until reaching the largest pattern of size less than SZ. */
640 unsigned sz = sizeof(uint64_t);
641 while (dist < len && dist < sz) {
642 copy_8_bytes(out, from);
643
644 out += dist;
645 len -= dist;
646 dist += dist;
647
648 /* Make sure the next memcpy has at least SZ bytes to be copied. */
649 if (len < sz)
650 /* Finish up byte by byte when there are not enough bytes left. */
651 return set_bytes(out, from, dist, len);
652 }
653
654 return chunk_memcpy(out, from, len);
655}
656
657/* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
658static inline unsigned char *chunk_copy(unsigned char *out, unsigned char *from, int dist, unsigned len) {
659 if (len < sizeof(uint64_t)) {
660 if (dist > 0)
661 return set_bytes(out, from, dist, len);
662
663 return copy_bytes(out, from, len);
664 }
665
666 if (dist == 1)
667 return byte_memset(out, len);
668
669 if (dist > 0)
670 return chunk_memset(out, from, dist, len);
671
672 return chunk_memcpy(out, from, len);
673}
674 #endif /* INFFAST_CHUNKSIZE */
675#endif /* MEMCOPY_H_ */
676