1 | /* memcopy.h -- inline functions to copy small data chunks. |
2 | * For conditions of distribution and use, see copyright notice in zlib.h |
3 | */ |
4 | #ifndef MEMCOPY_H_ |
5 | #define MEMCOPY_H_ |
6 | |
7 | #include "zendian.h" |
8 | |
9 | /* Load 64 bits from IN and place the bytes at offset BITS in the result. */ |
10 | static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) { |
11 | uint64_t chunk; |
12 | memcpy(&chunk, in, sizeof(chunk)); |
13 | |
14 | #if BYTE_ORDER == LITTLE_ENDIAN |
15 | return chunk << bits; |
16 | #else |
17 | return ZSWAP64(chunk) << bits; |
18 | #endif |
19 | } |
20 | |
21 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
22 | #include <arm_neon.h> |
23 | typedef uint8x16_t inffast_chunk_t; |
24 | #define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t) |
25 | #endif |
26 | |
27 | #if defined(X86_SSE2) |
28 | #include <immintrin.h> |
29 | typedef __m128i inffast_chunk_t; |
30 | #define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t) |
31 | #endif |
32 | |
33 | #ifdef INFFAST_CHUNKSIZE |
34 | /* |
35 | Ask the compiler to perform a wide, unaligned load with an machine |
36 | instruction appropriate for the inffast_chunk_t type. |
37 | */ |
38 | static inline inffast_chunk_t loadchunk(unsigned char const* s) { |
39 | inffast_chunk_t c; |
40 | memcpy(&c, s, sizeof(c)); |
41 | return c; |
42 | } |
43 | |
44 | /* |
45 | Ask the compiler to perform a wide, unaligned store with an machine |
46 | instruction appropriate for the inffast_chunk_t type. |
47 | */ |
48 | static inline void storechunk(unsigned char* d, inffast_chunk_t c) { |
49 | memcpy(d, &c, sizeof(c)); |
50 | } |
51 | |
52 | /* |
53 | Behave like memcpy, but assume that it's OK to overwrite at least |
54 | INFFAST_CHUNKSIZE bytes of output even if the length is shorter than this, |
55 | that the length is non-zero, and that `from` lags `out` by at least |
56 | INFFAST_CHUNKSIZE bytes (or that they don't overlap at all or simply that |
57 | the distance is less than the length of the copy). |
58 | |
59 | Aside from better memory bus utilisation, this means that short copies |
60 | (INFFAST_CHUNKSIZE bytes or fewer) will fall straight through the loop |
61 | without iteration, which will hopefully make the branch prediction more |
62 | reliable. |
63 | */ |
64 | static inline unsigned char* chunkcopy(unsigned char *out, unsigned char const *from, unsigned len) { |
65 | --len; |
66 | storechunk(out, loadchunk(from)); |
67 | out += (len % INFFAST_CHUNKSIZE) + 1; |
68 | from += (len % INFFAST_CHUNKSIZE) + 1; |
69 | len /= INFFAST_CHUNKSIZE; |
70 | while (len > 0) { |
71 | storechunk(out, loadchunk(from)); |
72 | out += INFFAST_CHUNKSIZE; |
73 | from += INFFAST_CHUNKSIZE; |
74 | --len; |
75 | } |
76 | return out; |
77 | } |
78 | |
79 | /* |
80 | Behave like chunkcopy, but avoid writing beyond of legal output. |
81 | */ |
82 | static inline unsigned char* chunkcopysafe(unsigned char *out, unsigned char const *from, unsigned len, |
83 | unsigned char *safe) { |
84 | if ((safe - out) < (ptrdiff_t)INFFAST_CHUNKSIZE) { |
85 | if (len & 8) { |
86 | memcpy(out, from, 8); |
87 | out += 8; |
88 | from += 8; |
89 | } |
90 | if (len & 4) { |
91 | memcpy(out, from, 4); |
92 | out += 4; |
93 | from += 4; |
94 | } |
95 | if (len & 2) { |
96 | memcpy(out, from, 2); |
97 | out += 2; |
98 | from += 2; |
99 | } |
100 | if (len & 1) { |
101 | *out++ = *from++; |
102 | } |
103 | return out; |
104 | } |
105 | return chunkcopy(out, from, len); |
106 | } |
107 | |
108 | /* |
109 | Perform short copies until distance can be rewritten as being at least |
110 | INFFAST_CHUNKSIZE. |
111 | |
112 | This assumes that it's OK to overwrite at least the first |
113 | 2*INFFAST_CHUNKSIZE bytes of output even if the copy is shorter than this. |
114 | This assumption holds because inflate_fast() starts every iteration with at |
115 | least 258 bytes of output space available (258 being the maximum length |
116 | output from a single token; see inflate_fast()'s assumptions below). |
117 | */ |
118 | static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, unsigned *len) { |
119 | unsigned char const *from = out - *dist; |
120 | while (*dist < *len && *dist < INFFAST_CHUNKSIZE) { |
121 | storechunk(out, loadchunk(from)); |
122 | out += *dist; |
123 | *len -= *dist; |
124 | *dist += *dist; |
125 | } |
126 | return out; |
127 | } |
128 | |
129 | static inline inffast_chunk_t chunkmemset_1(unsigned char *from) { |
130 | #if defined(X86_SSE2) |
131 | int8_t c; |
132 | memcpy(&c, from, sizeof(c)); |
133 | return _mm_set1_epi8(c); |
134 | #elif defined(__ARM_NEON__) || defined(__ARM_NEON) |
135 | return vld1q_dup_u8(from); |
136 | #endif |
137 | } |
138 | |
139 | static inline inffast_chunk_t chunkmemset_2(unsigned char *from) { |
140 | int16_t c; |
141 | memcpy(&c, from, sizeof(c)); |
142 | #if defined(X86_SSE2) |
143 | return _mm_set1_epi16(c); |
144 | #elif defined(__ARM_NEON__) || defined(__ARM_NEON) |
145 | return vreinterpretq_u8_s16(vdupq_n_s16(c)); |
146 | #endif |
147 | } |
148 | |
149 | static inline inffast_chunk_t chunkmemset_4(unsigned char *from) { |
150 | int32_t c; |
151 | memcpy(&c, from, sizeof(c)); |
152 | #if defined(X86_SSE2) |
153 | return _mm_set1_epi32(c); |
154 | #elif defined(__ARM_NEON__) || defined(__ARM_NEON) |
155 | return vreinterpretq_u8_s32(vdupq_n_s32(c)); |
156 | #endif |
157 | } |
158 | |
159 | static inline inffast_chunk_t chunkmemset_8(unsigned char *from) { |
160 | #if defined(X86_SSE2) |
161 | int64_t c; |
162 | memcpy(&c, from, sizeof(c)); |
163 | return _mm_set1_epi64x(c); |
164 | #elif defined(__ARM_NEON__) || defined(__ARM_NEON) |
165 | return vcombine_u8(vld1_u8(from), vld1_u8(from)); |
166 | #endif |
167 | } |
168 | |
169 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
170 | static inline unsigned char *chunkmemset_3(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) { |
171 | uint8x8x3_t chunks; |
172 | unsigned sz = sizeof(chunks); |
173 | if (len < sz) { |
174 | out = chunkunroll(out, &dist, &len); |
175 | return chunkcopy(out, out - dist, len); |
176 | } |
177 | |
178 | /* Load 3 bytes 'a,b,c' from FROM and duplicate across all lanes: |
179 | chunks[0] = {a,a,a,a,a,a,a,a} |
180 | chunks[1] = {b,b,b,b,b,b,b,b} |
181 | chunks[2] = {c,c,c,c,c,c,c,c}. */ |
182 | chunks = vld3_dup_u8(from); |
183 | |
184 | unsigned rem = len % sz; |
185 | len -= rem; |
186 | while (len) { |
187 | /* Store "a,b,c, ..., a,b,c". */ |
188 | vst3_u8(out, chunks); |
189 | out += sz; |
190 | len -= sz; |
191 | } |
192 | |
193 | if (!rem) |
194 | return out; |
195 | |
196 | /* Last, deal with the case when LEN is not a multiple of SZ. */ |
197 | out = chunkunroll(out, &dist, &rem); |
198 | return chunkcopy(out, out - dist, rem); |
199 | } |
200 | #endif |
201 | |
202 | #if defined(__aarch64__) || defined(_M_ARM64) |
203 | static inline unsigned char *chunkmemset_6(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) { |
204 | uint16x8x3_t chunks; |
205 | unsigned sz = sizeof(chunks); |
206 | if (len < sz) { |
207 | out = chunkunroll(out, &dist, &len); |
208 | return chunkcopy(out, out - dist, len); |
209 | } |
210 | |
211 | /* Load 6 bytes 'ab,cd,ef' from FROM and duplicate across all lanes: |
212 | chunks[0] = {ab,ab,ab,ab,ab,ab,ab,ab} |
213 | chunks[1] = {cd,cd,cd,cd,cd,cd,cd,cd} |
214 | chunks[2] = {ef,ef,ef,ef,ef,ef,ef,ef}. */ |
215 | chunks = vld3q_dup_u16((unsigned short *)from); |
216 | |
217 | unsigned rem = len % sz; |
218 | len -= rem; |
219 | while (len) { |
220 | /* Store "ab,cd,ef, ..., ab,cd,ef". */ |
221 | vst3q_u16((unsigned short *)out, chunks); |
222 | out += sz; |
223 | len -= sz; |
224 | } |
225 | |
226 | if (!rem) |
227 | return out; |
228 | |
229 | /* Last, deal with the case when LEN is not a multiple of SZ. */ |
230 | out = chunkunroll(out, &dist, &rem); |
231 | return chunkcopy(out, out - dist, rem); |
232 | } |
233 | #endif |
234 | |
235 | /* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. Return OUT + LEN. */ |
236 | static inline unsigned char *chunkmemset(unsigned char *out, unsigned dist, unsigned len) { |
237 | /* Debug performance related issues when len < sizeof(uint64_t): |
238 | Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */ |
239 | Assert(dist > 0, "cannot have a distance 0" ); |
240 | |
241 | unsigned char *from = out - dist; |
242 | inffast_chunk_t chunk; |
243 | unsigned sz = sizeof(chunk); |
244 | if (len < sz) { |
245 | do { |
246 | *out++ = *from++; |
247 | --len; |
248 | } while (len != 0); |
249 | return out; |
250 | } |
251 | |
252 | switch (dist) { |
253 | case 1: { |
254 | chunk = chunkmemset_1(from); |
255 | break; |
256 | } |
257 | case 2: { |
258 | chunk = chunkmemset_2(from); |
259 | break; |
260 | } |
261 | #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
262 | case 3: |
263 | return chunkmemset_3(out, from, dist, len); |
264 | #endif |
265 | case 4: { |
266 | chunk = chunkmemset_4(from); |
267 | break; |
268 | } |
269 | #if defined(__aarch64__) || defined(_M_ARM64) |
270 | case 6: |
271 | return chunkmemset_6(out, from, dist, len); |
272 | #endif |
273 | case 8: { |
274 | chunk = chunkmemset_8(from); |
275 | break; |
276 | } |
277 | case 16: |
278 | memcpy(&chunk, from, sz); |
279 | break; |
280 | |
281 | default: |
282 | out = chunkunroll(out, &dist, &len); |
283 | return chunkcopy(out, out - dist, len); |
284 | } |
285 | |
286 | unsigned rem = len % sz; |
287 | len -= rem; |
288 | while (len) { |
289 | memcpy(out, &chunk, sz); |
290 | out += sz; |
291 | len -= sz; |
292 | } |
293 | |
294 | /* Last, deal with the case when LEN is not a multiple of SZ. */ |
295 | if (rem) |
296 | memcpy(out, &chunk, rem); |
297 | out += rem; |
298 | return out; |
299 | } |
300 | |
301 | static inline unsigned char* chunkmemsetsafe(unsigned char *out, unsigned dist, unsigned len, unsigned left) { |
302 | if (left < (unsigned)(3 * INFFAST_CHUNKSIZE)) { |
303 | while (len > 0) { |
304 | *out = *(out - dist); |
305 | out++; |
306 | --len; |
307 | } |
308 | return out; |
309 | } |
310 | |
311 | return chunkmemset(out, dist, len); |
312 | } |
313 | |
314 | #else /* INFFAST_CHUNKSIZE */ |
315 | |
316 | static inline unsigned char *copy_1_bytes(unsigned char *out, unsigned char *from) { |
317 | *out++ = *from; |
318 | return out; |
319 | } |
320 | |
321 | static inline unsigned char *copy_2_bytes(unsigned char *out, unsigned char *from) { |
322 | uint16_t chunk; |
323 | unsigned sz = sizeof(chunk); |
324 | memcpy(&chunk, from, sz); |
325 | memcpy(out, &chunk, sz); |
326 | return out + sz; |
327 | } |
328 | |
329 | static inline unsigned char *copy_3_bytes(unsigned char *out, unsigned char *from) { |
330 | out = copy_1_bytes(out, from); |
331 | return copy_2_bytes(out, from + 1); |
332 | } |
333 | |
334 | static inline unsigned char *copy_4_bytes(unsigned char *out, unsigned char *from) { |
335 | uint32_t chunk; |
336 | unsigned sz = sizeof(chunk); |
337 | memcpy(&chunk, from, sz); |
338 | memcpy(out, &chunk, sz); |
339 | return out + sz; |
340 | } |
341 | |
342 | static inline unsigned char *copy_5_bytes(unsigned char *out, unsigned char *from) { |
343 | out = copy_1_bytes(out, from); |
344 | return copy_4_bytes(out, from + 1); |
345 | } |
346 | |
347 | static inline unsigned char *copy_6_bytes(unsigned char *out, unsigned char *from) { |
348 | out = copy_2_bytes(out, from); |
349 | return copy_4_bytes(out, from + 2); |
350 | } |
351 | |
352 | static inline unsigned char *copy_7_bytes(unsigned char *out, unsigned char *from) { |
353 | out = copy_3_bytes(out, from); |
354 | return copy_4_bytes(out, from + 3); |
355 | } |
356 | |
357 | static inline unsigned char *copy_8_bytes(unsigned char *out, unsigned char *from) { |
358 | uint64_t chunk; |
359 | unsigned sz = sizeof(chunk); |
360 | memcpy(&chunk, from, sz); |
361 | memcpy(out, &chunk, sz); |
362 | return out + sz; |
363 | } |
364 | |
365 | /* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */ |
366 | static inline unsigned char *copy_bytes(unsigned char *out, unsigned char *from, unsigned len) { |
367 | Assert(len < 8, "copy_bytes should be called with less than 8 bytes" ); |
368 | |
369 | #ifndef UNALIGNED_OK |
370 | while (len--) { |
371 | *out++ = *from++; |
372 | } |
373 | return out; |
374 | #else |
375 | switch (len) { |
376 | case 7: |
377 | return copy_7_bytes(out, from); |
378 | case 6: |
379 | return copy_6_bytes(out, from); |
380 | case 5: |
381 | return copy_5_bytes(out, from); |
382 | case 4: |
383 | return copy_4_bytes(out, from); |
384 | case 3: |
385 | return copy_3_bytes(out, from); |
386 | case 2: |
387 | return copy_2_bytes(out, from); |
388 | case 1: |
389 | return copy_1_bytes(out, from); |
390 | case 0: |
391 | return out; |
392 | default: |
393 | Assert(0, "should not happen" ); |
394 | } |
395 | |
396 | return out; |
397 | #endif /* UNALIGNED_OK */ |
398 | } |
399 | |
400 | /* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */ |
401 | static inline unsigned char *set_bytes(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) { |
402 | Assert(len < 8, "set_bytes should be called with less than 8 bytes" ); |
403 | |
404 | #ifndef UNALIGNED_OK |
405 | (void)dist; |
406 | while (len--) { |
407 | *out++ = *from++; |
408 | } |
409 | return out; |
410 | #else |
411 | if (dist >= len) |
412 | return copy_bytes(out, from, len); |
413 | |
414 | switch (dist) { |
415 | case 6: |
416 | Assert(len == 7, "len should be exactly 7" ); |
417 | out = copy_6_bytes(out, from); |
418 | return copy_1_bytes(out, from); |
419 | |
420 | case 5: |
421 | Assert(len == 6 || len == 7, "len should be either 6 or 7" ); |
422 | out = copy_5_bytes(out, from); |
423 | return copy_bytes(out, from, len - 5); |
424 | |
425 | case 4: |
426 | Assert(len == 5 || len == 6 || len == 7, "len should be either 5, 6, or 7" ); |
427 | out = copy_4_bytes(out, from); |
428 | return copy_bytes(out, from, len - 4); |
429 | |
430 | case 3: |
431 | Assert(4 <= len && len <= 7, "len should be between 4 and 7" ); |
432 | out = copy_3_bytes(out, from); |
433 | switch (len) { |
434 | case 7: |
435 | return copy_4_bytes(out, from); |
436 | case 6: |
437 | return copy_3_bytes(out, from); |
438 | case 5: |
439 | return copy_2_bytes(out, from); |
440 | case 4: |
441 | return copy_1_bytes(out, from); |
442 | default: |
443 | Assert(0, "should not happen" ); |
444 | break; |
445 | } |
446 | |
447 | case 2: |
448 | Assert(3 <= len && len <= 7, "len should be between 3 and 7" ); |
449 | out = copy_2_bytes(out, from); |
450 | switch (len) { |
451 | case 7: |
452 | out = copy_4_bytes(out, from); |
453 | out = copy_1_bytes(out, from); |
454 | return out; |
455 | case 6: |
456 | out = copy_4_bytes(out, from); |
457 | return out; |
458 | case 5: |
459 | out = copy_2_bytes(out, from); |
460 | out = copy_1_bytes(out, from); |
461 | return out; |
462 | case 4: |
463 | out = copy_2_bytes(out, from); |
464 | return out; |
465 | case 3: |
466 | out = copy_1_bytes(out, from); |
467 | return out; |
468 | default: |
469 | Assert(0, "should not happen" ); |
470 | break; |
471 | } |
472 | |
473 | case 1: |
474 | Assert(2 <= len && len <= 7, "len should be between 2 and 7" ); |
475 | unsigned char c = *from; |
476 | switch (len) { |
477 | case 7: |
478 | memset(out, c, 7); |
479 | return out + 7; |
480 | case 6: |
481 | memset(out, c, 6); |
482 | return out + 6; |
483 | case 5: |
484 | memset(out, c, 5); |
485 | return out + 5; |
486 | case 4: |
487 | memset(out, c, 4); |
488 | return out + 4; |
489 | case 3: |
490 | memset(out, c, 3); |
491 | return out + 3; |
492 | case 2: |
493 | memset(out, c, 2); |
494 | return out + 2; |
495 | default: |
496 | Assert(0, "should not happen" ); |
497 | break; |
498 | } |
499 | } |
500 | return out; |
501 | #endif /* UNALIGNED_OK */ |
502 | } |
503 | |
504 | /* Byte by byte semantics: copy LEN bytes from OUT + DIST and write them to OUT. Return OUT + LEN. */ |
505 | static inline unsigned char *chunk_memcpy(unsigned char *out, unsigned char *from, unsigned len) { |
506 | unsigned sz = sizeof(uint64_t); |
507 | Assert(len >= sz, "chunk_memcpy should be called on larger chunks" ); |
508 | |
509 | /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */ |
510 | copy_8_bytes(out, from); |
511 | |
512 | unsigned rem = len % sz; |
513 | len /= sz; |
514 | out += rem; |
515 | from += rem; |
516 | |
517 | unsigned by8 = len % sz; |
518 | len -= by8; |
519 | switch (by8) { |
520 | case 7: |
521 | out = copy_8_bytes(out, from); |
522 | from += sz; |
523 | case 6: |
524 | out = copy_8_bytes(out, from); |
525 | from += sz; |
526 | case 5: |
527 | out = copy_8_bytes(out, from); |
528 | from += sz; |
529 | case 4: |
530 | out = copy_8_bytes(out, from); |
531 | from += sz; |
532 | case 3: |
533 | out = copy_8_bytes(out, from); |
534 | from += sz; |
535 | case 2: |
536 | out = copy_8_bytes(out, from); |
537 | from += sz; |
538 | case 1: |
539 | out = copy_8_bytes(out, from); |
540 | from += sz; |
541 | } |
542 | |
543 | while (len) { |
544 | out = copy_8_bytes(out, from); |
545 | from += sz; |
546 | out = copy_8_bytes(out, from); |
547 | from += sz; |
548 | out = copy_8_bytes(out, from); |
549 | from += sz; |
550 | out = copy_8_bytes(out, from); |
551 | from += sz; |
552 | out = copy_8_bytes(out, from); |
553 | from += sz; |
554 | out = copy_8_bytes(out, from); |
555 | from += sz; |
556 | out = copy_8_bytes(out, from); |
557 | from += sz; |
558 | out = copy_8_bytes(out, from); |
559 | from += sz; |
560 | |
561 | len -= 8; |
562 | } |
563 | |
564 | return out; |
565 | } |
566 | |
567 | /* Memset LEN bytes in OUT with the value at OUT - 1. Return OUT + LEN. */ |
568 | static inline unsigned char *byte_memset(unsigned char *out, unsigned len) { |
569 | unsigned sz = sizeof(uint64_t); |
570 | Assert(len >= sz, "byte_memset should be called on larger chunks" ); |
571 | |
572 | unsigned char *from = out - 1; |
573 | unsigned char c = *from; |
574 | |
575 | /* First, deal with the case when LEN is not a multiple of SZ. */ |
576 | memset(out, c, sz); |
577 | unsigned rem = len % sz; |
578 | len /= sz; |
579 | out += rem; |
580 | |
581 | unsigned by8 = len % 8; |
582 | len -= by8; |
583 | switch (by8) { |
584 | case 7: |
585 | memset(out, c, sz); |
586 | out += sz; |
587 | case 6: |
588 | memset(out, c, sz); |
589 | out += sz; |
590 | case 5: |
591 | memset(out, c, sz); |
592 | out += sz; |
593 | case 4: |
594 | memset(out, c, sz); |
595 | out += sz; |
596 | case 3: |
597 | memset(out, c, sz); |
598 | out += sz; |
599 | case 2: |
600 | memset(out, c, sz); |
601 | out += sz; |
602 | case 1: |
603 | memset(out, c, sz); |
604 | out += sz; |
605 | } |
606 | |
607 | while (len) { |
608 | /* When sz is a constant, the compiler replaces __builtin_memset with an |
609 | inline version that does not incur a function call overhead. */ |
610 | memset(out, c, sz); |
611 | out += sz; |
612 | memset(out, c, sz); |
613 | out += sz; |
614 | memset(out, c, sz); |
615 | out += sz; |
616 | memset(out, c, sz); |
617 | out += sz; |
618 | memset(out, c, sz); |
619 | out += sz; |
620 | memset(out, c, sz); |
621 | out += sz; |
622 | memset(out, c, sz); |
623 | out += sz; |
624 | memset(out, c, sz); |
625 | out += sz; |
626 | len -= 8; |
627 | } |
628 | |
629 | return out; |
630 | } |
631 | |
632 | /* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. Return OUT + LEN. */ |
633 | static inline unsigned char *chunk_memset(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) { |
634 | if (dist >= len) |
635 | return chunk_memcpy(out, from, len); |
636 | |
637 | Assert(len >= sizeof(uint64_t), "chunk_memset should be called on larger chunks" ); |
638 | |
639 | /* Double up the size of the memset pattern until reaching the largest pattern of size less than SZ. */ |
640 | unsigned sz = sizeof(uint64_t); |
641 | while (dist < len && dist < sz) { |
642 | copy_8_bytes(out, from); |
643 | |
644 | out += dist; |
645 | len -= dist; |
646 | dist += dist; |
647 | |
648 | /* Make sure the next memcpy has at least SZ bytes to be copied. */ |
649 | if (len < sz) |
650 | /* Finish up byte by byte when there are not enough bytes left. */ |
651 | return set_bytes(out, from, dist, len); |
652 | } |
653 | |
654 | return chunk_memcpy(out, from, len); |
655 | } |
656 | |
657 | /* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */ |
658 | static inline unsigned char *chunk_copy(unsigned char *out, unsigned char *from, int dist, unsigned len) { |
659 | if (len < sizeof(uint64_t)) { |
660 | if (dist > 0) |
661 | return set_bytes(out, from, dist, len); |
662 | |
663 | return copy_bytes(out, from, len); |
664 | } |
665 | |
666 | if (dist == 1) |
667 | return byte_memset(out, len); |
668 | |
669 | if (dist > 0) |
670 | return chunk_memset(out, from, dist, len); |
671 | |
672 | return chunk_memcpy(out, from, len); |
673 | } |
674 | #endif /* INFFAST_CHUNKSIZE */ |
675 | #endif /* MEMCOPY_H_ */ |
676 | |