1 | /* chunkcopy.h -- fast chunk copy and set operations |
2 | * Copyright (C) 2017 ARM, Inc. |
3 | * Copyright 2017 The Chromium Authors. All rights reserved. |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the Chromium source repository LICENSE file. |
6 | */ |
7 | |
8 | #ifndef CHUNKCOPY_H |
9 | #define CHUNKCOPY_H |
10 | |
11 | #include <stdint.h> |
12 | #include "zutil.h" |
13 | |
14 | #define Z_STATIC_ASSERT(name, assert) typedef char name[(assert) ? 1 : -1] |
15 | |
16 | #if __STDC_VERSION__ >= 199901L |
17 | #define Z_RESTRICT restrict |
18 | #else |
19 | #define Z_RESTRICT |
20 | #endif |
21 | |
22 | #if defined(__clang__) || defined(__GNUC__) || defined(__llvm__) |
23 | #define Z_BUILTIN_MEMCPY __builtin_memcpy |
24 | #else |
25 | #define Z_BUILTIN_MEMCPY zmemcpy |
26 | #endif |
27 | |
28 | #if defined(INFLATE_CHUNK_SIMD_NEON) |
29 | #include <arm_neon.h> |
30 | typedef uint8x16_t z_vec128i_t; |
31 | #elif defined(INFLATE_CHUNK_SIMD_SSE2) |
32 | #include <emmintrin.h> |
33 | typedef __m128i z_vec128i_t; |
34 | #else |
35 | #error chunkcopy.h inflate chunk SIMD is not defined for your build target |
36 | #endif |
37 | |
38 | /* |
39 | * chunk copy type: the z_vec128i_t type size should be exactly 128-bits |
40 | * and equal to CHUNKCOPY_CHUNK_SIZE. |
41 | */ |
42 | #define CHUNKCOPY_CHUNK_SIZE sizeof(z_vec128i_t) |
43 | |
44 | Z_STATIC_ASSERT(vector_128_bits_wide, |
45 | CHUNKCOPY_CHUNK_SIZE == sizeof(int8_t) * 16); |
46 | |
47 | /* |
48 | * Ask the compiler to perform a wide, unaligned load with a machine |
49 | * instruction appropriate for the z_vec128i_t type. |
50 | */ |
51 | static inline z_vec128i_t loadchunk( |
52 | const unsigned char FAR* s) { |
53 | z_vec128i_t v; |
54 | Z_BUILTIN_MEMCPY(&v, s, sizeof(v)); |
55 | return v; |
56 | } |
57 | |
58 | /* |
59 | * Ask the compiler to perform a wide, unaligned store with a machine |
60 | * instruction appropriate for the z_vec128i_t type. |
61 | */ |
62 | static inline void storechunk( |
63 | unsigned char FAR* d, |
64 | const z_vec128i_t v) { |
65 | Z_BUILTIN_MEMCPY(d, &v, sizeof(v)); |
66 | } |
67 | |
68 | /* |
69 | * Perform a memcpy-like operation, assuming that length is non-zero and that |
70 | * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if |
71 | * the length is shorter than this. |
72 | * |
73 | * It also guarantees that it will properly unroll the data if the distance |
74 | * between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on |
75 | * in chunkcopy_relaxed(). |
76 | * |
77 | * Aside from better memory bus utilisation, this means that short copies |
78 | * (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop |
79 | * without iteration, which will hopefully make the branch prediction more |
80 | * reliable. |
81 | */ |
82 | static inline unsigned char FAR* chunkcopy_core( |
83 | unsigned char FAR* out, |
84 | const unsigned char FAR* from, |
85 | unsigned len) { |
86 | const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1; |
87 | storechunk(out, loadchunk(from)); |
88 | out += bump; |
89 | from += bump; |
90 | len /= CHUNKCOPY_CHUNK_SIZE; |
91 | while (len-- > 0) { |
92 | storechunk(out, loadchunk(from)); |
93 | out += CHUNKCOPY_CHUNK_SIZE; |
94 | from += CHUNKCOPY_CHUNK_SIZE; |
95 | } |
96 | return out; |
97 | } |
98 | |
99 | /* |
100 | * Like chunkcopy_core(), but avoid writing beyond of legal output. |
101 | * |
102 | * Accepts an additional pointer to the end of safe output. A generic safe |
103 | * copy would use (out + len), but it's normally the case that the end of the |
104 | * output buffer is beyond the end of the current copy, and this can still be |
105 | * exploited. |
106 | */ |
107 | static inline unsigned char FAR* chunkcopy_core_safe( |
108 | unsigned char FAR* out, |
109 | const unsigned char FAR* from, |
110 | unsigned len, |
111 | unsigned char FAR* limit) { |
112 | Assert(out + len <= limit, "chunk copy exceeds safety limit" ); |
113 | if ((limit - out) < (ptrdiff_t)CHUNKCOPY_CHUNK_SIZE) { |
114 | const unsigned char FAR* Z_RESTRICT rfrom = from; |
115 | if (len & 8) { |
116 | Z_BUILTIN_MEMCPY(out, rfrom, 8); |
117 | out += 8; |
118 | rfrom += 8; |
119 | } |
120 | if (len & 4) { |
121 | Z_BUILTIN_MEMCPY(out, rfrom, 4); |
122 | out += 4; |
123 | rfrom += 4; |
124 | } |
125 | if (len & 2) { |
126 | Z_BUILTIN_MEMCPY(out, rfrom, 2); |
127 | out += 2; |
128 | rfrom += 2; |
129 | } |
130 | if (len & 1) { |
131 | *out++ = *rfrom++; |
132 | } |
133 | return out; |
134 | } |
135 | return chunkcopy_core(out, from, len); |
136 | } |
137 | |
138 | /* |
139 | * Perform short copies until distance can be rewritten as being at least |
140 | * CHUNKCOPY_CHUNK_SIZE. |
141 | * |
142 | * Assumes it's OK to overwrite at least the first 2*CHUNKCOPY_CHUNK_SIZE |
143 | * bytes of output even if the copy is shorter than this. This assumption |
144 | * holds within zlib inflate_fast(), which starts every iteration with at |
145 | * least 258 bytes of output space available (258 being the maximum length |
146 | * output from a single token; see inffast.c). |
147 | */ |
148 | static inline unsigned char FAR* chunkunroll_relaxed( |
149 | unsigned char FAR* out, |
150 | unsigned FAR* dist, |
151 | unsigned FAR* len) { |
152 | const unsigned char FAR* from = out - *dist; |
153 | while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) { |
154 | storechunk(out, loadchunk(from)); |
155 | out += *dist; |
156 | *len -= *dist; |
157 | *dist += *dist; |
158 | } |
159 | return out; |
160 | } |
161 | |
162 | #if defined(INFLATE_CHUNK_SIMD_NEON) |
163 | /* |
164 | * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in |
165 | * every 64-bit component of the 128-bit result (64-bit int splat). |
166 | */ |
167 | static inline z_vec128i_t v_load64_dup(const void* src) { |
168 | return vcombine_u8(vld1_u8(src), vld1_u8(src)); |
169 | } |
170 | |
171 | /* |
172 | * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in |
173 | * every 32-bit component of the 128-bit result (32-bit int splat). |
174 | */ |
175 | static inline z_vec128i_t v_load32_dup(const void* src) { |
176 | int32_t i32; |
177 | Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32)); |
178 | return vreinterpretq_u8_s32(vdupq_n_s32(i32)); |
179 | } |
180 | |
181 | /* |
182 | * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in |
183 | * every 16-bit component of the 128-bit result (16-bit int splat). |
184 | */ |
185 | static inline z_vec128i_t v_load16_dup(const void* src) { |
186 | int16_t i16; |
187 | Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16)); |
188 | return vreinterpretq_u8_s16(vdupq_n_s16(i16)); |
189 | } |
190 | |
191 | /* |
192 | * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit |
193 | * component of the 128-bit result (8-bit int splat). |
194 | */ |
195 | static inline z_vec128i_t v_load8_dup(const void* src) { |
196 | return vld1q_dup_u8((const uint8_t*)src); |
197 | } |
198 | |
199 | /* |
200 | * v_store_128(): store the 128-bit vec in a memory destination (that might |
201 | * not be 16-byte aligned) void* out. |
202 | */ |
203 | static inline void v_store_128(void* out, const z_vec128i_t vec) { |
204 | vst1q_u8(out, vec); |
205 | } |
206 | |
207 | #elif defined(INFLATE_CHUNK_SIMD_SSE2) |
208 | /* |
209 | * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in |
210 | * every 64-bit component of the 128-bit result (64-bit int splat). |
211 | */ |
212 | static inline z_vec128i_t v_load64_dup(const void* src) { |
213 | int64_t i64; |
214 | Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64)); |
215 | return _mm_set1_epi64x(i64); |
216 | } |
217 | |
218 | /* |
219 | * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in |
220 | * every 32-bit component of the 128-bit result (32-bit int splat). |
221 | */ |
222 | static inline z_vec128i_t v_load32_dup(const void* src) { |
223 | int32_t i32; |
224 | Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32)); |
225 | return _mm_set1_epi32(i32); |
226 | } |
227 | |
228 | /* |
229 | * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in |
230 | * every 16-bit component of the 128-bit result (16-bit int splat). |
231 | */ |
232 | static inline z_vec128i_t v_load16_dup(const void* src) { |
233 | int16_t i16; |
234 | Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16)); |
235 | return _mm_set1_epi16(i16); |
236 | } |
237 | |
238 | /* |
239 | * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit |
240 | * component of the 128-bit result (8-bit int splat). |
241 | */ |
242 | static inline z_vec128i_t v_load8_dup(const void* src) { |
243 | return _mm_set1_epi8(*(const char*)src); |
244 | } |
245 | |
246 | /* |
247 | * v_store_128(): store the 128-bit vec in a memory destination (that might |
248 | * not be 16-byte aligned) void* out. |
249 | */ |
250 | static inline void v_store_128(void* out, const z_vec128i_t vec) { |
251 | _mm_storeu_si128((__m128i*)out, vec); |
252 | } |
253 | #endif |
254 | |
255 | /* |
256 | * Perform an overlapping copy which behaves as a memset() operation, but |
257 | * supporting periods other than one, and assume that length is non-zero and |
258 | * that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output |
259 | * even if the length is shorter than this. |
260 | */ |
261 | static inline unsigned char FAR* chunkset_core( |
262 | unsigned char FAR* out, |
263 | unsigned period, |
264 | unsigned len) { |
265 | z_vec128i_t v; |
266 | const int bump = ((len - 1) % sizeof(v)) + 1; |
267 | |
268 | switch (period) { |
269 | case 1: |
270 | v = v_load8_dup(out - 1); |
271 | v_store_128(out, v); |
272 | out += bump; |
273 | len -= bump; |
274 | while (len > 0) { |
275 | v_store_128(out, v); |
276 | out += sizeof(v); |
277 | len -= sizeof(v); |
278 | } |
279 | return out; |
280 | case 2: |
281 | v = v_load16_dup(out - 2); |
282 | v_store_128(out, v); |
283 | out += bump; |
284 | len -= bump; |
285 | if (len > 0) { |
286 | v = v_load16_dup(out - 2); |
287 | do { |
288 | v_store_128(out, v); |
289 | out += sizeof(v); |
290 | len -= sizeof(v); |
291 | } while (len > 0); |
292 | } |
293 | return out; |
294 | case 4: |
295 | v = v_load32_dup(out - 4); |
296 | v_store_128(out, v); |
297 | out += bump; |
298 | len -= bump; |
299 | if (len > 0) { |
300 | v = v_load32_dup(out - 4); |
301 | do { |
302 | v_store_128(out, v); |
303 | out += sizeof(v); |
304 | len -= sizeof(v); |
305 | } while (len > 0); |
306 | } |
307 | return out; |
308 | case 8: |
309 | v = v_load64_dup(out - 8); |
310 | v_store_128(out, v); |
311 | out += bump; |
312 | len -= bump; |
313 | if (len > 0) { |
314 | v = v_load64_dup(out - 8); |
315 | do { |
316 | v_store_128(out, v); |
317 | out += sizeof(v); |
318 | len -= sizeof(v); |
319 | } while (len > 0); |
320 | } |
321 | return out; |
322 | } |
323 | out = chunkunroll_relaxed(out, &period, &len); |
324 | return chunkcopy_core(out, out - period, len); |
325 | } |
326 | |
327 | /* |
328 | * Perform a memcpy-like operation, but assume that length is non-zero and that |
329 | * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if |
330 | * the length is shorter than this. |
331 | * |
332 | * Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour |
333 | * of overlapping buffers, regardless of the distance between the pointers. |
334 | * This is reflected in the `restrict`-qualified pointers, allowing the |
335 | * compiler to re-order loads and stores. |
336 | */ |
337 | static inline unsigned char FAR* chunkcopy_relaxed( |
338 | unsigned char FAR* Z_RESTRICT out, |
339 | const unsigned char FAR* Z_RESTRICT from, |
340 | unsigned len) { |
341 | return chunkcopy_core(out, from, len); |
342 | } |
343 | |
344 | /* |
345 | * Like chunkcopy_relaxed(), but avoid writing beyond of legal output. |
346 | * |
347 | * Unlike chunkcopy_core_safe() above, no guarantee is made regarding the |
348 | * behaviour of overlapping buffers, regardless of the distance between the |
349 | * pointers. This is reflected in the `restrict`-qualified pointers, allowing |
350 | * the compiler to re-order loads and stores. |
351 | * |
352 | * Accepts an additional pointer to the end of safe output. A generic safe |
353 | * copy would use (out + len), but it's normally the case that the end of the |
354 | * output buffer is beyond the end of the current copy, and this can still be |
355 | * exploited. |
356 | */ |
357 | static inline unsigned char FAR* chunkcopy_safe( |
358 | unsigned char FAR* out, |
359 | const unsigned char FAR* Z_RESTRICT from, |
360 | unsigned len, |
361 | unsigned char FAR* limit) { |
362 | Assert(out + len <= limit, "chunk copy exceeds safety limit" ); |
363 | return chunkcopy_core_safe(out, from, len, limit); |
364 | } |
365 | |
366 | /* |
367 | * Perform chunky copy within the same buffer, where the source and destination |
368 | * may potentially overlap. |
369 | * |
370 | * Assumes that len > 0 on entry, and that it's safe to write at least |
371 | * CHUNKCOPY_CHUNK_SIZE*3 bytes to the output. |
372 | */ |
373 | static inline unsigned char FAR* chunkcopy_lapped_relaxed( |
374 | unsigned char FAR* out, |
375 | unsigned dist, |
376 | unsigned len) { |
377 | if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) { |
378 | return chunkset_core(out, dist, len); |
379 | } |
380 | return chunkcopy_core(out, out - dist, len); |
381 | } |
382 | |
383 | /* |
384 | * Behave like chunkcopy_lapped_relaxed(), but avoid writing beyond of legal |
385 | * output. |
386 | * |
387 | * Accepts an additional pointer to the end of safe output. A generic safe |
388 | * copy would use (out + len), but it's normally the case that the end of the |
389 | * output buffer is beyond the end of the current copy, and this can still be |
390 | * exploited. |
391 | */ |
392 | static inline unsigned char FAR* chunkcopy_lapped_safe( |
393 | unsigned char FAR* out, |
394 | unsigned dist, |
395 | unsigned len, |
396 | unsigned char FAR* limit) { |
397 | Assert(out + len <= limit, "chunk copy exceeds safety limit" ); |
398 | if ((limit - out) < (ptrdiff_t)(3 * CHUNKCOPY_CHUNK_SIZE)) { |
399 | /* TODO(cavalcantii): try harder to optimise this */ |
400 | while (len-- > 0) { |
401 | *out = *(out - dist); |
402 | out++; |
403 | } |
404 | return out; |
405 | } |
406 | return chunkcopy_lapped_relaxed(out, dist, len); |
407 | } |
408 | |
409 | /* |
410 | * The chunk-copy code above deals with writing the decoded DEFLATE data to |
411 | * the output with SIMD methods to increase decode speed. Reading the input |
412 | * to the DEFLATE decoder with a wide, SIMD method can also increase decode |
413 | * speed. This option is supported on little endian machines, and reads the |
414 | * input data in 64-bit (8 byte) chunks. |
415 | */ |
416 | |
417 | #ifdef INFLATE_CHUNK_READ_64LE |
418 | /* |
419 | * Buffer the input in a uint64_t (8 bytes) in the wide input reading case. |
420 | */ |
421 | typedef uint64_t inflate_holder_t; |
422 | |
423 | /* |
424 | * Ask the compiler to perform a wide, unaligned load of a uint64_t using a |
425 | * machine instruction appropriate for the uint64_t type. |
426 | */ |
427 | static inline inflate_holder_t read64le(const unsigned char FAR *in) { |
428 | inflate_holder_t input; |
429 | Z_BUILTIN_MEMCPY(&input, in, sizeof(input)); |
430 | return input; |
431 | } |
432 | #else |
433 | /* |
434 | * Otherwise, buffer the input bits using zlib's default input buffer type. |
435 | */ |
436 | typedef unsigned long inflate_holder_t; |
437 | |
438 | #endif /* INFLATE_CHUNK_READ_64LE */ |
439 | |
440 | #undef Z_STATIC_ASSERT |
441 | #undef Z_RESTRICT |
442 | #undef Z_BUILTIN_MEMCPY |
443 | |
444 | #endif /* CHUNKCOPY_H */ |
445 | |