chunkcopy.h source code [engine/third_party/zlib/contrib/optimizations/chunkcopy.h]

1	/ chunkcopy.h -- fast chunk copy and set operations*
2	* Copyright (C) 2017 ARM, Inc.
3	* Copyright 2017 The Chromium Authors. All rights reserved.
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the Chromium source repository LICENSE file.
6	*/
7
8	#ifndef CHUNKCOPY_H
9	#define CHUNKCOPY_H
10
11	#include <stdint.h>
12	#include "zutil.h"
13
14	#define Z_STATIC_ASSERT(name, assert) typedef char name[(assert) ? 1 : -1]
15
16	#if __STDC_VERSION__ >= 199901L
17	#define Z_RESTRICT restrict
18	#else
19	#define Z_RESTRICT
20	#endif
21
22	#if defined(__clang__) \|\| defined(__GNUC__) \|\| defined(__llvm__)
23	#define Z_BUILTIN_MEMCPY __builtin_memcpy
24	#else
25	#define Z_BUILTIN_MEMCPY zmemcpy
26	#endif
27
28	#if defined(INFLATE_CHUNK_SIMD_NEON)
29	#include <arm_neon.h>
30	typedef uint8x16_t z_vec128i_t;
31	#elif defined(INFLATE_CHUNK_SIMD_SSE2)
32	#include <emmintrin.h>
33	typedef __m128i z_vec128i_t;
34	#else
35	#error chunkcopy.h inflate chunk SIMD is not defined for your build target
36	#endif
37
38	/*
39	* chunk copy type: the z_vec128i_t type size should be exactly 128-bits
40	* and equal to CHUNKCOPY_CHUNK_SIZE.
41	*/
42	#define CHUNKCOPY_CHUNK_SIZE sizeof(z_vec128i_t)
43
44	Z_STATIC_ASSERT(vector_128_bits_wide,
45	CHUNKCOPY_CHUNK_SIZE == sizeof(int8_t) * `16`);
46
47	/*
48	* Ask the compiler to perform a wide, unaligned load with a machine
49	* instruction appropriate for the z_vec128i_t type.
50	*/
51	static inline z_vec128i_t loadchunk(
52	const unsigned char FAR* s) {
53	z_vec128i_t v;
54	Z_BUILTIN_MEMCPY(&v, s, sizeof(v));
55	return v;
56	}
57
58	/*
59	* Ask the compiler to perform a wide, unaligned store with a machine
60	* instruction appropriate for the z_vec128i_t type.
61	*/
62	static inline void storechunk(
63	unsigned char FAR* d,
64	const z_vec128i_t v) {
65	Z_BUILTIN_MEMCPY(d, &v, sizeof(v));
66	}
67
68	/*
69	* Perform a memcpy-like operation, assuming that length is non-zero and that
70	* it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
71	* the length is shorter than this.
72	*
73	* It also guarantees that it will properly unroll the data if the distance
74	* between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
75	* in chunkcopy_relaxed().
76	*
77	* Aside from better memory bus utilisation, this means that short copies
78	* (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
79	* without iteration, which will hopefully make the branch prediction more
80	* reliable.
81	*/
82	static inline unsigned char FAR* chunkcopy_core(
83	unsigned char FAR* out,
84	const unsigned char FAR* from,
85	unsigned len) {
86	const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + `1`;
87	storechunk(out, loadchunk(from));
88	out += bump;
89	from += bump;
90	len /= CHUNKCOPY_CHUNK_SIZE;
91	while (len-- > `0`) {
92	storechunk(out, loadchunk(from));
93	out += CHUNKCOPY_CHUNK_SIZE;
94	from += CHUNKCOPY_CHUNK_SIZE;
95	}
96	return out;
97	}
98
99	/*
100	* Like chunkcopy_core(), but avoid writing beyond of legal output.
101	*
102	* Accepts an additional pointer to the end of safe output. A generic safe
103	* copy would use (out + len), but it's normally the case that the end of the
104	* output buffer is beyond the end of the current copy, and this can still be
105	* exploited.
106	*/
107	static inline unsigned char FAR* chunkcopy_core_safe(
108	unsigned char FAR* out,
109	const unsigned char FAR* from,
110	unsigned len,
111	unsigned char FAR* limit) {
112	Assert(out + len <= limit, "chunk copy exceeds safety limit");
113	if ((limit - out) < (ptrdiff_t)CHUNKCOPY_CHUNK_SIZE) {
114	const unsigned char FAR* Z_RESTRICT rfrom = from;
115	if (len & `8`) {
116	Z_BUILTIN_MEMCPY(out, rfrom, `8`);
117	out += `8`;
118	rfrom += `8`;
119	}
120	if (len & `4`) {
121	Z_BUILTIN_MEMCPY(out, rfrom, `4`);
122	out += `4`;
123	rfrom += `4`;
124	}
125	if (len & `2`) {
126	Z_BUILTIN_MEMCPY(out, rfrom, `2`);
127	out += `2`;
128	rfrom += `2`;
129	}
130	if (len & `1`) {
131	out++ = rfrom++;
132	}
133	return out;
134	}
135	return chunkcopy_core(out, from, len);
136	}
137
138	/*
139	* Perform short copies until distance can be rewritten as being at least
140	* CHUNKCOPY_CHUNK_SIZE.
141	*
142	* Assumes it's OK to overwrite at least the first 2*CHUNKCOPY_CHUNK_SIZE
143	* bytes of output even if the copy is shorter than this. This assumption
144	* holds within zlib inflate_fast(), which starts every iteration with at
145	* least 258 bytes of output space available (258 being the maximum length
146	* output from a single token; see inffast.c).
147	*/
148	static inline unsigned char FAR* chunkunroll_relaxed(
149	unsigned char FAR* out,
150	unsigned FAR* dist,
151	unsigned FAR* len) {
152	const unsigned char FAR* from = out - *dist;
153	while (dist < len && *dist < CHUNKCOPY_CHUNK_SIZE) {
154	storechunk(out, loadchunk(from));
155	out += *dist;
156	len -= dist;
157	dist += dist;
158	}
159	return out;
160	}
161
162	#if defined(INFLATE_CHUNK_SIMD_NEON)
163	/*
164	* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
165	* every 64-bit component of the 128-bit result (64-bit int splat).
166	*/
167	static inline z_vec128i_t v_load64_dup(const void* src) {
168	return vcombine_u8(vld1_u8(src), vld1_u8(src));
169	}
170
171	/*
172	* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
173	* every 32-bit component of the 128-bit result (32-bit int splat).
174	*/
175	static inline z_vec128i_t v_load32_dup(const void* src) {
176	int32_t i32;
177	Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
178	return vreinterpretq_u8_s32(vdupq_n_s32(i32));
179	}
180
181	/*
182	* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
183	* every 16-bit component of the 128-bit result (16-bit int splat).
184	*/
185	static inline z_vec128i_t v_load16_dup(const void* src) {
186	int16_t i16;
187	Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
188	return vreinterpretq_u8_s16(vdupq_n_s16(i16));
189	}
190
191	/*
192	* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
193	* component of the 128-bit result (8-bit int splat).
194	*/
195	static inline z_vec128i_t v_load8_dup(const void* src) {
196	return vld1q_dup_u8((const uint8_t*)src);
197	}
198
199	/*
200	* v_store_128(): store the 128-bit vec in a memory destination (that might
201	* not be 16-byte aligned) void* out.
202	*/
203	static inline void v_store_128(void* out, const z_vec128i_t vec) {
204	vst1q_u8(out, vec);
205	}
206
207	#elif defined(INFLATE_CHUNK_SIMD_SSE2)
208	/*
209	* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
210	* every 64-bit component of the 128-bit result (64-bit int splat).
211	*/
212	static inline z_vec128i_t v_load64_dup(const void* src) {
213	int64_t i64;
214	Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64));
215	return _mm_set1_epi64x(i64);
216	}
217
218	/*
219	* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
220	* every 32-bit component of the 128-bit result (32-bit int splat).
221	*/
222	static inline z_vec128i_t v_load32_dup(const void* src) {
223	int32_t i32;
224	Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
225	return _mm_set1_epi32(i32);
226	}
227
228	/*
229	* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
230	* every 16-bit component of the 128-bit result (16-bit int splat).
231	*/
232	static inline z_vec128i_t v_load16_dup(const void* src) {
233	int16_t i16;
234	Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
235	return _mm_set1_epi16(i16);
236	}
237
238	/*
239	* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
240	* component of the 128-bit result (8-bit int splat).
241	*/
242	static inline z_vec128i_t v_load8_dup(const void* src) {
243	return _mm_set1_epi8((const* char*)src);
244	}
245
246	/*
247	* v_store_128(): store the 128-bit vec in a memory destination (that might
248	* not be 16-byte aligned) void* out.
249	*/
250	static inline void v_store_128(void* out, const z_vec128i_t vec) {
251	_mm_storeu_si128((__m128i*)out, vec);
252	}
253	#endif
254
255	/*
256	* Perform an overlapping copy which behaves as a memset() operation, but
257	* supporting periods other than one, and assume that length is non-zero and
258	* that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
259	* even if the length is shorter than this.
260	*/
261	static inline unsigned char FAR* chunkset_core(
262	unsigned char FAR* out,
263	unsigned period,
264	unsigned len) {
265	z_vec128i_t v;
266	const int bump = ((len - `1`) % sizeof(v)) + `1`;
267
268	switch (period) {
269	case `1`:
270	v = v_load8_dup(out - `1`);
271	v_store_128(out, v);
272	out += bump;
273	len -= bump;
274	while (len > `0`) {
275	v_store_128(out, v);
276	out += sizeof(v);
277	len -= sizeof(v);
278	}
279	return out;
280	case `2`:
281	v = v_load16_dup(out - `2`);
282	v_store_128(out, v);
283	out += bump;
284	len -= bump;
285	if (len > `0`) {
286	v = v_load16_dup(out - `2`);
287	do {
288	v_store_128(out, v);
289	out += sizeof(v);
290	len -= sizeof(v);
291	} while (len > `0`);
292	}
293	return out;
294	case `4`:
295	v = v_load32_dup(out - `4`);
296	v_store_128(out, v);
297	out += bump;
298	len -= bump;
299	if (len > `0`) {
300	v = v_load32_dup(out - `4`);
301	do {
302	v_store_128(out, v);
303	out += sizeof(v);
304	len -= sizeof(v);
305	} while (len > `0`);
306	}
307	return out;
308	case `8`:
309	v = v_load64_dup(out - `8`);
310	v_store_128(out, v);
311	out += bump;
312	len -= bump;
313	if (len > `0`) {
314	v = v_load64_dup(out - `8`);
315	do {
316	v_store_128(out, v);
317	out += sizeof(v);
318	len -= sizeof(v);
319	} while (len > `0`);
320	}
321	return out;
322	}
323	out = chunkunroll_relaxed(out, &period, &len);
324	return chunkcopy_core(out, out - period, len);
325	}
326
327	/*
328	* Perform a memcpy-like operation, but assume that length is non-zero and that
329	* it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
330	* the length is shorter than this.
331	*
332	* Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
333	* of overlapping buffers, regardless of the distance between the pointers.
334	* This is reflected in the `restrict`-qualified pointers, allowing the
335	* compiler to re-order loads and stores.
336	*/
337	static inline unsigned char FAR* chunkcopy_relaxed(
338	unsigned char FAR* Z_RESTRICT out,
339	const unsigned char FAR* Z_RESTRICT from,
340	unsigned len) {
341	return chunkcopy_core(out, from, len);
342	}
343
344	/*
345	* Like chunkcopy_relaxed(), but avoid writing beyond of legal output.
346	*
347	* Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
348	* behaviour of overlapping buffers, regardless of the distance between the
349	* pointers. This is reflected in the `restrict`-qualified pointers, allowing
350	* the compiler to re-order loads and stores.
351	*
352	* Accepts an additional pointer to the end of safe output. A generic safe
353	* copy would use (out + len), but it's normally the case that the end of the
354	* output buffer is beyond the end of the current copy, and this can still be
355	* exploited.
356	*/
357	static inline unsigned char FAR* chunkcopy_safe(
358	unsigned char FAR* out,
359	const unsigned char FAR* Z_RESTRICT from,
360	unsigned len,
361	unsigned char FAR* limit) {
362	Assert(out + len <= limit, "chunk copy exceeds safety limit");
363	return chunkcopy_core_safe(out, from, len, limit);
364	}
365
366	/*
367	* Perform chunky copy within the same buffer, where the source and destination
368	* may potentially overlap.
369	*
370	* Assumes that len > 0 on entry, and that it's safe to write at least
371	* CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
372	*/
373	static inline unsigned char FAR* chunkcopy_lapped_relaxed(
374	unsigned char FAR* out,
375	unsigned dist,
376	unsigned len) {
377	if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
378	return chunkset_core(out, dist, len);
379	}
380	return chunkcopy_core(out, out - dist, len);
381	}
382
383	/*
384	* Behave like chunkcopy_lapped_relaxed(), but avoid writing beyond of legal
385	* output.
386	*
387	* Accepts an additional pointer to the end of safe output. A generic safe
388	* copy would use (out + len), but it's normally the case that the end of the
389	* output buffer is beyond the end of the current copy, and this can still be
390	* exploited.
391	*/
392	static inline unsigned char FAR* chunkcopy_lapped_safe(
393	unsigned char FAR* out,
394	unsigned dist,
395	unsigned len,
396	unsigned char FAR* limit) {
397	Assert(out + len <= limit, "chunk copy exceeds safety limit");
398	if ((limit - out) < (ptrdiff_t)(`3` * CHUNKCOPY_CHUNK_SIZE)) {
399	/ TODO(cavalcantii): try harder to optimise this /
400	while (len-- > `0`) {
401	out = (out - dist);
402	out++;
403	}
404	return out;
405	}
406	return chunkcopy_lapped_relaxed(out, dist, len);
407	}
408
409	/*
410	* The chunk-copy code above deals with writing the decoded DEFLATE data to
411	* the output with SIMD methods to increase decode speed. Reading the input
412	* to the DEFLATE decoder with a wide, SIMD method can also increase decode
413	* speed. This option is supported on little endian machines, and reads the
414	* input data in 64-bit (8 byte) chunks.
415	*/
416
417	#ifdef INFLATE_CHUNK_READ_64LE
418	/*
419	* Buffer the input in a uint64_t (8 bytes) in the wide input reading case.
420	*/
421	typedef uint64_t inflate_holder_t;
422
423	/*
424	* Ask the compiler to perform a wide, unaligned load of a uint64_t using a
425	* machine instruction appropriate for the uint64_t type.
426	*/
427	static inline inflate_holder_t read64le(const unsigned char FAR *in) {
428	inflate_holder_t input;
429	Z_BUILTIN_MEMCPY(&input, in, sizeof(input));
430	return input;
431	}
432	#else
433	/*
434	* Otherwise, buffer the input bits using zlib's default input buffer type.
435	*/
436	typedef unsigned long inflate_holder_t;
437
438	#endif /* INFLATE_CHUNK_READ_64LE */
439
440	#undef Z_STATIC_ASSERT
441	#undef Z_RESTRICT
442	#undef Z_BUILTIN_MEMCPY
443
444	#endif /* CHUNKCOPY_H */
445

Browse the source code of engine/third_party/zlib/contrib/optimizations/chunkcopy.h