bufferiszero.c source code [qemu/util/bufferiszero.c]

1	/*
2	* Simple C functions to supplement the C library
3	*
4	* Copyright (c) 2006 Fabrice Bellard
5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a copy
7	* of this software and associated documentation files (the "Software"), to deal
8	* in the Software without restriction, including without limitation the rights
9	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10	* copies of the Software, and to permit persons to whom the Software is
11	* furnished to do so, subject to the following conditions:
12	*
13	* The above copyright notice and this permission notice shall be included in
14	* all copies or substantial portions of the Software.
15	*
16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22	* THE SOFTWARE.
23	*/
24	#include "qemu/osdep.h"
25	#include "qemu/cutils.h"
26	#include "qemu/bswap.h"
27
28	static bool
29	buffer_zero_int(const void *buf, size_t len)
30	{
31	if (unlikely(len < `8`)) {
32	/ For a very small buffer, simply accumulate all the bytes. /
33	const unsigned char *p = buf;
34	const unsigned char *e = buf + len;
35	unsigned char t = `0`;
36
37	do {
38	t \|= *p++;
39	} while (p < e);
40
41	return t == `0`;
42	} else {
43	/ Otherwise, use the unaligned memory access functions to*
44	handle the beginning and end of the buffer, with a couple
45	of loops handling the middle aligned section. /*
46	uint64_t t = ldq_he_p(buf);
47	const uint64_t p = (uint64_t )(((uintptr_t)buf + `8`) & -`8`);
48	const uint64_t e = (uint64_t )(((uintptr_t)buf + len) & -`8`);
49
50	for (; p + `8` <= e; p += `8`) {
51	__builtin_prefetch(p + `8`);
52	if (t) {
53	return false;
54	}
55	t = p[`0`] \| p[`1`] \| p[`2`] \| p[`3`] \| p[`4`] \| p[`5`] \| p[`6`] \| p[`7`];
56	}
57	while (p < e) {
58	t \|= *p++;
59	}
60	t \|= ldq_he_p(buf + len - `8`);
61
62	return t == `0`;
63	}
64	}
65
66	#if defined(CONFIG_AVX2_OPT) \|\| defined(__SSE2__)
67	/ Do not use push_options pragmas unnecessarily, because clang*
68	* does not support them.
69	*/
70	#ifdef CONFIG_AVX2_OPT
71	#pragma GCC push_options
72	#pragma GCC target("sse2")
73	#endif
74	#include <emmintrin.h>
75
76	/ Note that each of these vectorized functions require len >= 64. /
77
78	static bool
79	buffer_zero_sse2(const void *buf, size_t len)
80	{
81	__m128i t = _mm_loadu_si128(buf);
82	__m128i p = (__m128i )(((uintptr_t)buf + `5` * `16`) & -`16`);
83	__m128i e = (__m128i )(((uintptr_t)buf + len) & -`16`);
84	__m128i zero = _mm_setzero_si128();
85
86	/ Loop over 16-byte aligned blocks of 64. /
87	while (likely(p <= e)) {
88	__builtin_prefetch(p);
89	t = _mm_cmpeq_epi8(t, zero);
90	if (unlikely(_mm_movemask_epi8(t) != `0xFFFF`)) {
91	return false;
92	}
93	t = p[-`4`] \| p[-`3`] \| p[-`2`] \| p[-`1`];
94	p += `4`;
95	}
96
97	/ Finish the aligned tail. /
98	t \|= e[-`3`];
99	t \|= e[-`2`];
100	t \|= e[-`1`];
101
102	/ Finish the unaligned tail. /
103	t \|= _mm_loadu_si128(buf + len - `16`);
104
105	return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == `0xFFFF`;
106	}
107	#ifdef CONFIG_AVX2_OPT
108	#pragma GCC pop_options
109	#endif
110
111	#ifdef CONFIG_AVX2_OPT
112	/ Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8,*
113	* the includes have to be within the corresponding push_options region, and
114	* therefore the regions themselves have to be ordered with increasing ISA.
115	*/
116	#pragma GCC push_options
117	#pragma GCC target("sse4")
118	#include <smmintrin.h>
119
120	static bool
121	buffer_zero_sse4(const void *buf, size_t len)
122	{
123	__m128i t = _mm_loadu_si128(buf);
124	__m128i p = (__m128i )(((uintptr_t)buf + `5` * `16`) & -`16`);
125	__m128i e = (__m128i )(((uintptr_t)buf + len) & -`16`);
126
127	/ Loop over 16-byte aligned blocks of 64. /
128	while (likely(p <= e)) {
129	__builtin_prefetch(p);
130	if (unlikely(!_mm_testz_si128(t, t))) {
131	return false;
132	}
133	t = p[-`4`] \| p[-`3`] \| p[-`2`] \| p[-`1`];
134	p += `4`;
135	}
136
137	/ Finish the aligned tail. /
138	t \|= e[-`3`];
139	t \|= e[-`2`];
140	t \|= e[-`1`];
141
142	/ Finish the unaligned tail. /
143	t \|= _mm_loadu_si128(buf + len - `16`);
144
145	return _mm_testz_si128(t, t);
146	}
147
148	#pragma GCC pop_options
149	#pragma GCC push_options
150	#pragma GCC target("avx2")
151	#include <immintrin.h>
152
153	static bool
154	buffer_zero_avx2(const void *buf, size_t len)
155	{
156	/ Begin with an unaligned head of 32 bytes. /
157	__m256i t = _mm256_loadu_si256(buf);
158	__m256i p = (__m256i )(((uintptr_t)buf + `5` * `32`) & -`32`);
159	__m256i e = (__m256i )(((uintptr_t)buf + len) & -`32`);
160
161	if (likely(p <= e)) {
162	/ Loop over 32-byte aligned blocks of 128. /
163	do {
164	__builtin_prefetch(p);
165	if (unlikely(!_mm256_testz_si256(t, t))) {
166	return false;
167	}
168	t = p[-`4`] \| p[-`3`] \| p[-`2`] \| p[-`1`];
169	p += `4`;
170	} while (p <= e);
171	} else {
172	t \|= _mm256_loadu_si256(buf + `32`);
173	if (len <= `128`) {
174	goto last2;
175	}
176	}
177
178	/ Finish the last block of 128 unaligned. /
179	t \|= _mm256_loadu_si256(buf + len - `4` * `32`);
180	t \|= _mm256_loadu_si256(buf + len - `3` * `32`);
181	last2:
182	t \|= _mm256_loadu_si256(buf + len - `2` * `32`);
183	t \|= _mm256_loadu_si256(buf + len - `1` * `32`);
184
185	return _mm256_testz_si256(t, t);
186	}
187	#pragma GCC pop_options
188	#endif /* CONFIG_AVX2_OPT */
189
190	/ Note that for test_buffer_is_zero_next_accel, the most preferred*
191	* ISA must have the least significant bit.
192	*/
193	#define CACHE_AVX2 1
194	#define CACHE_SSE4 2
195	#define CACHE_SSE2 4
196
197	/ Make sure that these variables are appropriately initialized when*
198	* SSE2 is enabled on the compiler command-line, but the compiler is
199	* too old to support CONFIG_AVX2_OPT.
200	*/
201	#ifdef CONFIG_AVX2_OPT
202	# define INIT_CACHE 0
203	# define INIT_ACCEL buffer_zero_int
204	#else
205	# ifndef __SSE2__
206	# error "ISA selection confusion"
207	# endif
208	# define INIT_CACHE CACHE_SSE2
209	# define INIT_ACCEL buffer_zero_sse2
210	#endif
211
212	static unsigned cpuid_cache = INIT_CACHE;
213	static bool (buffer_accel)(const* void *, size_t) = INIT_ACCEL;
214
215	static void init_accel(unsigned cache)
216	{
217	bool (fn)(const* void *, size_t) = buffer_zero_int;
218	if (cache & CACHE_SSE2) {
219	fn = buffer_zero_sse2;
220	}
221	#ifdef CONFIG_AVX2_OPT
222	if (cache & CACHE_SSE4) {
223	fn = buffer_zero_sse4;
224	}
225	if (cache & CACHE_AVX2) {
226	fn = buffer_zero_avx2;
227	}
228	#endif
229	buffer_accel = fn;
230	}
231
232	#ifdef CONFIG_AVX2_OPT
233	#include "qemu/cpuid.h"
234
235	static void __attribute__((constructor)) init_cpuid_cache(void)
236	{
237	int max = __get_cpuid_max(`0`, NULL);
238	int a, b, c, d;
239	unsigned cache = `0`;
240
241	if (max >= `1`) {
242	__cpuid(`1`, a, b, c, d);
243	if (d & bit_SSE2) {
244	cache \|= CACHE_SSE2;
245	}
246	if (c & bit_SSE4_1) {
247	cache \|= CACHE_SSE4;
248	}
249
250	/ We must check that AVX is not just available, but usable. /
251	if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= `7`) {
252	int bv;
253	__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(`0`));
254	__cpuid_count(`7`, `0`, a, b, c, d);
255	if ((bv & `6`) == `6` && (b & bit_AVX2)) {
256	cache \|= CACHE_AVX2;
257	}
258	}
259	}
260	cpuid_cache = cache;
261	init_accel(cache);
262	}
263	#endif /* CONFIG_AVX2_OPT */
264
265	bool test_buffer_is_zero_next_accel(void)
266	{
267	/ If no bits set, we just tested buffer_zero_int, and there*
268	are no more acceleration options to test. /*
269	if (cpuid_cache == `0`) {
270	return false;
271	}
272	/ Disable the accelerator we used before and select a new one. /
273	cpuid_cache &= cpuid_cache - `1`;
274	init_accel(cpuid_cache);
275	return true;
276	}
277
278	static bool select_accel_fn(const void *buf, size_t len)
279	{
280	if (likely(len >= `64`)) {
281	return buffer_accel(buf, len);
282	}
283	return buffer_zero_int(buf, len);
284	}
285
286	#else
287	#define select_accel_fn buffer_zero_int
288	bool test_buffer_is_zero_next_accel(void)
289	{
290	return false;
291	}
292	#endif
293
294	/*
295	* Checks if a buffer is all zeroes
296	*/
297	bool buffer_is_zero(const void *buf, size_t len)
298	{
299	if (unlikely(len == `0`)) {
300	return true;
301	}
302
303	/ Fetch the beginning of the buffer while we select the accelerator. /
304	__builtin_prefetch(buf);
305
306	/ Use an optimized zero check if possible. Note that this also*
307	includes a check for an unrolled loop over 64-bit integers. /*
308	return select_accel_fn(buf, len);
309	}
310

Browse the source code of qemu/util/bufferiszero.c