astcenc_vecmathlib_common_4.h source code [Godot/thirdparty/astcenc/astcenc_vecmathlib_common_4.h]

1	// SPDX-License-Identifier: Apache-2.0
2	// ----------------------------------------------------------------------------
3	// Copyright 2020-2021 Arm Limited
4	//
5	// Licensed under the Apache License, Version 2.0 (the "License"); you may not
6	// use this file except in compliance with the License. You may obtain a copy
7	// of the License at:
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing, software
12	// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13	// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14	// License for the specific language governing permissions and limitations
15	// under the License.
16	// ----------------------------------------------------------------------------
17
18	/**
19	* @brief Generic 4x32-bit vector functions.
20	*
21	* This module implements generic 4-wide vector functions that are valid for
22	* all instruction sets, typically implemented using lower level 4-wide
23	* operations that are ISA-specific.
24	*/
25
26	#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
27	#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
28
29	#ifndef ASTCENC_SIMD_INLINE
30	#error "Include astcenc_vecmathlib.h, do not include directly"
31	#endif
32
33	#include <cstdio>
34
35	// ============================================================================
36	// vmask4 operators and functions
37	// ============================================================================
38
39	/**
40	* @brief True if any lanes are enabled, false otherwise.
41	*/
42	ASTCENC_SIMD_INLINE bool any(vmask4 a)
43	{
44	return mask(a) != `0`;
45	}
46
47	/**
48	* @brief True if all lanes are enabled, false otherwise.
49	*/
50	ASTCENC_SIMD_INLINE bool all(vmask4 a)
51	{
52	return mask(a) == `0xF`;
53	}
54
55	// ============================================================================
56	// vint4 operators and functions
57	// ============================================================================
58
59	/**
60	* @brief Overload: vector by scalar addition.
61	*/
62	ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
63	{
64	return a + vint4 (b);
65	}
66
67	/**
68	* @brief Overload: vector by vector incremental addition.
69	*/
70	ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
71	{
72	a = a + b;
73	return a;
74	}
75
76	/**
77	* @brief Overload: vector by scalar subtraction.
78	*/
79	ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
80	{
81	return a - vint4 (b);
82	}
83
84	/**
85	* @brief Overload: vector by scalar multiplication.
86	*/
87	ASTCENC_SIMD_INLINE vint4 operator(vint4 a, int* b)
88	{
89	return a * vint4 (b);
90	}
91
92	/**
93	* @brief Overload: vector by scalar bitwise or.
94	*/
95	ASTCENC_SIMD_INLINE vint4 operator\|(vint4 a, int b)
96	{
97	return a \| vint4 (b);
98	}
99
100	/**
101	* @brief Overload: vector by scalar bitwise and.
102	*/
103	ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
104	{
105	return a & vint4 (b);
106	}
107
108	/**
109	* @brief Overload: vector by scalar bitwise xor.
110	*/
111	ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
112	{
113	return a ^ vint4 (b);
114	}
115
116	/**
117	* @brief Return the clamped value between min and max.
118	*/
119	ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
120	{
121	return min(max(a, vint4 (minv)), vint4 (maxv));
122	}
123
124	/**
125	* @brief Return the horizontal sum of RGB vector lanes as a scalar.
126	*/
127	ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
128	{
129	return a.lane<`0`>() + a.lane<`1`>() + a.lane<`2`>();
130	}
131
132	// ============================================================================
133	// vfloat4 operators and functions
134	// ============================================================================
135
136	/**
137	* @brief Overload: vector by vector incremental addition.
138	*/
139	ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
140	{
141	a = a + b;
142	return a;
143	}
144
145	/**
146	* @brief Overload: vector by scalar addition.
147	*/
148	ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
149	{
150	return a + vfloat4 (b);
151	}
152
153	/**
154	* @brief Overload: vector by scalar subtraction.
155	*/
156	ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
157	{
158	return a - vfloat4 (b);
159	}
160
161	/**
162	* @brief Overload: vector by scalar multiplication.
163	*/
164	ASTCENC_SIMD_INLINE vfloat4 operator(vfloat4 a, float* b)
165	{
166	return a * vfloat4 (b);
167	}
168
169	/**
170	* @brief Overload: scalar by vector multiplication.
171	*/
172	ASTCENC_SIMD_INLINE vfloat4 operator(float* a, vfloat4 b)
173	{
174	return vfloat4 (a) * b;
175	}
176
177	/**
178	* @brief Overload: vector by scalar division.
179	*/
180	ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
181	{
182	return a / vfloat4 (b);
183	}
184
185	/**
186	* @brief Overload: scalar by vector division.
187	*/
188	ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
189	{
190	return vfloat4 (a) / b;
191	}
192
193	/**
194	* @brief Return the min vector of a vector and a scalar.
195	*
196	* If either lane value is NaN, @c b will be returned for that lane.
197	*/
198	ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
199	{
200	return min(a, vfloat4 (b));
201	}
202
203	/**
204	* @brief Return the max vector of a vector and a scalar.
205	*
206	* If either lane value is NaN, @c b will be returned for that lane.
207	*/
208	ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
209	{
210	return max(a, vfloat4 (b));
211	}
212
213	/**
214	* @brief Return the clamped value between min and max.
215	*
216	* It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
217	* then @c min will be returned for that lane.
218	*/
219	ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
220	{
221	// Do not reorder - second operand will return if either is NaN
222	return min(max(a, minv), maxv);
223	}
224
225	/**
226	* @brief Return the clamped value between 0.0f and max.
227	*
228	* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
229	* be returned for that lane.
230	*/
231	ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
232	{
233	// Do not reorder - second operand will return if either is NaN
234	return min(max(a, vfloat4::zero()), maxv);
235	}
236
237	/**
238	* @brief Return the clamped value between 0.0f and 1.0f.
239	*
240	* If @c a is NaN then zero will be returned for that lane.
241	*/
242	ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
243	{
244	// Do not reorder - second operand will return if either is NaN
245	return min(max(a, vfloat4::zero()), `1.0f`);
246	}
247
248	/**
249	* @brief Return the horizontal minimum of a vector.
250	*/
251	ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
252	{
253	return hmin(a).lane<`0`>();
254	}
255
256	/**
257	* @brief Return the horizontal min of RGB vector lanes as a scalar.
258	*/
259	ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
260	{
261	a.set_lane<`3`>(a.lane<`0`>());
262	return hmin_s(a);
263	}
264
265	/**
266	* @brief Return the horizontal maximum of a vector.
267	*/
268	ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
269	{
270	return hmax(a).lane<`0`>();
271	}
272
273	/**
274	* @brief Accumulate lane-wise sums for a vector.
275	*/
276	ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
277	{
278	accum = accum + a;
279	}
280
281	/**
282	* @brief Accumulate lane-wise sums for a masked vector.
283	*/
284	ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
285	{
286	a = select(vfloat4::zero(), a, m);
287	haccumulate(accum, a);
288	}
289
290	/**
291	* @brief Return the horizontal sum of RGB vector lanes as a scalar.
292	*/
293	ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
294	{
295	return a.lane<`0`>() + a.lane<`1`>() + a.lane<`2`>();
296	}
297
298	#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
299
300	/**
301	* @brief Return the dot product for the full 4 lanes, returning scalar.
302	*/
303	ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
304	{
305	vfloat4 m = a * b;
306	return hadd_s(m);
307	}
308
309	/**
310	* @brief Return the dot product for the full 4 lanes, returning vector.
311	*/
312	ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
313	{
314	vfloat4 m = a * b;
315	return vfloat4 (hadd_s(m));
316	}
317
318	/**
319	* @brief Return the dot product for the bottom 3 lanes, returning scalar.
320	*/
321	ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
322	{
323	vfloat4 m = a * b;
324	return hadd_rgb_s(m);
325	}
326
327	/**
328	* @brief Return the dot product for the bottom 3 lanes, returning vector.
329	*/
330	ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
331	{
332	vfloat4 m = a * b;
333	float d3 = hadd_rgb_s(m);
334	return vfloat4 (d3, d3, d3, `0.0f`);
335	}
336
337	#endif
338
339	#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
340
341	/**
342	* @brief Population bit count.
343	*
344	* @param v The value to population count.
345	*
346	* @return The number of 1 bits.
347	*/
348	static inline int popcount(uint64_t v)
349	{
350	uint64_t mask1 = `0x5555555555555555ULL`;
351	uint64_t mask2 = `0x3333333333333333ULL`;
352	uint64_t mask3 = `0x0F0F0F0F0F0F0F0FULL`;
353	v -= (v >> `1`) & mask1;
354	v = (v & mask2) + ((v >> `2`) & mask2);
355	v += v >> `4`;
356	v &= mask3;
357	v *= `0x0101010101010101ULL`;
358	v >>= `56`;
359	return static_cast<int>(v);
360	}
361
362	#endif
363
364	/**
365	* @brief Apply signed bit transfer.
366	*
367	* @param input0 The first encoded endpoint.
368	* @param input1 The second encoded endpoint.
369	*/
370	static ASTCENC_SIMD_INLINE void bit_transfer_signed(
371	vint4& input0,
372	vint4& input1
373	) {
374	input1 = lsr<`1`>(input1) \| (input0 & `0x80`);
375	input0 = lsr<`1`>(input0) & `0x3F`;
376
377	vmask4 mask = (input0 & `0x20`) != vint4::zero();
378	input0 = select(input0, input0 - `0x40`, mask);
379	}
380
381	/**
382	* @brief Debug function to print a vector of ints.
383	*/
384	ASTCENC_SIMD_INLINE void print(vint4 a)
385	{
386	alignas(`16`) int v[`4`];
387	storea(a, v);
388	printf("v4_i32:\n %8d %8d %8d %8d\n",
389	v[`0`], v[`1`], v[`2`], v[`3`]);
390	}
391
392	/**
393	* @brief Debug function to print a vector of ints.
394	*/
395	ASTCENC_SIMD_INLINE void printx(vint4 a)
396	{
397	alignas(`16`) int v[`4`];
398	storea(a, v);
399	printf("v4_i32:\n %08x %08x %08x %08x\n",
400	v[`0`], v[`1`], v[`2`], v[`3`]);
401	}
402
403	/**
404	* @brief Debug function to print a vector of floats.
405	*/
406	ASTCENC_SIMD_INLINE void print(vfloat4 a)
407	{
408	alignas(`16`) float v[`4`];
409	storea(a, v);
410	printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
411	static_cast<double>(v[`0`]), static_cast<double>(v[`1`]),
412	static_cast<double>(v[`2`]), static_cast<double>(v[`3`]));
413	}
414
415	/**
416	* @brief Debug function to print a vector of masks.
417	*/
418	ASTCENC_SIMD_INLINE void print(vmask4 a)
419	{
420	print(select(vint4 (`0`), vint4 (`1`), a));
421	}
422
423	#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
424

Browse the source code of Godot/thirdparty/astcenc/astcenc_vecmathlib_common_4.h