1 | // SPDX-License-Identifier: Apache-2.0 |
2 | // ---------------------------------------------------------------------------- |
3 | // Copyright 2020-2021 Arm Limited |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | // use this file except in compliance with the License. You may obtain a copy |
7 | // of the License at: |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, software |
12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | // License for the specific language governing permissions and limitations |
15 | // under the License. |
16 | // ---------------------------------------------------------------------------- |
17 | |
18 | /** |
19 | * @brief Generic 4x32-bit vector functions. |
20 | * |
21 | * This module implements generic 4-wide vector functions that are valid for |
22 | * all instruction sets, typically implemented using lower level 4-wide |
23 | * operations that are ISA-specific. |
24 | */ |
25 | |
26 | #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED |
27 | #define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED |
28 | |
29 | #ifndef ASTCENC_SIMD_INLINE |
30 | #error "Include astcenc_vecmathlib.h, do not include directly" |
31 | #endif |
32 | |
33 | #include <cstdio> |
34 | |
35 | // ============================================================================ |
36 | // vmask4 operators and functions |
37 | // ============================================================================ |
38 | |
39 | /** |
40 | * @brief True if any lanes are enabled, false otherwise. |
41 | */ |
42 | ASTCENC_SIMD_INLINE bool any(vmask4 a) |
43 | { |
44 | return mask(a) != 0; |
45 | } |
46 | |
47 | /** |
48 | * @brief True if all lanes are enabled, false otherwise. |
49 | */ |
50 | ASTCENC_SIMD_INLINE bool all(vmask4 a) |
51 | { |
52 | return mask(a) == 0xF; |
53 | } |
54 | |
55 | // ============================================================================ |
56 | // vint4 operators and functions |
57 | // ============================================================================ |
58 | |
59 | /** |
60 | * @brief Overload: vector by scalar addition. |
61 | */ |
62 | ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b) |
63 | { |
64 | return a + vint4(b); |
65 | } |
66 | |
67 | /** |
68 | * @brief Overload: vector by vector incremental addition. |
69 | */ |
70 | ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b) |
71 | { |
72 | a = a + b; |
73 | return a; |
74 | } |
75 | |
76 | /** |
77 | * @brief Overload: vector by scalar subtraction. |
78 | */ |
79 | ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b) |
80 | { |
81 | return a - vint4(b); |
82 | } |
83 | |
84 | /** |
85 | * @brief Overload: vector by scalar multiplication. |
86 | */ |
87 | ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b) |
88 | { |
89 | return a * vint4(b); |
90 | } |
91 | |
92 | /** |
93 | * @brief Overload: vector by scalar bitwise or. |
94 | */ |
95 | ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b) |
96 | { |
97 | return a | vint4(b); |
98 | } |
99 | |
100 | /** |
101 | * @brief Overload: vector by scalar bitwise and. |
102 | */ |
103 | ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b) |
104 | { |
105 | return a & vint4(b); |
106 | } |
107 | |
108 | /** |
109 | * @brief Overload: vector by scalar bitwise xor. |
110 | */ |
111 | ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b) |
112 | { |
113 | return a ^ vint4(b); |
114 | } |
115 | |
116 | /** |
117 | * @brief Return the clamped value between min and max. |
118 | */ |
119 | ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a) |
120 | { |
121 | return min(max(a, vint4(minv)), vint4(maxv)); |
122 | } |
123 | |
124 | /** |
125 | * @brief Return the horizontal sum of RGB vector lanes as a scalar. |
126 | */ |
127 | ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a) |
128 | { |
129 | return a.lane<0>() + a.lane<1>() + a.lane<2>(); |
130 | } |
131 | |
132 | // ============================================================================ |
133 | // vfloat4 operators and functions |
134 | // ============================================================================ |
135 | |
136 | /** |
137 | * @brief Overload: vector by vector incremental addition. |
138 | */ |
139 | ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b) |
140 | { |
141 | a = a + b; |
142 | return a; |
143 | } |
144 | |
145 | /** |
146 | * @brief Overload: vector by scalar addition. |
147 | */ |
148 | ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b) |
149 | { |
150 | return a + vfloat4(b); |
151 | } |
152 | |
153 | /** |
154 | * @brief Overload: vector by scalar subtraction. |
155 | */ |
156 | ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b) |
157 | { |
158 | return a - vfloat4(b); |
159 | } |
160 | |
161 | /** |
162 | * @brief Overload: vector by scalar multiplication. |
163 | */ |
164 | ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b) |
165 | { |
166 | return a * vfloat4(b); |
167 | } |
168 | |
169 | /** |
170 | * @brief Overload: scalar by vector multiplication. |
171 | */ |
172 | ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b) |
173 | { |
174 | return vfloat4(a) * b; |
175 | } |
176 | |
177 | /** |
178 | * @brief Overload: vector by scalar division. |
179 | */ |
180 | ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b) |
181 | { |
182 | return a / vfloat4(b); |
183 | } |
184 | |
185 | /** |
186 | * @brief Overload: scalar by vector division. |
187 | */ |
188 | ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b) |
189 | { |
190 | return vfloat4(a) / b; |
191 | } |
192 | |
193 | /** |
194 | * @brief Return the min vector of a vector and a scalar. |
195 | * |
196 | * If either lane value is NaN, @c b will be returned for that lane. |
197 | */ |
198 | ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b) |
199 | { |
200 | return min(a, vfloat4(b)); |
201 | } |
202 | |
203 | /** |
204 | * @brief Return the max vector of a vector and a scalar. |
205 | * |
206 | * If either lane value is NaN, @c b will be returned for that lane. |
207 | */ |
208 | ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b) |
209 | { |
210 | return max(a, vfloat4(b)); |
211 | } |
212 | |
213 | /** |
214 | * @brief Return the clamped value between min and max. |
215 | * |
216 | * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN |
217 | * then @c min will be returned for that lane. |
218 | */ |
219 | ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a) |
220 | { |
221 | // Do not reorder - second operand will return if either is NaN |
222 | return min(max(a, minv), maxv); |
223 | } |
224 | |
225 | /** |
226 | * @brief Return the clamped value between 0.0f and max. |
227 | * |
228 | * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will |
229 | * be returned for that lane. |
230 | */ |
231 | ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a) |
232 | { |
233 | // Do not reorder - second operand will return if either is NaN |
234 | return min(max(a, vfloat4::zero()), maxv); |
235 | } |
236 | |
237 | /** |
238 | * @brief Return the clamped value between 0.0f and 1.0f. |
239 | * |
240 | * If @c a is NaN then zero will be returned for that lane. |
241 | */ |
242 | ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a) |
243 | { |
244 | // Do not reorder - second operand will return if either is NaN |
245 | return min(max(a, vfloat4::zero()), 1.0f); |
246 | } |
247 | |
248 | /** |
249 | * @brief Return the horizontal minimum of a vector. |
250 | */ |
251 | ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a) |
252 | { |
253 | return hmin(a).lane<0>(); |
254 | } |
255 | |
256 | /** |
257 | * @brief Return the horizontal min of RGB vector lanes as a scalar. |
258 | */ |
259 | ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a) |
260 | { |
261 | a.set_lane<3>(a.lane<0>()); |
262 | return hmin_s(a); |
263 | } |
264 | |
265 | /** |
266 | * @brief Return the horizontal maximum of a vector. |
267 | */ |
268 | ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a) |
269 | { |
270 | return hmax(a).lane<0>(); |
271 | } |
272 | |
273 | /** |
274 | * @brief Accumulate lane-wise sums for a vector. |
275 | */ |
276 | ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a) |
277 | { |
278 | accum = accum + a; |
279 | } |
280 | |
281 | /** |
282 | * @brief Accumulate lane-wise sums for a masked vector. |
283 | */ |
284 | ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m) |
285 | { |
286 | a = select(vfloat4::zero(), a, m); |
287 | haccumulate(accum, a); |
288 | } |
289 | |
290 | /** |
291 | * @brief Return the horizontal sum of RGB vector lanes as a scalar. |
292 | */ |
293 | ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a) |
294 | { |
295 | return a.lane<0>() + a.lane<1>() + a.lane<2>(); |
296 | } |
297 | |
298 | #if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT) |
299 | |
300 | /** |
301 | * @brief Return the dot product for the full 4 lanes, returning scalar. |
302 | */ |
303 | ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b) |
304 | { |
305 | vfloat4 m = a * b; |
306 | return hadd_s(m); |
307 | } |
308 | |
309 | /** |
310 | * @brief Return the dot product for the full 4 lanes, returning vector. |
311 | */ |
312 | ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b) |
313 | { |
314 | vfloat4 m = a * b; |
315 | return vfloat4(hadd_s(m)); |
316 | } |
317 | |
318 | /** |
319 | * @brief Return the dot product for the bottom 3 lanes, returning scalar. |
320 | */ |
321 | ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b) |
322 | { |
323 | vfloat4 m = a * b; |
324 | return hadd_rgb_s(m); |
325 | } |
326 | |
327 | /** |
328 | * @brief Return the dot product for the bottom 3 lanes, returning vector. |
329 | */ |
330 | ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b) |
331 | { |
332 | vfloat4 m = a * b; |
333 | float d3 = hadd_rgb_s(m); |
334 | return vfloat4(d3, d3, d3, 0.0f); |
335 | } |
336 | |
337 | #endif |
338 | |
339 | #if !defined(ASTCENC_USE_NATIVE_POPCOUNT) |
340 | |
341 | /** |
342 | * @brief Population bit count. |
343 | * |
344 | * @param v The value to population count. |
345 | * |
346 | * @return The number of 1 bits. |
347 | */ |
348 | static inline int popcount(uint64_t v) |
349 | { |
350 | uint64_t mask1 = 0x5555555555555555ULL; |
351 | uint64_t mask2 = 0x3333333333333333ULL; |
352 | uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL; |
353 | v -= (v >> 1) & mask1; |
354 | v = (v & mask2) + ((v >> 2) & mask2); |
355 | v += v >> 4; |
356 | v &= mask3; |
357 | v *= 0x0101010101010101ULL; |
358 | v >>= 56; |
359 | return static_cast<int>(v); |
360 | } |
361 | |
362 | #endif |
363 | |
364 | /** |
365 | * @brief Apply signed bit transfer. |
366 | * |
367 | * @param input0 The first encoded endpoint. |
368 | * @param input1 The second encoded endpoint. |
369 | */ |
370 | static ASTCENC_SIMD_INLINE void bit_transfer_signed( |
371 | vint4& input0, |
372 | vint4& input1 |
373 | ) { |
374 | input1 = lsr<1>(input1) | (input0 & 0x80); |
375 | input0 = lsr<1>(input0) & 0x3F; |
376 | |
377 | vmask4 mask = (input0 & 0x20) != vint4::zero(); |
378 | input0 = select(input0, input0 - 0x40, mask); |
379 | } |
380 | |
381 | /** |
382 | * @brief Debug function to print a vector of ints. |
383 | */ |
384 | ASTCENC_SIMD_INLINE void print(vint4 a) |
385 | { |
386 | alignas(16) int v[4]; |
387 | storea(a, v); |
388 | printf("v4_i32:\n %8d %8d %8d %8d\n" , |
389 | v[0], v[1], v[2], v[3]); |
390 | } |
391 | |
392 | /** |
393 | * @brief Debug function to print a vector of ints. |
394 | */ |
395 | ASTCENC_SIMD_INLINE void printx(vint4 a) |
396 | { |
397 | alignas(16) int v[4]; |
398 | storea(a, v); |
399 | printf("v4_i32:\n %08x %08x %08x %08x\n" , |
400 | v[0], v[1], v[2], v[3]); |
401 | } |
402 | |
403 | /** |
404 | * @brief Debug function to print a vector of floats. |
405 | */ |
406 | ASTCENC_SIMD_INLINE void print(vfloat4 a) |
407 | { |
408 | alignas(16) float v[4]; |
409 | storea(a, v); |
410 | printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n" , |
411 | static_cast<double>(v[0]), static_cast<double>(v[1]), |
412 | static_cast<double>(v[2]), static_cast<double>(v[3])); |
413 | } |
414 | |
415 | /** |
416 | * @brief Debug function to print a vector of masks. |
417 | */ |
418 | ASTCENC_SIMD_INLINE void print(vmask4 a) |
419 | { |
420 | print(select(vint4(0), vint4(1), a)); |
421 | } |
422 | |
423 | #endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED |
424 | |