| 1 | // SPDX-License-Identifier: Apache-2.0 |
| 2 | // ---------------------------------------------------------------------------- |
| 3 | // Copyright 2020-2021 Arm Limited |
| 4 | // |
| 5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| 6 | // use this file except in compliance with the License. You may obtain a copy |
| 7 | // of the License at: |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, software |
| 12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| 13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| 14 | // License for the specific language governing permissions and limitations |
| 15 | // under the License. |
| 16 | // ---------------------------------------------------------------------------- |
| 17 | |
| 18 | /** |
| 19 | * @brief Generic 4x32-bit vector functions. |
| 20 | * |
| 21 | * This module implements generic 4-wide vector functions that are valid for |
| 22 | * all instruction sets, typically implemented using lower level 4-wide |
| 23 | * operations that are ISA-specific. |
| 24 | */ |
| 25 | |
| 26 | #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED |
| 27 | #define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED |
| 28 | |
| 29 | #ifndef ASTCENC_SIMD_INLINE |
| 30 | #error "Include astcenc_vecmathlib.h, do not include directly" |
| 31 | #endif |
| 32 | |
| 33 | #include <cstdio> |
| 34 | |
| 35 | // ============================================================================ |
| 36 | // vmask4 operators and functions |
| 37 | // ============================================================================ |
| 38 | |
| 39 | /** |
| 40 | * @brief True if any lanes are enabled, false otherwise. |
| 41 | */ |
| 42 | ASTCENC_SIMD_INLINE bool any(vmask4 a) |
| 43 | { |
| 44 | return mask(a) != 0; |
| 45 | } |
| 46 | |
| 47 | /** |
| 48 | * @brief True if all lanes are enabled, false otherwise. |
| 49 | */ |
| 50 | ASTCENC_SIMD_INLINE bool all(vmask4 a) |
| 51 | { |
| 52 | return mask(a) == 0xF; |
| 53 | } |
| 54 | |
| 55 | // ============================================================================ |
| 56 | // vint4 operators and functions |
| 57 | // ============================================================================ |
| 58 | |
| 59 | /** |
| 60 | * @brief Overload: vector by scalar addition. |
| 61 | */ |
| 62 | ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b) |
| 63 | { |
| 64 | return a + vint4(b); |
| 65 | } |
| 66 | |
| 67 | /** |
| 68 | * @brief Overload: vector by vector incremental addition. |
| 69 | */ |
| 70 | ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b) |
| 71 | { |
| 72 | a = a + b; |
| 73 | return a; |
| 74 | } |
| 75 | |
| 76 | /** |
| 77 | * @brief Overload: vector by scalar subtraction. |
| 78 | */ |
| 79 | ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b) |
| 80 | { |
| 81 | return a - vint4(b); |
| 82 | } |
| 83 | |
| 84 | /** |
| 85 | * @brief Overload: vector by scalar multiplication. |
| 86 | */ |
| 87 | ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b) |
| 88 | { |
| 89 | return a * vint4(b); |
| 90 | } |
| 91 | |
| 92 | /** |
| 93 | * @brief Overload: vector by scalar bitwise or. |
| 94 | */ |
| 95 | ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b) |
| 96 | { |
| 97 | return a | vint4(b); |
| 98 | } |
| 99 | |
| 100 | /** |
| 101 | * @brief Overload: vector by scalar bitwise and. |
| 102 | */ |
| 103 | ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b) |
| 104 | { |
| 105 | return a & vint4(b); |
| 106 | } |
| 107 | |
| 108 | /** |
| 109 | * @brief Overload: vector by scalar bitwise xor. |
| 110 | */ |
| 111 | ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b) |
| 112 | { |
| 113 | return a ^ vint4(b); |
| 114 | } |
| 115 | |
| 116 | /** |
| 117 | * @brief Return the clamped value between min and max. |
| 118 | */ |
| 119 | ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a) |
| 120 | { |
| 121 | return min(max(a, vint4(minv)), vint4(maxv)); |
| 122 | } |
| 123 | |
| 124 | /** |
| 125 | * @brief Return the horizontal sum of RGB vector lanes as a scalar. |
| 126 | */ |
| 127 | ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a) |
| 128 | { |
| 129 | return a.lane<0>() + a.lane<1>() + a.lane<2>(); |
| 130 | } |
| 131 | |
| 132 | // ============================================================================ |
| 133 | // vfloat4 operators and functions |
| 134 | // ============================================================================ |
| 135 | |
| 136 | /** |
| 137 | * @brief Overload: vector by vector incremental addition. |
| 138 | */ |
| 139 | ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b) |
| 140 | { |
| 141 | a = a + b; |
| 142 | return a; |
| 143 | } |
| 144 | |
| 145 | /** |
| 146 | * @brief Overload: vector by scalar addition. |
| 147 | */ |
| 148 | ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b) |
| 149 | { |
| 150 | return a + vfloat4(b); |
| 151 | } |
| 152 | |
| 153 | /** |
| 154 | * @brief Overload: vector by scalar subtraction. |
| 155 | */ |
| 156 | ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b) |
| 157 | { |
| 158 | return a - vfloat4(b); |
| 159 | } |
| 160 | |
| 161 | /** |
| 162 | * @brief Overload: vector by scalar multiplication. |
| 163 | */ |
| 164 | ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b) |
| 165 | { |
| 166 | return a * vfloat4(b); |
| 167 | } |
| 168 | |
| 169 | /** |
| 170 | * @brief Overload: scalar by vector multiplication. |
| 171 | */ |
| 172 | ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b) |
| 173 | { |
| 174 | return vfloat4(a) * b; |
| 175 | } |
| 176 | |
| 177 | /** |
| 178 | * @brief Overload: vector by scalar division. |
| 179 | */ |
| 180 | ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b) |
| 181 | { |
| 182 | return a / vfloat4(b); |
| 183 | } |
| 184 | |
| 185 | /** |
| 186 | * @brief Overload: scalar by vector division. |
| 187 | */ |
| 188 | ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b) |
| 189 | { |
| 190 | return vfloat4(a) / b; |
| 191 | } |
| 192 | |
| 193 | /** |
| 194 | * @brief Return the min vector of a vector and a scalar. |
| 195 | * |
| 196 | * If either lane value is NaN, @c b will be returned for that lane. |
| 197 | */ |
| 198 | ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b) |
| 199 | { |
| 200 | return min(a, vfloat4(b)); |
| 201 | } |
| 202 | |
| 203 | /** |
| 204 | * @brief Return the max vector of a vector and a scalar. |
| 205 | * |
| 206 | * If either lane value is NaN, @c b will be returned for that lane. |
| 207 | */ |
| 208 | ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b) |
| 209 | { |
| 210 | return max(a, vfloat4(b)); |
| 211 | } |
| 212 | |
| 213 | /** |
| 214 | * @brief Return the clamped value between min and max. |
| 215 | * |
| 216 | * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN |
| 217 | * then @c min will be returned for that lane. |
| 218 | */ |
| 219 | ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a) |
| 220 | { |
| 221 | // Do not reorder - second operand will return if either is NaN |
| 222 | return min(max(a, minv), maxv); |
| 223 | } |
| 224 | |
| 225 | /** |
| 226 | * @brief Return the clamped value between 0.0f and max. |
| 227 | * |
| 228 | * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will |
| 229 | * be returned for that lane. |
| 230 | */ |
| 231 | ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a) |
| 232 | { |
| 233 | // Do not reorder - second operand will return if either is NaN |
| 234 | return min(max(a, vfloat4::zero()), maxv); |
| 235 | } |
| 236 | |
| 237 | /** |
| 238 | * @brief Return the clamped value between 0.0f and 1.0f. |
| 239 | * |
| 240 | * If @c a is NaN then zero will be returned for that lane. |
| 241 | */ |
| 242 | ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a) |
| 243 | { |
| 244 | // Do not reorder - second operand will return if either is NaN |
| 245 | return min(max(a, vfloat4::zero()), 1.0f); |
| 246 | } |
| 247 | |
| 248 | /** |
| 249 | * @brief Return the horizontal minimum of a vector. |
| 250 | */ |
| 251 | ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a) |
| 252 | { |
| 253 | return hmin(a).lane<0>(); |
| 254 | } |
| 255 | |
| 256 | /** |
| 257 | * @brief Return the horizontal min of RGB vector lanes as a scalar. |
| 258 | */ |
| 259 | ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a) |
| 260 | { |
| 261 | a.set_lane<3>(a.lane<0>()); |
| 262 | return hmin_s(a); |
| 263 | } |
| 264 | |
| 265 | /** |
| 266 | * @brief Return the horizontal maximum of a vector. |
| 267 | */ |
| 268 | ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a) |
| 269 | { |
| 270 | return hmax(a).lane<0>(); |
| 271 | } |
| 272 | |
| 273 | /** |
| 274 | * @brief Accumulate lane-wise sums for a vector. |
| 275 | */ |
| 276 | ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a) |
| 277 | { |
| 278 | accum = accum + a; |
| 279 | } |
| 280 | |
| 281 | /** |
| 282 | * @brief Accumulate lane-wise sums for a masked vector. |
| 283 | */ |
| 284 | ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m) |
| 285 | { |
| 286 | a = select(vfloat4::zero(), a, m); |
| 287 | haccumulate(accum, a); |
| 288 | } |
| 289 | |
| 290 | /** |
| 291 | * @brief Return the horizontal sum of RGB vector lanes as a scalar. |
| 292 | */ |
| 293 | ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a) |
| 294 | { |
| 295 | return a.lane<0>() + a.lane<1>() + a.lane<2>(); |
| 296 | } |
| 297 | |
| 298 | #if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT) |
| 299 | |
| 300 | /** |
| 301 | * @brief Return the dot product for the full 4 lanes, returning scalar. |
| 302 | */ |
| 303 | ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b) |
| 304 | { |
| 305 | vfloat4 m = a * b; |
| 306 | return hadd_s(m); |
| 307 | } |
| 308 | |
| 309 | /** |
| 310 | * @brief Return the dot product for the full 4 lanes, returning vector. |
| 311 | */ |
| 312 | ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b) |
| 313 | { |
| 314 | vfloat4 m = a * b; |
| 315 | return vfloat4(hadd_s(m)); |
| 316 | } |
| 317 | |
| 318 | /** |
| 319 | * @brief Return the dot product for the bottom 3 lanes, returning scalar. |
| 320 | */ |
| 321 | ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b) |
| 322 | { |
| 323 | vfloat4 m = a * b; |
| 324 | return hadd_rgb_s(m); |
| 325 | } |
| 326 | |
| 327 | /** |
| 328 | * @brief Return the dot product for the bottom 3 lanes, returning vector. |
| 329 | */ |
| 330 | ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b) |
| 331 | { |
| 332 | vfloat4 m = a * b; |
| 333 | float d3 = hadd_rgb_s(m); |
| 334 | return vfloat4(d3, d3, d3, 0.0f); |
| 335 | } |
| 336 | |
| 337 | #endif |
| 338 | |
| 339 | #if !defined(ASTCENC_USE_NATIVE_POPCOUNT) |
| 340 | |
| 341 | /** |
| 342 | * @brief Population bit count. |
| 343 | * |
| 344 | * @param v The value to population count. |
| 345 | * |
| 346 | * @return The number of 1 bits. |
| 347 | */ |
| 348 | static inline int popcount(uint64_t v) |
| 349 | { |
| 350 | uint64_t mask1 = 0x5555555555555555ULL; |
| 351 | uint64_t mask2 = 0x3333333333333333ULL; |
| 352 | uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL; |
| 353 | v -= (v >> 1) & mask1; |
| 354 | v = (v & mask2) + ((v >> 2) & mask2); |
| 355 | v += v >> 4; |
| 356 | v &= mask3; |
| 357 | v *= 0x0101010101010101ULL; |
| 358 | v >>= 56; |
| 359 | return static_cast<int>(v); |
| 360 | } |
| 361 | |
| 362 | #endif |
| 363 | |
| 364 | /** |
| 365 | * @brief Apply signed bit transfer. |
| 366 | * |
| 367 | * @param input0 The first encoded endpoint. |
| 368 | * @param input1 The second encoded endpoint. |
| 369 | */ |
| 370 | static ASTCENC_SIMD_INLINE void bit_transfer_signed( |
| 371 | vint4& input0, |
| 372 | vint4& input1 |
| 373 | ) { |
| 374 | input1 = lsr<1>(input1) | (input0 & 0x80); |
| 375 | input0 = lsr<1>(input0) & 0x3F; |
| 376 | |
| 377 | vmask4 mask = (input0 & 0x20) != vint4::zero(); |
| 378 | input0 = select(input0, input0 - 0x40, mask); |
| 379 | } |
| 380 | |
| 381 | /** |
| 382 | * @brief Debug function to print a vector of ints. |
| 383 | */ |
| 384 | ASTCENC_SIMD_INLINE void print(vint4 a) |
| 385 | { |
| 386 | alignas(16) int v[4]; |
| 387 | storea(a, v); |
| 388 | printf("v4_i32:\n %8d %8d %8d %8d\n" , |
| 389 | v[0], v[1], v[2], v[3]); |
| 390 | } |
| 391 | |
| 392 | /** |
| 393 | * @brief Debug function to print a vector of ints. |
| 394 | */ |
| 395 | ASTCENC_SIMD_INLINE void printx(vint4 a) |
| 396 | { |
| 397 | alignas(16) int v[4]; |
| 398 | storea(a, v); |
| 399 | printf("v4_i32:\n %08x %08x %08x %08x\n" , |
| 400 | v[0], v[1], v[2], v[3]); |
| 401 | } |
| 402 | |
| 403 | /** |
| 404 | * @brief Debug function to print a vector of floats. |
| 405 | */ |
| 406 | ASTCENC_SIMD_INLINE void print(vfloat4 a) |
| 407 | { |
| 408 | alignas(16) float v[4]; |
| 409 | storea(a, v); |
| 410 | printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n" , |
| 411 | static_cast<double>(v[0]), static_cast<double>(v[1]), |
| 412 | static_cast<double>(v[2]), static_cast<double>(v[3])); |
| 413 | } |
| 414 | |
| 415 | /** |
| 416 | * @brief Debug function to print a vector of masks. |
| 417 | */ |
| 418 | ASTCENC_SIMD_INLINE void print(vmask4 a) |
| 419 | { |
| 420 | print(select(vint4(0), vint4(1), a)); |
| 421 | } |
| 422 | |
| 423 | #endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED |
| 424 | |