| 1 | /* | 
|---|
| 2 | * Copyright 2018 Google Inc. | 
|---|
| 3 | * | 
|---|
| 4 | * Use of this source code is governed by a BSD-style license that can be | 
|---|
| 5 | * found in the LICENSE file. | 
|---|
| 6 | */ | 
|---|
| 7 |  | 
|---|
| 8 | // Intentionally NO #pragma once... included multiple times. | 
|---|
| 9 |  | 
|---|
| 10 | // This file is included from skcms.cc in a namespace with some pre-defines: | 
|---|
| 11 | //    - N:    depth of all vectors, 1,4,8, or 16 (preprocessor define) | 
|---|
| 12 | //    - V<T>: a template to create a vector of N T's. | 
|---|
| 13 |  | 
|---|
| 14 | using F   = V<Color>;   // Called F for historic reasons... maybe rename C? | 
|---|
| 15 | using I32 = V<int32_t>; | 
|---|
| 16 | using U64 = V<uint64_t>; | 
|---|
| 17 | using U32 = V<uint32_t>; | 
|---|
| 18 | using U16 = V<uint16_t>; | 
|---|
| 19 | using U8  = V<uint8_t>; | 
|---|
| 20 |  | 
|---|
| 21 |  | 
|---|
| 22 | #if defined(__GNUC__) && !defined(__clang__) | 
|---|
| 23 | // Once again, GCC is kind of weird, not allowing vector = scalar directly. | 
|---|
| 24 | static constexpr F F0 = F() + 0.0f, | 
|---|
| 25 | F1 = F() + 1.0f; | 
|---|
| 26 | #else | 
|---|
| 27 | static constexpr F F0 = 0.0f, | 
|---|
| 28 | F1 = 1.0f; | 
|---|
| 29 | #endif | 
|---|
| 30 |  | 
|---|
| 31 | // Instead of checking __AVX__ below, we'll check USING_AVX. | 
|---|
| 32 | // This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way. | 
|---|
| 33 | // Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2. | 
|---|
| 34 |  | 
|---|
| 35 | #if !defined(USING_AVX)      && N == 8 && defined(__AVX__) | 
|---|
| 36 | #define  USING_AVX | 
|---|
| 37 | #endif | 
|---|
| 38 | #if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__) | 
|---|
| 39 | #define  USING AVX_F16C | 
|---|
| 40 | #endif | 
|---|
| 41 | #if !defined(USING_AVX2)     && defined(USING_AVX) && defined(__AVX2__) | 
|---|
| 42 | #define  USING_AVX2 | 
|---|
| 43 | #endif | 
|---|
| 44 | #if !defined(USING_AVX512F)  && N == 16 && defined(__AVX512F__) | 
|---|
| 45 | #define  USING_AVX512F | 
|---|
| 46 | #endif | 
|---|
| 47 |  | 
|---|
| 48 | // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C. | 
|---|
| 49 | // This is more for organizational clarity... skcms.cc doesn't force these. | 
|---|
| 50 | #if N > 1 && defined(__ARM_NEON) | 
|---|
| 51 | #define USING_NEON | 
|---|
| 52 | #if __ARM_FP & 2 | 
|---|
| 53 | #define USING_NEON_F16C | 
|---|
| 54 | #endif | 
|---|
| 55 | #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16) | 
|---|
| 56 | #define USING_NEON_FP16 | 
|---|
| 57 | #endif | 
|---|
| 58 | #endif | 
|---|
| 59 |  | 
|---|
| 60 | // These -Wvector-conversion warnings seem to trigger in very bogus situations, | 
|---|
| 61 | // like vst3q_f32() expecting a 16x char rather than a 4x float vector.  :/ | 
|---|
| 62 | #if defined(USING_NEON) && defined(__clang__) | 
|---|
| 63 | #pragma clang diagnostic ignored "-Wvector-conversion" | 
|---|
| 64 | #endif | 
|---|
| 65 |  | 
|---|
| 66 | // GCC warns us about returning U64 on x86 because it's larger than a register. | 
|---|
| 67 | // You'd see warnings like, "using AVX even though AVX is not enabled". | 
|---|
| 68 | // We stifle these warnings... our helpers that return U64 are always inlined. | 
|---|
| 69 | #if defined(__SSE__) && defined(__GNUC__) && !defined(__clang__) | 
|---|
| 70 | #pragma GCC diagnostic ignored "-Wpsabi" | 
|---|
| 71 | #endif | 
|---|
| 72 |  | 
|---|
| 73 | #if defined(__clang__) | 
|---|
| 74 | #define FALLTHROUGH [[clang::fallthrough]] | 
|---|
| 75 | #else | 
|---|
| 76 | #define FALLTHROUGH | 
|---|
| 77 | #endif | 
|---|
| 78 |  | 
|---|
| 79 | // We tag most helper functions as SI, to enforce good code generation | 
|---|
| 80 | // but also work around what we think is a bug in GCC: when targeting 32-bit | 
|---|
| 81 | // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the | 
|---|
| 82 | // MMX mm0 register, which seems to mess with unrelated code that later uses | 
|---|
| 83 | // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register). | 
|---|
| 84 | // | 
|---|
| 85 | // It helps codegen to call __builtin_memcpy() when we know the byte count at compile time. | 
|---|
| 86 | #if defined(__clang__) || defined(__GNUC__) | 
|---|
| 87 | #define SI static inline __attribute__((always_inline)) | 
|---|
| 88 | #else | 
|---|
| 89 | #define SI static inline | 
|---|
| 90 | #endif | 
|---|
| 91 |  | 
|---|
| 92 | template <typename T, typename P> | 
|---|
| 93 | SI T load(const P* ptr) { | 
|---|
| 94 | T val; | 
|---|
| 95 | small_memcpy(&val, ptr, sizeof(val)); | 
|---|
| 96 | return val; | 
|---|
| 97 | } | 
|---|
| 98 | template <typename T, typename P> | 
|---|
| 99 | SI void store(P* ptr, const T& val) { | 
|---|
| 100 | small_memcpy(ptr, &val, sizeof(val)); | 
|---|
| 101 | } | 
|---|
| 102 |  | 
|---|
| 103 | // (T)v is a cast when N == 1 and a bit-pun when N>1, | 
|---|
| 104 | // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun. | 
|---|
| 105 | template <typename D, typename S> | 
|---|
| 106 | SI D cast(const S& v) { | 
|---|
| 107 | #if N == 1 | 
|---|
| 108 | return (D)v; | 
|---|
| 109 | #elif defined(__clang__) | 
|---|
| 110 | return __builtin_convertvector(v, D); | 
|---|
| 111 | #else | 
|---|
| 112 | D d; | 
|---|
| 113 | for (int i = 0; i < N; i++) { | 
|---|
| 114 | d[i] = v[i]; | 
|---|
| 115 | } | 
|---|
| 116 | return d; | 
|---|
| 117 | #endif | 
|---|
| 118 | } | 
|---|
| 119 |  | 
|---|
| 120 | template <typename D, typename S> | 
|---|
| 121 | SI D bit_pun(const S& v) { | 
|---|
| 122 | static_assert(sizeof(D) == sizeof(v), ""); | 
|---|
| 123 | return load<D>(&v); | 
|---|
| 124 | } | 
|---|
| 125 |  | 
|---|
| 126 | // When we convert from float to fixed point, it's very common to want to round, | 
|---|
| 127 | // and for some reason compilers generate better code when converting to int32_t. | 
|---|
| 128 | // To serve both those ends, we use this function to_fixed() instead of direct cast(). | 
|---|
| 129 | #if defined(USING_NEON_FP16) | 
|---|
| 130 | // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16. | 
|---|
| 131 | SI U16 to_fixed(F f) {  return cast<U16>(f + 0.5f); } | 
|---|
| 132 | #else | 
|---|
| 133 | SI U32 to_fixed(F f) {  return (U32)cast<I32>(f + 0.5f); } | 
|---|
| 134 | #endif | 
|---|
| 135 |  | 
|---|
| 136 |  | 
|---|
| 137 | // Sometimes we do something crazy on one branch of a conditonal, | 
|---|
| 138 | // like divide by zero or convert a huge float to an integer, | 
|---|
| 139 | // but then harmlessly select the other side.  That trips up N==1 | 
|---|
| 140 | // sanitizer builds, so we make if_then_else() a macro to avoid | 
|---|
| 141 | // evaluating the unused side. | 
|---|
| 142 |  | 
|---|
| 143 | #if N == 1 | 
|---|
| 144 | #define if_then_else(cond, t, e) ((cond) ? (t) : (e)) | 
|---|
| 145 | #else | 
|---|
| 146 | template <typename C, typename T> | 
|---|
| 147 | SI T if_then_else(C cond, T t, T e) { | 
|---|
| 148 | return bit_pun<T>( ( cond & bit_pun<C>(t)) | | 
|---|
| 149 | (~cond & bit_pun<C>(e)) ); | 
|---|
| 150 | } | 
|---|
| 151 | #endif | 
|---|
| 152 |  | 
|---|
| 153 |  | 
|---|
| 154 | SI F F_from_Half(U16 half) { | 
|---|
| 155 | #if defined(USING_NEON_FP16) | 
|---|
| 156 | return bit_pun<F>(half); | 
|---|
| 157 | #elif defined(USING_NEON_F16C) | 
|---|
| 158 | return vcvt_f32_f16((float16x4_t)half); | 
|---|
| 159 | #elif defined(USING_AVX512F) | 
|---|
| 160 | return (F)_mm512_cvtph_ps((__m256i)half); | 
|---|
| 161 | #elif defined(USING_AVX_F16C) | 
|---|
| 162 | typedef int16_t __attribute__((vector_size(16))) I16; | 
|---|
| 163 | return __builtin_ia32_vcvtph2ps256((I16)half); | 
|---|
| 164 | #else | 
|---|
| 165 | U32 wide = cast<U32>(half); | 
|---|
| 166 | // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias. | 
|---|
| 167 | U32 s  = wide & 0x8000, | 
|---|
| 168 | em = wide ^ s; | 
|---|
| 169 |  | 
|---|
| 170 | // Constructing the float is easy if the half is not denormalized. | 
|---|
| 171 | F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) ); | 
|---|
| 172 |  | 
|---|
| 173 | // Simply flush all denorm half floats to zero. | 
|---|
| 174 | return if_then_else(em < 0x0400, F0, norm); | 
|---|
| 175 | #endif | 
|---|
| 176 | } | 
|---|
| 177 |  | 
|---|
| 178 | #if defined(__clang__) | 
|---|
| 179 | // The -((127-15)<<10) underflows that side of the math when | 
|---|
| 180 | // we pass a denorm half float.  It's harmless... we'll take the 0 side anyway. | 
|---|
| 181 | __attribute__((no_sanitize( "unsigned-integer-overflow"))) | 
|---|
| 182 | #endif | 
|---|
| 183 | SI U16 Half_from_F(F f) { | 
|---|
| 184 | #if defined(USING_NEON_FP16) | 
|---|
| 185 | return bit_pun<U16>(f); | 
|---|
| 186 | #elif defined(USING_NEON_F16C) | 
|---|
| 187 | return (U16)vcvt_f16_f32(f); | 
|---|
| 188 | #elif defined(USING_AVX512F) | 
|---|
| 189 | return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION ); | 
|---|
| 190 | #elif defined(USING_AVX_F16C) | 
|---|
| 191 | return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/); | 
|---|
| 192 | #else | 
|---|
| 193 | // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias. | 
|---|
| 194 | U32 sem = bit_pun<U32>(f), | 
|---|
| 195 | s   = sem & 0x80000000, | 
|---|
| 196 | em = sem ^ s; | 
|---|
| 197 |  | 
|---|
| 198 | // For simplicity we flush denorm half floats (including all denorm floats) to zero. | 
|---|
| 199 | return cast<U16>(if_then_else(em < 0x38800000, (U32)F0 | 
|---|
| 200 | , (s>>16) + (em>>13) - ((127-15)<<10))); | 
|---|
| 201 | #endif | 
|---|
| 202 | } | 
|---|
| 203 |  | 
|---|
| 204 | // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian. | 
|---|
| 205 | #if defined(USING_NEON_FP16) | 
|---|
| 206 | SI U16 swap_endian_16(U16 v) { | 
|---|
| 207 | return (U16)vrev16q_u8((uint8x16_t) v); | 
|---|
| 208 | } | 
|---|
| 209 | #elif defined(USING_NEON) | 
|---|
| 210 | SI U16 swap_endian_16(U16 v) { | 
|---|
| 211 | return (U16)vrev16_u8((uint8x8_t) v); | 
|---|
| 212 | } | 
|---|
| 213 | #endif | 
|---|
| 214 |  | 
|---|
| 215 | SI U64 swap_endian_16x4(const U64& rgba) { | 
|---|
| 216 | return (rgba & 0x00ff00ff00ff00ff) << 8 | 
|---|
| 217 | | (rgba & 0xff00ff00ff00ff00) >> 8; | 
|---|
| 218 | } | 
|---|
| 219 |  | 
|---|
| 220 | #if defined(USING_NEON_FP16) | 
|---|
| 221 | SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); } | 
|---|
| 222 | SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); } | 
|---|
| 223 | #elif defined(USING_NEON) | 
|---|
| 224 | SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); } | 
|---|
| 225 | SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); } | 
|---|
| 226 | #else | 
|---|
| 227 | SI F min_(F x, F y) { return if_then_else(x > y, y, x); } | 
|---|
| 228 | SI F max_(F x, F y) { return if_then_else(x < y, y, x); } | 
|---|
| 229 | #endif | 
|---|
| 230 |  | 
|---|
| 231 | SI F floor_(F x) { | 
|---|
| 232 | #if N == 1 | 
|---|
| 233 | return floorf_(x); | 
|---|
| 234 | #elif defined(USING_NEON_FP16) | 
|---|
| 235 | return vrndmq_f16(x); | 
|---|
| 236 | #elif defined(__aarch64__) | 
|---|
| 237 | return vrndmq_f32(x); | 
|---|
| 238 | #elif defined(USING_AVX512F) | 
|---|
| 239 | // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1, | 
|---|
| 240 | // and integer santizer catches that this implicit cast changes the | 
|---|
| 241 | // value from -1 to 65535.  We'll cast manually to work around it. | 
|---|
| 242 | // Read this as `return _mm512_floor_ps(x)`. | 
|---|
| 243 | return _mm512_mask_floor_ps(x, (__mmask16)-1, x); | 
|---|
| 244 | #elif defined(USING_AVX) | 
|---|
| 245 | return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/); | 
|---|
| 246 | #elif defined(__SSE4_1__) | 
|---|
| 247 | return _mm_floor_ps(x); | 
|---|
| 248 | #else | 
|---|
| 249 | // Round trip through integers with a truncating cast. | 
|---|
| 250 | F roundtrip = cast<F>(cast<I32>(x)); | 
|---|
| 251 | // If x is negative, truncating gives the ceiling instead of the floor. | 
|---|
| 252 | return roundtrip - if_then_else(roundtrip > x, F1, F0); | 
|---|
| 253 |  | 
|---|
| 254 | // This implementation fails for values of x that are outside | 
|---|
| 255 | // the range an integer can represent.  We expect most x to be small. | 
|---|
| 256 | #endif | 
|---|
| 257 | } | 
|---|
| 258 |  | 
|---|
| 259 | SI F approx_log2(F x) { | 
|---|
| 260 | #if defined(USING_NEON_FP16) | 
|---|
| 261 | // TODO(mtklein) | 
|---|
| 262 | return x; | 
|---|
| 263 | #else | 
|---|
| 264 | // The first approximation of log2(x) is its exponent 'e', minus 127. | 
|---|
| 265 | I32 bits = bit_pun<I32>(x); | 
|---|
| 266 |  | 
|---|
| 267 | F e = cast<F>(bits) * (1.0f / (1<<23)); | 
|---|
| 268 |  | 
|---|
| 269 | // If we use the mantissa too we can refine the error signficantly. | 
|---|
| 270 | F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 ); | 
|---|
| 271 |  | 
|---|
| 272 | return e - 124.225514990f | 
|---|
| 273 | -   1.498030302f*m | 
|---|
| 274 | -   1.725879990f/(0.3520887068f + m); | 
|---|
| 275 | #endif | 
|---|
| 276 | } | 
|---|
| 277 |  | 
|---|
| 278 | SI F approx_log(F x) { | 
|---|
| 279 | const float ln2 = 0.69314718f; | 
|---|
| 280 | return ln2 * approx_log2(x); | 
|---|
| 281 | } | 
|---|
| 282 |  | 
|---|
| 283 | SI F approx_exp2(F x) { | 
|---|
| 284 | #if defined(USING_NEON_FP16) | 
|---|
| 285 | // TODO(mtklein) | 
|---|
| 286 | return x; | 
|---|
| 287 | #else | 
|---|
| 288 | F fract = x - floor_(x); | 
|---|
| 289 |  | 
|---|
| 290 | I32 bits = cast<I32>((1.0f * (1<<23)) * (x + 121.274057500f | 
|---|
| 291 | -   1.490129070f*fract | 
|---|
| 292 | +  27.728023300f/(4.84252568f - fract))); | 
|---|
| 293 | return bit_pun<F>(bits); | 
|---|
| 294 | #endif | 
|---|
| 295 | } | 
|---|
| 296 |  | 
|---|
| 297 | SI F approx_pow(F x, float y) { | 
|---|
| 298 | return if_then_else((x == F0) | (x == F1), x | 
|---|
| 299 | , approx_exp2(approx_log2(x) * y)); | 
|---|
| 300 | } | 
|---|
| 301 |  | 
|---|
| 302 | SI F approx_exp(F x) { | 
|---|
| 303 | const float log2_e = 1.4426950408889634074f; | 
|---|
| 304 | return approx_exp2(log2_e * x); | 
|---|
| 305 | } | 
|---|
| 306 |  | 
|---|
| 307 | // Return tf(x). | 
|---|
| 308 | SI F apply_tf(const skcms_TransferFunction* tf, F x) { | 
|---|
| 309 | #if defined(USING_NEON_FP16) | 
|---|
| 310 | // TODO(mtklein) | 
|---|
| 311 | (void)tf; | 
|---|
| 312 | return x; | 
|---|
| 313 | #else | 
|---|
| 314 | // Peel off the sign bit and set x = |x|. | 
|---|
| 315 | U32 bits = bit_pun<U32>(x), | 
|---|
| 316 | sign = bits & 0x80000000; | 
|---|
| 317 | x = bit_pun<F>(bits ^ sign); | 
|---|
| 318 |  | 
|---|
| 319 | // The transfer function has a linear part up to d, exponential at d and after. | 
|---|
| 320 | F v = if_then_else(x < tf->d,            tf->c*x + tf->f | 
|---|
| 321 | , approx_pow(tf->a*x + tf->b, tf->g) + tf->e); | 
|---|
| 322 |  | 
|---|
| 323 | // Tack the sign bit back on. | 
|---|
| 324 | return bit_pun<F>(sign | bit_pun<U32>(v)); | 
|---|
| 325 | #endif | 
|---|
| 326 | } | 
|---|
| 327 |  | 
|---|
| 328 | SI F apply_pq(const skcms_TransferFunction* tf, F x) { | 
|---|
| 329 | #if defined(USING_NEON_FP16) | 
|---|
| 330 | // TODO(mtklein) | 
|---|
| 331 | (void)tf; | 
|---|
| 332 | return x; | 
|---|
| 333 | #else | 
|---|
| 334 | U32 bits = bit_pun<U32>(x), | 
|---|
| 335 | sign = bits & 0x80000000; | 
|---|
| 336 | x = bit_pun<F>(bits ^ sign); | 
|---|
| 337 |  | 
|---|
| 338 | F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0) | 
|---|
| 339 | / (tf->d + tf->e * approx_pow(x, tf->c)), | 
|---|
| 340 | tf->f); | 
|---|
| 341 |  | 
|---|
| 342 | return bit_pun<F>(sign | bit_pun<U32>(v)); | 
|---|
| 343 | #endif | 
|---|
| 344 | } | 
|---|
| 345 |  | 
|---|
| 346 | SI F apply_hlg(const skcms_TransferFunction* tf, F x) { | 
|---|
| 347 | #if defined(USING_NEON_FP16) | 
|---|
| 348 | // TODO(mtklein) | 
|---|
| 349 | (void)tf; | 
|---|
| 350 | return x; | 
|---|
| 351 | #else | 
|---|
| 352 | const float R = tf->a, G = tf->b, | 
|---|
| 353 | a = tf->c, b = tf->d, c = tf->e; | 
|---|
| 354 | U32 bits = bit_pun<U32>(x), | 
|---|
| 355 | sign = bits & 0x80000000; | 
|---|
| 356 | x = bit_pun<F>(bits ^ sign); | 
|---|
| 357 |  | 
|---|
| 358 | F v = if_then_else(x*R <= 1, approx_pow(x*R, G) | 
|---|
| 359 | , approx_exp((x-c)*a) + b); | 
|---|
| 360 |  | 
|---|
| 361 | return bit_pun<F>(sign | bit_pun<U32>(v)); | 
|---|
| 362 | #endif | 
|---|
| 363 | } | 
|---|
| 364 |  | 
|---|
| 365 | SI F apply_hlginv(const skcms_TransferFunction* tf, F x) { | 
|---|
| 366 | #if defined(USING_NEON_FP16) | 
|---|
| 367 | // TODO(mtklein) | 
|---|
| 368 | (void)tf; | 
|---|
| 369 | return x; | 
|---|
| 370 | #else | 
|---|
| 371 | const float R = tf->a, G = tf->b, | 
|---|
| 372 | a = tf->c, b = tf->d, c = tf->e; | 
|---|
| 373 | U32 bits = bit_pun<U32>(x), | 
|---|
| 374 | sign = bits & 0x80000000; | 
|---|
| 375 | x = bit_pun<F>(bits ^ sign); | 
|---|
| 376 |  | 
|---|
| 377 | F v = if_then_else(x <= 1, R * approx_pow(x, G) | 
|---|
| 378 | , a * approx_log(x - b) + c); | 
|---|
| 379 |  | 
|---|
| 380 | return bit_pun<F>(sign | bit_pun<U32>(v)); | 
|---|
| 381 | #endif | 
|---|
| 382 | } | 
|---|
| 383 |  | 
|---|
| 384 |  | 
|---|
| 385 | // Strided loads and stores of N values, starting from p. | 
|---|
| 386 | template <typename T, typename P> | 
|---|
| 387 | SI T load_3(const P* p) { | 
|---|
| 388 | #if N == 1 | 
|---|
| 389 | return (T)p[0]; | 
|---|
| 390 | #elif N == 4 | 
|---|
| 391 | return T{p[ 0],p[ 3],p[ 6],p[ 9]}; | 
|---|
| 392 | #elif N == 8 | 
|---|
| 393 | return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]}; | 
|---|
| 394 | #elif N == 16 | 
|---|
| 395 | return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21], | 
|---|
| 396 | p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]}; | 
|---|
| 397 | #endif | 
|---|
| 398 | } | 
|---|
| 399 |  | 
|---|
| 400 | template <typename T, typename P> | 
|---|
| 401 | SI T load_4(const P* p) { | 
|---|
| 402 | #if N == 1 | 
|---|
| 403 | return (T)p[0]; | 
|---|
| 404 | #elif N == 4 | 
|---|
| 405 | return T{p[ 0],p[ 4],p[ 8],p[12]}; | 
|---|
| 406 | #elif N == 8 | 
|---|
| 407 | return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]}; | 
|---|
| 408 | #elif N == 16 | 
|---|
| 409 | return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28], | 
|---|
| 410 | p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]}; | 
|---|
| 411 | #endif | 
|---|
| 412 | } | 
|---|
| 413 |  | 
|---|
| 414 | template <typename T, typename P> | 
|---|
| 415 | SI void store_3(P* p, const T& v) { | 
|---|
| 416 | #if N == 1 | 
|---|
| 417 | p[0] = v; | 
|---|
| 418 | #elif N == 4 | 
|---|
| 419 | p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3]; | 
|---|
| 420 | #elif N == 8 | 
|---|
| 421 | p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3]; | 
|---|
| 422 | p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7]; | 
|---|
| 423 | #elif N == 16 | 
|---|
| 424 | p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3]; | 
|---|
| 425 | p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7]; | 
|---|
| 426 | p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11]; | 
|---|
| 427 | p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15]; | 
|---|
| 428 | #endif | 
|---|
| 429 | } | 
|---|
| 430 |  | 
|---|
| 431 | template <typename T, typename P> | 
|---|
| 432 | SI void store_4(P* p, const T& v) { | 
|---|
| 433 | #if N == 1 | 
|---|
| 434 | p[0] = v; | 
|---|
| 435 | #elif N == 4 | 
|---|
| 436 | p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3]; | 
|---|
| 437 | #elif N == 8 | 
|---|
| 438 | p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3]; | 
|---|
| 439 | p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7]; | 
|---|
| 440 | #elif N == 16 | 
|---|
| 441 | p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3]; | 
|---|
| 442 | p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7]; | 
|---|
| 443 | p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11]; | 
|---|
| 444 | p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15]; | 
|---|
| 445 | #endif | 
|---|
| 446 | } | 
|---|
| 447 |  | 
|---|
| 448 |  | 
|---|
| 449 | SI U8 gather_8(const uint8_t* p, I32 ix) { | 
|---|
| 450 | #if N == 1 | 
|---|
| 451 | U8 v = p[ix]; | 
|---|
| 452 | #elif N == 4 | 
|---|
| 453 | U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] }; | 
|---|
| 454 | #elif N == 8 | 
|---|
| 455 | U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], | 
|---|
| 456 | p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] }; | 
|---|
| 457 | #elif N == 16 | 
|---|
| 458 | U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]], | 
|---|
| 459 | p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]], | 
|---|
| 460 | p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]], | 
|---|
| 461 | p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] }; | 
|---|
| 462 | #endif | 
|---|
| 463 | return v; | 
|---|
| 464 | } | 
|---|
| 465 |  | 
|---|
| 466 | SI U16 gather_16(const uint8_t* p, I32 ix) { | 
|---|
| 467 | // Load the i'th 16-bit value from p. | 
|---|
| 468 | auto load_16 = [p](int i) { | 
|---|
| 469 | return load<uint16_t>(p + 2*i); | 
|---|
| 470 | }; | 
|---|
| 471 | #if N == 1 | 
|---|
| 472 | U16 v = load_16(ix); | 
|---|
| 473 | #elif N == 4 | 
|---|
| 474 | U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) }; | 
|---|
| 475 | #elif N == 8 | 
|---|
| 476 | U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]), | 
|---|
| 477 | load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) }; | 
|---|
| 478 | #elif N == 16 | 
|---|
| 479 | U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]), | 
|---|
| 480 | load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]), | 
|---|
| 481 | load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]), | 
|---|
| 482 | load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) }; | 
|---|
| 483 | #endif | 
|---|
| 484 | return v; | 
|---|
| 485 | } | 
|---|
| 486 |  | 
|---|
| 487 | SI U32 gather_32(const uint8_t* p, I32 ix) { | 
|---|
| 488 | // Load the i'th 32-bit value from p. | 
|---|
| 489 | auto load_32 = [p](int i) { | 
|---|
| 490 | return load<uint32_t>(p + 4*i); | 
|---|
| 491 | }; | 
|---|
| 492 | #if N == 1 | 
|---|
| 493 | U32 v = load_32(ix); | 
|---|
| 494 | #elif N == 4 | 
|---|
| 495 | U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) }; | 
|---|
| 496 | #elif N == 8 | 
|---|
| 497 | U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]), | 
|---|
| 498 | load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) }; | 
|---|
| 499 | #elif N == 16 | 
|---|
| 500 | U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]), | 
|---|
| 501 | load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]), | 
|---|
| 502 | load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]), | 
|---|
| 503 | load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) }; | 
|---|
| 504 | #endif | 
|---|
| 505 | // TODO: AVX2 and AVX-512 gathers (c.f. gather_24). | 
|---|
| 506 | return v; | 
|---|
| 507 | } | 
|---|
| 508 |  | 
|---|
| 509 | SI U32 gather_24(const uint8_t* p, I32 ix) { | 
|---|
| 510 | // First, back up a byte.  Any place we're gathering from has a safe junk byte to read | 
|---|
| 511 | // in front of it, either a previous table value, or some tag metadata. | 
|---|
| 512 | p -= 1; | 
|---|
| 513 |  | 
|---|
| 514 | // Load the i'th 24-bit value from p, and 1 extra byte. | 
|---|
| 515 | auto load_24_32 = [p](int i) { | 
|---|
| 516 | return load<uint32_t>(p + 3*i); | 
|---|
| 517 | }; | 
|---|
| 518 |  | 
|---|
| 519 | // Now load multiples of 4 bytes (a junk byte, then r,g,b). | 
|---|
| 520 | #if N == 1 | 
|---|
| 521 | U32 v = load_24_32(ix); | 
|---|
| 522 | #elif N == 4 | 
|---|
| 523 | U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) }; | 
|---|
| 524 | #elif N == 8 && !defined(USING_AVX2) | 
|---|
| 525 | U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]), | 
|---|
| 526 | load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) }; | 
|---|
| 527 | #elif N == 8 | 
|---|
| 528 | (void)load_24_32; | 
|---|
| 529 | // The gather instruction here doesn't need any particular alignment, | 
|---|
| 530 | // but the intrinsic takes a const int*. | 
|---|
| 531 | const int* p4 = bit_pun<const int*>(p); | 
|---|
| 532 | I32 zero = { 0, 0, 0, 0,  0, 0, 0, 0}, | 
|---|
| 533 | mask = {-1,-1,-1,-1, -1,-1,-1,-1}; | 
|---|
| 534 | #if defined(__clang__) | 
|---|
| 535 | U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1); | 
|---|
| 536 | #elif defined(__GNUC__) | 
|---|
| 537 | U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1); | 
|---|
| 538 | #endif | 
|---|
| 539 | #elif N == 16 | 
|---|
| 540 | (void)load_24_32; | 
|---|
| 541 | // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2. | 
|---|
| 542 | // And AVX-512 swapped the order of arguments.  :/ | 
|---|
| 543 | const int* p4 = bit_pun<const int*>(p); | 
|---|
| 544 | U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1); | 
|---|
| 545 | #endif | 
|---|
| 546 |  | 
|---|
| 547 | // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8). | 
|---|
| 548 | return v >> 8; | 
|---|
| 549 | } | 
|---|
| 550 |  | 
|---|
| 551 | #if !defined(__arm__) | 
|---|
| 552 | SI void gather_48(const uint8_t* p, I32 ix, U64* v) { | 
|---|
| 553 | // As in gather_24(), with everything doubled. | 
|---|
| 554 | p -= 2; | 
|---|
| 555 |  | 
|---|
| 556 | // Load the i'th 48-bit value from p, and 2 extra bytes. | 
|---|
| 557 | auto load_48_64 = [p](int i) { | 
|---|
| 558 | return load<uint64_t>(p + 6*i); | 
|---|
| 559 | }; | 
|---|
| 560 |  | 
|---|
| 561 | #if N == 1 | 
|---|
| 562 | *v = load_48_64(ix); | 
|---|
| 563 | #elif N == 4 | 
|---|
| 564 | *v = U64{ | 
|---|
| 565 | load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]), | 
|---|
| 566 | }; | 
|---|
| 567 | #elif N == 8 && !defined(USING_AVX2) | 
|---|
| 568 | *v = U64{ | 
|---|
| 569 | load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]), | 
|---|
| 570 | load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]), | 
|---|
| 571 | }; | 
|---|
| 572 | #elif N == 8 | 
|---|
| 573 | (void)load_48_64; | 
|---|
| 574 | typedef int32_t   __attribute__((vector_size(16))) Half_I32; | 
|---|
| 575 | typedef long long __attribute__((vector_size(32))) Half_I64; | 
|---|
| 576 |  | 
|---|
| 577 | // The gather instruction here doesn't need any particular alignment, | 
|---|
| 578 | // but the intrinsic takes a const long long*. | 
|---|
| 579 | const long long int* p8 = bit_pun<const long long int*>(p); | 
|---|
| 580 |  | 
|---|
| 581 | Half_I64 zero = { 0, 0, 0, 0}, | 
|---|
| 582 | mask = {-1,-1,-1,-1}; | 
|---|
| 583 |  | 
|---|
| 584 | ix *= 6; | 
|---|
| 585 | Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] }, | 
|---|
| 586 | ix_hi = { ix[4], ix[5], ix[6], ix[7] }; | 
|---|
| 587 |  | 
|---|
| 588 | #if defined(__clang__) | 
|---|
| 589 | Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1), | 
|---|
| 590 | hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1); | 
|---|
| 591 | #elif defined(__GNUC__) | 
|---|
| 592 | Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1), | 
|---|
| 593 | hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1); | 
|---|
| 594 | #endif | 
|---|
| 595 | store((char*)v +  0, lo); | 
|---|
| 596 | store((char*)v + 32, hi); | 
|---|
| 597 | #elif N == 16 | 
|---|
| 598 | (void)load_48_64; | 
|---|
| 599 | const long long int* p8 = bit_pun<const long long int*>(p); | 
|---|
| 600 | __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1), | 
|---|
| 601 | hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1); | 
|---|
| 602 | store((char*)v +  0, lo); | 
|---|
| 603 | store((char*)v + 64, hi); | 
|---|
| 604 | #endif | 
|---|
| 605 |  | 
|---|
| 606 | *v >>= 16; | 
|---|
| 607 | } | 
|---|
| 608 | #endif | 
|---|
| 609 |  | 
|---|
| 610 | SI F F_from_U8(U8 v) { | 
|---|
| 611 | return cast<F>(v) * (1/255.0f); | 
|---|
| 612 | } | 
|---|
| 613 |  | 
|---|
| 614 | SI F F_from_U16_BE(U16 v) { | 
|---|
| 615 | // All 16-bit ICC values are big-endian, so we byte swap before converting to float. | 
|---|
| 616 | // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask. | 
|---|
| 617 | U16 lo = (v >> 8), | 
|---|
| 618 | hi = (v << 8) & 0xffff; | 
|---|
| 619 | return cast<F>(lo|hi) * (1/65535.0f); | 
|---|
| 620 | } | 
|---|
| 621 |  | 
|---|
| 622 | SI U16 U16_from_F(F v) { | 
|---|
| 623 | // 65535 == inf in FP16, so promote to FP32 before converting. | 
|---|
| 624 | return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f); | 
|---|
| 625 | } | 
|---|
| 626 |  | 
|---|
| 627 | SI F minus_1_ulp(F v) { | 
|---|
| 628 | #if defined(USING_NEON_FP16) | 
|---|
| 629 | return bit_pun<F>( bit_pun<U16>(v) - 1 ); | 
|---|
| 630 | #else | 
|---|
| 631 | return bit_pun<F>( bit_pun<U32>(v) - 1 ); | 
|---|
| 632 | #endif | 
|---|
| 633 | } | 
|---|
| 634 |  | 
|---|
| 635 | SI F table(const skcms_Curve* curve, F v) { | 
|---|
| 636 | // Clamp the input to [0,1], then scale to a table index. | 
|---|
| 637 | F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1); | 
|---|
| 638 |  | 
|---|
| 639 | // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two. | 
|---|
| 640 | I32 lo = cast<I32>(            ix      ), | 
|---|
| 641 | hi = cast<I32>(minus_1_ulp(ix+1.0f)); | 
|---|
| 642 | F t = ix - cast<F>(lo);  // i.e. the fractional part of ix. | 
|---|
| 643 |  | 
|---|
| 644 | // TODO: can we load l and h simultaneously?  Each entry in 'h' is either | 
|---|
| 645 | // the same as in 'l' or adjacent.  We have a rough idea that's it'd always be safe | 
|---|
| 646 | // to read adjacent entries and perhaps underflow the table by a byte or two | 
|---|
| 647 | // (it'd be junk, but always safe to read).  Not sure how to lerp yet. | 
|---|
| 648 | F l,h; | 
|---|
| 649 | if (curve->table_8) { | 
|---|
| 650 | l = F_from_U8(gather_8(curve->table_8, lo)); | 
|---|
| 651 | h = F_from_U8(gather_8(curve->table_8, hi)); | 
|---|
| 652 | } else { | 
|---|
| 653 | l = F_from_U16_BE(gather_16(curve->table_16, lo)); | 
|---|
| 654 | h = F_from_U16_BE(gather_16(curve->table_16, hi)); | 
|---|
| 655 | } | 
|---|
| 656 | return l + (h-l)*t; | 
|---|
| 657 | } | 
|---|
| 658 |  | 
|---|
| 659 | SI void sample_clut_8(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) { | 
|---|
| 660 | U32 rgb = gather_24(a2b->grid_8, ix); | 
|---|
| 661 |  | 
|---|
| 662 | *r = cast<F>((rgb >>  0) & 0xff) * (1/255.0f); | 
|---|
| 663 | *g = cast<F>((rgb >>  8) & 0xff) * (1/255.0f); | 
|---|
| 664 | *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f); | 
|---|
| 665 | } | 
|---|
| 666 |  | 
|---|
| 667 | SI void sample_clut_16(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) { | 
|---|
| 668 | #if defined(__arm__) | 
|---|
| 669 | // This is up to 2x faster on 32-bit ARM than the #else-case fast path. | 
|---|
| 670 | *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0)); | 
|---|
| 671 | *g = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+1)); | 
|---|
| 672 | *b = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+2)); | 
|---|
| 673 | #else | 
|---|
| 674 | // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too. | 
|---|
| 675 | U64 rgb; | 
|---|
| 676 | gather_48(a2b->grid_16, ix, &rgb); | 
|---|
| 677 | rgb = swap_endian_16x4(rgb); | 
|---|
| 678 |  | 
|---|
| 679 | *r = cast<F>((rgb >>  0) & 0xffff) * (1/65535.0f); | 
|---|
| 680 | *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f); | 
|---|
| 681 | *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f); | 
|---|
| 682 | #endif | 
|---|
| 683 | } | 
|---|
| 684 |  | 
|---|
| 685 | // GCC 7.2.0 hits an internal compiler error with -finline-functions (or -O3) | 
|---|
| 686 | // when targeting MIPS 64, i386, or s390x,  I think attempting to inline clut() into exec_ops(). | 
|---|
| 687 | #if 1 && defined(__GNUC__) && !defined(__clang__) \ | 
|---|
| 688 | && (defined(__mips64) || defined(__i386) || defined(__s390x__)) | 
|---|
| 689 | #define MAYBE_NOINLINE __attribute__((noinline)) | 
|---|
| 690 | #else | 
|---|
| 691 | #define MAYBE_NOINLINE | 
|---|
| 692 | #endif | 
|---|
| 693 |  | 
|---|
| 694 | MAYBE_NOINLINE | 
|---|
| 695 | static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) { | 
|---|
| 696 | const int dim = (int)a2b->input_channels; | 
|---|
| 697 | assert (0 < dim && dim <= 4); | 
|---|
| 698 |  | 
|---|
| 699 | // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4. | 
|---|
| 700 | I32 index [8];  // Index contribution by dimension, first low from 0, then high from 4. | 
|---|
| 701 | F   weight[8];  // Weight for each contribution, again first low, then high. | 
|---|
| 702 |  | 
|---|
| 703 | // O(dim) work first: calculate index,weight from r,g,b,a. | 
|---|
| 704 | const F inputs[] = { *r,*g,*b,a }; | 
|---|
| 705 | for (int i = dim-1, stride = 1; i >= 0; i--) { | 
|---|
| 706 | // x is where we logically want to sample the grid in the i-th dimension. | 
|---|
| 707 | F x = inputs[i] * (float)(a2b->grid_points[i] - 1); | 
|---|
| 708 |  | 
|---|
| 709 | // But we can't index at floats.  lo and hi are the two integer grid points surrounding x. | 
|---|
| 710 | I32 lo = cast<I32>(            x      ),   // i.e. trunc(x) == floor(x) here. | 
|---|
| 711 | hi = cast<I32>(minus_1_ulp(x+1.0f)); | 
|---|
| 712 | // Notice how we fold in the accumulated stride across previous dimensions here. | 
|---|
| 713 | index[i+0] = lo * stride; | 
|---|
| 714 | index[i+4] = hi * stride; | 
|---|
| 715 | stride *= a2b->grid_points[i]; | 
|---|
| 716 |  | 
|---|
| 717 | // We'll interpolate between those two integer grid points by t. | 
|---|
| 718 | F t = x - cast<F>(lo);  // i.e. fract(x) | 
|---|
| 719 | weight[i+0] = 1-t; | 
|---|
| 720 | weight[i+4] = t; | 
|---|
| 721 | } | 
|---|
| 722 |  | 
|---|
| 723 | *r = *g = *b = F0; | 
|---|
| 724 |  | 
|---|
| 725 | // We'll sample 2^dim == 1<<dim table entries per pixel, | 
|---|
| 726 | // in all combinations of low and high in each dimension. | 
|---|
| 727 | for (int combo = 0; combo < (1<<dim); combo++) {  // This loop can be done in any order. | 
|---|
| 728 |  | 
|---|
| 729 | // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4, | 
|---|
| 730 | // where 0 selects the low index contribution and its weight 1-t, | 
|---|
| 731 | // or 4 the high index contribution and its weight t. | 
|---|
| 732 |  | 
|---|
| 733 | // Since 0<dim≤4, we can always just start off with the 0-th channel, | 
|---|
| 734 | // then handle the others conditionally. | 
|---|
| 735 | I32 ix = index [0 + (combo&1)*4]; | 
|---|
| 736 | F    w = weight[0 + (combo&1)*4]; | 
|---|
| 737 |  | 
|---|
| 738 | switch ((dim-1)&3) {  // This lets the compiler know there are no other cases to handle. | 
|---|
| 739 | case 3: ix += index [3 + (combo&8)/2]; | 
|---|
| 740 | w  *= weight[3 + (combo&8)/2]; | 
|---|
| 741 | FALLTHROUGH; | 
|---|
| 742 | // fall through | 
|---|
| 743 |  | 
|---|
| 744 | case 2: ix += index [2 + (combo&4)*1]; | 
|---|
| 745 | w  *= weight[2 + (combo&4)*1]; | 
|---|
| 746 | FALLTHROUGH; | 
|---|
| 747 | // fall through | 
|---|
| 748 |  | 
|---|
| 749 | case 1: ix += index [1 + (combo&2)*2]; | 
|---|
| 750 | w  *= weight[1 + (combo&2)*2]; | 
|---|
| 751 | } | 
|---|
| 752 |  | 
|---|
| 753 | F R,G,B; | 
|---|
| 754 | if (a2b->grid_8) { | 
|---|
| 755 | sample_clut_8 (a2b,ix, &R,&G,&B); | 
|---|
| 756 | } else { | 
|---|
| 757 | sample_clut_16(a2b,ix, &R,&G,&B); | 
|---|
| 758 | } | 
|---|
| 759 |  | 
|---|
| 760 | *r += w*R; | 
|---|
| 761 | *g += w*G; | 
|---|
| 762 | *b += w*B; | 
|---|
| 763 | } | 
|---|
| 764 | } | 
|---|
| 765 |  | 
|---|
| 766 | static void exec_ops(const Op* ops, const void** args, | 
|---|
| 767 | const char* src, char* dst, int i) { | 
|---|
| 768 | F r = F0, g = F0, b = F0, a = F1; | 
|---|
| 769 | while (true) { | 
|---|
| 770 | switch (*ops++) { | 
|---|
| 771 | case Op_load_a8:{ | 
|---|
| 772 | a = F_from_U8(load<U8>(src + 1*i)); | 
|---|
| 773 | } break; | 
|---|
| 774 |  | 
|---|
| 775 | case Op_load_g8:{ | 
|---|
| 776 | r = g = b = F_from_U8(load<U8>(src + 1*i)); | 
|---|
| 777 | } break; | 
|---|
| 778 |  | 
|---|
| 779 | case Op_load_4444:{ | 
|---|
| 780 | U16 abgr = load<U16>(src + 2*i); | 
|---|
| 781 |  | 
|---|
| 782 | r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f); | 
|---|
| 783 | g = cast<F>((abgr >>  8) & 0xf) * (1/15.0f); | 
|---|
| 784 | b = cast<F>((abgr >>  4) & 0xf) * (1/15.0f); | 
|---|
| 785 | a = cast<F>((abgr >>  0) & 0xf) * (1/15.0f); | 
|---|
| 786 | } break; | 
|---|
| 787 |  | 
|---|
| 788 | case Op_load_565:{ | 
|---|
| 789 | U16 rgb = load<U16>(src + 2*i); | 
|---|
| 790 |  | 
|---|
| 791 | r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0)); | 
|---|
| 792 | g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5)); | 
|---|
| 793 | b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11)); | 
|---|
| 794 | } break; | 
|---|
| 795 |  | 
|---|
| 796 | case Op_load_888:{ | 
|---|
| 797 | const uint8_t* rgb = (const uint8_t*)(src + 3*i); | 
|---|
| 798 | #if defined(USING_NEON_FP16) | 
|---|
| 799 | // See the explanation under USING_NEON below.  This is that doubled up. | 
|---|
| 800 | uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }}; | 
|---|
| 801 | v = vld3q_lane_u8(rgb+ 0, v,  0); | 
|---|
| 802 | v = vld3q_lane_u8(rgb+ 3, v,  2); | 
|---|
| 803 | v = vld3q_lane_u8(rgb+ 6, v,  4); | 
|---|
| 804 | v = vld3q_lane_u8(rgb+ 9, v,  6); | 
|---|
| 805 |  | 
|---|
| 806 | v = vld3q_lane_u8(rgb+12, v,  8); | 
|---|
| 807 | v = vld3q_lane_u8(rgb+15, v, 10); | 
|---|
| 808 | v = vld3q_lane_u8(rgb+18, v, 12); | 
|---|
| 809 | v = vld3q_lane_u8(rgb+21, v, 14); | 
|---|
| 810 |  | 
|---|
| 811 | r = cast<F>((U16)v.val[0]) * (1/255.0f); | 
|---|
| 812 | g = cast<F>((U16)v.val[1]) * (1/255.0f); | 
|---|
| 813 | b = cast<F>((U16)v.val[2]) * (1/255.0f); | 
|---|
| 814 | #elif defined(USING_NEON) | 
|---|
| 815 | // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at | 
|---|
| 816 | // a time.  Since we're doing that, we might as well load them into 16-bit lanes. | 
|---|
| 817 | // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.) | 
|---|
| 818 | uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }}; | 
|---|
| 819 | v = vld3_lane_u8(rgb+0, v, 0); | 
|---|
| 820 | v = vld3_lane_u8(rgb+3, v, 2); | 
|---|
| 821 | v = vld3_lane_u8(rgb+6, v, 4); | 
|---|
| 822 | v = vld3_lane_u8(rgb+9, v, 6); | 
|---|
| 823 |  | 
|---|
| 824 | // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to | 
|---|
| 825 | // convert to F.  (Again, U32 would be even better here if drop ARMv7 or split | 
|---|
| 826 | // ARMv7 and ARMv8 impls.) | 
|---|
| 827 | r = cast<F>((U16)v.val[0]) * (1/255.0f); | 
|---|
| 828 | g = cast<F>((U16)v.val[1]) * (1/255.0f); | 
|---|
| 829 | b = cast<F>((U16)v.val[2]) * (1/255.0f); | 
|---|
| 830 | #else | 
|---|
| 831 | r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f); | 
|---|
| 832 | g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f); | 
|---|
| 833 | b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f); | 
|---|
| 834 | #endif | 
|---|
| 835 | } break; | 
|---|
| 836 |  | 
|---|
| 837 | case Op_load_8888:{ | 
|---|
| 838 | U32 rgba = load<U32>(src + 4*i); | 
|---|
| 839 |  | 
|---|
| 840 | r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f); | 
|---|
| 841 | g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f); | 
|---|
| 842 | b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f); | 
|---|
| 843 | a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f); | 
|---|
| 844 | } break; | 
|---|
| 845 |  | 
|---|
| 846 | case Op_load_8888_palette8:{ | 
|---|
| 847 | const uint8_t* palette = (const uint8_t*) *args++; | 
|---|
| 848 | I32 ix = cast<I32>(load<U8>(src + 1*i)); | 
|---|
| 849 | U32 rgba = gather_32(palette, ix); | 
|---|
| 850 |  | 
|---|
| 851 | r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f); | 
|---|
| 852 | g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f); | 
|---|
| 853 | b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f); | 
|---|
| 854 | a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f); | 
|---|
| 855 | } break; | 
|---|
| 856 |  | 
|---|
| 857 | case Op_load_1010102:{ | 
|---|
| 858 | U32 rgba = load<U32>(src + 4*i); | 
|---|
| 859 |  | 
|---|
| 860 | r = cast<F>((rgba >>  0) & 0x3ff) * (1/1023.0f); | 
|---|
| 861 | g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f); | 
|---|
| 862 | b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f); | 
|---|
| 863 | a = cast<F>((rgba >> 30) & 0x3  ) * (1/   3.0f); | 
|---|
| 864 | } break; | 
|---|
| 865 |  | 
|---|
| 866 | case Op_load_161616LE:{ | 
|---|
| 867 | uintptr_t ptr = (uintptr_t)(src + 6*i); | 
|---|
| 868 | assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this | 
|---|
| 869 | const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. | 
|---|
| 870 | #if defined(USING_NEON_FP16) | 
|---|
| 871 | uint16x8x3_t v = vld3q_u16(rgb); | 
|---|
| 872 | r = cast<F>((U16)v.val[0]) * (1/65535.0f); | 
|---|
| 873 | g = cast<F>((U16)v.val[1]) * (1/65535.0f); | 
|---|
| 874 | b = cast<F>((U16)v.val[2]) * (1/65535.0f); | 
|---|
| 875 | #elif defined(USING_NEON) | 
|---|
| 876 | uint16x4x3_t v = vld3_u16(rgb); | 
|---|
| 877 | r = cast<F>((U16)v.val[0]) * (1/65535.0f); | 
|---|
| 878 | g = cast<F>((U16)v.val[1]) * (1/65535.0f); | 
|---|
| 879 | b = cast<F>((U16)v.val[2]) * (1/65535.0f); | 
|---|
| 880 | #else | 
|---|
| 881 | r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f); | 
|---|
| 882 | g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f); | 
|---|
| 883 | b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f); | 
|---|
| 884 | #endif | 
|---|
| 885 | } break; | 
|---|
| 886 |  | 
|---|
| 887 | case Op_load_16161616LE:{ | 
|---|
| 888 | uintptr_t ptr = (uintptr_t)(src + 8*i); | 
|---|
| 889 | assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this | 
|---|
| 890 | const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. | 
|---|
| 891 | #if defined(USING_NEON_FP16) | 
|---|
| 892 | uint16x8x4_t v = vld4q_u16(rgba); | 
|---|
| 893 | r = cast<F>((U16)v.val[0]) * (1/65535.0f); | 
|---|
| 894 | g = cast<F>((U16)v.val[1]) * (1/65535.0f); | 
|---|
| 895 | b = cast<F>((U16)v.val[2]) * (1/65535.0f); | 
|---|
| 896 | a = cast<F>((U16)v.val[3]) * (1/65535.0f); | 
|---|
| 897 | #elif defined(USING_NEON) | 
|---|
| 898 | uint16x4x4_t v = vld4_u16(rgba); | 
|---|
| 899 | r = cast<F>((U16)v.val[0]) * (1/65535.0f); | 
|---|
| 900 | g = cast<F>((U16)v.val[1]) * (1/65535.0f); | 
|---|
| 901 | b = cast<F>((U16)v.val[2]) * (1/65535.0f); | 
|---|
| 902 | a = cast<F>((U16)v.val[3]) * (1/65535.0f); | 
|---|
| 903 | #else | 
|---|
| 904 | U64 px = load<U64>(rgba); | 
|---|
| 905 |  | 
|---|
| 906 | r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f); | 
|---|
| 907 | g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f); | 
|---|
| 908 | b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f); | 
|---|
| 909 | a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f); | 
|---|
| 910 | #endif | 
|---|
| 911 | } break; | 
|---|
| 912 |  | 
|---|
| 913 | case Op_load_161616BE:{ | 
|---|
| 914 | uintptr_t ptr = (uintptr_t)(src + 6*i); | 
|---|
| 915 | assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this | 
|---|
| 916 | const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. | 
|---|
| 917 | #if defined(USING_NEON_FP16) | 
|---|
| 918 | uint16x8x3_t v = vld3q_u16(rgb); | 
|---|
| 919 | r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f); | 
|---|
| 920 | g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f); | 
|---|
| 921 | b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f); | 
|---|
| 922 | #elif defined(USING_NEON) | 
|---|
| 923 | uint16x4x3_t v = vld3_u16(rgb); | 
|---|
| 924 | r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f); | 
|---|
| 925 | g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f); | 
|---|
| 926 | b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f); | 
|---|
| 927 | #else | 
|---|
| 928 | U32 R = load_3<U32>(rgb+0), | 
|---|
| 929 | G = load_3<U32>(rgb+1), | 
|---|
| 930 | B = load_3<U32>(rgb+2); | 
|---|
| 931 | // R,G,B are big-endian 16-bit, so byte swap them before converting to float. | 
|---|
| 932 | r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f); | 
|---|
| 933 | g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f); | 
|---|
| 934 | b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f); | 
|---|
| 935 | #endif | 
|---|
| 936 | } break; | 
|---|
| 937 |  | 
|---|
| 938 | case Op_load_16161616BE:{ | 
|---|
| 939 | uintptr_t ptr = (uintptr_t)(src + 8*i); | 
|---|
| 940 | assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this | 
|---|
| 941 | const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. | 
|---|
| 942 | #if defined(USING_NEON_FP16) | 
|---|
| 943 | uint16x8x4_t v = vld4q_u16(rgba); | 
|---|
| 944 | r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f); | 
|---|
| 945 | g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f); | 
|---|
| 946 | b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f); | 
|---|
| 947 | a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f); | 
|---|
| 948 | #elif defined(USING_NEON) | 
|---|
| 949 | uint16x4x4_t v = vld4_u16(rgba); | 
|---|
| 950 | r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f); | 
|---|
| 951 | g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f); | 
|---|
| 952 | b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f); | 
|---|
| 953 | a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f); | 
|---|
| 954 | #else | 
|---|
| 955 | U64 px = swap_endian_16x4(load<U64>(rgba)); | 
|---|
| 956 |  | 
|---|
| 957 | r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f); | 
|---|
| 958 | g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f); | 
|---|
| 959 | b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f); | 
|---|
| 960 | a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f); | 
|---|
| 961 | #endif | 
|---|
| 962 | } break; | 
|---|
| 963 |  | 
|---|
| 964 | case Op_load_hhh:{ | 
|---|
| 965 | uintptr_t ptr = (uintptr_t)(src + 6*i); | 
|---|
| 966 | assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this | 
|---|
| 967 | const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. | 
|---|
| 968 | #if defined(USING_NEON_FP16) | 
|---|
| 969 | uint16x8x3_t v = vld3q_u16(rgb); | 
|---|
| 970 | U16 R = (U16)v.val[0], | 
|---|
| 971 | G = (U16)v.val[1], | 
|---|
| 972 | B = (U16)v.val[2]; | 
|---|
| 973 | #elif defined(USING_NEON) | 
|---|
| 974 | uint16x4x3_t v = vld3_u16(rgb); | 
|---|
| 975 | U16 R = (U16)v.val[0], | 
|---|
| 976 | G = (U16)v.val[1], | 
|---|
| 977 | B = (U16)v.val[2]; | 
|---|
| 978 | #else | 
|---|
| 979 | U16 R = load_3<U16>(rgb+0), | 
|---|
| 980 | G = load_3<U16>(rgb+1), | 
|---|
| 981 | B = load_3<U16>(rgb+2); | 
|---|
| 982 | #endif | 
|---|
| 983 | r = F_from_Half(R); | 
|---|
| 984 | g = F_from_Half(G); | 
|---|
| 985 | b = F_from_Half(B); | 
|---|
| 986 | } break; | 
|---|
| 987 |  | 
|---|
| 988 | case Op_load_hhhh:{ | 
|---|
| 989 | uintptr_t ptr = (uintptr_t)(src + 8*i); | 
|---|
| 990 | assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this | 
|---|
| 991 | const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. | 
|---|
| 992 | #if defined(USING_NEON_FP16) | 
|---|
| 993 | uint16x8x4_t v = vld4q_u16(rgba); | 
|---|
| 994 | U16 R = (U16)v.val[0], | 
|---|
| 995 | G = (U16)v.val[1], | 
|---|
| 996 | B = (U16)v.val[2], | 
|---|
| 997 | A = (U16)v.val[3]; | 
|---|
| 998 | #elif defined(USING_NEON) | 
|---|
| 999 | uint16x4x4_t v = vld4_u16(rgba); | 
|---|
| 1000 | U16 R = (U16)v.val[0], | 
|---|
| 1001 | G = (U16)v.val[1], | 
|---|
| 1002 | B = (U16)v.val[2], | 
|---|
| 1003 | A = (U16)v.val[3]; | 
|---|
| 1004 | #else | 
|---|
| 1005 | U64 px = load<U64>(rgba); | 
|---|
| 1006 | U16 R = cast<U16>((px >>  0) & 0xffff), | 
|---|
| 1007 | G = cast<U16>((px >> 16) & 0xffff), | 
|---|
| 1008 | B = cast<U16>((px >> 32) & 0xffff), | 
|---|
| 1009 | A = cast<U16>((px >> 48) & 0xffff); | 
|---|
| 1010 | #endif | 
|---|
| 1011 | r = F_from_Half(R); | 
|---|
| 1012 | g = F_from_Half(G); | 
|---|
| 1013 | b = F_from_Half(B); | 
|---|
| 1014 | a = F_from_Half(A); | 
|---|
| 1015 | } break; | 
|---|
| 1016 |  | 
|---|
| 1017 | case Op_load_fff:{ | 
|---|
| 1018 | uintptr_t ptr = (uintptr_t)(src + 12*i); | 
|---|
| 1019 | assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this | 
|---|
| 1020 | const float* rgb = (const float*)ptr;       // cast to const float* to be safe. | 
|---|
| 1021 | #if defined(USING_NEON_FP16) | 
|---|
| 1022 | float32x4x3_t lo = vld3q_f32(rgb +  0), | 
|---|
| 1023 | hi = vld3q_f32(rgb + 12); | 
|---|
| 1024 | r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0])); | 
|---|
| 1025 | g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1])); | 
|---|
| 1026 | b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2])); | 
|---|
| 1027 | #elif defined(USING_NEON) | 
|---|
| 1028 | float32x4x3_t v = vld3q_f32(rgb); | 
|---|
| 1029 | r = (F)v.val[0]; | 
|---|
| 1030 | g = (F)v.val[1]; | 
|---|
| 1031 | b = (F)v.val[2]; | 
|---|
| 1032 | #else | 
|---|
| 1033 | r = load_3<F>(rgb+0); | 
|---|
| 1034 | g = load_3<F>(rgb+1); | 
|---|
| 1035 | b = load_3<F>(rgb+2); | 
|---|
| 1036 | #endif | 
|---|
| 1037 | } break; | 
|---|
| 1038 |  | 
|---|
| 1039 | case Op_load_ffff:{ | 
|---|
| 1040 | uintptr_t ptr = (uintptr_t)(src + 16*i); | 
|---|
| 1041 | assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this | 
|---|
| 1042 | const float* rgba = (const float*)ptr;      // cast to const float* to be safe. | 
|---|
| 1043 | #if defined(USING_NEON_FP16) | 
|---|
| 1044 | float32x4x4_t lo = vld4q_f32(rgba +  0), | 
|---|
| 1045 | hi = vld4q_f32(rgba + 16); | 
|---|
| 1046 | r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0])); | 
|---|
| 1047 | g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1])); | 
|---|
| 1048 | b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2])); | 
|---|
| 1049 | a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3])); | 
|---|
| 1050 | #elif defined(USING_NEON) | 
|---|
| 1051 | float32x4x4_t v = vld4q_f32(rgba); | 
|---|
| 1052 | r = (F)v.val[0]; | 
|---|
| 1053 | g = (F)v.val[1]; | 
|---|
| 1054 | b = (F)v.val[2]; | 
|---|
| 1055 | a = (F)v.val[3]; | 
|---|
| 1056 | #else | 
|---|
| 1057 | r = load_4<F>(rgba+0); | 
|---|
| 1058 | g = load_4<F>(rgba+1); | 
|---|
| 1059 | b = load_4<F>(rgba+2); | 
|---|
| 1060 | a = load_4<F>(rgba+3); | 
|---|
| 1061 | #endif | 
|---|
| 1062 | } break; | 
|---|
| 1063 |  | 
|---|
| 1064 | case Op_swap_rb:{ | 
|---|
| 1065 | F t = r; | 
|---|
| 1066 | r = b; | 
|---|
| 1067 | b = t; | 
|---|
| 1068 | } break; | 
|---|
| 1069 |  | 
|---|
| 1070 | case Op_clamp:{ | 
|---|
| 1071 | r = max_(F0, min_(r, F1)); | 
|---|
| 1072 | g = max_(F0, min_(g, F1)); | 
|---|
| 1073 | b = max_(F0, min_(b, F1)); | 
|---|
| 1074 | a = max_(F0, min_(a, F1)); | 
|---|
| 1075 | } break; | 
|---|
| 1076 |  | 
|---|
| 1077 | case Op_invert:{ | 
|---|
| 1078 | r = F1 - r; | 
|---|
| 1079 | g = F1 - g; | 
|---|
| 1080 | b = F1 - b; | 
|---|
| 1081 | a = F1 - a; | 
|---|
| 1082 | } break; | 
|---|
| 1083 |  | 
|---|
| 1084 | case Op_force_opaque:{ | 
|---|
| 1085 | a = F1; | 
|---|
| 1086 | } break; | 
|---|
| 1087 |  | 
|---|
| 1088 | case Op_premul:{ | 
|---|
| 1089 | r *= a; | 
|---|
| 1090 | g *= a; | 
|---|
| 1091 | b *= a; | 
|---|
| 1092 | } break; | 
|---|
| 1093 |  | 
|---|
| 1094 | case Op_unpremul:{ | 
|---|
| 1095 | F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0); | 
|---|
| 1096 | r *= scale; | 
|---|
| 1097 | g *= scale; | 
|---|
| 1098 | b *= scale; | 
|---|
| 1099 | } break; | 
|---|
| 1100 |  | 
|---|
| 1101 | case Op_matrix_3x3:{ | 
|---|
| 1102 | const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++; | 
|---|
| 1103 | const float* m = &matrix->vals[0][0]; | 
|---|
| 1104 |  | 
|---|
| 1105 | F R = m[0]*r + m[1]*g + m[2]*b, | 
|---|
| 1106 | G = m[3]*r + m[4]*g + m[5]*b, | 
|---|
| 1107 | B = m[6]*r + m[7]*g + m[8]*b; | 
|---|
| 1108 |  | 
|---|
| 1109 | r = R; | 
|---|
| 1110 | g = G; | 
|---|
| 1111 | b = B; | 
|---|
| 1112 | } break; | 
|---|
| 1113 |  | 
|---|
| 1114 | case Op_matrix_3x4:{ | 
|---|
| 1115 | const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++; | 
|---|
| 1116 | const float* m = &matrix->vals[0][0]; | 
|---|
| 1117 |  | 
|---|
| 1118 | F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3], | 
|---|
| 1119 | G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7], | 
|---|
| 1120 | B = m[8]*r + m[9]*g + m[10]*b + m[11]; | 
|---|
| 1121 |  | 
|---|
| 1122 | r = R; | 
|---|
| 1123 | g = G; | 
|---|
| 1124 | b = B; | 
|---|
| 1125 | } break; | 
|---|
| 1126 |  | 
|---|
| 1127 | case Op_lab_to_xyz:{ | 
|---|
| 1128 | // The L*a*b values are in r,g,b, but normalized to [0,1].  Reconstruct them: | 
|---|
| 1129 | F L = r * 100.0f, | 
|---|
| 1130 | A = g * 255.0f - 128.0f, | 
|---|
| 1131 | B = b * 255.0f - 128.0f; | 
|---|
| 1132 |  | 
|---|
| 1133 | // Convert to CIE XYZ. | 
|---|
| 1134 | F Y = (L + 16.0f) * (1/116.0f), | 
|---|
| 1135 | X = Y + A*(1/500.0f), | 
|---|
| 1136 | Z = Y - B*(1/200.0f); | 
|---|
| 1137 |  | 
|---|
| 1138 | X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f)); | 
|---|
| 1139 | Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f)); | 
|---|
| 1140 | Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f)); | 
|---|
| 1141 |  | 
|---|
| 1142 | // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op. | 
|---|
| 1143 | r = X * 0.9642f; | 
|---|
| 1144 | g = Y          ; | 
|---|
| 1145 | b = Z * 0.8249f; | 
|---|
| 1146 | } break; | 
|---|
| 1147 |  | 
|---|
| 1148 | case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break; | 
|---|
| 1149 | case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break; | 
|---|
| 1150 | case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break; | 
|---|
| 1151 | case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break; | 
|---|
| 1152 |  | 
|---|
| 1153 | case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break; | 
|---|
| 1154 | case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break; | 
|---|
| 1155 | case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break; | 
|---|
| 1156 | case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break; | 
|---|
| 1157 |  | 
|---|
| 1158 | case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break; | 
|---|
| 1159 | case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break; | 
|---|
| 1160 | case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break; | 
|---|
| 1161 | case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break; | 
|---|
| 1162 |  | 
|---|
| 1163 | case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break; | 
|---|
| 1164 | case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break; | 
|---|
| 1165 | case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break; | 
|---|
| 1166 | case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break; | 
|---|
| 1167 |  | 
|---|
| 1168 | case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break; | 
|---|
| 1169 | case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break; | 
|---|
| 1170 | case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break; | 
|---|
| 1171 | case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break; | 
|---|
| 1172 |  | 
|---|
| 1173 | case Op_clut: { | 
|---|
| 1174 | const skcms_A2B* a2b = (const skcms_A2B*) *args++; | 
|---|
| 1175 | clut(a2b, &r,&g,&b,a); | 
|---|
| 1176 |  | 
|---|
| 1177 | if (a2b->input_channels == 4) { | 
|---|
| 1178 | // CMYK is opaque. | 
|---|
| 1179 | a = F1; | 
|---|
| 1180 | } | 
|---|
| 1181 | } break; | 
|---|
| 1182 |  | 
|---|
| 1183 | // Notice, from here on down the store_ ops all return, ending the loop. | 
|---|
| 1184 |  | 
|---|
| 1185 | case Op_store_a8: { | 
|---|
| 1186 | store(dst + 1*i, cast<U8>(to_fixed(a * 255))); | 
|---|
| 1187 | } return; | 
|---|
| 1188 |  | 
|---|
| 1189 | case Op_store_g8: { | 
|---|
| 1190 | // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z) | 
|---|
| 1191 | store(dst + 1*i, cast<U8>(to_fixed(g * 255))); | 
|---|
| 1192 | } return; | 
|---|
| 1193 |  | 
|---|
| 1194 | case Op_store_4444: { | 
|---|
| 1195 | store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12) | 
|---|
| 1196 | | cast<U16>(to_fixed(g * 15) <<  8) | 
|---|
| 1197 | | cast<U16>(to_fixed(b * 15) <<  4) | 
|---|
| 1198 | | cast<U16>(to_fixed(a * 15) <<  0)); | 
|---|
| 1199 | } return; | 
|---|
| 1200 |  | 
|---|
| 1201 | case Op_store_565: { | 
|---|
| 1202 | store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) <<  0 ) | 
|---|
| 1203 | | cast<U16>(to_fixed(g * 63) <<  5 ) | 
|---|
| 1204 | | cast<U16>(to_fixed(b * 31) << 11 )); | 
|---|
| 1205 | } return; | 
|---|
| 1206 |  | 
|---|
| 1207 | case Op_store_888: { | 
|---|
| 1208 | uint8_t* rgb = (uint8_t*)dst + 3*i; | 
|---|
| 1209 | #if defined(USING_NEON_FP16) | 
|---|
| 1210 | // See the explanation under USING_NEON below.  This is that doubled up. | 
|---|
| 1211 | U16 R = to_fixed(r * 255), | 
|---|
| 1212 | G = to_fixed(g * 255), | 
|---|
| 1213 | B = to_fixed(b * 255); | 
|---|
| 1214 |  | 
|---|
| 1215 | uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }}; | 
|---|
| 1216 | vst3q_lane_u8(rgb+ 0, v,  0); | 
|---|
| 1217 | vst3q_lane_u8(rgb+ 3, v,  2); | 
|---|
| 1218 | vst3q_lane_u8(rgb+ 6, v,  4); | 
|---|
| 1219 | vst3q_lane_u8(rgb+ 9, v,  6); | 
|---|
| 1220 |  | 
|---|
| 1221 | vst3q_lane_u8(rgb+12, v,  8); | 
|---|
| 1222 | vst3q_lane_u8(rgb+15, v, 10); | 
|---|
| 1223 | vst3q_lane_u8(rgb+18, v, 12); | 
|---|
| 1224 | vst3q_lane_u8(rgb+21, v, 14); | 
|---|
| 1225 | #elif defined(USING_NEON) | 
|---|
| 1226 | // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but | 
|---|
| 1227 | // get there via U16 to save some instructions converting to float.  And just | 
|---|
| 1228 | // like load_888, we'd prefer to go via U32 but for ARMv7 support. | 
|---|
| 1229 | U16 R = cast<U16>(to_fixed(r * 255)), | 
|---|
| 1230 | G = cast<U16>(to_fixed(g * 255)), | 
|---|
| 1231 | B = cast<U16>(to_fixed(b * 255)); | 
|---|
| 1232 |  | 
|---|
| 1233 | uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }}; | 
|---|
| 1234 | vst3_lane_u8(rgb+0, v, 0); | 
|---|
| 1235 | vst3_lane_u8(rgb+3, v, 2); | 
|---|
| 1236 | vst3_lane_u8(rgb+6, v, 4); | 
|---|
| 1237 | vst3_lane_u8(rgb+9, v, 6); | 
|---|
| 1238 | #else | 
|---|
| 1239 | store_3(rgb+0, cast<U8>(to_fixed(r * 255)) ); | 
|---|
| 1240 | store_3(rgb+1, cast<U8>(to_fixed(g * 255)) ); | 
|---|
| 1241 | store_3(rgb+2, cast<U8>(to_fixed(b * 255)) ); | 
|---|
| 1242 | #endif | 
|---|
| 1243 | } return; | 
|---|
| 1244 |  | 
|---|
| 1245 | case Op_store_8888: { | 
|---|
| 1246 | store(dst + 4*i, cast<U32>(to_fixed(r * 255)) <<  0 | 
|---|
| 1247 | | cast<U32>(to_fixed(g * 255)) <<  8 | 
|---|
| 1248 | | cast<U32>(to_fixed(b * 255)) << 16 | 
|---|
| 1249 | | cast<U32>(to_fixed(a * 255)) << 24); | 
|---|
| 1250 | } return; | 
|---|
| 1251 |  | 
|---|
| 1252 | case Op_store_1010102: { | 
|---|
| 1253 | store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) <<  0 | 
|---|
| 1254 | | cast<U32>(to_fixed(g * 1023)) << 10 | 
|---|
| 1255 | | cast<U32>(to_fixed(b * 1023)) << 20 | 
|---|
| 1256 | | cast<U32>(to_fixed(a *    3)) << 30); | 
|---|
| 1257 | } return; | 
|---|
| 1258 |  | 
|---|
| 1259 | case Op_store_161616LE: { | 
|---|
| 1260 | uintptr_t ptr = (uintptr_t)(dst + 6*i); | 
|---|
| 1261 | assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned | 
|---|
| 1262 | uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe. | 
|---|
| 1263 | #if defined(USING_NEON_FP16) | 
|---|
| 1264 | uint16x8x3_t v = {{ | 
|---|
| 1265 | (uint16x8_t)U16_from_F(r), | 
|---|
| 1266 | (uint16x8_t)U16_from_F(g), | 
|---|
| 1267 | (uint16x8_t)U16_from_F(b), | 
|---|
| 1268 | }}; | 
|---|
| 1269 | vst3q_u16(rgb, v); | 
|---|
| 1270 | #elif defined(USING_NEON) | 
|---|
| 1271 | uint16x4x3_t v = {{ | 
|---|
| 1272 | (uint16x4_t)U16_from_F(r), | 
|---|
| 1273 | (uint16x4_t)U16_from_F(g), | 
|---|
| 1274 | (uint16x4_t)U16_from_F(b), | 
|---|
| 1275 | }}; | 
|---|
| 1276 | vst3_u16(rgb, v); | 
|---|
| 1277 | #else | 
|---|
| 1278 | store_3(rgb+0, U16_from_F(r)); | 
|---|
| 1279 | store_3(rgb+1, U16_from_F(g)); | 
|---|
| 1280 | store_3(rgb+2, U16_from_F(b)); | 
|---|
| 1281 | #endif | 
|---|
| 1282 |  | 
|---|
| 1283 | } return; | 
|---|
| 1284 |  | 
|---|
| 1285 | case Op_store_16161616LE: { | 
|---|
| 1286 | uintptr_t ptr = (uintptr_t)(dst + 8*i); | 
|---|
| 1287 | assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned | 
|---|
| 1288 | uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe. | 
|---|
| 1289 | #if defined(USING_NEON_FP16) | 
|---|
| 1290 | uint16x8x4_t v = {{ | 
|---|
| 1291 | (uint16x8_t)U16_from_F(r), | 
|---|
| 1292 | (uint16x8_t)U16_from_F(g), | 
|---|
| 1293 | (uint16x8_t)U16_from_F(b), | 
|---|
| 1294 | (uint16x8_t)U16_from_F(a), | 
|---|
| 1295 | }}; | 
|---|
| 1296 | vst4q_u16(rgba, v); | 
|---|
| 1297 | #elif defined(USING_NEON) | 
|---|
| 1298 | uint16x4x4_t v = {{ | 
|---|
| 1299 | (uint16x4_t)U16_from_F(r), | 
|---|
| 1300 | (uint16x4_t)U16_from_F(g), | 
|---|
| 1301 | (uint16x4_t)U16_from_F(b), | 
|---|
| 1302 | (uint16x4_t)U16_from_F(a), | 
|---|
| 1303 | }}; | 
|---|
| 1304 | vst4_u16(rgba, v); | 
|---|
| 1305 | #else | 
|---|
| 1306 | U64 px = cast<U64>(to_fixed(r * 65535)) <<  0 | 
|---|
| 1307 | | cast<U64>(to_fixed(g * 65535)) << 16 | 
|---|
| 1308 | | cast<U64>(to_fixed(b * 65535)) << 32 | 
|---|
| 1309 | | cast<U64>(to_fixed(a * 65535)) << 48; | 
|---|
| 1310 | store(rgba, px); | 
|---|
| 1311 | #endif | 
|---|
| 1312 | } return; | 
|---|
| 1313 |  | 
|---|
| 1314 | case Op_store_161616BE: { | 
|---|
| 1315 | uintptr_t ptr = (uintptr_t)(dst + 6*i); | 
|---|
| 1316 | assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned | 
|---|
| 1317 | uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe. | 
|---|
| 1318 | #if defined(USING_NEON_FP16) | 
|---|
| 1319 | uint16x8x3_t v = {{ | 
|---|
| 1320 | (uint16x8_t)swap_endian_16(U16_from_F(r)), | 
|---|
| 1321 | (uint16x8_t)swap_endian_16(U16_from_F(g)), | 
|---|
| 1322 | (uint16x8_t)swap_endian_16(U16_from_F(b)), | 
|---|
| 1323 | }}; | 
|---|
| 1324 | vst3q_u16(rgb, v); | 
|---|
| 1325 | #elif defined(USING_NEON) | 
|---|
| 1326 | uint16x4x3_t v = {{ | 
|---|
| 1327 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))), | 
|---|
| 1328 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))), | 
|---|
| 1329 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))), | 
|---|
| 1330 | }}; | 
|---|
| 1331 | vst3_u16(rgb, v); | 
|---|
| 1332 | #else | 
|---|
| 1333 | U32 R = to_fixed(r * 65535), | 
|---|
| 1334 | G = to_fixed(g * 65535), | 
|---|
| 1335 | B = to_fixed(b * 65535); | 
|---|
| 1336 | store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) ); | 
|---|
| 1337 | store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) ); | 
|---|
| 1338 | store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) ); | 
|---|
| 1339 | #endif | 
|---|
| 1340 |  | 
|---|
| 1341 | } return; | 
|---|
| 1342 |  | 
|---|
| 1343 | case Op_store_16161616BE: { | 
|---|
| 1344 | uintptr_t ptr = (uintptr_t)(dst + 8*i); | 
|---|
| 1345 | assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned | 
|---|
| 1346 | uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe. | 
|---|
| 1347 | #if defined(USING_NEON_FP16) | 
|---|
| 1348 | uint16x8x4_t v = {{ | 
|---|
| 1349 | (uint16x8_t)swap_endian_16(U16_from_F(r)), | 
|---|
| 1350 | (uint16x8_t)swap_endian_16(U16_from_F(g)), | 
|---|
| 1351 | (uint16x8_t)swap_endian_16(U16_from_F(b)), | 
|---|
| 1352 | (uint16x8_t)swap_endian_16(U16_from_F(a)), | 
|---|
| 1353 | }}; | 
|---|
| 1354 | vst4q_u16(rgba, v); | 
|---|
| 1355 | #elif defined(USING_NEON) | 
|---|
| 1356 | uint16x4x4_t v = {{ | 
|---|
| 1357 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))), | 
|---|
| 1358 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))), | 
|---|
| 1359 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))), | 
|---|
| 1360 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))), | 
|---|
| 1361 | }}; | 
|---|
| 1362 | vst4_u16(rgba, v); | 
|---|
| 1363 | #else | 
|---|
| 1364 | U64 px = cast<U64>(to_fixed(r * 65535)) <<  0 | 
|---|
| 1365 | | cast<U64>(to_fixed(g * 65535)) << 16 | 
|---|
| 1366 | | cast<U64>(to_fixed(b * 65535)) << 32 | 
|---|
| 1367 | | cast<U64>(to_fixed(a * 65535)) << 48; | 
|---|
| 1368 | store(rgba, swap_endian_16x4(px)); | 
|---|
| 1369 | #endif | 
|---|
| 1370 | } return; | 
|---|
| 1371 |  | 
|---|
| 1372 | case Op_store_hhh: { | 
|---|
| 1373 | uintptr_t ptr = (uintptr_t)(dst + 6*i); | 
|---|
| 1374 | assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned | 
|---|
| 1375 | uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe. | 
|---|
| 1376 |  | 
|---|
| 1377 | U16 R = Half_from_F(r), | 
|---|
| 1378 | G = Half_from_F(g), | 
|---|
| 1379 | B = Half_from_F(b); | 
|---|
| 1380 | #if defined(USING_NEON_FP16) | 
|---|
| 1381 | uint16x8x3_t v = {{ | 
|---|
| 1382 | (uint16x8_t)R, | 
|---|
| 1383 | (uint16x8_t)G, | 
|---|
| 1384 | (uint16x8_t)B, | 
|---|
| 1385 | }}; | 
|---|
| 1386 | vst3q_u16(rgb, v); | 
|---|
| 1387 | #elif defined(USING_NEON) | 
|---|
| 1388 | uint16x4x3_t v = {{ | 
|---|
| 1389 | (uint16x4_t)R, | 
|---|
| 1390 | (uint16x4_t)G, | 
|---|
| 1391 | (uint16x4_t)B, | 
|---|
| 1392 | }}; | 
|---|
| 1393 | vst3_u16(rgb, v); | 
|---|
| 1394 | #else | 
|---|
| 1395 | store_3(rgb+0, R); | 
|---|
| 1396 | store_3(rgb+1, G); | 
|---|
| 1397 | store_3(rgb+2, B); | 
|---|
| 1398 | #endif | 
|---|
| 1399 | } return; | 
|---|
| 1400 |  | 
|---|
| 1401 | case Op_store_hhhh: { | 
|---|
| 1402 | uintptr_t ptr = (uintptr_t)(dst + 8*i); | 
|---|
| 1403 | assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned | 
|---|
| 1404 | uint16_t* rgba = (uint16_t*)ptr;         // for this cast to uint16_t* to be safe. | 
|---|
| 1405 |  | 
|---|
| 1406 | U16 R = Half_from_F(r), | 
|---|
| 1407 | G = Half_from_F(g), | 
|---|
| 1408 | B = Half_from_F(b), | 
|---|
| 1409 | A = Half_from_F(a); | 
|---|
| 1410 | #if defined(USING_NEON_FP16) | 
|---|
| 1411 | uint16x8x4_t v = {{ | 
|---|
| 1412 | (uint16x8_t)R, | 
|---|
| 1413 | (uint16x8_t)G, | 
|---|
| 1414 | (uint16x8_t)B, | 
|---|
| 1415 | (uint16x8_t)A, | 
|---|
| 1416 | }}; | 
|---|
| 1417 | vst4q_u16(rgba, v); | 
|---|
| 1418 | #elif defined(USING_NEON) | 
|---|
| 1419 | uint16x4x4_t v = {{ | 
|---|
| 1420 | (uint16x4_t)R, | 
|---|
| 1421 | (uint16x4_t)G, | 
|---|
| 1422 | (uint16x4_t)B, | 
|---|
| 1423 | (uint16x4_t)A, | 
|---|
| 1424 | }}; | 
|---|
| 1425 | vst4_u16(rgba, v); | 
|---|
| 1426 | #else | 
|---|
| 1427 | store(rgba, cast<U64>(R) <<  0 | 
|---|
| 1428 | | cast<U64>(G) << 16 | 
|---|
| 1429 | | cast<U64>(B) << 32 | 
|---|
| 1430 | | cast<U64>(A) << 48); | 
|---|
| 1431 | #endif | 
|---|
| 1432 |  | 
|---|
| 1433 | } return; | 
|---|
| 1434 |  | 
|---|
| 1435 | case Op_store_fff: { | 
|---|
| 1436 | uintptr_t ptr = (uintptr_t)(dst + 12*i); | 
|---|
| 1437 | assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned | 
|---|
| 1438 | float* rgb = (float*)ptr;                // for this cast to float* to be safe. | 
|---|
| 1439 | #if defined(USING_NEON_FP16) | 
|---|
| 1440 | float32x4x3_t lo = {{ | 
|---|
| 1441 | vcvt_f32_f16(vget_low_f16(r)), | 
|---|
| 1442 | vcvt_f32_f16(vget_low_f16(g)), | 
|---|
| 1443 | vcvt_f32_f16(vget_low_f16(b)), | 
|---|
| 1444 | }}, hi = {{ | 
|---|
| 1445 | vcvt_f32_f16(vget_high_f16(r)), | 
|---|
| 1446 | vcvt_f32_f16(vget_high_f16(g)), | 
|---|
| 1447 | vcvt_f32_f16(vget_high_f16(b)), | 
|---|
| 1448 | }}; | 
|---|
| 1449 | vst3q_f32(rgb +  0, lo); | 
|---|
| 1450 | vst3q_f32(rgb + 12, hi); | 
|---|
| 1451 | #elif defined(USING_NEON) | 
|---|
| 1452 | float32x4x3_t v = {{ | 
|---|
| 1453 | (float32x4_t)r, | 
|---|
| 1454 | (float32x4_t)g, | 
|---|
| 1455 | (float32x4_t)b, | 
|---|
| 1456 | }}; | 
|---|
| 1457 | vst3q_f32(rgb, v); | 
|---|
| 1458 | #else | 
|---|
| 1459 | store_3(rgb+0, r); | 
|---|
| 1460 | store_3(rgb+1, g); | 
|---|
| 1461 | store_3(rgb+2, b); | 
|---|
| 1462 | #endif | 
|---|
| 1463 | } return; | 
|---|
| 1464 |  | 
|---|
| 1465 | case Op_store_ffff: { | 
|---|
| 1466 | uintptr_t ptr = (uintptr_t)(dst + 16*i); | 
|---|
| 1467 | assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned | 
|---|
| 1468 | float* rgba = (float*)ptr;               // for this cast to float* to be safe. | 
|---|
| 1469 | #if defined(USING_NEON_FP16) | 
|---|
| 1470 | float32x4x4_t lo = {{ | 
|---|
| 1471 | vcvt_f32_f16(vget_low_f16(r)), | 
|---|
| 1472 | vcvt_f32_f16(vget_low_f16(g)), | 
|---|
| 1473 | vcvt_f32_f16(vget_low_f16(b)), | 
|---|
| 1474 | vcvt_f32_f16(vget_low_f16(a)), | 
|---|
| 1475 | }}, hi = {{ | 
|---|
| 1476 | vcvt_f32_f16(vget_high_f16(r)), | 
|---|
| 1477 | vcvt_f32_f16(vget_high_f16(g)), | 
|---|
| 1478 | vcvt_f32_f16(vget_high_f16(b)), | 
|---|
| 1479 | vcvt_f32_f16(vget_high_f16(a)), | 
|---|
| 1480 | }}; | 
|---|
| 1481 | vst4q_f32(rgba +  0, lo); | 
|---|
| 1482 | vst4q_f32(rgba + 16, hi); | 
|---|
| 1483 | #elif defined(USING_NEON) | 
|---|
| 1484 | float32x4x4_t v = {{ | 
|---|
| 1485 | (float32x4_t)r, | 
|---|
| 1486 | (float32x4_t)g, | 
|---|
| 1487 | (float32x4_t)b, | 
|---|
| 1488 | (float32x4_t)a, | 
|---|
| 1489 | }}; | 
|---|
| 1490 | vst4q_f32(rgba, v); | 
|---|
| 1491 | #else | 
|---|
| 1492 | store_4(rgba+0, r); | 
|---|
| 1493 | store_4(rgba+1, g); | 
|---|
| 1494 | store_4(rgba+2, b); | 
|---|
| 1495 | store_4(rgba+3, a); | 
|---|
| 1496 | #endif | 
|---|
| 1497 | } return; | 
|---|
| 1498 | } | 
|---|
| 1499 | } | 
|---|
| 1500 | } | 
|---|
| 1501 |  | 
|---|
| 1502 |  | 
|---|
| 1503 | static void run_program(const Op* program, const void** arguments, | 
|---|
| 1504 | const char* src, char* dst, int n, | 
|---|
| 1505 | const size_t src_bpp, const size_t dst_bpp) { | 
|---|
| 1506 | int i = 0; | 
|---|
| 1507 | while (n >= N) { | 
|---|
| 1508 | exec_ops(program, arguments, src, dst, i); | 
|---|
| 1509 | i += N; | 
|---|
| 1510 | n -= N; | 
|---|
| 1511 | } | 
|---|
| 1512 | if (n > 0) { | 
|---|
| 1513 | char tmp[4*4*N] = {0}; | 
|---|
| 1514 |  | 
|---|
| 1515 | memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp); | 
|---|
| 1516 | exec_ops(program, arguments, tmp, tmp, 0); | 
|---|
| 1517 | memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp); | 
|---|
| 1518 | } | 
|---|
| 1519 | } | 
|---|
| 1520 |  | 
|---|
| 1521 | // Clean up any #defines we may have set so that we can be #included again. | 
|---|
| 1522 | #if defined(USING_AVX) | 
|---|
| 1523 | #undef  USING_AVX | 
|---|
| 1524 | #endif | 
|---|
| 1525 | #if defined(USING_AVX_F16C) | 
|---|
| 1526 | #undef  USING_AVX_F16C | 
|---|
| 1527 | #endif | 
|---|
| 1528 | #if defined(USING_AVX2) | 
|---|
| 1529 | #undef  USING_AVX2 | 
|---|
| 1530 | #endif | 
|---|
| 1531 | #if defined(USING_AVX512F) | 
|---|
| 1532 | #undef  USING_AVX512F | 
|---|
| 1533 | #endif | 
|---|
| 1534 |  | 
|---|
| 1535 | #if defined(USING_NEON) | 
|---|
| 1536 | #undef  USING_NEON | 
|---|
| 1537 | #endif | 
|---|
| 1538 | #if defined(USING_NEON_F16C) | 
|---|
| 1539 | #undef  USING_NEON_F16C | 
|---|
| 1540 | #endif | 
|---|
| 1541 | #if defined(USING_NEON_FP16) | 
|---|
| 1542 | #undef  USING_NEON_FP16 | 
|---|
| 1543 | #endif | 
|---|
| 1544 |  | 
|---|
| 1545 | #undef FALLTHROUGH | 
|---|
| 1546 |  | 
|---|