1/*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8// Intentionally NO #pragma once... included multiple times.
9
10// This file is included from skcms.cc in a namespace with some pre-defines:
11// - N: depth of all vectors, 1,4,8, or 16 (preprocessor define)
12// - V<T>: a template to create a vector of N T's.
13
14using F = V<Color>; // Called F for historic reasons... maybe rename C?
15using I32 = V<int32_t>;
16using U64 = V<uint64_t>;
17using U32 = V<uint32_t>;
18using U16 = V<uint16_t>;
19using U8 = V<uint8_t>;
20
21
22#if defined(__GNUC__) && !defined(__clang__)
23 // Once again, GCC is kind of weird, not allowing vector = scalar directly.
24 static constexpr F F0 = F() + 0.0f,
25 F1 = F() + 1.0f;
26#else
27 static constexpr F F0 = 0.0f,
28 F1 = 1.0f;
29#endif
30
31// Instead of checking __AVX__ below, we'll check USING_AVX.
32// This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
33// Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
34
35#if !defined(USING_AVX) && N == 8 && defined(__AVX__)
36 #define USING_AVX
37#endif
38#if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
39 #define USING AVX_F16C
40#endif
41#if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
42 #define USING_AVX2
43#endif
44#if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__)
45 #define USING_AVX512F
46#endif
47
48// Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
49// This is more for organizational clarity... skcms.cc doesn't force these.
50#if N > 1 && defined(__ARM_NEON)
51 #define USING_NEON
52 #if __ARM_FP & 2
53 #define USING_NEON_F16C
54 #endif
55 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
56 #define USING_NEON_FP16
57 #endif
58#endif
59
60// These -Wvector-conversion warnings seem to trigger in very bogus situations,
61// like vst3q_f32() expecting a 16x char rather than a 4x float vector. :/
62#if defined(USING_NEON) && defined(__clang__)
63 #pragma clang diagnostic ignored "-Wvector-conversion"
64#endif
65
66// GCC warns us about returning U64 on x86 because it's larger than a register.
67// You'd see warnings like, "using AVX even though AVX is not enabled".
68// We stifle these warnings... our helpers that return U64 are always inlined.
69#if defined(__SSE__) && defined(__GNUC__) && !defined(__clang__)
70 #pragma GCC diagnostic ignored "-Wpsabi"
71#endif
72
73#if defined(__clang__)
74 #define FALLTHROUGH [[clang::fallthrough]]
75#else
76 #define FALLTHROUGH
77#endif
78
79// We tag most helper functions as SI, to enforce good code generation
80// but also work around what we think is a bug in GCC: when targeting 32-bit
81// x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
82// MMX mm0 register, which seems to mess with unrelated code that later uses
83// x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
84//
85// It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
86#if defined(__clang__) || defined(__GNUC__)
87 #define SI static inline __attribute__((always_inline))
88#else
89 #define SI static inline
90#endif
91
92template <typename T, typename P>
93SI T load(const P* ptr) {
94 T val;
95 small_memcpy(&val, ptr, sizeof(val));
96 return val;
97}
98template <typename T, typename P>
99SI void store(P* ptr, const T& val) {
100 small_memcpy(ptr, &val, sizeof(val));
101}
102
103// (T)v is a cast when N == 1 and a bit-pun when N>1,
104// so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
105template <typename D, typename S>
106SI D cast(const S& v) {
107#if N == 1
108 return (D)v;
109#elif defined(__clang__)
110 return __builtin_convertvector(v, D);
111#else
112 D d;
113 for (int i = 0; i < N; i++) {
114 d[i] = v[i];
115 }
116 return d;
117#endif
118}
119
120template <typename D, typename S>
121SI D bit_pun(const S& v) {
122 static_assert(sizeof(D) == sizeof(v), "");
123 return load<D>(&v);
124}
125
126// When we convert from float to fixed point, it's very common to want to round,
127// and for some reason compilers generate better code when converting to int32_t.
128// To serve both those ends, we use this function to_fixed() instead of direct cast().
129#if defined(USING_NEON_FP16)
130 // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
131 SI U16 to_fixed(F f) { return cast<U16>(f + 0.5f); }
132#else
133 SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
134#endif
135
136
137// Sometimes we do something crazy on one branch of a conditonal,
138// like divide by zero or convert a huge float to an integer,
139// but then harmlessly select the other side. That trips up N==1
140// sanitizer builds, so we make if_then_else() a macro to avoid
141// evaluating the unused side.
142
143#if N == 1
144 #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
145#else
146 template <typename C, typename T>
147 SI T if_then_else(C cond, T t, T e) {
148 return bit_pun<T>( ( cond & bit_pun<C>(t)) |
149 (~cond & bit_pun<C>(e)) );
150 }
151#endif
152
153
154SI F F_from_Half(U16 half) {
155#if defined(USING_NEON_FP16)
156 return bit_pun<F>(half);
157#elif defined(USING_NEON_F16C)
158 return vcvt_f32_f16((float16x4_t)half);
159#elif defined(USING_AVX512F)
160 return (F)_mm512_cvtph_ps((__m256i)half);
161#elif defined(USING_AVX_F16C)
162 typedef int16_t __attribute__((vector_size(16))) I16;
163 return __builtin_ia32_vcvtph2ps256((I16)half);
164#else
165 U32 wide = cast<U32>(half);
166 // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
167 U32 s = wide & 0x8000,
168 em = wide ^ s;
169
170 // Constructing the float is easy if the half is not denormalized.
171 F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
172
173 // Simply flush all denorm half floats to zero.
174 return if_then_else(em < 0x0400, F0, norm);
175#endif
176}
177
178#if defined(__clang__)
179 // The -((127-15)<<10) underflows that side of the math when
180 // we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
181 __attribute__((no_sanitize("unsigned-integer-overflow")))
182#endif
183SI U16 Half_from_F(F f) {
184#if defined(USING_NEON_FP16)
185 return bit_pun<U16>(f);
186#elif defined(USING_NEON_F16C)
187 return (U16)vcvt_f16_f32(f);
188#elif defined(USING_AVX512F)
189 return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
190#elif defined(USING_AVX_F16C)
191 return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
192#else
193 // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
194 U32 sem = bit_pun<U32>(f),
195 s = sem & 0x80000000,
196 em = sem ^ s;
197
198 // For simplicity we flush denorm half floats (including all denorm floats) to zero.
199 return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
200 , (s>>16) + (em>>13) - ((127-15)<<10)));
201#endif
202}
203
204// Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
205#if defined(USING_NEON_FP16)
206 SI U16 swap_endian_16(U16 v) {
207 return (U16)vrev16q_u8((uint8x16_t) v);
208 }
209#elif defined(USING_NEON)
210 SI U16 swap_endian_16(U16 v) {
211 return (U16)vrev16_u8((uint8x8_t) v);
212 }
213#endif
214
215SI U64 swap_endian_16x4(const U64& rgba) {
216 return (rgba & 0x00ff00ff00ff00ff) << 8
217 | (rgba & 0xff00ff00ff00ff00) >> 8;
218}
219
220#if defined(USING_NEON_FP16)
221 SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
222 SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
223#elif defined(USING_NEON)
224 SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
225 SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
226#else
227 SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
228 SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
229#endif
230
231SI F floor_(F x) {
232#if N == 1
233 return floorf_(x);
234#elif defined(USING_NEON_FP16)
235 return vrndmq_f16(x);
236#elif defined(__aarch64__)
237 return vrndmq_f32(x);
238#elif defined(USING_AVX512F)
239 // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
240 // and integer santizer catches that this implicit cast changes the
241 // value from -1 to 65535. We'll cast manually to work around it.
242 // Read this as `return _mm512_floor_ps(x)`.
243 return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
244#elif defined(USING_AVX)
245 return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
246#elif defined(__SSE4_1__)
247 return _mm_floor_ps(x);
248#else
249 // Round trip through integers with a truncating cast.
250 F roundtrip = cast<F>(cast<I32>(x));
251 // If x is negative, truncating gives the ceiling instead of the floor.
252 return roundtrip - if_then_else(roundtrip > x, F1, F0);
253
254 // This implementation fails for values of x that are outside
255 // the range an integer can represent. We expect most x to be small.
256#endif
257}
258
259SI F approx_log2(F x) {
260#if defined(USING_NEON_FP16)
261 // TODO(mtklein)
262 return x;
263#else
264 // The first approximation of log2(x) is its exponent 'e', minus 127.
265 I32 bits = bit_pun<I32>(x);
266
267 F e = cast<F>(bits) * (1.0f / (1<<23));
268
269 // If we use the mantissa too we can refine the error signficantly.
270 F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
271
272 return e - 124.225514990f
273 - 1.498030302f*m
274 - 1.725879990f/(0.3520887068f + m);
275#endif
276}
277
278SI F approx_log(F x) {
279 const float ln2 = 0.69314718f;
280 return ln2 * approx_log2(x);
281}
282
283SI F approx_exp2(F x) {
284#if defined(USING_NEON_FP16)
285 // TODO(mtklein)
286 return x;
287#else
288 F fract = x - floor_(x);
289
290 I32 bits = cast<I32>((1.0f * (1<<23)) * (x + 121.274057500f
291 - 1.490129070f*fract
292 + 27.728023300f/(4.84252568f - fract)));
293 return bit_pun<F>(bits);
294#endif
295}
296
297SI F approx_pow(F x, float y) {
298 return if_then_else((x == F0) | (x == F1), x
299 , approx_exp2(approx_log2(x) * y));
300}
301
302SI F approx_exp(F x) {
303 const float log2_e = 1.4426950408889634074f;
304 return approx_exp2(log2_e * x);
305}
306
307// Return tf(x).
308SI F apply_tf(const skcms_TransferFunction* tf, F x) {
309#if defined(USING_NEON_FP16)
310 // TODO(mtklein)
311 (void)tf;
312 return x;
313#else
314 // Peel off the sign bit and set x = |x|.
315 U32 bits = bit_pun<U32>(x),
316 sign = bits & 0x80000000;
317 x = bit_pun<F>(bits ^ sign);
318
319 // The transfer function has a linear part up to d, exponential at d and after.
320 F v = if_then_else(x < tf->d, tf->c*x + tf->f
321 , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
322
323 // Tack the sign bit back on.
324 return bit_pun<F>(sign | bit_pun<U32>(v));
325#endif
326}
327
328SI F apply_pq(const skcms_TransferFunction* tf, F x) {
329#if defined(USING_NEON_FP16)
330 // TODO(mtklein)
331 (void)tf;
332 return x;
333#else
334 U32 bits = bit_pun<U32>(x),
335 sign = bits & 0x80000000;
336 x = bit_pun<F>(bits ^ sign);
337
338 F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
339 / (tf->d + tf->e * approx_pow(x, tf->c)),
340 tf->f);
341
342 return bit_pun<F>(sign | bit_pun<U32>(v));
343#endif
344}
345
346SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
347#if defined(USING_NEON_FP16)
348 // TODO(mtklein)
349 (void)tf;
350 return x;
351#else
352 const float R = tf->a, G = tf->b,
353 a = tf->c, b = tf->d, c = tf->e;
354 U32 bits = bit_pun<U32>(x),
355 sign = bits & 0x80000000;
356 x = bit_pun<F>(bits ^ sign);
357
358 F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
359 , approx_exp((x-c)*a) + b);
360
361 return bit_pun<F>(sign | bit_pun<U32>(v));
362#endif
363}
364
365SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
366#if defined(USING_NEON_FP16)
367 // TODO(mtklein)
368 (void)tf;
369 return x;
370#else
371 const float R = tf->a, G = tf->b,
372 a = tf->c, b = tf->d, c = tf->e;
373 U32 bits = bit_pun<U32>(x),
374 sign = bits & 0x80000000;
375 x = bit_pun<F>(bits ^ sign);
376
377 F v = if_then_else(x <= 1, R * approx_pow(x, G)
378 , a * approx_log(x - b) + c);
379
380 return bit_pun<F>(sign | bit_pun<U32>(v));
381#endif
382}
383
384
385// Strided loads and stores of N values, starting from p.
386template <typename T, typename P>
387SI T load_3(const P* p) {
388#if N == 1
389 return (T)p[0];
390#elif N == 4
391 return T{p[ 0],p[ 3],p[ 6],p[ 9]};
392#elif N == 8
393 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
394#elif N == 16
395 return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
396 p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
397#endif
398}
399
400template <typename T, typename P>
401SI T load_4(const P* p) {
402#if N == 1
403 return (T)p[0];
404#elif N == 4
405 return T{p[ 0],p[ 4],p[ 8],p[12]};
406#elif N == 8
407 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
408#elif N == 16
409 return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
410 p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
411#endif
412}
413
414template <typename T, typename P>
415SI void store_3(P* p, const T& v) {
416#if N == 1
417 p[0] = v;
418#elif N == 4
419 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
420#elif N == 8
421 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
422 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
423#elif N == 16
424 p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
425 p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
426 p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
427 p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
428#endif
429}
430
431template <typename T, typename P>
432SI void store_4(P* p, const T& v) {
433#if N == 1
434 p[0] = v;
435#elif N == 4
436 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
437#elif N == 8
438 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
439 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
440#elif N == 16
441 p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
442 p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
443 p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
444 p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
445#endif
446}
447
448
449SI U8 gather_8(const uint8_t* p, I32 ix) {
450#if N == 1
451 U8 v = p[ix];
452#elif N == 4
453 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
454#elif N == 8
455 U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
456 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
457#elif N == 16
458 U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
459 p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
460 p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
461 p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
462#endif
463 return v;
464}
465
466SI U16 gather_16(const uint8_t* p, I32 ix) {
467 // Load the i'th 16-bit value from p.
468 auto load_16 = [p](int i) {
469 return load<uint16_t>(p + 2*i);
470 };
471#if N == 1
472 U16 v = load_16(ix);
473#elif N == 4
474 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
475#elif N == 8
476 U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
477 load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
478#elif N == 16
479 U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
480 load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
481 load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
482 load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
483#endif
484 return v;
485}
486
487SI U32 gather_32(const uint8_t* p, I32 ix) {
488 // Load the i'th 32-bit value from p.
489 auto load_32 = [p](int i) {
490 return load<uint32_t>(p + 4*i);
491 };
492#if N == 1
493 U32 v = load_32(ix);
494#elif N == 4
495 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
496#elif N == 8
497 U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
498 load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
499#elif N == 16
500 U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
501 load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
502 load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
503 load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
504#endif
505 // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
506 return v;
507}
508
509SI U32 gather_24(const uint8_t* p, I32 ix) {
510 // First, back up a byte. Any place we're gathering from has a safe junk byte to read
511 // in front of it, either a previous table value, or some tag metadata.
512 p -= 1;
513
514 // Load the i'th 24-bit value from p, and 1 extra byte.
515 auto load_24_32 = [p](int i) {
516 return load<uint32_t>(p + 3*i);
517 };
518
519 // Now load multiples of 4 bytes (a junk byte, then r,g,b).
520#if N == 1
521 U32 v = load_24_32(ix);
522#elif N == 4
523 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
524#elif N == 8 && !defined(USING_AVX2)
525 U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
526 load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
527#elif N == 8
528 (void)load_24_32;
529 // The gather instruction here doesn't need any particular alignment,
530 // but the intrinsic takes a const int*.
531 const int* p4 = bit_pun<const int*>(p);
532 I32 zero = { 0, 0, 0, 0, 0, 0, 0, 0},
533 mask = {-1,-1,-1,-1, -1,-1,-1,-1};
534 #if defined(__clang__)
535 U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
536 #elif defined(__GNUC__)
537 U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
538 #endif
539#elif N == 16
540 (void)load_24_32;
541 // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
542 // And AVX-512 swapped the order of arguments. :/
543 const int* p4 = bit_pun<const int*>(p);
544 U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
545#endif
546
547 // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
548 return v >> 8;
549}
550
551#if !defined(__arm__)
552 SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
553 // As in gather_24(), with everything doubled.
554 p -= 2;
555
556 // Load the i'th 48-bit value from p, and 2 extra bytes.
557 auto load_48_64 = [p](int i) {
558 return load<uint64_t>(p + 6*i);
559 };
560
561 #if N == 1
562 *v = load_48_64(ix);
563 #elif N == 4
564 *v = U64{
565 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
566 };
567 #elif N == 8 && !defined(USING_AVX2)
568 *v = U64{
569 load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
570 load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
571 };
572 #elif N == 8
573 (void)load_48_64;
574 typedef int32_t __attribute__((vector_size(16))) Half_I32;
575 typedef long long __attribute__((vector_size(32))) Half_I64;
576
577 // The gather instruction here doesn't need any particular alignment,
578 // but the intrinsic takes a const long long*.
579 const long long int* p8 = bit_pun<const long long int*>(p);
580
581 Half_I64 zero = { 0, 0, 0, 0},
582 mask = {-1,-1,-1,-1};
583
584 ix *= 6;
585 Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
586 ix_hi = { ix[4], ix[5], ix[6], ix[7] };
587
588 #if defined(__clang__)
589 Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
590 hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
591 #elif defined(__GNUC__)
592 Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
593 hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
594 #endif
595 store((char*)v + 0, lo);
596 store((char*)v + 32, hi);
597 #elif N == 16
598 (void)load_48_64;
599 const long long int* p8 = bit_pun<const long long int*>(p);
600 __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
601 hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
602 store((char*)v + 0, lo);
603 store((char*)v + 64, hi);
604 #endif
605
606 *v >>= 16;
607 }
608#endif
609
610SI F F_from_U8(U8 v) {
611 return cast<F>(v) * (1/255.0f);
612}
613
614SI F F_from_U16_BE(U16 v) {
615 // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
616 // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
617 U16 lo = (v >> 8),
618 hi = (v << 8) & 0xffff;
619 return cast<F>(lo|hi) * (1/65535.0f);
620}
621
622SI U16 U16_from_F(F v) {
623 // 65535 == inf in FP16, so promote to FP32 before converting.
624 return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
625}
626
627SI F minus_1_ulp(F v) {
628#if defined(USING_NEON_FP16)
629 return bit_pun<F>( bit_pun<U16>(v) - 1 );
630#else
631 return bit_pun<F>( bit_pun<U32>(v) - 1 );
632#endif
633}
634
635SI F table(const skcms_Curve* curve, F v) {
636 // Clamp the input to [0,1], then scale to a table index.
637 F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
638
639 // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
640 I32 lo = cast<I32>( ix ),
641 hi = cast<I32>(minus_1_ulp(ix+1.0f));
642 F t = ix - cast<F>(lo); // i.e. the fractional part of ix.
643
644 // TODO: can we load l and h simultaneously? Each entry in 'h' is either
645 // the same as in 'l' or adjacent. We have a rough idea that's it'd always be safe
646 // to read adjacent entries and perhaps underflow the table by a byte or two
647 // (it'd be junk, but always safe to read). Not sure how to lerp yet.
648 F l,h;
649 if (curve->table_8) {
650 l = F_from_U8(gather_8(curve->table_8, lo));
651 h = F_from_U8(gather_8(curve->table_8, hi));
652 } else {
653 l = F_from_U16_BE(gather_16(curve->table_16, lo));
654 h = F_from_U16_BE(gather_16(curve->table_16, hi));
655 }
656 return l + (h-l)*t;
657}
658
659SI void sample_clut_8(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
660 U32 rgb = gather_24(a2b->grid_8, ix);
661
662 *r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
663 *g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f);
664 *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
665}
666
667SI void sample_clut_16(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
668#if defined(__arm__)
669 // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
670 *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0));
671 *g = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+1));
672 *b = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+2));
673#else
674 // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
675 U64 rgb;
676 gather_48(a2b->grid_16, ix, &rgb);
677 rgb = swap_endian_16x4(rgb);
678
679 *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
680 *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
681 *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
682#endif
683}
684
685// GCC 7.2.0 hits an internal compiler error with -finline-functions (or -O3)
686// when targeting MIPS 64, i386, or s390x, I think attempting to inline clut() into exec_ops().
687#if 1 && defined(__GNUC__) && !defined(__clang__) \
688 && (defined(__mips64) || defined(__i386) || defined(__s390x__))
689 #define MAYBE_NOINLINE __attribute__((noinline))
690#else
691 #define MAYBE_NOINLINE
692#endif
693
694MAYBE_NOINLINE
695static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
696 const int dim = (int)a2b->input_channels;
697 assert (0 < dim && dim <= 4);
698
699 // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
700 I32 index [8]; // Index contribution by dimension, first low from 0, then high from 4.
701 F weight[8]; // Weight for each contribution, again first low, then high.
702
703 // O(dim) work first: calculate index,weight from r,g,b,a.
704 const F inputs[] = { *r,*g,*b,a };
705 for (int i = dim-1, stride = 1; i >= 0; i--) {
706 // x is where we logically want to sample the grid in the i-th dimension.
707 F x = inputs[i] * (float)(a2b->grid_points[i] - 1);
708
709 // But we can't index at floats. lo and hi are the two integer grid points surrounding x.
710 I32 lo = cast<I32>( x ), // i.e. trunc(x) == floor(x) here.
711 hi = cast<I32>(minus_1_ulp(x+1.0f));
712 // Notice how we fold in the accumulated stride across previous dimensions here.
713 index[i+0] = lo * stride;
714 index[i+4] = hi * stride;
715 stride *= a2b->grid_points[i];
716
717 // We'll interpolate between those two integer grid points by t.
718 F t = x - cast<F>(lo); // i.e. fract(x)
719 weight[i+0] = 1-t;
720 weight[i+4] = t;
721 }
722
723 *r = *g = *b = F0;
724
725 // We'll sample 2^dim == 1<<dim table entries per pixel,
726 // in all combinations of low and high in each dimension.
727 for (int combo = 0; combo < (1<<dim); combo++) { // This loop can be done in any order.
728
729 // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
730 // where 0 selects the low index contribution and its weight 1-t,
731 // or 4 the high index contribution and its weight t.
732
733 // Since 0<dim≤4, we can always just start off with the 0-th channel,
734 // then handle the others conditionally.
735 I32 ix = index [0 + (combo&1)*4];
736 F w = weight[0 + (combo&1)*4];
737
738 switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle.
739 case 3: ix += index [3 + (combo&8)/2];
740 w *= weight[3 + (combo&8)/2];
741 FALLTHROUGH;
742 // fall through
743
744 case 2: ix += index [2 + (combo&4)*1];
745 w *= weight[2 + (combo&4)*1];
746 FALLTHROUGH;
747 // fall through
748
749 case 1: ix += index [1 + (combo&2)*2];
750 w *= weight[1 + (combo&2)*2];
751 }
752
753 F R,G,B;
754 if (a2b->grid_8) {
755 sample_clut_8 (a2b,ix, &R,&G,&B);
756 } else {
757 sample_clut_16(a2b,ix, &R,&G,&B);
758 }
759
760 *r += w*R;
761 *g += w*G;
762 *b += w*B;
763 }
764}
765
766static void exec_ops(const Op* ops, const void** args,
767 const char* src, char* dst, int i) {
768 F r = F0, g = F0, b = F0, a = F1;
769 while (true) {
770 switch (*ops++) {
771 case Op_load_a8:{
772 a = F_from_U8(load<U8>(src + 1*i));
773 } break;
774
775 case Op_load_g8:{
776 r = g = b = F_from_U8(load<U8>(src + 1*i));
777 } break;
778
779 case Op_load_4444:{
780 U16 abgr = load<U16>(src + 2*i);
781
782 r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
783 g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
784 b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
785 a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
786 } break;
787
788 case Op_load_565:{
789 U16 rgb = load<U16>(src + 2*i);
790
791 r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
792 g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
793 b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
794 } break;
795
796 case Op_load_888:{
797 const uint8_t* rgb = (const uint8_t*)(src + 3*i);
798 #if defined(USING_NEON_FP16)
799 // See the explanation under USING_NEON below. This is that doubled up.
800 uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
801 v = vld3q_lane_u8(rgb+ 0, v, 0);
802 v = vld3q_lane_u8(rgb+ 3, v, 2);
803 v = vld3q_lane_u8(rgb+ 6, v, 4);
804 v = vld3q_lane_u8(rgb+ 9, v, 6);
805
806 v = vld3q_lane_u8(rgb+12, v, 8);
807 v = vld3q_lane_u8(rgb+15, v, 10);
808 v = vld3q_lane_u8(rgb+18, v, 12);
809 v = vld3q_lane_u8(rgb+21, v, 14);
810
811 r = cast<F>((U16)v.val[0]) * (1/255.0f);
812 g = cast<F>((U16)v.val[1]) * (1/255.0f);
813 b = cast<F>((U16)v.val[2]) * (1/255.0f);
814 #elif defined(USING_NEON)
815 // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
816 // a time. Since we're doing that, we might as well load them into 16-bit lanes.
817 // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
818 uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
819 v = vld3_lane_u8(rgb+0, v, 0);
820 v = vld3_lane_u8(rgb+3, v, 2);
821 v = vld3_lane_u8(rgb+6, v, 4);
822 v = vld3_lane_u8(rgb+9, v, 6);
823
824 // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
825 // convert to F. (Again, U32 would be even better here if drop ARMv7 or split
826 // ARMv7 and ARMv8 impls.)
827 r = cast<F>((U16)v.val[0]) * (1/255.0f);
828 g = cast<F>((U16)v.val[1]) * (1/255.0f);
829 b = cast<F>((U16)v.val[2]) * (1/255.0f);
830 #else
831 r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
832 g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
833 b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
834 #endif
835 } break;
836
837 case Op_load_8888:{
838 U32 rgba = load<U32>(src + 4*i);
839
840 r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
841 g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
842 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
843 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
844 } break;
845
846 case Op_load_8888_palette8:{
847 const uint8_t* palette = (const uint8_t*) *args++;
848 I32 ix = cast<I32>(load<U8>(src + 1*i));
849 U32 rgba = gather_32(palette, ix);
850
851 r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
852 g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
853 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
854 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
855 } break;
856
857 case Op_load_1010102:{
858 U32 rgba = load<U32>(src + 4*i);
859
860 r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
861 g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
862 b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
863 a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
864 } break;
865
866 case Op_load_161616LE:{
867 uintptr_t ptr = (uintptr_t)(src + 6*i);
868 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
869 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
870 #if defined(USING_NEON_FP16)
871 uint16x8x3_t v = vld3q_u16(rgb);
872 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
873 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
874 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
875 #elif defined(USING_NEON)
876 uint16x4x3_t v = vld3_u16(rgb);
877 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
878 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
879 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
880 #else
881 r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
882 g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
883 b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
884 #endif
885 } break;
886
887 case Op_load_16161616LE:{
888 uintptr_t ptr = (uintptr_t)(src + 8*i);
889 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
890 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
891 #if defined(USING_NEON_FP16)
892 uint16x8x4_t v = vld4q_u16(rgba);
893 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
894 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
895 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
896 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
897 #elif defined(USING_NEON)
898 uint16x4x4_t v = vld4_u16(rgba);
899 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
900 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
901 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
902 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
903 #else
904 U64 px = load<U64>(rgba);
905
906 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
907 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
908 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
909 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
910 #endif
911 } break;
912
913 case Op_load_161616BE:{
914 uintptr_t ptr = (uintptr_t)(src + 6*i);
915 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
916 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
917 #if defined(USING_NEON_FP16)
918 uint16x8x3_t v = vld3q_u16(rgb);
919 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
920 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
921 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
922 #elif defined(USING_NEON)
923 uint16x4x3_t v = vld3_u16(rgb);
924 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
925 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
926 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
927 #else
928 U32 R = load_3<U32>(rgb+0),
929 G = load_3<U32>(rgb+1),
930 B = load_3<U32>(rgb+2);
931 // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
932 r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
933 g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
934 b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
935 #endif
936 } break;
937
938 case Op_load_16161616BE:{
939 uintptr_t ptr = (uintptr_t)(src + 8*i);
940 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
941 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
942 #if defined(USING_NEON_FP16)
943 uint16x8x4_t v = vld4q_u16(rgba);
944 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
945 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
946 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
947 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
948 #elif defined(USING_NEON)
949 uint16x4x4_t v = vld4_u16(rgba);
950 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
951 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
952 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
953 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
954 #else
955 U64 px = swap_endian_16x4(load<U64>(rgba));
956
957 r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
958 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
959 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
960 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
961 #endif
962 } break;
963
964 case Op_load_hhh:{
965 uintptr_t ptr = (uintptr_t)(src + 6*i);
966 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
967 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
968 #if defined(USING_NEON_FP16)
969 uint16x8x3_t v = vld3q_u16(rgb);
970 U16 R = (U16)v.val[0],
971 G = (U16)v.val[1],
972 B = (U16)v.val[2];
973 #elif defined(USING_NEON)
974 uint16x4x3_t v = vld3_u16(rgb);
975 U16 R = (U16)v.val[0],
976 G = (U16)v.val[1],
977 B = (U16)v.val[2];
978 #else
979 U16 R = load_3<U16>(rgb+0),
980 G = load_3<U16>(rgb+1),
981 B = load_3<U16>(rgb+2);
982 #endif
983 r = F_from_Half(R);
984 g = F_from_Half(G);
985 b = F_from_Half(B);
986 } break;
987
988 case Op_load_hhhh:{
989 uintptr_t ptr = (uintptr_t)(src + 8*i);
990 assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
991 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
992 #if defined(USING_NEON_FP16)
993 uint16x8x4_t v = vld4q_u16(rgba);
994 U16 R = (U16)v.val[0],
995 G = (U16)v.val[1],
996 B = (U16)v.val[2],
997 A = (U16)v.val[3];
998 #elif defined(USING_NEON)
999 uint16x4x4_t v = vld4_u16(rgba);
1000 U16 R = (U16)v.val[0],
1001 G = (U16)v.val[1],
1002 B = (U16)v.val[2],
1003 A = (U16)v.val[3];
1004 #else
1005 U64 px = load<U64>(rgba);
1006 U16 R = cast<U16>((px >> 0) & 0xffff),
1007 G = cast<U16>((px >> 16) & 0xffff),
1008 B = cast<U16>((px >> 32) & 0xffff),
1009 A = cast<U16>((px >> 48) & 0xffff);
1010 #endif
1011 r = F_from_Half(R);
1012 g = F_from_Half(G);
1013 b = F_from_Half(B);
1014 a = F_from_Half(A);
1015 } break;
1016
1017 case Op_load_fff:{
1018 uintptr_t ptr = (uintptr_t)(src + 12*i);
1019 assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1020 const float* rgb = (const float*)ptr; // cast to const float* to be safe.
1021 #if defined(USING_NEON_FP16)
1022 float32x4x3_t lo = vld3q_f32(rgb + 0),
1023 hi = vld3q_f32(rgb + 12);
1024 r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1025 g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1026 b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1027 #elif defined(USING_NEON)
1028 float32x4x3_t v = vld3q_f32(rgb);
1029 r = (F)v.val[0];
1030 g = (F)v.val[1];
1031 b = (F)v.val[2];
1032 #else
1033 r = load_3<F>(rgb+0);
1034 g = load_3<F>(rgb+1);
1035 b = load_3<F>(rgb+2);
1036 #endif
1037 } break;
1038
1039 case Op_load_ffff:{
1040 uintptr_t ptr = (uintptr_t)(src + 16*i);
1041 assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1042 const float* rgba = (const float*)ptr; // cast to const float* to be safe.
1043 #if defined(USING_NEON_FP16)
1044 float32x4x4_t lo = vld4q_f32(rgba + 0),
1045 hi = vld4q_f32(rgba + 16);
1046 r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1047 g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1048 b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1049 a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
1050 #elif defined(USING_NEON)
1051 float32x4x4_t v = vld4q_f32(rgba);
1052 r = (F)v.val[0];
1053 g = (F)v.val[1];
1054 b = (F)v.val[2];
1055 a = (F)v.val[3];
1056 #else
1057 r = load_4<F>(rgba+0);
1058 g = load_4<F>(rgba+1);
1059 b = load_4<F>(rgba+2);
1060 a = load_4<F>(rgba+3);
1061 #endif
1062 } break;
1063
1064 case Op_swap_rb:{
1065 F t = r;
1066 r = b;
1067 b = t;
1068 } break;
1069
1070 case Op_clamp:{
1071 r = max_(F0, min_(r, F1));
1072 g = max_(F0, min_(g, F1));
1073 b = max_(F0, min_(b, F1));
1074 a = max_(F0, min_(a, F1));
1075 } break;
1076
1077 case Op_invert:{
1078 r = F1 - r;
1079 g = F1 - g;
1080 b = F1 - b;
1081 a = F1 - a;
1082 } break;
1083
1084 case Op_force_opaque:{
1085 a = F1;
1086 } break;
1087
1088 case Op_premul:{
1089 r *= a;
1090 g *= a;
1091 b *= a;
1092 } break;
1093
1094 case Op_unpremul:{
1095 F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1096 r *= scale;
1097 g *= scale;
1098 b *= scale;
1099 } break;
1100
1101 case Op_matrix_3x3:{
1102 const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
1103 const float* m = &matrix->vals[0][0];
1104
1105 F R = m[0]*r + m[1]*g + m[2]*b,
1106 G = m[3]*r + m[4]*g + m[5]*b,
1107 B = m[6]*r + m[7]*g + m[8]*b;
1108
1109 r = R;
1110 g = G;
1111 b = B;
1112 } break;
1113
1114 case Op_matrix_3x4:{
1115 const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
1116 const float* m = &matrix->vals[0][0];
1117
1118 F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1119 G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1120 B = m[8]*r + m[9]*g + m[10]*b + m[11];
1121
1122 r = R;
1123 g = G;
1124 b = B;
1125 } break;
1126
1127 case Op_lab_to_xyz:{
1128 // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
1129 F L = r * 100.0f,
1130 A = g * 255.0f - 128.0f,
1131 B = b * 255.0f - 128.0f;
1132
1133 // Convert to CIE XYZ.
1134 F Y = (L + 16.0f) * (1/116.0f),
1135 X = Y + A*(1/500.0f),
1136 Z = Y - B*(1/200.0f);
1137
1138 X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1139 Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1140 Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1141
1142 // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1143 r = X * 0.9642f;
1144 g = Y ;
1145 b = Z * 0.8249f;
1146 } break;
1147
1148 case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
1149 case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
1150 case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
1151 case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
1152
1153 case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
1154 case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
1155 case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
1156 case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
1157
1158 case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
1159 case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
1160 case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
1161 case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
1162
1163 case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
1164 case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
1165 case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
1166 case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
1167
1168 case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
1169 case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
1170 case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
1171 case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
1172
1173 case Op_clut: {
1174 const skcms_A2B* a2b = (const skcms_A2B*) *args++;
1175 clut(a2b, &r,&g,&b,a);
1176
1177 if (a2b->input_channels == 4) {
1178 // CMYK is opaque.
1179 a = F1;
1180 }
1181 } break;
1182
1183 // Notice, from here on down the store_ ops all return, ending the loop.
1184
1185 case Op_store_a8: {
1186 store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1187 } return;
1188
1189 case Op_store_g8: {
1190 // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1191 store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1192 } return;
1193
1194 case Op_store_4444: {
1195 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1196 | cast<U16>(to_fixed(g * 15) << 8)
1197 | cast<U16>(to_fixed(b * 15) << 4)
1198 | cast<U16>(to_fixed(a * 15) << 0));
1199 } return;
1200
1201 case Op_store_565: {
1202 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
1203 | cast<U16>(to_fixed(g * 63) << 5 )
1204 | cast<U16>(to_fixed(b * 31) << 11 ));
1205 } return;
1206
1207 case Op_store_888: {
1208 uint8_t* rgb = (uint8_t*)dst + 3*i;
1209 #if defined(USING_NEON_FP16)
1210 // See the explanation under USING_NEON below. This is that doubled up.
1211 U16 R = to_fixed(r * 255),
1212 G = to_fixed(g * 255),
1213 B = to_fixed(b * 255);
1214
1215 uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
1216 vst3q_lane_u8(rgb+ 0, v, 0);
1217 vst3q_lane_u8(rgb+ 3, v, 2);
1218 vst3q_lane_u8(rgb+ 6, v, 4);
1219 vst3q_lane_u8(rgb+ 9, v, 6);
1220
1221 vst3q_lane_u8(rgb+12, v, 8);
1222 vst3q_lane_u8(rgb+15, v, 10);
1223 vst3q_lane_u8(rgb+18, v, 12);
1224 vst3q_lane_u8(rgb+21, v, 14);
1225 #elif defined(USING_NEON)
1226 // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1227 // get there via U16 to save some instructions converting to float. And just
1228 // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1229 U16 R = cast<U16>(to_fixed(r * 255)),
1230 G = cast<U16>(to_fixed(g * 255)),
1231 B = cast<U16>(to_fixed(b * 255));
1232
1233 uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1234 vst3_lane_u8(rgb+0, v, 0);
1235 vst3_lane_u8(rgb+3, v, 2);
1236 vst3_lane_u8(rgb+6, v, 4);
1237 vst3_lane_u8(rgb+9, v, 6);
1238 #else
1239 store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1240 store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1241 store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1242 #endif
1243 } return;
1244
1245 case Op_store_8888: {
1246 store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
1247 | cast<U32>(to_fixed(g * 255)) << 8
1248 | cast<U32>(to_fixed(b * 255)) << 16
1249 | cast<U32>(to_fixed(a * 255)) << 24);
1250 } return;
1251
1252 case Op_store_1010102: {
1253 store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
1254 | cast<U32>(to_fixed(g * 1023)) << 10
1255 | cast<U32>(to_fixed(b * 1023)) << 20
1256 | cast<U32>(to_fixed(a * 3)) << 30);
1257 } return;
1258
1259 case Op_store_161616LE: {
1260 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1261 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1262 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1263 #if defined(USING_NEON_FP16)
1264 uint16x8x3_t v = {{
1265 (uint16x8_t)U16_from_F(r),
1266 (uint16x8_t)U16_from_F(g),
1267 (uint16x8_t)U16_from_F(b),
1268 }};
1269 vst3q_u16(rgb, v);
1270 #elif defined(USING_NEON)
1271 uint16x4x3_t v = {{
1272 (uint16x4_t)U16_from_F(r),
1273 (uint16x4_t)U16_from_F(g),
1274 (uint16x4_t)U16_from_F(b),
1275 }};
1276 vst3_u16(rgb, v);
1277 #else
1278 store_3(rgb+0, U16_from_F(r));
1279 store_3(rgb+1, U16_from_F(g));
1280 store_3(rgb+2, U16_from_F(b));
1281 #endif
1282
1283 } return;
1284
1285 case Op_store_16161616LE: {
1286 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1287 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1288 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1289 #if defined(USING_NEON_FP16)
1290 uint16x8x4_t v = {{
1291 (uint16x8_t)U16_from_F(r),
1292 (uint16x8_t)U16_from_F(g),
1293 (uint16x8_t)U16_from_F(b),
1294 (uint16x8_t)U16_from_F(a),
1295 }};
1296 vst4q_u16(rgba, v);
1297 #elif defined(USING_NEON)
1298 uint16x4x4_t v = {{
1299 (uint16x4_t)U16_from_F(r),
1300 (uint16x4_t)U16_from_F(g),
1301 (uint16x4_t)U16_from_F(b),
1302 (uint16x4_t)U16_from_F(a),
1303 }};
1304 vst4_u16(rgba, v);
1305 #else
1306 U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1307 | cast<U64>(to_fixed(g * 65535)) << 16
1308 | cast<U64>(to_fixed(b * 65535)) << 32
1309 | cast<U64>(to_fixed(a * 65535)) << 48;
1310 store(rgba, px);
1311 #endif
1312 } return;
1313
1314 case Op_store_161616BE: {
1315 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1316 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1317 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1318 #if defined(USING_NEON_FP16)
1319 uint16x8x3_t v = {{
1320 (uint16x8_t)swap_endian_16(U16_from_F(r)),
1321 (uint16x8_t)swap_endian_16(U16_from_F(g)),
1322 (uint16x8_t)swap_endian_16(U16_from_F(b)),
1323 }};
1324 vst3q_u16(rgb, v);
1325 #elif defined(USING_NEON)
1326 uint16x4x3_t v = {{
1327 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1328 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1329 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1330 }};
1331 vst3_u16(rgb, v);
1332 #else
1333 U32 R = to_fixed(r * 65535),
1334 G = to_fixed(g * 65535),
1335 B = to_fixed(b * 65535);
1336 store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1337 store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1338 store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1339 #endif
1340
1341 } return;
1342
1343 case Op_store_16161616BE: {
1344 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1345 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1346 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1347 #if defined(USING_NEON_FP16)
1348 uint16x8x4_t v = {{
1349 (uint16x8_t)swap_endian_16(U16_from_F(r)),
1350 (uint16x8_t)swap_endian_16(U16_from_F(g)),
1351 (uint16x8_t)swap_endian_16(U16_from_F(b)),
1352 (uint16x8_t)swap_endian_16(U16_from_F(a)),
1353 }};
1354 vst4q_u16(rgba, v);
1355 #elif defined(USING_NEON)
1356 uint16x4x4_t v = {{
1357 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1358 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1359 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1360 (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1361 }};
1362 vst4_u16(rgba, v);
1363 #else
1364 U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1365 | cast<U64>(to_fixed(g * 65535)) << 16
1366 | cast<U64>(to_fixed(b * 65535)) << 32
1367 | cast<U64>(to_fixed(a * 65535)) << 48;
1368 store(rgba, swap_endian_16x4(px));
1369 #endif
1370 } return;
1371
1372 case Op_store_hhh: {
1373 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1374 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1375 uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1376
1377 U16 R = Half_from_F(r),
1378 G = Half_from_F(g),
1379 B = Half_from_F(b);
1380 #if defined(USING_NEON_FP16)
1381 uint16x8x3_t v = {{
1382 (uint16x8_t)R,
1383 (uint16x8_t)G,
1384 (uint16x8_t)B,
1385 }};
1386 vst3q_u16(rgb, v);
1387 #elif defined(USING_NEON)
1388 uint16x4x3_t v = {{
1389 (uint16x4_t)R,
1390 (uint16x4_t)G,
1391 (uint16x4_t)B,
1392 }};
1393 vst3_u16(rgb, v);
1394 #else
1395 store_3(rgb+0, R);
1396 store_3(rgb+1, G);
1397 store_3(rgb+2, B);
1398 #endif
1399 } return;
1400
1401 case Op_store_hhhh: {
1402 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1403 assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1404 uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1405
1406 U16 R = Half_from_F(r),
1407 G = Half_from_F(g),
1408 B = Half_from_F(b),
1409 A = Half_from_F(a);
1410 #if defined(USING_NEON_FP16)
1411 uint16x8x4_t v = {{
1412 (uint16x8_t)R,
1413 (uint16x8_t)G,
1414 (uint16x8_t)B,
1415 (uint16x8_t)A,
1416 }};
1417 vst4q_u16(rgba, v);
1418 #elif defined(USING_NEON)
1419 uint16x4x4_t v = {{
1420 (uint16x4_t)R,
1421 (uint16x4_t)G,
1422 (uint16x4_t)B,
1423 (uint16x4_t)A,
1424 }};
1425 vst4_u16(rgba, v);
1426 #else
1427 store(rgba, cast<U64>(R) << 0
1428 | cast<U64>(G) << 16
1429 | cast<U64>(B) << 32
1430 | cast<U64>(A) << 48);
1431 #endif
1432
1433 } return;
1434
1435 case Op_store_fff: {
1436 uintptr_t ptr = (uintptr_t)(dst + 12*i);
1437 assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1438 float* rgb = (float*)ptr; // for this cast to float* to be safe.
1439 #if defined(USING_NEON_FP16)
1440 float32x4x3_t lo = {{
1441 vcvt_f32_f16(vget_low_f16(r)),
1442 vcvt_f32_f16(vget_low_f16(g)),
1443 vcvt_f32_f16(vget_low_f16(b)),
1444 }}, hi = {{
1445 vcvt_f32_f16(vget_high_f16(r)),
1446 vcvt_f32_f16(vget_high_f16(g)),
1447 vcvt_f32_f16(vget_high_f16(b)),
1448 }};
1449 vst3q_f32(rgb + 0, lo);
1450 vst3q_f32(rgb + 12, hi);
1451 #elif defined(USING_NEON)
1452 float32x4x3_t v = {{
1453 (float32x4_t)r,
1454 (float32x4_t)g,
1455 (float32x4_t)b,
1456 }};
1457 vst3q_f32(rgb, v);
1458 #else
1459 store_3(rgb+0, r);
1460 store_3(rgb+1, g);
1461 store_3(rgb+2, b);
1462 #endif
1463 } return;
1464
1465 case Op_store_ffff: {
1466 uintptr_t ptr = (uintptr_t)(dst + 16*i);
1467 assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1468 float* rgba = (float*)ptr; // for this cast to float* to be safe.
1469 #if defined(USING_NEON_FP16)
1470 float32x4x4_t lo = {{
1471 vcvt_f32_f16(vget_low_f16(r)),
1472 vcvt_f32_f16(vget_low_f16(g)),
1473 vcvt_f32_f16(vget_low_f16(b)),
1474 vcvt_f32_f16(vget_low_f16(a)),
1475 }}, hi = {{
1476 vcvt_f32_f16(vget_high_f16(r)),
1477 vcvt_f32_f16(vget_high_f16(g)),
1478 vcvt_f32_f16(vget_high_f16(b)),
1479 vcvt_f32_f16(vget_high_f16(a)),
1480 }};
1481 vst4q_f32(rgba + 0, lo);
1482 vst4q_f32(rgba + 16, hi);
1483 #elif defined(USING_NEON)
1484 float32x4x4_t v = {{
1485 (float32x4_t)r,
1486 (float32x4_t)g,
1487 (float32x4_t)b,
1488 (float32x4_t)a,
1489 }};
1490 vst4q_f32(rgba, v);
1491 #else
1492 store_4(rgba+0, r);
1493 store_4(rgba+1, g);
1494 store_4(rgba+2, b);
1495 store_4(rgba+3, a);
1496 #endif
1497 } return;
1498 }
1499 }
1500}
1501
1502
1503static void run_program(const Op* program, const void** arguments,
1504 const char* src, char* dst, int n,
1505 const size_t src_bpp, const size_t dst_bpp) {
1506 int i = 0;
1507 while (n >= N) {
1508 exec_ops(program, arguments, src, dst, i);
1509 i += N;
1510 n -= N;
1511 }
1512 if (n > 0) {
1513 char tmp[4*4*N] = {0};
1514
1515 memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1516 exec_ops(program, arguments, tmp, tmp, 0);
1517 memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1518 }
1519}
1520
1521// Clean up any #defines we may have set so that we can be #included again.
1522#if defined(USING_AVX)
1523 #undef USING_AVX
1524#endif
1525#if defined(USING_AVX_F16C)
1526 #undef USING_AVX_F16C
1527#endif
1528#if defined(USING_AVX2)
1529 #undef USING_AVX2
1530#endif
1531#if defined(USING_AVX512F)
1532 #undef USING_AVX512F
1533#endif
1534
1535#if defined(USING_NEON)
1536 #undef USING_NEON
1537#endif
1538#if defined(USING_NEON_F16C)
1539 #undef USING_NEON_F16C
1540#endif
1541#if defined(USING_NEON_FP16)
1542 #undef USING_NEON_FP16
1543#endif
1544
1545#undef FALLTHROUGH
1546