1 | /* |
2 | * Copyright 2018 Google Inc. |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | // Intentionally NO #pragma once... included multiple times. |
9 | |
10 | // This file is included from skcms.cc in a namespace with some pre-defines: |
11 | // - N: depth of all vectors, 1,4,8, or 16 (preprocessor define) |
12 | // - V<T>: a template to create a vector of N T's. |
13 | |
14 | using F = V<Color>; // Called F for historic reasons... maybe rename C? |
15 | using I32 = V<int32_t>; |
16 | using U64 = V<uint64_t>; |
17 | using U32 = V<uint32_t>; |
18 | using U16 = V<uint16_t>; |
19 | using U8 = V<uint8_t>; |
20 | |
21 | |
22 | #if defined(__GNUC__) && !defined(__clang__) |
23 | // Once again, GCC is kind of weird, not allowing vector = scalar directly. |
24 | static constexpr F F0 = F() + 0.0f, |
25 | F1 = F() + 1.0f; |
26 | #else |
27 | static constexpr F F0 = 0.0f, |
28 | F1 = 1.0f; |
29 | #endif |
30 | |
31 | // Instead of checking __AVX__ below, we'll check USING_AVX. |
32 | // This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way. |
33 | // Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2. |
34 | |
35 | #if !defined(USING_AVX) && N == 8 && defined(__AVX__) |
36 | #define USING_AVX |
37 | #endif |
38 | #if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__) |
39 | #define USING AVX_F16C |
40 | #endif |
41 | #if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__) |
42 | #define USING_AVX2 |
43 | #endif |
44 | #if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__) |
45 | #define USING_AVX512F |
46 | #endif |
47 | |
48 | // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C. |
49 | // This is more for organizational clarity... skcms.cc doesn't force these. |
50 | #if N > 1 && defined(__ARM_NEON) |
51 | #define USING_NEON |
52 | #if __ARM_FP & 2 |
53 | #define USING_NEON_F16C |
54 | #endif |
55 | #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16) |
56 | #define USING_NEON_FP16 |
57 | #endif |
58 | #endif |
59 | |
60 | // These -Wvector-conversion warnings seem to trigger in very bogus situations, |
61 | // like vst3q_f32() expecting a 16x char rather than a 4x float vector. :/ |
62 | #if defined(USING_NEON) && defined(__clang__) |
63 | #pragma clang diagnostic ignored "-Wvector-conversion" |
64 | #endif |
65 | |
66 | // GCC warns us about returning U64 on x86 because it's larger than a register. |
67 | // You'd see warnings like, "using AVX even though AVX is not enabled". |
68 | // We stifle these warnings... our helpers that return U64 are always inlined. |
69 | #if defined(__SSE__) && defined(__GNUC__) && !defined(__clang__) |
70 | #pragma GCC diagnostic ignored "-Wpsabi" |
71 | #endif |
72 | |
73 | #if defined(__clang__) |
74 | #define FALLTHROUGH [[clang::fallthrough]] |
75 | #else |
76 | #define FALLTHROUGH |
77 | #endif |
78 | |
79 | // We tag most helper functions as SI, to enforce good code generation |
80 | // but also work around what we think is a bug in GCC: when targeting 32-bit |
81 | // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the |
82 | // MMX mm0 register, which seems to mess with unrelated code that later uses |
83 | // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register). |
84 | // |
85 | // It helps codegen to call __builtin_memcpy() when we know the byte count at compile time. |
86 | #if defined(__clang__) || defined(__GNUC__) |
87 | #define SI static inline __attribute__((always_inline)) |
88 | #else |
89 | #define SI static inline |
90 | #endif |
91 | |
92 | template <typename T, typename P> |
93 | SI T load(const P* ptr) { |
94 | T val; |
95 | small_memcpy(&val, ptr, sizeof(val)); |
96 | return val; |
97 | } |
98 | template <typename T, typename P> |
99 | SI void store(P* ptr, const T& val) { |
100 | small_memcpy(ptr, &val, sizeof(val)); |
101 | } |
102 | |
103 | // (T)v is a cast when N == 1 and a bit-pun when N>1, |
104 | // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun. |
105 | template <typename D, typename S> |
106 | SI D cast(const S& v) { |
107 | #if N == 1 |
108 | return (D)v; |
109 | #elif defined(__clang__) |
110 | return __builtin_convertvector(v, D); |
111 | #else |
112 | D d; |
113 | for (int i = 0; i < N; i++) { |
114 | d[i] = v[i]; |
115 | } |
116 | return d; |
117 | #endif |
118 | } |
119 | |
120 | template <typename D, typename S> |
121 | SI D bit_pun(const S& v) { |
122 | static_assert(sizeof(D) == sizeof(v), "" ); |
123 | return load<D>(&v); |
124 | } |
125 | |
126 | // When we convert from float to fixed point, it's very common to want to round, |
127 | // and for some reason compilers generate better code when converting to int32_t. |
128 | // To serve both those ends, we use this function to_fixed() instead of direct cast(). |
129 | #if defined(USING_NEON_FP16) |
130 | // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16. |
131 | SI U16 to_fixed(F f) { return cast<U16>(f + 0.5f); } |
132 | #else |
133 | SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); } |
134 | #endif |
135 | |
136 | |
137 | // Sometimes we do something crazy on one branch of a conditonal, |
138 | // like divide by zero or convert a huge float to an integer, |
139 | // but then harmlessly select the other side. That trips up N==1 |
140 | // sanitizer builds, so we make if_then_else() a macro to avoid |
141 | // evaluating the unused side. |
142 | |
143 | #if N == 1 |
144 | #define if_then_else(cond, t, e) ((cond) ? (t) : (e)) |
145 | #else |
146 | template <typename C, typename T> |
147 | SI T if_then_else(C cond, T t, T e) { |
148 | return bit_pun<T>( ( cond & bit_pun<C>(t)) | |
149 | (~cond & bit_pun<C>(e)) ); |
150 | } |
151 | #endif |
152 | |
153 | |
154 | SI F F_from_Half(U16 half) { |
155 | #if defined(USING_NEON_FP16) |
156 | return bit_pun<F>(half); |
157 | #elif defined(USING_NEON_F16C) |
158 | return vcvt_f32_f16((float16x4_t)half); |
159 | #elif defined(USING_AVX512F) |
160 | return (F)_mm512_cvtph_ps((__m256i)half); |
161 | #elif defined(USING_AVX_F16C) |
162 | typedef int16_t __attribute__((vector_size(16))) I16; |
163 | return __builtin_ia32_vcvtph2ps256((I16)half); |
164 | #else |
165 | U32 wide = cast<U32>(half); |
166 | // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias. |
167 | U32 s = wide & 0x8000, |
168 | em = wide ^ s; |
169 | |
170 | // Constructing the float is easy if the half is not denormalized. |
171 | F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) ); |
172 | |
173 | // Simply flush all denorm half floats to zero. |
174 | return if_then_else(em < 0x0400, F0, norm); |
175 | #endif |
176 | } |
177 | |
178 | #if defined(__clang__) |
179 | // The -((127-15)<<10) underflows that side of the math when |
180 | // we pass a denorm half float. It's harmless... we'll take the 0 side anyway. |
181 | __attribute__((no_sanitize("unsigned-integer-overflow" ))) |
182 | #endif |
183 | SI U16 Half_from_F(F f) { |
184 | #if defined(USING_NEON_FP16) |
185 | return bit_pun<U16>(f); |
186 | #elif defined(USING_NEON_F16C) |
187 | return (U16)vcvt_f16_f32(f); |
188 | #elif defined(USING_AVX512F) |
189 | return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION ); |
190 | #elif defined(USING_AVX_F16C) |
191 | return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/); |
192 | #else |
193 | // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias. |
194 | U32 sem = bit_pun<U32>(f), |
195 | s = sem & 0x80000000, |
196 | em = sem ^ s; |
197 | |
198 | // For simplicity we flush denorm half floats (including all denorm floats) to zero. |
199 | return cast<U16>(if_then_else(em < 0x38800000, (U32)F0 |
200 | , (s>>16) + (em>>13) - ((127-15)<<10))); |
201 | #endif |
202 | } |
203 | |
204 | // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian. |
205 | #if defined(USING_NEON_FP16) |
206 | SI U16 swap_endian_16(U16 v) { |
207 | return (U16)vrev16q_u8((uint8x16_t) v); |
208 | } |
209 | #elif defined(USING_NEON) |
210 | SI U16 swap_endian_16(U16 v) { |
211 | return (U16)vrev16_u8((uint8x8_t) v); |
212 | } |
213 | #endif |
214 | |
215 | SI U64 swap_endian_16x4(const U64& rgba) { |
216 | return (rgba & 0x00ff00ff00ff00ff) << 8 |
217 | | (rgba & 0xff00ff00ff00ff00) >> 8; |
218 | } |
219 | |
220 | #if defined(USING_NEON_FP16) |
221 | SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); } |
222 | SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); } |
223 | #elif defined(USING_NEON) |
224 | SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); } |
225 | SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); } |
226 | #else |
227 | SI F min_(F x, F y) { return if_then_else(x > y, y, x); } |
228 | SI F max_(F x, F y) { return if_then_else(x < y, y, x); } |
229 | #endif |
230 | |
231 | SI F floor_(F x) { |
232 | #if N == 1 |
233 | return floorf_(x); |
234 | #elif defined(USING_NEON_FP16) |
235 | return vrndmq_f16(x); |
236 | #elif defined(__aarch64__) |
237 | return vrndmq_f32(x); |
238 | #elif defined(USING_AVX512F) |
239 | // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1, |
240 | // and integer santizer catches that this implicit cast changes the |
241 | // value from -1 to 65535. We'll cast manually to work around it. |
242 | // Read this as `return _mm512_floor_ps(x)`. |
243 | return _mm512_mask_floor_ps(x, (__mmask16)-1, x); |
244 | #elif defined(USING_AVX) |
245 | return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/); |
246 | #elif defined(__SSE4_1__) |
247 | return _mm_floor_ps(x); |
248 | #else |
249 | // Round trip through integers with a truncating cast. |
250 | F roundtrip = cast<F>(cast<I32>(x)); |
251 | // If x is negative, truncating gives the ceiling instead of the floor. |
252 | return roundtrip - if_then_else(roundtrip > x, F1, F0); |
253 | |
254 | // This implementation fails for values of x that are outside |
255 | // the range an integer can represent. We expect most x to be small. |
256 | #endif |
257 | } |
258 | |
259 | SI F approx_log2(F x) { |
260 | #if defined(USING_NEON_FP16) |
261 | // TODO(mtklein) |
262 | return x; |
263 | #else |
264 | // The first approximation of log2(x) is its exponent 'e', minus 127. |
265 | I32 bits = bit_pun<I32>(x); |
266 | |
267 | F e = cast<F>(bits) * (1.0f / (1<<23)); |
268 | |
269 | // If we use the mantissa too we can refine the error signficantly. |
270 | F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 ); |
271 | |
272 | return e - 124.225514990f |
273 | - 1.498030302f*m |
274 | - 1.725879990f/(0.3520887068f + m); |
275 | #endif |
276 | } |
277 | |
278 | SI F approx_log(F x) { |
279 | const float ln2 = 0.69314718f; |
280 | return ln2 * approx_log2(x); |
281 | } |
282 | |
283 | SI F approx_exp2(F x) { |
284 | #if defined(USING_NEON_FP16) |
285 | // TODO(mtklein) |
286 | return x; |
287 | #else |
288 | F fract = x - floor_(x); |
289 | |
290 | I32 bits = cast<I32>((1.0f * (1<<23)) * (x + 121.274057500f |
291 | - 1.490129070f*fract |
292 | + 27.728023300f/(4.84252568f - fract))); |
293 | return bit_pun<F>(bits); |
294 | #endif |
295 | } |
296 | |
297 | SI F approx_pow(F x, float y) { |
298 | return if_then_else((x == F0) | (x == F1), x |
299 | , approx_exp2(approx_log2(x) * y)); |
300 | } |
301 | |
302 | SI F approx_exp(F x) { |
303 | const float log2_e = 1.4426950408889634074f; |
304 | return approx_exp2(log2_e * x); |
305 | } |
306 | |
307 | // Return tf(x). |
308 | SI F apply_tf(const skcms_TransferFunction* tf, F x) { |
309 | #if defined(USING_NEON_FP16) |
310 | // TODO(mtklein) |
311 | (void)tf; |
312 | return x; |
313 | #else |
314 | // Peel off the sign bit and set x = |x|. |
315 | U32 bits = bit_pun<U32>(x), |
316 | sign = bits & 0x80000000; |
317 | x = bit_pun<F>(bits ^ sign); |
318 | |
319 | // The transfer function has a linear part up to d, exponential at d and after. |
320 | F v = if_then_else(x < tf->d, tf->c*x + tf->f |
321 | , approx_pow(tf->a*x + tf->b, tf->g) + tf->e); |
322 | |
323 | // Tack the sign bit back on. |
324 | return bit_pun<F>(sign | bit_pun<U32>(v)); |
325 | #endif |
326 | } |
327 | |
328 | SI F apply_pq(const skcms_TransferFunction* tf, F x) { |
329 | #if defined(USING_NEON_FP16) |
330 | // TODO(mtklein) |
331 | (void)tf; |
332 | return x; |
333 | #else |
334 | U32 bits = bit_pun<U32>(x), |
335 | sign = bits & 0x80000000; |
336 | x = bit_pun<F>(bits ^ sign); |
337 | |
338 | F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0) |
339 | / (tf->d + tf->e * approx_pow(x, tf->c)), |
340 | tf->f); |
341 | |
342 | return bit_pun<F>(sign | bit_pun<U32>(v)); |
343 | #endif |
344 | } |
345 | |
346 | SI F apply_hlg(const skcms_TransferFunction* tf, F x) { |
347 | #if defined(USING_NEON_FP16) |
348 | // TODO(mtklein) |
349 | (void)tf; |
350 | return x; |
351 | #else |
352 | const float R = tf->a, G = tf->b, |
353 | a = tf->c, b = tf->d, c = tf->e; |
354 | U32 bits = bit_pun<U32>(x), |
355 | sign = bits & 0x80000000; |
356 | x = bit_pun<F>(bits ^ sign); |
357 | |
358 | F v = if_then_else(x*R <= 1, approx_pow(x*R, G) |
359 | , approx_exp((x-c)*a) + b); |
360 | |
361 | return bit_pun<F>(sign | bit_pun<U32>(v)); |
362 | #endif |
363 | } |
364 | |
365 | SI F apply_hlginv(const skcms_TransferFunction* tf, F x) { |
366 | #if defined(USING_NEON_FP16) |
367 | // TODO(mtklein) |
368 | (void)tf; |
369 | return x; |
370 | #else |
371 | const float R = tf->a, G = tf->b, |
372 | a = tf->c, b = tf->d, c = tf->e; |
373 | U32 bits = bit_pun<U32>(x), |
374 | sign = bits & 0x80000000; |
375 | x = bit_pun<F>(bits ^ sign); |
376 | |
377 | F v = if_then_else(x <= 1, R * approx_pow(x, G) |
378 | , a * approx_log(x - b) + c); |
379 | |
380 | return bit_pun<F>(sign | bit_pun<U32>(v)); |
381 | #endif |
382 | } |
383 | |
384 | |
385 | // Strided loads and stores of N values, starting from p. |
386 | template <typename T, typename P> |
387 | SI T load_3(const P* p) { |
388 | #if N == 1 |
389 | return (T)p[0]; |
390 | #elif N == 4 |
391 | return T{p[ 0],p[ 3],p[ 6],p[ 9]}; |
392 | #elif N == 8 |
393 | return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]}; |
394 | #elif N == 16 |
395 | return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21], |
396 | p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]}; |
397 | #endif |
398 | } |
399 | |
400 | template <typename T, typename P> |
401 | SI T load_4(const P* p) { |
402 | #if N == 1 |
403 | return (T)p[0]; |
404 | #elif N == 4 |
405 | return T{p[ 0],p[ 4],p[ 8],p[12]}; |
406 | #elif N == 8 |
407 | return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]}; |
408 | #elif N == 16 |
409 | return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28], |
410 | p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]}; |
411 | #endif |
412 | } |
413 | |
414 | template <typename T, typename P> |
415 | SI void store_3(P* p, const T& v) { |
416 | #if N == 1 |
417 | p[0] = v; |
418 | #elif N == 4 |
419 | p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3]; |
420 | #elif N == 8 |
421 | p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3]; |
422 | p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7]; |
423 | #elif N == 16 |
424 | p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3]; |
425 | p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7]; |
426 | p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11]; |
427 | p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15]; |
428 | #endif |
429 | } |
430 | |
431 | template <typename T, typename P> |
432 | SI void store_4(P* p, const T& v) { |
433 | #if N == 1 |
434 | p[0] = v; |
435 | #elif N == 4 |
436 | p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3]; |
437 | #elif N == 8 |
438 | p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3]; |
439 | p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7]; |
440 | #elif N == 16 |
441 | p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3]; |
442 | p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7]; |
443 | p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11]; |
444 | p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15]; |
445 | #endif |
446 | } |
447 | |
448 | |
449 | SI U8 gather_8(const uint8_t* p, I32 ix) { |
450 | #if N == 1 |
451 | U8 v = p[ix]; |
452 | #elif N == 4 |
453 | U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] }; |
454 | #elif N == 8 |
455 | U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], |
456 | p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] }; |
457 | #elif N == 16 |
458 | U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]], |
459 | p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]], |
460 | p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]], |
461 | p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] }; |
462 | #endif |
463 | return v; |
464 | } |
465 | |
466 | SI U16 gather_16(const uint8_t* p, I32 ix) { |
467 | // Load the i'th 16-bit value from p. |
468 | auto load_16 = [p](int i) { |
469 | return load<uint16_t>(p + 2*i); |
470 | }; |
471 | #if N == 1 |
472 | U16 v = load_16(ix); |
473 | #elif N == 4 |
474 | U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) }; |
475 | #elif N == 8 |
476 | U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]), |
477 | load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) }; |
478 | #elif N == 16 |
479 | U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]), |
480 | load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]), |
481 | load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]), |
482 | load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) }; |
483 | #endif |
484 | return v; |
485 | } |
486 | |
487 | SI U32 gather_32(const uint8_t* p, I32 ix) { |
488 | // Load the i'th 32-bit value from p. |
489 | auto load_32 = [p](int i) { |
490 | return load<uint32_t>(p + 4*i); |
491 | }; |
492 | #if N == 1 |
493 | U32 v = load_32(ix); |
494 | #elif N == 4 |
495 | U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) }; |
496 | #elif N == 8 |
497 | U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]), |
498 | load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) }; |
499 | #elif N == 16 |
500 | U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]), |
501 | load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]), |
502 | load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]), |
503 | load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) }; |
504 | #endif |
505 | // TODO: AVX2 and AVX-512 gathers (c.f. gather_24). |
506 | return v; |
507 | } |
508 | |
509 | SI U32 gather_24(const uint8_t* p, I32 ix) { |
510 | // First, back up a byte. Any place we're gathering from has a safe junk byte to read |
511 | // in front of it, either a previous table value, or some tag metadata. |
512 | p -= 1; |
513 | |
514 | // Load the i'th 24-bit value from p, and 1 extra byte. |
515 | auto load_24_32 = [p](int i) { |
516 | return load<uint32_t>(p + 3*i); |
517 | }; |
518 | |
519 | // Now load multiples of 4 bytes (a junk byte, then r,g,b). |
520 | #if N == 1 |
521 | U32 v = load_24_32(ix); |
522 | #elif N == 4 |
523 | U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) }; |
524 | #elif N == 8 && !defined(USING_AVX2) |
525 | U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]), |
526 | load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) }; |
527 | #elif N == 8 |
528 | (void)load_24_32; |
529 | // The gather instruction here doesn't need any particular alignment, |
530 | // but the intrinsic takes a const int*. |
531 | const int* p4 = bit_pun<const int*>(p); |
532 | I32 zero = { 0, 0, 0, 0, 0, 0, 0, 0}, |
533 | mask = {-1,-1,-1,-1, -1,-1,-1,-1}; |
534 | #if defined(__clang__) |
535 | U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1); |
536 | #elif defined(__GNUC__) |
537 | U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1); |
538 | #endif |
539 | #elif N == 16 |
540 | (void)load_24_32; |
541 | // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2. |
542 | // And AVX-512 swapped the order of arguments. :/ |
543 | const int* p4 = bit_pun<const int*>(p); |
544 | U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1); |
545 | #endif |
546 | |
547 | // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8). |
548 | return v >> 8; |
549 | } |
550 | |
551 | #if !defined(__arm__) |
552 | SI void gather_48(const uint8_t* p, I32 ix, U64* v) { |
553 | // As in gather_24(), with everything doubled. |
554 | p -= 2; |
555 | |
556 | // Load the i'th 48-bit value from p, and 2 extra bytes. |
557 | auto load_48_64 = [p](int i) { |
558 | return load<uint64_t>(p + 6*i); |
559 | }; |
560 | |
561 | #if N == 1 |
562 | *v = load_48_64(ix); |
563 | #elif N == 4 |
564 | *v = U64{ |
565 | load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]), |
566 | }; |
567 | #elif N == 8 && !defined(USING_AVX2) |
568 | *v = U64{ |
569 | load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]), |
570 | load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]), |
571 | }; |
572 | #elif N == 8 |
573 | (void)load_48_64; |
574 | typedef int32_t __attribute__((vector_size(16))) Half_I32; |
575 | typedef long long __attribute__((vector_size(32))) Half_I64; |
576 | |
577 | // The gather instruction here doesn't need any particular alignment, |
578 | // but the intrinsic takes a const long long*. |
579 | const long long int* p8 = bit_pun<const long long int*>(p); |
580 | |
581 | Half_I64 zero = { 0, 0, 0, 0}, |
582 | mask = {-1,-1,-1,-1}; |
583 | |
584 | ix *= 6; |
585 | Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] }, |
586 | ix_hi = { ix[4], ix[5], ix[6], ix[7] }; |
587 | |
588 | #if defined(__clang__) |
589 | Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1), |
590 | hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1); |
591 | #elif defined(__GNUC__) |
592 | Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1), |
593 | hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1); |
594 | #endif |
595 | store((char*)v + 0, lo); |
596 | store((char*)v + 32, hi); |
597 | #elif N == 16 |
598 | (void)load_48_64; |
599 | const long long int* p8 = bit_pun<const long long int*>(p); |
600 | __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1), |
601 | hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1); |
602 | store((char*)v + 0, lo); |
603 | store((char*)v + 64, hi); |
604 | #endif |
605 | |
606 | *v >>= 16; |
607 | } |
608 | #endif |
609 | |
610 | SI F F_from_U8(U8 v) { |
611 | return cast<F>(v) * (1/255.0f); |
612 | } |
613 | |
614 | SI F F_from_U16_BE(U16 v) { |
615 | // All 16-bit ICC values are big-endian, so we byte swap before converting to float. |
616 | // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask. |
617 | U16 lo = (v >> 8), |
618 | hi = (v << 8) & 0xffff; |
619 | return cast<F>(lo|hi) * (1/65535.0f); |
620 | } |
621 | |
622 | SI U16 U16_from_F(F v) { |
623 | // 65535 == inf in FP16, so promote to FP32 before converting. |
624 | return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f); |
625 | } |
626 | |
627 | SI F minus_1_ulp(F v) { |
628 | #if defined(USING_NEON_FP16) |
629 | return bit_pun<F>( bit_pun<U16>(v) - 1 ); |
630 | #else |
631 | return bit_pun<F>( bit_pun<U32>(v) - 1 ); |
632 | #endif |
633 | } |
634 | |
635 | SI F table(const skcms_Curve* curve, F v) { |
636 | // Clamp the input to [0,1], then scale to a table index. |
637 | F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1); |
638 | |
639 | // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two. |
640 | I32 lo = cast<I32>( ix ), |
641 | hi = cast<I32>(minus_1_ulp(ix+1.0f)); |
642 | F t = ix - cast<F>(lo); // i.e. the fractional part of ix. |
643 | |
644 | // TODO: can we load l and h simultaneously? Each entry in 'h' is either |
645 | // the same as in 'l' or adjacent. We have a rough idea that's it'd always be safe |
646 | // to read adjacent entries and perhaps underflow the table by a byte or two |
647 | // (it'd be junk, but always safe to read). Not sure how to lerp yet. |
648 | F l,h; |
649 | if (curve->table_8) { |
650 | l = F_from_U8(gather_8(curve->table_8, lo)); |
651 | h = F_from_U8(gather_8(curve->table_8, hi)); |
652 | } else { |
653 | l = F_from_U16_BE(gather_16(curve->table_16, lo)); |
654 | h = F_from_U16_BE(gather_16(curve->table_16, hi)); |
655 | } |
656 | return l + (h-l)*t; |
657 | } |
658 | |
659 | SI void sample_clut_8(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) { |
660 | U32 rgb = gather_24(a2b->grid_8, ix); |
661 | |
662 | *r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f); |
663 | *g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f); |
664 | *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f); |
665 | } |
666 | |
667 | SI void sample_clut_16(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) { |
668 | #if defined(__arm__) |
669 | // This is up to 2x faster on 32-bit ARM than the #else-case fast path. |
670 | *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0)); |
671 | *g = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+1)); |
672 | *b = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+2)); |
673 | #else |
674 | // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too. |
675 | U64 rgb; |
676 | gather_48(a2b->grid_16, ix, &rgb); |
677 | rgb = swap_endian_16x4(rgb); |
678 | |
679 | *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f); |
680 | *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f); |
681 | *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f); |
682 | #endif |
683 | } |
684 | |
685 | // GCC 7.2.0 hits an internal compiler error with -finline-functions (or -O3) |
686 | // when targeting MIPS 64, i386, or s390x, I think attempting to inline clut() into exec_ops(). |
687 | #if 1 && defined(__GNUC__) && !defined(__clang__) \ |
688 | && (defined(__mips64) || defined(__i386) || defined(__s390x__)) |
689 | #define MAYBE_NOINLINE __attribute__((noinline)) |
690 | #else |
691 | #define MAYBE_NOINLINE |
692 | #endif |
693 | |
694 | MAYBE_NOINLINE |
695 | static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) { |
696 | const int dim = (int)a2b->input_channels; |
697 | assert (0 < dim && dim <= 4); |
698 | |
699 | // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4. |
700 | I32 index [8]; // Index contribution by dimension, first low from 0, then high from 4. |
701 | F weight[8]; // Weight for each contribution, again first low, then high. |
702 | |
703 | // O(dim) work first: calculate index,weight from r,g,b,a. |
704 | const F inputs[] = { *r,*g,*b,a }; |
705 | for (int i = dim-1, stride = 1; i >= 0; i--) { |
706 | // x is where we logically want to sample the grid in the i-th dimension. |
707 | F x = inputs[i] * (float)(a2b->grid_points[i] - 1); |
708 | |
709 | // But we can't index at floats. lo and hi are the two integer grid points surrounding x. |
710 | I32 lo = cast<I32>( x ), // i.e. trunc(x) == floor(x) here. |
711 | hi = cast<I32>(minus_1_ulp(x+1.0f)); |
712 | // Notice how we fold in the accumulated stride across previous dimensions here. |
713 | index[i+0] = lo * stride; |
714 | index[i+4] = hi * stride; |
715 | stride *= a2b->grid_points[i]; |
716 | |
717 | // We'll interpolate between those two integer grid points by t. |
718 | F t = x - cast<F>(lo); // i.e. fract(x) |
719 | weight[i+0] = 1-t; |
720 | weight[i+4] = t; |
721 | } |
722 | |
723 | *r = *g = *b = F0; |
724 | |
725 | // We'll sample 2^dim == 1<<dim table entries per pixel, |
726 | // in all combinations of low and high in each dimension. |
727 | for (int combo = 0; combo < (1<<dim); combo++) { // This loop can be done in any order. |
728 | |
729 | // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4, |
730 | // where 0 selects the low index contribution and its weight 1-t, |
731 | // or 4 the high index contribution and its weight t. |
732 | |
733 | // Since 0<dim≤4, we can always just start off with the 0-th channel, |
734 | // then handle the others conditionally. |
735 | I32 ix = index [0 + (combo&1)*4]; |
736 | F w = weight[0 + (combo&1)*4]; |
737 | |
738 | switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle. |
739 | case 3: ix += index [3 + (combo&8)/2]; |
740 | w *= weight[3 + (combo&8)/2]; |
741 | FALLTHROUGH; |
742 | // fall through |
743 | |
744 | case 2: ix += index [2 + (combo&4)*1]; |
745 | w *= weight[2 + (combo&4)*1]; |
746 | FALLTHROUGH; |
747 | // fall through |
748 | |
749 | case 1: ix += index [1 + (combo&2)*2]; |
750 | w *= weight[1 + (combo&2)*2]; |
751 | } |
752 | |
753 | F R,G,B; |
754 | if (a2b->grid_8) { |
755 | sample_clut_8 (a2b,ix, &R,&G,&B); |
756 | } else { |
757 | sample_clut_16(a2b,ix, &R,&G,&B); |
758 | } |
759 | |
760 | *r += w*R; |
761 | *g += w*G; |
762 | *b += w*B; |
763 | } |
764 | } |
765 | |
766 | static void exec_ops(const Op* ops, const void** args, |
767 | const char* src, char* dst, int i) { |
768 | F r = F0, g = F0, b = F0, a = F1; |
769 | while (true) { |
770 | switch (*ops++) { |
771 | case Op_load_a8:{ |
772 | a = F_from_U8(load<U8>(src + 1*i)); |
773 | } break; |
774 | |
775 | case Op_load_g8:{ |
776 | r = g = b = F_from_U8(load<U8>(src + 1*i)); |
777 | } break; |
778 | |
779 | case Op_load_4444:{ |
780 | U16 abgr = load<U16>(src + 2*i); |
781 | |
782 | r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f); |
783 | g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f); |
784 | b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f); |
785 | a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f); |
786 | } break; |
787 | |
788 | case Op_load_565:{ |
789 | U16 rgb = load<U16>(src + 2*i); |
790 | |
791 | r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0)); |
792 | g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5)); |
793 | b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11)); |
794 | } break; |
795 | |
796 | case Op_load_888:{ |
797 | const uint8_t* rgb = (const uint8_t*)(src + 3*i); |
798 | #if defined(USING_NEON_FP16) |
799 | // See the explanation under USING_NEON below. This is that doubled up. |
800 | uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }}; |
801 | v = vld3q_lane_u8(rgb+ 0, v, 0); |
802 | v = vld3q_lane_u8(rgb+ 3, v, 2); |
803 | v = vld3q_lane_u8(rgb+ 6, v, 4); |
804 | v = vld3q_lane_u8(rgb+ 9, v, 6); |
805 | |
806 | v = vld3q_lane_u8(rgb+12, v, 8); |
807 | v = vld3q_lane_u8(rgb+15, v, 10); |
808 | v = vld3q_lane_u8(rgb+18, v, 12); |
809 | v = vld3q_lane_u8(rgb+21, v, 14); |
810 | |
811 | r = cast<F>((U16)v.val[0]) * (1/255.0f); |
812 | g = cast<F>((U16)v.val[1]) * (1/255.0f); |
813 | b = cast<F>((U16)v.val[2]) * (1/255.0f); |
814 | #elif defined(USING_NEON) |
815 | // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at |
816 | // a time. Since we're doing that, we might as well load them into 16-bit lanes. |
817 | // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.) |
818 | uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }}; |
819 | v = vld3_lane_u8(rgb+0, v, 0); |
820 | v = vld3_lane_u8(rgb+3, v, 2); |
821 | v = vld3_lane_u8(rgb+6, v, 4); |
822 | v = vld3_lane_u8(rgb+9, v, 6); |
823 | |
824 | // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to |
825 | // convert to F. (Again, U32 would be even better here if drop ARMv7 or split |
826 | // ARMv7 and ARMv8 impls.) |
827 | r = cast<F>((U16)v.val[0]) * (1/255.0f); |
828 | g = cast<F>((U16)v.val[1]) * (1/255.0f); |
829 | b = cast<F>((U16)v.val[2]) * (1/255.0f); |
830 | #else |
831 | r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f); |
832 | g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f); |
833 | b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f); |
834 | #endif |
835 | } break; |
836 | |
837 | case Op_load_8888:{ |
838 | U32 rgba = load<U32>(src + 4*i); |
839 | |
840 | r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f); |
841 | g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f); |
842 | b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f); |
843 | a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f); |
844 | } break; |
845 | |
846 | case Op_load_8888_palette8:{ |
847 | const uint8_t* palette = (const uint8_t*) *args++; |
848 | I32 ix = cast<I32>(load<U8>(src + 1*i)); |
849 | U32 rgba = gather_32(palette, ix); |
850 | |
851 | r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f); |
852 | g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f); |
853 | b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f); |
854 | a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f); |
855 | } break; |
856 | |
857 | case Op_load_1010102:{ |
858 | U32 rgba = load<U32>(src + 4*i); |
859 | |
860 | r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f); |
861 | g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f); |
862 | b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f); |
863 | a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f); |
864 | } break; |
865 | |
866 | case Op_load_161616LE:{ |
867 | uintptr_t ptr = (uintptr_t)(src + 6*i); |
868 | assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this |
869 | const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. |
870 | #if defined(USING_NEON_FP16) |
871 | uint16x8x3_t v = vld3q_u16(rgb); |
872 | r = cast<F>((U16)v.val[0]) * (1/65535.0f); |
873 | g = cast<F>((U16)v.val[1]) * (1/65535.0f); |
874 | b = cast<F>((U16)v.val[2]) * (1/65535.0f); |
875 | #elif defined(USING_NEON) |
876 | uint16x4x3_t v = vld3_u16(rgb); |
877 | r = cast<F>((U16)v.val[0]) * (1/65535.0f); |
878 | g = cast<F>((U16)v.val[1]) * (1/65535.0f); |
879 | b = cast<F>((U16)v.val[2]) * (1/65535.0f); |
880 | #else |
881 | r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f); |
882 | g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f); |
883 | b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f); |
884 | #endif |
885 | } break; |
886 | |
887 | case Op_load_16161616LE:{ |
888 | uintptr_t ptr = (uintptr_t)(src + 8*i); |
889 | assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this |
890 | const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. |
891 | #if defined(USING_NEON_FP16) |
892 | uint16x8x4_t v = vld4q_u16(rgba); |
893 | r = cast<F>((U16)v.val[0]) * (1/65535.0f); |
894 | g = cast<F>((U16)v.val[1]) * (1/65535.0f); |
895 | b = cast<F>((U16)v.val[2]) * (1/65535.0f); |
896 | a = cast<F>((U16)v.val[3]) * (1/65535.0f); |
897 | #elif defined(USING_NEON) |
898 | uint16x4x4_t v = vld4_u16(rgba); |
899 | r = cast<F>((U16)v.val[0]) * (1/65535.0f); |
900 | g = cast<F>((U16)v.val[1]) * (1/65535.0f); |
901 | b = cast<F>((U16)v.val[2]) * (1/65535.0f); |
902 | a = cast<F>((U16)v.val[3]) * (1/65535.0f); |
903 | #else |
904 | U64 px = load<U64>(rgba); |
905 | |
906 | r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f); |
907 | g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f); |
908 | b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f); |
909 | a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f); |
910 | #endif |
911 | } break; |
912 | |
913 | case Op_load_161616BE:{ |
914 | uintptr_t ptr = (uintptr_t)(src + 6*i); |
915 | assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this |
916 | const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. |
917 | #if defined(USING_NEON_FP16) |
918 | uint16x8x3_t v = vld3q_u16(rgb); |
919 | r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f); |
920 | g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f); |
921 | b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f); |
922 | #elif defined(USING_NEON) |
923 | uint16x4x3_t v = vld3_u16(rgb); |
924 | r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f); |
925 | g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f); |
926 | b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f); |
927 | #else |
928 | U32 R = load_3<U32>(rgb+0), |
929 | G = load_3<U32>(rgb+1), |
930 | B = load_3<U32>(rgb+2); |
931 | // R,G,B are big-endian 16-bit, so byte swap them before converting to float. |
932 | r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f); |
933 | g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f); |
934 | b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f); |
935 | #endif |
936 | } break; |
937 | |
938 | case Op_load_16161616BE:{ |
939 | uintptr_t ptr = (uintptr_t)(src + 8*i); |
940 | assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this |
941 | const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. |
942 | #if defined(USING_NEON_FP16) |
943 | uint16x8x4_t v = vld4q_u16(rgba); |
944 | r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f); |
945 | g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f); |
946 | b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f); |
947 | a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f); |
948 | #elif defined(USING_NEON) |
949 | uint16x4x4_t v = vld4_u16(rgba); |
950 | r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f); |
951 | g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f); |
952 | b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f); |
953 | a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f); |
954 | #else |
955 | U64 px = swap_endian_16x4(load<U64>(rgba)); |
956 | |
957 | r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f); |
958 | g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f); |
959 | b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f); |
960 | a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f); |
961 | #endif |
962 | } break; |
963 | |
964 | case Op_load_hhh:{ |
965 | uintptr_t ptr = (uintptr_t)(src + 6*i); |
966 | assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this |
967 | const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. |
968 | #if defined(USING_NEON_FP16) |
969 | uint16x8x3_t v = vld3q_u16(rgb); |
970 | U16 R = (U16)v.val[0], |
971 | G = (U16)v.val[1], |
972 | B = (U16)v.val[2]; |
973 | #elif defined(USING_NEON) |
974 | uint16x4x3_t v = vld3_u16(rgb); |
975 | U16 R = (U16)v.val[0], |
976 | G = (U16)v.val[1], |
977 | B = (U16)v.val[2]; |
978 | #else |
979 | U16 R = load_3<U16>(rgb+0), |
980 | G = load_3<U16>(rgb+1), |
981 | B = load_3<U16>(rgb+2); |
982 | #endif |
983 | r = F_from_Half(R); |
984 | g = F_from_Half(G); |
985 | b = F_from_Half(B); |
986 | } break; |
987 | |
988 | case Op_load_hhhh:{ |
989 | uintptr_t ptr = (uintptr_t)(src + 8*i); |
990 | assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this |
991 | const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe. |
992 | #if defined(USING_NEON_FP16) |
993 | uint16x8x4_t v = vld4q_u16(rgba); |
994 | U16 R = (U16)v.val[0], |
995 | G = (U16)v.val[1], |
996 | B = (U16)v.val[2], |
997 | A = (U16)v.val[3]; |
998 | #elif defined(USING_NEON) |
999 | uint16x4x4_t v = vld4_u16(rgba); |
1000 | U16 R = (U16)v.val[0], |
1001 | G = (U16)v.val[1], |
1002 | B = (U16)v.val[2], |
1003 | A = (U16)v.val[3]; |
1004 | #else |
1005 | U64 px = load<U64>(rgba); |
1006 | U16 R = cast<U16>((px >> 0) & 0xffff), |
1007 | G = cast<U16>((px >> 16) & 0xffff), |
1008 | B = cast<U16>((px >> 32) & 0xffff), |
1009 | A = cast<U16>((px >> 48) & 0xffff); |
1010 | #endif |
1011 | r = F_from_Half(R); |
1012 | g = F_from_Half(G); |
1013 | b = F_from_Half(B); |
1014 | a = F_from_Half(A); |
1015 | } break; |
1016 | |
1017 | case Op_load_fff:{ |
1018 | uintptr_t ptr = (uintptr_t)(src + 12*i); |
1019 | assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this |
1020 | const float* rgb = (const float*)ptr; // cast to const float* to be safe. |
1021 | #if defined(USING_NEON_FP16) |
1022 | float32x4x3_t lo = vld3q_f32(rgb + 0), |
1023 | hi = vld3q_f32(rgb + 12); |
1024 | r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0])); |
1025 | g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1])); |
1026 | b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2])); |
1027 | #elif defined(USING_NEON) |
1028 | float32x4x3_t v = vld3q_f32(rgb); |
1029 | r = (F)v.val[0]; |
1030 | g = (F)v.val[1]; |
1031 | b = (F)v.val[2]; |
1032 | #else |
1033 | r = load_3<F>(rgb+0); |
1034 | g = load_3<F>(rgb+1); |
1035 | b = load_3<F>(rgb+2); |
1036 | #endif |
1037 | } break; |
1038 | |
1039 | case Op_load_ffff:{ |
1040 | uintptr_t ptr = (uintptr_t)(src + 16*i); |
1041 | assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this |
1042 | const float* rgba = (const float*)ptr; // cast to const float* to be safe. |
1043 | #if defined(USING_NEON_FP16) |
1044 | float32x4x4_t lo = vld4q_f32(rgba + 0), |
1045 | hi = vld4q_f32(rgba + 16); |
1046 | r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0])); |
1047 | g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1])); |
1048 | b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2])); |
1049 | a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3])); |
1050 | #elif defined(USING_NEON) |
1051 | float32x4x4_t v = vld4q_f32(rgba); |
1052 | r = (F)v.val[0]; |
1053 | g = (F)v.val[1]; |
1054 | b = (F)v.val[2]; |
1055 | a = (F)v.val[3]; |
1056 | #else |
1057 | r = load_4<F>(rgba+0); |
1058 | g = load_4<F>(rgba+1); |
1059 | b = load_4<F>(rgba+2); |
1060 | a = load_4<F>(rgba+3); |
1061 | #endif |
1062 | } break; |
1063 | |
1064 | case Op_swap_rb:{ |
1065 | F t = r; |
1066 | r = b; |
1067 | b = t; |
1068 | } break; |
1069 | |
1070 | case Op_clamp:{ |
1071 | r = max_(F0, min_(r, F1)); |
1072 | g = max_(F0, min_(g, F1)); |
1073 | b = max_(F0, min_(b, F1)); |
1074 | a = max_(F0, min_(a, F1)); |
1075 | } break; |
1076 | |
1077 | case Op_invert:{ |
1078 | r = F1 - r; |
1079 | g = F1 - g; |
1080 | b = F1 - b; |
1081 | a = F1 - a; |
1082 | } break; |
1083 | |
1084 | case Op_force_opaque:{ |
1085 | a = F1; |
1086 | } break; |
1087 | |
1088 | case Op_premul:{ |
1089 | r *= a; |
1090 | g *= a; |
1091 | b *= a; |
1092 | } break; |
1093 | |
1094 | case Op_unpremul:{ |
1095 | F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0); |
1096 | r *= scale; |
1097 | g *= scale; |
1098 | b *= scale; |
1099 | } break; |
1100 | |
1101 | case Op_matrix_3x3:{ |
1102 | const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++; |
1103 | const float* m = &matrix->vals[0][0]; |
1104 | |
1105 | F R = m[0]*r + m[1]*g + m[2]*b, |
1106 | G = m[3]*r + m[4]*g + m[5]*b, |
1107 | B = m[6]*r + m[7]*g + m[8]*b; |
1108 | |
1109 | r = R; |
1110 | g = G; |
1111 | b = B; |
1112 | } break; |
1113 | |
1114 | case Op_matrix_3x4:{ |
1115 | const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++; |
1116 | const float* m = &matrix->vals[0][0]; |
1117 | |
1118 | F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3], |
1119 | G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7], |
1120 | B = m[8]*r + m[9]*g + m[10]*b + m[11]; |
1121 | |
1122 | r = R; |
1123 | g = G; |
1124 | b = B; |
1125 | } break; |
1126 | |
1127 | case Op_lab_to_xyz:{ |
1128 | // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them: |
1129 | F L = r * 100.0f, |
1130 | A = g * 255.0f - 128.0f, |
1131 | B = b * 255.0f - 128.0f; |
1132 | |
1133 | // Convert to CIE XYZ. |
1134 | F Y = (L + 16.0f) * (1/116.0f), |
1135 | X = Y + A*(1/500.0f), |
1136 | Z = Y - B*(1/200.0f); |
1137 | |
1138 | X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f)); |
1139 | Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f)); |
1140 | Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f)); |
1141 | |
1142 | // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op. |
1143 | r = X * 0.9642f; |
1144 | g = Y ; |
1145 | b = Z * 0.8249f; |
1146 | } break; |
1147 | |
1148 | case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break; |
1149 | case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break; |
1150 | case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break; |
1151 | case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break; |
1152 | |
1153 | case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break; |
1154 | case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break; |
1155 | case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break; |
1156 | case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break; |
1157 | |
1158 | case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break; |
1159 | case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break; |
1160 | case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break; |
1161 | case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break; |
1162 | |
1163 | case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break; |
1164 | case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break; |
1165 | case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break; |
1166 | case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break; |
1167 | |
1168 | case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break; |
1169 | case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break; |
1170 | case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break; |
1171 | case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break; |
1172 | |
1173 | case Op_clut: { |
1174 | const skcms_A2B* a2b = (const skcms_A2B*) *args++; |
1175 | clut(a2b, &r,&g,&b,a); |
1176 | |
1177 | if (a2b->input_channels == 4) { |
1178 | // CMYK is opaque. |
1179 | a = F1; |
1180 | } |
1181 | } break; |
1182 | |
1183 | // Notice, from here on down the store_ ops all return, ending the loop. |
1184 | |
1185 | case Op_store_a8: { |
1186 | store(dst + 1*i, cast<U8>(to_fixed(a * 255))); |
1187 | } return; |
1188 | |
1189 | case Op_store_g8: { |
1190 | // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z) |
1191 | store(dst + 1*i, cast<U8>(to_fixed(g * 255))); |
1192 | } return; |
1193 | |
1194 | case Op_store_4444: { |
1195 | store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12) |
1196 | | cast<U16>(to_fixed(g * 15) << 8) |
1197 | | cast<U16>(to_fixed(b * 15) << 4) |
1198 | | cast<U16>(to_fixed(a * 15) << 0)); |
1199 | } return; |
1200 | |
1201 | case Op_store_565: { |
1202 | store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 ) |
1203 | | cast<U16>(to_fixed(g * 63) << 5 ) |
1204 | | cast<U16>(to_fixed(b * 31) << 11 )); |
1205 | } return; |
1206 | |
1207 | case Op_store_888: { |
1208 | uint8_t* rgb = (uint8_t*)dst + 3*i; |
1209 | #if defined(USING_NEON_FP16) |
1210 | // See the explanation under USING_NEON below. This is that doubled up. |
1211 | U16 R = to_fixed(r * 255), |
1212 | G = to_fixed(g * 255), |
1213 | B = to_fixed(b * 255); |
1214 | |
1215 | uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }}; |
1216 | vst3q_lane_u8(rgb+ 0, v, 0); |
1217 | vst3q_lane_u8(rgb+ 3, v, 2); |
1218 | vst3q_lane_u8(rgb+ 6, v, 4); |
1219 | vst3q_lane_u8(rgb+ 9, v, 6); |
1220 | |
1221 | vst3q_lane_u8(rgb+12, v, 8); |
1222 | vst3q_lane_u8(rgb+15, v, 10); |
1223 | vst3q_lane_u8(rgb+18, v, 12); |
1224 | vst3q_lane_u8(rgb+21, v, 14); |
1225 | #elif defined(USING_NEON) |
1226 | // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but |
1227 | // get there via U16 to save some instructions converting to float. And just |
1228 | // like load_888, we'd prefer to go via U32 but for ARMv7 support. |
1229 | U16 R = cast<U16>(to_fixed(r * 255)), |
1230 | G = cast<U16>(to_fixed(g * 255)), |
1231 | B = cast<U16>(to_fixed(b * 255)); |
1232 | |
1233 | uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }}; |
1234 | vst3_lane_u8(rgb+0, v, 0); |
1235 | vst3_lane_u8(rgb+3, v, 2); |
1236 | vst3_lane_u8(rgb+6, v, 4); |
1237 | vst3_lane_u8(rgb+9, v, 6); |
1238 | #else |
1239 | store_3(rgb+0, cast<U8>(to_fixed(r * 255)) ); |
1240 | store_3(rgb+1, cast<U8>(to_fixed(g * 255)) ); |
1241 | store_3(rgb+2, cast<U8>(to_fixed(b * 255)) ); |
1242 | #endif |
1243 | } return; |
1244 | |
1245 | case Op_store_8888: { |
1246 | store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0 |
1247 | | cast<U32>(to_fixed(g * 255)) << 8 |
1248 | | cast<U32>(to_fixed(b * 255)) << 16 |
1249 | | cast<U32>(to_fixed(a * 255)) << 24); |
1250 | } return; |
1251 | |
1252 | case Op_store_1010102: { |
1253 | store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0 |
1254 | | cast<U32>(to_fixed(g * 1023)) << 10 |
1255 | | cast<U32>(to_fixed(b * 1023)) << 20 |
1256 | | cast<U32>(to_fixed(a * 3)) << 30); |
1257 | } return; |
1258 | |
1259 | case Op_store_161616LE: { |
1260 | uintptr_t ptr = (uintptr_t)(dst + 6*i); |
1261 | assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned |
1262 | uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe. |
1263 | #if defined(USING_NEON_FP16) |
1264 | uint16x8x3_t v = {{ |
1265 | (uint16x8_t)U16_from_F(r), |
1266 | (uint16x8_t)U16_from_F(g), |
1267 | (uint16x8_t)U16_from_F(b), |
1268 | }}; |
1269 | vst3q_u16(rgb, v); |
1270 | #elif defined(USING_NEON) |
1271 | uint16x4x3_t v = {{ |
1272 | (uint16x4_t)U16_from_F(r), |
1273 | (uint16x4_t)U16_from_F(g), |
1274 | (uint16x4_t)U16_from_F(b), |
1275 | }}; |
1276 | vst3_u16(rgb, v); |
1277 | #else |
1278 | store_3(rgb+0, U16_from_F(r)); |
1279 | store_3(rgb+1, U16_from_F(g)); |
1280 | store_3(rgb+2, U16_from_F(b)); |
1281 | #endif |
1282 | |
1283 | } return; |
1284 | |
1285 | case Op_store_16161616LE: { |
1286 | uintptr_t ptr = (uintptr_t)(dst + 8*i); |
1287 | assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned |
1288 | uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe. |
1289 | #if defined(USING_NEON_FP16) |
1290 | uint16x8x4_t v = {{ |
1291 | (uint16x8_t)U16_from_F(r), |
1292 | (uint16x8_t)U16_from_F(g), |
1293 | (uint16x8_t)U16_from_F(b), |
1294 | (uint16x8_t)U16_from_F(a), |
1295 | }}; |
1296 | vst4q_u16(rgba, v); |
1297 | #elif defined(USING_NEON) |
1298 | uint16x4x4_t v = {{ |
1299 | (uint16x4_t)U16_from_F(r), |
1300 | (uint16x4_t)U16_from_F(g), |
1301 | (uint16x4_t)U16_from_F(b), |
1302 | (uint16x4_t)U16_from_F(a), |
1303 | }}; |
1304 | vst4_u16(rgba, v); |
1305 | #else |
1306 | U64 px = cast<U64>(to_fixed(r * 65535)) << 0 |
1307 | | cast<U64>(to_fixed(g * 65535)) << 16 |
1308 | | cast<U64>(to_fixed(b * 65535)) << 32 |
1309 | | cast<U64>(to_fixed(a * 65535)) << 48; |
1310 | store(rgba, px); |
1311 | #endif |
1312 | } return; |
1313 | |
1314 | case Op_store_161616BE: { |
1315 | uintptr_t ptr = (uintptr_t)(dst + 6*i); |
1316 | assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned |
1317 | uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe. |
1318 | #if defined(USING_NEON_FP16) |
1319 | uint16x8x3_t v = {{ |
1320 | (uint16x8_t)swap_endian_16(U16_from_F(r)), |
1321 | (uint16x8_t)swap_endian_16(U16_from_F(g)), |
1322 | (uint16x8_t)swap_endian_16(U16_from_F(b)), |
1323 | }}; |
1324 | vst3q_u16(rgb, v); |
1325 | #elif defined(USING_NEON) |
1326 | uint16x4x3_t v = {{ |
1327 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))), |
1328 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))), |
1329 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))), |
1330 | }}; |
1331 | vst3_u16(rgb, v); |
1332 | #else |
1333 | U32 R = to_fixed(r * 65535), |
1334 | G = to_fixed(g * 65535), |
1335 | B = to_fixed(b * 65535); |
1336 | store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) ); |
1337 | store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) ); |
1338 | store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) ); |
1339 | #endif |
1340 | |
1341 | } return; |
1342 | |
1343 | case Op_store_16161616BE: { |
1344 | uintptr_t ptr = (uintptr_t)(dst + 8*i); |
1345 | assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned |
1346 | uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe. |
1347 | #if defined(USING_NEON_FP16) |
1348 | uint16x8x4_t v = {{ |
1349 | (uint16x8_t)swap_endian_16(U16_from_F(r)), |
1350 | (uint16x8_t)swap_endian_16(U16_from_F(g)), |
1351 | (uint16x8_t)swap_endian_16(U16_from_F(b)), |
1352 | (uint16x8_t)swap_endian_16(U16_from_F(a)), |
1353 | }}; |
1354 | vst4q_u16(rgba, v); |
1355 | #elif defined(USING_NEON) |
1356 | uint16x4x4_t v = {{ |
1357 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))), |
1358 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))), |
1359 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))), |
1360 | (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))), |
1361 | }}; |
1362 | vst4_u16(rgba, v); |
1363 | #else |
1364 | U64 px = cast<U64>(to_fixed(r * 65535)) << 0 |
1365 | | cast<U64>(to_fixed(g * 65535)) << 16 |
1366 | | cast<U64>(to_fixed(b * 65535)) << 32 |
1367 | | cast<U64>(to_fixed(a * 65535)) << 48; |
1368 | store(rgba, swap_endian_16x4(px)); |
1369 | #endif |
1370 | } return; |
1371 | |
1372 | case Op_store_hhh: { |
1373 | uintptr_t ptr = (uintptr_t)(dst + 6*i); |
1374 | assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned |
1375 | uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe. |
1376 | |
1377 | U16 R = Half_from_F(r), |
1378 | G = Half_from_F(g), |
1379 | B = Half_from_F(b); |
1380 | #if defined(USING_NEON_FP16) |
1381 | uint16x8x3_t v = {{ |
1382 | (uint16x8_t)R, |
1383 | (uint16x8_t)G, |
1384 | (uint16x8_t)B, |
1385 | }}; |
1386 | vst3q_u16(rgb, v); |
1387 | #elif defined(USING_NEON) |
1388 | uint16x4x3_t v = {{ |
1389 | (uint16x4_t)R, |
1390 | (uint16x4_t)G, |
1391 | (uint16x4_t)B, |
1392 | }}; |
1393 | vst3_u16(rgb, v); |
1394 | #else |
1395 | store_3(rgb+0, R); |
1396 | store_3(rgb+1, G); |
1397 | store_3(rgb+2, B); |
1398 | #endif |
1399 | } return; |
1400 | |
1401 | case Op_store_hhhh: { |
1402 | uintptr_t ptr = (uintptr_t)(dst + 8*i); |
1403 | assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned |
1404 | uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe. |
1405 | |
1406 | U16 R = Half_from_F(r), |
1407 | G = Half_from_F(g), |
1408 | B = Half_from_F(b), |
1409 | A = Half_from_F(a); |
1410 | #if defined(USING_NEON_FP16) |
1411 | uint16x8x4_t v = {{ |
1412 | (uint16x8_t)R, |
1413 | (uint16x8_t)G, |
1414 | (uint16x8_t)B, |
1415 | (uint16x8_t)A, |
1416 | }}; |
1417 | vst4q_u16(rgba, v); |
1418 | #elif defined(USING_NEON) |
1419 | uint16x4x4_t v = {{ |
1420 | (uint16x4_t)R, |
1421 | (uint16x4_t)G, |
1422 | (uint16x4_t)B, |
1423 | (uint16x4_t)A, |
1424 | }}; |
1425 | vst4_u16(rgba, v); |
1426 | #else |
1427 | store(rgba, cast<U64>(R) << 0 |
1428 | | cast<U64>(G) << 16 |
1429 | | cast<U64>(B) << 32 |
1430 | | cast<U64>(A) << 48); |
1431 | #endif |
1432 | |
1433 | } return; |
1434 | |
1435 | case Op_store_fff: { |
1436 | uintptr_t ptr = (uintptr_t)(dst + 12*i); |
1437 | assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned |
1438 | float* rgb = (float*)ptr; // for this cast to float* to be safe. |
1439 | #if defined(USING_NEON_FP16) |
1440 | float32x4x3_t lo = {{ |
1441 | vcvt_f32_f16(vget_low_f16(r)), |
1442 | vcvt_f32_f16(vget_low_f16(g)), |
1443 | vcvt_f32_f16(vget_low_f16(b)), |
1444 | }}, hi = {{ |
1445 | vcvt_f32_f16(vget_high_f16(r)), |
1446 | vcvt_f32_f16(vget_high_f16(g)), |
1447 | vcvt_f32_f16(vget_high_f16(b)), |
1448 | }}; |
1449 | vst3q_f32(rgb + 0, lo); |
1450 | vst3q_f32(rgb + 12, hi); |
1451 | #elif defined(USING_NEON) |
1452 | float32x4x3_t v = {{ |
1453 | (float32x4_t)r, |
1454 | (float32x4_t)g, |
1455 | (float32x4_t)b, |
1456 | }}; |
1457 | vst3q_f32(rgb, v); |
1458 | #else |
1459 | store_3(rgb+0, r); |
1460 | store_3(rgb+1, g); |
1461 | store_3(rgb+2, b); |
1462 | #endif |
1463 | } return; |
1464 | |
1465 | case Op_store_ffff: { |
1466 | uintptr_t ptr = (uintptr_t)(dst + 16*i); |
1467 | assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned |
1468 | float* rgba = (float*)ptr; // for this cast to float* to be safe. |
1469 | #if defined(USING_NEON_FP16) |
1470 | float32x4x4_t lo = {{ |
1471 | vcvt_f32_f16(vget_low_f16(r)), |
1472 | vcvt_f32_f16(vget_low_f16(g)), |
1473 | vcvt_f32_f16(vget_low_f16(b)), |
1474 | vcvt_f32_f16(vget_low_f16(a)), |
1475 | }}, hi = {{ |
1476 | vcvt_f32_f16(vget_high_f16(r)), |
1477 | vcvt_f32_f16(vget_high_f16(g)), |
1478 | vcvt_f32_f16(vget_high_f16(b)), |
1479 | vcvt_f32_f16(vget_high_f16(a)), |
1480 | }}; |
1481 | vst4q_f32(rgba + 0, lo); |
1482 | vst4q_f32(rgba + 16, hi); |
1483 | #elif defined(USING_NEON) |
1484 | float32x4x4_t v = {{ |
1485 | (float32x4_t)r, |
1486 | (float32x4_t)g, |
1487 | (float32x4_t)b, |
1488 | (float32x4_t)a, |
1489 | }}; |
1490 | vst4q_f32(rgba, v); |
1491 | #else |
1492 | store_4(rgba+0, r); |
1493 | store_4(rgba+1, g); |
1494 | store_4(rgba+2, b); |
1495 | store_4(rgba+3, a); |
1496 | #endif |
1497 | } return; |
1498 | } |
1499 | } |
1500 | } |
1501 | |
1502 | |
1503 | static void run_program(const Op* program, const void** arguments, |
1504 | const char* src, char* dst, int n, |
1505 | const size_t src_bpp, const size_t dst_bpp) { |
1506 | int i = 0; |
1507 | while (n >= N) { |
1508 | exec_ops(program, arguments, src, dst, i); |
1509 | i += N; |
1510 | n -= N; |
1511 | } |
1512 | if (n > 0) { |
1513 | char tmp[4*4*N] = {0}; |
1514 | |
1515 | memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp); |
1516 | exec_ops(program, arguments, tmp, tmp, 0); |
1517 | memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp); |
1518 | } |
1519 | } |
1520 | |
1521 | // Clean up any #defines we may have set so that we can be #included again. |
1522 | #if defined(USING_AVX) |
1523 | #undef USING_AVX |
1524 | #endif |
1525 | #if defined(USING_AVX_F16C) |
1526 | #undef USING_AVX_F16C |
1527 | #endif |
1528 | #if defined(USING_AVX2) |
1529 | #undef USING_AVX2 |
1530 | #endif |
1531 | #if defined(USING_AVX512F) |
1532 | #undef USING_AVX512F |
1533 | #endif |
1534 | |
1535 | #if defined(USING_NEON) |
1536 | #undef USING_NEON |
1537 | #endif |
1538 | #if defined(USING_NEON_F16C) |
1539 | #undef USING_NEON_F16C |
1540 | #endif |
1541 | #if defined(USING_NEON_FP16) |
1542 | #undef USING_NEON_FP16 |
1543 | #endif |
1544 | |
1545 | #undef FALLTHROUGH |
1546 | |