Transform_inl.h source code [Skia/third_party/skcms/src/Transform_inl.h]

1	/*
2	* Copyright 2018 Google Inc.
3	*
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the LICENSE file.
6	*/
7
8	// Intentionally NO #pragma once... included multiple times.
9
10	// This file is included from skcms.cc in a namespace with some pre-defines:
11	// - N: depth of all vectors, 1,4,8, or 16 (preprocessor define)
12	// - V<T>: a template to create a vector of N T's.
13
14	using F = V<Color>; // Called F for historic reasons... maybe rename C?
15	using I32 = V<int32_t>;
16	using U64 = V<uint64_t>;
17	using U32 = V<uint32_t>;
18	using U16 = V<uint16_t>;
19	using U8 = V<uint8_t>;
20
21
22	#if defined(__GNUC__) && !defined(__clang__)
23	// Once again, GCC is kind of weird, not allowing vector = scalar directly.
24	static constexpr F F0 = F() + `0.0f`,
25	F1 = F() + `1.0f`;
26	#else
27	static constexpr F F0 = `0.0f`,
28	F1 = `1.0f`;
29	#endif
30
31	// Instead of checking __AVX__ below, we'll check USING_AVX.
32	// This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
33	// Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
34
35	#if !defined(USING_AVX) && N == 8 && defined(__AVX__)
36	#define USING_AVX
37	#endif
38	#if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
39	#define USING AVX_F16C
40	#endif
41	#if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
42	#define USING_AVX2
43	#endif
44	#if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__)
45	#define USING_AVX512F
46	#endif
47
48	// Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
49	// This is more for organizational clarity... skcms.cc doesn't force these.
50	#if N > 1 && defined(__ARM_NEON)
51	#define USING_NEON
52	#if __ARM_FP & 2
53	#define USING_NEON_F16C
54	#endif
55	#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
56	#define USING_NEON_FP16
57	#endif
58	#endif
59
60	// These -Wvector-conversion warnings seem to trigger in very bogus situations,
61	// like vst3q_f32() expecting a 16x char rather than a 4x float vector. :/
62	#if defined(USING_NEON) && defined(__clang__)
63	#pragma clang diagnostic ignored "-Wvector-conversion"
64	#endif
65
66	// GCC warns us about returning U64 on x86 because it's larger than a register.
67	// You'd see warnings like, "using AVX even though AVX is not enabled".
68	// We stifle these warnings... our helpers that return U64 are always inlined.
69	#if defined(__SSE__) && defined(__GNUC__) && !defined(__clang__)
70	#pragma GCC diagnostic ignored "-Wpsabi"
71	#endif
72
73	#if defined(__clang__)
74	#define FALLTHROUGH [[clang::fallthrough]]
75	#else
76	#define FALLTHROUGH
77	#endif
78
79	// We tag most helper functions as SI, to enforce good code generation
80	// but also work around what we think is a bug in GCC: when targeting 32-bit
81	// x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
82	// MMX mm0 register, which seems to mess with unrelated code that later uses
83	// x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
84	//
85	// It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
86	#if defined(__clang__) \|\| defined(__GNUC__)
87	#define SI static inline __attribute__((always_inline))
88	#else
89	#define SI static inline
90	#endif
91
92	template <typename T, typename P>
93	SI T load(const P* ptr) {
94	T val;
95	small_memcpy(&val, ptr, sizeof(val));
96	return val;
97	}
98	template <typename T, typename P>
99	SI void store(P* ptr, const T& val) {
100	small_memcpy(ptr, &val, sizeof(val));
101	}
102
103	// (T)v is a cast when N == 1 and a bit-pun when N>1,
104	// so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
105	template <typename D, typename S>
106	SI D cast(const S& v) {
107	#if N == 1
108	return (D)v;
109	#elif defined(__clang__)
110	return __builtin_convertvector(v, D);
111	#else
112	D d;
113	for (int i = `0`; i < N; i++) {
114	d[i] = v[i];
115	}
116	return d;
117	#endif
118	}
119
120	template <typename D, typename S>
121	SI D bit_pun(const S& v) {
122	static_assert(sizeof(D) == sizeof(v), "");
123	return load<D>(&v);
124	}
125
126	// When we convert from float to fixed point, it's very common to want to round,
127	// and for some reason compilers generate better code when converting to int32_t.
128	// To serve both those ends, we use this function to_fixed() instead of direct cast().
129	#if defined(USING_NEON_FP16)
130	// NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
131	SI U16 to_fixed(F f) { return cast<U16>(f + `0.5f`); }
132	#else
133	SI U32 to_fixed(F f) { return (U32)cast<I32>(f + `0.5f`); }
134	#endif
135
136
137	// Sometimes we do something crazy on one branch of a conditonal,
138	// like divide by zero or convert a huge float to an integer,
139	// but then harmlessly select the other side. That trips up N==1
140	// sanitizer builds, so we make if_then_else() a macro to avoid
141	// evaluating the unused side.
142
143	#if N == 1
144	#define if_then_else(cond, t, e) ((cond) ? (t) : (e))
145	#else
146	template <typename C, typename T>
147	SI T if_then_else(C cond, T t, T e) {
148	return bit_pun<T>( ( cond & bit_pun<C>(t)) \|
149	(~cond & bit_pun<C>(e)) );
150	}
151	#endif
152
153
154	SI F F_from_Half(U16 half) {
155	#if defined(USING_NEON_FP16)
156	return bit_pun<F>(half);
157	#elif defined(USING_NEON_F16C)
158	return vcvt_f32_f16((float16x4_t)half);
159	#elif defined(USING_AVX512F)
160	return (F)_mm512_cvtph_ps((__m256i)half);
161	#elif defined(USING_AVX_F16C)
162	typedef int16_t __attribute__((vector_size(`16`))) I16;
163	return __builtin_ia32_vcvtph2ps256((I16)half);
164	#else
165	U32 wide = cast<U32>(half);
166	// A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
167	U32 s = wide & `0x8000`,
168	em = wide ^ s;
169
170	// Constructing the float is easy if the half is not denormalized.
171	F norm = bit_pun<F>( (s<<`16`) + (em<<`13`) + ((`127`-`15`)<<`23`) );
172
173	// Simply flush all denorm half floats to zero.
174	return if_then_else(em < `0x0400`, F0, norm);
175	#endif
176	}
177
178	#if defined(__clang__)
179	// The -((127-15)<<10) underflows that side of the math when
180	// we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
181	__attribute__((no_sanitize("unsigned-integer-overflow")))
182	#endif
183	SI U16 Half_from_F(F f) {
184	#if defined(USING_NEON_FP16)
185	return bit_pun<U16>(f);
186	#elif defined(USING_NEON_F16C)
187	return (U16)vcvt_f16_f32(f);
188	#elif defined(USING_AVX512F)
189	return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
190	#elif defined(USING_AVX_F16C)
191	return (U16)__builtin_ia32_vcvtps2ph256(f, `0x04`/_MM_FROUND_CUR_DIRECTION/);
192	#else
193	// A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
194	U32 sem = bit_pun<U32>(f),
195	s = sem & `0x80000000`,
196	em = sem ^ s;
197
198	// For simplicity we flush denorm half floats (including all denorm floats) to zero.
199	return cast<U16>(if_then_else(em < `0x38800000`, (U32)F0
200	, (s>>`16`) + (em>>`13`) - ((`127`-`15`)<<`10`)));
201	#endif
202	}
203
204	// Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
205	#if defined(USING_NEON_FP16)
206	SI U16 swap_endian_16(U16 v) {
207	return (U16)vrev16q_u8((uint8x16_t) v);
208	}
209	#elif defined(USING_NEON)
210	SI U16 swap_endian_16(U16 v) {
211	return (U16)vrev16_u8((uint8x8_t) v);
212	}
213	#endif
214
215	SI U64 swap_endian_16x4(const U64& rgba) {
216	return (rgba & `0x00ff00ff00ff00ff`) << `8`
217	\| (rgba & `0xff00ff00ff00ff00`) >> `8`;
218	}
219
220	#if defined(USING_NEON_FP16)
221	SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
222	SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
223	#elif defined(USING_NEON)
224	SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
225	SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
226	#else
227	SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
228	SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
229	#endif
230
231	SI F floor_(F x) {
232	#if N == 1
233	return floorf_(x);
234	#elif defined(USING_NEON_FP16)
235	return vrndmq_f16(x);
236	#elif defined(__aarch64__)
237	return vrndmq_f32(x);
238	#elif defined(USING_AVX512F)
239	// Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
240	// and integer santizer catches that this implicit cast changes the
241	// value from -1 to 65535. We'll cast manually to work around it.
242	// Read this as `return _mm512_floor_ps(x)`.
243	return _mm512_mask_floor_ps(x, (__mmask16)-`1`, x);
244	#elif defined(USING_AVX)
245	return __builtin_ia32_roundps256(x, `0x01`/_MM_FROUND_FLOOR/);
246	#elif defined(__SSE4_1__)
247	return _mm_floor_ps(x);
248	#else
249	// Round trip through integers with a truncating cast.
250	F roundtrip = cast<F>(cast<I32>(x));
251	// If x is negative, truncating gives the ceiling instead of the floor.
252	return roundtrip - if_then_else(roundtrip > x, F1, F0);
253
254	// This implementation fails for values of x that are outside
255	// the range an integer can represent. We expect most x to be small.
256	#endif
257	}
258
259	SI F approx_log2(F x) {
260	#if defined(USING_NEON_FP16)
261	// TODO(mtklein)
262	return x;
263	#else
264	// The first approximation of log2(x) is its exponent 'e', minus 127.
265	I32 bits = bit_pun<I32>(x);
266
267	F e = cast<F>(bits) * (`1.0f` / (`1`<<`23`));
268
269	// If we use the mantissa too we can refine the error signficantly.
270	F m = bit_pun<F>( (bits & `0x007fffff`) \| `0x3f000000` );
271
272	return e - `124.225514990f`
273	- `1.498030302f`*m
274	- `1.725879990f`/(`0.3520887068f` + m);
275	#endif
276	}
277
278	SI F approx_log(F x) {
279	const float ln2 = `0.69314718f`;
280	return ln2 * approx_log2(x);
281	}
282
283	SI F approx_exp2(F x) {
284	#if defined(USING_NEON_FP16)
285	// TODO(mtklein)
286	return x;
287	#else
288	F fract = x - floor_(x);
289
290	I32 bits = cast<I32>((`1.0f` * (`1`<<`23`)) * (x + `121.274057500f`
291	- `1.490129070f`*fract
292	+ `27.728023300f`/(`4.84252568f` - fract)));
293	return bit_pun<F>(bits);
294	#endif
295	}
296
297	SI F approx_pow(F x, float y) {
298	return if_then_else((x == F0) \| (x == F1), x
299	, approx_exp2(approx_log2(x) * y));
300	}
301
302	SI F approx_exp(F x) {
303	const float log2_e = `1.4426950408889634074f`;
304	return approx_exp2(log2_e * x);
305	}
306
307	// Return tf(x).
308	SI F apply_tf(const skcms_TransferFunction* tf, F x) {
309	#if defined(USING_NEON_FP16)
310	// TODO(mtklein)
311	(void)tf;
312	return x;
313	#else
314	// Peel off the sign bit and set x = \|x\|.
315	U32 bits = bit_pun<U32>(x),
316	sign = bits & `0x80000000`;
317	x = bit_pun<F>(bits ^ sign);
318
319	// The transfer function has a linear part up to d, exponential at d and after.
320	F v = if_then_else(x < tf->d, tf->c*x + tf->f
321	, approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
322
323	// Tack the sign bit back on.
324	return bit_pun<F>(sign \| bit_pun<U32>(v));
325	#endif
326	}
327
328	SI F apply_pq(const skcms_TransferFunction* tf, F x) {
329	#if defined(USING_NEON_FP16)
330	// TODO(mtklein)
331	(void)tf;
332	return x;
333	#else
334	U32 bits = bit_pun<U32>(x),
335	sign = bits & `0x80000000`;
336	x = bit_pun<F>(bits ^ sign);
337
338	F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
339	/ (tf->d + tf->e * approx_pow(x, tf->c)),
340	tf->f);
341
342	return bit_pun<F>(sign \| bit_pun<U32>(v));
343	#endif
344	}
345
346	SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
347	#if defined(USING_NEON_FP16)
348	// TODO(mtklein)
349	(void)tf;
350	return x;
351	#else
352	const float R = tf->a, G = tf->b,
353	a = tf->c, b = tf->d, c = tf->e;
354	U32 bits = bit_pun<U32>(x),
355	sign = bits & `0x80000000`;
356	x = bit_pun<F>(bits ^ sign);
357
358	F v = if_then_else(xR <= `1`, approx_pow(xR, G)
359	, approx_exp((x-c)*a) + b);
360
361	return bit_pun<F>(sign \| bit_pun<U32>(v));
362	#endif
363	}
364
365	SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
366	#if defined(USING_NEON_FP16)
367	// TODO(mtklein)
368	(void)tf;
369	return x;
370	#else
371	const float R = tf->a, G = tf->b,
372	a = tf->c, b = tf->d, c = tf->e;
373	U32 bits = bit_pun<U32>(x),
374	sign = bits & `0x80000000`;
375	x = bit_pun<F>(bits ^ sign);
376
377	F v = if_then_else(x <= `1`, R * approx_pow(x, G)
378	, a * approx_log(x - b) + c);
379
380	return bit_pun<F>(sign \| bit_pun<U32>(v));
381	#endif
382	}
383
384
385	// Strided loads and stores of N values, starting from p.
386	template <typename T, typename P>
387	SI T load_3(const P* p) {
388	#if N == 1
389	return (T)p[`0`];
390	#elif N == 4
391	return T{p[ `0`],p[ `3`],p[ `6`],p[ `9`]};
392	#elif N == 8
393	return T{p[ `0`],p[ `3`],p[ `6`],p[ `9`], p[`12`],p[`15`],p[`18`],p[`21`]};
394	#elif N == 16
395	return T{p[ `0`],p[ `3`],p[ `6`],p[ `9`], p[`12`],p[`15`],p[`18`],p[`21`],
396	p[`24`],p[`27`],p[`30`],p[`33`], p[`36`],p[`39`],p[`42`],p[`45`]};
397	#endif
398	}
399
400	template <typename T, typename P>
401	SI T load_4(const P* p) {
402	#if N == 1
403	return (T)p[`0`];
404	#elif N == 4
405	return T{p[ `0`],p[ `4`],p[ `8`],p[`12`]};
406	#elif N == 8
407	return T{p[ `0`],p[ `4`],p[ `8`],p[`12`], p[`16`],p[`20`],p[`24`],p[`28`]};
408	#elif N == 16
409	return T{p[ `0`],p[ `4`],p[ `8`],p[`12`], p[`16`],p[`20`],p[`24`],p[`28`],
410	p[`32`],p[`36`],p[`40`],p[`44`], p[`48`],p[`52`],p[`56`],p[`60`]};
411	#endif
412	}
413
414	template <typename T, typename P>
415	SI void store_3(P* p, const T& v) {
416	#if N == 1
417	p[`0`] = v;
418	#elif N == 4
419	p[ `0`] = v[ `0`]; p[ `3`] = v[ `1`]; p[ `6`] = v[ `2`]; p[ `9`] = v[ `3`];
420	#elif N == 8
421	p[ `0`] = v[ `0`]; p[ `3`] = v[ `1`]; p[ `6`] = v[ `2`]; p[ `9`] = v[ `3`];
422	p[`12`] = v[ `4`]; p[`15`] = v[ `5`]; p[`18`] = v[ `6`]; p[`21`] = v[ `7`];
423	#elif N == 16
424	p[ `0`] = v[ `0`]; p[ `3`] = v[ `1`]; p[ `6`] = v[ `2`]; p[ `9`] = v[ `3`];
425	p[`12`] = v[ `4`]; p[`15`] = v[ `5`]; p[`18`] = v[ `6`]; p[`21`] = v[ `7`];
426	p[`24`] = v[ `8`]; p[`27`] = v[ `9`]; p[`30`] = v[`10`]; p[`33`] = v[`11`];
427	p[`36`] = v[`12`]; p[`39`] = v[`13`]; p[`42`] = v[`14`]; p[`45`] = v[`15`];
428	#endif
429	}
430
431	template <typename T, typename P>
432	SI void store_4(P* p, const T& v) {
433	#if N == 1
434	p[`0`] = v;
435	#elif N == 4
436	p[ `0`] = v[ `0`]; p[ `4`] = v[ `1`]; p[ `8`] = v[ `2`]; p[`12`] = v[ `3`];
437	#elif N == 8
438	p[ `0`] = v[ `0`]; p[ `4`] = v[ `1`]; p[ `8`] = v[ `2`]; p[`12`] = v[ `3`];
439	p[`16`] = v[ `4`]; p[`20`] = v[ `5`]; p[`24`] = v[ `6`]; p[`28`] = v[ `7`];
440	#elif N == 16
441	p[ `0`] = v[ `0`]; p[ `4`] = v[ `1`]; p[ `8`] = v[ `2`]; p[`12`] = v[ `3`];
442	p[`16`] = v[ `4`]; p[`20`] = v[ `5`]; p[`24`] = v[ `6`]; p[`28`] = v[ `7`];
443	p[`32`] = v[ `8`]; p[`36`] = v[ `9`]; p[`40`] = v[`10`]; p[`44`] = v[`11`];
444	p[`48`] = v[`12`]; p[`52`] = v[`13`]; p[`56`] = v[`14`]; p[`60`] = v[`15`];
445	#endif
446	}
447
448
449	SI U8 gather_8(const uint8_t* p, I32 ix) {
450	#if N == 1
451	U8 v = p[ix];
452	#elif N == 4
453	U8 v = { p[ix[`0`]], p[ix[`1`]], p[ix[`2`]], p[ix[`3`]] };
454	#elif N == 8
455	U8 v = { p[ix[`0`]], p[ix[`1`]], p[ix[`2`]], p[ix[`3`]],
456	p[ix[`4`]], p[ix[`5`]], p[ix[`6`]], p[ix[`7`]] };
457	#elif N == 16
458	U8 v = { p[ix[ `0`]], p[ix[ `1`]], p[ix[ `2`]], p[ix[ `3`]],
459	p[ix[ `4`]], p[ix[ `5`]], p[ix[ `6`]], p[ix[ `7`]],
460	p[ix[ `8`]], p[ix[ `9`]], p[ix[`10`]], p[ix[`11`]],
461	p[ix[`12`]], p[ix[`13`]], p[ix[`14`]], p[ix[`15`]] };
462	#endif
463	return v;
464	}
465
466	SI U16 gather_16(const uint8_t* p, I32 ix) {
467	// Load the i'th 16-bit value from p.
468	auto load_16 = [p](int i) {
469	return load<uint16_t>(p + `2`*i);
470	};
471	#if N == 1
472	U16 v = load_16(ix);
473	#elif N == 4
474	U16 v = { load_16(ix[`0`]), load_16(ix[`1`]), load_16(ix[`2`]), load_16(ix[`3`]) };
475	#elif N == 8
476	U16 v = { load_16(ix[`0`]), load_16(ix[`1`]), load_16(ix[`2`]), load_16(ix[`3`]),
477	load_16(ix[`4`]), load_16(ix[`5`]), load_16(ix[`6`]), load_16(ix[`7`]) };
478	#elif N == 16
479	U16 v = { load_16(ix[ `0`]), load_16(ix[ `1`]), load_16(ix[ `2`]), load_16(ix[ `3`]),
480	load_16(ix[ `4`]), load_16(ix[ `5`]), load_16(ix[ `6`]), load_16(ix[ `7`]),
481	load_16(ix[ `8`]), load_16(ix[ `9`]), load_16(ix[`10`]), load_16(ix[`11`]),
482	load_16(ix[`12`]), load_16(ix[`13`]), load_16(ix[`14`]), load_16(ix[`15`]) };
483	#endif
484	return v;
485	}
486
487	SI U32 gather_32(const uint8_t* p, I32 ix) {
488	// Load the i'th 32-bit value from p.
489	auto load_32 = [p](int i) {
490	return load<uint32_t>(p + `4`*i);
491	};
492	#if N == 1
493	U32 v = load_32(ix);
494	#elif N == 4
495	U32 v = { load_32(ix[`0`]), load_32(ix[`1`]), load_32(ix[`2`]), load_32(ix[`3`]) };
496	#elif N == 8
497	U32 v = { load_32(ix[`0`]), load_32(ix[`1`]), load_32(ix[`2`]), load_32(ix[`3`]),
498	load_32(ix[`4`]), load_32(ix[`5`]), load_32(ix[`6`]), load_32(ix[`7`]) };
499	#elif N == 16
500	U32 v = { load_32(ix[ `0`]), load_32(ix[ `1`]), load_32(ix[ `2`]), load_32(ix[ `3`]),
501	load_32(ix[ `4`]), load_32(ix[ `5`]), load_32(ix[ `6`]), load_32(ix[ `7`]),
502	load_32(ix[ `8`]), load_32(ix[ `9`]), load_32(ix[`10`]), load_32(ix[`11`]),
503	load_32(ix[`12`]), load_32(ix[`13`]), load_32(ix[`14`]), load_32(ix[`15`]) };
504	#endif
505	// TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
506	return v;
507	}
508
509	SI U32 gather_24(const uint8_t* p, I32 ix) {
510	// First, back up a byte. Any place we're gathering from has a safe junk byte to read
511	// in front of it, either a previous table value, or some tag metadata.
512	p -= `1`;
513
514	// Load the i'th 24-bit value from p, and 1 extra byte.
515	auto load_24_32 = [p](int i) {
516	return load<uint32_t>(p + `3`*i);
517	};
518
519	// Now load multiples of 4 bytes (a junk byte, then r,g,b).
520	#if N == 1
521	U32 v = load_24_32(ix);
522	#elif N == 4
523	U32 v = { load_24_32(ix[`0`]), load_24_32(ix[`1`]), load_24_32(ix[`2`]), load_24_32(ix[`3`]) };
524	#elif N == 8 && !defined(USING_AVX2)
525	U32 v = { load_24_32(ix[`0`]), load_24_32(ix[`1`]), load_24_32(ix[`2`]), load_24_32(ix[`3`]),
526	load_24_32(ix[`4`]), load_24_32(ix[`5`]), load_24_32(ix[`6`]), load_24_32(ix[`7`]) };
527	#elif N == 8
528	(void)load_24_32;
529	// The gather instruction here doesn't need any particular alignment,
530	// but the intrinsic takes a const int.*
531	const int* p4 = bit_pun<const int*>(p);
532	I32 zero = { `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`},
533	mask = {-`1`,-`1`,-`1`,-`1`, -`1`,-`1`,-`1`,-`1`};
534	#if defined(__clang__)
535	U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, `3`*ix, mask, `1`);
536	#elif defined(__GNUC__)
537	U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, `3`*ix, mask, `1`);
538	#endif
539	#elif N == 16
540	(void)load_24_32;
541	// The intrinsic is supposed to take const void now, but it takes const int, just like AVX2.
542	// And AVX-512 swapped the order of arguments. :/
543	const int* p4 = bit_pun<const int*>(p);
544	U32 v = (U32)_mm512_i32gather_epi32((__m512i)(`3`*ix), p4, `1`);
545	#endif
546
547	// Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
548	return v >> `8`;
549	}
550
551	#if !defined(__arm__)
552	SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
553	// As in gather_24(), with everything doubled.
554	p -= `2`;
555
556	// Load the i'th 48-bit value from p, and 2 extra bytes.
557	auto load_48_64 = [p](int i) {
558	return load<uint64_t>(p + `6`*i);
559	};
560
561	#if N == 1
562	*v = load_48_64(ix);
563	#elif N == 4
564	*v = U64{
565	load_48_64(ix[`0`]), load_48_64(ix[`1`]), load_48_64(ix[`2`]), load_48_64(ix[`3`]),
566	};
567	#elif N == 8 && !defined(USING_AVX2)
568	*v = U64{
569	load_48_64(ix[`0`]), load_48_64(ix[`1`]), load_48_64(ix[`2`]), load_48_64(ix[`3`]),
570	load_48_64(ix[`4`]), load_48_64(ix[`5`]), load_48_64(ix[`6`]), load_48_64(ix[`7`]),
571	};
572	#elif N == 8
573	(void)load_48_64;
574	typedef int32_t __attribute__((vector_size(`16`))) Half_I32;
575	typedef long long __attribute__((vector_size(`32`))) Half_I64;
576
577	// The gather instruction here doesn't need any particular alignment,
578	// but the intrinsic takes a const long long.*
579	const long long int* p8 = bit_pun<const long long int*>(p);
580
581	Half_I64 zero = { `0`, `0`, `0`, `0`},
582	mask = {-`1`,-`1`,-`1`,-`1`};
583
584	ix *= `6`;
585	Half_I32 ix_lo = { ix[`0`], ix[`1`], ix[`2`], ix[`3`] },
586	ix_hi = { ix[`4`], ix[`5`], ix[`6`], ix[`7`] };
587
588	#if defined(__clang__)
589	Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, `1`),
590	hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, `1`);
591	#elif defined(__GNUC__)
592	Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, `1`),
593	hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, `1`);
594	#endif
595	store((char*)v + `0`, lo);
596	store((char*)v + `32`, hi);
597	#elif N == 16
598	(void)load_48_64;
599	const long long int* p8 = bit_pun<const long long int*>(p);
600	__m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(`6`*ix), `0`), p8, `1`),
601	hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(`6`*ix), `1`), p8, `1`);
602	store((char*)v + `0`, lo);
603	store((char*)v + `64`, hi);
604	#endif
605
606	*v >>= `16`;
607	}
608	#endif
609
610	SI F F_from_U8(U8 v) {
611	return cast<F>(v) * (`1`/`255.0f`);
612	}
613
614	SI F F_from_U16_BE(U16 v) {
615	// All 16-bit ICC values are big-endian, so we byte swap before converting to float.
616	// MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
617	U16 lo = (v >> `8`),
618	hi = (v << `8`) & `0xffff`;
619	return cast<F>(lo\|hi) * (`1`/`65535.0f`);
620	}
621
622	SI U16 U16_from_F(F v) {
623	// 65535 == inf in FP16, so promote to FP32 before converting.
624	return cast<U16>(cast<V<float>>(v) * `65535` + `0.5f`);
625	}
626
627	SI F minus_1_ulp(F v) {
628	#if defined(USING_NEON_FP16)
629	return bit_pun<F>( bit_pun<U16>(v) - `1` );
630	#else
631	return bit_pun<F>( bit_pun<U32>(v) - `1` );
632	#endif
633	}
634
635	SI F table(const skcms_Curve* curve, F v) {
636	// Clamp the input to [0,1], then scale to a table index.
637	F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - `1`);
638
639	// We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
640	I32 lo = cast<I32>( ix ),
641	hi = cast<I32>(minus_1_ulp(ix+`1.0f`));
642	F t = ix - cast<F>(lo); // i.e. the fractional part of ix.
643
644	// TODO: can we load l and h simultaneously? Each entry in 'h' is either
645	// the same as in 'l' or adjacent. We have a rough idea that's it'd always be safe
646	// to read adjacent entries and perhaps underflow the table by a byte or two
647	// (it'd be junk, but always safe to read). Not sure how to lerp yet.
648	F l,h;
649	if (curve->table_8) {
650	l = F_from_U8(gather_8(curve->table_8, lo));
651	h = F_from_U8(gather_8(curve->table_8, hi));
652	} else {
653	l = F_from_U16_BE(gather_16(curve->table_16, lo));
654	h = F_from_U16_BE(gather_16(curve->table_16, hi));
655	}
656	return l + (h-l)*t;
657	}
658
659	SI void sample_clut_8(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
660	U32 rgb = gather_24(a2b->grid_8, ix);
661
662	r = cast<F>((rgb >> `0`) & `0xff`) (`1`/`255.0f`);
663	g = cast<F>((rgb >> `8`) & `0xff`) (`1`/`255.0f`);
664	b = cast<F>((rgb >> `16`) & `0xff`) (`1`/`255.0f`);
665	}
666
667	SI void sample_clut_16(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
668	#if defined(__arm__)
669	// This is up to 2x faster on 32-bit ARM than the #else-case fast path.
670	r = F_from_U16_BE(gather_16(a2b->grid_16, `3`ix+`0`));
671	g = F_from_U16_BE(gather_16(a2b->grid_16, `3`ix+`1`));
672	b = F_from_U16_BE(gather_16(a2b->grid_16, `3`ix+`2`));
673	#else
674	// This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
675	U64 rgb;
676	gather_48(a2b->grid_16, ix, &rgb);
677	rgb = swap_endian_16x4(rgb);
678
679	r = cast<F>((rgb >> `0`) & `0xffff`) (`1`/`65535.0f`);
680	g = cast<F>((rgb >> `16`) & `0xffff`) (`1`/`65535.0f`);
681	b = cast<F>((rgb >> `32`) & `0xffff`) (`1`/`65535.0f`);
682	#endif
683	}
684
685	// GCC 7.2.0 hits an internal compiler error with -finline-functions (or -O3)
686	// when targeting MIPS 64, i386, or s390x, I think attempting to inline clut() into exec_ops().
687	#if 1 && defined(__GNUC__) && !defined(__clang__) \
688	&& (defined(__mips64) \|\| defined(__i386) \|\| defined(__s390x__))
689	#define MAYBE_NOINLINE __attribute__((noinline))
690	#else
691	#define MAYBE_NOINLINE
692	#endif
693
694	MAYBE_NOINLINE
695	static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
696	const int dim = (int)a2b->input_channels;
697	assert (`0` < dim && dim <= `4`);
698
699	// For each of these arrays, think foo[2dim], but we use foo[8] since we know dim <= 4.*
700	I32 index [`8`]; // Index contribution by dimension, first low from 0, then high from 4.
701	F weight[`8`]; // Weight for each contribution, again first low, then high.
702
703	// O(dim) work first: calculate index,weight from r,g,b,a.
704	const F inputs[] = { r,g,*b,a };
705	for (int i = dim-`1`, stride = `1`; i >= `0`; i--) {
706	// x is where we logically want to sample the grid in the i-th dimension.
707	F x = inputs[i] * (float)(a2b->grid_points[i] - `1`);
708
709	// But we can't index at floats. lo and hi are the two integer grid points surrounding x.
710	I32 lo = cast<I32>( x ), // i.e. trunc(x) == floor(x) here.
711	hi = cast<I32>(minus_1_ulp(x+`1.0f`));
712	// Notice how we fold in the accumulated stride across previous dimensions here.
713	index[i+`0`] = lo * stride;
714	index[i+`4`] = hi * stride;
715	stride *= a2b->grid_points[i];
716
717	// We'll interpolate between those two integer grid points by t.
718	F t = x - cast<F>(lo); // i.e. fract(x)
719	weight[i+`0`] = `1`-t;
720	weight[i+`4`] = t;
721	}
722
723	r = g = *b = F0;
724
725	// We'll sample 2^dim == 1<<dim table entries per pixel,
726	// in all combinations of low and high in each dimension.
727	for (int combo = `0`; combo < (`1`<<dim); combo++) { // This loop can be done in any order.
728
729	// Each of these upcoming (combo&N)K expressions here evaluates to 0 or 4,*
730	// where 0 selects the low index contribution and its weight 1-t,
731	// or 4 the high index contribution and its weight t.
732
733	// Since 0<dim≤4, we can always just start off with the 0-th channel,
734	// then handle the others conditionally.
735	I32 ix = index [`0` + (combo&`1`)*`4`];
736	F w = weight[`0` + (combo&`1`)*`4`];
737
738	switch ((dim-`1`)&`3`) { // This lets the compiler know there are no other cases to handle.
739	case `3`: ix += index [`3` + (combo&`8`)/`2`];
740	w *= weight[`3` + (combo&`8`)/`2`];
741	FALLTHROUGH;
742	// fall through
743
744	case `2`: ix += index [`2` + (combo&`4`)*`1`];
745	w = weight[`2` + (combo&`4`)`1`];
746	FALLTHROUGH;
747	// fall through
748
749	case `1`: ix += index [`1` + (combo&`2`)*`2`];
750	w = weight[`1` + (combo&`2`)`2`];
751	}
752
753	F R,G,B;
754	if (a2b->grid_8) {
755	sample_clut_8 (a2b,ix, &R,&G,&B);
756	} else {
757	sample_clut_16(a2b,ix, &R,&G,&B);
758	}
759
760	r += wR;
761	g += wG;
762	b += wB;
763	}
764	}
765
766	static void exec_ops(const Op* ops, const void** args,
767	const char* src, char* dst, int i) {
768	F r = F0, g = F0, b = F0, a = F1;
769	while (true) {
770	switch (*ops++) {
771	case Op_load_a8:{
772	a = F_from_U8(load<U8>(src + `1`*i));
773	} break;
774
775	case Op_load_g8:{
776	r = g = b = F_from_U8(load<U8>(src + `1`*i));
777	} break;
778
779	case Op_load_4444:{
780	U16 abgr = load<U16>(src + `2`*i);
781
782	r = cast<F>((abgr >> `12`) & `0xf`) * (`1`/`15.0f`);
783	g = cast<F>((abgr >> `8`) & `0xf`) * (`1`/`15.0f`);
784	b = cast<F>((abgr >> `4`) & `0xf`) * (`1`/`15.0f`);
785	a = cast<F>((abgr >> `0`) & `0xf`) * (`1`/`15.0f`);
786	} break;
787
788	case Op_load_565:{
789	U16 rgb = load<U16>(src + `2`*i);
790
791	r = cast<F>(rgb & (uint16_t)(`31`<< `0`)) * (`1.0f` / (`31`<< `0`));
792	g = cast<F>(rgb & (uint16_t)(`63`<< `5`)) * (`1.0f` / (`63`<< `5`));
793	b = cast<F>(rgb & (uint16_t)(`31`<<`11`)) * (`1.0f` / (`31`<<`11`));
794	} break;
795
796	case Op_load_888:{
797	const uint8_t* rgb = (const uint8_t)(src + `3`i);
798	#if defined(USING_NEON_FP16)
799	// See the explanation under USING_NEON below. This is that doubled up.
800	uint8x16x3_t v = {{ vdupq_n_u8(`0`), vdupq_n_u8(`0`), vdupq_n_u8(`0`) }};
801	v = vld3q_lane_u8(rgb+ `0`, v, `0`);
802	v = vld3q_lane_u8(rgb+ `3`, v, `2`);
803	v = vld3q_lane_u8(rgb+ `6`, v, `4`);
804	v = vld3q_lane_u8(rgb+ `9`, v, `6`);
805
806	v = vld3q_lane_u8(rgb+`12`, v, `8`);
807	v = vld3q_lane_u8(rgb+`15`, v, `10`);
808	v = vld3q_lane_u8(rgb+`18`, v, `12`);
809	v = vld3q_lane_u8(rgb+`21`, v, `14`);
810
811	r = cast<F>((U16)v.val[`0`]) * (`1`/`255.0f`);
812	g = cast<F>((U16)v.val[`1`]) * (`1`/`255.0f`);
813	b = cast<F>((U16)v.val[`2`]) * (`1`/`255.0f`);
814	#elif defined(USING_NEON)
815	// There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
816	// a time. Since we're doing that, we might as well load them into 16-bit lanes.
817	// (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
818	uint8x8x3_t v = {{ vdup_n_u8(`0`), vdup_n_u8(`0`), vdup_n_u8(`0`) }};
819	v = vld3_lane_u8(rgb+`0`, v, `0`);
820	v = vld3_lane_u8(rgb+`3`, v, `2`);
821	v = vld3_lane_u8(rgb+`6`, v, `4`);
822	v = vld3_lane_u8(rgb+`9`, v, `6`);
823
824	// Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
825	// convert to F. (Again, U32 would be even better here if drop ARMv7 or split
826	// ARMv7 and ARMv8 impls.)
827	r = cast<F>((U16)v.val[`0`]) * (`1`/`255.0f`);
828	g = cast<F>((U16)v.val[`1`]) * (`1`/`255.0f`);
829	b = cast<F>((U16)v.val[`2`]) * (`1`/`255.0f`);
830	#else
831	r = cast<F>(load_3<U32>(rgb+`0`) ) * (`1`/`255.0f`);
832	g = cast<F>(load_3<U32>(rgb+`1`) ) * (`1`/`255.0f`);
833	b = cast<F>(load_3<U32>(rgb+`2`) ) * (`1`/`255.0f`);
834	#endif
835	} break;
836
837	case Op_load_8888:{
838	U32 rgba = load<U32>(src + `4`*i);
839
840	r = cast<F>((rgba >> `0`) & `0xff`) * (`1`/`255.0f`);
841	g = cast<F>((rgba >> `8`) & `0xff`) * (`1`/`255.0f`);
842	b = cast<F>((rgba >> `16`) & `0xff`) * (`1`/`255.0f`);
843	a = cast<F>((rgba >> `24`) & `0xff`) * (`1`/`255.0f`);
844	} break;
845
846	case Op_load_8888_palette8:{
847	const uint8_t* palette = (const uint8_t) args++;
848	I32 ix = cast<I32>(load<U8>(src + `1`*i));
849	U32 rgba = gather_32(palette, ix);
850
851	r = cast<F>((rgba >> `0`) & `0xff`) * (`1`/`255.0f`);
852	g = cast<F>((rgba >> `8`) & `0xff`) * (`1`/`255.0f`);
853	b = cast<F>((rgba >> `16`) & `0xff`) * (`1`/`255.0f`);
854	a = cast<F>((rgba >> `24`) & `0xff`) * (`1`/`255.0f`);
855	} break;
856
857	case Op_load_1010102:{
858	U32 rgba = load<U32>(src + `4`*i);
859
860	r = cast<F>((rgba >> `0`) & `0x3ff`) * (`1`/`1023.0f`);
861	g = cast<F>((rgba >> `10`) & `0x3ff`) * (`1`/`1023.0f`);
862	b = cast<F>((rgba >> `20`) & `0x3ff`) * (`1`/`1023.0f`);
863	a = cast<F>((rgba >> `30`) & `0x3` ) * (`1`/ `3.0f`);
864	} break;
865
866	case Op_load_161616LE:{
867	uintptr_t ptr = (uintptr_t)(src + `6`*i);
868	assert( (ptr & `1`) == `0` ); // src must be 2-byte aligned for this
869	const uint16_t* rgb = (const uint16_t)ptr; // cast to const uint16_t* to be safe.*
870	#if defined(USING_NEON_FP16)
871	uint16x8x3_t v = vld3q_u16(rgb);
872	r = cast<F>((U16)v.val[`0`]) * (`1`/`65535.0f`);
873	g = cast<F>((U16)v.val[`1`]) * (`1`/`65535.0f`);
874	b = cast<F>((U16)v.val[`2`]) * (`1`/`65535.0f`);
875	#elif defined(USING_NEON)
876	uint16x4x3_t v = vld3_u16(rgb);
877	r = cast<F>((U16)v.val[`0`]) * (`1`/`65535.0f`);
878	g = cast<F>((U16)v.val[`1`]) * (`1`/`65535.0f`);
879	b = cast<F>((U16)v.val[`2`]) * (`1`/`65535.0f`);
880	#else
881	r = cast<F>(load_3<U32>(rgb+`0`)) * (`1`/`65535.0f`);
882	g = cast<F>(load_3<U32>(rgb+`1`)) * (`1`/`65535.0f`);
883	b = cast<F>(load_3<U32>(rgb+`2`)) * (`1`/`65535.0f`);
884	#endif
885	} break;
886
887	case Op_load_16161616LE:{
888	uintptr_t ptr = (uintptr_t)(src + `8`*i);
889	assert( (ptr & `1`) == `0` ); // src must be 2-byte aligned for this
890	const uint16_t* rgba = (const uint16_t)ptr; // cast to const uint16_t* to be safe.*
891	#if defined(USING_NEON_FP16)
892	uint16x8x4_t v = vld4q_u16(rgba);
893	r = cast<F>((U16)v.val[`0`]) * (`1`/`65535.0f`);
894	g = cast<F>((U16)v.val[`1`]) * (`1`/`65535.0f`);
895	b = cast<F>((U16)v.val[`2`]) * (`1`/`65535.0f`);
896	a = cast<F>((U16)v.val[`3`]) * (`1`/`65535.0f`);
897	#elif defined(USING_NEON)
898	uint16x4x4_t v = vld4_u16(rgba);
899	r = cast<F>((U16)v.val[`0`]) * (`1`/`65535.0f`);
900	g = cast<F>((U16)v.val[`1`]) * (`1`/`65535.0f`);
901	b = cast<F>((U16)v.val[`2`]) * (`1`/`65535.0f`);
902	a = cast<F>((U16)v.val[`3`]) * (`1`/`65535.0f`);
903	#else
904	U64 px = load<U64>(rgba);
905
906	r = cast<F>((px >> `0`) & `0xffff`) * (`1`/`65535.0f`);
907	g = cast<F>((px >> `16`) & `0xffff`) * (`1`/`65535.0f`);
908	b = cast<F>((px >> `32`) & `0xffff`) * (`1`/`65535.0f`);
909	a = cast<F>((px >> `48`) & `0xffff`) * (`1`/`65535.0f`);
910	#endif
911	} break;
912
913	case Op_load_161616BE:{
914	uintptr_t ptr = (uintptr_t)(src + `6`*i);
915	assert( (ptr & `1`) == `0` ); // src must be 2-byte aligned for this
916	const uint16_t* rgb = (const uint16_t)ptr; // cast to const uint16_t* to be safe.*
917	#if defined(USING_NEON_FP16)
918	uint16x8x3_t v = vld3q_u16(rgb);
919	r = cast<F>(swap_endian_16((U16)v.val[`0`])) * (`1`/`65535.0f`);
920	g = cast<F>(swap_endian_16((U16)v.val[`1`])) * (`1`/`65535.0f`);
921	b = cast<F>(swap_endian_16((U16)v.val[`2`])) * (`1`/`65535.0f`);
922	#elif defined(USING_NEON)
923	uint16x4x3_t v = vld3_u16(rgb);
924	r = cast<F>(swap_endian_16((U16)v.val[`0`])) * (`1`/`65535.0f`);
925	g = cast<F>(swap_endian_16((U16)v.val[`1`])) * (`1`/`65535.0f`);
926	b = cast<F>(swap_endian_16((U16)v.val[`2`])) * (`1`/`65535.0f`);
927	#else
928	U32 R = load_3<U32>(rgb+`0`),
929	G = load_3<U32>(rgb+`1`),
930	B = load_3<U32>(rgb+`2`);
931	// R,G,B are big-endian 16-bit, so byte swap them before converting to float.
932	r = cast<F>((R & `0x00ff`)<<`8` \| (R & `0xff00`)>>`8`) * (`1`/`65535.0f`);
933	g = cast<F>((G & `0x00ff`)<<`8` \| (G & `0xff00`)>>`8`) * (`1`/`65535.0f`);
934	b = cast<F>((B & `0x00ff`)<<`8` \| (B & `0xff00`)>>`8`) * (`1`/`65535.0f`);
935	#endif
936	} break;
937
938	case Op_load_16161616BE:{
939	uintptr_t ptr = (uintptr_t)(src + `8`*i);
940	assert( (ptr & `1`) == `0` ); // src must be 2-byte aligned for this
941	const uint16_t* rgba = (const uint16_t)ptr; // cast to const uint16_t* to be safe.*
942	#if defined(USING_NEON_FP16)
943	uint16x8x4_t v = vld4q_u16(rgba);
944	r = cast<F>(swap_endian_16((U16)v.val[`0`])) * (`1`/`65535.0f`);
945	g = cast<F>(swap_endian_16((U16)v.val[`1`])) * (`1`/`65535.0f`);
946	b = cast<F>(swap_endian_16((U16)v.val[`2`])) * (`1`/`65535.0f`);
947	a = cast<F>(swap_endian_16((U16)v.val[`3`])) * (`1`/`65535.0f`);
948	#elif defined(USING_NEON)
949	uint16x4x4_t v = vld4_u16(rgba);
950	r = cast<F>(swap_endian_16((U16)v.val[`0`])) * (`1`/`65535.0f`);
951	g = cast<F>(swap_endian_16((U16)v.val[`1`])) * (`1`/`65535.0f`);
952	b = cast<F>(swap_endian_16((U16)v.val[`2`])) * (`1`/`65535.0f`);
953	a = cast<F>(swap_endian_16((U16)v.val[`3`])) * (`1`/`65535.0f`);
954	#else
955	U64 px = swap_endian_16x4(load<U64>(rgba));
956
957	r = cast<F>((px >> `0`) & `0xffff`) * (`1`/`65535.0f`);
958	g = cast<F>((px >> `16`) & `0xffff`) * (`1`/`65535.0f`);
959	b = cast<F>((px >> `32`) & `0xffff`) * (`1`/`65535.0f`);
960	a = cast<F>((px >> `48`) & `0xffff`) * (`1`/`65535.0f`);
961	#endif
962	} break;
963
964	case Op_load_hhh:{
965	uintptr_t ptr = (uintptr_t)(src + `6`*i);
966	assert( (ptr & `1`) == `0` ); // src must be 2-byte aligned for this
967	const uint16_t* rgb = (const uint16_t)ptr; // cast to const uint16_t* to be safe.*
968	#if defined(USING_NEON_FP16)
969	uint16x8x3_t v = vld3q_u16(rgb);
970	U16 R = (U16)v.val[`0`],
971	G = (U16)v.val[`1`],
972	B = (U16)v.val[`2`];
973	#elif defined(USING_NEON)
974	uint16x4x3_t v = vld3_u16(rgb);
975	U16 R = (U16)v.val[`0`],
976	G = (U16)v.val[`1`],
977	B = (U16)v.val[`2`];
978	#else
979	U16 R = load_3<U16>(rgb+`0`),
980	G = load_3<U16>(rgb+`1`),
981	B = load_3<U16>(rgb+`2`);
982	#endif
983	r = F_from_Half(R);
984	g = F_from_Half(G);
985	b = F_from_Half(B);
986	} break;
987
988	case Op_load_hhhh:{
989	uintptr_t ptr = (uintptr_t)(src + `8`*i);
990	assert( (ptr & `1`) == `0` ); // src must be 2-byte aligned for this
991	const uint16_t* rgba = (const uint16_t)ptr; // cast to const uint16_t* to be safe.*
992	#if defined(USING_NEON_FP16)
993	uint16x8x4_t v = vld4q_u16(rgba);
994	U16 R = (U16)v.val[`0`],
995	G = (U16)v.val[`1`],
996	B = (U16)v.val[`2`],
997	A = (U16)v.val[`3`];
998	#elif defined(USING_NEON)
999	uint16x4x4_t v = vld4_u16(rgba);
1000	U16 R = (U16)v.val[`0`],
1001	G = (U16)v.val[`1`],
1002	B = (U16)v.val[`2`],
1003	A = (U16)v.val[`3`];
1004	#else
1005	U64 px = load<U64>(rgba);
1006	U16 R = cast<U16>((px >> `0`) & `0xffff`),
1007	G = cast<U16>((px >> `16`) & `0xffff`),
1008	B = cast<U16>((px >> `32`) & `0xffff`),
1009	A = cast<U16>((px >> `48`) & `0xffff`);
1010	#endif
1011	r = F_from_Half(R);
1012	g = F_from_Half(G);
1013	b = F_from_Half(B);
1014	a = F_from_Half(A);
1015	} break;
1016
1017	case Op_load_fff:{
1018	uintptr_t ptr = (uintptr_t)(src + `12`*i);
1019	assert( (ptr & `3`) == `0` ); // src must be 4-byte aligned for this
1020	const float* rgb = (const float)ptr; // cast to const float* to be safe.*
1021	#if defined(USING_NEON_FP16)
1022	float32x4x3_t lo = vld3q_f32(rgb + `0`),
1023	hi = vld3q_f32(rgb + `12`);
1024	r = (F)vcombine_f16(vcvt_f16_f32(lo.val[`0`]), vcvt_f16_f32(hi.val[`0`]));
1025	g = (F)vcombine_f16(vcvt_f16_f32(lo.val[`1`]), vcvt_f16_f32(hi.val[`1`]));
1026	b = (F)vcombine_f16(vcvt_f16_f32(lo.val[`2`]), vcvt_f16_f32(hi.val[`2`]));
1027	#elif defined(USING_NEON)
1028	float32x4x3_t v = vld3q_f32(rgb);
1029	r = (F)v.val[`0`];
1030	g = (F)v.val[`1`];
1031	b = (F)v.val[`2`];
1032	#else
1033	r = load_3<F>(rgb+`0`);
1034	g = load_3<F>(rgb+`1`);
1035	b = load_3<F>(rgb+`2`);
1036	#endif
1037	} break;
1038
1039	case Op_load_ffff:{
1040	uintptr_t ptr = (uintptr_t)(src + `16`*i);
1041	assert( (ptr & `3`) == `0` ); // src must be 4-byte aligned for this
1042	const float* rgba = (const float)ptr; // cast to const float* to be safe.*
1043	#if defined(USING_NEON_FP16)
1044	float32x4x4_t lo = vld4q_f32(rgba + `0`),
1045	hi = vld4q_f32(rgba + `16`);
1046	r = (F)vcombine_f16(vcvt_f16_f32(lo.val[`0`]), vcvt_f16_f32(hi.val[`0`]));
1047	g = (F)vcombine_f16(vcvt_f16_f32(lo.val[`1`]), vcvt_f16_f32(hi.val[`1`]));
1048	b = (F)vcombine_f16(vcvt_f16_f32(lo.val[`2`]), vcvt_f16_f32(hi.val[`2`]));
1049	a = (F)vcombine_f16(vcvt_f16_f32(lo.val[`3`]), vcvt_f16_f32(hi.val[`3`]));
1050	#elif defined(USING_NEON)
1051	float32x4x4_t v = vld4q_f32(rgba);
1052	r = (F)v.val[`0`];
1053	g = (F)v.val[`1`];
1054	b = (F)v.val[`2`];
1055	a = (F)v.val[`3`];
1056	#else
1057	r = load_4<F>(rgba+`0`);
1058	g = load_4<F>(rgba+`1`);
1059	b = load_4<F>(rgba+`2`);
1060	a = load_4<F>(rgba+`3`);
1061	#endif
1062	} break;
1063
1064	case Op_swap_rb:{
1065	F t = r;
1066	r = b;
1067	b = t;
1068	} break;
1069
1070	case Op_clamp:{
1071	r = max_(F0, min_(r, F1));
1072	g = max_(F0, min_(g, F1));
1073	b = max_(F0, min_(b, F1));
1074	a = max_(F0, min_(a, F1));
1075	} break;
1076
1077	case Op_invert:{
1078	r = F1 - r;
1079	g = F1 - g;
1080	b = F1 - b;
1081	a = F1 - a;
1082	} break;
1083
1084	case Op_force_opaque:{
1085	a = F1;
1086	} break;
1087
1088	case Op_premul:{
1089	r *= a;
1090	g *= a;
1091	b *= a;
1092	} break;
1093
1094	case Op_unpremul:{
1095	F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1096	r *= scale;
1097	g *= scale;
1098	b *= scale;
1099	} break;
1100
1101	case Op_matrix_3x3:{
1102	const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3) args++;
1103	const float* m = &matrix->vals[`0`][`0`];
1104
1105	F R = m[`0`]r + m[`1`]g + m[`2`]*b,
1106	G = m[`3`]r + m[`4`]g + m[`5`]*b,
1107	B = m[`6`]r + m[`7`]g + m[`8`]*b;
1108
1109	r = R;
1110	g = G;
1111	b = B;
1112	} break;
1113
1114	case Op_matrix_3x4:{
1115	const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4) args++;
1116	const float* m = &matrix->vals[`0`][`0`];
1117
1118	F R = m[`0`]r + m[`1`]g + m[ `2`]*b + m[ `3`],
1119	G = m[`4`]r + m[`5`]g + m[ `6`]*b + m[ `7`],
1120	B = m[`8`]r + m[`9`]g + m[`10`]*b + m[`11`];
1121
1122	r = R;
1123	g = G;
1124	b = B;
1125	} break;
1126
1127	case Op_lab_to_xyz:{
1128	// The Lab values are in r,g,b, but normalized to [0,1]. Reconstruct them:
1129	F L = r * `100.0f`,
1130	A = g * `255.0f` - `128.0f`,
1131	B = b * `255.0f` - `128.0f`;
1132
1133	// Convert to CIE XYZ.
1134	F Y = (L + `16.0f`) * (`1`/`116.0f`),
1135	X = Y + A*(`1`/`500.0f`),
1136	Z = Y - B*(`1`/`200.0f`);
1137
1138	X = if_then_else(XXX > `0.008856f`, XXX, (X - (`16`/`116.0f`)) * (`1`/`7.787f`));
1139	Y = if_then_else(YYY > `0.008856f`, YYY, (Y - (`16`/`116.0f`)) * (`1`/`7.787f`));
1140	Z = if_then_else(ZZZ > `0.008856f`, ZZZ, (Z - (`16`/`116.0f`)) * (`1`/`7.787f`));
1141
1142	// Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1143	r = X * `0.9642f`;
1144	g = Y ;
1145	b = Z * `0.8249f`;
1146	} break;
1147
1148	case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction)args++, r); } break;
1149	case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction)args++, g); } break;
1150	case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction)args++, b); } break;
1151	case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction)args++, a); } break;
1152
1153	case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction)args++, r); } break;
1154	case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction)args++, g); } break;
1155	case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction)args++, b); } break;
1156	case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction)args++, a); } break;
1157
1158	case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction)args++, r); } break;
1159	case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction)args++, g); } break;
1160	case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction)args++, b); } break;
1161	case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction)args++, a); } break;
1162
1163	case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction)args++, r); } break;
1164	case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction)args++, g); } break;
1165	case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction)args++, b); } break;
1166	case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction)args++, a); } break;
1167
1168	case Op_table_r: { r = table((const skcms_Curve)args++, r); } break;
1169	case Op_table_g: { g = table((const skcms_Curve)args++, g); } break;
1170	case Op_table_b: { b = table((const skcms_Curve)args++, b); } break;
1171	case Op_table_a: { a = table((const skcms_Curve)args++, a); } break;
1172
1173	case Op_clut: {
1174	const skcms_A2B* a2b = (const skcms_A2B) args++;
1175	clut(a2b, &r,&g,&b,a);
1176
1177	if (a2b->input_channels == `4`) {
1178	// CMYK is opaque.
1179	a = F1;
1180	}
1181	} break;
1182
1183	// Notice, from here on down the store_ ops all return, ending the loop.
1184
1185	case Op_store_a8: {
1186	store(dst + `1`i, cast<U8>(to_fixed(a `255`)));
1187	} return;
1188
1189	case Op_store_g8: {
1190	// g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1191	store(dst + `1`i, cast<U8>(to_fixed(g `255`)));
1192	} return;
1193
1194	case Op_store_4444: {
1195	store<U16>(dst + `2`i, cast<U16>(to_fixed(r `15`) << `12`)
1196	\| cast<U16>(to_fixed(g * `15`) << `8`)
1197	\| cast<U16>(to_fixed(b * `15`) << `4`)
1198	\| cast<U16>(to_fixed(a * `15`) << `0`));
1199	} return;
1200
1201	case Op_store_565: {
1202	store<U16>(dst + `2`i, cast<U16>(to_fixed(r `31`) << `0` )
1203	\| cast<U16>(to_fixed(g * `63`) << `5` )
1204	\| cast<U16>(to_fixed(b * `31`) << `11` ));
1205	} return;
1206
1207	case Op_store_888: {
1208	uint8_t* rgb = (uint8_t)dst + `3`i;
1209	#if defined(USING_NEON_FP16)
1210	// See the explanation under USING_NEON below. This is that doubled up.
1211	U16 R = to_fixed(r * `255`),
1212	G = to_fixed(g * `255`),
1213	B = to_fixed(b * `255`);
1214
1215	uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
1216	vst3q_lane_u8(rgb+ `0`, v, `0`);
1217	vst3q_lane_u8(rgb+ `3`, v, `2`);
1218	vst3q_lane_u8(rgb+ `6`, v, `4`);
1219	vst3q_lane_u8(rgb+ `9`, v, `6`);
1220
1221	vst3q_lane_u8(rgb+`12`, v, `8`);
1222	vst3q_lane_u8(rgb+`15`, v, `10`);
1223	vst3q_lane_u8(rgb+`18`, v, `12`);
1224	vst3q_lane_u8(rgb+`21`, v, `14`);
1225	#elif defined(USING_NEON)
1226	// Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1227	// get there via U16 to save some instructions converting to float. And just
1228	// like load_888, we'd prefer to go via U32 but for ARMv7 support.
1229	U16 R = cast<U16>(to_fixed(r * `255`)),
1230	G = cast<U16>(to_fixed(g * `255`)),
1231	B = cast<U16>(to_fixed(b * `255`));
1232
1233	uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1234	vst3_lane_u8(rgb+`0`, v, `0`);
1235	vst3_lane_u8(rgb+`3`, v, `2`);
1236	vst3_lane_u8(rgb+`6`, v, `4`);
1237	vst3_lane_u8(rgb+`9`, v, `6`);
1238	#else
1239	store_3(rgb+`0`, cast<U8>(to_fixed(r * `255`)) );
1240	store_3(rgb+`1`, cast<U8>(to_fixed(g * `255`)) );
1241	store_3(rgb+`2`, cast<U8>(to_fixed(b * `255`)) );
1242	#endif
1243	} return;
1244
1245	case Op_store_8888: {
1246	store(dst + `4`i, cast<U32>(to_fixed(r `255`)) << `0`
1247	\| cast<U32>(to_fixed(g * `255`)) << `8`
1248	\| cast<U32>(to_fixed(b * `255`)) << `16`
1249	\| cast<U32>(to_fixed(a * `255`)) << `24`);
1250	} return;
1251
1252	case Op_store_1010102: {
1253	store(dst + `4`i, cast<U32>(to_fixed(r `1023`)) << `0`
1254	\| cast<U32>(to_fixed(g * `1023`)) << `10`
1255	\| cast<U32>(to_fixed(b * `1023`)) << `20`
1256	\| cast<U32>(to_fixed(a * `3`)) << `30`);
1257	} return;
1258
1259	case Op_store_161616LE: {
1260	uintptr_t ptr = (uintptr_t)(dst + `6`*i);
1261	assert( (ptr & `1`) == `0` ); // The dst pointer must be 2-byte aligned
1262	uint16_t* rgb = (uint16_t)ptr; // for this cast to uint16_t* to be safe.*
1263	#if defined(USING_NEON_FP16)
1264	uint16x8x3_t v = {{
1265	(uint16x8_t)U16_from_F(r),
1266	(uint16x8_t)U16_from_F(g),
1267	(uint16x8_t)U16_from_F(b),
1268	}};
1269	vst3q_u16(rgb, v);
1270	#elif defined(USING_NEON)
1271	uint16x4x3_t v = {{
1272	(uint16x4_t)U16_from_F(r),
1273	(uint16x4_t)U16_from_F(g),
1274	(uint16x4_t)U16_from_F(b),
1275	}};
1276	vst3_u16(rgb, v);
1277	#else
1278	store_3(rgb+`0`, U16_from_F(r));
1279	store_3(rgb+`1`, U16_from_F(g));
1280	store_3(rgb+`2`, U16_from_F(b));
1281	#endif
1282
1283	} return;
1284
1285	case Op_store_16161616LE: {
1286	uintptr_t ptr = (uintptr_t)(dst + `8`*i);
1287	assert( (ptr & `1`) == `0` ); // The dst pointer must be 2-byte aligned
1288	uint16_t* rgba = (uint16_t)ptr; // for this cast to uint16_t* to be safe.*
1289	#if defined(USING_NEON_FP16)
1290	uint16x8x4_t v = {{
1291	(uint16x8_t)U16_from_F(r),
1292	(uint16x8_t)U16_from_F(g),
1293	(uint16x8_t)U16_from_F(b),
1294	(uint16x8_t)U16_from_F(a),
1295	}};
1296	vst4q_u16(rgba, v);
1297	#elif defined(USING_NEON)
1298	uint16x4x4_t v = {{
1299	(uint16x4_t)U16_from_F(r),
1300	(uint16x4_t)U16_from_F(g),
1301	(uint16x4_t)U16_from_F(b),
1302	(uint16x4_t)U16_from_F(a),
1303	}};
1304	vst4_u16(rgba, v);
1305	#else
1306	U64 px = cast<U64>(to_fixed(r * `65535`)) << `0`
1307	\| cast<U64>(to_fixed(g * `65535`)) << `16`
1308	\| cast<U64>(to_fixed(b * `65535`)) << `32`
1309	\| cast<U64>(to_fixed(a * `65535`)) << `48`;
1310	store(rgba, px);
1311	#endif
1312	} return;
1313
1314	case Op_store_161616BE: {
1315	uintptr_t ptr = (uintptr_t)(dst + `6`*i);
1316	assert( (ptr & `1`) == `0` ); // The dst pointer must be 2-byte aligned
1317	uint16_t* rgb = (uint16_t)ptr; // for this cast to uint16_t* to be safe.*
1318	#if defined(USING_NEON_FP16)
1319	uint16x8x3_t v = {{
1320	(uint16x8_t)swap_endian_16(U16_from_F(r)),
1321	(uint16x8_t)swap_endian_16(U16_from_F(g)),
1322	(uint16x8_t)swap_endian_16(U16_from_F(b)),
1323	}};
1324	vst3q_u16(rgb, v);
1325	#elif defined(USING_NEON)
1326	uint16x4x3_t v = {{
1327	(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1328	(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1329	(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1330	}};
1331	vst3_u16(rgb, v);
1332	#else
1333	U32 R = to_fixed(r * `65535`),
1334	G = to_fixed(g * `65535`),
1335	B = to_fixed(b * `65535`);
1336	store_3(rgb+`0`, cast<U16>((R & `0x00ff`) << `8` \| (R & `0xff00`) >> `8`) );
1337	store_3(rgb+`1`, cast<U16>((G & `0x00ff`) << `8` \| (G & `0xff00`) >> `8`) );
1338	store_3(rgb+`2`, cast<U16>((B & `0x00ff`) << `8` \| (B & `0xff00`) >> `8`) );
1339	#endif
1340
1341	} return;
1342
1343	case Op_store_16161616BE: {
1344	uintptr_t ptr = (uintptr_t)(dst + `8`*i);
1345	assert( (ptr & `1`) == `0` ); // The dst pointer must be 2-byte aligned
1346	uint16_t* rgba = (uint16_t)ptr; // for this cast to uint16_t* to be safe.*
1347	#if defined(USING_NEON_FP16)
1348	uint16x8x4_t v = {{
1349	(uint16x8_t)swap_endian_16(U16_from_F(r)),
1350	(uint16x8_t)swap_endian_16(U16_from_F(g)),
1351	(uint16x8_t)swap_endian_16(U16_from_F(b)),
1352	(uint16x8_t)swap_endian_16(U16_from_F(a)),
1353	}};
1354	vst4q_u16(rgba, v);
1355	#elif defined(USING_NEON)
1356	uint16x4x4_t v = {{
1357	(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1358	(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1359	(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1360	(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1361	}};
1362	vst4_u16(rgba, v);
1363	#else
1364	U64 px = cast<U64>(to_fixed(r * `65535`)) << `0`
1365	\| cast<U64>(to_fixed(g * `65535`)) << `16`
1366	\| cast<U64>(to_fixed(b * `65535`)) << `32`
1367	\| cast<U64>(to_fixed(a * `65535`)) << `48`;
1368	store(rgba, swap_endian_16x4(px));
1369	#endif
1370	} return;
1371
1372	case Op_store_hhh: {
1373	uintptr_t ptr = (uintptr_t)(dst + `6`*i);
1374	assert( (ptr & `1`) == `0` ); // The dst pointer must be 2-byte aligned
1375	uint16_t* rgb = (uint16_t)ptr; // for this cast to uint16_t* to be safe.*
1376
1377	U16 R = Half_from_F(r),
1378	G = Half_from_F(g),
1379	B = Half_from_F(b);
1380	#if defined(USING_NEON_FP16)
1381	uint16x8x3_t v = {{
1382	(uint16x8_t)R,
1383	(uint16x8_t)G,
1384	(uint16x8_t)B,
1385	}};
1386	vst3q_u16(rgb, v);
1387	#elif defined(USING_NEON)
1388	uint16x4x3_t v = {{
1389	(uint16x4_t)R,
1390	(uint16x4_t)G,
1391	(uint16x4_t)B,
1392	}};
1393	vst3_u16(rgb, v);
1394	#else
1395	store_3(rgb+`0`, R);
1396	store_3(rgb+`1`, G);
1397	store_3(rgb+`2`, B);
1398	#endif
1399	} return;
1400
1401	case Op_store_hhhh: {
1402	uintptr_t ptr = (uintptr_t)(dst + `8`*i);
1403	assert( (ptr & `1`) == `0` ); // The dst pointer must be 2-byte aligned
1404	uint16_t* rgba = (uint16_t)ptr; // for this cast to uint16_t* to be safe.*
1405
1406	U16 R = Half_from_F(r),
1407	G = Half_from_F(g),
1408	B = Half_from_F(b),
1409	A = Half_from_F(a);
1410	#if defined(USING_NEON_FP16)
1411	uint16x8x4_t v = {{
1412	(uint16x8_t)R,
1413	(uint16x8_t)G,
1414	(uint16x8_t)B,
1415	(uint16x8_t)A,
1416	}};
1417	vst4q_u16(rgba, v);
1418	#elif defined(USING_NEON)
1419	uint16x4x4_t v = {{
1420	(uint16x4_t)R,
1421	(uint16x4_t)G,
1422	(uint16x4_t)B,
1423	(uint16x4_t)A,
1424	}};
1425	vst4_u16(rgba, v);
1426	#else
1427	store(rgba, cast<U64>(R) << `0`
1428	\| cast<U64>(G) << `16`
1429	\| cast<U64>(B) << `32`
1430	\| cast<U64>(A) << `48`);
1431	#endif
1432
1433	} return;
1434
1435	case Op_store_fff: {
1436	uintptr_t ptr = (uintptr_t)(dst + `12`*i);
1437	assert( (ptr & `3`) == `0` ); // The dst pointer must be 4-byte aligned
1438	float* rgb = (float)ptr; // for this cast to float* to be safe.*
1439	#if defined(USING_NEON_FP16)
1440	float32x4x3_t lo = {{
1441	vcvt_f32_f16(vget_low_f16(r)),
1442	vcvt_f32_f16(vget_low_f16(g)),
1443	vcvt_f32_f16(vget_low_f16(b)),
1444	}}, hi = {{
1445	vcvt_f32_f16(vget_high_f16(r)),
1446	vcvt_f32_f16(vget_high_f16(g)),
1447	vcvt_f32_f16(vget_high_f16(b)),
1448	}};
1449	vst3q_f32(rgb + `0`, lo);
1450	vst3q_f32(rgb + `12`, hi);
1451	#elif defined(USING_NEON)
1452	float32x4x3_t v = {{
1453	(float32x4_t)r,
1454	(float32x4_t)g,
1455	(float32x4_t)b,
1456	}};
1457	vst3q_f32(rgb, v);
1458	#else
1459	store_3(rgb+`0`, r);
1460	store_3(rgb+`1`, g);
1461	store_3(rgb+`2`, b);
1462	#endif
1463	} return;
1464
1465	case Op_store_ffff: {
1466	uintptr_t ptr = (uintptr_t)(dst + `16`*i);
1467	assert( (ptr & `3`) == `0` ); // The dst pointer must be 4-byte aligned
1468	float* rgba = (float)ptr; // for this cast to float* to be safe.*
1469	#if defined(USING_NEON_FP16)
1470	float32x4x4_t lo = {{
1471	vcvt_f32_f16(vget_low_f16(r)),
1472	vcvt_f32_f16(vget_low_f16(g)),
1473	vcvt_f32_f16(vget_low_f16(b)),
1474	vcvt_f32_f16(vget_low_f16(a)),
1475	}}, hi = {{
1476	vcvt_f32_f16(vget_high_f16(r)),
1477	vcvt_f32_f16(vget_high_f16(g)),
1478	vcvt_f32_f16(vget_high_f16(b)),
1479	vcvt_f32_f16(vget_high_f16(a)),
1480	}};
1481	vst4q_f32(rgba + `0`, lo);
1482	vst4q_f32(rgba + `16`, hi);
1483	#elif defined(USING_NEON)
1484	float32x4x4_t v = {{
1485	(float32x4_t)r,
1486	(float32x4_t)g,
1487	(float32x4_t)b,
1488	(float32x4_t)a,
1489	}};
1490	vst4q_f32(rgba, v);
1491	#else
1492	store_4(rgba+`0`, r);
1493	store_4(rgba+`1`, g);
1494	store_4(rgba+`2`, b);
1495	store_4(rgba+`3`, a);
1496	#endif
1497	} return;
1498	}
1499	}
1500	}
1501
1502
1503	static void run_program(const Op* program, const void** arguments,
1504	const char* src, char* dst, int n,
1505	const size_t src_bpp, const size_t dst_bpp) {
1506	int i = `0`;
1507	while (n >= N) {
1508	exec_ops(program, arguments, src, dst, i);
1509	i += N;
1510	n -= N;
1511	}
1512	if (n > `0`) {
1513	char tmp[`4``4`N] = {`0`};
1514
1515	memcpy(tmp, (const char)src + (size_t)isrc_bpp, (size_t)n*src_bpp);
1516	exec_ops(program, arguments, tmp, tmp, `0`);
1517	memcpy((char)dst + (size_t)idst_bpp, tmp, (size_t)n*dst_bpp);
1518	}
1519	}
1520
1521	// Clean up any #defines we may have set so that we can be #included again.
1522	#if defined(USING_AVX)
1523	#undef USING_AVX
1524	#endif
1525	#if defined(USING_AVX_F16C)
1526	#undef USING_AVX_F16C
1527	#endif
1528	#if defined(USING_AVX2)
1529	#undef USING_AVX2
1530	#endif
1531	#if defined(USING_AVX512F)
1532	#undef USING_AVX512F
1533	#endif
1534
1535	#if defined(USING_NEON)
1536	#undef USING_NEON
1537	#endif
1538	#if defined(USING_NEON_F16C)
1539	#undef USING_NEON_F16C
1540	#endif
1541	#if defined(USING_NEON_FP16)
1542	#undef USING_NEON_FP16
1543	#endif
1544
1545	#undef FALLTHROUGH
1546

Browse the source code of Skia/third_party/skcms/src/Transform_inl.h