SkRasterPipeline_opts.h source code [engine/third_party/skia/src/opts/SkRasterPipeline_opts.h]

1	/*
2	* Copyright 2018 Google Inc.
3	*
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the LICENSE file.
6	*/
7
8	#ifndef SkRasterPipeline_opts_DEFINED
9	#define SkRasterPipeline_opts_DEFINED
10
11	#include "include/core/SkData.h"
12	#include "include/core/SkTypes.h"
13	#include "src/core/SkUtils.h" // unaligned_{load,store}
14
15	// Every function in this file should be marked static and inline using SI.
16	#if defined(__clang__)
17	#define SI __attribute__((always_inline)) static inline
18	#else
19	#define SI static inline
20	#endif
21
22	template <typename Dst, typename Src>
23	SI Dst widen_cast(const Src& src) {
24	static_assert(sizeof(Dst) > sizeof(Src));
25	static_assert(std::is_trivially_copyable<Dst>::value);
26	static_assert(std::is_trivially_copyable<Src>::value);
27	Dst dst;
28	memcpy(&dst, &src, sizeof(Src));
29	return dst;
30	}
31
32	// Our program is an array of void, either*
33	// - 1 void per stage with no context pointer, the next stage;*
34	// - 2 void per stage with a context pointer, first the context pointer, then the next stage.*
35
36	// load_and_inc() steps the program forward by 1 void, returning that pointer.*
37	SI void* load_and_inc(void**& program) {
38	#if defined(__GNUC__) && defined(__x86_64__)
39	// If program is in %rsi (we try to make this likely) then this is a single instruction.
40	void* rax;
41	asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi.
42	return rax;
43	#else
44	// On ARM program++ compiles into pretty ideal code without any handholding.*
45	return *program++;
46	#endif
47	}
48
49	// Lazily resolved on first cast. Does nothing if cast to Ctx::None.
50	struct Ctx {
51	struct None {};
52
53	void* ptr;
54	void**& program;
55
56	explicit Ctx(void& p) : ptr(nullptr**), program(p) {}
57
58	template <typename T>
59	operator T*() {
60	if (!ptr) { ptr = load_and_inc(program); }
61	return (T*)ptr;
62	}
63	operator None() { return None{}; }
64	};
65
66
67	#if !defined(__clang__)
68	#define JUMPER_IS_SCALAR
69	#elif defined(SK_ARM_HAS_NEON)
70	#define JUMPER_IS_NEON
71	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
72	#define JUMPER_IS_SKX
73	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
74	#define JUMPER_IS_HSW
75	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
76	#define JUMPER_IS_AVX
77	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
78	#define JUMPER_IS_SSE41
79	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
80	#define JUMPER_IS_SSE2
81	#else
82	#define JUMPER_IS_SCALAR
83	#endif
84
85	// Older Clangs seem to crash when generating non-optimized NEON code for ARMv7.
86	#if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32)
87	// Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative.
88	#if defined(__apple_build_version__) && __clang_major__ < 9
89	#define JUMPER_IS_SCALAR
90	#elif __clang_major__ < 5
91	#define JUMPER_IS_SCALAR
92	#endif
93
94	#if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR)
95	#undef JUMPER_IS_NEON
96	#endif
97	#endif
98
99	#if defined(JUMPER_IS_SCALAR)
100	#include <math.h>
101	#elif defined(JUMPER_IS_NEON)
102	#include <arm_neon.h>
103	#else
104	#include <immintrin.h>
105	#endif
106
107	namespace SK_OPTS_NS {
108
109	#if defined(JUMPER_IS_SCALAR)
110	// This path should lead to portable scalar code.
111	using F = float ;
112	using I32 = int32_t;
113	using U64 = uint64_t;
114	using U32 = uint32_t;
115	using U16 = uint16_t;
116	using U8 = uint8_t ;
117
118	SI F mad(F f, F m, F a) { return f*m+a; }
119	SI F min(F a, F b) { return fminf(a,b); }
120	SI F max(F a, F b) { return fmaxf(a,b); }
121	SI F abs_ (F v) { return fabsf(v); }
122	SI F floor_(F v) { return floorf(v); }
123	SI F rcp (F v) { return `1.0f` / v; }
124	SI F rsqrt (F v) { return `1.0f` / sqrtf(v); }
125	SI F sqrt_(F v) { return sqrtf(v); }
126	SI U32 round (F v, F scale) { return (uint32_t)(v*scale + `0.5f`); }
127	SI U16 pack(U32 v) { return (U16)v; }
128	SI U8 pack(U16 v) { return (U8)v; }
129
130	SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
131
132	template <typename T>
133	SI T gather(const T* p, U32 ix) { return p[ix]; }
134
135	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
136	*r = ptr[`0`];
137	*g = ptr[`1`];
138	}
139	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
140	ptr[`0`] = r;
141	ptr[`1`] = g;
142	}
143	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
144	*r = ptr[`0`];
145	*g = ptr[`1`];
146	*b = ptr[`2`];
147	}
148	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
149	*r = ptr[`0`];
150	*g = ptr[`1`];
151	*b = ptr[`2`];
152	*a = ptr[`3`];
153	}
154	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
155	ptr[`0`] = r;
156	ptr[`1`] = g;
157	ptr[`2`] = b;
158	ptr[`3`] = a;
159	}
160
161	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
162	*r = ptr[`0`];
163	*g = ptr[`1`];
164	}
165	SI void store2(float* ptr, size_t tail, F r, F g) {
166	ptr[`0`] = r;
167	ptr[`1`] = g;
168	}
169	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
170	*r = ptr[`0`];
171	*g = ptr[`1`];
172	*b = ptr[`2`];
173	*a = ptr[`3`];
174	}
175	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
176	ptr[`0`] = r;
177	ptr[`1`] = g;
178	ptr[`2`] = b;
179	ptr[`3`] = a;
180	}
181
182	#elif defined(JUMPER_IS_NEON)
183	// Since we know we're using Clang, we can use its vector extensions.
184	template <typename T> using V = T __attribute__((ext_vector_type(`4`)));
185	using F = V<float >;
186	using I32 = V< int32_t>;
187	using U64 = V<uint64_t>;
188	using U32 = V<uint32_t>;
189	using U16 = V<uint16_t>;
190	using U8 = V<uint8_t >;
191
192	// We polyfill a few routines that Clang doesn't build into ext_vector_types.
193	SI F min(F a, F b) { return vminq_f32(a,b); }
194	SI F max(F a, F b) { return vmaxq_f32(a,b); }
195	SI F abs_ (F v) { return vabsq_f32(v); }
196	SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
197	SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,ee) e; }
198	SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
199	SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
200
201	SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
202
203	#if defined(SK_CPU_ARM64)
204	SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
205	SI F floor_(F v) { return vrndmq_f32(v); }
206	SI F sqrt_(F v) { return vsqrtq_f32(v); }
207	SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
208	#else
209	SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
210	SI F floor_(F v) {
211	F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
212	return roundtrip - if_then_else(roundtrip > v, `1`, `0`);
213	}
214
215	SI F sqrt_(F v) {
216	auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v).
217	e = vrsqrtsq_f32(v,ee);
218	e = vrsqrtsq_f32(v,ee);
219	return ve; // sqrt(v) == vrsqrt(v).
220	}
221
222	SI U32 round(F v, F scale) {
223	return vcvtq_u32_f32(mad(v,scale,`0.5f`));
224	}
225	#endif
226
227
228	template <typename T>
229	SI V<T> gather(const T* p, U32 ix) {
230	return {p[ix[`0`]], p[ix[`1`]], p[ix[`2`]], p[ix[`3`]]};
231	}
232	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
233	uint16x4x2_t rg;
234	if (__builtin_expect(tail,`0`)) {
235	if ( true ) { rg = vld2_lane_u16(ptr + `0`, rg, `0`); }
236	if (tail > `1`) { rg = vld2_lane_u16(ptr + `2`, rg, `1`); }
237	if (tail > `2`) { rg = vld2_lane_u16(ptr + `4`, rg, `2`); }
238	} else {
239	rg = vld2_u16(ptr);
240	}
241	*r = rg.val[`0`];
242	*g = rg.val[`1`];
243	}
244	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
245	if (__builtin_expect(tail,`0`)) {
246	if ( true ) { vst2_lane_u16(ptr + `0`, (uint16x4x2_t{{r,g}}), `0`); }
247	if (tail > `1`) { vst2_lane_u16(ptr + `2`, (uint16x4x2_t{{r,g}}), `1`); }
248	if (tail > `2`) { vst2_lane_u16(ptr + `4`, (uint16x4x2_t{{r,g}}), `2`); }
249	} else {
250	vst2_u16(ptr, (uint16x4x2_t{{r,g}}));
251	}
252	}
253	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
254	uint16x4x3_t rgb;
255	if (__builtin_expect(tail,`0`)) {
256	if ( true ) { rgb = vld3_lane_u16(ptr + `0`, rgb, `0`); }
257	if (tail > `1`) { rgb = vld3_lane_u16(ptr + `3`, rgb, `1`); }
258	if (tail > `2`) { rgb = vld3_lane_u16(ptr + `6`, rgb, `2`); }
259	} else {
260	rgb = vld3_u16(ptr);
261	}
262	*r = rgb.val[`0`];
263	*g = rgb.val[`1`];
264	*b = rgb.val[`2`];
265	}
266	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
267	uint16x4x4_t rgba;
268	if (__builtin_expect(tail,`0`)) {
269	if ( true ) { rgba = vld4_lane_u16(ptr + `0`, rgba, `0`); }
270	if (tail > `1`) { rgba = vld4_lane_u16(ptr + `4`, rgba, `1`); }
271	if (tail > `2`) { rgba = vld4_lane_u16(ptr + `8`, rgba, `2`); }
272	} else {
273	rgba = vld4_u16(ptr);
274	}
275	*r = rgba.val[`0`];
276	*g = rgba.val[`1`];
277	*b = rgba.val[`2`];
278	*a = rgba.val[`3`];
279	}
280
281	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
282	if (__builtin_expect(tail,`0`)) {
283	if ( true ) { vst4_lane_u16(ptr + `0`, (uint16x4x4_t{{r,g,b,a}}), `0`); }
284	if (tail > `1`) { vst4_lane_u16(ptr + `4`, (uint16x4x4_t{{r,g,b,a}}), `1`); }
285	if (tail > `2`) { vst4_lane_u16(ptr + `8`, (uint16x4x4_t{{r,g,b,a}}), `2`); }
286	} else {
287	vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
288	}
289	}
290	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
291	float32x4x2_t rg;
292	if (__builtin_expect(tail,`0`)) {
293	if ( true ) { rg = vld2q_lane_f32(ptr + `0`, rg, `0`); }
294	if (tail > `1`) { rg = vld2q_lane_f32(ptr + `2`, rg, `1`); }
295	if (tail > `2`) { rg = vld2q_lane_f32(ptr + `4`, rg, `2`); }
296	} else {
297	rg = vld2q_f32(ptr);
298	}
299	*r = rg.val[`0`];
300	*g = rg.val[`1`];
301	}
302	SI void store2(float* ptr, size_t tail, F r, F g) {
303	if (__builtin_expect(tail,`0`)) {
304	if ( true ) { vst2q_lane_f32(ptr + `0`, (float32x4x2_t{{r,g}}), `0`); }
305	if (tail > `1`) { vst2q_lane_f32(ptr + `2`, (float32x4x2_t{{r,g}}), `1`); }
306	if (tail > `2`) { vst2q_lane_f32(ptr + `4`, (float32x4x2_t{{r,g}}), `2`); }
307	} else {
308	vst2q_f32(ptr, (float32x4x2_t{{r,g}}));
309	}
310	}
311	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
312	float32x4x4_t rgba;
313	if (__builtin_expect(tail,`0`)) {
314	if ( true ) { rgba = vld4q_lane_f32(ptr + `0`, rgba, `0`); }
315	if (tail > `1`) { rgba = vld4q_lane_f32(ptr + `4`, rgba, `1`); }
316	if (tail > `2`) { rgba = vld4q_lane_f32(ptr + `8`, rgba, `2`); }
317	} else {
318	rgba = vld4q_f32(ptr);
319	}
320	*r = rgba.val[`0`];
321	*g = rgba.val[`1`];
322	*b = rgba.val[`2`];
323	*a = rgba.val[`3`];
324	}
325	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
326	if (__builtin_expect(tail,`0`)) {
327	if ( true ) { vst4q_lane_f32(ptr + `0`, (float32x4x4_t{{r,g,b,a}}), `0`); }
328	if (tail > `1`) { vst4q_lane_f32(ptr + `4`, (float32x4x4_t{{r,g,b,a}}), `1`); }
329	if (tail > `2`) { vst4q_lane_f32(ptr + `8`, (float32x4x4_t{{r,g,b,a}}), `2`); }
330	} else {
331	vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
332	}
333	}
334
335	#elif defined(JUMPER_IS_AVX) \|\| defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
336	// These are __m256 and __m256i, but friendlier and strongly-typed.
337	template <typename T> using V = T __attribute__((ext_vector_type(`8`)));
338	using F = V<float >;
339	using I32 = V< int32_t>;
340	using U64 = V<uint64_t>;
341	using U32 = V<uint32_t>;
342	using U16 = V<uint16_t>;
343	using U8 = V<uint8_t >;
344
345	SI F mad(F f, F m, F a) {
346	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
347	return _mm256_fmadd_ps(f,m,a);
348	#else
349	return f*m+a;
350	#endif
351	}
352
353	SI F min(F a, F b) { return _mm256_min_ps(a,b); }
354	SI F max(F a, F b) { return _mm256_max_ps(a,b); }
355	SI F abs_ (F v) { return _mm256_and_ps(v, `0`-v); }
356	SI F floor_(F v) { return _mm256_floor_ps(v); }
357	SI F rcp (F v) { return _mm256_rcp_ps (v); }
358	SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); }
359	SI F sqrt_(F v) { return _mm256_sqrt_ps (v); }
360	SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
361
362	SI U16 pack(U32 v) {
363	return _mm_packus_epi32(_mm256_extractf128_si256(v, `0`),
364	_mm256_extractf128_si256(v, `1`));
365	}
366	SI U8 pack(U16 v) {
367	auto r = _mm_packus_epi16(v,v);
368	return sk_unaligned_load<U8>(&r);
369	}
370
371	SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
372
373	template <typename T>
374	SI V<T> gather(const T* p, U32 ix) {
375	return { p[ix[`0`]], p[ix[`1`]], p[ix[`2`]], p[ix[`3`]],
376	p[ix[`4`]], p[ix[`5`]], p[ix[`6`]], p[ix[`7`]], };
377	}
378	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
379	SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, `4`); }
380	SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, `4`); }
381	SI U64 gather(const uint64_t* p, U32 ix) {
382	__m256i parts[] = {
383	_mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,`0`), `8`),
384	_mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,`1`), `8`),
385	};
386	return sk_bit_cast<U64>(parts);
387	}
388	#endif
389
390	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
391	U16 _0123, _4567;
392	if (__builtin_expect(tail,`0`)) {
393	_0123 = _4567 = _mm_setzero_si128();
394	auto* d = &_0123;
395	if (tail > `3`) {
396	d = _mm_loadu_si128(((__m128i)ptr) + `0`);
397	tail -= `4`;
398	ptr += `8`;
399	d = &_4567;
400	}
401	bool high = false;
402	if (tail > `1`) {
403	*d = _mm_loadu_si64(ptr);
404	tail -= `2`;
405	ptr += `4`;
406	high = true;
407	}
408	if (tail > `0`) {
409	(d)[high ? `4` : `0`] = (ptr + `0`);
410	(d)[high ? `5` : `1`] = (ptr + `1`);
411	}
412	} else {
413	_0123 = _mm_loadu_si128(((__m128i*)ptr) + `0`);
414	_4567 = _mm_loadu_si128(((__m128i*)ptr) + `1`);
415	}
416	*r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, `16`), `16`),
417	_mm_srai_epi32(_mm_slli_epi32(_4567, `16`), `16`));
418	*g = _mm_packs_epi32(_mm_srai_epi32(_0123, `16`),
419	_mm_srai_epi32(_4567, `16`));
420	}
421	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
422	auto _0123 = _mm_unpacklo_epi16(r, g),
423	_4567 = _mm_unpackhi_epi16(r, g);
424	if (__builtin_expect(tail,`0`)) {
425	const auto* s = &_0123;
426	if (tail > `3`) {
427	_mm_storeu_si128((__m128i)ptr, s);
428	s = &_4567;
429	tail -= `4`;
430	ptr += `8`;
431	}
432	bool high = false;
433	if (tail > `1`) {
434	_mm_storel_epi64((__m128i)ptr, s);
435	ptr += `4`;
436	tail -= `2`;
437	high = true;
438	}
439	if (tail > `0`) {
440	if (high) {
441	(int32_t)ptr = _mm_extract_epi32(*s, `2`);
442	} else {
443	(int32_t)ptr = _mm_cvtsi128_si32(*s);
444	}
445	}
446	} else {
447	_mm_storeu_si128((__m128i*)ptr + `0`, _0123);
448	_mm_storeu_si128((__m128i*)ptr + `1`, _4567);
449	}
450	}
451
452	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
453	__m128i _0,_1,_2,_3,_4,_5,_6,_7;
454	if (__builtin_expect(tail,`0`)) {
455	auto load_rgb = [](const uint16_t* src) {
456	auto v = _mm_cvtsi32_si128((const* uint32_t*)src);
457	return _mm_insert_epi16(v, src[`2`], `2`);
458	};
459	_1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128();
460	if ( true ) { _0 = load_rgb(ptr + `0`); }
461	if (tail > `1`) { _1 = load_rgb(ptr + `3`); }
462	if (tail > `2`) { _2 = load_rgb(ptr + `6`); }
463	if (tail > `3`) { _3 = load_rgb(ptr + `9`); }
464	if (tail > `4`) { _4 = load_rgb(ptr + `12`); }
465	if (tail > `5`) { _5 = load_rgb(ptr + `15`); }
466	if (tail > `6`) { _6 = load_rgb(ptr + `18`); }
467	} else {
468	// Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over.
469	auto _01 = _mm_loadu_si128((const __m128i*)(ptr + `0`)) ;
470	auto _23 = _mm_loadu_si128((const __m128i*)(ptr + `6`)) ;
471	auto _45 = _mm_loadu_si128((const __m128i*)(ptr + `12`)) ;
472	auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + `16`)), `4`);
473	_0 = _01; _1 = _mm_srli_si128(_01, `6`);
474	_2 = _23; _3 = _mm_srli_si128(_23, `6`);
475	_4 = _45; _5 = _mm_srli_si128(_45, `6`);
476	_6 = _67; _7 = _mm_srli_si128(_67, `6`);
477	}
478
479	auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
480	_13 = _mm_unpacklo_epi16(_1, _3),
481	_46 = _mm_unpacklo_epi16(_4, _6),
482	_57 = _mm_unpacklo_epi16(_5, _7);
483
484	auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
485	bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx
486	rg4567 = _mm_unpacklo_epi16(_46, _57),
487	bx4567 = _mm_unpackhi_epi16(_46, _57);
488
489	*r = _mm_unpacklo_epi64(rg0123, rg4567);
490	*g = _mm_unpackhi_epi64(rg0123, rg4567);
491	*b = _mm_unpacklo_epi64(bx0123, bx4567);
492	}
493	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
494	__m128i _01, _23, _45, _67;
495	if (__builtin_expect(tail,`0`)) {
496	auto src = (const double*)ptr;
497	_01 = _23 = _45 = _67 = _mm_setzero_si128();
498	if (tail > `0`) { _01 = _mm_loadl_pd(_01, src+`0`); }
499	if (tail > `1`) { _01 = _mm_loadh_pd(_01, src+`1`); }
500	if (tail > `2`) { _23 = _mm_loadl_pd(_23, src+`2`); }
501	if (tail > `3`) { _23 = _mm_loadh_pd(_23, src+`3`); }
502	if (tail > `4`) { _45 = _mm_loadl_pd(_45, src+`4`); }
503	if (tail > `5`) { _45 = _mm_loadh_pd(_45, src+`5`); }
504	if (tail > `6`) { _67 = _mm_loadl_pd(_67, src+`6`); }
505	} else {
506	_01 = _mm_loadu_si128(((__m128i*)ptr) + `0`);
507	_23 = _mm_loadu_si128(((__m128i*)ptr) + `1`);
508	_45 = _mm_loadu_si128(((__m128i*)ptr) + `2`);
509	_67 = _mm_loadu_si128(((__m128i*)ptr) + `3`);
510	}
511
512	auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
513	_13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
514	_46 = _mm_unpacklo_epi16(_45, _67),
515	_57 = _mm_unpackhi_epi16(_45, _67);
516
517	auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
518	ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
519	rg4567 = _mm_unpacklo_epi16(_46, _57),
520	ba4567 = _mm_unpackhi_epi16(_46, _57);
521
522	*r = _mm_unpacklo_epi64(rg0123, rg4567);
523	*g = _mm_unpackhi_epi64(rg0123, rg4567);
524	*b = _mm_unpacklo_epi64(ba0123, ba4567);
525	*a = _mm_unpackhi_epi64(ba0123, ba4567);
526	}
527	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
528	auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3
529	rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7
530	ba0123 = _mm_unpacklo_epi16(b, a),
531	ba4567 = _mm_unpackhi_epi16(b, a);
532
533	auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
534	_23 = _mm_unpackhi_epi32(rg0123, ba0123),
535	_45 = _mm_unpacklo_epi32(rg4567, ba4567),
536	_67 = _mm_unpackhi_epi32(rg4567, ba4567);
537
538	if (__builtin_expect(tail,`0`)) {
539	auto dst = (double*)ptr;
540	if (tail > `0`) { _mm_storel_pd(dst+`0`, _01); }
541	if (tail > `1`) { _mm_storeh_pd(dst+`1`, _01); }
542	if (tail > `2`) { _mm_storel_pd(dst+`2`, _23); }
543	if (tail > `3`) { _mm_storeh_pd(dst+`3`, _23); }
544	if (tail > `4`) { _mm_storel_pd(dst+`4`, _45); }
545	if (tail > `5`) { _mm_storeh_pd(dst+`5`, _45); }
546	if (tail > `6`) { _mm_storel_pd(dst+`6`, _67); }
547	} else {
548	_mm_storeu_si128((__m128i*)ptr + `0`, _01);
549	_mm_storeu_si128((__m128i*)ptr + `1`, _23);
550	_mm_storeu_si128((__m128i*)ptr + `2`, _45);
551	_mm_storeu_si128((__m128i*)ptr + `3`, _67);
552	}
553	}
554
555	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
556	F _0123, _4567;
557	if (__builtin_expect(tail, `0`)) {
558	_0123 = _4567 = _mm256_setzero_ps();
559	F* d = &_0123;
560	if (tail > `3`) {
561	*d = _mm256_loadu_ps(ptr);
562	ptr += `8`;
563	tail -= `4`;
564	d = &_4567;
565	}
566	bool high = false;
567	if (tail > `1`) {
568	*d = _mm256_castps128_ps256(_mm_loadu_ps(ptr));
569	ptr += `4`;
570	tail -= `2`;
571	high = true;
572	}
573	if (tail > `0`) {
574	d = high ? _mm256_insertf128_ps(d, _mm_loadu_si64(ptr), `1`)
575	: _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), `0`);
576	}
577	} else {
578	_0123 = _mm256_loadu_ps(ptr + `0`);
579	_4567 = _mm256_loadu_ps(ptr + `8`);
580	}
581
582	F _0145 = _mm256_permute2f128_pd(_0123, _4567, `0x20`),
583	_2367 = _mm256_permute2f128_pd(_0123, _4567, `0x31`);
584
585	*r = _mm256_shuffle_ps(_0145, _2367, `0x88`);
586	*g = _mm256_shuffle_ps(_0145, _2367, `0xDD`);
587	}
588	SI void store2(float* ptr, size_t tail, F r, F g) {
589	F _0145 = _mm256_unpacklo_ps(r, g),
590	_2367 = _mm256_unpackhi_ps(r, g);
591	F _0123 = _mm256_permute2f128_pd(_0145, _2367, `0x20`),
592	_4567 = _mm256_permute2f128_pd(_0145, _2367, `0x31`);
593
594	if (__builtin_expect(tail, `0`)) {
595	const __m256* s = &_0123;
596	if (tail > `3`) {
597	_mm256_storeu_ps(ptr, *s);
598	s = &_4567;
599	tail -= `4`;
600	ptr += `8`;
601	}
602	bool high = false;
603	if (tail > `1`) {
604	_mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, `0`));
605	ptr += `4`;
606	tail -= `2`;
607	high = true;
608	}
609	if (tail > `0`) {
610	(ptr + `0`) = (s)[ high ? `4` : `0`];
611	(ptr + `1`) = (s)[ high ? `5` : `1`];
612	}
613	} else {
614	_mm256_storeu_ps(ptr + `0`, _0123);
615	_mm256_storeu_ps(ptr + `8`, _4567);
616	}
617	}
618
619	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
620	F _04, _15, _26, _37;
621	_04 = _15 = _26 = _37 = `0`;
622	switch (tail) {
623	case `0`: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+`28`), `1`); [[fallthrough]];
624	case `7`: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+`24`), `1`); [[fallthrough]];
625	case `6`: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+`20`), `1`); [[fallthrough]];
626	case `5`: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+`16`), `1`); [[fallthrough]];
627	case `4`: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+`12`), `0`); [[fallthrough]];
628	case `3`: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ `8`), `0`); [[fallthrough]];
629	case `2`: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ `4`), `0`); [[fallthrough]];
630	case `1`: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ `0`), `0`);
631	}
632
633	F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 \| r4 r5 g4 g5
634	ba0145 = _mm256_unpackhi_ps(_04,_15),
635	rg2367 = _mm256_unpacklo_ps(_26,_37),
636	ba2367 = _mm256_unpackhi_ps(_26,_37);
637
638	*r = _mm256_unpacklo_pd(rg0145, rg2367);
639	*g = _mm256_unpackhi_pd(rg0145, rg2367);
640	*b = _mm256_unpacklo_pd(ba0145, ba2367);
641	*a = _mm256_unpackhi_pd(ba0145, ba2367);
642	}
643	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
644	F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 \| r4 g4 r5 g5
645	rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... \| r6 ...
646	ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 \| b4 a4 b5 a5
647	ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... \| b6 ...
648
649	F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 \| r4 g4 b4 a4
650	_15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... \| r5 ...
651	_26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... \| r6 ...
652	_37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... \| r7 ...
653
654	if (__builtin_expect(tail, `0`)) {
655	if (tail > `0`) { _mm_storeu_ps(ptr+ `0`, _mm256_extractf128_ps(_04, `0`)); }
656	if (tail > `1`) { _mm_storeu_ps(ptr+ `4`, _mm256_extractf128_ps(_15, `0`)); }
657	if (tail > `2`) { _mm_storeu_ps(ptr+ `8`, _mm256_extractf128_ps(_26, `0`)); }
658	if (tail > `3`) { _mm_storeu_ps(ptr+`12`, _mm256_extractf128_ps(_37, `0`)); }
659	if (tail > `4`) { _mm_storeu_ps(ptr+`16`, _mm256_extractf128_ps(_04, `1`)); }
660	if (tail > `5`) { _mm_storeu_ps(ptr+`20`, _mm256_extractf128_ps(_15, `1`)); }
661	if (tail > `6`) { _mm_storeu_ps(ptr+`24`, _mm256_extractf128_ps(_26, `1`)); }
662	} else {
663	F _01 = _mm256_permute2f128_ps(_04, _15, `32`), // 32 == 0010 0000 == lo, lo
664	_23 = _mm256_permute2f128_ps(_26, _37, `32`),
665	_45 = _mm256_permute2f128_ps(_04, _15, `49`), // 49 == 0011 0001 == hi, hi
666	_67 = _mm256_permute2f128_ps(_26, _37, `49`);
667	_mm256_storeu_ps(ptr+ `0`, _01);
668	_mm256_storeu_ps(ptr+ `8`, _23);
669	_mm256_storeu_ps(ptr+`16`, _45);
670	_mm256_storeu_ps(ptr+`24`, _67);
671	}
672	}
673
674	#elif defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41)
675	template <typename T> using V = T __attribute__((ext_vector_type(`4`)));
676	using F = V<float >;
677	using I32 = V< int32_t>;
678	using U64 = V<uint64_t>;
679	using U32 = V<uint32_t>;
680	using U16 = V<uint16_t>;
681	using U8 = V<uint8_t >;
682
683	SI F mad(F f, F m, F a) { return f*m+a; }
684	SI F min(F a, F b) { return _mm_min_ps(a,b); }
685	SI F max(F a, F b) { return _mm_max_ps(a,b); }
686	SI F abs_(F v) { return _mm_and_ps(v, `0`-v); }
687	SI F rcp (F v) { return _mm_rcp_ps (v); }
688	SI F rsqrt (F v) { return _mm_rsqrt_ps(v); }
689	SI F sqrt_(F v) { return _mm_sqrt_ps (v); }
690	SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
691
692	SI U16 pack(U32 v) {
693	#if defined(JUMPER_IS_SSE41)
694	auto p = _mm_packus_epi32(v,v);
695	#else
696	// Sign extend so that _mm_packs_epi32() does the pack we want.
697	auto p = _mm_srai_epi32(_mm_slli_epi32(v, `16`), `16`);
698	p = _mm_packs_epi32(p,p);
699	#endif
700	return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
701	}
702	SI U8 pack(U16 v) {
703	auto r = widen_cast<__m128i>(v);
704	r = _mm_packus_epi16(r,r);
705	return sk_unaligned_load<U8>(&r);
706	}
707
708	SI F if_then_else(I32 c, F t, F e) {
709	return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
710	}
711
712	SI F floor_(F v) {
713	#if defined(JUMPER_IS_SSE41)
714	return _mm_floor_ps(v);
715	#else
716	F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
717	return roundtrip - if_then_else(roundtrip > v, `1`, `0`);
718	#endif
719	}
720
721	template <typename T>
722	SI V<T> gather(const T* p, U32 ix) {
723	return {p[ix[`0`]], p[ix[`1`]], p[ix[`2`]], p[ix[`3`]]};
724	}
725
726	// TODO: these loads and stores are incredibly difficult to follow.
727
728	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
729	__m128i _01;
730	if (__builtin_expect(tail,`0`)) {
731	_01 = _mm_setzero_si128();
732	if (tail > `1`) {
733	_01 = _mm_loadl_pd(_01, (const double)ptr); // r0 g0 r1 g1 00 00 00 00*
734	if (tail > `2`) {
735	_01 = _mm_insert_epi16(_01, (ptr+`4`), `4`); // r0 g0 r1 g1 r2 00 00 00*
736	_01 = _mm_insert_epi16(_01, (ptr+`5`), `5`); // r0 g0 r1 g1 r2 g2 00 00*
737	}
738	} else {
739	_01 = _mm_cvtsi32_si128((const* uint32_t)ptr); // r0 g0 00 00 00 00 00 00*
740	}
741	} else {
742	_01 = _mm_loadu_si128(((__m128i)ptr) + `0`); // r0 g0 r1 g1 r2 g2 r3 g3*
743	}
744	auto rg01_23 = _mm_shufflelo_epi16(_01, `0xD8`); // r0 r1 g0 g1 r2 g2 r3 g3
745	auto rg = _mm_shufflehi_epi16(rg01_23, `0xD8`); // r0 r1 g0 g1 r2 r3 g2 g3
746
747	auto R = _mm_shuffle_epi32(rg, `0x88`); // r0 r1 r2 r3 r0 r1 r2 r3
748	auto G = _mm_shuffle_epi32(rg, `0xDD`); // g0 g1 g2 g3 g0 g1 g2 g3
749	*r = sk_unaligned_load<U16>(&R);
750	*g = sk_unaligned_load<U16>(&G);
751	}
752	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
753	U32 rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g));
754	if (__builtin_expect(tail, `0`)) {
755	if (tail > `1`) {
756	_mm_storel_epi64((__m128i*)ptr, rg);
757	if (tail > `2`) {
758	int32_t rgpair = rg[`2`];
759	memcpy(ptr + `4`, &rgpair, sizeof(rgpair));
760	}
761	} else {
762	int32_t rgpair = rg[`0`];
763	memcpy(ptr, &rgpair, sizeof(rgpair));
764	}
765	} else {
766	_mm_storeu_si128((__m128i*)ptr + `0`, rg);
767	}
768	}
769
770	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
771	__m128i _0, _1, _2, _3;
772	if (__builtin_expect(tail,`0`)) {
773	_1 = _2 = _3 = _mm_setzero_si128();
774	auto load_rgb = [](const uint16_t* src) {
775	auto v = _mm_cvtsi32_si128((const* uint32_t*)src);
776	return _mm_insert_epi16(v, src[`2`], `2`);
777	};
778	if ( true ) { _0 = load_rgb(ptr + `0`); }
779	if (tail > `1`) { _1 = load_rgb(ptr + `3`); }
780	if (tail > `2`) { _2 = load_rgb(ptr + `6`); }
781	} else {
782	// Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
783	auto _01 = _mm_loadu_si128((const __m128i*)(ptr + `0`)) ,
784	_23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + `4`)), `4`);
785
786	// Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
787	_0 = _01;
788	_1 = _mm_srli_si128(_01, `6`);
789	_2 = _23;
790	_3 = _mm_srli_si128(_23, `6`);
791	}
792
793	// De-interlace to R,G,B.
794	auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
795	_13 = _mm_unpacklo_epi16(_1, _3); // r1 r3 g1 g3 b1 b3 xx xx
796
797	auto R = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
798	G = _mm_srli_si128(R, `8`),
799	B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx
800
801	*r = sk_unaligned_load<U16>(&R);
802	*g = sk_unaligned_load<U16>(&G);
803	*b = sk_unaligned_load<U16>(&B);
804	}
805
806	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
807	__m128i _01, _23;
808	if (__builtin_expect(tail,`0`)) {
809	_01 = _23 = _mm_setzero_si128();
810	auto src = (const double*)ptr;
811	if ( true ) { _01 = _mm_loadl_pd(_01, src + `0`); } // r0 g0 b0 a0 00 00 00 00
812	if (tail > `1`) { _01 = _mm_loadh_pd(_01, src + `1`); } // r0 g0 b0 a0 r1 g1 b1 a1
813	if (tail > `2`) { _23 = _mm_loadl_pd(_23, src + `2`); } // r2 g2 b2 a2 00 00 00 00
814	} else {
815	_01 = _mm_loadu_si128(((__m128i)ptr) + `0`); // r0 g0 b0 a0 r1 g1 b1 a1*
816	_23 = _mm_loadu_si128(((__m128i)ptr) + `1`); // r2 g2 b2 a2 r3 g3 b3 a3*
817	}
818
819	auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
820	_13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3
821
822	auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
823	ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
824
825	r = sk_unaligned_load<U16>((uint16_t)&rg + `0`);
826	g = sk_unaligned_load<U16>((uint16_t)&rg + `4`);
827	b = sk_unaligned_load<U16>((uint16_t)&ba + `0`);
828	a = sk_unaligned_load<U16>((uint16_t)&ba + `4`);
829	}
830
831	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
832	auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
833	ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
834
835	if (__builtin_expect(tail, `0`)) {
836	auto dst = (double*)ptr;
837	if ( true ) { _mm_storel_pd(dst + `0`, _mm_unpacklo_epi32(rg, ba)); }
838	if (tail > `1`) { _mm_storeh_pd(dst + `1`, _mm_unpacklo_epi32(rg, ba)); }
839	if (tail > `2`) { _mm_storel_pd(dst + `2`, _mm_unpackhi_epi32(rg, ba)); }
840	} else {
841	_mm_storeu_si128((__m128i*)ptr + `0`, _mm_unpacklo_epi32(rg, ba));
842	_mm_storeu_si128((__m128i*)ptr + `1`, _mm_unpackhi_epi32(rg, ba));
843	}
844	}
845
846	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
847	F _01, _23;
848	if (__builtin_expect(tail, `0`)) {
849	_01 = _23 = _mm_setzero_si128();
850	if ( true ) { _01 = _mm_loadl_pi(_01, (__m64 const*)(ptr + `0`)); }
851	if (tail > `1`) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + `2`)); }
852	if (tail > `2`) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + `4`)); }
853	} else {
854	_01 = _mm_loadu_ps(ptr + `0`);
855	_23 = _mm_loadu_ps(ptr + `4`);
856	}
857	*r = _mm_shuffle_ps(_01, _23, `0x88`);
858	*g = _mm_shuffle_ps(_01, _23, `0xDD`);
859	}
860	SI void store2(float* ptr, size_t tail, F r, F g) {
861	F _01 = _mm_unpacklo_ps(r, g),
862	_23 = _mm_unpackhi_ps(r, g);
863	if (__builtin_expect(tail, `0`)) {
864	if ( true ) { _mm_storel_pi((__m64*)(ptr + `0`), _01); }
865	if (tail > `1`) { _mm_storeh_pi((__m64*)(ptr + `2`), _01); }
866	if (tail > `2`) { _mm_storel_pi((__m64*)(ptr + `4`), _23); }
867	} else {
868	_mm_storeu_ps(ptr + `0`, _01);
869	_mm_storeu_ps(ptr + `4`, _23);
870	}
871	}
872
873	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
874	F _0, _1, _2, _3;
875	if (__builtin_expect(tail, `0`)) {
876	_1 = _2 = _3 = _mm_setzero_si128();
877	if ( true ) { _0 = _mm_loadu_ps(ptr + `0`); }
878	if (tail > `1`) { _1 = _mm_loadu_ps(ptr + `4`); }
879	if (tail > `2`) { _2 = _mm_loadu_ps(ptr + `8`); }
880	} else {
881	_0 = _mm_loadu_ps(ptr + `0`);
882	_1 = _mm_loadu_ps(ptr + `4`);
883	_2 = _mm_loadu_ps(ptr + `8`);
884	_3 = _mm_loadu_ps(ptr +`12`);
885	}
886	_MM_TRANSPOSE4_PS(_0,_1,_2,_3);
887	*r = _0;
888	*g = _1;
889	*b = _2;
890	*a = _3;
891	}
892
893	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
894	_MM_TRANSPOSE4_PS(r,g,b,a);
895	if (__builtin_expect(tail, `0`)) {
896	if ( true ) { _mm_storeu_ps(ptr + `0`, r); }
897	if (tail > `1`) { _mm_storeu_ps(ptr + `4`, g); }
898	if (tail > `2`) { _mm_storeu_ps(ptr + `8`, b); }
899	} else {
900	_mm_storeu_ps(ptr + `0`, r);
901	_mm_storeu_ps(ptr + `4`, g);
902	_mm_storeu_ps(ptr + `8`, b);
903	_mm_storeu_ps(ptr +`12`, a);
904	}
905	}
906	#endif
907
908	// We need to be a careful with casts.
909	// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
910	// These named casts and bit_cast() are always what they seem to be.
911	#if defined(JUMPER_IS_SCALAR)
912	SI F cast (U32 v) { return (F)v; }
913	SI F cast64(U64 v) { return (F)v; }
914	SI U32 trunc_(F v) { return (U32)v; }
915	SI U32 expand(U16 v) { return (U32)v; }
916	SI U32 expand(U8 v) { return (U32)v; }
917	#else
918	SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
919	SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
920	SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
921	SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
922	SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
923	#endif
924
925	template <typename V>
926	SI V if_then_else(I32 c, V t, V e) {
927	return sk_bit_cast<V>(if_then_else(c, sk_bit_cast<F>(t), sk_bit_cast<F>(e)));
928	}
929
930	SI U16 bswap(U16 x) {
931	#if defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41)
932	// Somewhat inexplicably Clang decides to do (x<<8) \| (x>>8) in 32-bit lanes
933	// when generating code for SSE2 and SSE4.1. We'll do it manually...
934	auto v = widen_cast<__m128i>(x);
935	v = _mm_slli_epi16(v,`8`) \| _mm_srli_epi16(v,`8`);
936	return sk_unaligned_load<U16>(&v);
937	#else
938	return (x<<`8`) \| (x>>`8`);
939	#endif
940	}
941
942	SI F fract(F v) { return v - floor_(v); }
943
944	// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
945	SI F approx_log2(F x) {
946	// e - 127 is a fair approximation of log2(x) in its own right...
947	F e = cast(sk_bit_cast<U32>(x)) * (`1.0f` / (`1`<<`23`));
948
949	// ... but using the mantissa to refine its error is _much_ better.
950	F m = sk_bit_cast<F>((sk_bit_cast<U32>(x) & `0x007fffff`) \| `0x3f000000`);
951	return e
952	- `124.225514990f`
953	- `1.498030302f` * m
954	- `1.725879990f` / (`0.3520887068f` + m);
955	}
956
957	SI F approx_log(F x) {
958	const float ln2 = `0.69314718f`;
959	return ln2 * approx_log2(x);
960	}
961
962	SI F approx_pow2(F x) {
963	F f = fract(x);
964	return sk_bit_cast<F>(round(`1.0f` * (`1`<<`23`),
965	x + `121.274057500f`
966	- `1.490129070f` * f
967	+ `27.728023300f` / (`4.84252568f` - f)));
968	}
969
970	SI F approx_exp(F x) {
971	const float log2_e = `1.4426950408889634074f`;
972	return approx_pow2(log2_e * x);
973	}
974
975	SI F approx_powf(F x, F y) {
976	return if_then_else((x == `0`)\|(x == `1`), x
977	, approx_pow2(approx_log2(x) * y));
978	}
979
980	SI F from_half(U16 h) {
981	#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
982	&& !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
983	return vcvt_f32_f16(h);
984
985	#elif defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
986	return _mm256_cvtph_ps(h);
987
988	#else
989	// Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
990	U32 sem = expand(h),
991	s = sem & `0x8000`,
992	em = sem ^ s;
993
994	// Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
995	auto denorm = (I32)em < `0x0400`; // I32 comparison is often quicker, and always safe here.
996	return if_then_else(denorm, F(`0`)
997	, sk_bit_cast<F>( (s<<`16`) + (em<<`13`) + ((`127`-`15`)<<`23`) ));
998	#endif
999	}
1000
1001	SI U16 to_half(F f) {
1002	#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
1003	&& !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
1004	return vcvt_f16_f32(f);
1005
1006	#elif defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
1007	return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1008
1009	#else
1010	// Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
1011	U32 sem = sk_bit_cast<U32>(f),
1012	s = sem & `0x80000000`,
1013	em = sem ^ s;
1014
1015	// Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
1016	auto denorm = (I32)em < `0x38800000`; // I32 comparison is often quicker, and always safe here.
1017	return pack(if_then_else(denorm, U32(`0`)
1018	, (s>>`16`) + (em>>`13`) - ((`127`-`15`)<<`10`)));
1019	#endif
1020	}
1021
1022	// Our fundamental vector depth is our pixel stride.
1023	static const size_t N = sizeof(F) / sizeof(float);
1024
1025	// We're finally going to get to what a Stage function looks like!
1026	// tail == 0 ~~> work on a full N pixels
1027	// tail != 0 ~~> work on only the first tail pixels
1028	// tail is always < N.
1029
1030	// Any custom ABI to use for all (non-externally-facing) stage functions?
1031	// Also decide here whether to use narrow (compromise) or wide (ideal) stages.
1032	#if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON)
1033	// This lets us pass vectors more efficiently on 32-bit ARM.
1034	// We can still only pass 16 floats, so best as 4x {r,g,b,a}.
1035	#define ABI __attribute__((pcs("aapcs-vfp")))
1036	#define JUMPER_NARROW_STAGES 1
1037	#elif 0 && defined(_MSC_VER) && defined(__clang__) && defined(__x86_64__)
1038	// SysV ABI makes it very sensible to use wide stages with clang-cl.
1039	// TODO: crashes during compilation :(
1040	#define ABI __attribute__((sysv_abi))
1041	#define JUMPER_NARROW_STAGES 0
1042	#elif defined(_MSC_VER)
1043	// Even if not vectorized, this lets us pass {r,g,b,a} as registers,
1044	// instead of {b,a} on the stack. Narrow stages work best for __vectorcall.
1045	#define ABI __vectorcall
1046	#define JUMPER_NARROW_STAGES 1
1047	#elif defined(__x86_64__) \|\| defined(SK_CPU_ARM64)
1048	// These platforms are ideal for wider stages, and their default ABI is ideal.
1049	#define ABI
1050	#define JUMPER_NARROW_STAGES 0
1051	#else
1052	// 32-bit or unknown... shunt them down the narrow path.
1053	// Odds are these have few registers and are better off there.
1054	#define ABI
1055	#define JUMPER_NARROW_STAGES 1
1056	#endif
1057
1058	#if JUMPER_NARROW_STAGES
1059	struct Params {
1060	size_t dx, dy, tail;
1061	F dr,dg,db,da;
1062	};
1063	using Stage = void(ABI)(Params, void** program, F r, F g, F b, F a);
1064	#else
1065	// We keep program the second argument, so that it's passed in rsi for load_and_inc().
1066	using Stage = void(ABI)(size_t tail, void*** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
1067	#endif
1068
1069
1070	static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, void** program) {
1071	auto start = (Stage)load_and_inc(program);
1072	const size_t x0 = dx;
1073	for (; dy < ylimit; dy++) {
1074	#if JUMPER_NARROW_STAGES
1075	Params params = { x0,dy,`0`, `0`,`0`,`0`,`0` };
1076	while (params.dx + N <= xlimit) {
1077	start(&params,program, `0`,`0`,`0`,`0`);
1078	params.dx += N;
1079	}
1080	if (size_t tail = xlimit - params.dx) {
1081	params.tail = tail;
1082	start(&params,program, `0`,`0`,`0`,`0`);
1083	}
1084	#else
1085	dx = x0;
1086	while (dx + N <= xlimit) {
1087	start(`0`,program,dx,dy, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
1088	dx += N;
1089	}
1090	if (size_t tail = xlimit - dx) {
1091	start(tail,program,dx,dy, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
1092	}
1093	#endif
1094	}
1095	}
1096
1097	#if JUMPER_NARROW_STAGES
1098	#define STAGE(name, ...) \
1099	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1100	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1101	static void ABI name(Params* params, void** program, \
1102	F r, F g, F b, F a) { \
1103	name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \
1104	params->dr, params->dg, params->db, params->da); \
1105	auto next = (Stage)load_and_inc(program); \
1106	next(params,program, r,g,b,a); \
1107	} \
1108	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1109	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1110	#else
1111	#define STAGE(name, ...) \
1112	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1113	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1114	static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
1115	F r, F g, F b, F a, F dr, F dg, F db, F da) { \
1116	name##_k(Ctx {program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \
1117	auto next = (Stage)load_and_inc(program); \
1118	next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
1119	} \
1120	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1121	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1122	#endif
1123
1124
1125	// just_return() is a simple no-op stage that only exists to end the chain,
1126	// returning back up to start_pipeline(), and from there to the caller.
1127	#if JUMPER_NARROW_STAGES
1128	static void ABI just_return(Params, void***, F,F,F,F) {}
1129	#else
1130	static void ABI just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
1131	#endif
1132
1133
1134	// We could start defining normal Stages now. But first, some helper functions.
1135
1136	// These load() and store() methods are tail-aware,
1137	// but focus mainly on keeping the at-stride tail==0 case fast.
1138
1139	template <typename V, typename T>
1140	SI V load(const T* src, size_t tail) {
1141	#if !defined(JUMPER_IS_SCALAR)
1142	__builtin_assume(tail < N);
1143	if (__builtin_expect(tail, `0`)) {
1144	V v{}; // Any inactive lanes are zeroed.
1145	switch (tail) {
1146	case `7`: v[`6`] = src[`6`]; [[fallthrough]];
1147	case `6`: v[`5`] = src[`5`]; [[fallthrough]];
1148	case `5`: v[`4`] = src[`4`]; [[fallthrough]];
1149	case `4`: memcpy(&v, src, `4`*sizeof(T)); break;
1150	case `3`: v[`2`] = src[`2`]; [[fallthrough]];
1151	case `2`: memcpy(&v, src, `2`*sizeof(T)); break;
1152	case `1`: memcpy(&v, src, `1`*sizeof(T)); break;
1153	}
1154	return v;
1155	}
1156	#endif
1157	return sk_unaligned_load<V>(src);
1158	}
1159
1160	template <typename V, typename T>
1161	SI void store(T* dst, V v, size_t tail) {
1162	#if !defined(JUMPER_IS_SCALAR)
1163	__builtin_assume(tail < N);
1164	if (__builtin_expect(tail, `0`)) {
1165	switch (tail) {
1166	case `7`: dst[`6`] = v[`6`]; [[fallthrough]];
1167	case `6`: dst[`5`] = v[`5`]; [[fallthrough]];
1168	case `5`: dst[`4`] = v[`4`]; [[fallthrough]];
1169	case `4`: memcpy(dst, &v, `4`*sizeof(T)); break;
1170	case `3`: dst[`2`] = v[`2`]; [[fallthrough]];
1171	case `2`: memcpy(dst, &v, `2`*sizeof(T)); break;
1172	case `1`: memcpy(dst, &v, `1`*sizeof(T)); break;
1173	}
1174	return;
1175	}
1176	#endif
1177	sk_unaligned_store(dst, v);
1178	}
1179
1180	SI F from_byte(U8 b) {
1181	return cast(expand(b)) * (`1`/`255.0f`);
1182	}
1183	SI F from_short(U16 s) {
1184	return cast(expand(s)) * (`1`/`65535.0f`);
1185	}
1186	SI void from_565(U16 _565, F* r, F* g, F* b) {
1187	U32 wide = expand(_565);
1188	r = cast(wide & (`31`<<`11`)) (`1.0f` / (`31`<<`11`));
1189	g = cast(wide & (`63`<< `5`)) (`1.0f` / (`63`<< `5`));
1190	b = cast(wide & (`31`<< `0`)) (`1.0f` / (`31`<< `0`));
1191	}
1192	SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
1193	U32 wide = expand(_4444);
1194	r = cast(wide & (`15`<<`12`)) (`1.0f` / (`15`<<`12`));
1195	g = cast(wide & (`15`<< `8`)) (`1.0f` / (`15`<< `8`));
1196	b = cast(wide & (`15`<< `4`)) (`1.0f` / (`15`<< `4`));
1197	a = cast(wide & (`15`<< `0`)) (`1.0f` / (`15`<< `0`));
1198	}
1199	SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
1200	r = cast((_8888 ) & `0xff`) (`1`/`255.0f`);
1201	g = cast((_8888 >> `8`) & `0xff`) (`1`/`255.0f`);
1202	b = cast((_8888 >> `16`) & `0xff`) (`1`/`255.0f`);
1203	a = cast((_8888 >> `24`) ) (`1`/`255.0f`);
1204	}
1205	SI void from_88(U16 _88, F* r, F* g) {
1206	U32 wide = expand(_88);
1207	r = cast((wide ) & `0xff`) (`1`/`255.0f`);
1208	g = cast((wide >> `8`) & `0xff`) (`1`/`255.0f`);
1209	}
1210	SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) {
1211	r = cast((rgba ) & `0x3ff`) (`1`/`1023.0f`);
1212	g = cast((rgba >> `10`) & `0x3ff`) (`1`/`1023.0f`);
1213	b = cast((rgba >> `20`) & `0x3ff`) (`1`/`1023.0f`);
1214	a = cast((rgba >> `30`) ) (`1`/ `3.0f`);
1215	}
1216	SI void from_1616(U32 _1616, F* r, F* g) {
1217	r = cast((_1616 ) & `0xffff`) (`1`/`65535.0f`);
1218	g = cast((_1616 >> `16`) & `0xffff`) (`1`/`65535.0f`);
1219	}
1220	SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) {
1221	r = cast64((_16161616 ) & `0xffff`) (`1`/`65535.0f`);
1222	g = cast64((_16161616 >> `16`) & `0xffff`) (`1`/`65535.0f`);
1223	b = cast64((_16161616 >> `32`) & `0xffff`) (`1`/`65535.0f`);
1224	a = cast64((_16161616 >> `48`) & `0xffff`) (`1`/`65535.0f`);
1225	}
1226
1227	// Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory.
1228	template <typename T>
1229	SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
1230	return (T)ctx->pixels + dyctx->stride + dx;
1231	}
1232
1233	// clamp v to [0,limit).
1234	SI F clamp(F v, F limit) {
1235	F inclusive = sk_bit_cast<F>( sk_bit_cast<U32>(limit) - `1` ); // Exclusive -> inclusive.
1236	return min(max(`0`, v), inclusive);
1237	}
1238
1239	// Used by gather_ stages to calculate the base pointer and a vector of indices to load.
1240	template <typename T>
1241	SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
1242	x = clamp(x, ctx->width);
1243	y = clamp(y, ctx->height);
1244
1245	ptr = (const* T*)ctx->pixels;
1246	return trunc_(y)*ctx->stride + trunc_(x);
1247	}
1248
1249	// We often have a nominally [0,1] float value we need to scale and convert to an integer,
1250	// whether for a table lookup or to pack back down into bytes for storage.
1251	//
1252	// In practice, especially when dealing with interesting color spaces, that notionally
1253	// [0,1] float may be out of [0,1] range. Unorms cannot represent that, so we must clamp.
1254	//
1255	// You can adjust the expected input to [0,bias] by tweaking that parameter.
1256	SI U32 to_unorm(F v, F scale, F bias = `1.0f`) {
1257	// TODO: platform-specific implementations to to_unorm(), removing round() entirely?
1258	// Any time we use round() we probably want to use to_unorm().
1259	return round(min(max(`0`, v), bias), scale);
1260	}
1261
1262	SI I32 cond_to_mask(I32 cond) { return if_then_else(cond, I32(~`0`), I32(`0`)); }
1263
1264	// Now finally, normal Stages!
1265
1266	STAGE(seed_shader, Ctx::None) {
1267	static const float iota[] = {
1268	`0.5f`, `1.5f`, `2.5f`, `3.5f`, `4.5f`, `5.5f`, `6.5f`, `7.5f`,
1269	`8.5f`, `9.5f`,`10.5f`,`11.5f`,`12.5f`,`13.5f`,`14.5f`,`15.5f`,
1270	};
1271	// It's important for speed to explicitly cast(dx) and cast(dy),
1272	// which has the effect of splatting them to vectors before converting to floats.
1273	// On Intel this breaks a data dependency on previous loop iterations' registers.
1274	r = cast(dx) + sk_unaligned_load<F>(iota);
1275	g = cast(dy) + `0.5f`;
1276	b = `1.0f`;
1277	a = `0`;
1278	dr = dg = db = da = `0`;
1279	}
1280
1281	STAGE(dither, const float* rate) {
1282	// Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors.
1283	uint32_t iota[] = {`0`,`1`,`2`,`3`,`4`,`5`,`6`,`7`};
1284	U32 X = dx + sk_unaligned_load<U32>(iota),
1285	Y = dy;
1286
1287	// We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
1288	// In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ].
1289
1290	// We only need X and X^Y from here on, so it's easier to just think of that as "Y".
1291	Y ^= X;
1292
1293	// We'll mix the bottom 3 bits of each of X and Y to make 6 bits,
1294	// for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda.
1295	U32 M = (Y & `1`) << `5` \| (X & `1`) << `4`
1296	\| (Y & `2`) << `2` \| (X & `2`) << `1`
1297	\| (Y & `4`) >> `1` \| (X & `4`) >> `2`;
1298
1299	// Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon.
1300	// We want to make sure our dither is less than 0.5 in either direction to keep exact values
1301	// like 0 and 1 unchanged after rounding.
1302	F dither = cast(M) * (`2`/`128.0f`) - (`63`/`128.0f`);
1303
1304	r += ratedither;
1305	g += ratedither;
1306	b += ratedither;
1307
1308	r = max(`0`, min(r, a));
1309	g = max(`0`, min(g, a));
1310	b = max(`0`, min(b, a));
1311	}
1312
1313	// load 4 floats from memory, and splat them into r,g,b,a
1314	STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1315	r = c->r;
1316	g = c->g;
1317	b = c->b;
1318	a = c->a;
1319	}
1320	STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1321	r = c->r;
1322	g = c->g;
1323	b = c->b;
1324	a = c->a;
1325	}
1326	// load 4 floats from memory, and splat them into dr,dg,db,da
1327	STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
1328	dr = c->r;
1329	dg = c->g;
1330	db = c->b;
1331	da = c->a;
1332	}
1333
1334	// splats opaque-black into r,g,b,a
1335	STAGE(black_color, Ctx::None) {
1336	r = g = b = `0.0f`;
1337	a = `1.0f`;
1338	}
1339
1340	STAGE(white_color, Ctx::None) {
1341	r = g = b = a = `1.0f`;
1342	}
1343
1344	// load registers r,g,b,a from context (mirrors store_rgba)
1345	STAGE(load_src, const float* ptr) {
1346	r = sk_unaligned_load<F>(ptr + `0`*N);
1347	g = sk_unaligned_load<F>(ptr + `1`*N);
1348	b = sk_unaligned_load<F>(ptr + `2`*N);
1349	a = sk_unaligned_load<F>(ptr + `3`*N);
1350	}
1351
1352	// store registers r,g,b,a into context (mirrors load_rgba)
1353	STAGE(store_src, float* ptr) {
1354	sk_unaligned_store(ptr + `0`*N, r);
1355	sk_unaligned_store(ptr + `1`*N, g);
1356	sk_unaligned_store(ptr + `2`*N, b);
1357	sk_unaligned_store(ptr + `3`*N, a);
1358	}
1359	STAGE(store_src_a, float* ptr) {
1360	sk_unaligned_store(ptr, a);
1361	}
1362
1363	// load registers dr,dg,db,da from context (mirrors store_dst)
1364	STAGE(load_dst, const float* ptr) {
1365	dr = sk_unaligned_load<F>(ptr + `0`*N);
1366	dg = sk_unaligned_load<F>(ptr + `1`*N);
1367	db = sk_unaligned_load<F>(ptr + `2`*N);
1368	da = sk_unaligned_load<F>(ptr + `3`*N);
1369	}
1370
1371	// store registers dr,dg,db,da into context (mirrors load_dst)
1372	STAGE(store_dst, float* ptr) {
1373	sk_unaligned_store(ptr + `0`*N, dr);
1374	sk_unaligned_store(ptr + `1`*N, dg);
1375	sk_unaligned_store(ptr + `2`*N, db);
1376	sk_unaligned_store(ptr + `3`*N, da);
1377	}
1378
1379	// Most blend modes apply the same logic to each channel.
1380	#define BLEND_MODE(name) \
1381	SI F name##_channel(F s, F d, F sa, F da); \
1382	STAGE(name, Ctx::None) { \
1383	r = name##_channel(r,dr,a,da); \
1384	g = name##_channel(g,dg,a,da); \
1385	b = name##_channel(b,db,a,da); \
1386	a = name##_channel(a,da,a,da); \
1387	} \
1388	SI F name##_channel(F s, F d, F sa, F da)
1389
1390	SI F inv(F x) { return `1.0f` - x; }
1391	SI F two(F x) { return x + x; }
1392
1393
1394	BLEND_MODE(clear) { return `0`; }
1395	BLEND_MODE(srcatop) { return sda + dinv(sa); }
1396	BLEND_MODE(dstatop) { return dsa + sinv(da); }
1397	BLEND_MODE(srcin) { return s * da; }
1398	BLEND_MODE(dstin) { return d * sa; }
1399	BLEND_MODE(srcout) { return s * inv(da); }
1400	BLEND_MODE(dstout) { return d * inv(sa); }
1401	BLEND_MODE(srcover) { return mad(d, inv(sa), s); }
1402	BLEND_MODE(dstover) { return mad(s, inv(da), d); }
1403
1404	BLEND_MODE(modulate) { return s*d; }
1405	BLEND_MODE(multiply) { return sinv(da) + dinv(sa) + s*d; }
1406	BLEND_MODE(plus_) { return min(s + d, `1.0f`); } // We can clamp to either 1 or sa.
1407	BLEND_MODE(screen) { return s + d - s*d; }
1408	BLEND_MODE(xor_) { return sinv(da) + dinv(sa); }
1409	#undef BLEND_MODE
1410
1411	// Most other blend modes apply the same logic to colors, and srcover to alpha.
1412	#define BLEND_MODE(name) \
1413	SI F name##_channel(F s, F d, F sa, F da); \
1414	STAGE(name, Ctx::None) { \
1415	r = name##_channel(r,dr,a,da); \
1416	g = name##_channel(g,dg,a,da); \
1417	b = name##_channel(b,db,a,da); \
1418	a = mad(da, inv(a), a); \
1419	} \
1420	SI F name##_channel(F s, F d, F sa, F da)
1421
1422	BLEND_MODE(darken) { return s + d - max(sda, dsa) ; }
1423	BLEND_MODE(lighten) { return s + d - min(sda, dsa) ; }
1424	BLEND_MODE(difference) { return s + d - two(min(sda, dsa)); }
1425	BLEND_MODE(exclusion) { return s + d - two(s*d); }
1426
1427	BLEND_MODE(colorburn) {
1428	return if_then_else(d == da, d + s*inv(da),
1429	if_then_else(s == `0`, / s + / d*inv(sa),
1430	sa(da - min(da, (da-d)sarcp(s))) + sinv(da) + d*inv(sa)));
1431	}
1432	BLEND_MODE(colordodge) {
1433	return if_then_else(d == `0`, / d + / s*inv(da),
1434	if_then_else(s == sa, s + d*inv(sa),
1435	samin(da, (dsa)rcp(sa - s)) + sinv(da) + d*inv(sa)));
1436	}
1437	BLEND_MODE(hardlight) {
1438	return sinv(da) + dinv(sa)
1439	+ if_then_else(two(s) <= sa, two(sd), sada - two((da-d)*(sa-s)));
1440	}
1441	BLEND_MODE(overlay) {
1442	return sinv(da) + dinv(sa)
1443	+ if_then_else(two(d) <= da, two(sd), sada - two((da-d)*(sa-s)));
1444	}
1445
1446	BLEND_MODE(softlight) {
1447	F m = if_then_else(da > `0`, d / da, `0`),
1448	s2 = two(s),
1449	m4 = two(two(m));
1450
1451	// The logic forks three ways:
1452	// 1. dark src?
1453	// 2. light src, dark dst?
1454	// 3. light src, light dst?
1455	F darkSrc = d(sa + (s2 - sa)(`1.0f` - m)), // Used in case 1.
1456	darkDst = (m4m4 + m4)(m - `1.0f`) + `7.0f`m, // Used in case 2.*
1457	liteDst = rcp(rsqrt(m)) - m, // Used in case 3.
1458	liteSrc = dsa + da(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3?
1459	return sinv(da) + dinv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)?
1460	}
1461	#undef BLEND_MODE
1462
1463	// We're basing our implemenation of non-separable blend modes on
1464	// https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1465	// and
1466	// https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1467	// They're equivalent, but ES' math has been better simplified.
1468	//
1469	// Anything extra we add beyond that is to make the math work with premul inputs.
1470
1471	SI F sat(F r, F g, F b) { return max(r, max(g,b)) - min(r, min(g,b)); }
1472	SI F lum(F r, F g, F b) { return r`0.30f` + g`0.59f` + b*`0.11f`; }
1473
1474	SI void set_sat(F* r, F* g, F* b, F s) {
1475	F mn = min(r, min(g,*b)),
1476	mx = max(r, max(g,*b)),
1477	sat = mx - mn;
1478
1479	// Map min channel to 0, max channel to s, and scale the middle proportionally.
1480	auto scale = [=](F c) {
1481	return if_then_else(sat == `0`, `0`, (c - mn) * s / sat);
1482	};
1483	r = scale(r);
1484	g = scale(g);
1485	b = scale(b);
1486	}
1487	SI void set_lum(F* r, F* g, F* b, F l) {
1488	F diff = l - lum(r, g, *b);
1489	*r += diff;
1490	*g += diff;
1491	*b += diff;
1492	}
1493	SI void clip_color(F* r, F* g, F* b, F a) {
1494	F mn = min(r, min(g, *b)),
1495	mx = max(r, max(g, *b)),
1496	l = lum(r, g, *b);
1497
1498	auto clip = [=](F c) {
1499	c = if_then_else(mn >= `0`, c, l + (c - l) * ( l) / (l - mn) );
1500	c = if_then_else(mx > a, l + (c - l) * (a - l) / (mx - l), c);
1501	c = max(c, `0`); // Sometimes without this we may dip just a little negative.
1502	return c;
1503	};
1504	r = clip(r);
1505	g = clip(g);
1506	b = clip(b);
1507	}
1508
1509	STAGE(hue, Ctx::None) {
1510	F R = r*a,
1511	G = g*a,
1512	B = b*a;
1513
1514	set_sat(&R, &G, &B, sat(dr,dg,db)*a);
1515	set_lum(&R, &G, &B, lum(dr,dg,db)*a);
1516	clip_color(&R,&G,&B, a*da);
1517
1518	r = rinv(da) + drinv(a) + R;
1519	g = ginv(da) + dginv(a) + G;
1520	b = binv(da) + dbinv(a) + B;
1521	a = a + da - a*da;
1522	}
1523	STAGE(saturation, Ctx::None) {
1524	F R = dr*a,
1525	G = dg*a,
1526	B = db*a;
1527
1528	set_sat(&R, &G, &B, sat( r, g, b)*da);
1529	set_lum(&R, &G, &B, lum(dr,dg,db)* a); // (This is not redundant.)
1530	clip_color(&R,&G,&B, a*da);
1531
1532	r = rinv(da) + drinv(a) + R;
1533	g = ginv(da) + dginv(a) + G;
1534	b = binv(da) + dbinv(a) + B;
1535	a = a + da - a*da;
1536	}
1537	STAGE(color, Ctx::None) {
1538	F R = r*da,
1539	G = g*da,
1540	B = b*da;
1541
1542	set_lum(&R, &G, &B, lum(dr,dg,db)*a);
1543	clip_color(&R,&G,&B, a*da);
1544
1545	r = rinv(da) + drinv(a) + R;
1546	g = ginv(da) + dginv(a) + G;
1547	b = binv(da) + dbinv(a) + B;
1548	a = a + da - a*da;
1549	}
1550	STAGE(luminosity, Ctx::None) {
1551	F R = dr*a,
1552	G = dg*a,
1553	B = db*a;
1554
1555	set_lum(&R, &G, &B, lum(r,g,b)*da);
1556	clip_color(&R,&G,&B, a*da);
1557
1558	r = rinv(da) + drinv(a) + R;
1559	g = ginv(da) + dginv(a) + G;
1560	b = binv(da) + dbinv(a) + B;
1561	a = a + da - a*da;
1562	}
1563
1564	STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
1565	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
1566
1567	U32 dst = load<U32>(ptr, tail);
1568	dr = cast((dst ) & `0xff`);
1569	dg = cast((dst >> `8`) & `0xff`);
1570	db = cast((dst >> `16`) & `0xff`);
1571	da = cast((dst >> `24`) );
1572	// {dr,dg,db,da} are in [0,255]
1573	// { r, g, b, a} are in [0, 1] (but may be out of gamut)
1574
1575	r = mad(dr, inv(a), r*`255.0f`);
1576	g = mad(dg, inv(a), g*`255.0f`);
1577	b = mad(db, inv(a), b*`255.0f`);
1578	a = mad(da, inv(a), a*`255.0f`);
1579	// { r, g, b, a} are now in [0,255] (but may be out of gamut)
1580
1581	// to_unorm() clamps back to gamut. Scaling by 1 since we're already 255-biased.
1582	dst = to_unorm(r, `1`, `255`)
1583	\| to_unorm(g, `1`, `255`) << `8`
1584	\| to_unorm(b, `1`, `255`) << `16`
1585	\| to_unorm(a, `1`, `255`) << `24`;
1586	store(ptr, dst, tail);
1587	}
1588
1589	STAGE(clamp_0, Ctx::None) {
1590	r = max(r, `0`);
1591	g = max(g, `0`);
1592	b = max(b, `0`);
1593	a = max(a, `0`);
1594	}
1595
1596	STAGE(clamp_1, Ctx::None) {
1597	r = min(r, `1.0f`);
1598	g = min(g, `1.0f`);
1599	b = min(b, `1.0f`);
1600	a = min(a, `1.0f`);
1601	}
1602
1603	STAGE(clamp_a, Ctx::None) {
1604	a = min(a, `1.0f`);
1605	r = min(r, a);
1606	g = min(g, a);
1607	b = min(b, a);
1608	}
1609
1610	STAGE(clamp_gamut, Ctx::None) {
1611	a = min(max(a, `0`), `1.0f`);
1612	r = min(max(r, `0`), a);
1613	g = min(max(g, `0`), a);
1614	b = min(max(b, `0`), a);
1615	}
1616
1617	STAGE(set_rgb, const float* rgb) {
1618	r = rgb[`0`];
1619	g = rgb[`1`];
1620	b = rgb[`2`];
1621	}
1622	STAGE(unbounded_set_rgb, const float* rgb) {
1623	r = rgb[`0`];
1624	g = rgb[`1`];
1625	b = rgb[`2`];
1626	}
1627
1628	STAGE(swap_rb, Ctx::None) {
1629	auto tmp = r;
1630	r = b;
1631	b = tmp;
1632	}
1633	STAGE(swap_rb_dst, Ctx::None) {
1634	auto tmp = dr;
1635	dr = db;
1636	db = tmp;
1637	}
1638
1639	STAGE(move_src_dst, Ctx::None) {
1640	dr = r;
1641	dg = g;
1642	db = b;
1643	da = a;
1644	}
1645	STAGE(move_dst_src, Ctx::None) {
1646	r = dr;
1647	g = dg;
1648	b = db;
1649	a = da;
1650	}
1651
1652	STAGE(premul, Ctx::None) {
1653	r = r * a;
1654	g = g * a;
1655	b = b * a;
1656	}
1657	STAGE(premul_dst, Ctx::None) {
1658	dr = dr * da;
1659	dg = dg * da;
1660	db = db * da;
1661	}
1662	STAGE(unpremul, Ctx::None) {
1663	float inf = sk_bit_cast<float>(`0x7f800000`);
1664	auto scale = if_then_else(`1.0f`/a < inf, `1.0f`/a, `0`);
1665	r *= scale;
1666	g *= scale;
1667	b *= scale;
1668	}
1669
1670	STAGE(force_opaque , Ctx::None) { a = `1`; }
1671	STAGE(force_opaque_dst, Ctx::None) { da = `1`; }
1672
1673	// Clamp x to [0,1], both sides inclusive (think, gradients).
1674	// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
1675	SI F clamp_01(F v) { return min(max(`0`, v), `1`); }
1676
1677	STAGE(rgb_to_hsl, Ctx::None) {
1678	F mx = max(r, max(g,b)),
1679	mn = min(r, min(g,b)),
1680	d = mx - mn,
1681	d_rcp = `1.0f` / d;
1682
1683	F h = (`1`/`6.0f`) *
1684	if_then_else(mx == mn, `0`,
1685	if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, `6.0f`, `0`),
1686	if_then_else(mx == g, (b-r)*d_rcp + `2.0f`,
1687	(r-g)*d_rcp + `4.0f`)));
1688
1689	F l = (mx + mn) * `0.5f`;
1690	F s = if_then_else(mx == mn, `0`,
1691	d / if_then_else(l > `0.5f`, `2.0f`-mx-mn, mx+mn));
1692
1693	r = h;
1694	g = s;
1695	b = l;
1696	}
1697	STAGE(hsl_to_rgb, Ctx::None) {
1698	// See GrRGBToHSLFilterEffect.fp
1699
1700	F h = r,
1701	s = g,
1702	l = b,
1703	c = (`1.0f` - abs_(`2.0f` * l - `1`)) * s;
1704
1705	auto hue_to_rgb = [&](F hue) {
1706	F q = clamp_01(abs_(fract(hue) * `6.0f` - `3.0f`) - `1.0f`);
1707	return (q - `0.5f`) * c + l;
1708	};
1709
1710	r = hue_to_rgb(h + `0.0f`/`3.0f`);
1711	g = hue_to_rgb(h + `2.0f`/`3.0f`);
1712	b = hue_to_rgb(h + `1.0f`/`3.0f`);
1713	}
1714
1715	// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
1716	SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) {
1717	return if_then_else(a < da, min(cr, min(cg,cb))
1718	, max(cr, max(cg,cb)));
1719	}
1720
1721	STAGE(scale_1_float, const float* c) {
1722	r = r * *c;
1723	g = g * *c;
1724	b = b * *c;
1725	a = a * *c;
1726	}
1727	STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
1728	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1729
1730	auto scales = load<U8>(ptr, tail);
1731	auto c = from_byte(scales);
1732
1733	r = r * c;
1734	g = g * c;
1735	b = b * c;
1736	a = a * c;
1737	}
1738	STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
1739	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1740
1741	F cr,cg,cb;
1742	from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1743
1744	F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
1745
1746	r = r * cr;
1747	g = g * cg;
1748	b = b * cb;
1749	a = a * ca;
1750	}
1751
1752	SI F lerp(F from, F to, F t) {
1753	return mad(to-from, t, from);
1754	}
1755
1756	STAGE(lerp_1_float, const float* c) {
1757	r = lerp(dr, r, *c);
1758	g = lerp(dg, g, *c);
1759	b = lerp(db, b, *c);
1760	a = lerp(da, a, *c);
1761	}
1762	STAGE(scale_native, const float scales[]) {
1763	auto c = sk_unaligned_load<F>(scales);
1764	r = r * c;
1765	g = g * c;
1766	b = b * c;
1767	a = a * c;
1768	}
1769	STAGE(lerp_native, const float scales[]) {
1770	auto c = sk_unaligned_load<F>(scales);
1771	r = lerp(dr, r, c);
1772	g = lerp(dg, g, c);
1773	b = lerp(db, b, c);
1774	a = lerp(da, a, c);
1775	}
1776	STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
1777	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1778
1779	auto scales = load<U8>(ptr, tail);
1780	auto c = from_byte(scales);
1781
1782	r = lerp(dr, r, c);
1783	g = lerp(dg, g, c);
1784	b = lerp(db, b, c);
1785	a = lerp(da, a, c);
1786	}
1787	STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
1788	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1789
1790	F cr,cg,cb;
1791	from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1792
1793	F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
1794
1795	r = lerp(dr, r, cr);
1796	g = lerp(dg, g, cg);
1797	b = lerp(db, b, cb);
1798	a = lerp(da, a, ca);
1799	}
1800
1801	STAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
1802	auto mptr = ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy),
1803	aptr = ptr_at_xy<const uint8_t>(&ctx->add, dx,dy);
1804
1805	F mul = from_byte(load<U8>(mptr, tail)),
1806	add = from_byte(load<U8>(aptr, tail));
1807
1808	r = mad(r, mul, add);
1809	g = mad(g, mul, add);
1810	b = mad(b, mul, add);
1811	}
1812
1813	STAGE(byte_tables, const void* ctx) { // TODO: rename Tables SkRasterPipeline_ByteTablesCtx
1814	struct Tables { const uint8_t r, g, b, a; };
1815	auto tables = (const Tables*)ctx;
1816
1817	r = from_byte(gather(tables->r, to_unorm(r, `255`)));
1818	g = from_byte(gather(tables->g, to_unorm(g, `255`)));
1819	b = from_byte(gather(tables->b, to_unorm(b, `255`)));
1820	a = from_byte(gather(tables->a, to_unorm(a, `255`)));
1821	}
1822
1823	SI F strip_sign(F x, U32* sign) {
1824	U32 bits = sk_bit_cast<U32>(x);
1825	*sign = bits & `0x80000000`;
1826	return sk_bit_cast<F>(bits ^ *sign);
1827	}
1828
1829	SI F apply_sign(F x, U32 sign) {
1830	return sk_bit_cast<F>(sign \| sk_bit_cast<U32>(x));
1831	}
1832
1833	STAGE(parametric, const skcms_TransferFunction* ctx) {
1834	auto fn = [&](F v) {
1835	U32 sign;
1836	v = strip_sign(v, &sign);
1837
1838	F r = if_then_else(v <= ctx->d, mad(ctx->c, v, ctx->f)
1839	, approx_powf(mad(ctx->a, v, ctx->b), ctx->g) + ctx->e);
1840	return apply_sign(r, sign);
1841	};
1842	r = fn(r);
1843	g = fn(g);
1844	b = fn(b);
1845	}
1846
1847	STAGE(gamma_, const float* G) {
1848	auto fn = [&](F v) {
1849	U32 sign;
1850	v = strip_sign(v, &sign);
1851	return apply_sign(approx_powf(v, *G), sign);
1852	};
1853	r = fn(r);
1854	g = fn(g);
1855	b = fn(b);
1856	}
1857
1858	STAGE(PQish, const skcms_TransferFunction* ctx) {
1859	auto fn = [&](F v) {
1860	U32 sign;
1861	v = strip_sign(v, &sign);
1862
1863	F r = approx_powf(max(mad(ctx->b, approx_powf(v, ctx->c), ctx->a), `0`)
1864	/ (mad(ctx->e, approx_powf(v, ctx->c), ctx->d)),
1865	ctx->f);
1866
1867	return apply_sign(r, sign);
1868	};
1869	r = fn(r);
1870	g = fn(g);
1871	b = fn(b);
1872	}
1873
1874	STAGE(HLGish, const skcms_TransferFunction* ctx) {
1875	auto fn = [&](F v) {
1876	U32 sign;
1877	v = strip_sign(v, &sign);
1878
1879	const float R = ctx->a, G = ctx->b,
1880	a = ctx->c, b = ctx->d, c = ctx->e;
1881
1882	F r = if_then_else(vR <= `1`, approx_powf(vR, G)
1883	, approx_exp((v-c)*a) + b);
1884
1885	return apply_sign(r, sign);
1886	};
1887	r = fn(r);
1888	g = fn(g);
1889	b = fn(b);
1890	}
1891
1892	STAGE(HLGinvish, const skcms_TransferFunction* ctx) {
1893	auto fn = [&](F v) {
1894	U32 sign;
1895	v = strip_sign(v, &sign);
1896
1897	const float R = ctx->a, G = ctx->b,
1898	a = ctx->c, b = ctx->d, c = ctx->e;
1899
1900	F r = if_then_else(v <= `1`, R * approx_powf(v, G)
1901	, a * approx_log(v - b) + c);
1902
1903	return apply_sign(r, sign);
1904	};
1905	r = fn(r);
1906	g = fn(g);
1907	b = fn(b);
1908	}
1909
1910	STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
1911	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1912
1913	r = g = b = `0.0f`;
1914	a = from_byte(load<U8>(ptr, tail));
1915	}
1916	STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1917	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1918
1919	dr = dg = db = `0.0f`;
1920	da = from_byte(load<U8>(ptr, tail));
1921	}
1922	STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
1923	const uint8_t* ptr;
1924	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1925	r = g = b = `0.0f`;
1926	a = from_byte(gather(ptr, ix));
1927	}
1928	STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
1929	auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
1930
1931	U8 packed = pack(pack(to_unorm(a, `255`)));
1932	store(ptr, packed, tail);
1933	}
1934
1935	STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
1936	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1937
1938	from_565(load<U16>(ptr, tail), &r,&g,&b);
1939	a = `1.0f`;
1940	}
1941	STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1942	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1943
1944	from_565(load<U16>(ptr, tail), &dr,&dg,&db);
1945	da = `1.0f`;
1946	}
1947	STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
1948	const uint16_t* ptr;
1949	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1950	from_565(gather(ptr, ix), &r,&g,&b);
1951	a = `1.0f`;
1952	}
1953	STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
1954	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
1955
1956	U16 px = pack( to_unorm(r, `31`) << `11`
1957	\| to_unorm(g, `63`) << `5`
1958	\| to_unorm(b, `31`) );
1959	store(ptr, px, tail);
1960	}
1961
1962	STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
1963	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1964	from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
1965	}
1966	STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1967	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1968	from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da);
1969	}
1970	STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
1971	const uint16_t* ptr;
1972	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1973	from_4444(gather(ptr, ix), &r,&g,&b,&a);
1974	}
1975	STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
1976	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
1977	U16 px = pack( to_unorm(r, `15`) << `12`
1978	\| to_unorm(g, `15`) << `8`
1979	\| to_unorm(b, `15`) << `4`
1980	\| to_unorm(a, `15`) );
1981	store(ptr, px, tail);
1982	}
1983
1984	STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
1985	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
1986	from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
1987	}
1988	STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1989	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
1990	from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);
1991	}
1992	STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
1993	const uint32_t* ptr;
1994	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1995	from_8888(gather(ptr, ix), &r,&g,&b,&a);
1996	}
1997	STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
1998	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
1999
2000	U32 px = to_unorm(r, `255`)
2001	\| to_unorm(g, `255`) << `8`
2002	\| to_unorm(b, `255`) << `16`
2003	\| to_unorm(a, `255`) << `24`;
2004	store(ptr, px, tail);
2005	}
2006
2007	STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2008	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2009	from_88(load<U16>(ptr, tail), &r, &g);
2010	b = `0`;
2011	a = `1`;
2012	}
2013	STAGE(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2014	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2015	from_88(load<U16>(ptr, tail), &dr, &dg);
2016	db = `0`;
2017	da = `1`;
2018	}
2019	STAGE(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
2020	const uint16_t* ptr;
2021	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2022	from_88(gather(ptr, ix), &r, &g);
2023	b = `0`;
2024	a = `1`;
2025	}
2026	STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2027	auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy);
2028	U16 px = pack( to_unorm(r, `255`) \| to_unorm(g, `255`) << `8` );
2029	store(ptr, px, tail);
2030	}
2031
2032	STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2033	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2034	r = g = b = `0`;
2035	a = from_short(load<U16>(ptr, tail));
2036	}
2037	STAGE(load_a16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2038	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2039	dr = dg = db = `0.0f`;
2040	da = from_short(load<U16>(ptr, tail));
2041	}
2042	STAGE(gather_a16, const SkRasterPipeline_GatherCtx* ctx) {
2043	const uint16_t* ptr;
2044	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2045	r = g = b = `0.0f`;
2046	a = from_short(gather(ptr, ix));
2047	}
2048	STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2049	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2050
2051	U16 px = pack(to_unorm(a, `65535`));
2052	store(ptr, px, tail);
2053	}
2054
2055	STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2056	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2057	b = `0`; a = `1`;
2058	from_1616(load<U32>(ptr, tail), &r,&g);
2059	}
2060	STAGE(load_rg1616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2061	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2062	from_1616(load<U32>(ptr, tail), &dr, &dg);
2063	db = `0`;
2064	da = `1`;
2065	}
2066	STAGE(gather_rg1616, const SkRasterPipeline_GatherCtx* ctx) {
2067	const uint32_t* ptr;
2068	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2069	from_1616(gather(ptr, ix), &r, &g);
2070	b = `0`;
2071	a = `1`;
2072	}
2073	STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2074	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2075
2076	U32 px = to_unorm(r, `65535`)
2077	\| to_unorm(g, `65535`) << `16`;
2078	store(ptr, px, tail);
2079	}
2080
2081	STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2082	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2083	from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a);
2084	}
2085	STAGE(load_16161616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2086	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2087	from_16161616(load<U64>(ptr, tail), &dr, &dg, &db, &da);
2088	}
2089	STAGE(gather_16161616, const SkRasterPipeline_GatherCtx* ctx) {
2090	const uint64_t* ptr;
2091	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2092	from_16161616(gather(ptr, ix), &r, &g, &b, &a);
2093	}
2094	STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2095	auto ptr = ptr_at_xy<uint16_t>(ctx, `4`dx,`4`dy);
2096
2097	U16 R = pack(to_unorm(r, `65535`)),
2098	G = pack(to_unorm(g, `65535`)),
2099	B = pack(to_unorm(b, `65535`)),
2100	A = pack(to_unorm(a, `65535`));
2101
2102	store4(ptr,tail, R,G,B,A);
2103	}
2104
2105
2106	STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2107	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2108	from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a);
2109	}
2110	STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2111	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2112	from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da);
2113	}
2114	STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) {
2115	const uint32_t* ptr;
2116	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2117	from_1010102(gather(ptr, ix), &r,&g,&b,&a);
2118	}
2119	STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2120	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2121
2122	U32 px = to_unorm(r, `1023`)
2123	\| to_unorm(g, `1023`) << `10`
2124	\| to_unorm(b, `1023`) << `20`
2125	\| to_unorm(a, `3`) << `30`;
2126	store(ptr, px, tail);
2127	}
2128
2129	STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2130	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2131
2132	U16 R,G,B,A;
2133	load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2134	r = from_half(R);
2135	g = from_half(G);
2136	b = from_half(B);
2137	a = from_half(A);
2138	}
2139	STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2140	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2141
2142	U16 R,G,B,A;
2143	load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2144	dr = from_half(R);
2145	dg = from_half(G);
2146	db = from_half(B);
2147	da = from_half(A);
2148	}
2149	STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) {
2150	const uint64_t* ptr;
2151	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2152	auto px = gather(ptr, ix);
2153
2154	U16 R,G,B,A;
2155	load4((const uint16_t*)&px,`0`, &R,&G,&B,&A);
2156	r = from_half(R);
2157	g = from_half(G);
2158	b = from_half(B);
2159	a = from_half(A);
2160	}
2161	STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2162	auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy);
2163	store4((uint16_t*)ptr,tail, to_half(r)
2164	, to_half(g)
2165	, to_half(b)
2166	, to_half(a));
2167	}
2168
2169	STAGE(store_u16_be, const SkRasterPipeline_MemoryCtx* ctx) {
2170	auto ptr = ptr_at_xy<uint16_t>(ctx, `4`*dx,dy);
2171
2172	U16 R = bswap(pack(to_unorm(r, `65535`))),
2173	G = bswap(pack(to_unorm(g, `65535`))),
2174	B = bswap(pack(to_unorm(b, `65535`))),
2175	A = bswap(pack(to_unorm(a, `65535`)));
2176
2177	store4(ptr,tail, R,G,B,A);
2178	}
2179
2180	STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2181	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2182
2183	U16 A = load<U16>((const uint16_t*)ptr, tail);
2184	r = `0`;
2185	g = `0`;
2186	b = `0`;
2187	a = from_half(A);
2188	}
2189	STAGE(load_af16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2190	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2191
2192	U16 A = load<U16>((const uint16_t*)ptr, tail);
2193	dr = dg = db = `0.0f`;
2194	da = from_half(A);
2195	}
2196	STAGE(gather_af16, const SkRasterPipeline_GatherCtx* ctx) {
2197	const uint16_t* ptr;
2198	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2199	r = g = b = `0.0f`;
2200	a = from_half(gather(ptr, ix));
2201	}
2202	STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2203	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2204	store(ptr, to_half(a), tail);
2205	}
2206
2207	STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2208	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2209
2210	U16 R,G;
2211	load2((const uint16_t*)ptr, tail, &R, &G);
2212	r = from_half(R);
2213	g = from_half(G);
2214	b = `0`;
2215	a = `1`;
2216	}
2217	STAGE(load_rgf16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2218	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2219
2220	U16 R,G;
2221	load2((const uint16_t*)ptr, tail, &R, &G);
2222	dr = from_half(R);
2223	dg = from_half(G);
2224	db = `0`;
2225	da = `1`;
2226	}
2227	STAGE(gather_rgf16, const SkRasterPipeline_GatherCtx* ctx) {
2228	const uint32_t* ptr;
2229	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2230	auto px = gather(ptr, ix);
2231
2232	U16 R,G;
2233	load2((const uint16_t*)&px, `0`, &R, &G);
2234	r = from_half(R);
2235	g = from_half(G);
2236	b = `0`;
2237	a = `1`;
2238	}
2239	STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2240	auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy);
2241	store2((uint16_t*)ptr, tail, to_half(r)
2242	, to_half(g));
2243	}
2244
2245	STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2246	auto ptr = ptr_at_xy<const float>(ctx, `4`dx,`4`dy);
2247	load4(ptr,tail, &r,&g,&b,&a);
2248	}
2249	STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2250	auto ptr = ptr_at_xy<const float>(ctx, `4`dx,`4`dy);
2251	load4(ptr,tail, &dr,&dg,&db,&da);
2252	}
2253	STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) {
2254	const float* ptr;
2255	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2256	r = gather(ptr, `4`*ix + `0`);
2257	g = gather(ptr, `4`*ix + `1`);
2258	b = gather(ptr, `4`*ix + `2`);
2259	a = gather(ptr, `4`*ix + `3`);
2260	}
2261	STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2262	auto ptr = ptr_at_xy<float>(ctx, `4`dx,`4`dy);
2263	store4(ptr,tail, r,g,b,a);
2264	}
2265
2266	STAGE(load_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2267	auto ptr = ptr_at_xy<const float>(ctx, `2`dx,`2`dy);
2268	load2(ptr, tail, &r, &g);
2269	b = `0`;
2270	a = `1`;
2271	}
2272	STAGE(store_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2273	auto ptr = ptr_at_xy<float>(ctx, `2`dx,`2`dy);
2274	store2(ptr, tail, r, g);
2275	}
2276
2277	SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) {
2278	return v - floor_(vctx->invScale)ctx->scale;
2279	}
2280	SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) {
2281	auto limit = ctx->scale;
2282	auto invLimit = ctx->invScale;
2283	return abs_( (v-limit) - (limit+limit)floor_((v-limit)(invLimit*`0.5f`)) - limit );
2284	}
2285	// Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
2286	// The gather stages will hard clamp the output of these stages to [0,limit)...
2287	// we just need to do the basic repeat or mirroring.
2288	STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(r, ctx); }
2289	STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(g, ctx); }
2290	STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(r, ctx); }
2291	STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(g, ctx); }
2292
2293	STAGE( clamp_x_1, Ctx::None) { r = clamp_01(r); }
2294	STAGE(repeat_x_1, Ctx::None) { r = clamp_01(r - floor_(r)); }
2295	STAGE(mirror_x_1, Ctx::None) { r = clamp_01(abs_( (r-`1.0f`) - two(floor_((r-`1.0f`)*`0.5f`)) - `1.0f` )); }
2296
2297	// Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain:
2298	// mask == 0x00000000 if the coordinate(s) are out of bounds
2299	// mask == 0xFFFFFFFF if the coordinate(s) are in bounds
2300	// After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0
2301	// if either of the coordinates were out of bounds.
2302
2303	STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
2304	auto w = ctx->limit_x;
2305	sk_unaligned_store(ctx->mask, cond_to_mask((`0` <= r) & (r < w)));
2306	}
2307	STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
2308	auto h = ctx->limit_y;
2309	sk_unaligned_store(ctx->mask, cond_to_mask((`0` <= g) & (g < h)));
2310	}
2311	STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
2312	auto w = ctx->limit_x;
2313	auto h = ctx->limit_y;
2314	sk_unaligned_store(ctx->mask,
2315	cond_to_mask((`0` <= r) & (r < w) & (`0` <= g) & (g < h)));
2316	}
2317	STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
2318	auto mask = sk_unaligned_load<U32>(ctx->mask);
2319	r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask);
2320	g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask);
2321	b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask);
2322	a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask);
2323	}
2324
2325	STAGE(alpha_to_gray, Ctx::None) {
2326	r = g = b = a;
2327	a = `1`;
2328	}
2329	STAGE(alpha_to_gray_dst, Ctx::None) {
2330	dr = dg = db = da;
2331	da = `1`;
2332	}
2333	STAGE(bt709_luminance_or_luma_to_alpha, Ctx::None) {
2334	a = r`0.2126f` + g`0.7152f` + b*`0.0722f`;
2335	r = g = b = `0`;
2336	}
2337
2338	STAGE(matrix_translate, const float* m) {
2339	r += m[`0`];
2340	g += m[`1`];
2341	}
2342	STAGE(matrix_scale_translate, const float* m) {
2343	r = mad(r,m[`0`], m[`2`]);
2344	g = mad(g,m[`1`], m[`3`]);
2345	}
2346	STAGE(matrix_2x3, const float* m) {
2347	auto R = mad(r,m[`0`], mad(g,m[`2`], m[`4`])),
2348	G = mad(r,m[`1`], mad(g,m[`3`], m[`5`]));
2349	r = R;
2350	g = G;
2351	}
2352	STAGE(matrix_3x3, const float* m) {
2353	auto R = mad(r,m[`0`], mad(g,m[`3`], b*m[`6`])),
2354	G = mad(r,m[`1`], mad(g,m[`4`], b*m[`7`])),
2355	B = mad(r,m[`2`], mad(g,m[`5`], b*m[`8`]));
2356	r = R;
2357	g = G;
2358	b = B;
2359	}
2360	STAGE(matrix_3x4, const float* m) {
2361	auto R = mad(r,m[`0`], mad(g,m[`3`], mad(b,m[`6`], m[ `9`]))),
2362	G = mad(r,m[`1`], mad(g,m[`4`], mad(b,m[`7`], m[`10`]))),
2363	B = mad(r,m[`2`], mad(g,m[`5`], mad(b,m[`8`], m[`11`])));
2364	r = R;
2365	g = G;
2366	b = B;
2367	}
2368	STAGE(matrix_4x5, const float* m) {
2369	auto R = mad(r,m[ `0`], mad(g,m[ `1`], mad(b,m[ `2`], mad(a,m[ `3`], m[ `4`])))),
2370	G = mad(r,m[ `5`], mad(g,m[ `6`], mad(b,m[ `7`], mad(a,m[ `8`], m[ `9`])))),
2371	B = mad(r,m[`10`], mad(g,m[`11`], mad(b,m[`12`], mad(a,m[`13`], m[`14`])))),
2372	A = mad(r,m[`15`], mad(g,m[`16`], mad(b,m[`17`], mad(a,m[`18`], m[`19`]))));
2373	r = R;
2374	g = G;
2375	b = B;
2376	a = A;
2377	}
2378	STAGE(matrix_4x3, const float* m) {
2379	auto X = r,
2380	Y = g;
2381
2382	r = mad(X, m[`0`], mad(Y, m[`4`], m[ `8`]));
2383	g = mad(X, m[`1`], mad(Y, m[`5`], m[ `9`]));
2384	b = mad(X, m[`2`], mad(Y, m[`6`], m[`10`]));
2385	a = mad(X, m[`3`], mad(Y, m[`7`], m[`11`]));
2386	}
2387	STAGE(matrix_perspective, const float* m) {
2388	// N.B. Unlike the other matrix_ stages, this matrix is row-major.
2389	auto R = mad(r,m[`0`], mad(g,m[`1`], m[`2`])),
2390	G = mad(r,m[`3`], mad(g,m[`4`], m[`5`])),
2391	Z = mad(r,m[`6`], mad(g,m[`7`], m[`8`]));
2392	r = R * rcp(Z);
2393	g = G * rcp(Z);
2394	}
2395
2396	SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
2397	F* r, F* g, F* b, F* a) {
2398	F fr, br, fg, bg, fb, bb, fa, ba;
2399	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
2400	if (c->stopCount <=`8`) {
2401	fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`0`]), idx);
2402	br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`0`]), idx);
2403	fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`1`]), idx);
2404	bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`1`]), idx);
2405	fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`2`]), idx);
2406	bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`2`]), idx);
2407	fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`3`]), idx);
2408	ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`3`]), idx);
2409	} else
2410	#endif
2411	{
2412	fr = gather(c->fs[`0`], idx);
2413	br = gather(c->bs[`0`], idx);
2414	fg = gather(c->fs[`1`], idx);
2415	bg = gather(c->bs[`1`], idx);
2416	fb = gather(c->fs[`2`], idx);
2417	bb = gather(c->bs[`2`], idx);
2418	fa = gather(c->fs[`3`], idx);
2419	ba = gather(c->bs[`3`], idx);
2420	}
2421
2422	*r = mad(t, fr, br);
2423	*g = mad(t, fg, bg);
2424	*b = mad(t, fb, bb);
2425	*a = mad(t, fa, ba);
2426	}
2427
2428	STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
2429	auto t = r;
2430	auto idx = trunc_(t * (c->stopCount-`1`));
2431	gradient_lookup(c, idx, t, &r, &g, &b, &a);
2432	}
2433
2434	STAGE(gradient, const SkRasterPipeline_GradientCtx* c) {
2435	auto t = r;
2436	U32 idx = `0`;
2437
2438	// N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
2439	for (size_t i = `1`; i < c->stopCount; i++) {
2440	idx += if_then_else(t >= c->ts[i], U32(`1`), U32(`0`));
2441	}
2442
2443	gradient_lookup(c, idx, t, &r, &g, &b, &a);
2444	}
2445
2446	STAGE(evenly_spaced_2_stop_gradient, const void* ctx) {
2447	// TODO: Rename Ctx SkRasterPipeline_EvenlySpaced2StopGradientCtx.
2448	struct Ctx { float f[`4`], b[`4`]; };
2449	auto c = (const Ctx*)ctx;
2450
2451	auto t = r;
2452	r = mad(t, c->f[`0`], c->b[`0`]);
2453	g = mad(t, c->f[`1`], c->b[`1`]);
2454	b = mad(t, c->f[`2`], c->b[`2`]);
2455	a = mad(t, c->f[`3`], c->b[`3`]);
2456	}
2457
2458	STAGE(xy_to_unit_angle, Ctx::None) {
2459	F X = r,
2460	Y = g;
2461	F xabs = abs_(X),
2462	yabs = abs_(Y);
2463
2464	F slope = min(xabs, yabs)/max(xabs, yabs);
2465	F s = slope * slope;
2466
2467	// Use a 7th degree polynomial to approximate atan.
2468	// This was generated using sollya.gforge.inria.fr.
2469	// A float optimized polynomial was generated using the following command.
2470	// P1 = fpminimax((1/(2Pi))atan(x),[\|1,3,5,7\|],[\|24...\|],[2^(-40),1],relative);
2471	F phi = slope
2472	* (`0.15912117063999176025390625f` + s
2473	* (-`5.185396969318389892578125e-2f` + s
2474	* (`2.476101927459239959716796875e-2f` + s
2475	* (-`7.0547382347285747528076171875e-3f`))));
2476
2477	phi = if_then_else(xabs < yabs, `1.0f`/`4.0f` - phi, phi);
2478	phi = if_then_else(X < `0.0f` , `1.0f`/`2.0f` - phi, phi);
2479	phi = if_then_else(Y < `0.0f` , `1.0f` - phi , phi);
2480	phi = if_then_else(phi != phi , `0` , phi); // Check for NaN.
2481	r = phi;
2482	}
2483
2484	STAGE(xy_to_radius, Ctx::None) {
2485	F X2 = r * r,
2486	Y2 = g * g;
2487	r = sqrt_(X2 + Y2);
2488	}
2489
2490	// Please see https://skia.org/dev/design/conical for how our 2pt conical shader works.
2491
2492	STAGE(negate_x, Ctx::None) { r = -r; }
2493
2494	STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) {
2495	F x = r, y = g, &t = r;
2496	t = x + sqrt_(ctx->fP0 - yy); // ctx->fP0 = r0 * r0*
2497	}
2498
2499	STAGE(xy_to_2pt_conical_focal_on_circle, Ctx::None) {
2500	F x = r, y = g, &t = r;
2501	t = x + yy / x; // (x^2 + y^2) / x*
2502	}
2503
2504	STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) {
2505	F x = r, y = g, &t = r;
2506	t = sqrt_(xx + yy) - x * ctx->fP0; // ctx->fP0 = 1/r1
2507	}
2508
2509	STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) {
2510	F x = r, y = g, &t = r;
2511	t = sqrt_(xx - yy) - x * ctx->fP0; // ctx->fP0 = 1/r1
2512	}
2513
2514	STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) {
2515	F x = r, y = g, &t = r;
2516	t = -sqrt_(xx - yy) - x * ctx->fP0; // ctx->fP0 = 1/r1
2517	}
2518
2519	STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) {
2520	F& t = r;
2521	t = t + ctx->fP1; // ctx->fP1 = f
2522	}
2523
2524	STAGE(alter_2pt_conical_unswap, Ctx::None) {
2525	F& t = r;
2526	t = `1` - t;
2527	}
2528
2529	STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) {
2530	F& t = r;
2531	auto is_degenerate = (t != t); // NaN
2532	t = if_then_else(is_degenerate, F(`0`), t);
2533	sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
2534	}
2535
2536	STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) {
2537	F& t = r;
2538	auto is_degenerate = (t <= `0`) \| (t != t);
2539	t = if_then_else(is_degenerate, F(`0`), t);
2540	sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
2541	}
2542
2543	STAGE(apply_vector_mask, const uint32_t* ctx) {
2544	const U32 mask = sk_unaligned_load<U32>(ctx);
2545	r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask);
2546	g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask);
2547	b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask);
2548	a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask);
2549	}
2550
2551	STAGE(save_xy, SkRasterPipeline_SamplerCtx* c) {
2552	// Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
2553	// They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
2554	// surrounding (x,y) at (0.5,0.5) off-center.
2555	F fx = fract(r + `0.5f`),
2556	fy = fract(g + `0.5f`);
2557
2558	// Samplers will need to load x and fx, or y and fy.
2559	sk_unaligned_store(c->x, r);
2560	sk_unaligned_store(c->y, g);
2561	sk_unaligned_store(c->fx, fx);
2562	sk_unaligned_store(c->fy, fy);
2563	}
2564
2565	STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) {
2566	// Bilinear and bicubic filters are both separable, so we produce independent contributions
2567	// from x and y, multiplying them together here to get each pixel's total scale factor.
2568	auto scale = sk_unaligned_load<F>(c->scalex)
2569	* sk_unaligned_load<F>(c->scaley);
2570	dr = mad(scale, r, dr);
2571	dg = mad(scale, g, dg);
2572	db = mad(scale, b, db);
2573	da = mad(scale, a, da);
2574	}
2575
2576	// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
2577	// are combined in direct proportion to their area overlapping that logical query pixel.
2578	// At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
2579	// The y-axis is symmetric.
2580
2581	template <int kScale>
2582	SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
2583	x = sk_unaligned_load<F>(ctx->x) + (kScale `0.5f`);
2584	F fx = sk_unaligned_load<F>(ctx->fx);
2585
2586	F scalex;
2587	if (kScale == -`1`) { scalex = `1.0f` - fx; }
2588	if (kScale == +`1`) { scalex = fx; }
2589	sk_unaligned_store(ctx->scalex, scalex);
2590	}
2591	template <int kScale>
2592	SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
2593	y = sk_unaligned_load<F>(ctx->y) + (kScale `0.5f`);
2594	F fy = sk_unaligned_load<F>(ctx->fy);
2595
2596	F scaley;
2597	if (kScale == -`1`) { scaley = `1.0f` - fy; }
2598	if (kScale == +`1`) { scaley = fy; }
2599	sk_unaligned_store(ctx->scaley, scaley);
2600	}
2601
2602	STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-`1`>(ctx, &r); }
2603	STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+`1`>(ctx, &r); }
2604	STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-`1`>(ctx, &g); }
2605	STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+`1`>(ctx, &g); }
2606
2607
2608	// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
2609	// pixel center are combined with a non-uniform cubic filter, with higher values near the center.
2610	//
2611	// We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
2612	// See GrCubicEffect for details of this particular filter.
2613
2614	SI F bicubic_near(F t) {
2615	// 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
2616	return mad(t, mad(t, mad((-`21`/`18.0f`), t, (`27`/`18.0f`)), (`9`/`18.0f`)), (`1`/`18.0f`));
2617	}
2618	SI F bicubic_far(F t) {
2619	// 0/18 + 0/18t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)*
2620	return (tt)mad((`7`/`18.0f`), t, (-`6`/`18.0f`));
2621	}
2622
2623	template <int kScale>
2624	SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
2625	x = sk_unaligned_load<F>(ctx->x) + (kScale `0.5f`);
2626	F fx = sk_unaligned_load<F>(ctx->fx);
2627
2628	F scalex;
2629	if (kScale == -`3`) { scalex = bicubic_far (`1.0f` - fx); }
2630	if (kScale == -`1`) { scalex = bicubic_near(`1.0f` - fx); }
2631	if (kScale == +`1`) { scalex = bicubic_near( fx); }
2632	if (kScale == +`3`) { scalex = bicubic_far ( fx); }
2633	sk_unaligned_store(ctx->scalex, scalex);
2634	}
2635	template <int kScale>
2636	SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
2637	y = sk_unaligned_load<F>(ctx->y) + (kScale `0.5f`);
2638	F fy = sk_unaligned_load<F>(ctx->fy);
2639
2640	F scaley;
2641	if (kScale == -`3`) { scaley = bicubic_far (`1.0f` - fy); }
2642	if (kScale == -`1`) { scaley = bicubic_near(`1.0f` - fy); }
2643	if (kScale == +`1`) { scaley = bicubic_near( fy); }
2644	if (kScale == +`3`) { scaley = bicubic_far ( fy); }
2645	sk_unaligned_store(ctx->scaley, scaley);
2646	}
2647
2648	STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-`3`>(ctx, &r); }
2649	STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-`1`>(ctx, &r); }
2650	STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+`1`>(ctx, &r); }
2651	STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+`3`>(ctx, &r); }
2652
2653	STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-`3`>(ctx, &g); }
2654	STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-`1`>(ctx, &g); }
2655	STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+`1`>(ctx, &g); }
2656	STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+`3`>(ctx, &g); }
2657
2658	STAGE(callback, SkRasterPipeline_CallbackCtx* c) {
2659	store4(c->rgba,`0`, r,g,b,a);
2660	c->fn(c, tail ? tail : N);
2661	load4(c->read_from,`0`, &r,&g,&b,&a);
2662	}
2663
2664	STAGE(gauss_a_to_rgba, Ctx::None) {
2665	// x = 1 - x;
2666	// exp(-x x * 4) - 0.018f;*
2667	// ... now approximate with quartic
2668	//
2669	const float c4 = -`2.26661229133605957031f`;
2670	const float c3 = `2.89795351028442382812f`;
2671	const float c2 = `0.21345567703247070312f`;
2672	const float c1 = `0.15489584207534790039f`;
2673	const float c0 = `0.00030726194381713867f`;
2674	a = mad(a, mad(a, mad(a, mad(a, c4, c3), c2), c1), c0);
2675	r = a;
2676	g = a;
2677	b = a;
2678	}
2679
2680	SI F tile(F v, SkTileMode mode, float limit, float invLimit) {
2681	// The ix_and_ptr() calls in sample() will clamp tile()'s output, so no need to clamp here.
2682	switch (mode) {
2683	case SkTileMode::kDecal: // TODO, for now fallthrough to clamp
2684	case SkTileMode::kClamp: return v;
2685	case SkTileMode::kRepeat: return v - floor_(vinvLimit)limit;
2686	case SkTileMode::kMirror:
2687	return abs_( (v-limit) - (limit+limit)floor_((v-limit)(invLimit*`0.5f`)) - limit );
2688	}
2689	SkUNREACHABLE;
2690	}
2691
2692	SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y,
2693	F* r, F* g, F* b, F* a) {
2694	x = tile(x, ctx->tileX, ctx->width , ctx->invWidth );
2695	y = tile(y, ctx->tileY, ctx->height, ctx->invHeight);
2696
2697	switch (ctx->ct) {
2698	default: r = g = b = a = `0`; // TODO
2699	break;
2700
2701	case kRGBA_8888_SkColorType:
2702	case kBGRA_8888_SkColorType: {
2703	const uint32_t* ptr;
2704	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
2705	from_8888(gather(ptr, ix), r,g,b,a);
2706	if (ctx->ct == kBGRA_8888_SkColorType) {
2707	std::swap(r,b);
2708	}
2709	} break;
2710	}
2711	}
2712
2713	template <int D>
2714	SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx,
2715	F cx, F cy, const F (&wx)[D], const F (&wy)[D],
2716	F* r, F* g, F* b, F* a) {
2717
2718	float start = -`0.5f`*(D-`1`);
2719
2720	r = g = b = a = `0`;
2721	F y = cy + start;
2722	for (int j = `0`; j < D; j++, y += `1.0f`) {
2723	F x = cx + start;
2724	for (int i = `0`; i < D; i++, x += `1.0f`) {
2725	F R,G,B,A;
2726	sample(ctx, x,y, &R,&G,&B,&A);
2727
2728	F w = wx[i] * wy[j];
2729	r = mad(w,R,r);
2730	g = mad(w,G,g);
2731	b = mad(w,B,b);
2732	a = mad(w,A,a);
2733	}
2734	}
2735	}
2736
2737	STAGE(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) {
2738	F x = r, fx = fract(x + `0.5f`),
2739	y = g, fy = fract(y + `0.5f`);
2740	const F wx[] = {`1.0f` - fx, fx};
2741	const F wy[] = {`1.0f` - fy, fy};
2742
2743	sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
2744	}
2745	STAGE(bicubic, SkRasterPipeline_SamplerCtx2* ctx) {
2746	F x = r, fx = fract(x + `0.5f`),
2747	y = g, fy = fract(y + `0.5f`);
2748	const F wx[] = { bicubic_far(`1`-fx), bicubic_near(`1`-fx), bicubic_near(fx), bicubic_far(fx) };
2749	const F wy[] = { bicubic_far(`1`-fy), bicubic_near(`1`-fy), bicubic_near(fy), bicubic_far(fy) };
2750
2751	sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
2752	}
2753
2754	// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
2755	STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
2756	// (cx,cy) are the center of our sample.
2757	F cx = r,
2758	cy = g;
2759
2760	// All sample points are at the same fractional offset (fx,fy).
2761	// They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
2762	F fx = fract(cx + `0.5f`),
2763	fy = fract(cy + `0.5f`);
2764
2765	// We'll accumulate the color of all four samples into {r,g,b,a} directly.
2766	r = g = b = a = `0`;
2767
2768	for (float dy = -`0.5f`; dy <= +`0.5f`; dy += `1.0f`)
2769	for (float dx = -`0.5f`; dx <= +`0.5f`; dx += `1.0f`) {
2770	// (x,y) are the coordinates of this sample point.
2771	F x = cx + dx,
2772	y = cy + dy;
2773
2774	// ix_and_ptr() will clamp to the image's bounds for us.
2775	const uint32_t* ptr;
2776	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
2777
2778	F sr,sg,sb,sa;
2779	from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
2780
2781	// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
2782	// are combined in direct proportion to their area overlapping that logical query pixel.
2783	// At positive offsets, the x-axis contribution to that rectangle is fx,
2784	// or (1-fx) at negative x. Same deal for y.
2785	F sx = (dx > `0`) ? fx : `1.0f` - fx,
2786	sy = (dy > `0`) ? fy : `1.0f` - fy,
2787	area = sx * sy;
2788
2789	r += sr * area;
2790	g += sg * area;
2791	b += sb * area;
2792	a += sa * area;
2793	}
2794	}
2795
2796	// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
2797	STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
2798	// (cx,cy) are the center of our sample.
2799	F cx = r,
2800	cy = g;
2801
2802	// All sample points are at the same fractional offset (fx,fy).
2803	// They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
2804	F fx = fract(cx + `0.5f`),
2805	fy = fract(cy + `0.5f`);
2806
2807	// We'll accumulate the color of all four samples into {r,g,b,a} directly.
2808	r = g = b = a = `0`;
2809
2810	const F scaley[`4`] = {
2811	bicubic_far (`1.0f` - fy), bicubic_near(`1.0f` - fy),
2812	bicubic_near( fy), bicubic_far ( fy),
2813	};
2814	const F scalex[`4`] = {
2815	bicubic_far (`1.0f` - fx), bicubic_near(`1.0f` - fx),
2816	bicubic_near( fx), bicubic_far ( fx),
2817	};
2818
2819	F sample_y = cy - `1.5f`;
2820	for (int yy = `0`; yy <= `3`; ++yy) {
2821	F sample_x = cx - `1.5f`;
2822	for (int xx = `0`; xx <= `3`; ++xx) {
2823	F scale = scalex[xx] * scaley[yy];
2824
2825	// ix_and_ptr() will clamp to the image's bounds for us.
2826	const uint32_t* ptr;
2827	U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y);
2828
2829	F sr,sg,sb,sa;
2830	from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
2831
2832	r = mad(scale, sr, r);
2833	g = mad(scale, sg, g);
2834	b = mad(scale, sb, b);
2835	a = mad(scale, sa, a);
2836
2837	sample_x += `1`;
2838	}
2839	sample_y += `1`;
2840	}
2841	}
2842
2843	// ~~~~~~ GrSwizzle stage ~~~~~~ //
2844
2845	STAGE(swizzle, void* ctx) {
2846	auto ir = r, ig = g, ib = b, ia = a;
2847	F* o[] = {&r, &g, &b, &a};
2848	char swiz[`4`];
2849	memcpy(swiz, &ctx, sizeof(swiz));
2850
2851	for (int i = `0`; i < `4`; ++i) {
2852	switch (swiz[i]) {
2853	case `'r'`: o[i] = ir; break*;
2854	case `'g'`: o[i] = ig; break*;
2855	case `'b'`: o[i] = ib; break*;
2856	case `'a'`: o[i] = ia; break*;
2857	case `'0'`: o[i] = F(`0`); break*;
2858	case `'1'`: o[i] = F(`1`); break*;
2859	default: break;
2860	}
2861	}
2862	}
2863
2864	namespace lowp {
2865	#if defined(JUMPER_IS_SCALAR) \|\| defined(SK_DISABLE_LOWP_RASTER_PIPELINE)
2866	// If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually),
2867	// we don't generate lowp stages. All these nullptrs will tell SkJumper.cpp to always use the
2868	// highp float pipeline.
2869	#define M(st) static void (*st)(void) = nullptr;
2870	SK_RASTER_PIPELINE_STAGES(M)
2871	#undef M
2872	static void (just_return)(void) = nullptr*;
2873
2874	static void start_pipeline(size_t,size_t,size_t,size_t, void**) {}
2875
2876	#else // We are compiling vector code with Clang... let's make some lowp stages!
2877
2878	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
2879	using U8 = uint8_t __attribute__((ext_vector_type(`16`)));
2880	using U16 = uint16_t __attribute__((ext_vector_type(`16`)));
2881	using I16 = int16_t __attribute__((ext_vector_type(`16`)));
2882	using I32 = int32_t __attribute__((ext_vector_type(`16`)));
2883	using U32 = uint32_t __attribute__((ext_vector_type(`16`)));
2884	using F = float __attribute__((ext_vector_type(`16`)));
2885	#else
2886	using U8 = uint8_t __attribute__((ext_vector_type(`8`)));
2887	using U16 = uint16_t __attribute__((ext_vector_type(`8`)));
2888	using I16 = int16_t __attribute__((ext_vector_type(`8`)));
2889	using I32 = int32_t __attribute__((ext_vector_type(`8`)));
2890	using U32 = uint32_t __attribute__((ext_vector_type(`8`)));
2891	using F = float __attribute__((ext_vector_type(`8`)));
2892	#endif
2893
2894	static const size_t N = sizeof(U16) / sizeof(uint16_t);
2895
2896	// Once again, some platforms benefit from a restricted Stage calling convention,
2897	// but others can pass tons and tons of registers and we're happy to exploit that.
2898	// It's exactly the same decision and implementation strategy as the F stages above.
2899	#if JUMPER_NARROW_STAGES
2900	struct Params {
2901	size_t dx, dy, tail;
2902	U16 dr,dg,db,da;
2903	};
2904	using Stage = void(ABI)(Params, void** program, U16 r, U16 g, U16 b, U16 a);
2905	#else
2906	// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
2907	using Stage = void (ABI)(size_t tail, void*** program, size_t dx, size_t dy,
2908	U16 r, U16 g, U16 b, U16 a,
2909	U16 dr, U16 dg, U16 db, U16 da);
2910	#endif
2911
2912	static void start_pipeline(const size_t x0, const size_t y0,
2913	const size_t xlimit, const size_t ylimit, void** program) {
2914	auto start = (Stage)load_and_inc(program);
2915	for (size_t dy = y0; dy < ylimit; dy++) {
2916	#if JUMPER_NARROW_STAGES
2917	Params params = { x0,dy,`0`, `0`,`0`,`0`,`0` };
2918	for (; params.dx + N <= xlimit; params.dx += N) {
2919	start(&params,program, `0`,`0`,`0`,`0`);
2920	}
2921	if (size_t tail = xlimit - params.dx) {
2922	params.tail = tail;
2923	start(&params,program, `0`,`0`,`0`,`0`);
2924	}
2925	#else
2926	size_t dx = x0;
2927	for (; dx + N <= xlimit; dx += N) {
2928	start( `0`,program,dx,dy, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
2929	}
2930	if (size_t tail = xlimit - dx) {
2931	start(tail,program,dx,dy, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
2932	}
2933	#endif
2934	}
2935	}
2936
2937	#if JUMPER_NARROW_STAGES
2938	static void ABI just_return(Params, void***, U16,U16,U16,U16) {}
2939	#else
2940	static void ABI just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
2941	#endif
2942
2943	// All stages use the same function call ABI to chain into each other, but there are three types:
2944	// GG: geometry in, geometry out -- think, a matrix
2945	// GP: geometry in, pixels out. -- think, a memory gather
2946	// PP: pixels in, pixels out. -- think, a blend mode
2947	//
2948	// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.)
2949	//
2950	// These three STAGE_ macros let you define each type of stage,
2951	// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
2952
2953	#if JUMPER_NARROW_STAGES
2954	#define STAGE_GG(name, ...) \
2955	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
2956	static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
2957	auto x = join<F>(r,g), \
2958	y = join<F>(b,a); \
2959	name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \
2960	split(x, &r,&g); \
2961	split(y, &b,&a); \
2962	auto next = (Stage)load_and_inc(program); \
2963	next(params,program, r,g,b,a); \
2964	} \
2965	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
2966
2967	#define STAGE_GP(name, ...) \
2968	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
2969	U16& r, U16& g, U16& b, U16& a, \
2970	U16& dr, U16& dg, U16& db, U16& da); \
2971	static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
2972	auto x = join<F>(r,g), \
2973	y = join<F>(b,a); \
2974	name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \
2975	params->dr,params->dg,params->db,params->da); \
2976	auto next = (Stage)load_and_inc(program); \
2977	next(params,program, r,g,b,a); \
2978	} \
2979	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
2980	U16& r, U16& g, U16& b, U16& a, \
2981	U16& dr, U16& dg, U16& db, U16& da)
2982
2983	#define STAGE_PP(name, ...) \
2984	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
2985	U16& r, U16& g, U16& b, U16& a, \
2986	U16& dr, U16& dg, U16& db, U16& da); \
2987	static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
2988	name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \
2989	params->dr,params->dg,params->db,params->da); \
2990	auto next = (Stage)load_and_inc(program); \
2991	next(params,program, r,g,b,a); \
2992	} \
2993	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
2994	U16& r, U16& g, U16& b, U16& a, \
2995	U16& dr, U16& dg, U16& db, U16& da)
2996	#else
2997	#define STAGE_GG(name, ...) \
2998	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
2999	static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
3000	U16 r, U16 g, U16 b, U16 a, \
3001	U16 dr, U16 dg, U16 db, U16 da) { \
3002	auto x = join<F>(r,g), \
3003	y = join<F>(b,a); \
3004	name##_k(Ctx{program}, dx,dy,tail, x,y); \
3005	split(x, &r,&g); \
3006	split(y, &b,&a); \
3007	auto next = (Stage)load_and_inc(program); \
3008	next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
3009	} \
3010	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
3011
3012	#define STAGE_GP(name, ...) \
3013	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
3014	U16& r, U16& g, U16& b, U16& a, \
3015	U16& dr, U16& dg, U16& db, U16& da); \
3016	static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
3017	U16 r, U16 g, U16 b, U16 a, \
3018	U16 dr, U16 dg, U16 db, U16 da) { \
3019	auto x = join<F>(r,g), \
3020	y = join<F>(b,a); \
3021	name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
3022	auto next = (Stage)load_and_inc(program); \
3023	next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
3024	} \
3025	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
3026	U16& r, U16& g, U16& b, U16& a, \
3027	U16& dr, U16& dg, U16& db, U16& da)
3028
3029	#define STAGE_PP(name, ...) \
3030	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
3031	U16& r, U16& g, U16& b, U16& a, \
3032	U16& dr, U16& dg, U16& db, U16& da); \
3033	static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
3034	U16 r, U16 g, U16 b, U16 a, \
3035	U16 dr, U16 dg, U16 db, U16 da) { \
3036	name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
3037	auto next = (Stage)load_and_inc(program); \
3038	next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
3039	} \
3040	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
3041	U16& r, U16& g, U16& b, U16& a, \
3042	U16& dr, U16& dg, U16& db, U16& da)
3043	#endif
3044
3045	// ~~~~~~ Commonly used helper functions ~~~~~~ //
3046
3047	SI U16 div255(U16 v) {
3048	#if 0
3049	return (v+`127`)/`255`; // The ideal rounding divide by 255.
3050	#elif 1 && defined(JUMPER_IS_NEON)
3051	// With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8
3052	// just as fast as we can do the approximation below, so might as well be correct!
3053	// First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up.
3054	return vrshrq_n_u16(vrsraq_n_u16(v, v, `8`), `8`);
3055	#else
3056	return (v+`255`)/`256`; // A good approximation of (v+127)/255.
3057	#endif
3058	}
3059
3060	SI U16 inv(U16 v) { return `255`-v; }
3061
3062	SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) \| (e & ~c); }
3063	SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) \| (e & ~c); }
3064
3065	SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
3066	SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
3067
3068	SI U16 from_float(float f) { return f * `255.0f` + `0.5f`; }
3069
3070	SI U16 lerp(U16 from, U16 to, U16 t) { return div255( frominv(t) + tot ); }
3071
3072	template <typename D, typename S>
3073	SI D cast(S src) {
3074	return __builtin_convertvector(src, D);
3075	}
3076
3077	template <typename D, typename S>
3078	SI void split(S v, D* lo, D* hi) {
3079	static_assert(`2`*sizeof(D) == sizeof(S), "");
3080	memcpy(lo, (const char)&v + `0`sizeof(D), sizeof(D));
3081	memcpy(hi, (const char)&v + `1`sizeof(D), sizeof(D));
3082	}
3083	template <typename D, typename S>
3084	SI D join(S lo, S hi) {
3085	static_assert(sizeof(D) == `2`*sizeof(S), "");
3086	D v;
3087	memcpy((char)&v + `0`sizeof(S), &lo, sizeof(S));
3088	memcpy((char)&v + `1`sizeof(S), &hi, sizeof(S));
3089	return v;
3090	}
3091
3092	SI F if_then_else(I32 c, F t, F e) {
3093	return sk_bit_cast<F>( (sk_bit_cast<I32>(t) & c) \| (sk_bit_cast<I32>(e) & ~c) );
3094	}
3095	SI F max(F x, F y) { return if_then_else(x < y, y, x); }
3096	SI F min(F x, F y) { return if_then_else(x < y, x, y); }
3097
3098	SI F mad(F f, F m, F a) { return f*m+a; }
3099	SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
3100
3101	SI F rcp(F x) {
3102	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
3103	__m256 lo,hi;
3104	split(x, &lo,&hi);
3105	return join<F>(_mm256_rcp_ps(lo), _mm256_rcp_ps(hi));
3106	#elif defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
3107	__m128 lo,hi;
3108	split(x, &lo,&hi);
3109	return join<F>(_mm_rcp_ps(lo), _mm_rcp_ps(hi));
3110	#elif defined(JUMPER_IS_NEON)
3111	auto rcp = [](float32x4_t v) {
3112	auto est = vrecpeq_f32(v);
3113	return vrecpsq_f32(v,est)*est;
3114	};
3115	float32x4_t lo,hi;
3116	split(x, &lo,&hi);
3117	return join<F>(rcp(lo), rcp(hi));
3118	#else
3119	return `1.0f` / x;
3120	#endif
3121	}
3122	SI F sqrt_(F x) {
3123	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
3124	__m256 lo,hi;
3125	split(x, &lo,&hi);
3126	return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
3127	#elif defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
3128	__m128 lo,hi;
3129	split(x, &lo,&hi);
3130	return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi));
3131	#elif defined(SK_CPU_ARM64)
3132	float32x4_t lo,hi;
3133	split(x, &lo,&hi);
3134	return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi));
3135	#elif defined(JUMPER_IS_NEON)
3136	auto sqrt = [](float32x4_t v) {
3137	auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
3138	est = vrsqrtsq_f32(v,estest);
3139	est = vrsqrtsq_f32(v,estest);
3140	return vest; // sqrt(v) == vrsqrt(v).
3141	};
3142	float32x4_t lo,hi;
3143	split(x, &lo,&hi);
3144	return join<F>(sqrt(lo), sqrt(hi));
3145	#else
3146	return F{
3147	sqrtf(x[`0`]), sqrtf(x[`1`]), sqrtf(x[`2`]), sqrtf(x[`3`]),
3148	sqrtf(x[`4`]), sqrtf(x[`5`]), sqrtf(x[`6`]), sqrtf(x[`7`]),
3149	};
3150	#endif
3151	}
3152
3153	SI F floor_(F x) {
3154	#if defined(SK_CPU_ARM64)
3155	float32x4_t lo,hi;
3156	split(x, &lo,&hi);
3157	return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
3158	#elif defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
3159	__m256 lo,hi;
3160	split(x, &lo,&hi);
3161	return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi));
3162	#elif defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
3163	__m128 lo,hi;
3164	split(x, &lo,&hi);
3165	return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi));
3166	#else
3167	F roundtrip = cast<F>(cast<I32>(x));
3168	return roundtrip - if_then_else(roundtrip > x, F(`1`), F(`0`));
3169	#endif
3170	}
3171	SI F fract(F x) { return x - floor_(x); }
3172	SI F abs_(F x) { return sk_bit_cast<F>( sk_bit_cast<I32>(x) & `0x7fffffff` ); }
3173
3174	// ~~~~~~ Basic / misc. stages ~~~~~~ //
3175
3176	STAGE_GG(seed_shader, Ctx::None) {
3177	static const float iota[] = {
3178	`0.5f`, `1.5f`, `2.5f`, `3.5f`, `4.5f`, `5.5f`, `6.5f`, `7.5f`,
3179	`8.5f`, `9.5f`,`10.5f`,`11.5f`,`12.5f`,`13.5f`,`14.5f`,`15.5f`,
3180	};
3181	x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota);
3182	y = cast<F>(I32(dy)) + `0.5f`;
3183	}
3184
3185	STAGE_GG(matrix_translate, const float* m) {
3186	x += m[`0`];
3187	y += m[`1`];
3188	}
3189	STAGE_GG(matrix_scale_translate, const float* m) {
3190	x = mad(x,m[`0`], m[`2`]);
3191	y = mad(y,m[`1`], m[`3`]);
3192	}
3193	STAGE_GG(matrix_2x3, const float* m) {
3194	auto X = mad(x,m[`0`], mad(y,m[`2`], m[`4`])),
3195	Y = mad(x,m[`1`], mad(y,m[`3`], m[`5`]));
3196	x = X;
3197	y = Y;
3198	}
3199	STAGE_GG(matrix_perspective, const float* m) {
3200	// N.B. Unlike the other matrix_ stages, this matrix is row-major.
3201	auto X = mad(x,m[`0`], mad(y,m[`1`], m[`2`])),
3202	Y = mad(x,m[`3`], mad(y,m[`4`], m[`5`])),
3203	Z = mad(x,m[`6`], mad(y,m[`7`], m[`8`]));
3204	x = X * rcp(Z);
3205	y = Y * rcp(Z);
3206	}
3207
3208	STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
3209	r = c->rgba[`0`];
3210	g = c->rgba[`1`];
3211	b = c->rgba[`2`];
3212	a = c->rgba[`3`];
3213	}
3214	STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
3215	dr = c->rgba[`0`];
3216	dg = c->rgba[`1`];
3217	db = c->rgba[`2`];
3218	da = c->rgba[`3`];
3219	}
3220	STAGE_PP(black_color, Ctx::None) { r = g = b = `0`; a = `255`; }
3221	STAGE_PP(white_color, Ctx::None) { r = g = b = `255`; a = `255`; }
3222
3223	STAGE_PP(set_rgb, const float rgb[`3`]) {
3224	r = from_float(rgb[`0`]);
3225	g = from_float(rgb[`1`]);
3226	b = from_float(rgb[`2`]);
3227	}
3228
3229	STAGE_PP(clamp_0, Ctx::None) { /definitely a noop/ }
3230	STAGE_PP(clamp_1, Ctx::None) { /_should_ be a noop/ }
3231
3232	STAGE_PP(clamp_a, Ctx::None) {
3233	r = min(r, a);
3234	g = min(g, a);
3235	b = min(b, a);
3236	}
3237
3238	STAGE_PP(clamp_gamut, Ctx::None) {
3239	// It shouldn't be possible to get out-of-gamut
3240	// colors when working in lowp.
3241	}
3242
3243	STAGE_PP(premul, Ctx::None) {
3244	r = div255(r * a);
3245	g = div255(g * a);
3246	b = div255(b * a);
3247	}
3248	STAGE_PP(premul_dst, Ctx::None) {
3249	dr = div255(dr * da);
3250	dg = div255(dg * da);
3251	db = div255(db * da);
3252	}
3253
3254	STAGE_PP(force_opaque , Ctx::None) { a = `255`; }
3255	STAGE_PP(force_opaque_dst, Ctx::None) { da = `255`; }
3256
3257	STAGE_PP(swap_rb, Ctx::None) {
3258	auto tmp = r;
3259	r = b;
3260	b = tmp;
3261	}
3262	STAGE_PP(swap_rb_dst, Ctx::None) {
3263	auto tmp = dr;
3264	dr = db;
3265	db = tmp;
3266	}
3267
3268	STAGE_PP(move_src_dst, Ctx::None) {
3269	dr = r;
3270	dg = g;
3271	db = b;
3272	da = a;
3273	}
3274
3275	STAGE_PP(move_dst_src, Ctx::None) {
3276	r = dr;
3277	g = dg;
3278	b = db;
3279	a = da;
3280	}
3281
3282	// ~~~~~~ Blend modes ~~~~~~ //
3283
3284	// The same logic applied to all 4 channels.
3285	#define BLEND_MODE(name) \
3286	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
3287	STAGE_PP(name, Ctx::None) { \
3288	r = name##_channel(r,dr,a,da); \
3289	g = name##_channel(g,dg,a,da); \
3290	b = name##_channel(b,db,a,da); \
3291	a = name##_channel(a,da,a,da); \
3292	} \
3293	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
3294
3295	BLEND_MODE(clear) { return `0`; }
3296	BLEND_MODE(srcatop) { return div255( sda + dinv(sa) ); }
3297	BLEND_MODE(dstatop) { return div255( dsa + sinv(da) ); }
3298	BLEND_MODE(srcin) { return div255( s*da ); }
3299	BLEND_MODE(dstin) { return div255( d*sa ); }
3300	BLEND_MODE(srcout) { return div255( s*inv(da) ); }
3301	BLEND_MODE(dstout) { return div255( d*inv(sa) ); }
3302	BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); }
3303	BLEND_MODE(dstover) { return d + div255( s*inv(da) ); }
3304	BLEND_MODE(modulate) { return div255( s*d ); }
3305	BLEND_MODE(multiply) { return div255( sinv(da) + dinv(sa) + s*d ); }
3306	BLEND_MODE(plus_) { return min(s+d, `255`); }
3307	BLEND_MODE(screen) { return s + d - div255( s*d ); }
3308	BLEND_MODE(xor_) { return div255( sinv(da) + dinv(sa) ); }
3309	#undef BLEND_MODE
3310
3311	// The same logic applied to color, and srcover for alpha.
3312	#define BLEND_MODE(name) \
3313	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
3314	STAGE_PP(name, Ctx::None) { \
3315	r = name##_channel(r,dr,a,da); \
3316	g = name##_channel(g,dg,a,da); \
3317	b = name##_channel(b,db,a,da); \
3318	a = a + div255( da*inv(a) ); \
3319	} \
3320	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
3321
3322	BLEND_MODE(darken) { return s + d - div255( max(sda, dsa) ); }
3323	BLEND_MODE(lighten) { return s + d - div255( min(sda, dsa) ); }
3324	BLEND_MODE(difference) { return s + d - `2`div255( min(sda, d*sa) ); }
3325	BLEND_MODE(exclusion) { return s + d - `2`div255( sd ); }
3326
3327	BLEND_MODE(hardlight) {
3328	return div255( sinv(da) + dinv(sa) +
3329	if_then_else(`2`s <= sa, `2`sd, sada - `2`(sa-s)(da-d)) );
3330	}
3331	BLEND_MODE(overlay) {
3332	return div255( sinv(da) + dinv(sa) +
3333	if_then_else(`2`d <= da, `2`sd, sada - `2`(sa-s)(da-d)) );
3334	}
3335	#undef BLEND_MODE
3336
3337	// ~~~~~~ Helpers for interacting with memory ~~~~~~ //
3338
3339	template <typename T>
3340	SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
3341	return (T)ctx->pixels + dyctx->stride + dx;
3342	}
3343
3344	template <typename T>
3345	SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
3346	// Exclusive -> inclusive.
3347	const F w = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->width ) - `1`),
3348	h = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->height) - `1`);
3349
3350	x = min(max(`0`, x), w);
3351	y = min(max(`0`, y), h);
3352
3353	ptr = (const* T*)ctx->pixels;
3354	return trunc_(y)*ctx->stride + trunc_(x);
3355	}
3356
3357	template <typename V, typename T>
3358	SI V load(const T* ptr, size_t tail) {
3359	V v = `0`;
3360	switch (tail & (N-`1`)) {
3361	case `0`: memcpy(&v, ptr, sizeof(v)); break;
3362	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
3363	case `15`: v[`14`] = ptr[`14`]; [[fallthrough]];
3364	case `14`: v[`13`] = ptr[`13`]; [[fallthrough]];
3365	case `13`: v[`12`] = ptr[`12`]; [[fallthrough]];
3366	case `12`: memcpy(&v, ptr, `12`*sizeof(T)); break;
3367	case `11`: v[`10`] = ptr[`10`]; [[fallthrough]];
3368	case `10`: v[ `9`] = ptr[ `9`]; [[fallthrough]];
3369	case `9`: v[ `8`] = ptr[ `8`]; [[fallthrough]];
3370	case `8`: memcpy(&v, ptr, `8`*sizeof(T)); break;
3371	#endif
3372	case `7`: v[ `6`] = ptr[ `6`]; [[fallthrough]];
3373	case `6`: v[ `5`] = ptr[ `5`]; [[fallthrough]];
3374	case `5`: v[ `4`] = ptr[ `4`]; [[fallthrough]];
3375	case `4`: memcpy(&v, ptr, `4`*sizeof(T)); break;
3376	case `3`: v[ `2`] = ptr[ `2`]; [[fallthrough]];
3377	case `2`: memcpy(&v, ptr, `2`*sizeof(T)); break;
3378	case `1`: v[ `0`] = ptr[ `0`];
3379	}
3380	return v;
3381	}
3382	template <typename V, typename T>
3383	SI void store(T* ptr, size_t tail, V v) {
3384	switch (tail & (N-`1`)) {
3385	case `0`: memcpy(ptr, &v, sizeof(v)); break;
3386	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
3387	case `15`: ptr[`14`] = v[`14`]; [[fallthrough]];
3388	case `14`: ptr[`13`] = v[`13`]; [[fallthrough]];
3389	case `13`: ptr[`12`] = v[`12`]; [[fallthrough]];
3390	case `12`: memcpy(ptr, &v, `12`*sizeof(T)); break;
3391	case `11`: ptr[`10`] = v[`10`]; [[fallthrough]];
3392	case `10`: ptr[ `9`] = v[ `9`]; [[fallthrough]];
3393	case `9`: ptr[ `8`] = v[ `8`]; [[fallthrough]];
3394	case `8`: memcpy(ptr, &v, `8`*sizeof(T)); break;
3395	#endif
3396	case `7`: ptr[ `6`] = v[ `6`]; [[fallthrough]];
3397	case `6`: ptr[ `5`] = v[ `5`]; [[fallthrough]];
3398	case `5`: ptr[ `4`] = v[ `4`]; [[fallthrough]];
3399	case `4`: memcpy(ptr, &v, `4`*sizeof(T)); break;
3400	case `3`: ptr[ `2`] = v[ `2`]; [[fallthrough]];
3401	case `2`: memcpy(ptr, &v, `2`*sizeof(T)); break;
3402	case `1`: ptr[ `0`] = v[ `0`];
3403	}
3404	}
3405
3406	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
3407	template <typename V, typename T>
3408	SI V gather(const T* ptr, U32 ix) {
3409	return V{ ptr[ix[ `0`]], ptr[ix[ `1`]], ptr[ix[ `2`]], ptr[ix[ `3`]],
3410	ptr[ix[ `4`]], ptr[ix[ `5`]], ptr[ix[ `6`]], ptr[ix[ `7`]],
3411	ptr[ix[ `8`]], ptr[ix[ `9`]], ptr[ix[`10`]], ptr[ix[`11`]],
3412	ptr[ix[`12`]], ptr[ix[`13`]], ptr[ix[`14`]], ptr[ix[`15`]], };
3413	}
3414
3415	template<>
3416	F gather(const float* ptr, U32 ix) {
3417	__m256i lo, hi;
3418	split(ix, &lo, &hi);
3419
3420	return join<F>(_mm256_i32gather_ps(ptr, lo, `4`),
3421	_mm256_i32gather_ps(ptr, hi, `4`));
3422	}
3423
3424	template<>
3425	U32 gather(const uint32_t* ptr, U32 ix) {
3426	__m256i lo, hi;
3427	split(ix, &lo, &hi);
3428
3429	return join<U32>(_mm256_i32gather_epi32(ptr, lo, `4`),
3430	_mm256_i32gather_epi32(ptr, hi, `4`));
3431	}
3432	#else
3433	template <typename V, typename T>
3434	SI V gather(const T* ptr, U32 ix) {
3435	return V{ ptr[ix[ `0`]], ptr[ix[ `1`]], ptr[ix[ `2`]], ptr[ix[ `3`]],
3436	ptr[ix[ `4`]], ptr[ix[ `5`]], ptr[ix[ `6`]], ptr[ix[ `7`]], };
3437	}
3438	#endif
3439
3440
3441	// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
3442
3443	SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
3444	#if 1 && defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
3445	// Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
3446	__m256i _01,_23;
3447	split(rgba, &_01, &_23);
3448	__m256i _02 = _mm256_permute2x128_si256(_01,_23, `0x20`),
3449	_13 = _mm256_permute2x128_si256(_01,_23, `0x31`);
3450	rgba = join<U32>(_02, _13);
3451
3452	auto cast_U16 = [](U32 v) -> U16 {
3453	__m256i _02,_13;
3454	split(v, &_02,&_13);
3455	return _mm256_packus_epi32(_02,_13);
3456	};
3457	#else
3458	auto cast_U16 = [](U32 v) -> U16 {
3459	return cast<U16>(v);
3460	};
3461	#endif
3462	*r = cast_U16(rgba & `65535`) & `255`;
3463	*g = cast_U16(rgba & `65535`) >> `8`;
3464	*b = cast_U16(rgba >> `16`) & `255`;
3465	*a = cast_U16(rgba >> `16`) >> `8`;
3466	}
3467
3468	SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3469	#if 1 && defined(JUMPER_IS_NEON)
3470	uint8x8x4_t rgba;
3471	switch (tail & (N-`1`)) {
3472	case `0`: rgba = vld4_u8 ((const uint8_t)(ptr+`0`) ); break*;
3473	case `7`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`6`), rgba, `6`); [[fallthrough]];
3474	case `6`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`5`), rgba, `5`); [[fallthrough]];
3475	case `5`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`4`), rgba, `4`); [[fallthrough]];
3476	case `4`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`3`), rgba, `3`); [[fallthrough]];
3477	case `3`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`2`), rgba, `2`); [[fallthrough]];
3478	case `2`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`1`), rgba, `1`); [[fallthrough]];
3479	case `1`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`0`), rgba, `0`);
3480	}
3481	*r = cast<U16>(rgba.val[`0`]);
3482	*g = cast<U16>(rgba.val[`1`]);
3483	*b = cast<U16>(rgba.val[`2`]);
3484	*a = cast<U16>(rgba.val[`3`]);
3485	#else
3486	from_8888(load<U32>(ptr, tail), r,g,b,a);
3487	#endif
3488	}
3489	SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3490	#if 1 && defined(JUMPER_IS_NEON)
3491	uint8x8x4_t rgba = {{
3492	cast<U8>(r),
3493	cast<U8>(g),
3494	cast<U8>(b),
3495	cast<U8>(a),
3496	}};
3497	switch (tail & (N-`1`)) {
3498	case `0`: vst4_u8 ((uint8_t)(ptr+`0`), rgba ); break*;
3499	case `7`: vst4_lane_u8((uint8_t*)(ptr+`6`), rgba, `6`); [[fallthrough]];
3500	case `6`: vst4_lane_u8((uint8_t*)(ptr+`5`), rgba, `5`); [[fallthrough]];
3501	case `5`: vst4_lane_u8((uint8_t*)(ptr+`4`), rgba, `4`); [[fallthrough]];
3502	case `4`: vst4_lane_u8((uint8_t*)(ptr+`3`), rgba, `3`); [[fallthrough]];
3503	case `3`: vst4_lane_u8((uint8_t*)(ptr+`2`), rgba, `2`); [[fallthrough]];
3504	case `2`: vst4_lane_u8((uint8_t*)(ptr+`1`), rgba, `1`); [[fallthrough]];
3505	case `1`: vst4_lane_u8((uint8_t*)(ptr+`0`), rgba, `0`);
3506	}
3507	#else
3508	store(ptr, tail, cast<U32>(r \| (g<<`8`)) << `0`
3509	\| cast<U32>(b \| (a<<`8`)) << `16`);
3510	#endif
3511	}
3512
3513	STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
3514	load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3515	}
3516	STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3517	load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3518	}
3519	STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
3520	store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
3521	}
3522	STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
3523	const uint32_t* ptr;
3524	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3525	from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
3526	}
3527
3528	// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
3529
3530	SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
3531	// Format for 565 buffers: 15\|rrrrr gggggg bbbbb\|0
3532	U16 R = (rgb >> `11`) & `31`,
3533	G = (rgb >> `5`) & `63`,
3534	B = (rgb >> `0`) & `31`;
3535
3536	// These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
3537	*r = (R << `3`) \| (R >> `2`);
3538	*g = (G << `2`) \| (G >> `4`);
3539	*b = (B << `3`) \| (B >> `2`);
3540	}
3541	SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
3542	from_565(load<U16>(ptr, tail), r,g,b);
3543	}
3544	SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
3545	// Round from [0,255] to [0,31] or [0,63], as if x (31/255.0f) + 0.5f.*
3546	// (Don't feel like you need to find some fundamental truth in these...
3547	// they were brute-force searched.)
3548	U16 R = (r * `9` + `36`) / `74`, // 9/74 ≈ 31/255, plus 36/74, about half.
3549	G = (g * `21` + `42`) / `85`, // 21/85 = 63/255 exactly.
3550	B = (b * `9` + `36`) / `74`;
3551	// Pack them back into 15\|rrrrr gggggg bbbbb\|0.
3552	store(ptr, tail, R << `11`
3553	\| G << `5`
3554	\| B << `0`);
3555	}
3556
3557	STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
3558	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
3559	a = `255`;
3560	}
3561	STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3562	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
3563	da = `255`;
3564	}
3565	STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
3566	store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
3567	}
3568	STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
3569	const uint16_t* ptr;
3570	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3571	from_565(gather<U16>(ptr, ix), &r, &g, &b);
3572	a = `255`;
3573	}
3574
3575	SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
3576	// Format for 4444 buffers: 15\|rrrr gggg bbbb aaaa\|0.
3577	U16 R = (rgba >> `12`) & `15`,
3578	G = (rgba >> `8`) & `15`,
3579	B = (rgba >> `4`) & `15`,
3580	A = (rgba >> `0`) & `15`;
3581
3582	// Scale [0,15] to [0,255].
3583	*r = (R << `4`) \| R;
3584	*g = (G << `4`) \| G;
3585	*b = (B << `4`) \| B;
3586	*a = (A << `4`) \| A;
3587	}
3588	SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3589	from_4444(load<U16>(ptr, tail), r,g,b,a);
3590	}
3591	SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3592	// Round from [0,255] to [0,15], producing the same value as (x(15/255.0f) + 0.5f).*
3593	U16 R = (r + `8`) / `17`,
3594	G = (g + `8`) / `17`,
3595	B = (b + `8`) / `17`,
3596	A = (a + `8`) / `17`;
3597	// Pack them back into 15\|rrrr gggg bbbb aaaa\|0.
3598	store(ptr, tail, R << `12`
3599	\| G << `8`
3600	\| B << `4`
3601	\| A << `0`);
3602	}
3603
3604	STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
3605	load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3606	}
3607	STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3608	load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3609	}
3610	STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
3611	store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
3612	}
3613	STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
3614	const uint16_t* ptr;
3615	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3616	from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
3617	}
3618
3619	SI void from_88(U16 rg, U16* r, U16* g) {
3620	*r = (rg & `0xFF`);
3621	*g = (rg >> `8`);
3622	}
3623
3624	SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
3625	#if 1 && defined(JUMPER_IS_NEON)
3626	uint8x8x2_t rg;
3627	switch (tail & (N-`1`)) {
3628	case `0`: rg = vld2_u8 ((const uint8_t)(ptr+`0`) ); break*;
3629	case `7`: rg = vld2_lane_u8((const uint8_t*)(ptr+`6`), rg, `6`); [[fallthrough]];
3630	case `6`: rg = vld2_lane_u8((const uint8_t*)(ptr+`5`), rg, `5`); [[fallthrough]];
3631	case `5`: rg = vld2_lane_u8((const uint8_t*)(ptr+`4`), rg, `4`); [[fallthrough]];
3632	case `4`: rg = vld2_lane_u8((const uint8_t*)(ptr+`3`), rg, `3`); [[fallthrough]];
3633	case `3`: rg = vld2_lane_u8((const uint8_t*)(ptr+`2`), rg, `2`); [[fallthrough]];
3634	case `2`: rg = vld2_lane_u8((const uint8_t*)(ptr+`1`), rg, `1`); [[fallthrough]];
3635	case `1`: rg = vld2_lane_u8((const uint8_t*)(ptr+`0`), rg, `0`);
3636	}
3637	*r = cast<U16>(rg.val[`0`]);
3638	*g = cast<U16>(rg.val[`1`]);
3639	#else
3640	from_88(load<U16>(ptr, tail), r,g);
3641	#endif
3642	}
3643
3644	SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) {
3645	#if 1 && defined(JUMPER_IS_NEON)
3646	uint8x8x2_t rg = {{
3647	cast<U8>(r),
3648	cast<U8>(g),
3649	}};
3650	switch (tail & (N-`1`)) {
3651	case `0`: vst2_u8 ((uint8_t)(ptr+`0`), rg ); break*;
3652	case `7`: vst2_lane_u8((uint8_t*)(ptr+`6`), rg, `6`); [[fallthrough]];
3653	case `6`: vst2_lane_u8((uint8_t*)(ptr+`5`), rg, `5`); [[fallthrough]];
3654	case `5`: vst2_lane_u8((uint8_t*)(ptr+`4`), rg, `4`); [[fallthrough]];
3655	case `4`: vst2_lane_u8((uint8_t*)(ptr+`3`), rg, `3`); [[fallthrough]];
3656	case `3`: vst2_lane_u8((uint8_t*)(ptr+`2`), rg, `2`); [[fallthrough]];
3657	case `2`: vst2_lane_u8((uint8_t*)(ptr+`1`), rg, `1`); [[fallthrough]];
3658	case `1`: vst2_lane_u8((uint8_t*)(ptr+`0`), rg, `0`);
3659	}
3660	#else
3661	store(ptr, tail, cast<U16>(r \| (g<<`8`)) << `0`);
3662	#endif
3663	}
3664
3665	STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
3666	load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &r, &g);
3667	b = `0`;
3668	a = `255`;
3669	}
3670	STAGE_PP(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3671	load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &dr, &dg);
3672	db = `0`;
3673	da = `255`;
3674	}
3675	STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
3676	store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g);
3677	}
3678	STAGE_GP(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
3679	const uint16_t* ptr;
3680	U32 ix = ix_and_ptr(&ptr, ctx, x, y);
3681	from_88(gather<U16>(ptr, ix), &r, &g);
3682	b = `0`;
3683	a = `255`;
3684	}
3685
3686	// ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
3687
3688	SI U16 load_8(const uint8_t* ptr, size_t tail) {
3689	return cast<U16>(load<U8>(ptr, tail));
3690	}
3691	SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
3692	store(ptr, tail, cast<U8>(v));
3693	}
3694
3695	STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
3696	r = g = b = `0`;
3697	a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3698	}
3699	STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3700	dr = dg = db = `0`;
3701	da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3702	}
3703	STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
3704	store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
3705	}
3706	STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
3707	const uint8_t* ptr;
3708	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3709	r = g = b = `0`;
3710	a = cast<U16>(gather<U8>(ptr, ix));
3711	}
3712
3713	STAGE_PP(alpha_to_gray, Ctx::None) {
3714	r = g = b = a;
3715	a = `255`;
3716	}
3717	STAGE_PP(alpha_to_gray_dst, Ctx::None) {
3718	dr = dg = db = da;
3719	da = `255`;
3720	}
3721	STAGE_PP(bt709_luminance_or_luma_to_alpha, Ctx::None) {
3722	a = (r`54` + g`183` + b`19`)/`256`; // 0.2126, 0.7152, 0.0722 with 256 denominator.*
3723	r = g = b = `0`;
3724	}
3725
3726	// ~~~~~~ Coverage scales / lerps ~~~~~~ //
3727
3728	STAGE_PP(load_src, const uint16_t* ptr) {
3729	r = sk_unaligned_load<U16>(ptr + `0`*N);
3730	g = sk_unaligned_load<U16>(ptr + `1`*N);
3731	b = sk_unaligned_load<U16>(ptr + `2`*N);
3732	a = sk_unaligned_load<U16>(ptr + `3`*N);
3733	}
3734	STAGE_PP(store_src, uint16_t* ptr) {
3735	sk_unaligned_store(ptr + `0`*N, r);
3736	sk_unaligned_store(ptr + `1`*N, g);
3737	sk_unaligned_store(ptr + `2`*N, b);
3738	sk_unaligned_store(ptr + `3`*N, a);
3739	}
3740	STAGE_PP(store_src_a, uint16_t* ptr) {
3741	sk_unaligned_store(ptr, a);
3742	}
3743	STAGE_PP(load_dst, const uint16_t* ptr) {
3744	dr = sk_unaligned_load<U16>(ptr + `0`*N);
3745	dg = sk_unaligned_load<U16>(ptr + `1`*N);
3746	db = sk_unaligned_load<U16>(ptr + `2`*N);
3747	da = sk_unaligned_load<U16>(ptr + `3`*N);
3748	}
3749	STAGE_PP(store_dst, uint16_t* ptr) {
3750	sk_unaligned_store(ptr + `0`*N, dr);
3751	sk_unaligned_store(ptr + `1`*N, dg);
3752	sk_unaligned_store(ptr + `2`*N, db);
3753	sk_unaligned_store(ptr + `3`*N, da);
3754	}
3755
3756	// ~~~~~~ Coverage scales / lerps ~~~~~~ //
3757
3758	STAGE_PP(scale_1_float, const float* f) {
3759	U16 c = from_float(*f);
3760	r = div255( r * c );
3761	g = div255( g * c );
3762	b = div255( b * c );
3763	a = div255( a * c );
3764	}
3765	STAGE_PP(lerp_1_float, const float* f) {
3766	U16 c = from_float(*f);
3767	r = lerp(dr, r, c);
3768	g = lerp(dg, g, c);
3769	b = lerp(db, b, c);
3770	a = lerp(da, a, c);
3771	}
3772	STAGE_PP(scale_native, const uint16_t scales[]) {
3773	auto c = sk_unaligned_load<U16>(scales);
3774	r = div255( r * c );
3775	g = div255( g * c );
3776	b = div255( b * c );
3777	a = div255( a * c );
3778	}
3779
3780	STAGE_PP(lerp_native, const uint16_t scales[]) {
3781	auto c = sk_unaligned_load<U16>(scales);
3782	r = lerp(dr, r, c);
3783	g = lerp(dg, g, c);
3784	b = lerp(db, b, c);
3785	a = lerp(da, a, c);
3786	}
3787
3788	STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
3789	U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3790	r = div255( r * c );
3791	g = div255( g * c );
3792	b = div255( b * c );
3793	a = div255( a * c );
3794	}
3795	STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
3796	U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3797	r = lerp(dr, r, c);
3798	g = lerp(dg, g, c);
3799	b = lerp(db, b, c);
3800	a = lerp(da, a, c);
3801	}
3802
3803	// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
3804	SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
3805	return if_then_else(a < da, min(cr, min(cg,cb))
3806	, max(cr, max(cg,cb)));
3807	}
3808	STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
3809	U16 cr,cg,cb;
3810	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3811	U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
3812
3813	r = div255( r * cr );
3814	g = div255( g * cg );
3815	b = div255( b * cb );
3816	a = div255( a * ca );
3817	}
3818	STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
3819	U16 cr,cg,cb;
3820	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3821	U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
3822
3823	r = lerp(dr, r, cr);
3824	g = lerp(dg, g, cg);
3825	b = lerp(db, b, cb);
3826	a = lerp(da, a, ca);
3827	}
3828
3829	STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
3830	U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail),
3831	add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail);
3832
3833	r = min(div255(r*mul) + add, a);
3834	g = min(div255(g*mul) + add, a);
3835	b = min(div255(b*mul) + add, a);
3836	}
3837
3838
3839	// ~~~~~~ Gradient stages ~~~~~~ //
3840
3841	// Clamp x to [0,1], both sides inclusive (think, gradients).
3842	// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
3843	SI F clamp_01(F v) { return min(max(`0`, v), `1`); }
3844
3845	STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); }
3846	STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); }
3847	STAGE_GG(mirror_x_1, Ctx::None) {
3848	auto two = [](F x){ return x+x; };
3849	x = clamp_01(abs_( (x-`1.0f`) - two(floor_((x-`1.0f`)*`0.5f`)) - `1.0f` ));
3850	}
3851
3852	SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); }
3853
3854	STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
3855	auto w = ctx->limit_x;
3856	sk_unaligned_store(ctx->mask, cond_to_mask_16((`0` <= x) & (x < w)));
3857	}
3858	STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
3859	auto h = ctx->limit_y;
3860	sk_unaligned_store(ctx->mask, cond_to_mask_16((`0` <= y) & (y < h)));
3861	}
3862	STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
3863	auto w = ctx->limit_x;
3864	auto h = ctx->limit_y;
3865	sk_unaligned_store(ctx->mask, cond_to_mask_16((`0` <= x) & (x < w) & (`0` <= y) & (y < h)));
3866	}
3867	STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
3868	auto mask = sk_unaligned_load<U16>(ctx->mask);
3869	r = r & mask;
3870	g = g & mask;
3871	b = b & mask;
3872	a = a & mask;
3873	}
3874
3875	SI void round_F_to_U16(F R, F G, F B, F A, bool interpolatedInPremul,
3876	U16* r, U16* g, U16* b, U16* a) {
3877	auto round = [](F x) { return cast<U16>(x * `255.0f` + `0.5f`); };
3878
3879	F limit = interpolatedInPremul ? A
3880	: `1`;
3881	*r = round(min(max(`0`,R), limit));
3882	*g = round(min(max(`0`,G), limit));
3883	*b = round(min(max(`0`,B), limit));
3884	a = round(A); // we assume alpha is already in [0,1].*
3885	}
3886
3887	SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
3888	U16* r, U16* g, U16* b, U16* a) {
3889
3890	F fr, fg, fb, fa, br, bg, bb, ba;
3891	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_SKX)
3892	if (c->stopCount <=`8`) {
3893	__m256i lo, hi;
3894	split(idx, &lo, &hi);
3895
3896	fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`0`]), lo),
3897	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`0`]), hi));
3898	br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`0`]), lo),
3899	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`0`]), hi));
3900	fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`1`]), lo),
3901	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`1`]), hi));
3902	bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`1`]), lo),
3903	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`1`]), hi));
3904	fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`2`]), lo),
3905	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`2`]), hi));
3906	bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`2`]), lo),
3907	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`2`]), hi));
3908	fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`3`]), lo),
3909	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`3`]), hi));
3910	ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`3`]), lo),
3911	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`3`]), hi));
3912	} else
3913	#endif
3914	{
3915	fr = gather<F>(c->fs[`0`], idx);
3916	fg = gather<F>(c->fs[`1`], idx);
3917	fb = gather<F>(c->fs[`2`], idx);
3918	fa = gather<F>(c->fs[`3`], idx);
3919	br = gather<F>(c->bs[`0`], idx);
3920	bg = gather<F>(c->bs[`1`], idx);
3921	bb = gather<F>(c->bs[`2`], idx);
3922	ba = gather<F>(c->bs[`3`], idx);
3923	}
3924	round_F_to_U16(mad(t, fr, br),
3925	mad(t, fg, bg),
3926	mad(t, fb, bb),
3927	mad(t, fa, ba),
3928	c->interpolatedInPremul,
3929	r,g,b,a);
3930	}
3931
3932	STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) {
3933	auto t = x;
3934	U32 idx = `0`;
3935
3936	// N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
3937	for (size_t i = `1`; i < c->stopCount; i++) {
3938	idx += if_then_else(t >= c->ts[i], U32(`1`), U32(`0`));
3939	}
3940
3941	gradient_lookup(c, idx, t, &r, &g, &b, &a);
3942	}
3943
3944	STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
3945	auto t = x;
3946	auto idx = trunc_(t * (c->stopCount-`1`));
3947	gradient_lookup(c, idx, t, &r, &g, &b, &a);
3948	}
3949
3950	STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
3951	auto t = x;
3952	round_F_to_U16(mad(t, c->f[`0`], c->b[`0`]),
3953	mad(t, c->f[`1`], c->b[`1`]),
3954	mad(t, c->f[`2`], c->b[`2`]),
3955	mad(t, c->f[`3`], c->b[`3`]),
3956	c->interpolatedInPremul,
3957	&r,&g,&b,&a);
3958	}
3959
3960	STAGE_GG(xy_to_unit_angle, Ctx::None) {
3961	F xabs = abs_(x),
3962	yabs = abs_(y);
3963
3964	F slope = min(xabs, yabs)/max(xabs, yabs);
3965	F s = slope * slope;
3966
3967	// Use a 7th degree polynomial to approximate atan.
3968	// This was generated using sollya.gforge.inria.fr.
3969	// A float optimized polynomial was generated using the following command.
3970	// P1 = fpminimax((1/(2Pi))atan(x),[\|1,3,5,7\|],[\|24...\|],[2^(-40),1],relative);
3971	F phi = slope
3972	* (`0.15912117063999176025390625f` + s
3973	* (-`5.185396969318389892578125e-2f` + s
3974	* (`2.476101927459239959716796875e-2f` + s
3975	* (-`7.0547382347285747528076171875e-3f`))));
3976
3977	phi = if_then_else(xabs < yabs, `1.0f`/`4.0f` - phi, phi);
3978	phi = if_then_else(x < `0.0f` , `1.0f`/`2.0f` - phi, phi);
3979	phi = if_then_else(y < `0.0f` , `1.0f` - phi , phi);
3980	phi = if_then_else(phi != phi , `0` , phi); // Check for NaN.
3981	x = phi;
3982	}
3983	STAGE_GG(xy_to_radius, Ctx::None) {
3984	x = sqrt_(xx + yy);
3985	}
3986
3987	// ~~~~~~ Compound stages ~~~~~~ //
3988
3989	STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
3990	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
3991
3992	load_8888_(ptr, tail, &dr,&dg,&db,&da);
3993	r = r + div255( dr*inv(a) );
3994	g = g + div255( dg*inv(a) );
3995	b = b + div255( db*inv(a) );
3996	a = a + div255( da*inv(a) );
3997	store_8888_(ptr, tail, r,g,b,a);
3998	}
3999
4000	#if defined(SK_DISABLE_LOWP_BILERP_CLAMP_CLAMP_STAGE)
4001	static void(bilerp_clamp_8888)(void) = nullptr*;
4002	static void(bilinear)(void) = nullptr*;
4003	#else
4004	STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
4005	// (cx,cy) are the center of our sample.
4006	F cx = x,
4007	cy = y;
4008
4009	// All sample points are at the same fractional offset (fx,fy).
4010	// They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
4011	F fx = fract(cx + `0.5f`),
4012	fy = fract(cy + `0.5f`);
4013
4014	// We'll accumulate the color of all four samples into {r,g,b,a} directly.
4015	r = g = b = a = `0`;
4016
4017	// The first three sample points will calculate their area using math
4018	// just like in the float code above, but the fourth will take up all the rest.
4019	//
4020	// Logically this is the same as doing the math for the fourth pixel too,
4021	// but rounding error makes this a better strategy, keeping opaque opaque, etc.
4022	//
4023	// We can keep up to 8 bits of fractional precision without overflowing 16-bit,
4024	// so our "1.0" area is 256.
4025	const uint16_t bias = `256`;
4026	U16 remaining = bias;
4027
4028	for (float dy = -`0.5f`; dy <= +`0.5f`; dy += `1.0f`)
4029	for (float dx = -`0.5f`; dx <= +`0.5f`; dx += `1.0f`) {
4030	// (x,y) are the coordinates of this sample point.
4031	F x = cx + dx,
4032	y = cy + dy;
4033
4034	// ix_and_ptr() will clamp to the image's bounds for us.
4035	const uint32_t* ptr;
4036	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
4037
4038	U16 sr,sg,sb,sa;
4039	from_8888(gather<U32>(ptr, ix), &sr,&sg,&sb,&sa);
4040
4041	// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
4042	// are combined in direct proportion to their area overlapping that logical query pixel.
4043	// At positive offsets, the x-axis contribution to that rectangle is fx,
4044	// or (1-fx) at negative x. Same deal for y.
4045	F sx = (dx > `0`) ? fx : `1.0f` - fx,
4046	sy = (dy > `0`) ? fy : `1.0f` - fy;
4047
4048	U16 area = (dy == `0.5f` && dx == `0.5f`) ? remaining
4049	: cast<U16>(sx * sy * bias);
4050	for (size_t i = `0`; i < N; i++) {
4051	SkASSERT(remaining[i] >= area[i]);
4052	}
4053	remaining -= area;
4054
4055	r += sr * area;
4056	g += sg * area;
4057	b += sb * area;
4058	a += sa * area;
4059	}
4060
4061	r = (r + bias/`2`) / bias;
4062	g = (g + bias/`2`) / bias;
4063	b = (b + bias/`2`) / bias;
4064	a = (a + bias/`2`) / bias;
4065	}
4066
4067	// TODO: lowp::tile() is identical to the highp tile()... share?
4068	SI F tile(F v, SkTileMode mode, float limit, float invLimit) {
4069	// After ix_and_ptr() will clamp the output of tile(), so we need not clamp here.
4070	switch (mode) {
4071	case SkTileMode::kDecal: // TODO, for now fallthrough to clamp
4072	case SkTileMode::kClamp: return v;
4073	case SkTileMode::kRepeat: return v - floor_(vinvLimit)limit;
4074	case SkTileMode::kMirror:
4075	return abs_( (v-limit) - (limit+limit)floor_((v-limit)(invLimit*`0.5f`)) - limit );
4076	}
4077	SkUNREACHABLE;
4078	}
4079
4080	SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y,
4081	U16* r, U16* g, U16* b, U16* a) {
4082	x = tile(x, ctx->tileX, ctx->width , ctx->invWidth );
4083	y = tile(y, ctx->tileY, ctx->height, ctx->invHeight);
4084
4085	switch (ctx->ct) {
4086	default: r = g = b = a = `0`; // TODO
4087	break;
4088
4089	case kRGBA_8888_SkColorType:
4090	case kBGRA_8888_SkColorType: {
4091	const uint32_t* ptr;
4092	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
4093	from_8888(gather<U32>(ptr, ix), r,g,b,a);
4094	if (ctx->ct == kBGRA_8888_SkColorType) {
4095	std::swap(r,b);
4096	}
4097	} break;
4098	}
4099	}
4100
4101	template <int D>
4102	SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx,
4103	F cx, F cy, const F (&wx)[D], const F (&wy)[D],
4104	U16* r, U16* g, U16* b, U16* a) {
4105
4106	float start = -`0.5f`*(D-`1`);
4107
4108	const uint16_t bias = `256`;
4109	U16 remaining = bias;
4110
4111	r = g = b = a = `0`;
4112	F y = cy + start;
4113	for (int j = `0`; j < D; j++, y += `1.0f`) {
4114	F x = cx + start;
4115	for (int i = `0`; i < D; i++, x += `1.0f`) {
4116	U16 R,G,B,A;
4117	sample(ctx, x,y, &R,&G,&B,&A);
4118
4119	U16 w = (i == D-`1` && j == D-`1`) ? remaining
4120	: cast<U16>(wx[i]wy[j]bias);
4121	remaining -= w;
4122	r += wR;
4123	g += wG;
4124	b += wB;
4125	a += wA;
4126	}
4127	}
4128	r = (r + bias/`2`) / bias;
4129	g = (g + bias/`2`) / bias;
4130	b = (b + bias/`2`) / bias;
4131	a = (a + bias/`2`) / bias;
4132	}
4133
4134	STAGE_GP(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) {
4135	F fx = fract(x + `0.5f`),
4136	fy = fract(y + `0.5f`);
4137	const F wx[] = {`1.0f` - fx, fx};
4138	const F wy[] = {`1.0f` - fy, fy};
4139
4140	sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
4141	}
4142	#endif
4143
4144	// ~~~~~~ GrSwizzle stage ~~~~~~ //
4145
4146	STAGE_PP(swizzle, void* ctx) {
4147	auto ir = r, ig = g, ib = b, ia = a;
4148	U16* o[] = {&r, &g, &b, &a};
4149	char swiz[`4`];
4150	memcpy(swiz, &ctx, sizeof(swiz));
4151
4152	for (int i = `0`; i < `4`; ++i) {
4153	switch (swiz[i]) {
4154	case `'r'`: o[i] = ir; break*;
4155	case `'g'`: o[i] = ig; break*;
4156	case `'b'`: o[i] = ib; break*;
4157	case `'a'`: o[i] = ia; break*;
4158	case `'0'`: o[i] = U16(`0`); break*;
4159	case `'1'`: o[i] = U16(`255`); break*;
4160	default: break;
4161	}
4162	}
4163	}
4164
4165	// Now we'll add null stand-ins for stages we haven't implemented in lowp.
4166	// If a pipeline uses these stages, it'll boot it out of lowp into highp.
4167	#define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr;
4168	NOT_IMPLEMENTED(callback)
4169	NOT_IMPLEMENTED(interpreter)
4170	NOT_IMPLEMENTED(unbounded_set_rgb)
4171	NOT_IMPLEMENTED(unbounded_uniform_color)
4172	NOT_IMPLEMENTED(unpremul)
4173	NOT_IMPLEMENTED(dither) // TODO
4174	NOT_IMPLEMENTED(load_16161616)
4175	NOT_IMPLEMENTED(load_16161616_dst)
4176	NOT_IMPLEMENTED(store_16161616)
4177	NOT_IMPLEMENTED(gather_16161616)
4178	NOT_IMPLEMENTED(load_a16)
4179	NOT_IMPLEMENTED(load_a16_dst)
4180	NOT_IMPLEMENTED(store_a16)
4181	NOT_IMPLEMENTED(gather_a16)
4182	NOT_IMPLEMENTED(load_rg1616)
4183	NOT_IMPLEMENTED(load_rg1616_dst)
4184	NOT_IMPLEMENTED(store_rg1616)
4185	NOT_IMPLEMENTED(gather_rg1616)
4186	NOT_IMPLEMENTED(load_f16)
4187	NOT_IMPLEMENTED(load_f16_dst)
4188	NOT_IMPLEMENTED(store_f16)
4189	NOT_IMPLEMENTED(gather_f16)
4190	NOT_IMPLEMENTED(load_af16)
4191	NOT_IMPLEMENTED(load_af16_dst)
4192	NOT_IMPLEMENTED(store_af16)
4193	NOT_IMPLEMENTED(gather_af16)
4194	NOT_IMPLEMENTED(load_rgf16)
4195	NOT_IMPLEMENTED(load_rgf16_dst)
4196	NOT_IMPLEMENTED(store_rgf16)
4197	NOT_IMPLEMENTED(gather_rgf16)
4198	NOT_IMPLEMENTED(load_f32)
4199	NOT_IMPLEMENTED(load_f32_dst)
4200	NOT_IMPLEMENTED(store_f32)
4201	NOT_IMPLEMENTED(gather_f32)
4202	NOT_IMPLEMENTED(load_rgf32)
4203	NOT_IMPLEMENTED(store_rgf32)
4204	NOT_IMPLEMENTED(load_1010102)
4205	NOT_IMPLEMENTED(load_1010102_dst)
4206	NOT_IMPLEMENTED(store_1010102)
4207	NOT_IMPLEMENTED(gather_1010102)
4208	NOT_IMPLEMENTED(store_u16_be)
4209	NOT_IMPLEMENTED(byte_tables) // TODO
4210	NOT_IMPLEMENTED(colorburn)
4211	NOT_IMPLEMENTED(colordodge)
4212	NOT_IMPLEMENTED(softlight)
4213	NOT_IMPLEMENTED(hue)
4214	NOT_IMPLEMENTED(saturation)
4215	NOT_IMPLEMENTED(color)
4216	NOT_IMPLEMENTED(luminosity)
4217	NOT_IMPLEMENTED(matrix_3x3)
4218	NOT_IMPLEMENTED(matrix_3x4)
4219	NOT_IMPLEMENTED(matrix_4x5) // TODO
4220	NOT_IMPLEMENTED(matrix_4x3) // TODO
4221	NOT_IMPLEMENTED(parametric)
4222	NOT_IMPLEMENTED(gamma_)
4223	NOT_IMPLEMENTED(PQish)
4224	NOT_IMPLEMENTED(HLGish)
4225	NOT_IMPLEMENTED(HLGinvish)
4226	NOT_IMPLEMENTED(rgb_to_hsl)
4227	NOT_IMPLEMENTED(hsl_to_rgb)
4228	NOT_IMPLEMENTED(gauss_a_to_rgba) // TODO
4229	NOT_IMPLEMENTED(mirror_x) // TODO
4230	NOT_IMPLEMENTED(repeat_x) // TODO
4231	NOT_IMPLEMENTED(mirror_y) // TODO
4232	NOT_IMPLEMENTED(repeat_y) // TODO
4233	NOT_IMPLEMENTED(negate_x)
4234	NOT_IMPLEMENTED(bicubic) // TODO if I can figure out negative weights
4235	NOT_IMPLEMENTED(bicubic_clamp_8888)
4236	NOT_IMPLEMENTED(bilinear_nx) // TODO
4237	NOT_IMPLEMENTED(bilinear_ny) // TODO
4238	NOT_IMPLEMENTED(bilinear_px) // TODO
4239	NOT_IMPLEMENTED(bilinear_py) // TODO
4240	NOT_IMPLEMENTED(bicubic_n3x) // TODO
4241	NOT_IMPLEMENTED(bicubic_n1x) // TODO
4242	NOT_IMPLEMENTED(bicubic_p1x) // TODO
4243	NOT_IMPLEMENTED(bicubic_p3x) // TODO
4244	NOT_IMPLEMENTED(bicubic_n3y) // TODO
4245	NOT_IMPLEMENTED(bicubic_n1y) // TODO
4246	NOT_IMPLEMENTED(bicubic_p1y) // TODO
4247	NOT_IMPLEMENTED(bicubic_p3y) // TODO
4248	NOT_IMPLEMENTED(save_xy) // TODO
4249	NOT_IMPLEMENTED(accumulate) // TODO
4250	NOT_IMPLEMENTED(xy_to_2pt_conical_well_behaved)
4251	NOT_IMPLEMENTED(xy_to_2pt_conical_strip)
4252	NOT_IMPLEMENTED(xy_to_2pt_conical_focal_on_circle)
4253	NOT_IMPLEMENTED(xy_to_2pt_conical_smaller)
4254	NOT_IMPLEMENTED(xy_to_2pt_conical_greater)
4255	NOT_IMPLEMENTED(alter_2pt_conical_compensate_focal)
4256	NOT_IMPLEMENTED(alter_2pt_conical_unswap)
4257	NOT_IMPLEMENTED(mask_2pt_conical_nan)
4258	NOT_IMPLEMENTED(mask_2pt_conical_degenerates)
4259	NOT_IMPLEMENTED(apply_vector_mask)
4260	#undef NOT_IMPLEMENTED
4261
4262	#endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages
4263	} // namespace lowp
4264
4265	} // namespace SK_OPTS_NS
4266
4267	#endif//SkRasterPipeline_opts_DEFINED
4268

Browse the source code of engine/third_party/skia/src/opts/SkRasterPipeline_opts.h