SkRasterPipeline_opts.h source code [Skia/src/opts/SkRasterPipeline_opts.h]

1	/*
2	* Copyright 2018 Google Inc.
3	*
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the LICENSE file.
6	*/
7
8	#ifndef SkRasterPipeline_opts_DEFINED
9	#define SkRasterPipeline_opts_DEFINED
10
11	#include "include/core/SkTypes.h"
12	#include "src/core/SkUtils.h" // unaligned_{load,store}
13	#include "src/sksl/SkSLByteCode.h"
14
15	// Every function in this file should be marked static and inline using SI.
16	#if defined(__clang__)
17	#define SI __attribute__((always_inline)) static inline
18	#else
19	#define SI static inline
20	#endif
21
22	template <typename Dst, typename Src>
23	SI Dst bit_cast(const Src& src) {
24	static_assert(sizeof(Dst) == sizeof(Src), "");
25	return sk_unaligned_load<Dst>(&src);
26	}
27
28	template <typename Dst, typename Src>
29	SI Dst widen_cast(const Src& src) {
30	static_assert(sizeof(Dst) > sizeof(Src), "");
31	Dst dst;
32	memcpy(&dst, &src, sizeof(Src));
33	return dst;
34	}
35
36	// Our program is an array of void, either*
37	// - 1 void per stage with no context pointer, the next stage;*
38	// - 2 void per stage with a context pointer, first the context pointer, then the next stage.*
39
40	// load_and_inc() steps the program forward by 1 void, returning that pointer.*
41	SI void* load_and_inc(void**& program) {
42	#if defined(__GNUC__) && defined(__x86_64__)
43	// If program is in %rsi (we try to make this likely) then this is a single instruction.
44	void* rax;
45	asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi.
46	return rax;
47	#else
48	// On ARM program++ compiles into pretty ideal code without any handholding.*
49	return *program++;
50	#endif
51	}
52
53	// Lazily resolved on first cast. Does nothing if cast to Ctx::None.
54	struct Ctx {
55	struct None {};
56
57	void* ptr;
58	void**& program;
59
60	explicit Ctx(void& p) : ptr(nullptr**), program(p) {}
61
62	template <typename T>
63	operator T*() {
64	if (!ptr) { ptr = load_and_inc(program); }
65	return (T*)ptr;
66	}
67	operator None() { return None{}; }
68	};
69
70
71	#if !defined(__clang__)
72	#define JUMPER_IS_SCALAR
73	#elif defined(SK_ARM_HAS_NEON)
74	#define JUMPER_IS_NEON
75	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX512
76	#define JUMPER_IS_AVX512
77	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
78	#define JUMPER_IS_HSW
79	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
80	#define JUMPER_IS_AVX
81	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
82	#define JUMPER_IS_SSE41
83	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
84	#define JUMPER_IS_SSE2
85	#else
86	#define JUMPER_IS_SCALAR
87	#endif
88
89	// Older Clangs seem to crash when generating non-optimized NEON code for ARMv7.
90	#if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32)
91	// Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative.
92	#if defined(__apple_build_version__) && __clang_major__ < 9
93	#define JUMPER_IS_SCALAR
94	#elif __clang_major__ < 5
95	#define JUMPER_IS_SCALAR
96	#endif
97
98	#if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR)
99	#undef JUMPER_IS_NEON
100	#endif
101	#endif
102
103	#if defined(JUMPER_IS_SCALAR)
104	#include <math.h>
105	#elif defined(JUMPER_IS_NEON)
106	#include <arm_neon.h>
107	#else
108	#include <immintrin.h>
109	#endif
110
111	namespace SK_OPTS_NS {
112
113	#if defined(JUMPER_IS_SCALAR)
114	// This path should lead to portable scalar code.
115	using F = float ;
116	using I32 = int32_t;
117	using U64 = uint64_t;
118	using U32 = uint32_t;
119	using U16 = uint16_t;
120	using U8 = uint8_t ;
121
122	SI F mad(F f, F m, F a) { return f*m+a; }
123	SI F min(F a, F b) { return fminf(a,b); }
124	SI F max(F a, F b) { return fmaxf(a,b); }
125	SI F abs_ (F v) { return fabsf(v); }
126	SI F floor_(F v) { return floorf(v); }
127	SI F rcp (F v) { return `1.0f` / v; }
128	SI F rsqrt (F v) { return `1.0f` / sqrtf(v); }
129	SI F sqrt_(F v) { return sqrtf(v); }
130	SI U32 round (F v, F scale) { return (uint32_t)(v*scale + `0.5f`); }
131	SI U16 pack(U32 v) { return (U16)v; }
132	SI U8 pack(U16 v) { return (U8)v; }
133
134	SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
135
136	template <typename T>
137	SI T gather(const T* p, U32 ix) { return p[ix]; }
138
139	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
140	*r = ptr[`0`];
141	*g = ptr[`1`];
142	}
143	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
144	ptr[`0`] = r;
145	ptr[`1`] = g;
146	}
147	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
148	*r = ptr[`0`];
149	*g = ptr[`1`];
150	*b = ptr[`2`];
151	}
152	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
153	*r = ptr[`0`];
154	*g = ptr[`1`];
155	*b = ptr[`2`];
156	*a = ptr[`3`];
157	}
158	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
159	ptr[`0`] = r;
160	ptr[`1`] = g;
161	ptr[`2`] = b;
162	ptr[`3`] = a;
163	}
164
165	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
166	*r = ptr[`0`];
167	*g = ptr[`1`];
168	}
169	SI void store2(float* ptr, size_t tail, F r, F g) {
170	ptr[`0`] = r;
171	ptr[`1`] = g;
172	}
173	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
174	*r = ptr[`0`];
175	*g = ptr[`1`];
176	*b = ptr[`2`];
177	*a = ptr[`3`];
178	}
179	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
180	ptr[`0`] = r;
181	ptr[`1`] = g;
182	ptr[`2`] = b;
183	ptr[`3`] = a;
184	}
185
186	#elif defined(JUMPER_IS_NEON)
187	// Since we know we're using Clang, we can use its vector extensions.
188	template <typename T> using V = T __attribute__((ext_vector_type(`4`)));
189	using F = V<float >;
190	using I32 = V< int32_t>;
191	using U64 = V<uint64_t>;
192	using U32 = V<uint32_t>;
193	using U16 = V<uint16_t>;
194	using U8 = V<uint8_t >;
195
196	// We polyfill a few routines that Clang doesn't build into ext_vector_types.
197	SI F min(F a, F b) { return vminq_f32(a,b); }
198	SI F max(F a, F b) { return vmaxq_f32(a,b); }
199	SI F abs_ (F v) { return vabsq_f32(v); }
200	SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
201	SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,ee) e; }
202	SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
203	SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
204
205	SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
206
207	#if defined(SK_CPU_ARM64)
208	SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
209	SI F floor_(F v) { return vrndmq_f32(v); }
210	SI F sqrt_(F v) { return vsqrtq_f32(v); }
211	SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
212	#else
213	SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
214	SI F floor_(F v) {
215	F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
216	return roundtrip - if_then_else(roundtrip > v, `1`, `0`);
217	}
218
219	SI F sqrt_(F v) {
220	auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v).
221	e = vrsqrtsq_f32(v,ee);
222	e = vrsqrtsq_f32(v,ee);
223	return ve; // sqrt(v) == vrsqrt(v).
224	}
225
226	SI U32 round(F v, F scale) {
227	return vcvtq_u32_f32(mad(v,scale,`0.5f`));
228	}
229	#endif
230
231
232	template <typename T>
233	SI V<T> gather(const T* p, U32 ix) {
234	return {p[ix[`0`]], p[ix[`1`]], p[ix[`2`]], p[ix[`3`]]};
235	}
236	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
237	uint16x4x2_t rg;
238	if (__builtin_expect(tail,`0`)) {
239	if ( true ) { rg = vld2_lane_u16(ptr + `0`, rg, `0`); }
240	if (tail > `1`) { rg = vld2_lane_u16(ptr + `2`, rg, `1`); }
241	if (tail > `2`) { rg = vld2_lane_u16(ptr + `4`, rg, `2`); }
242	} else {
243	rg = vld2_u16(ptr);
244	}
245	*r = rg.val[`0`];
246	*g = rg.val[`1`];
247	}
248	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
249	if (__builtin_expect(tail,`0`)) {
250	if ( true ) { vst2_lane_u16(ptr + `0`, (uint16x4x2_t{{r,g}}), `0`); }
251	if (tail > `1`) { vst2_lane_u16(ptr + `2`, (uint16x4x2_t{{r,g}}), `1`); }
252	if (tail > `2`) { vst2_lane_u16(ptr + `4`, (uint16x4x2_t{{r,g}}), `2`); }
253	} else {
254	vst2_u16(ptr, (uint16x4x2_t{{r,g}}));
255	}
256	}
257	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
258	uint16x4x3_t rgb;
259	if (__builtin_expect(tail,`0`)) {
260	if ( true ) { rgb = vld3_lane_u16(ptr + `0`, rgb, `0`); }
261	if (tail > `1`) { rgb = vld3_lane_u16(ptr + `3`, rgb, `1`); }
262	if (tail > `2`) { rgb = vld3_lane_u16(ptr + `6`, rgb, `2`); }
263	} else {
264	rgb = vld3_u16(ptr);
265	}
266	*r = rgb.val[`0`];
267	*g = rgb.val[`1`];
268	*b = rgb.val[`2`];
269	}
270	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
271	uint16x4x4_t rgba;
272	if (__builtin_expect(tail,`0`)) {
273	if ( true ) { rgba = vld4_lane_u16(ptr + `0`, rgba, `0`); }
274	if (tail > `1`) { rgba = vld4_lane_u16(ptr + `4`, rgba, `1`); }
275	if (tail > `2`) { rgba = vld4_lane_u16(ptr + `8`, rgba, `2`); }
276	} else {
277	rgba = vld4_u16(ptr);
278	}
279	*r = rgba.val[`0`];
280	*g = rgba.val[`1`];
281	*b = rgba.val[`2`];
282	*a = rgba.val[`3`];
283	}
284
285	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
286	if (__builtin_expect(tail,`0`)) {
287	if ( true ) { vst4_lane_u16(ptr + `0`, (uint16x4x4_t{{r,g,b,a}}), `0`); }
288	if (tail > `1`) { vst4_lane_u16(ptr + `4`, (uint16x4x4_t{{r,g,b,a}}), `1`); }
289	if (tail > `2`) { vst4_lane_u16(ptr + `8`, (uint16x4x4_t{{r,g,b,a}}), `2`); }
290	} else {
291	vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
292	}
293	}
294	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
295	float32x4x2_t rg;
296	if (__builtin_expect(tail,`0`)) {
297	if ( true ) { rg = vld2q_lane_f32(ptr + `0`, rg, `0`); }
298	if (tail > `1`) { rg = vld2q_lane_f32(ptr + `2`, rg, `1`); }
299	if (tail > `2`) { rg = vld2q_lane_f32(ptr + `4`, rg, `2`); }
300	} else {
301	rg = vld2q_f32(ptr);
302	}
303	*r = rg.val[`0`];
304	*g = rg.val[`1`];
305	}
306	SI void store2(float* ptr, size_t tail, F r, F g) {
307	if (__builtin_expect(tail,`0`)) {
308	if ( true ) { vst2q_lane_f32(ptr + `0`, (float32x4x2_t{{r,g}}), `0`); }
309	if (tail > `1`) { vst2q_lane_f32(ptr + `2`, (float32x4x2_t{{r,g}}), `1`); }
310	if (tail > `2`) { vst2q_lane_f32(ptr + `4`, (float32x4x2_t{{r,g}}), `2`); }
311	} else {
312	vst2q_f32(ptr, (float32x4x2_t{{r,g}}));
313	}
314	}
315	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
316	float32x4x4_t rgba;
317	if (__builtin_expect(tail,`0`)) {
318	if ( true ) { rgba = vld4q_lane_f32(ptr + `0`, rgba, `0`); }
319	if (tail > `1`) { rgba = vld4q_lane_f32(ptr + `4`, rgba, `1`); }
320	if (tail > `2`) { rgba = vld4q_lane_f32(ptr + `8`, rgba, `2`); }
321	} else {
322	rgba = vld4q_f32(ptr);
323	}
324	*r = rgba.val[`0`];
325	*g = rgba.val[`1`];
326	*b = rgba.val[`2`];
327	*a = rgba.val[`3`];
328	}
329	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
330	if (__builtin_expect(tail,`0`)) {
331	if ( true ) { vst4q_lane_f32(ptr + `0`, (float32x4x4_t{{r,g,b,a}}), `0`); }
332	if (tail > `1`) { vst4q_lane_f32(ptr + `4`, (float32x4x4_t{{r,g,b,a}}), `1`); }
333	if (tail > `2`) { vst4q_lane_f32(ptr + `8`, (float32x4x4_t{{r,g,b,a}}), `2`); }
334	} else {
335	vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
336	}
337	}
338
339	#elif defined(JUMPER_IS_AVX) \|\| defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
340	// These are __m256 and __m256i, but friendlier and strongly-typed.
341	template <typename T> using V = T __attribute__((ext_vector_type(`8`)));
342	using F = V<float >;
343	using I32 = V< int32_t>;
344	using U64 = V<uint64_t>;
345	using U32 = V<uint32_t>;
346	using U16 = V<uint16_t>;
347	using U8 = V<uint8_t >;
348
349	SI F mad(F f, F m, F a) {
350	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
351	return _mm256_fmadd_ps(f,m,a);
352	#else
353	return f*m+a;
354	#endif
355	}
356
357	SI F min(F a, F b) { return _mm256_min_ps(a,b); }
358	SI F max(F a, F b) { return _mm256_max_ps(a,b); }
359	SI F abs_ (F v) { return _mm256_and_ps(v, `0`-v); }
360	SI F floor_(F v) { return _mm256_floor_ps(v); }
361	SI F rcp (F v) { return _mm256_rcp_ps (v); }
362	SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); }
363	SI F sqrt_(F v) { return _mm256_sqrt_ps (v); }
364	SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
365
366	SI U16 pack(U32 v) {
367	return _mm_packus_epi32(_mm256_extractf128_si256(v, `0`),
368	_mm256_extractf128_si256(v, `1`));
369	}
370	SI U8 pack(U16 v) {
371	auto r = _mm_packus_epi16(v,v);
372	return sk_unaligned_load<U8>(&r);
373	}
374
375	SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
376
377	template <typename T>
378	SI V<T> gather(const T* p, U32 ix) {
379	return { p[ix[`0`]], p[ix[`1`]], p[ix[`2`]], p[ix[`3`]],
380	p[ix[`4`]], p[ix[`5`]], p[ix[`6`]], p[ix[`7`]], };
381	}
382	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
383	SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, `4`); }
384	SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, `4`); }
385	SI U64 gather(const uint64_t* p, U32 ix) {
386	__m256i parts[] = {
387	_mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,`0`), `8`),
388	_mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,`1`), `8`),
389	};
390	return bit_cast<U64>(parts);
391	}
392	#endif
393
394	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
395	U16 _0123, _4567;
396	if (__builtin_expect(tail,`0`)) {
397	_0123 = _4567 = _mm_setzero_si128();
398	auto* d = &_0123;
399	if (tail > `3`) {
400	d = _mm_loadu_si128(((__m128i)ptr) + `0`);
401	tail -= `4`;
402	ptr += `8`;
403	d = &_4567;
404	}
405	bool high = false;
406	if (tail > `1`) {
407	*d = _mm_loadu_si64(ptr);
408	tail -= `2`;
409	ptr += `4`;
410	high = true;
411	}
412	if (tail > `0`) {
413	(d)[high ? `4` : `0`] = (ptr + `0`);
414	(d)[high ? `5` : `1`] = (ptr + `1`);
415	}
416	} else {
417	_0123 = _mm_loadu_si128(((__m128i*)ptr) + `0`);
418	_4567 = _mm_loadu_si128(((__m128i*)ptr) + `1`);
419	}
420	*r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, `16`), `16`),
421	_mm_srai_epi32(_mm_slli_epi32(_4567, `16`), `16`));
422	*g = _mm_packs_epi32(_mm_srai_epi32(_0123, `16`),
423	_mm_srai_epi32(_4567, `16`));
424	}
425	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
426	auto _0123 = _mm_unpacklo_epi16(r, g),
427	_4567 = _mm_unpackhi_epi16(r, g);
428	if (__builtin_expect(tail,`0`)) {
429	const auto* s = &_0123;
430	if (tail > `3`) {
431	_mm_storeu_si128((__m128i)ptr, s);
432	s = &_4567;
433	tail -= `4`;
434	ptr += `8`;
435	}
436	bool high = false;
437	if (tail > `1`) {
438	_mm_storel_epi64((__m128i)ptr, s);
439	ptr += `4`;
440	tail -= `2`;
441	high = true;
442	}
443	if (tail > `0`) {
444	if (high) {
445	(int32_t)ptr = _mm_extract_epi32(*s, `2`);
446	} else {
447	(int32_t)ptr = _mm_cvtsi128_si32(*s);
448	}
449	}
450	} else {
451	_mm_storeu_si128((__m128i*)ptr + `0`, _0123);
452	_mm_storeu_si128((__m128i*)ptr + `1`, _4567);
453	}
454	}
455
456	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
457	__m128i _0,_1,_2,_3,_4,_5,_6,_7;
458	if (__builtin_expect(tail,`0`)) {
459	auto load_rgb = [](const uint16_t* src) {
460	auto v = _mm_cvtsi32_si128((const* uint32_t*)src);
461	return _mm_insert_epi16(v, src[`2`], `2`);
462	};
463	_1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128();
464	if ( true ) { _0 = load_rgb(ptr + `0`); }
465	if (tail > `1`) { _1 = load_rgb(ptr + `3`); }
466	if (tail > `2`) { _2 = load_rgb(ptr + `6`); }
467	if (tail > `3`) { _3 = load_rgb(ptr + `9`); }
468	if (tail > `4`) { _4 = load_rgb(ptr + `12`); }
469	if (tail > `5`) { _5 = load_rgb(ptr + `15`); }
470	if (tail > `6`) { _6 = load_rgb(ptr + `18`); }
471	} else {
472	// Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over.
473	auto _01 = _mm_loadu_si128((const __m128i*)(ptr + `0`)) ;
474	auto _23 = _mm_loadu_si128((const __m128i*)(ptr + `6`)) ;
475	auto _45 = _mm_loadu_si128((const __m128i*)(ptr + `12`)) ;
476	auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + `16`)), `4`);
477	_0 = _01; _1 = _mm_srli_si128(_01, `6`);
478	_2 = _23; _3 = _mm_srli_si128(_23, `6`);
479	_4 = _45; _5 = _mm_srli_si128(_45, `6`);
480	_6 = _67; _7 = _mm_srli_si128(_67, `6`);
481	}
482
483	auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
484	_13 = _mm_unpacklo_epi16(_1, _3),
485	_46 = _mm_unpacklo_epi16(_4, _6),
486	_57 = _mm_unpacklo_epi16(_5, _7);
487
488	auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
489	bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx
490	rg4567 = _mm_unpacklo_epi16(_46, _57),
491	bx4567 = _mm_unpackhi_epi16(_46, _57);
492
493	*r = _mm_unpacklo_epi64(rg0123, rg4567);
494	*g = _mm_unpackhi_epi64(rg0123, rg4567);
495	*b = _mm_unpacklo_epi64(bx0123, bx4567);
496	}
497	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
498	__m128i _01, _23, _45, _67;
499	if (__builtin_expect(tail,`0`)) {
500	auto src = (const double*)ptr;
501	_01 = _23 = _45 = _67 = _mm_setzero_si128();
502	if (tail > `0`) { _01 = _mm_loadl_pd(_01, src+`0`); }
503	if (tail > `1`) { _01 = _mm_loadh_pd(_01, src+`1`); }
504	if (tail > `2`) { _23 = _mm_loadl_pd(_23, src+`2`); }
505	if (tail > `3`) { _23 = _mm_loadh_pd(_23, src+`3`); }
506	if (tail > `4`) { _45 = _mm_loadl_pd(_45, src+`4`); }
507	if (tail > `5`) { _45 = _mm_loadh_pd(_45, src+`5`); }
508	if (tail > `6`) { _67 = _mm_loadl_pd(_67, src+`6`); }
509	} else {
510	_01 = _mm_loadu_si128(((__m128i*)ptr) + `0`);
511	_23 = _mm_loadu_si128(((__m128i*)ptr) + `1`);
512	_45 = _mm_loadu_si128(((__m128i*)ptr) + `2`);
513	_67 = _mm_loadu_si128(((__m128i*)ptr) + `3`);
514	}
515
516	auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
517	_13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
518	_46 = _mm_unpacklo_epi16(_45, _67),
519	_57 = _mm_unpackhi_epi16(_45, _67);
520
521	auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
522	ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
523	rg4567 = _mm_unpacklo_epi16(_46, _57),
524	ba4567 = _mm_unpackhi_epi16(_46, _57);
525
526	*r = _mm_unpacklo_epi64(rg0123, rg4567);
527	*g = _mm_unpackhi_epi64(rg0123, rg4567);
528	*b = _mm_unpacklo_epi64(ba0123, ba4567);
529	*a = _mm_unpackhi_epi64(ba0123, ba4567);
530	}
531	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
532	auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3
533	rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7
534	ba0123 = _mm_unpacklo_epi16(b, a),
535	ba4567 = _mm_unpackhi_epi16(b, a);
536
537	auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
538	_23 = _mm_unpackhi_epi32(rg0123, ba0123),
539	_45 = _mm_unpacklo_epi32(rg4567, ba4567),
540	_67 = _mm_unpackhi_epi32(rg4567, ba4567);
541
542	if (__builtin_expect(tail,`0`)) {
543	auto dst = (double*)ptr;
544	if (tail > `0`) { _mm_storel_pd(dst+`0`, _01); }
545	if (tail > `1`) { _mm_storeh_pd(dst+`1`, _01); }
546	if (tail > `2`) { _mm_storel_pd(dst+`2`, _23); }
547	if (tail > `3`) { _mm_storeh_pd(dst+`3`, _23); }
548	if (tail > `4`) { _mm_storel_pd(dst+`4`, _45); }
549	if (tail > `5`) { _mm_storeh_pd(dst+`5`, _45); }
550	if (tail > `6`) { _mm_storel_pd(dst+`6`, _67); }
551	} else {
552	_mm_storeu_si128((__m128i*)ptr + `0`, _01);
553	_mm_storeu_si128((__m128i*)ptr + `1`, _23);
554	_mm_storeu_si128((__m128i*)ptr + `2`, _45);
555	_mm_storeu_si128((__m128i*)ptr + `3`, _67);
556	}
557	}
558
559	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
560	F _0123, _4567;
561	if (__builtin_expect(tail, `0`)) {
562	_0123 = _4567 = _mm256_setzero_ps();
563	F* d = &_0123;
564	if (tail > `3`) {
565	*d = _mm256_loadu_ps(ptr);
566	ptr += `8`;
567	tail -= `4`;
568	d = &_4567;
569	}
570	bool high = false;
571	if (tail > `1`) {
572	*d = _mm256_castps128_ps256(_mm_loadu_ps(ptr));
573	ptr += `4`;
574	tail -= `2`;
575	high = true;
576	}
577	if (tail > `0`) {
578	d = high ? _mm256_insertf128_ps(d, _mm_loadu_si64(ptr), `1`)
579	: _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), `0`);
580	}
581	} else {
582	_0123 = _mm256_loadu_ps(ptr + `0`);
583	_4567 = _mm256_loadu_ps(ptr + `8`);
584	}
585
586	F _0145 = _mm256_permute2f128_pd(_0123, _4567, `0x20`),
587	_2367 = _mm256_permute2f128_pd(_0123, _4567, `0x31`);
588
589	*r = _mm256_shuffle_ps(_0145, _2367, `0x88`);
590	*g = _mm256_shuffle_ps(_0145, _2367, `0xDD`);
591	}
592	SI void store2(float* ptr, size_t tail, F r, F g) {
593	F _0145 = _mm256_unpacklo_ps(r, g),
594	_2367 = _mm256_unpackhi_ps(r, g);
595	F _0123 = _mm256_permute2f128_pd(_0145, _2367, `0x20`),
596	_4567 = _mm256_permute2f128_pd(_0145, _2367, `0x31`);
597
598	if (__builtin_expect(tail, `0`)) {
599	const __m256* s = &_0123;
600	if (tail > `3`) {
601	_mm256_storeu_ps(ptr, *s);
602	s = &_4567;
603	tail -= `4`;
604	ptr += `8`;
605	}
606	bool high = false;
607	if (tail > `1`) {
608	_mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, `0`));
609	ptr += `4`;
610	tail -= `2`;
611	high = true;
612	}
613	if (tail > `0`) {
614	(ptr + `0`) = (s)[ high ? `4` : `0`];
615	(ptr + `1`) = (s)[ high ? `5` : `1`];
616	}
617	} else {
618	_mm256_storeu_ps(ptr + `0`, _0123);
619	_mm256_storeu_ps(ptr + `8`, _4567);
620	}
621	}
622
623	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
624	F _04, _15, _26, _37;
625	_04 = _15 = _26 = _37 = `0`;
626	switch (tail) {
627	case `0`: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+`28`), `1`);
628	case `7`: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+`24`), `1`);
629	case `6`: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+`20`), `1`);
630	case `5`: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+`16`), `1`);
631	case `4`: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+`12`), `0`);
632	case `3`: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ `8`), `0`);
633	case `2`: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ `4`), `0`);
634	case `1`: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ `0`), `0`);
635	}
636
637	F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 \| r4 r5 g4 g5
638	ba0145 = _mm256_unpackhi_ps(_04,_15),
639	rg2367 = _mm256_unpacklo_ps(_26,_37),
640	ba2367 = _mm256_unpackhi_ps(_26,_37);
641
642	*r = _mm256_unpacklo_pd(rg0145, rg2367);
643	*g = _mm256_unpackhi_pd(rg0145, rg2367);
644	*b = _mm256_unpacklo_pd(ba0145, ba2367);
645	*a = _mm256_unpackhi_pd(ba0145, ba2367);
646	}
647	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
648	F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 \| r4 g4 r5 g5
649	rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... \| r6 ...
650	ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 \| b4 a4 b5 a5
651	ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... \| b6 ...
652
653	F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 \| r4 g4 b4 a4
654	_15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... \| r5 ...
655	_26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... \| r6 ...
656	_37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... \| r7 ...
657
658	if (__builtin_expect(tail, `0`)) {
659	if (tail > `0`) { _mm_storeu_ps(ptr+ `0`, _mm256_extractf128_ps(_04, `0`)); }
660	if (tail > `1`) { _mm_storeu_ps(ptr+ `4`, _mm256_extractf128_ps(_15, `0`)); }
661	if (tail > `2`) { _mm_storeu_ps(ptr+ `8`, _mm256_extractf128_ps(_26, `0`)); }
662	if (tail > `3`) { _mm_storeu_ps(ptr+`12`, _mm256_extractf128_ps(_37, `0`)); }
663	if (tail > `4`) { _mm_storeu_ps(ptr+`16`, _mm256_extractf128_ps(_04, `1`)); }
664	if (tail > `5`) { _mm_storeu_ps(ptr+`20`, _mm256_extractf128_ps(_15, `1`)); }
665	if (tail > `6`) { _mm_storeu_ps(ptr+`24`, _mm256_extractf128_ps(_26, `1`)); }
666	} else {
667	F _01 = _mm256_permute2f128_ps(_04, _15, `32`), // 32 == 0010 0000 == lo, lo
668	_23 = _mm256_permute2f128_ps(_26, _37, `32`),
669	_45 = _mm256_permute2f128_ps(_04, _15, `49`), // 49 == 0011 0001 == hi, hi
670	_67 = _mm256_permute2f128_ps(_26, _37, `49`);
671	_mm256_storeu_ps(ptr+ `0`, _01);
672	_mm256_storeu_ps(ptr+ `8`, _23);
673	_mm256_storeu_ps(ptr+`16`, _45);
674	_mm256_storeu_ps(ptr+`24`, _67);
675	}
676	}
677
678	#elif defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41)
679	template <typename T> using V = T __attribute__((ext_vector_type(`4`)));
680	using F = V<float >;
681	using I32 = V< int32_t>;
682	using U64 = V<uint64_t>;
683	using U32 = V<uint32_t>;
684	using U16 = V<uint16_t>;
685	using U8 = V<uint8_t >;
686
687	SI F mad(F f, F m, F a) { return f*m+a; }
688	SI F min(F a, F b) { return _mm_min_ps(a,b); }
689	SI F max(F a, F b) { return _mm_max_ps(a,b); }
690	SI F abs_(F v) { return _mm_and_ps(v, `0`-v); }
691	SI F rcp (F v) { return _mm_rcp_ps (v); }
692	SI F rsqrt (F v) { return _mm_rsqrt_ps(v); }
693	SI F sqrt_(F v) { return _mm_sqrt_ps (v); }
694	SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
695
696	SI U16 pack(U32 v) {
697	#if defined(JUMPER_IS_SSE41)
698	auto p = _mm_packus_epi32(v,v);
699	#else
700	// Sign extend so that _mm_packs_epi32() does the pack we want.
701	auto p = _mm_srai_epi32(_mm_slli_epi32(v, `16`), `16`);
702	p = _mm_packs_epi32(p,p);
703	#endif
704	return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
705	}
706	SI U8 pack(U16 v) {
707	auto r = widen_cast<__m128i>(v);
708	r = _mm_packus_epi16(r,r);
709	return sk_unaligned_load<U8>(&r);
710	}
711
712	SI F if_then_else(I32 c, F t, F e) {
713	return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
714	}
715
716	SI F floor_(F v) {
717	#if defined(JUMPER_IS_SSE41)
718	return _mm_floor_ps(v);
719	#else
720	F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
721	return roundtrip - if_then_else(roundtrip > v, `1`, `0`);
722	#endif
723	}
724
725	template <typename T>
726	SI V<T> gather(const T* p, U32 ix) {
727	return {p[ix[`0`]], p[ix[`1`]], p[ix[`2`]], p[ix[`3`]]};
728	}
729
730	// TODO: these loads and stores are incredibly difficult to follow.
731
732	SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
733	__m128i _01;
734	if (__builtin_expect(tail,`0`)) {
735	_01 = _mm_setzero_si128();
736	if (tail > `1`) {
737	_01 = _mm_loadl_pd(_01, (const double)ptr); // r0 g0 r1 g1 00 00 00 00*
738	if (tail > `2`) {
739	_01 = _mm_insert_epi16(_01, (ptr+`4`), `4`); // r0 g0 r1 g1 r2 00 00 00*
740	_01 = _mm_insert_epi16(_01, (ptr+`5`), `5`); // r0 g0 r1 g1 r2 g2 00 00*
741	}
742	} else {
743	_01 = _mm_cvtsi32_si128((const* uint32_t)ptr); // r0 g0 00 00 00 00 00 00*
744	}
745	} else {
746	_01 = _mm_loadu_si128(((__m128i)ptr) + `0`); // r0 g0 r1 g1 r2 g2 r3 g3*
747	}
748	auto rg01_23 = _mm_shufflelo_epi16(_01, `0xD8`); // r0 r1 g0 g1 r2 g2 r3 g3
749	auto rg = _mm_shufflehi_epi16(rg01_23, `0xD8`); // r0 r1 g0 g1 r2 r3 g2 g3
750
751	auto R = _mm_shuffle_epi32(rg, `0x88`); // r0 r1 r2 r3 r0 r1 r2 r3
752	auto G = _mm_shuffle_epi32(rg, `0xDD`); // g0 g1 g2 g3 g0 g1 g2 g3
753	*r = sk_unaligned_load<U16>(&R);
754	*g = sk_unaligned_load<U16>(&G);
755	}
756	SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
757	U32 rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g));
758	if (__builtin_expect(tail, `0`)) {
759	if (tail > `1`) {
760	_mm_storel_epi64((__m128i*)ptr, rg);
761	if (tail > `2`) {
762	int32_t rgpair = rg[`2`];
763	memcpy(ptr + `4`, &rgpair, sizeof(rgpair));
764	}
765	} else {
766	int32_t rgpair = rg[`0`];
767	memcpy(ptr, &rgpair, sizeof(rgpair));
768	}
769	} else {
770	_mm_storeu_si128((__m128i*)ptr + `0`, rg);
771	}
772	}
773
774	SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
775	__m128i _0, _1, _2, _3;
776	if (__builtin_expect(tail,`0`)) {
777	_1 = _2 = _3 = _mm_setzero_si128();
778	auto load_rgb = [](const uint16_t* src) {
779	auto v = _mm_cvtsi32_si128((const* uint32_t*)src);
780	return _mm_insert_epi16(v, src[`2`], `2`);
781	};
782	if ( true ) { _0 = load_rgb(ptr + `0`); }
783	if (tail > `1`) { _1 = load_rgb(ptr + `3`); }
784	if (tail > `2`) { _2 = load_rgb(ptr + `6`); }
785	} else {
786	// Load slightly weirdly to make sure we don't load past the end of 4x48 bits.
787	auto _01 = _mm_loadu_si128((const __m128i*)(ptr + `0`)) ,
788	_23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + `4`)), `4`);
789
790	// Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored).
791	_0 = _01;
792	_1 = _mm_srli_si128(_01, `6`);
793	_2 = _23;
794	_3 = _mm_srli_si128(_23, `6`);
795	}
796
797	// De-interlace to R,G,B.
798	auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx
799	_13 = _mm_unpacklo_epi16(_1, _3); // r1 r3 g1 g3 b1 b3 xx xx
800
801	auto R = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
802	G = _mm_srli_si128(R, `8`),
803	B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx
804
805	*r = sk_unaligned_load<U16>(&R);
806	*g = sk_unaligned_load<U16>(&G);
807	*b = sk_unaligned_load<U16>(&B);
808	}
809
810	SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
811	__m128i _01, _23;
812	if (__builtin_expect(tail,`0`)) {
813	_01 = _23 = _mm_setzero_si128();
814	auto src = (const double*)ptr;
815	if ( true ) { _01 = _mm_loadl_pd(_01, src + `0`); } // r0 g0 b0 a0 00 00 00 00
816	if (tail > `1`) { _01 = _mm_loadh_pd(_01, src + `1`); } // r0 g0 b0 a0 r1 g1 b1 a1
817	if (tail > `2`) { _23 = _mm_loadl_pd(_23, src + `2`); } // r2 g2 b2 a2 00 00 00 00
818	} else {
819	_01 = _mm_loadu_si128(((__m128i)ptr) + `0`); // r0 g0 b0 a0 r1 g1 b1 a1*
820	_23 = _mm_loadu_si128(((__m128i)ptr) + `1`); // r2 g2 b2 a2 r3 g3 b3 a3*
821	}
822
823	auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
824	_13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3
825
826	auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
827	ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
828
829	r = sk_unaligned_load<U16>((uint16_t)&rg + `0`);
830	g = sk_unaligned_load<U16>((uint16_t)&rg + `4`);
831	b = sk_unaligned_load<U16>((uint16_t)&ba + `0`);
832	a = sk_unaligned_load<U16>((uint16_t)&ba + `4`);
833	}
834
835	SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
836	auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
837	ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
838
839	if (__builtin_expect(tail, `0`)) {
840	auto dst = (double*)ptr;
841	if ( true ) { _mm_storel_pd(dst + `0`, _mm_unpacklo_epi32(rg, ba)); }
842	if (tail > `1`) { _mm_storeh_pd(dst + `1`, _mm_unpacklo_epi32(rg, ba)); }
843	if (tail > `2`) { _mm_storel_pd(dst + `2`, _mm_unpackhi_epi32(rg, ba)); }
844	} else {
845	_mm_storeu_si128((__m128i*)ptr + `0`, _mm_unpacklo_epi32(rg, ba));
846	_mm_storeu_si128((__m128i*)ptr + `1`, _mm_unpackhi_epi32(rg, ba));
847	}
848	}
849
850	SI void load2(const float* ptr, size_t tail, F* r, F* g) {
851	F _01, _23;
852	if (__builtin_expect(tail, `0`)) {
853	_01 = _23 = _mm_setzero_si128();
854	if ( true ) { _01 = _mm_loadl_pi(_01, (__m64 const*)(ptr + `0`)); }
855	if (tail > `1`) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + `2`)); }
856	if (tail > `2`) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + `4`)); }
857	} else {
858	_01 = _mm_loadu_ps(ptr + `0`);
859	_23 = _mm_loadu_ps(ptr + `4`);
860	}
861	*r = _mm_shuffle_ps(_01, _23, `0x88`);
862	*g = _mm_shuffle_ps(_01, _23, `0xDD`);
863	}
864	SI void store2(float* ptr, size_t tail, F r, F g) {
865	F _01 = _mm_unpacklo_ps(r, g),
866	_23 = _mm_unpackhi_ps(r, g);
867	if (__builtin_expect(tail, `0`)) {
868	if ( true ) { _mm_storel_pi((__m64*)(ptr + `0`), _01); }
869	if (tail > `1`) { _mm_storeh_pi((__m64*)(ptr + `2`), _01); }
870	if (tail > `2`) { _mm_storel_pi((__m64*)(ptr + `4`), _23); }
871	} else {
872	_mm_storeu_ps(ptr + `0`, _01);
873	_mm_storeu_ps(ptr + `4`, _23);
874	}
875	}
876
877	SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
878	F _0, _1, _2, _3;
879	if (__builtin_expect(tail, `0`)) {
880	_1 = _2 = _3 = _mm_setzero_si128();
881	if ( true ) { _0 = _mm_loadu_ps(ptr + `0`); }
882	if (tail > `1`) { _1 = _mm_loadu_ps(ptr + `4`); }
883	if (tail > `2`) { _2 = _mm_loadu_ps(ptr + `8`); }
884	} else {
885	_0 = _mm_loadu_ps(ptr + `0`);
886	_1 = _mm_loadu_ps(ptr + `4`);
887	_2 = _mm_loadu_ps(ptr + `8`);
888	_3 = _mm_loadu_ps(ptr +`12`);
889	}
890	_MM_TRANSPOSE4_PS(_0,_1,_2,_3);
891	*r = _0;
892	*g = _1;
893	*b = _2;
894	*a = _3;
895	}
896
897	SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
898	_MM_TRANSPOSE4_PS(r,g,b,a);
899	if (__builtin_expect(tail, `0`)) {
900	if ( true ) { _mm_storeu_ps(ptr + `0`, r); }
901	if (tail > `1`) { _mm_storeu_ps(ptr + `4`, g); }
902	if (tail > `2`) { _mm_storeu_ps(ptr + `8`, b); }
903	} else {
904	_mm_storeu_ps(ptr + `0`, r);
905	_mm_storeu_ps(ptr + `4`, g);
906	_mm_storeu_ps(ptr + `8`, b);
907	_mm_storeu_ps(ptr +`12`, a);
908	}
909	}
910	#endif
911
912	// We need to be a careful with casts.
913	// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
914	// These named casts and bit_cast() are always what they seem to be.
915	#if defined(JUMPER_IS_SCALAR)
916	SI F cast (U32 v) { return (F)v; }
917	SI F cast64(U64 v) { return (F)v; }
918	SI U32 trunc_(F v) { return (U32)v; }
919	SI U32 expand(U16 v) { return (U32)v; }
920	SI U32 expand(U8 v) { return (U32)v; }
921	#else
922	SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
923	SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
924	SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
925	SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
926	SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
927	#endif
928
929	template <typename V>
930	SI V if_then_else(I32 c, V t, V e) {
931	return bit_cast<V>(if_then_else(c, bit_cast<F>(t), bit_cast<F>(e)));
932	}
933
934	SI U16 bswap(U16 x) {
935	#if defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41)
936	// Somewhat inexplicably Clang decides to do (x<<8) \| (x>>8) in 32-bit lanes
937	// when generating code for SSE2 and SSE4.1. We'll do it manually...
938	auto v = widen_cast<__m128i>(x);
939	v = _mm_slli_epi16(v,`8`) \| _mm_srli_epi16(v,`8`);
940	return sk_unaligned_load<U16>(&v);
941	#else
942	return (x<<`8`) \| (x>>`8`);
943	#endif
944	}
945
946	SI F fract(F v) { return v - floor_(v); }
947
948	// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
949	SI F approx_log2(F x) {
950	// e - 127 is a fair approximation of log2(x) in its own right...
951	F e = cast(bit_cast<U32>(x)) * (`1.0f` / (`1`<<`23`));
952
953	// ... but using the mantissa to refine its error is _much_ better.
954	F m = bit_cast<F>((bit_cast<U32>(x) & `0x007fffff`) \| `0x3f000000`);
955	return e
956	- `124.225514990f`
957	- `1.498030302f` * m
958	- `1.725879990f` / (`0.3520887068f` + m);
959	}
960
961	SI F approx_log(F x) {
962	const float ln2 = `0.69314718f`;
963	return ln2 * approx_log2(x);
964	}
965
966	SI F approx_pow2(F x) {
967	F f = fract(x);
968	return bit_cast<F>(round(`1.0f` * (`1`<<`23`),
969	x + `121.274057500f`
970	- `1.490129070f` * f
971	+ `27.728023300f` / (`4.84252568f` - f)));
972	}
973
974	SI F approx_exp(F x) {
975	const float log2_e = `1.4426950408889634074f`;
976	return approx_pow2(log2_e * x);
977	}
978
979	SI F approx_powf(F x, F y) {
980	#if defined(SK_LEGACY_APPROX_POWF_SPECIALCASE)
981	return if_then_else((x == `0`) , `0`
982	#else
983	return if_then_else((x == `0`)\|(x == `1`), x
984	#endif
985	, approx_pow2(approx_log2(x) * y));
986	}
987
988	SI F from_half(U16 h) {
989	#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
990	&& !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
991	return vcvt_f32_f16(h);
992
993	#elif defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
994	return _mm256_cvtph_ps(h);
995
996	#else
997	// Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
998	U32 sem = expand(h),
999	s = sem & `0x8000`,
1000	em = sem ^ s;
1001
1002	// Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
1003	auto denorm = (I32)em < `0x0400`; // I32 comparison is often quicker, and always safe here.
1004	return if_then_else(denorm, F(`0`)
1005	, bit_cast<F>( (s<<`16`) + (em<<`13`) + ((`127`-`15`)<<`23`) ));
1006	#endif
1007	}
1008
1009	SI U16 to_half(F f) {
1010	#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \
1011	&& !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
1012	return vcvt_f16_f32(f);
1013
1014	#elif defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
1015	return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1016
1017	#else
1018	// Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
1019	U32 sem = bit_cast<U32>(f),
1020	s = sem & `0x80000000`,
1021	em = sem ^ s;
1022
1023	// Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
1024	auto denorm = (I32)em < `0x38800000`; // I32 comparison is often quicker, and always safe here.
1025	return pack(if_then_else(denorm, U32(`0`)
1026	, (s>>`16`) + (em>>`13`) - ((`127`-`15`)<<`10`)));
1027	#endif
1028	}
1029
1030	// Our fundamental vector depth is our pixel stride.
1031	static const size_t N = sizeof(F) / sizeof(float);
1032
1033	// We're finally going to get to what a Stage function looks like!
1034	// tail == 0 ~~> work on a full N pixels
1035	// tail != 0 ~~> work on only the first tail pixels
1036	// tail is always < N.
1037
1038	// Any custom ABI to use for all (non-externally-facing) stage functions?
1039	// Also decide here whether to use narrow (compromise) or wide (ideal) stages.
1040	#if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON)
1041	// This lets us pass vectors more efficiently on 32-bit ARM.
1042	// We can still only pass 16 floats, so best as 4x {r,g,b,a}.
1043	#define ABI __attribute__((pcs("aapcs-vfp")))
1044	#define JUMPER_NARROW_STAGES 1
1045	#elif 0 && defined(_MSC_VER) && defined(__clang__) && defined(__x86_64__)
1046	// SysV ABI makes it very sensible to use wide stages with clang-cl.
1047	// TODO: crashes during compilation :(
1048	#define ABI __attribute__((sysv_abi))
1049	#define JUMPER_NARROW_STAGES 0
1050	#elif defined(_MSC_VER)
1051	// Even if not vectorized, this lets us pass {r,g,b,a} as registers,
1052	// instead of {b,a} on the stack. Narrow stages work best for __vectorcall.
1053	#define ABI __vectorcall
1054	#define JUMPER_NARROW_STAGES 1
1055	#elif defined(__x86_64__) \|\| defined(SK_CPU_ARM64)
1056	// These platforms are ideal for wider stages, and their default ABI is ideal.
1057	#define ABI
1058	#define JUMPER_NARROW_STAGES 0
1059	#else
1060	// 32-bit or unknown... shunt them down the narrow path.
1061	// Odds are these have few registers and are better off there.
1062	#define ABI
1063	#define JUMPER_NARROW_STAGES 1
1064	#endif
1065
1066	#if JUMPER_NARROW_STAGES
1067	struct Params {
1068	size_t dx, dy, tail;
1069	F dr,dg,db,da;
1070	};
1071	using Stage = void(ABI)(Params, void** program, F r, F g, F b, F a);
1072	#else
1073	// We keep program the second argument, so that it's passed in rsi for load_and_inc().
1074	using Stage = void(ABI)(size_t tail, void*** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
1075	#endif
1076
1077
1078	static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, void** program) {
1079	auto start = (Stage)load_and_inc(program);
1080	const size_t x0 = dx;
1081	for (; dy < ylimit; dy++) {
1082	#if JUMPER_NARROW_STAGES
1083	Params params = { x0,dy,`0`, `0`,`0`,`0`,`0` };
1084	while (params.dx + N <= xlimit) {
1085	start(&params,program, `0`,`0`,`0`,`0`);
1086	params.dx += N;
1087	}
1088	if (size_t tail = xlimit - params.dx) {
1089	params.tail = tail;
1090	start(&params,program, `0`,`0`,`0`,`0`);
1091	}
1092	#else
1093	dx = x0;
1094	while (dx + N <= xlimit) {
1095	start(`0`,program,dx,dy, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
1096	dx += N;
1097	}
1098	if (size_t tail = xlimit - dx) {
1099	start(tail,program,dx,dy, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
1100	}
1101	#endif
1102	}
1103	}
1104
1105	#if JUMPER_NARROW_STAGES
1106	#define STAGE(name, ...) \
1107	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1108	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1109	static void ABI name(Params* params, void** program, \
1110	F r, F g, F b, F a) { \
1111	name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \
1112	params->dr, params->dg, params->db, params->da); \
1113	auto next = (Stage)load_and_inc(program); \
1114	next(params,program, r,g,b,a); \
1115	} \
1116	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1117	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1118	#else
1119	#define STAGE(name, ...) \
1120	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1121	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1122	static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
1123	F r, F g, F b, F a, F dr, F dg, F db, F da) { \
1124	name##_k(Ctx {program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \
1125	auto next = (Stage)load_and_inc(program); \
1126	next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
1127	} \
1128	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1129	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1130	#endif
1131
1132
1133	// just_return() is a simple no-op stage that only exists to end the chain,
1134	// returning back up to start_pipeline(), and from there to the caller.
1135	#if JUMPER_NARROW_STAGES
1136	static void ABI just_return(Params, void***, F,F,F,F) {}
1137	#else
1138	static void ABI just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
1139	#endif
1140
1141
1142	// We could start defining normal Stages now. But first, some helper functions.
1143
1144	// These load() and store() methods are tail-aware,
1145	// but focus mainly on keeping the at-stride tail==0 case fast.
1146
1147	template <typename V, typename T>
1148	SI V load(const T* src, size_t tail) {
1149	#if !defined(JUMPER_IS_SCALAR)
1150	__builtin_assume(tail < N);
1151	if (__builtin_expect(tail, `0`)) {
1152	V v{}; // Any inactive lanes are zeroed.
1153	switch (tail) {
1154	case `7`: v[`6`] = src[`6`];
1155	case `6`: v[`5`] = src[`5`];
1156	case `5`: v[`4`] = src[`4`];
1157	case `4`: memcpy(&v, src, `4`*sizeof(T)); break;
1158	case `3`: v[`2`] = src[`2`];
1159	case `2`: memcpy(&v, src, `2`*sizeof(T)); break;
1160	case `1`: memcpy(&v, src, `1`*sizeof(T)); break;
1161	}
1162	return v;
1163	}
1164	#endif
1165	return sk_unaligned_load<V>(src);
1166	}
1167
1168	template <typename V, typename T>
1169	SI void store(T* dst, V v, size_t tail) {
1170	#if !defined(JUMPER_IS_SCALAR)
1171	__builtin_assume(tail < N);
1172	if (__builtin_expect(tail, `0`)) {
1173	switch (tail) {
1174	case `7`: dst[`6`] = v[`6`];
1175	case `6`: dst[`5`] = v[`5`];
1176	case `5`: dst[`4`] = v[`4`];
1177	case `4`: memcpy(dst, &v, `4`*sizeof(T)); break;
1178	case `3`: dst[`2`] = v[`2`];
1179	case `2`: memcpy(dst, &v, `2`*sizeof(T)); break;
1180	case `1`: memcpy(dst, &v, `1`*sizeof(T)); break;
1181	}
1182	return;
1183	}
1184	#endif
1185	sk_unaligned_store(dst, v);
1186	}
1187
1188	SI F from_byte(U8 b) {
1189	return cast(expand(b)) * (`1`/`255.0f`);
1190	}
1191	SI F from_short(U16 s) {
1192	return cast(expand(s)) * (`1`/`65535.0f`);
1193	}
1194	SI void from_565(U16 _565, F* r, F* g, F* b) {
1195	U32 wide = expand(_565);
1196	r = cast(wide & (`31`<<`11`)) (`1.0f` / (`31`<<`11`));
1197	g = cast(wide & (`63`<< `5`)) (`1.0f` / (`63`<< `5`));
1198	b = cast(wide & (`31`<< `0`)) (`1.0f` / (`31`<< `0`));
1199	}
1200	SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
1201	U32 wide = expand(_4444);
1202	r = cast(wide & (`15`<<`12`)) (`1.0f` / (`15`<<`12`));
1203	g = cast(wide & (`15`<< `8`)) (`1.0f` / (`15`<< `8`));
1204	b = cast(wide & (`15`<< `4`)) (`1.0f` / (`15`<< `4`));
1205	a = cast(wide & (`15`<< `0`)) (`1.0f` / (`15`<< `0`));
1206	}
1207	SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
1208	r = cast((_8888 ) & `0xff`) (`1`/`255.0f`);
1209	g = cast((_8888 >> `8`) & `0xff`) (`1`/`255.0f`);
1210	b = cast((_8888 >> `16`) & `0xff`) (`1`/`255.0f`);
1211	a = cast((_8888 >> `24`) ) (`1`/`255.0f`);
1212	}
1213	SI void from_88(U16 _88, F* r, F* g) {
1214	U32 wide = expand(_88);
1215	r = cast((wide ) & `0xff`) (`1`/`255.0f`);
1216	g = cast((wide >> `8`) & `0xff`) (`1`/`255.0f`);
1217	}
1218	SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) {
1219	r = cast((rgba ) & `0x3ff`) (`1`/`1023.0f`);
1220	g = cast((rgba >> `10`) & `0x3ff`) (`1`/`1023.0f`);
1221	b = cast((rgba >> `20`) & `0x3ff`) (`1`/`1023.0f`);
1222	a = cast((rgba >> `30`) ) (`1`/ `3.0f`);
1223	}
1224	SI void from_1616(U32 _1616, F* r, F* g) {
1225	r = cast((_1616 ) & `0xffff`) (`1`/`65535.0f`);
1226	g = cast((_1616 >> `16`) & `0xffff`) (`1`/`65535.0f`);
1227	}
1228	SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) {
1229	r = cast64((_16161616 ) & `0xffff`) (`1`/`65535.0f`);
1230	g = cast64((_16161616 >> `16`) & `0xffff`) (`1`/`65535.0f`);
1231	b = cast64((_16161616 >> `32`) & `0xffff`) (`1`/`65535.0f`);
1232	a = cast64((_16161616 >> `48`) & `0xffff`) (`1`/`65535.0f`);
1233	}
1234
1235	// Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory.
1236	template <typename T>
1237	SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
1238	return (T)ctx->pixels + dyctx->stride + dx;
1239	}
1240
1241	// clamp v to [0,limit).
1242	SI F clamp(F v, F limit) {
1243	F inclusive = bit_cast<F>( bit_cast<U32>(limit) - `1` ); // Exclusive -> inclusive.
1244	return min(max(`0`, v), inclusive);
1245	}
1246
1247	// Used by gather_ stages to calculate the base pointer and a vector of indices to load.
1248	template <typename T>
1249	SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
1250	x = clamp(x, ctx->width);
1251	y = clamp(y, ctx->height);
1252
1253	ptr = (const* T*)ctx->pixels;
1254	return trunc_(y)*ctx->stride + trunc_(x);
1255	}
1256
1257	// We often have a nominally [0,1] float value we need to scale and convert to an integer,
1258	// whether for a table lookup or to pack back down into bytes for storage.
1259	//
1260	// In practice, especially when dealing with interesting color spaces, that notionally
1261	// [0,1] float may be out of [0,1] range. Unorms cannot represent that, so we must clamp.
1262	//
1263	// You can adjust the expected input to [0,bias] by tweaking that parameter.
1264	SI U32 to_unorm(F v, F scale, F bias = `1.0f`) {
1265	// TODO: platform-specific implementations to to_unorm(), removing round() entirely?
1266	// Any time we use round() we probably want to use to_unorm().
1267	return round(min(max(`0`, v), bias), scale);
1268	}
1269
1270	SI I32 cond_to_mask(I32 cond) { return if_then_else(cond, I32(~`0`), I32(`0`)); }
1271
1272	// Now finally, normal Stages!
1273
1274	STAGE(seed_shader, Ctx::None) {
1275	static const float iota[] = {
1276	`0.5f`, `1.5f`, `2.5f`, `3.5f`, `4.5f`, `5.5f`, `6.5f`, `7.5f`,
1277	`8.5f`, `9.5f`,`10.5f`,`11.5f`,`12.5f`,`13.5f`,`14.5f`,`15.5f`,
1278	};
1279	// It's important for speed to explicitly cast(dx) and cast(dy),
1280	// which has the effect of splatting them to vectors before converting to floats.
1281	// On Intel this breaks a data dependency on previous loop iterations' registers.
1282	r = cast(dx) + sk_unaligned_load<F>(iota);
1283	g = cast(dy) + `0.5f`;
1284	b = `1.0f`;
1285	a = `0`;
1286	dr = dg = db = da = `0`;
1287	}
1288
1289	STAGE(dither, const float* rate) {
1290	// Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors.
1291	uint32_t iota[] = {`0`,`1`,`2`,`3`,`4`,`5`,`6`,`7`};
1292	U32 X = dx + sk_unaligned_load<U32>(iota),
1293	Y = dy;
1294
1295	// We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
1296	// In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ].
1297
1298	// We only need X and X^Y from here on, so it's easier to just think of that as "Y".
1299	Y ^= X;
1300
1301	// We'll mix the bottom 3 bits of each of X and Y to make 6 bits,
1302	// for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda.
1303	U32 M = (Y & `1`) << `5` \| (X & `1`) << `4`
1304	\| (Y & `2`) << `2` \| (X & `2`) << `1`
1305	\| (Y & `4`) >> `1` \| (X & `4`) >> `2`;
1306
1307	// Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon.
1308	// We want to make sure our dither is less than 0.5 in either direction to keep exact values
1309	// like 0 and 1 unchanged after rounding.
1310	F dither = cast(M) * (`2`/`128.0f`) - (`63`/`128.0f`);
1311
1312	r += ratedither;
1313	g += ratedither;
1314	b += ratedither;
1315
1316	r = max(`0`, min(r, a));
1317	g = max(`0`, min(g, a));
1318	b = max(`0`, min(b, a));
1319	}
1320
1321	// load 4 floats from memory, and splat them into r,g,b,a
1322	STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1323	r = c->r;
1324	g = c->g;
1325	b = c->b;
1326	a = c->a;
1327	}
1328	STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
1329	r = c->r;
1330	g = c->g;
1331	b = c->b;
1332	a = c->a;
1333	}
1334	// load 4 floats from memory, and splat them into dr,dg,db,da
1335	STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
1336	dr = c->r;
1337	dg = c->g;
1338	db = c->b;
1339	da = c->a;
1340	}
1341
1342	// splats opaque-black into r,g,b,a
1343	STAGE(black_color, Ctx::None) {
1344	r = g = b = `0.0f`;
1345	a = `1.0f`;
1346	}
1347
1348	STAGE(white_color, Ctx::None) {
1349	r = g = b = a = `1.0f`;
1350	}
1351
1352	// load registers r,g,b,a from context (mirrors store_rgba)
1353	STAGE(load_src, const float* ptr) {
1354	r = sk_unaligned_load<F>(ptr + `0`*N);
1355	g = sk_unaligned_load<F>(ptr + `1`*N);
1356	b = sk_unaligned_load<F>(ptr + `2`*N);
1357	a = sk_unaligned_load<F>(ptr + `3`*N);
1358	}
1359
1360	// store registers r,g,b,a into context (mirrors load_rgba)
1361	STAGE(store_src, float* ptr) {
1362	sk_unaligned_store(ptr + `0`*N, r);
1363	sk_unaligned_store(ptr + `1`*N, g);
1364	sk_unaligned_store(ptr + `2`*N, b);
1365	sk_unaligned_store(ptr + `3`*N, a);
1366	}
1367	STAGE(store_src_a, float* ptr) {
1368	sk_unaligned_store(ptr, a);
1369	}
1370
1371	// load registers dr,dg,db,da from context (mirrors store_dst)
1372	STAGE(load_dst, const float* ptr) {
1373	dr = sk_unaligned_load<F>(ptr + `0`*N);
1374	dg = sk_unaligned_load<F>(ptr + `1`*N);
1375	db = sk_unaligned_load<F>(ptr + `2`*N);
1376	da = sk_unaligned_load<F>(ptr + `3`*N);
1377	}
1378
1379	// store registers dr,dg,db,da into context (mirrors load_dst)
1380	STAGE(store_dst, float* ptr) {
1381	sk_unaligned_store(ptr + `0`*N, dr);
1382	sk_unaligned_store(ptr + `1`*N, dg);
1383	sk_unaligned_store(ptr + `2`*N, db);
1384	sk_unaligned_store(ptr + `3`*N, da);
1385	}
1386
1387	// Most blend modes apply the same logic to each channel.
1388	#define BLEND_MODE(name) \
1389	SI F name##_channel(F s, F d, F sa, F da); \
1390	STAGE(name, Ctx::None) { \
1391	r = name##_channel(r,dr,a,da); \
1392	g = name##_channel(g,dg,a,da); \
1393	b = name##_channel(b,db,a,da); \
1394	a = name##_channel(a,da,a,da); \
1395	} \
1396	SI F name##_channel(F s, F d, F sa, F da)
1397
1398	SI F inv(F x) { return `1.0f` - x; }
1399	SI F two(F x) { return x + x; }
1400
1401
1402	BLEND_MODE(clear) { return `0`; }
1403	BLEND_MODE(srcatop) { return sda + dinv(sa); }
1404	BLEND_MODE(dstatop) { return dsa + sinv(da); }
1405	BLEND_MODE(srcin) { return s * da; }
1406	BLEND_MODE(dstin) { return d * sa; }
1407	BLEND_MODE(srcout) { return s * inv(da); }
1408	BLEND_MODE(dstout) { return d * inv(sa); }
1409	BLEND_MODE(srcover) { return mad(d, inv(sa), s); }
1410	BLEND_MODE(dstover) { return mad(s, inv(da), d); }
1411
1412	BLEND_MODE(modulate) { return s*d; }
1413	BLEND_MODE(multiply) { return sinv(da) + dinv(sa) + s*d; }
1414	BLEND_MODE(plus_) { return min(s + d, `1.0f`); } // We can clamp to either 1 or sa.
1415	BLEND_MODE(screen) { return s + d - s*d; }
1416	BLEND_MODE(xor_) { return sinv(da) + dinv(sa); }
1417	#undef BLEND_MODE
1418
1419	// Most other blend modes apply the same logic to colors, and srcover to alpha.
1420	#define BLEND_MODE(name) \
1421	SI F name##_channel(F s, F d, F sa, F da); \
1422	STAGE(name, Ctx::None) { \
1423	r = name##_channel(r,dr,a,da); \
1424	g = name##_channel(g,dg,a,da); \
1425	b = name##_channel(b,db,a,da); \
1426	a = mad(da, inv(a), a); \
1427	} \
1428	SI F name##_channel(F s, F d, F sa, F da)
1429
1430	BLEND_MODE(darken) { return s + d - max(sda, dsa) ; }
1431	BLEND_MODE(lighten) { return s + d - min(sda, dsa) ; }
1432	BLEND_MODE(difference) { return s + d - two(min(sda, dsa)); }
1433	BLEND_MODE(exclusion) { return s + d - two(s*d); }
1434
1435	BLEND_MODE(colorburn) {
1436	return if_then_else(d == da, d + s*inv(da),
1437	if_then_else(s == `0`, / s + / d*inv(sa),
1438	sa(da - min(da, (da-d)sarcp(s))) + sinv(da) + d*inv(sa)));
1439	}
1440	BLEND_MODE(colordodge) {
1441	return if_then_else(d == `0`, / d + / s*inv(da),
1442	if_then_else(s == sa, s + d*inv(sa),
1443	samin(da, (dsa)rcp(sa - s)) + sinv(da) + d*inv(sa)));
1444	}
1445	BLEND_MODE(hardlight) {
1446	return sinv(da) + dinv(sa)
1447	+ if_then_else(two(s) <= sa, two(sd), sada - two((da-d)*(sa-s)));
1448	}
1449	BLEND_MODE(overlay) {
1450	return sinv(da) + dinv(sa)
1451	+ if_then_else(two(d) <= da, two(sd), sada - two((da-d)*(sa-s)));
1452	}
1453
1454	BLEND_MODE(softlight) {
1455	F m = if_then_else(da > `0`, d / da, `0`),
1456	s2 = two(s),
1457	m4 = two(two(m));
1458
1459	// The logic forks three ways:
1460	// 1. dark src?
1461	// 2. light src, dark dst?
1462	// 3. light src, light dst?
1463	F darkSrc = d(sa + (s2 - sa)(`1.0f` - m)), // Used in case 1.
1464	darkDst = (m4m4 + m4)(m - `1.0f`) + `7.0f`m, // Used in case 2.*
1465	liteDst = rcp(rsqrt(m)) - m, // Used in case 3.
1466	liteSrc = dsa + da(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3?
1467	return sinv(da) + dinv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)?
1468	}
1469	#undef BLEND_MODE
1470
1471	// We're basing our implemenation of non-separable blend modes on
1472	// https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1473	// and
1474	// https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1475	// They're equivalent, but ES' math has been better simplified.
1476	//
1477	// Anything extra we add beyond that is to make the math work with premul inputs.
1478
1479	SI F sat(F r, F g, F b) { return max(r, max(g,b)) - min(r, min(g,b)); }
1480	SI F lum(F r, F g, F b) { return r`0.30f` + g`0.59f` + b*`0.11f`; }
1481
1482	SI void set_sat(F* r, F* g, F* b, F s) {
1483	F mn = min(r, min(g,*b)),
1484	mx = max(r, max(g,*b)),
1485	sat = mx - mn;
1486
1487	// Map min channel to 0, max channel to s, and scale the middle proportionally.
1488	auto scale = [=](F c) {
1489	return if_then_else(sat == `0`, `0`, (c - mn) * s / sat);
1490	};
1491	r = scale(r);
1492	g = scale(g);
1493	b = scale(b);
1494	}
1495	SI void set_lum(F* r, F* g, F* b, F l) {
1496	F diff = l - lum(r, g, *b);
1497	*r += diff;
1498	*g += diff;
1499	*b += diff;
1500	}
1501	SI void clip_color(F* r, F* g, F* b, F a) {
1502	F mn = min(r, min(g, *b)),
1503	mx = max(r, max(g, *b)),
1504	l = lum(r, g, *b);
1505
1506	auto clip = [=](F c) {
1507	c = if_then_else(mn >= `0`, c, l + (c - l) * ( l) / (l - mn) );
1508	c = if_then_else(mx > a, l + (c - l) * (a - l) / (mx - l), c);
1509	c = max(c, `0`); // Sometimes without this we may dip just a little negative.
1510	return c;
1511	};
1512	r = clip(r);
1513	g = clip(g);
1514	b = clip(b);
1515	}
1516
1517	STAGE(hue, Ctx::None) {
1518	F R = r*a,
1519	G = g*a,
1520	B = b*a;
1521
1522	set_sat(&R, &G, &B, sat(dr,dg,db)*a);
1523	set_lum(&R, &G, &B, lum(dr,dg,db)*a);
1524	clip_color(&R,&G,&B, a*da);
1525
1526	r = rinv(da) + drinv(a) + R;
1527	g = ginv(da) + dginv(a) + G;
1528	b = binv(da) + dbinv(a) + B;
1529	a = a + da - a*da;
1530	}
1531	STAGE(saturation, Ctx::None) {
1532	F R = dr*a,
1533	G = dg*a,
1534	B = db*a;
1535
1536	set_sat(&R, &G, &B, sat( r, g, b)*da);
1537	set_lum(&R, &G, &B, lum(dr,dg,db)* a); // (This is not redundant.)
1538	clip_color(&R,&G,&B, a*da);
1539
1540	r = rinv(da) + drinv(a) + R;
1541	g = ginv(da) + dginv(a) + G;
1542	b = binv(da) + dbinv(a) + B;
1543	a = a + da - a*da;
1544	}
1545	STAGE(color, Ctx::None) {
1546	F R = r*da,
1547	G = g*da,
1548	B = b*da;
1549
1550	set_lum(&R, &G, &B, lum(dr,dg,db)*a);
1551	clip_color(&R,&G,&B, a*da);
1552
1553	r = rinv(da) + drinv(a) + R;
1554	g = ginv(da) + dginv(a) + G;
1555	b = binv(da) + dbinv(a) + B;
1556	a = a + da - a*da;
1557	}
1558	STAGE(luminosity, Ctx::None) {
1559	F R = dr*a,
1560	G = dg*a,
1561	B = db*a;
1562
1563	set_lum(&R, &G, &B, lum(r,g,b)*da);
1564	clip_color(&R,&G,&B, a*da);
1565
1566	r = rinv(da) + drinv(a) + R;
1567	g = ginv(da) + dginv(a) + G;
1568	b = binv(da) + dbinv(a) + B;
1569	a = a + da - a*da;
1570	}
1571
1572	STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
1573	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
1574
1575	U32 dst = load<U32>(ptr, tail);
1576	dr = cast((dst ) & `0xff`);
1577	dg = cast((dst >> `8`) & `0xff`);
1578	db = cast((dst >> `16`) & `0xff`);
1579	da = cast((dst >> `24`) );
1580	// {dr,dg,db,da} are in [0,255]
1581	// { r, g, b, a} are in [0, 1] (but may be out of gamut)
1582
1583	r = mad(dr, inv(a), r*`255.0f`);
1584	g = mad(dg, inv(a), g*`255.0f`);
1585	b = mad(db, inv(a), b*`255.0f`);
1586	a = mad(da, inv(a), a*`255.0f`);
1587	// { r, g, b, a} are now in [0,255] (but may be out of gamut)
1588
1589	// to_unorm() clamps back to gamut. Scaling by 1 since we're already 255-biased.
1590	dst = to_unorm(r, `1`, `255`)
1591	\| to_unorm(g, `1`, `255`) << `8`
1592	\| to_unorm(b, `1`, `255`) << `16`
1593	\| to_unorm(a, `1`, `255`) << `24`;
1594	store(ptr, dst, tail);
1595	}
1596
1597	STAGE(clamp_0, Ctx::None) {
1598	r = max(r, `0`);
1599	g = max(g, `0`);
1600	b = max(b, `0`);
1601	a = max(a, `0`);
1602	}
1603
1604	STAGE(clamp_1, Ctx::None) {
1605	r = min(r, `1.0f`);
1606	g = min(g, `1.0f`);
1607	b = min(b, `1.0f`);
1608	a = min(a, `1.0f`);
1609	}
1610
1611	STAGE(clamp_a, Ctx::None) {
1612	a = min(a, `1.0f`);
1613	r = min(r, a);
1614	g = min(g, a);
1615	b = min(b, a);
1616	}
1617
1618	STAGE(clamp_gamut, Ctx::None) {
1619	a = min(max(a, `0`), `1.0f`);
1620	r = min(max(r, `0`), a);
1621	g = min(max(g, `0`), a);
1622	b = min(max(b, `0`), a);
1623	}
1624
1625	STAGE(set_rgb, const float* rgb) {
1626	r = rgb[`0`];
1627	g = rgb[`1`];
1628	b = rgb[`2`];
1629	}
1630	STAGE(unbounded_set_rgb, const float* rgb) {
1631	r = rgb[`0`];
1632	g = rgb[`1`];
1633	b = rgb[`2`];
1634	}
1635
1636	STAGE(swap_rb, Ctx::None) {
1637	auto tmp = r;
1638	r = b;
1639	b = tmp;
1640	}
1641	STAGE(swap_rb_dst, Ctx::None) {
1642	auto tmp = dr;
1643	dr = db;
1644	db = tmp;
1645	}
1646
1647	STAGE(move_src_dst, Ctx::None) {
1648	dr = r;
1649	dg = g;
1650	db = b;
1651	da = a;
1652	}
1653	STAGE(move_dst_src, Ctx::None) {
1654	r = dr;
1655	g = dg;
1656	b = db;
1657	a = da;
1658	}
1659
1660	STAGE(premul, Ctx::None) {
1661	r = r * a;
1662	g = g * a;
1663	b = b * a;
1664	}
1665	STAGE(premul_dst, Ctx::None) {
1666	dr = dr * da;
1667	dg = dg * da;
1668	db = db * da;
1669	}
1670	STAGE(unpremul, Ctx::None) {
1671	float inf = bit_cast<float>(`0x7f800000`);
1672	auto scale = if_then_else(`1.0f`/a < inf, `1.0f`/a, `0`);
1673	r *= scale;
1674	g *= scale;
1675	b *= scale;
1676	}
1677
1678	STAGE(force_opaque , Ctx::None) { a = `1`; }
1679	STAGE(force_opaque_dst, Ctx::None) { da = `1`; }
1680
1681	// Clamp x to [0,1], both sides inclusive (think, gradients).
1682	// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
1683	SI F clamp_01(F v) { return min(max(`0`, v), `1`); }
1684
1685	STAGE(rgb_to_hsl, Ctx::None) {
1686	F mx = max(r, max(g,b)),
1687	mn = min(r, min(g,b)),
1688	d = mx - mn,
1689	d_rcp = `1.0f` / d;
1690
1691	F h = (`1`/`6.0f`) *
1692	if_then_else(mx == mn, `0`,
1693	if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, `6.0f`, `0`),
1694	if_then_else(mx == g, (b-r)*d_rcp + `2.0f`,
1695	(r-g)*d_rcp + `4.0f`)));
1696
1697	F l = (mx + mn) * `0.5f`;
1698	F s = if_then_else(mx == mn, `0`,
1699	d / if_then_else(l > `0.5f`, `2.0f`-mx-mn, mx+mn));
1700
1701	r = h;
1702	g = s;
1703	b = l;
1704	}
1705	STAGE(hsl_to_rgb, Ctx::None) {
1706	// See GrRGBToHSLFilterEffect.fp
1707
1708	F h = r,
1709	s = g,
1710	l = b,
1711	c = (`1.0f` - abs_(`2.0f` * l - `1`)) * s;
1712
1713	auto hue_to_rgb = [&](F hue) {
1714	F q = clamp_01(abs_(fract(hue) * `6.0f` - `3.0f`) - `1.0f`);
1715	return (q - `0.5f`) * c + l;
1716	};
1717
1718	r = hue_to_rgb(h + `0.0f`/`3.0f`);
1719	g = hue_to_rgb(h + `2.0f`/`3.0f`);
1720	b = hue_to_rgb(h + `1.0f`/`3.0f`);
1721	}
1722
1723	// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
1724	SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) {
1725	return if_then_else(a < da, min(cr, min(cg,cb))
1726	, max(cr, max(cg,cb)));
1727	}
1728
1729	STAGE(scale_1_float, const float* c) {
1730	r = r * *c;
1731	g = g * *c;
1732	b = b * *c;
1733	a = a * *c;
1734	}
1735	STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
1736	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1737
1738	auto scales = load<U8>(ptr, tail);
1739	auto c = from_byte(scales);
1740
1741	r = r * c;
1742	g = g * c;
1743	b = b * c;
1744	a = a * c;
1745	}
1746	STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
1747	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1748
1749	F cr,cg,cb;
1750	from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1751
1752	F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
1753
1754	r = r * cr;
1755	g = g * cg;
1756	b = b * cb;
1757	a = a * ca;
1758	}
1759
1760	SI F lerp(F from, F to, F t) {
1761	return mad(to-from, t, from);
1762	}
1763
1764	STAGE(lerp_1_float, const float* c) {
1765	r = lerp(dr, r, *c);
1766	g = lerp(dg, g, *c);
1767	b = lerp(db, b, *c);
1768	a = lerp(da, a, *c);
1769	}
1770	STAGE(scale_native, const float scales[]) {
1771	auto c = sk_unaligned_load<F>(scales);
1772	r = r * c;
1773	g = g * c;
1774	b = b * c;
1775	a = a * c;
1776	}
1777	STAGE(lerp_native, const float scales[]) {
1778	auto c = sk_unaligned_load<F>(scales);
1779	r = lerp(dr, r, c);
1780	g = lerp(dg, g, c);
1781	b = lerp(db, b, c);
1782	a = lerp(da, a, c);
1783	}
1784	STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
1785	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1786
1787	auto scales = load<U8>(ptr, tail);
1788	auto c = from_byte(scales);
1789
1790	r = lerp(dr, r, c);
1791	g = lerp(dg, g, c);
1792	b = lerp(db, b, c);
1793	a = lerp(da, a, c);
1794	}
1795	STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
1796	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1797
1798	F cr,cg,cb;
1799	from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1800
1801	F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
1802
1803	r = lerp(dr, r, cr);
1804	g = lerp(dg, g, cg);
1805	b = lerp(db, b, cb);
1806	a = lerp(da, a, ca);
1807	}
1808
1809	STAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
1810	auto mptr = ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy),
1811	aptr = ptr_at_xy<const uint8_t>(&ctx->add, dx,dy);
1812
1813	F mul = from_byte(load<U8>(mptr, tail)),
1814	add = from_byte(load<U8>(aptr, tail));
1815
1816	r = mad(r, mul, add);
1817	g = mad(g, mul, add);
1818	b = mad(b, mul, add);
1819	}
1820
1821	STAGE(byte_tables, const void* ctx) { // TODO: rename Tables SkRasterPipeline_ByteTablesCtx
1822	struct Tables { const uint8_t r, g, b, a; };
1823	auto tables = (const Tables*)ctx;
1824
1825	r = from_byte(gather(tables->r, to_unorm(r, `255`)));
1826	g = from_byte(gather(tables->g, to_unorm(g, `255`)));
1827	b = from_byte(gather(tables->b, to_unorm(b, `255`)));
1828	a = from_byte(gather(tables->a, to_unorm(a, `255`)));
1829	}
1830
1831	SI F strip_sign(F x, U32* sign) {
1832	U32 bits = bit_cast<U32>(x);
1833	*sign = bits & `0x80000000`;
1834	return bit_cast<F>(bits ^ *sign);
1835	}
1836
1837	SI F apply_sign(F x, U32 sign) {
1838	return bit_cast<F>(sign \| bit_cast<U32>(x));
1839	}
1840
1841	STAGE(parametric, const skcms_TransferFunction* ctx) {
1842	auto fn = [&](F v) {
1843	U32 sign;
1844	v = strip_sign(v, &sign);
1845
1846	F r = if_then_else(v <= ctx->d, mad(ctx->c, v, ctx->f)
1847	, approx_powf(mad(ctx->a, v, ctx->b), ctx->g) + ctx->e);
1848	return apply_sign(r, sign);
1849	};
1850	r = fn(r);
1851	g = fn(g);
1852	b = fn(b);
1853	}
1854
1855	STAGE(gamma_, const float* G) {
1856	auto fn = [&](F v) {
1857	U32 sign;
1858	v = strip_sign(v, &sign);
1859	return apply_sign(approx_powf(v, *G), sign);
1860	};
1861	r = fn(r);
1862	g = fn(g);
1863	b = fn(b);
1864	}
1865
1866	STAGE(PQish, const skcms_TransferFunction* ctx) {
1867	auto fn = [&](F v) {
1868	U32 sign;
1869	v = strip_sign(v, &sign);
1870
1871	F r = approx_powf(max(mad(ctx->b, approx_powf(v, ctx->c), ctx->a), `0`)
1872	/ (mad(ctx->e, approx_powf(v, ctx->c), ctx->d)),
1873	ctx->f);
1874
1875	return apply_sign(r, sign);
1876	};
1877	r = fn(r);
1878	g = fn(g);
1879	b = fn(b);
1880	}
1881
1882	STAGE(HLGish, const skcms_TransferFunction* ctx) {
1883	auto fn = [&](F v) {
1884	U32 sign;
1885	v = strip_sign(v, &sign);
1886
1887	const float R = ctx->a, G = ctx->b,
1888	a = ctx->c, b = ctx->d, c = ctx->e;
1889
1890	F r = if_then_else(vR <= `1`, approx_powf(vR, G)
1891	, approx_exp((v-c)*a) + b);
1892
1893	return apply_sign(r, sign);
1894	};
1895	r = fn(r);
1896	g = fn(g);
1897	b = fn(b);
1898	}
1899
1900	STAGE(HLGinvish, const skcms_TransferFunction* ctx) {
1901	auto fn = [&](F v) {
1902	U32 sign;
1903	v = strip_sign(v, &sign);
1904
1905	const float R = ctx->a, G = ctx->b,
1906	a = ctx->c, b = ctx->d, c = ctx->e;
1907
1908	F r = if_then_else(v <= `1`, R * approx_powf(v, G)
1909	, a * approx_log(v - b) + c);
1910
1911	return apply_sign(r, sign);
1912	};
1913	r = fn(r);
1914	g = fn(g);
1915	b = fn(b);
1916	}
1917
1918	STAGE(from_srgb, Ctx::None) {
1919	auto fn = [](F s) {
1920	U32 sign;
1921	s = strip_sign(s, &sign);
1922	auto lo = s * (`1`/`12.92f`);
1923	auto hi = mad(s*s, mad(s, `0.3000f`, `0.6975f`), `0.0025f`);
1924	return apply_sign(if_then_else(s < `0.055f`, lo, hi), sign);
1925	};
1926	r = fn(r);
1927	g = fn(g);
1928	b = fn(b);
1929	}
1930	STAGE(to_srgb, Ctx::None) {
1931	auto fn = [](F l) {
1932	U32 sign;
1933	l = strip_sign(l, &sign);
1934	// We tweak c and d for each instruction set to make sure fn(1) is exactly 1.
1935	#if defined(JUMPER_IS_AVX512)
1936	const float c = `1.130026340485f`,
1937	d = `0.141387879848f`;
1938	#elif defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41) \|\| \
1939	defined(JUMPER_IS_AVX ) \|\| defined(JUMPER_IS_HSW )
1940	const float c = `1.130048394203f`,
1941	d = `0.141357362270f`;
1942	#elif defined(JUMPER_IS_NEON)
1943	const float c = `1.129999995232f`,
1944	d = `0.141381442547f`;
1945	#else
1946	const float c = `1.129999995232f`,
1947	d = `0.141377761960f`;
1948	#endif
1949	F t = rsqrt(l);
1950	auto lo = l * `12.92f`;
1951	auto hi = mad(t, mad(t, -`0.0024542345f`, `0.013832027f`), c)
1952	* rcp(d + t);
1953	return apply_sign(if_then_else(l < `0.00465985f`, lo, hi), sign);
1954	};
1955	r = fn(r);
1956	g = fn(g);
1957	b = fn(b);
1958	}
1959
1960	STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
1961	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1962
1963	r = g = b = `0.0f`;
1964	a = from_byte(load<U8>(ptr, tail));
1965	}
1966	STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1967	auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1968
1969	dr = dg = db = `0.0f`;
1970	da = from_byte(load<U8>(ptr, tail));
1971	}
1972	STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
1973	const uint8_t* ptr;
1974	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1975	r = g = b = `0.0f`;
1976	a = from_byte(gather(ptr, ix));
1977	}
1978	STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
1979	auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
1980
1981	U8 packed = pack(pack(to_unorm(a, `255`)));
1982	store(ptr, packed, tail);
1983	}
1984
1985	STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
1986	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1987
1988	from_565(load<U16>(ptr, tail), &r,&g,&b);
1989	a = `1.0f`;
1990	}
1991	STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
1992	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1993
1994	from_565(load<U16>(ptr, tail), &dr,&dg,&db);
1995	da = `1.0f`;
1996	}
1997	STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
1998	const uint16_t* ptr;
1999	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2000	from_565(gather(ptr, ix), &r,&g,&b);
2001	a = `1.0f`;
2002	}
2003	STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
2004	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2005
2006	U16 px = pack( to_unorm(r, `31`) << `11`
2007	\| to_unorm(g, `63`) << `5`
2008	\| to_unorm(b, `31`) );
2009	store(ptr, px, tail);
2010	}
2011
2012	STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
2013	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2014	from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
2015	}
2016	STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2017	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2018	from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da);
2019	}
2020	STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
2021	const uint16_t* ptr;
2022	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2023	from_4444(gather(ptr, ix), &r,&g,&b,&a);
2024	}
2025	STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
2026	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2027	U16 px = pack( to_unorm(r, `15`) << `12`
2028	\| to_unorm(g, `15`) << `8`
2029	\| to_unorm(b, `15`) << `4`
2030	\| to_unorm(a, `15`) );
2031	store(ptr, px, tail);
2032	}
2033
2034	STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2035	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2036	from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
2037	}
2038	STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2039	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2040	from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);
2041	}
2042	STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
2043	const uint32_t* ptr;
2044	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2045	from_8888(gather(ptr, ix), &r,&g,&b,&a);
2046	}
2047	STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2048	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2049
2050	U32 px = to_unorm(r, `255`)
2051	\| to_unorm(g, `255`) << `8`
2052	\| to_unorm(b, `255`) << `16`
2053	\| to_unorm(a, `255`) << `24`;
2054	store(ptr, px, tail);
2055	}
2056
2057	STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2058	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2059	from_88(load<U16>(ptr, tail), &r, &g);
2060	b = `0`;
2061	a = `1`;
2062	}
2063	STAGE(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2064	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2065	from_88(load<U16>(ptr, tail), &dr, &dg);
2066	db = `0`;
2067	da = `1`;
2068	}
2069	STAGE(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
2070	const uint16_t* ptr;
2071	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2072	from_88(gather(ptr, ix), &r, &g);
2073	b = `0`;
2074	a = `1`;
2075	}
2076	STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2077	auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy);
2078	U16 px = pack( to_unorm(r, `255`) \| to_unorm(g, `255`) << `8` );
2079	store(ptr, px, tail);
2080	}
2081
2082	STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2083	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2084	r = g = b = `0`;
2085	a = from_short(load<U16>(ptr, tail));
2086	}
2087	STAGE(load_a16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2088	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2089	dr = dg = db = `0.0f`;
2090	da = from_short(load<U16>(ptr, tail));
2091	}
2092	STAGE(gather_a16, const SkRasterPipeline_GatherCtx* ctx) {
2093	const uint16_t* ptr;
2094	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2095	r = g = b = `0.0f`;
2096	a = from_short(gather(ptr, ix));
2097	}
2098	STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2099	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2100
2101	U16 px = pack(to_unorm(a, `65535`));
2102	store(ptr, px, tail);
2103	}
2104
2105	STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2106	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2107	b = `0`; a = `1`;
2108	from_1616(load<U32>(ptr, tail), &r,&g);
2109	}
2110	STAGE(load_rg1616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2111	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2112	from_1616(load<U32>(ptr, tail), &dr, &dg);
2113	db = `0`;
2114	da = `1`;
2115	}
2116	STAGE(gather_rg1616, const SkRasterPipeline_GatherCtx* ctx) {
2117	const uint32_t* ptr;
2118	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2119	from_1616(gather(ptr, ix), &r, &g);
2120	b = `0`;
2121	a = `1`;
2122	}
2123	STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2124	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2125
2126	U32 px = to_unorm(r, `65535`)
2127	\| to_unorm(g, `65535`) << `16`;
2128	store(ptr, px, tail);
2129	}
2130
2131	STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2132	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2133	from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a);
2134	}
2135	STAGE(load_16161616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2136	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2137	from_16161616(load<U64>(ptr, tail), &dr, &dg, &db, &da);
2138	}
2139	STAGE(gather_16161616, const SkRasterPipeline_GatherCtx* ctx) {
2140	const uint64_t* ptr;
2141	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2142	from_16161616(gather(ptr, ix), &r, &g, &b, &a);
2143	}
2144	STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2145	auto ptr = ptr_at_xy<uint16_t>(ctx, `4`dx,`4`dy);
2146
2147	U16 R = pack(to_unorm(r, `65535`)),
2148	G = pack(to_unorm(g, `65535`)),
2149	B = pack(to_unorm(b, `65535`)),
2150	A = pack(to_unorm(a, `65535`));
2151
2152	store4(ptr,tail, R,G,B,A);
2153	}
2154
2155
2156	STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2157	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2158	from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a);
2159	}
2160	STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2161	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2162	from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da);
2163	}
2164	STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) {
2165	const uint32_t* ptr;
2166	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2167	from_1010102(gather(ptr, ix), &r,&g,&b,&a);
2168	}
2169	STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
2170	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2171
2172	U32 px = to_unorm(r, `1023`)
2173	\| to_unorm(g, `1023`) << `10`
2174	\| to_unorm(b, `1023`) << `20`
2175	\| to_unorm(a, `3`) << `30`;
2176	store(ptr, px, tail);
2177	}
2178
2179	STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2180	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2181
2182	U16 R,G,B,A;
2183	load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2184	r = from_half(R);
2185	g = from_half(G);
2186	b = from_half(B);
2187	a = from_half(A);
2188	}
2189	STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2190	auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2191
2192	U16 R,G,B,A;
2193	load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2194	dr = from_half(R);
2195	dg = from_half(G);
2196	db = from_half(B);
2197	da = from_half(A);
2198	}
2199	STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) {
2200	const uint64_t* ptr;
2201	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2202	auto px = gather(ptr, ix);
2203
2204	U16 R,G,B,A;
2205	load4((const uint16_t*)&px,`0`, &R,&G,&B,&A);
2206	r = from_half(R);
2207	g = from_half(G);
2208	b = from_half(B);
2209	a = from_half(A);
2210	}
2211	STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) {
2212	auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy);
2213	store4((uint16_t*)ptr,tail, to_half(r)
2214	, to_half(g)
2215	, to_half(b)
2216	, to_half(a));
2217	}
2218
2219	STAGE(store_u16_be, const SkRasterPipeline_MemoryCtx* ctx) {
2220	auto ptr = ptr_at_xy<uint16_t>(ctx, `4`*dx,dy);
2221
2222	U16 R = bswap(pack(to_unorm(r, `65535`))),
2223	G = bswap(pack(to_unorm(g, `65535`))),
2224	B = bswap(pack(to_unorm(b, `65535`))),
2225	A = bswap(pack(to_unorm(a, `65535`)));
2226
2227	store4(ptr,tail, R,G,B,A);
2228	}
2229
2230	STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2231	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2232
2233	U16 A = load<U16>((const uint16_t*)ptr, tail);
2234	r = `0`;
2235	g = `0`;
2236	b = `0`;
2237	a = from_half(A);
2238	}
2239	STAGE(load_af16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2240	auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2241
2242	U16 A = load<U16>((const uint16_t*)ptr, tail);
2243	dr = dg = db = `0.0f`;
2244	da = from_half(A);
2245	}
2246	STAGE(gather_af16, const SkRasterPipeline_GatherCtx* ctx) {
2247	const uint16_t* ptr;
2248	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2249	r = g = b = `0.0f`;
2250	a = from_half(gather(ptr, ix));
2251	}
2252	STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) {
2253	auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2254	store(ptr, to_half(a), tail);
2255	}
2256
2257	STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2258	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2259
2260	U16 R,G;
2261	load2((const uint16_t*)ptr, tail, &R, &G);
2262	r = from_half(R);
2263	g = from_half(G);
2264	b = `0`;
2265	a = `1`;
2266	}
2267	STAGE(load_rgf16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2268	auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2269
2270	U16 R,G;
2271	load2((const uint16_t*)ptr, tail, &R, &G);
2272	dr = from_half(R);
2273	dg = from_half(G);
2274	db = `0`;
2275	da = `1`;
2276	}
2277	STAGE(gather_rgf16, const SkRasterPipeline_GatherCtx* ctx) {
2278	const uint32_t* ptr;
2279	U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2280	auto px = gather(ptr, ix);
2281
2282	U16 R,G;
2283	load2((const uint16_t*)&px, `0`, &R, &G);
2284	r = from_half(R);
2285	g = from_half(G);
2286	b = `0`;
2287	a = `1`;
2288	}
2289	STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
2290	auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy);
2291	store2((uint16_t*)ptr, tail, to_half(r)
2292	, to_half(g));
2293	}
2294
2295	STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2296	auto ptr = ptr_at_xy<const float>(ctx, `4`dx,`4`dy);
2297	load4(ptr,tail, &r,&g,&b,&a);
2298	}
2299	STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2300	auto ptr = ptr_at_xy<const float>(ctx, `4`dx,`4`dy);
2301	load4(ptr,tail, &dr,&dg,&db,&da);
2302	}
2303	STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) {
2304	const float* ptr;
2305	U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2306	r = gather(ptr, `4`*ix + `0`);
2307	g = gather(ptr, `4`*ix + `1`);
2308	b = gather(ptr, `4`*ix + `2`);
2309	a = gather(ptr, `4`*ix + `3`);
2310	}
2311	STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) {
2312	auto ptr = ptr_at_xy<float>(ctx, `4`dx,`4`dy);
2313	store4(ptr,tail, r,g,b,a);
2314	}
2315
2316	STAGE(load_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2317	auto ptr = ptr_at_xy<const float>(ctx, `2`dx,`2`dy);
2318	load2(ptr, tail, &r, &g);
2319	b = `0`;
2320	a = `1`;
2321	}
2322	STAGE(store_rgf32, const SkRasterPipeline_MemoryCtx* ctx) {
2323	auto ptr = ptr_at_xy<float>(ctx, `2`dx,`2`dy);
2324	store2(ptr, tail, r, g);
2325	}
2326
2327	SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) {
2328	return v - floor_(vctx->invScale)ctx->scale;
2329	}
2330	SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) {
2331	auto limit = ctx->scale;
2332	auto invLimit = ctx->invScale;
2333	return abs_( (v-limit) - (limit+limit)floor_((v-limit)(invLimit*`0.5f`)) - limit );
2334	}
2335	// Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
2336	// The gather stages will hard clamp the output of these stages to [0,limit)...
2337	// we just need to do the basic repeat or mirroring.
2338	STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(r, ctx); }
2339	STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(g, ctx); }
2340	STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(r, ctx); }
2341	STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(g, ctx); }
2342
2343	STAGE( clamp_x_1, Ctx::None) { r = clamp_01(r); }
2344	STAGE(repeat_x_1, Ctx::None) { r = clamp_01(r - floor_(r)); }
2345	STAGE(mirror_x_1, Ctx::None) { r = clamp_01(abs_( (r-`1.0f`) - two(floor_((r-`1.0f`)*`0.5f`)) - `1.0f` )); }
2346
2347	// Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain:
2348	// mask == 0x00000000 if the coordinate(s) are out of bounds
2349	// mask == 0xFFFFFFFF if the coordinate(s) are in bounds
2350	// After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0
2351	// if either of the coordinates were out of bounds.
2352
2353	STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
2354	auto w = ctx->limit_x;
2355	sk_unaligned_store(ctx->mask, cond_to_mask((`0` <= r) & (r < w)));
2356	}
2357	STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
2358	auto h = ctx->limit_y;
2359	sk_unaligned_store(ctx->mask, cond_to_mask((`0` <= g) & (g < h)));
2360	}
2361	STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
2362	auto w = ctx->limit_x;
2363	auto h = ctx->limit_y;
2364	sk_unaligned_store(ctx->mask,
2365	cond_to_mask((`0` <= r) & (r < w) & (`0` <= g) & (g < h)));
2366	}
2367	STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
2368	auto mask = sk_unaligned_load<U32>(ctx->mask);
2369	r = bit_cast<F>( bit_cast<U32>(r) & mask );
2370	g = bit_cast<F>( bit_cast<U32>(g) & mask );
2371	b = bit_cast<F>( bit_cast<U32>(b) & mask );
2372	a = bit_cast<F>( bit_cast<U32>(a) & mask );
2373	}
2374
2375	STAGE(alpha_to_gray, Ctx::None) {
2376	r = g = b = a;
2377	a = `1`;
2378	}
2379	STAGE(alpha_to_gray_dst, Ctx::None) {
2380	dr = dg = db = da;
2381	da = `1`;
2382	}
2383	STAGE(bt709_luminance_or_luma_to_alpha, Ctx::None) {
2384	a = r`0.2126f` + g`0.7152f` + b*`0.0722f`;
2385	r = g = b = `0`;
2386	}
2387
2388	STAGE(matrix_translate, const float* m) {
2389	r += m[`0`];
2390	g += m[`1`];
2391	}
2392	STAGE(matrix_scale_translate, const float* m) {
2393	r = mad(r,m[`0`], m[`2`]);
2394	g = mad(g,m[`1`], m[`3`]);
2395	}
2396	STAGE(matrix_2x3, const float* m) {
2397	auto R = mad(r,m[`0`], mad(g,m[`2`], m[`4`])),
2398	G = mad(r,m[`1`], mad(g,m[`3`], m[`5`]));
2399	r = R;
2400	g = G;
2401	}
2402	STAGE(matrix_3x3, const float* m) {
2403	auto R = mad(r,m[`0`], mad(g,m[`3`], b*m[`6`])),
2404	G = mad(r,m[`1`], mad(g,m[`4`], b*m[`7`])),
2405	B = mad(r,m[`2`], mad(g,m[`5`], b*m[`8`]));
2406	r = R;
2407	g = G;
2408	b = B;
2409	}
2410	STAGE(matrix_3x4, const float* m) {
2411	auto R = mad(r,m[`0`], mad(g,m[`3`], mad(b,m[`6`], m[ `9`]))),
2412	G = mad(r,m[`1`], mad(g,m[`4`], mad(b,m[`7`], m[`10`]))),
2413	B = mad(r,m[`2`], mad(g,m[`5`], mad(b,m[`8`], m[`11`])));
2414	r = R;
2415	g = G;
2416	b = B;
2417	}
2418	STAGE(matrix_4x5, const float* m) {
2419	auto R = mad(r,m[ `0`], mad(g,m[ `1`], mad(b,m[ `2`], mad(a,m[ `3`], m[ `4`])))),
2420	G = mad(r,m[ `5`], mad(g,m[ `6`], mad(b,m[ `7`], mad(a,m[ `8`], m[ `9`])))),
2421	B = mad(r,m[`10`], mad(g,m[`11`], mad(b,m[`12`], mad(a,m[`13`], m[`14`])))),
2422	A = mad(r,m[`15`], mad(g,m[`16`], mad(b,m[`17`], mad(a,m[`18`], m[`19`]))));
2423	r = R;
2424	g = G;
2425	b = B;
2426	a = A;
2427	}
2428	STAGE(matrix_4x3, const float* m) {
2429	auto X = r,
2430	Y = g;
2431
2432	r = mad(X, m[`0`], mad(Y, m[`4`], m[ `8`]));
2433	g = mad(X, m[`1`], mad(Y, m[`5`], m[ `9`]));
2434	b = mad(X, m[`2`], mad(Y, m[`6`], m[`10`]));
2435	a = mad(X, m[`3`], mad(Y, m[`7`], m[`11`]));
2436	}
2437	STAGE(matrix_perspective, const float* m) {
2438	// N.B. Unlike the other matrix_ stages, this matrix is row-major.
2439	auto R = mad(r,m[`0`], mad(g,m[`1`], m[`2`])),
2440	G = mad(r,m[`3`], mad(g,m[`4`], m[`5`])),
2441	Z = mad(r,m[`6`], mad(g,m[`7`], m[`8`]));
2442	r = R * rcp(Z);
2443	g = G * rcp(Z);
2444	}
2445
2446	SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
2447	F* r, F* g, F* b, F* a) {
2448	F fr, br, fg, bg, fb, bb, fa, ba;
2449	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
2450	if (c->stopCount <=`8`) {
2451	fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`0`]), idx);
2452	br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`0`]), idx);
2453	fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`1`]), idx);
2454	bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`1`]), idx);
2455	fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`2`]), idx);
2456	bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`2`]), idx);
2457	fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`3`]), idx);
2458	ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`3`]), idx);
2459	} else
2460	#endif
2461	{
2462	fr = gather(c->fs[`0`], idx);
2463	br = gather(c->bs[`0`], idx);
2464	fg = gather(c->fs[`1`], idx);
2465	bg = gather(c->bs[`1`], idx);
2466	fb = gather(c->fs[`2`], idx);
2467	bb = gather(c->bs[`2`], idx);
2468	fa = gather(c->fs[`3`], idx);
2469	ba = gather(c->bs[`3`], idx);
2470	}
2471
2472	*r = mad(t, fr, br);
2473	*g = mad(t, fg, bg);
2474	*b = mad(t, fb, bb);
2475	*a = mad(t, fa, ba);
2476	}
2477
2478	STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
2479	auto t = r;
2480	auto idx = trunc_(t * (c->stopCount-`1`));
2481	gradient_lookup(c, idx, t, &r, &g, &b, &a);
2482	}
2483
2484	STAGE(gradient, const SkRasterPipeline_GradientCtx* c) {
2485	auto t = r;
2486	U32 idx = `0`;
2487
2488	// N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
2489	for (size_t i = `1`; i < c->stopCount; i++) {
2490	idx += if_then_else(t >= c->ts[i], U32(`1`), U32(`0`));
2491	}
2492
2493	gradient_lookup(c, idx, t, &r, &g, &b, &a);
2494	}
2495
2496	STAGE(evenly_spaced_2_stop_gradient, const void* ctx) {
2497	// TODO: Rename Ctx SkRasterPipeline_EvenlySpaced2StopGradientCtx.
2498	struct Ctx { float f[`4`], b[`4`]; };
2499	auto c = (const Ctx*)ctx;
2500
2501	auto t = r;
2502	r = mad(t, c->f[`0`], c->b[`0`]);
2503	g = mad(t, c->f[`1`], c->b[`1`]);
2504	b = mad(t, c->f[`2`], c->b[`2`]);
2505	a = mad(t, c->f[`3`], c->b[`3`]);
2506	}
2507
2508	STAGE(xy_to_unit_angle, Ctx::None) {
2509	F X = r,
2510	Y = g;
2511	F xabs = abs_(X),
2512	yabs = abs_(Y);
2513
2514	F slope = min(xabs, yabs)/max(xabs, yabs);
2515	F s = slope * slope;
2516
2517	// Use a 7th degree polynomial to approximate atan.
2518	// This was generated using sollya.gforge.inria.fr.
2519	// A float optimized polynomial was generated using the following command.
2520	// P1 = fpminimax((1/(2Pi))atan(x),[\|1,3,5,7\|],[\|24...\|],[2^(-40),1],relative);
2521	F phi = slope
2522	* (`0.15912117063999176025390625f` + s
2523	* (-`5.185396969318389892578125e-2f` + s
2524	* (`2.476101927459239959716796875e-2f` + s
2525	* (-`7.0547382347285747528076171875e-3f`))));
2526
2527	phi = if_then_else(xabs < yabs, `1.0f`/`4.0f` - phi, phi);
2528	phi = if_then_else(X < `0.0f` , `1.0f`/`2.0f` - phi, phi);
2529	phi = if_then_else(Y < `0.0f` , `1.0f` - phi , phi);
2530	phi = if_then_else(phi != phi , `0` , phi); // Check for NaN.
2531	r = phi;
2532	}
2533
2534	STAGE(xy_to_radius, Ctx::None) {
2535	F X2 = r * r,
2536	Y2 = g * g;
2537	r = sqrt_(X2 + Y2);
2538	}
2539
2540	// Please see https://skia.org/dev/design/conical for how our 2pt conical shader works.
2541
2542	STAGE(negate_x, Ctx::None) { r = -r; }
2543
2544	STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) {
2545	F x = r, y = g, &t = r;
2546	t = x + sqrt_(ctx->fP0 - yy); // ctx->fP0 = r0 * r0*
2547	}
2548
2549	STAGE(xy_to_2pt_conical_focal_on_circle, Ctx::None) {
2550	F x = r, y = g, &t = r;
2551	t = x + yy / x; // (x^2 + y^2) / x*
2552	}
2553
2554	STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) {
2555	F x = r, y = g, &t = r;
2556	t = sqrt_(xx + yy) - x * ctx->fP0; // ctx->fP0 = 1/r1
2557	}
2558
2559	STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) {
2560	F x = r, y = g, &t = r;
2561	t = sqrt_(xx - yy) - x * ctx->fP0; // ctx->fP0 = 1/r1
2562	}
2563
2564	STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) {
2565	F x = r, y = g, &t = r;
2566	t = -sqrt_(xx - yy) - x * ctx->fP0; // ctx->fP0 = 1/r1
2567	}
2568
2569	STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) {
2570	F& t = r;
2571	t = t + ctx->fP1; // ctx->fP1 = f
2572	}
2573
2574	STAGE(alter_2pt_conical_unswap, Ctx::None) {
2575	F& t = r;
2576	t = `1` - t;
2577	}
2578
2579	STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) {
2580	F& t = r;
2581	auto is_degenerate = (t != t); // NaN
2582	t = if_then_else(is_degenerate, F(`0`), t);
2583	sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
2584	}
2585
2586	STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) {
2587	F& t = r;
2588	auto is_degenerate = (t <= `0`) \| (t != t);
2589	t = if_then_else(is_degenerate, F(`0`), t);
2590	sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
2591	}
2592
2593	STAGE(apply_vector_mask, const uint32_t* ctx) {
2594	const U32 mask = sk_unaligned_load<U32>(ctx);
2595	r = bit_cast<F>(bit_cast<U32>(r) & mask);
2596	g = bit_cast<F>(bit_cast<U32>(g) & mask);
2597	b = bit_cast<F>(bit_cast<U32>(b) & mask);
2598	a = bit_cast<F>(bit_cast<U32>(a) & mask);
2599	}
2600
2601	STAGE(save_xy, SkRasterPipeline_SamplerCtx* c) {
2602	// Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
2603	// They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
2604	// surrounding (x,y) at (0.5,0.5) off-center.
2605	F fx = fract(r + `0.5f`),
2606	fy = fract(g + `0.5f`);
2607
2608	// Samplers will need to load x and fx, or y and fy.
2609	sk_unaligned_store(c->x, r);
2610	sk_unaligned_store(c->y, g);
2611	sk_unaligned_store(c->fx, fx);
2612	sk_unaligned_store(c->fy, fy);
2613	}
2614
2615	STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) {
2616	// Bilinear and bicubic filters are both separable, so we produce independent contributions
2617	// from x and y, multiplying them together here to get each pixel's total scale factor.
2618	auto scale = sk_unaligned_load<F>(c->scalex)
2619	* sk_unaligned_load<F>(c->scaley);
2620	dr = mad(scale, r, dr);
2621	dg = mad(scale, g, dg);
2622	db = mad(scale, b, db);
2623	da = mad(scale, a, da);
2624	}
2625
2626	// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
2627	// are combined in direct proportion to their area overlapping that logical query pixel.
2628	// At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
2629	// The y-axis is symmetric.
2630
2631	template <int kScale>
2632	SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
2633	x = sk_unaligned_load<F>(ctx->x) + (kScale `0.5f`);
2634	F fx = sk_unaligned_load<F>(ctx->fx);
2635
2636	F scalex;
2637	if (kScale == -`1`) { scalex = `1.0f` - fx; }
2638	if (kScale == +`1`) { scalex = fx; }
2639	sk_unaligned_store(ctx->scalex, scalex);
2640	}
2641	template <int kScale>
2642	SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
2643	y = sk_unaligned_load<F>(ctx->y) + (kScale `0.5f`);
2644	F fy = sk_unaligned_load<F>(ctx->fy);
2645
2646	F scaley;
2647	if (kScale == -`1`) { scaley = `1.0f` - fy; }
2648	if (kScale == +`1`) { scaley = fy; }
2649	sk_unaligned_store(ctx->scaley, scaley);
2650	}
2651
2652	STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-`1`>(ctx, &r); }
2653	STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+`1`>(ctx, &r); }
2654	STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-`1`>(ctx, &g); }
2655	STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+`1`>(ctx, &g); }
2656
2657
2658	// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
2659	// pixel center are combined with a non-uniform cubic filter, with higher values near the center.
2660	//
2661	// We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
2662	// See GrCubicEffect for details of this particular filter.
2663
2664	SI F bicubic_near(F t) {
2665	// 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
2666	return mad(t, mad(t, mad((-`21`/`18.0f`), t, (`27`/`18.0f`)), (`9`/`18.0f`)), (`1`/`18.0f`));
2667	}
2668	SI F bicubic_far(F t) {
2669	// 0/18 + 0/18t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)*
2670	return (tt)mad((`7`/`18.0f`), t, (-`6`/`18.0f`));
2671	}
2672
2673	template <int kScale>
2674	SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
2675	x = sk_unaligned_load<F>(ctx->x) + (kScale `0.5f`);
2676	F fx = sk_unaligned_load<F>(ctx->fx);
2677
2678	F scalex;
2679	if (kScale == -`3`) { scalex = bicubic_far (`1.0f` - fx); }
2680	if (kScale == -`1`) { scalex = bicubic_near(`1.0f` - fx); }
2681	if (kScale == +`1`) { scalex = bicubic_near( fx); }
2682	if (kScale == +`3`) { scalex = bicubic_far ( fx); }
2683	sk_unaligned_store(ctx->scalex, scalex);
2684	}
2685	template <int kScale>
2686	SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
2687	y = sk_unaligned_load<F>(ctx->y) + (kScale `0.5f`);
2688	F fy = sk_unaligned_load<F>(ctx->fy);
2689
2690	F scaley;
2691	if (kScale == -`3`) { scaley = bicubic_far (`1.0f` - fy); }
2692	if (kScale == -`1`) { scaley = bicubic_near(`1.0f` - fy); }
2693	if (kScale == +`1`) { scaley = bicubic_near( fy); }
2694	if (kScale == +`3`) { scaley = bicubic_far ( fy); }
2695	sk_unaligned_store(ctx->scaley, scaley);
2696	}
2697
2698	STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-`3`>(ctx, &r); }
2699	STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-`1`>(ctx, &r); }
2700	STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+`1`>(ctx, &r); }
2701	STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+`3`>(ctx, &r); }
2702
2703	STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-`3`>(ctx, &g); }
2704	STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-`1`>(ctx, &g); }
2705	STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+`1`>(ctx, &g); }
2706	STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+`3`>(ctx, &g); }
2707
2708	STAGE(callback, SkRasterPipeline_CallbackCtx* c) {
2709	store4(c->rgba,`0`, r,g,b,a);
2710	c->fn(c, tail ? tail : N);
2711	load4(c->read_from,`0`, &r,&g,&b,&a);
2712	}
2713
2714	// shader: void main(float2 p, inout half4 color)
2715	// colorfilter: void main(inout half4 color)
2716	STAGE(interpreter, SkRasterPipeline_InterpreterCtx* c) {
2717	// If N is less than the interpreter's VecWidth, then we are doing more work than necessary in
2718	// the interpreter. This is a known issue, and will be addressed at some point.
2719	float xx[N], yy[N],
2720	rr[N], gg[N], bb[N], aa[N];
2721
2722	float* args[] = { xx, yy, rr, gg, bb, aa };
2723	float** in_args = args;
2724	int in_count = `6`;
2725
2726	if (c->shaderConvention) {
2727	// our caller must have called seed_shader to set these
2728	sk_unaligned_store(xx, r);
2729	sk_unaligned_store(yy, g);
2730	sk_unaligned_store(rr, F(c->paintColor.fR));
2731	sk_unaligned_store(gg, F(c->paintColor.fG));
2732	sk_unaligned_store(bb, F(c->paintColor.fB));
2733	sk_unaligned_store(aa, F(c->paintColor.fA));
2734	} else {
2735	in_args += `2`; // skip x,y
2736	in_count = `4`;
2737	sk_unaligned_store(rr, r);
2738	sk_unaligned_store(gg, g);
2739	sk_unaligned_store(bb, b);
2740	sk_unaligned_store(aa, a);
2741	}
2742
2743	SkAssertResult(c->byteCode->runStriped(c->fn, tail ? tail : N, in_args, in_count,
2744	nullptr, `0`, (const float*)c->inputs, c->ninputs));
2745
2746	r = sk_unaligned_load<F>(rr);
2747	g = sk_unaligned_load<F>(gg);
2748	b = sk_unaligned_load<F>(bb);
2749	a = sk_unaligned_load<F>(aa);
2750	}
2751
2752	STAGE(gauss_a_to_rgba, Ctx::None) {
2753	// x = 1 - x;
2754	// exp(-x x * 4) - 0.018f;*
2755	// ... now approximate with quartic
2756	//
2757	const float c4 = -`2.26661229133605957031f`;
2758	const float c3 = `2.89795351028442382812f`;
2759	const float c2 = `0.21345567703247070312f`;
2760	const float c1 = `0.15489584207534790039f`;
2761	const float c0 = `0.00030726194381713867f`;
2762	a = mad(a, mad(a, mad(a, mad(a, c4, c3), c2), c1), c0);
2763	r = a;
2764	g = a;
2765	b = a;
2766	}
2767
2768	SI F tile(F v, SkTileMode mode, float limit, float invLimit) {
2769	// The ix_and_ptr() calls in sample() will clamp tile()'s output, so no need to clamp here.
2770	switch (mode) {
2771	case SkTileMode::kDecal: // TODO, for now fallthrough to clamp
2772	case SkTileMode::kClamp: return v;
2773	case SkTileMode::kRepeat: return v - floor_(vinvLimit)limit;
2774	case SkTileMode::kMirror:
2775	return abs_( (v-limit) - (limit+limit)floor_((v-limit)(invLimit*`0.5f`)) - limit );
2776	}
2777	SkUNREACHABLE;
2778	}
2779
2780	SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y,
2781	F* r, F* g, F* b, F* a) {
2782	x = tile(x, ctx->tileX, ctx->width , ctx->invWidth );
2783	y = tile(y, ctx->tileY, ctx->height, ctx->invHeight);
2784
2785	switch (ctx->ct) {
2786	default: r = g = b = a = `0`; // TODO
2787	break;
2788
2789	case kRGBA_8888_SkColorType:
2790	case kBGRA_8888_SkColorType: {
2791	const uint32_t* ptr;
2792	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
2793	from_8888(gather(ptr, ix), r,g,b,a);
2794	if (ctx->ct == kBGRA_8888_SkColorType) {
2795	std::swap(r,b);
2796	}
2797	} break;
2798	}
2799	}
2800
2801	template <int D>
2802	SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx,
2803	F cx, F cy, const F (&wx)[D], const F (&wy)[D],
2804	F* r, F* g, F* b, F* a) {
2805
2806	float start = -`0.5f`*(D-`1`);
2807
2808	r = g = b = a = `0`;
2809	F y = cy + start;
2810	for (int j = `0`; j < D; j++, y += `1.0f`) {
2811	F x = cx + start;
2812	for (int i = `0`; i < D; i++, x += `1.0f`) {
2813	F R,G,B,A;
2814	sample(ctx, x,y, &R,&G,&B,&A);
2815
2816	F w = wx[i] * wy[j];
2817	r = mad(w,R,r);
2818	g = mad(w,G,g);
2819	b = mad(w,B,b);
2820	a = mad(w,A,a);
2821	}
2822	}
2823	}
2824
2825	STAGE(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) {
2826	F x = r, fx = fract(x + `0.5f`),
2827	y = g, fy = fract(y + `0.5f`);
2828	const F wx[] = {`1.0f` - fx, fx};
2829	const F wy[] = {`1.0f` - fy, fy};
2830
2831	sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
2832	}
2833	STAGE(bicubic, SkRasterPipeline_SamplerCtx2* ctx) {
2834	F x = r, fx = fract(x + `0.5f`),
2835	y = g, fy = fract(y + `0.5f`);
2836	const F wx[] = { bicubic_far(`1`-fx), bicubic_near(`1`-fx), bicubic_near(fx), bicubic_far(fx) };
2837	const F wy[] = { bicubic_far(`1`-fy), bicubic_near(`1`-fy), bicubic_near(fy), bicubic_far(fy) };
2838
2839	sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
2840	}
2841
2842	// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
2843	STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
2844	// (cx,cy) are the center of our sample.
2845	F cx = r,
2846	cy = g;
2847
2848	// All sample points are at the same fractional offset (fx,fy).
2849	// They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
2850	F fx = fract(cx + `0.5f`),
2851	fy = fract(cy + `0.5f`);
2852
2853	// We'll accumulate the color of all four samples into {r,g,b,a} directly.
2854	r = g = b = a = `0`;
2855
2856	for (float dy = -`0.5f`; dy <= +`0.5f`; dy += `1.0f`)
2857	for (float dx = -`0.5f`; dx <= +`0.5f`; dx += `1.0f`) {
2858	// (x,y) are the coordinates of this sample point.
2859	F x = cx + dx,
2860	y = cy + dy;
2861
2862	// ix_and_ptr() will clamp to the image's bounds for us.
2863	const uint32_t* ptr;
2864	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
2865
2866	F sr,sg,sb,sa;
2867	from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
2868
2869	// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
2870	// are combined in direct proportion to their area overlapping that logical query pixel.
2871	// At positive offsets, the x-axis contribution to that rectangle is fx,
2872	// or (1-fx) at negative x. Same deal for y.
2873	F sx = (dx > `0`) ? fx : `1.0f` - fx,
2874	sy = (dy > `0`) ? fy : `1.0f` - fy,
2875	area = sx * sy;
2876
2877	r += sr * area;
2878	g += sg * area;
2879	b += sb * area;
2880	a += sa * area;
2881	}
2882	}
2883
2884	// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
2885	STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
2886	// (cx,cy) are the center of our sample.
2887	F cx = r,
2888	cy = g;
2889
2890	// All sample points are at the same fractional offset (fx,fy).
2891	// They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
2892	F fx = fract(cx + `0.5f`),
2893	fy = fract(cy + `0.5f`);
2894
2895	// We'll accumulate the color of all four samples into {r,g,b,a} directly.
2896	r = g = b = a = `0`;
2897
2898	const F scaley[`4`] = {
2899	bicubic_far (`1.0f` - fy), bicubic_near(`1.0f` - fy),
2900	bicubic_near( fy), bicubic_far ( fy),
2901	};
2902	const F scalex[`4`] = {
2903	bicubic_far (`1.0f` - fx), bicubic_near(`1.0f` - fx),
2904	bicubic_near( fx), bicubic_far ( fx),
2905	};
2906
2907	F sample_y = cy - `1.5f`;
2908	for (int yy = `0`; yy <= `3`; ++yy) {
2909	F sample_x = cx - `1.5f`;
2910	for (int xx = `0`; xx <= `3`; ++xx) {
2911	F scale = scalex[xx] * scaley[yy];
2912
2913	// ix_and_ptr() will clamp to the image's bounds for us.
2914	const uint32_t* ptr;
2915	U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y);
2916
2917	F sr,sg,sb,sa;
2918	from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
2919
2920	r = mad(scale, sr, r);
2921	g = mad(scale, sg, g);
2922	b = mad(scale, sb, b);
2923	a = mad(scale, sa, a);
2924
2925	sample_x += `1`;
2926	}
2927	sample_y += `1`;
2928	}
2929	}
2930
2931	// ~~~~~~ GrSwizzle stage ~~~~~~ //
2932
2933	STAGE(swizzle, void* ctx) {
2934	auto ir = r, ig = g, ib = b, ia = a;
2935	F* o[] = {&r, &g, &b, &a};
2936	char swiz[`4`];
2937	memcpy(swiz, &ctx, sizeof(swiz));
2938
2939	for (int i = `0`; i < `4`; ++i) {
2940	switch (swiz[i]) {
2941	case `'r'`: o[i] = ir; break*;
2942	case `'g'`: o[i] = ig; break*;
2943	case `'b'`: o[i] = ib; break*;
2944	case `'a'`: o[i] = ia; break*;
2945	case `'0'`: o[i] = F(`0`); break*;
2946	case `'1'`: o[i] = F(`1`); break*;
2947	default: break;
2948	}
2949	}
2950	}
2951
2952	namespace lowp {
2953	#if defined(JUMPER_IS_SCALAR) \|\| defined(SK_DISABLE_LOWP_RASTER_PIPELINE)
2954	// If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually),
2955	// we don't generate lowp stages. All these nullptrs will tell SkJumper.cpp to always use the
2956	// highp float pipeline.
2957	#define M(st) static void (*st)(void) = nullptr;
2958	SK_RASTER_PIPELINE_STAGES(M)
2959	#undef M
2960	static void (just_return)(void) = nullptr*;
2961
2962	static void start_pipeline(size_t,size_t,size_t,size_t, void**) {}
2963
2964	#else // We are compiling vector code with Clang... let's make some lowp stages!
2965
2966	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
2967	using U8 = uint8_t __attribute__((ext_vector_type(`16`)));
2968	using U16 = uint16_t __attribute__((ext_vector_type(`16`)));
2969	using I16 = int16_t __attribute__((ext_vector_type(`16`)));
2970	using I32 = int32_t __attribute__((ext_vector_type(`16`)));
2971	using U32 = uint32_t __attribute__((ext_vector_type(`16`)));
2972	using F = float __attribute__((ext_vector_type(`16`)));
2973	#else
2974	using U8 = uint8_t __attribute__((ext_vector_type(`8`)));
2975	using U16 = uint16_t __attribute__((ext_vector_type(`8`)));
2976	using I16 = int16_t __attribute__((ext_vector_type(`8`)));
2977	using I32 = int32_t __attribute__((ext_vector_type(`8`)));
2978	using U32 = uint32_t __attribute__((ext_vector_type(`8`)));
2979	using F = float __attribute__((ext_vector_type(`8`)));
2980	#endif
2981
2982	static const size_t N = sizeof(U16) / sizeof(uint16_t);
2983
2984	// Once again, some platforms benefit from a restricted Stage calling convention,
2985	// but others can pass tons and tons of registers and we're happy to exploit that.
2986	// It's exactly the same decision and implementation strategy as the F stages above.
2987	#if JUMPER_NARROW_STAGES
2988	struct Params {
2989	size_t dx, dy, tail;
2990	U16 dr,dg,db,da;
2991	};
2992	using Stage = void(ABI)(Params, void** program, U16 r, U16 g, U16 b, U16 a);
2993	#else
2994	// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
2995	using Stage = void (ABI)(size_t tail, void*** program, size_t dx, size_t dy,
2996	U16 r, U16 g, U16 b, U16 a,
2997	U16 dr, U16 dg, U16 db, U16 da);
2998	#endif
2999
3000	static void start_pipeline(const size_t x0, const size_t y0,
3001	const size_t xlimit, const size_t ylimit, void** program) {
3002	auto start = (Stage)load_and_inc(program);
3003	for (size_t dy = y0; dy < ylimit; dy++) {
3004	#if JUMPER_NARROW_STAGES
3005	Params params = { x0,dy,`0`, `0`,`0`,`0`,`0` };
3006	for (; params.dx + N <= xlimit; params.dx += N) {
3007	start(&params,program, `0`,`0`,`0`,`0`);
3008	}
3009	if (size_t tail = xlimit - params.dx) {
3010	params.tail = tail;
3011	start(&params,program, `0`,`0`,`0`,`0`);
3012	}
3013	#else
3014	size_t dx = x0;
3015	for (; dx + N <= xlimit; dx += N) {
3016	start( `0`,program,dx,dy, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
3017	}
3018	if (size_t tail = xlimit - dx) {
3019	start(tail,program,dx,dy, `0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`);
3020	}
3021	#endif
3022	}
3023	}
3024
3025	#if JUMPER_NARROW_STAGES
3026	static void ABI just_return(Params, void***, U16,U16,U16,U16) {}
3027	#else
3028	static void ABI just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
3029	#endif
3030
3031	// All stages use the same function call ABI to chain into each other, but there are three types:
3032	// GG: geometry in, geometry out -- think, a matrix
3033	// GP: geometry in, pixels out. -- think, a memory gather
3034	// PP: pixels in, pixels out. -- think, a blend mode
3035	//
3036	// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.)
3037	//
3038	// These three STAGE_ macros let you define each type of stage,
3039	// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
3040
3041	#if JUMPER_NARROW_STAGES
3042	#define STAGE_GG(name, ...) \
3043	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
3044	static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
3045	auto x = join<F>(r,g), \
3046	y = join<F>(b,a); \
3047	name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \
3048	split(x, &r,&g); \
3049	split(y, &b,&a); \
3050	auto next = (Stage)load_and_inc(program); \
3051	next(params,program, r,g,b,a); \
3052	} \
3053	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
3054
3055	#define STAGE_GP(name, ...) \
3056	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
3057	U16& r, U16& g, U16& b, U16& a, \
3058	U16& dr, U16& dg, U16& db, U16& da); \
3059	static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
3060	auto x = join<F>(r,g), \
3061	y = join<F>(b,a); \
3062	name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \
3063	params->dr,params->dg,params->db,params->da); \
3064	auto next = (Stage)load_and_inc(program); \
3065	next(params,program, r,g,b,a); \
3066	} \
3067	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
3068	U16& r, U16& g, U16& b, U16& a, \
3069	U16& dr, U16& dg, U16& db, U16& da)
3070
3071	#define STAGE_PP(name, ...) \
3072	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
3073	U16& r, U16& g, U16& b, U16& a, \
3074	U16& dr, U16& dg, U16& db, U16& da); \
3075	static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
3076	name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \
3077	params->dr,params->dg,params->db,params->da); \
3078	auto next = (Stage)load_and_inc(program); \
3079	next(params,program, r,g,b,a); \
3080	} \
3081	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
3082	U16& r, U16& g, U16& b, U16& a, \
3083	U16& dr, U16& dg, U16& db, U16& da)
3084	#else
3085	#define STAGE_GG(name, ...) \
3086	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
3087	static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
3088	U16 r, U16 g, U16 b, U16 a, \
3089	U16 dr, U16 dg, U16 db, U16 da) { \
3090	auto x = join<F>(r,g), \
3091	y = join<F>(b,a); \
3092	name##_k(Ctx {program}, dx,dy,tail, x,y); \
3093	split(x, &r,&g); \
3094	split(y, &b,&a); \
3095	auto next = (Stage)load_and_inc(program); \
3096	next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
3097	} \
3098	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
3099
3100	#define STAGE_GP(name, ...) \
3101	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
3102	U16& r, U16& g, U16& b, U16& a, \
3103	U16& dr, U16& dg, U16& db, U16& da); \
3104	static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
3105	U16 r, U16 g, U16 b, U16 a, \
3106	U16 dr, U16 dg, U16 db, U16 da) { \
3107	auto x = join<F>(r,g), \
3108	y = join<F>(b,a); \
3109	name##_k(Ctx {program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
3110	auto next = (Stage)load_and_inc(program); \
3111	next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
3112	} \
3113	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
3114	U16& r, U16& g, U16& b, U16& a, \
3115	U16& dr, U16& dg, U16& db, U16& da)
3116
3117	#define STAGE_PP(name, ...) \
3118	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
3119	U16& r, U16& g, U16& b, U16& a, \
3120	U16& dr, U16& dg, U16& db, U16& da); \
3121	static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
3122	U16 r, U16 g, U16 b, U16 a, \
3123	U16 dr, U16 dg, U16 db, U16 da) { \
3124	name##_k(Ctx {program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
3125	auto next = (Stage)load_and_inc(program); \
3126	next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
3127	} \
3128	SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
3129	U16& r, U16& g, U16& b, U16& a, \
3130	U16& dr, U16& dg, U16& db, U16& da)
3131	#endif
3132
3133	// ~~~~~~ Commonly used helper functions ~~~~~~ //
3134
3135	SI U16 div255(U16 v) {
3136	#if 0
3137	return (v+`127`)/`255`; // The ideal rounding divide by 255.
3138	#elif 1 && defined(JUMPER_IS_NEON)
3139	// With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8
3140	// just as fast as we can do the approximation below, so might as well be correct!
3141	// First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up.
3142	return vrshrq_n_u16(vrsraq_n_u16(v, v, `8`), `8`);
3143	#else
3144	return (v+`255`)/`256`; // A good approximation of (v+127)/255.
3145	#endif
3146	}
3147
3148	SI U16 inv(U16 v) { return `255`-v; }
3149
3150	SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) \| (e & ~c); }
3151	SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) \| (e & ~c); }
3152
3153	SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
3154	SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
3155
3156	SI U16 from_float(float f) { return f * `255.0f` + `0.5f`; }
3157
3158	SI U16 lerp(U16 from, U16 to, U16 t) { return div255( frominv(t) + tot ); }
3159
3160	template <typename D, typename S>
3161	SI D cast(S src) {
3162	return __builtin_convertvector(src, D);
3163	}
3164
3165	template <typename D, typename S>
3166	SI void split(S v, D* lo, D* hi) {
3167	static_assert(`2`*sizeof(D) == sizeof(S), "");
3168	memcpy(lo, (const char)&v + `0`sizeof(D), sizeof(D));
3169	memcpy(hi, (const char)&v + `1`sizeof(D), sizeof(D));
3170	}
3171	template <typename D, typename S>
3172	SI D join(S lo, S hi) {
3173	static_assert(sizeof(D) == `2`*sizeof(S), "");
3174	D v;
3175	memcpy((char)&v + `0`sizeof(S), &lo, sizeof(S));
3176	memcpy((char)&v + `1`sizeof(S), &hi, sizeof(S));
3177	return v;
3178	}
3179
3180	SI F if_then_else(I32 c, F t, F e) {
3181	return bit_cast<F>( (bit_cast<I32>(t) & c) \| (bit_cast<I32>(e) & ~c) );
3182	}
3183	SI F max(F x, F y) { return if_then_else(x < y, y, x); }
3184	SI F min(F x, F y) { return if_then_else(x < y, x, y); }
3185
3186	SI F mad(F f, F m, F a) { return f*m+a; }
3187	SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
3188
3189	SI F rcp(F x) {
3190	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
3191	__m256 lo,hi;
3192	split(x, &lo,&hi);
3193	return join<F>(_mm256_rcp_ps(lo), _mm256_rcp_ps(hi));
3194	#elif defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
3195	__m128 lo,hi;
3196	split(x, &lo,&hi);
3197	return join<F>(_mm_rcp_ps(lo), _mm_rcp_ps(hi));
3198	#elif defined(JUMPER_IS_NEON)
3199	auto rcp = [](float32x4_t v) {
3200	auto est = vrecpeq_f32(v);
3201	return vrecpsq_f32(v,est)*est;
3202	};
3203	float32x4_t lo,hi;
3204	split(x, &lo,&hi);
3205	return join<F>(rcp(lo), rcp(hi));
3206	#else
3207	return `1.0f` / x;
3208	#endif
3209	}
3210	SI F sqrt_(F x) {
3211	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
3212	__m256 lo,hi;
3213	split(x, &lo,&hi);
3214	return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
3215	#elif defined(JUMPER_IS_SSE2) \|\| defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
3216	__m128 lo,hi;
3217	split(x, &lo,&hi);
3218	return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi));
3219	#elif defined(SK_CPU_ARM64)
3220	float32x4_t lo,hi;
3221	split(x, &lo,&hi);
3222	return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi));
3223	#elif defined(JUMPER_IS_NEON)
3224	auto sqrt = [](float32x4_t v) {
3225	auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
3226	est = vrsqrtsq_f32(v,estest);
3227	est = vrsqrtsq_f32(v,estest);
3228	return vest; // sqrt(v) == vrsqrt(v).
3229	};
3230	float32x4_t lo,hi;
3231	split(x, &lo,&hi);
3232	return join<F>(sqrt(lo), sqrt(hi));
3233	#else
3234	return F{
3235	sqrtf(x[`0`]), sqrtf(x[`1`]), sqrtf(x[`2`]), sqrtf(x[`3`]),
3236	sqrtf(x[`4`]), sqrtf(x[`5`]), sqrtf(x[`6`]), sqrtf(x[`7`]),
3237	};
3238	#endif
3239	}
3240
3241	SI F floor_(F x) {
3242	#if defined(SK_CPU_ARM64)
3243	float32x4_t lo,hi;
3244	split(x, &lo,&hi);
3245	return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
3246	#elif defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
3247	__m256 lo,hi;
3248	split(x, &lo,&hi);
3249	return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi));
3250	#elif defined(JUMPER_IS_SSE41) \|\| defined(JUMPER_IS_AVX)
3251	__m128 lo,hi;
3252	split(x, &lo,&hi);
3253	return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi));
3254	#else
3255	F roundtrip = cast<F>(cast<I32>(x));
3256	return roundtrip - if_then_else(roundtrip > x, F(`1`), F(`0`));
3257	#endif
3258	}
3259	SI F fract(F x) { return x - floor_(x); }
3260	SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & `0x7fffffff` ); }
3261
3262	// ~~~~~~ Basic / misc. stages ~~~~~~ //
3263
3264	STAGE_GG(seed_shader, Ctx::None) {
3265	static const float iota[] = {
3266	`0.5f`, `1.5f`, `2.5f`, `3.5f`, `4.5f`, `5.5f`, `6.5f`, `7.5f`,
3267	`8.5f`, `9.5f`,`10.5f`,`11.5f`,`12.5f`,`13.5f`,`14.5f`,`15.5f`,
3268	};
3269	x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota);
3270	y = cast<F>(I32(dy)) + `0.5f`;
3271	}
3272
3273	STAGE_GG(matrix_translate, const float* m) {
3274	x += m[`0`];
3275	y += m[`1`];
3276	}
3277	STAGE_GG(matrix_scale_translate, const float* m) {
3278	x = mad(x,m[`0`], m[`2`]);
3279	y = mad(y,m[`1`], m[`3`]);
3280	}
3281	STAGE_GG(matrix_2x3, const float* m) {
3282	auto X = mad(x,m[`0`], mad(y,m[`2`], m[`4`])),
3283	Y = mad(x,m[`1`], mad(y,m[`3`], m[`5`]));
3284	x = X;
3285	y = Y;
3286	}
3287	STAGE_GG(matrix_perspective, const float* m) {
3288	// N.B. Unlike the other matrix_ stages, this matrix is row-major.
3289	auto X = mad(x,m[`0`], mad(y,m[`1`], m[`2`])),
3290	Y = mad(x,m[`3`], mad(y,m[`4`], m[`5`])),
3291	Z = mad(x,m[`6`], mad(y,m[`7`], m[`8`]));
3292	x = X * rcp(Z);
3293	y = Y * rcp(Z);
3294	}
3295
3296	STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
3297	r = c->rgba[`0`];
3298	g = c->rgba[`1`];
3299	b = c->rgba[`2`];
3300	a = c->rgba[`3`];
3301	}
3302	STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
3303	dr = c->rgba[`0`];
3304	dg = c->rgba[`1`];
3305	db = c->rgba[`2`];
3306	da = c->rgba[`3`];
3307	}
3308	STAGE_PP(black_color, Ctx::None) { r = g = b = `0`; a = `255`; }
3309	STAGE_PP(white_color, Ctx::None) { r = g = b = `255`; a = `255`; }
3310
3311	STAGE_PP(set_rgb, const float rgb[`3`]) {
3312	r = from_float(rgb[`0`]);
3313	g = from_float(rgb[`1`]);
3314	b = from_float(rgb[`2`]);
3315	}
3316
3317	STAGE_PP(clamp_0, Ctx::None) { /definitely a noop/ }
3318	STAGE_PP(clamp_1, Ctx::None) { /_should_ be a noop/ }
3319
3320	STAGE_PP(clamp_a, Ctx::None) {
3321	r = min(r, a);
3322	g = min(g, a);
3323	b = min(b, a);
3324	}
3325
3326	STAGE_PP(clamp_gamut, Ctx::None) {
3327	// It shouldn't be possible to get out-of-gamut
3328	// colors when working in lowp.
3329	}
3330
3331	STAGE_PP(premul, Ctx::None) {
3332	r = div255(r * a);
3333	g = div255(g * a);
3334	b = div255(b * a);
3335	}
3336	STAGE_PP(premul_dst, Ctx::None) {
3337	dr = div255(dr * da);
3338	dg = div255(dg * da);
3339	db = div255(db * da);
3340	}
3341
3342	STAGE_PP(force_opaque , Ctx::None) { a = `255`; }
3343	STAGE_PP(force_opaque_dst, Ctx::None) { da = `255`; }
3344
3345	STAGE_PP(swap_rb, Ctx::None) {
3346	auto tmp = r;
3347	r = b;
3348	b = tmp;
3349	}
3350	STAGE_PP(swap_rb_dst, Ctx::None) {
3351	auto tmp = dr;
3352	dr = db;
3353	db = tmp;
3354	}
3355
3356	STAGE_PP(move_src_dst, Ctx::None) {
3357	dr = r;
3358	dg = g;
3359	db = b;
3360	da = a;
3361	}
3362
3363	STAGE_PP(move_dst_src, Ctx::None) {
3364	r = dr;
3365	g = dg;
3366	b = db;
3367	a = da;
3368	}
3369
3370	// ~~~~~~ Blend modes ~~~~~~ //
3371
3372	// The same logic applied to all 4 channels.
3373	#define BLEND_MODE(name) \
3374	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
3375	STAGE_PP(name, Ctx::None) { \
3376	r = name##_channel(r,dr,a,da); \
3377	g = name##_channel(g,dg,a,da); \
3378	b = name##_channel(b,db,a,da); \
3379	a = name##_channel(a,da,a,da); \
3380	} \
3381	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
3382
3383	BLEND_MODE(clear) { return `0`; }
3384	BLEND_MODE(srcatop) { return div255( sda + dinv(sa) ); }
3385	BLEND_MODE(dstatop) { return div255( dsa + sinv(da) ); }
3386	BLEND_MODE(srcin) { return div255( s*da ); }
3387	BLEND_MODE(dstin) { return div255( d*sa ); }
3388	BLEND_MODE(srcout) { return div255( s*inv(da) ); }
3389	BLEND_MODE(dstout) { return div255( d*inv(sa) ); }
3390	BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); }
3391	BLEND_MODE(dstover) { return d + div255( s*inv(da) ); }
3392	BLEND_MODE(modulate) { return div255( s*d ); }
3393	BLEND_MODE(multiply) { return div255( sinv(da) + dinv(sa) + s*d ); }
3394	BLEND_MODE(plus_) { return min(s+d, `255`); }
3395	BLEND_MODE(screen) { return s + d - div255( s*d ); }
3396	BLEND_MODE(xor_) { return div255( sinv(da) + dinv(sa) ); }
3397	#undef BLEND_MODE
3398
3399	// The same logic applied to color, and srcover for alpha.
3400	#define BLEND_MODE(name) \
3401	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
3402	STAGE_PP(name, Ctx::None) { \
3403	r = name##_channel(r,dr,a,da); \
3404	g = name##_channel(g,dg,a,da); \
3405	b = name##_channel(b,db,a,da); \
3406	a = a + div255( da*inv(a) ); \
3407	} \
3408	SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
3409
3410	BLEND_MODE(darken) { return s + d - div255( max(sda, dsa) ); }
3411	BLEND_MODE(lighten) { return s + d - div255( min(sda, dsa) ); }
3412	BLEND_MODE(difference) { return s + d - `2`div255( min(sda, d*sa) ); }
3413	BLEND_MODE(exclusion) { return s + d - `2`div255( sd ); }
3414
3415	BLEND_MODE(hardlight) {
3416	return div255( sinv(da) + dinv(sa) +
3417	if_then_else(`2`s <= sa, `2`sd, sada - `2`(sa-s)(da-d)) );
3418	}
3419	BLEND_MODE(overlay) {
3420	return div255( sinv(da) + dinv(sa) +
3421	if_then_else(`2`d <= da, `2`sd, sada - `2`(sa-s)(da-d)) );
3422	}
3423	#undef BLEND_MODE
3424
3425	// ~~~~~~ Helpers for interacting with memory ~~~~~~ //
3426
3427	template <typename T>
3428	SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
3429	return (T)ctx->pixels + dyctx->stride + dx;
3430	}
3431
3432	template <typename T>
3433	SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
3434	auto clamp = [](F v, F limit) {
3435	limit = bit_cast<F>( bit_cast<U32>(limit) - `1` ); // Exclusive -> inclusive.
3436	return min(max(`0`, v), limit);
3437	};
3438	x = clamp(x, ctx->width);
3439	y = clamp(y, ctx->height);
3440
3441	ptr = (const* T*)ctx->pixels;
3442	return trunc_(y)*ctx->stride + trunc_(x);
3443	}
3444
3445	template <typename V, typename T>
3446	SI V load(const T* ptr, size_t tail) {
3447	V v = `0`;
3448	switch (tail & (N-`1`)) {
3449	case `0`: memcpy(&v, ptr, sizeof(v)); break;
3450	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
3451	case `15`: v[`14`] = ptr[`14`];
3452	case `14`: v[`13`] = ptr[`13`];
3453	case `13`: v[`12`] = ptr[`12`];
3454	case `12`: memcpy(&v, ptr, `12`*sizeof(T)); break;
3455	case `11`: v[`10`] = ptr[`10`];
3456	case `10`: v[ `9`] = ptr[ `9`];
3457	case `9`: v[ `8`] = ptr[ `8`];
3458	case `8`: memcpy(&v, ptr, `8`*sizeof(T)); break;
3459	#endif
3460	case `7`: v[ `6`] = ptr[ `6`];
3461	case `6`: v[ `5`] = ptr[ `5`];
3462	case `5`: v[ `4`] = ptr[ `4`];
3463	case `4`: memcpy(&v, ptr, `4`*sizeof(T)); break;
3464	case `3`: v[ `2`] = ptr[ `2`];
3465	case `2`: memcpy(&v, ptr, `2`*sizeof(T)); break;
3466	case `1`: v[ `0`] = ptr[ `0`];
3467	}
3468	return v;
3469	}
3470	template <typename V, typename T>
3471	SI void store(T* ptr, size_t tail, V v) {
3472	switch (tail & (N-`1`)) {
3473	case `0`: memcpy(ptr, &v, sizeof(v)); break;
3474	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
3475	case `15`: ptr[`14`] = v[`14`];
3476	case `14`: ptr[`13`] = v[`13`];
3477	case `13`: ptr[`12`] = v[`12`];
3478	case `12`: memcpy(ptr, &v, `12`*sizeof(T)); break;
3479	case `11`: ptr[`10`] = v[`10`];
3480	case `10`: ptr[ `9`] = v[ `9`];
3481	case `9`: ptr[ `8`] = v[ `8`];
3482	case `8`: memcpy(ptr, &v, `8`*sizeof(T)); break;
3483	#endif
3484	case `7`: ptr[ `6`] = v[ `6`];
3485	case `6`: ptr[ `5`] = v[ `5`];
3486	case `5`: ptr[ `4`] = v[ `4`];
3487	case `4`: memcpy(ptr, &v, `4`*sizeof(T)); break;
3488	case `3`: ptr[ `2`] = v[ `2`];
3489	case `2`: memcpy(ptr, &v, `2`*sizeof(T)); break;
3490	case `1`: ptr[ `0`] = v[ `0`];
3491	}
3492	}
3493
3494	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
3495	template <typename V, typename T>
3496	SI V gather(const T* ptr, U32 ix) {
3497	return V{ ptr[ix[ `0`]], ptr[ix[ `1`]], ptr[ix[ `2`]], ptr[ix[ `3`]],
3498	ptr[ix[ `4`]], ptr[ix[ `5`]], ptr[ix[ `6`]], ptr[ix[ `7`]],
3499	ptr[ix[ `8`]], ptr[ix[ `9`]], ptr[ix[`10`]], ptr[ix[`11`]],
3500	ptr[ix[`12`]], ptr[ix[`13`]], ptr[ix[`14`]], ptr[ix[`15`]], };
3501	}
3502
3503	template<>
3504	F gather(const float* ptr, U32 ix) {
3505	__m256i lo, hi;
3506	split(ix, &lo, &hi);
3507
3508	return join<F>(_mm256_i32gather_ps(ptr, lo, `4`),
3509	_mm256_i32gather_ps(ptr, hi, `4`));
3510	}
3511
3512	template<>
3513	U32 gather(const uint32_t* ptr, U32 ix) {
3514	__m256i lo, hi;
3515	split(ix, &lo, &hi);
3516
3517	return join<U32>(_mm256_i32gather_epi32(ptr, lo, `4`),
3518	_mm256_i32gather_epi32(ptr, hi, `4`));
3519	}
3520	#else
3521	template <typename V, typename T>
3522	SI V gather(const T* ptr, U32 ix) {
3523	return V{ ptr[ix[ `0`]], ptr[ix[ `1`]], ptr[ix[ `2`]], ptr[ix[ `3`]],
3524	ptr[ix[ `4`]], ptr[ix[ `5`]], ptr[ix[ `6`]], ptr[ix[ `7`]], };
3525	}
3526	#endif
3527
3528
3529	// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
3530
3531	SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
3532	#if 1 && defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
3533	// Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
3534	__m256i _01,_23;
3535	split(rgba, &_01, &_23);
3536	__m256i _02 = _mm256_permute2x128_si256(_01,_23, `0x20`),
3537	_13 = _mm256_permute2x128_si256(_01,_23, `0x31`);
3538	rgba = join<U32>(_02, _13);
3539
3540	auto cast_U16 = [](U32 v) -> U16 {
3541	__m256i _02,_13;
3542	split(v, &_02,&_13);
3543	return _mm256_packus_epi32(_02,_13);
3544	};
3545	#else
3546	auto cast_U16 = [](U32 v) -> U16 {
3547	return cast<U16>(v);
3548	};
3549	#endif
3550	*r = cast_U16(rgba & `65535`) & `255`;
3551	*g = cast_U16(rgba & `65535`) >> `8`;
3552	*b = cast_U16(rgba >> `16`) & `255`;
3553	*a = cast_U16(rgba >> `16`) >> `8`;
3554	}
3555
3556	SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3557	#if 1 && defined(JUMPER_IS_NEON)
3558	uint8x8x4_t rgba;
3559	switch (tail & (N-`1`)) {
3560	case `0`: rgba = vld4_u8 ((const uint8_t)(ptr+`0`) ); break*;
3561	case `7`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`6`), rgba, `6`);
3562	case `6`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`5`), rgba, `5`);
3563	case `5`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`4`), rgba, `4`);
3564	case `4`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`3`), rgba, `3`);
3565	case `3`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`2`), rgba, `2`);
3566	case `2`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`1`), rgba, `1`);
3567	case `1`: rgba = vld4_lane_u8((const uint8_t*)(ptr+`0`), rgba, `0`);
3568	}
3569	*r = cast<U16>(rgba.val[`0`]);
3570	*g = cast<U16>(rgba.val[`1`]);
3571	*b = cast<U16>(rgba.val[`2`]);
3572	*a = cast<U16>(rgba.val[`3`]);
3573	#else
3574	from_8888(load<U32>(ptr, tail), r,g,b,a);
3575	#endif
3576	}
3577	SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3578	#if 1 && defined(JUMPER_IS_NEON)
3579	uint8x8x4_t rgba = {{
3580	cast<U8>(r),
3581	cast<U8>(g),
3582	cast<U8>(b),
3583	cast<U8>(a),
3584	}};
3585	switch (tail & (N-`1`)) {
3586	case `0`: vst4_u8 ((uint8_t)(ptr+`0`), rgba ); break*;
3587	case `7`: vst4_lane_u8((uint8_t*)(ptr+`6`), rgba, `6`);
3588	case `6`: vst4_lane_u8((uint8_t*)(ptr+`5`), rgba, `5`);
3589	case `5`: vst4_lane_u8((uint8_t*)(ptr+`4`), rgba, `4`);
3590	case `4`: vst4_lane_u8((uint8_t*)(ptr+`3`), rgba, `3`);
3591	case `3`: vst4_lane_u8((uint8_t*)(ptr+`2`), rgba, `2`);
3592	case `2`: vst4_lane_u8((uint8_t*)(ptr+`1`), rgba, `1`);
3593	case `1`: vst4_lane_u8((uint8_t*)(ptr+`0`), rgba, `0`);
3594	}
3595	#else
3596	store(ptr, tail, cast<U32>(r \| (g<<`8`)) << `0`
3597	\| cast<U32>(b \| (a<<`8`)) << `16`);
3598	#endif
3599	}
3600
3601	STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
3602	load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3603	}
3604	STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3605	load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3606	}
3607	STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
3608	store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
3609	}
3610	STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
3611	const uint32_t* ptr;
3612	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3613	from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
3614	}
3615
3616	// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
3617
3618	SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
3619	// Format for 565 buffers: 15\|rrrrr gggggg bbbbb\|0
3620	U16 R = (rgb >> `11`) & `31`,
3621	G = (rgb >> `5`) & `63`,
3622	B = (rgb >> `0`) & `31`;
3623
3624	// These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
3625	*r = (R << `3`) \| (R >> `2`);
3626	*g = (G << `2`) \| (G >> `4`);
3627	*b = (B << `3`) \| (B >> `2`);
3628	}
3629	SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
3630	from_565(load<U16>(ptr, tail), r,g,b);
3631	}
3632	SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
3633	// Round from [0,255] to [0,31] or [0,63], as if x (31/255.0f) + 0.5f.*
3634	// (Don't feel like you need to find some fundamental truth in these...
3635	// they were brute-force searched.)
3636	U16 R = (r * `9` + `36`) / `74`, // 9/74 ≈ 31/255, plus 36/74, about half.
3637	G = (g * `21` + `42`) / `85`, // 21/85 = 63/255 exactly.
3638	B = (b * `9` + `36`) / `74`;
3639	// Pack them back into 15\|rrrrr gggggg bbbbb\|0.
3640	store(ptr, tail, R << `11`
3641	\| G << `5`
3642	\| B << `0`);
3643	}
3644
3645	STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
3646	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
3647	a = `255`;
3648	}
3649	STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3650	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
3651	da = `255`;
3652	}
3653	STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
3654	store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
3655	}
3656	STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
3657	const uint16_t* ptr;
3658	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3659	from_565(gather<U16>(ptr, ix), &r, &g, &b);
3660	a = `255`;
3661	}
3662
3663	SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
3664	// Format for 4444 buffers: 15\|rrrr gggg bbbb aaaa\|0.
3665	U16 R = (rgba >> `12`) & `15`,
3666	G = (rgba >> `8`) & `15`,
3667	B = (rgba >> `4`) & `15`,
3668	A = (rgba >> `0`) & `15`;
3669
3670	// Scale [0,15] to [0,255].
3671	*r = (R << `4`) \| R;
3672	*g = (G << `4`) \| G;
3673	*b = (B << `4`) \| B;
3674	*a = (A << `4`) \| A;
3675	}
3676	SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3677	from_4444(load<U16>(ptr, tail), r,g,b,a);
3678	}
3679	SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3680	// Round from [0,255] to [0,15], producing the same value as (x(15/255.0f) + 0.5f).*
3681	U16 R = (r + `8`) / `17`,
3682	G = (g + `8`) / `17`,
3683	B = (b + `8`) / `17`,
3684	A = (a + `8`) / `17`;
3685	// Pack them back into 15\|rrrr gggg bbbb aaaa\|0.
3686	store(ptr, tail, R << `12`
3687	\| G << `8`
3688	\| B << `4`
3689	\| A << `0`);
3690	}
3691
3692	STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
3693	load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3694	}
3695	STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3696	load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3697	}
3698	STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
3699	store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
3700	}
3701	STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
3702	const uint16_t* ptr;
3703	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3704	from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
3705	}
3706
3707	SI void from_88(U16 rg, U16* r, U16* g) {
3708	*r = (rg & `0xFF`);
3709	*g = (rg >> `8`);
3710	}
3711
3712	SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
3713	#if 1 && defined(JUMPER_IS_NEON)
3714	uint8x8x2_t rg;
3715	switch (tail & (N-`1`)) {
3716	case `0`: rg = vld2_u8 ((const uint8_t)(ptr+`0`) ); break*;
3717	case `7`: rg = vld2_lane_u8((const uint8_t*)(ptr+`6`), rg, `6`);
3718	case `6`: rg = vld2_lane_u8((const uint8_t*)(ptr+`5`), rg, `5`);
3719	case `5`: rg = vld2_lane_u8((const uint8_t*)(ptr+`4`), rg, `4`);
3720	case `4`: rg = vld2_lane_u8((const uint8_t*)(ptr+`3`), rg, `3`);
3721	case `3`: rg = vld2_lane_u8((const uint8_t*)(ptr+`2`), rg, `2`);
3722	case `2`: rg = vld2_lane_u8((const uint8_t*)(ptr+`1`), rg, `1`);
3723	case `1`: rg = vld2_lane_u8((const uint8_t*)(ptr+`0`), rg, `0`);
3724	}
3725	*r = cast<U16>(rg.val[`0`]);
3726	*g = cast<U16>(rg.val[`1`]);
3727	#else
3728	from_88(load<U16>(ptr, tail), r,g);
3729	#endif
3730	}
3731
3732	SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) {
3733	#if 1 && defined(JUMPER_IS_NEON)
3734	uint8x8x2_t rg = {{
3735	cast<U8>(r),
3736	cast<U8>(g),
3737	}};
3738	switch (tail & (N-`1`)) {
3739	case `0`: vst2_u8 ((uint8_t)(ptr+`0`), rg ); break*;
3740	case `7`: vst2_lane_u8((uint8_t*)(ptr+`6`), rg, `6`);
3741	case `6`: vst2_lane_u8((uint8_t*)(ptr+`5`), rg, `5`);
3742	case `5`: vst2_lane_u8((uint8_t*)(ptr+`4`), rg, `4`);
3743	case `4`: vst2_lane_u8((uint8_t*)(ptr+`3`), rg, `3`);
3744	case `3`: vst2_lane_u8((uint8_t*)(ptr+`2`), rg, `2`);
3745	case `2`: vst2_lane_u8((uint8_t*)(ptr+`1`), rg, `1`);
3746	case `1`: vst2_lane_u8((uint8_t*)(ptr+`0`), rg, `0`);
3747	}
3748	#else
3749	store(ptr, tail, cast<U16>(r \| (g<<`8`)) << `0`);
3750	#endif
3751	}
3752
3753	STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
3754	load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &r, &g);
3755	b = `0`;
3756	a = `255`;
3757	}
3758	STAGE_PP(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3759	load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &dr, &dg);
3760	db = `0`;
3761	da = `255`;
3762	}
3763	STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
3764	store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g);
3765	}
3766	STAGE_GP(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
3767	const uint16_t* ptr;
3768	U32 ix = ix_and_ptr(&ptr, ctx, x, y);
3769	from_88(gather<U16>(ptr, ix), &r, &g);
3770	b = `0`;
3771	a = `255`;
3772	}
3773
3774	// ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
3775
3776	SI U16 load_8(const uint8_t* ptr, size_t tail) {
3777	return cast<U16>(load<U8>(ptr, tail));
3778	}
3779	SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
3780	store(ptr, tail, cast<U8>(v));
3781	}
3782
3783	STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
3784	r = g = b = `0`;
3785	a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3786	}
3787	STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3788	dr = dg = db = `0`;
3789	da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3790	}
3791	STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
3792	store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
3793	}
3794	STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
3795	const uint8_t* ptr;
3796	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3797	r = g = b = `0`;
3798	a = cast<U16>(gather<U8>(ptr, ix));
3799	}
3800
3801	STAGE_PP(alpha_to_gray, Ctx::None) {
3802	r = g = b = a;
3803	a = `255`;
3804	}
3805	STAGE_PP(alpha_to_gray_dst, Ctx::None) {
3806	dr = dg = db = da;
3807	da = `255`;
3808	}
3809	STAGE_PP(bt709_luminance_or_luma_to_alpha, Ctx::None) {
3810	a = (r`54` + g`183` + b`19`)/`256`; // 0.2126, 0.7152, 0.0722 with 256 denominator.*
3811	r = g = b = `0`;
3812	}
3813
3814	// ~~~~~~ Coverage scales / lerps ~~~~~~ //
3815
3816	STAGE_PP(load_src, const uint16_t* ptr) {
3817	r = sk_unaligned_load<U16>(ptr + `0`*N);
3818	g = sk_unaligned_load<U16>(ptr + `1`*N);
3819	b = sk_unaligned_load<U16>(ptr + `2`*N);
3820	a = sk_unaligned_load<U16>(ptr + `3`*N);
3821	}
3822	STAGE_PP(store_src, uint16_t* ptr) {
3823	sk_unaligned_store(ptr + `0`*N, r);
3824	sk_unaligned_store(ptr + `1`*N, g);
3825	sk_unaligned_store(ptr + `2`*N, b);
3826	sk_unaligned_store(ptr + `3`*N, a);
3827	}
3828	STAGE_PP(store_src_a, uint16_t* ptr) {
3829	sk_unaligned_store(ptr, a);
3830	}
3831	STAGE_PP(load_dst, const uint16_t* ptr) {
3832	dr = sk_unaligned_load<U16>(ptr + `0`*N);
3833	dg = sk_unaligned_load<U16>(ptr + `1`*N);
3834	db = sk_unaligned_load<U16>(ptr + `2`*N);
3835	da = sk_unaligned_load<U16>(ptr + `3`*N);
3836	}
3837	STAGE_PP(store_dst, uint16_t* ptr) {
3838	sk_unaligned_store(ptr + `0`*N, dr);
3839	sk_unaligned_store(ptr + `1`*N, dg);
3840	sk_unaligned_store(ptr + `2`*N, db);
3841	sk_unaligned_store(ptr + `3`*N, da);
3842	}
3843
3844	// ~~~~~~ Coverage scales / lerps ~~~~~~ //
3845
3846	STAGE_PP(scale_1_float, const float* f) {
3847	U16 c = from_float(*f);
3848	r = div255( r * c );
3849	g = div255( g * c );
3850	b = div255( b * c );
3851	a = div255( a * c );
3852	}
3853	STAGE_PP(lerp_1_float, const float* f) {
3854	U16 c = from_float(*f);
3855	r = lerp(dr, r, c);
3856	g = lerp(dg, g, c);
3857	b = lerp(db, b, c);
3858	a = lerp(da, a, c);
3859	}
3860	STAGE_PP(scale_native, const uint16_t scales[]) {
3861	auto c = sk_unaligned_load<U16>(scales);
3862	r = div255( r * c );
3863	g = div255( g * c );
3864	b = div255( b * c );
3865	a = div255( a * c );
3866	}
3867
3868	STAGE_PP(lerp_native, const uint16_t scales[]) {
3869	auto c = sk_unaligned_load<U16>(scales);
3870	r = lerp(dr, r, c);
3871	g = lerp(dg, g, c);
3872	b = lerp(db, b, c);
3873	a = lerp(da, a, c);
3874	}
3875
3876	STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
3877	U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3878	r = div255( r * c );
3879	g = div255( g * c );
3880	b = div255( b * c );
3881	a = div255( a * c );
3882	}
3883	STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
3884	U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3885	r = lerp(dr, r, c);
3886	g = lerp(dg, g, c);
3887	b = lerp(db, b, c);
3888	a = lerp(da, a, c);
3889	}
3890
3891	// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
3892	SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
3893	return if_then_else(a < da, min(cr, min(cg,cb))
3894	, max(cr, max(cg,cb)));
3895	}
3896	STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
3897	U16 cr,cg,cb;
3898	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3899	U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
3900
3901	r = div255( r * cr );
3902	g = div255( g * cg );
3903	b = div255( b * cb );
3904	a = div255( a * ca );
3905	}
3906	STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
3907	U16 cr,cg,cb;
3908	load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3909	U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
3910
3911	r = lerp(dr, r, cr);
3912	g = lerp(dg, g, cg);
3913	b = lerp(db, b, cb);
3914	a = lerp(da, a, ca);
3915	}
3916
3917	STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
3918	U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail),
3919	add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail);
3920
3921	r = min(div255(r*mul) + add, a);
3922	g = min(div255(g*mul) + add, a);
3923	b = min(div255(b*mul) + add, a);
3924	}
3925
3926
3927	// ~~~~~~ Gradient stages ~~~~~~ //
3928
3929	// Clamp x to [0,1], both sides inclusive (think, gradients).
3930	// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
3931	SI F clamp_01(F v) { return min(max(`0`, v), `1`); }
3932
3933	STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); }
3934	STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); }
3935	STAGE_GG(mirror_x_1, Ctx::None) {
3936	auto two = [](F x){ return x+x; };
3937	x = clamp_01(abs_( (x-`1.0f`) - two(floor_((x-`1.0f`)*`0.5f`)) - `1.0f` ));
3938	}
3939
3940	SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); }
3941
3942	STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
3943	auto w = ctx->limit_x;
3944	sk_unaligned_store(ctx->mask, cond_to_mask_16((`0` <= x) & (x < w)));
3945	}
3946	STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
3947	auto h = ctx->limit_y;
3948	sk_unaligned_store(ctx->mask, cond_to_mask_16((`0` <= y) & (y < h)));
3949	}
3950	STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
3951	auto w = ctx->limit_x;
3952	auto h = ctx->limit_y;
3953	sk_unaligned_store(ctx->mask, cond_to_mask_16((`0` <= x) & (x < w) & (`0` <= y) & (y < h)));
3954	}
3955	STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
3956	auto mask = sk_unaligned_load<U16>(ctx->mask);
3957	r = r & mask;
3958	g = g & mask;
3959	b = b & mask;
3960	a = a & mask;
3961	}
3962
3963	SI void round_F_to_U16(F R, F G, F B, F A, bool interpolatedInPremul,
3964	U16* r, U16* g, U16* b, U16* a) {
3965	auto round = [](F x) { return cast<U16>(x * `255.0f` + `0.5f`); };
3966
3967	F limit = interpolatedInPremul ? A
3968	: `1`;
3969	*r = round(min(max(`0`,R), limit));
3970	*g = round(min(max(`0`,G), limit));
3971	*b = round(min(max(`0`,B), limit));
3972	a = round(A); // we assume alpha is already in [0,1].*
3973	}
3974
3975	SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
3976	U16* r, U16* g, U16* b, U16* a) {
3977
3978	F fr, fg, fb, fa, br, bg, bb, ba;
3979	#if defined(JUMPER_IS_HSW) \|\| defined(JUMPER_IS_AVX512)
3980	if (c->stopCount <=`8`) {
3981	__m256i lo, hi;
3982	split(idx, &lo, &hi);
3983
3984	fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`0`]), lo),
3985	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`0`]), hi));
3986	br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`0`]), lo),
3987	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`0`]), hi));
3988	fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`1`]), lo),
3989	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`1`]), hi));
3990	bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`1`]), lo),
3991	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`1`]), hi));
3992	fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`2`]), lo),
3993	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`2`]), hi));
3994	bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`2`]), lo),
3995	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`2`]), hi));
3996	fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`3`]), lo),
3997	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[`3`]), hi));
3998	ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`3`]), lo),
3999	_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[`3`]), hi));
4000	} else
4001	#endif
4002	{
4003	fr = gather<F>(c->fs[`0`], idx);
4004	fg = gather<F>(c->fs[`1`], idx);
4005	fb = gather<F>(c->fs[`2`], idx);
4006	fa = gather<F>(c->fs[`3`], idx);
4007	br = gather<F>(c->bs[`0`], idx);
4008	bg = gather<F>(c->bs[`1`], idx);
4009	bb = gather<F>(c->bs[`2`], idx);
4010	ba = gather<F>(c->bs[`3`], idx);
4011	}
4012	round_F_to_U16(mad(t, fr, br),
4013	mad(t, fg, bg),
4014	mad(t, fb, bb),
4015	mad(t, fa, ba),
4016	c->interpolatedInPremul,
4017	r,g,b,a);
4018	}
4019
4020	STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) {
4021	auto t = x;
4022	U32 idx = `0`;
4023
4024	// N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
4025	for (size_t i = `1`; i < c->stopCount; i++) {
4026	idx += if_then_else(t >= c->ts[i], U32(`1`), U32(`0`));
4027	}
4028
4029	gradient_lookup(c, idx, t, &r, &g, &b, &a);
4030	}
4031
4032	STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
4033	auto t = x;
4034	auto idx = trunc_(t * (c->stopCount-`1`));
4035	gradient_lookup(c, idx, t, &r, &g, &b, &a);
4036	}
4037
4038	STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
4039	auto t = x;
4040	round_F_to_U16(mad(t, c->f[`0`], c->b[`0`]),
4041	mad(t, c->f[`1`], c->b[`1`]),
4042	mad(t, c->f[`2`], c->b[`2`]),
4043	mad(t, c->f[`3`], c->b[`3`]),
4044	c->interpolatedInPremul,
4045	&r,&g,&b,&a);
4046	}
4047
4048	STAGE_GG(xy_to_unit_angle, Ctx::None) {
4049	F xabs = abs_(x),
4050	yabs = abs_(y);
4051
4052	F slope = min(xabs, yabs)/max(xabs, yabs);
4053	F s = slope * slope;
4054
4055	// Use a 7th degree polynomial to approximate atan.
4056	// This was generated using sollya.gforge.inria.fr.
4057	// A float optimized polynomial was generated using the following command.
4058	// P1 = fpminimax((1/(2Pi))atan(x),[\|1,3,5,7\|],[\|24...\|],[2^(-40),1],relative);
4059	F phi = slope
4060	* (`0.15912117063999176025390625f` + s
4061	* (-`5.185396969318389892578125e-2f` + s
4062	* (`2.476101927459239959716796875e-2f` + s
4063	* (-`7.0547382347285747528076171875e-3f`))));
4064
4065	phi = if_then_else(xabs < yabs, `1.0f`/`4.0f` - phi, phi);
4066	phi = if_then_else(x < `0.0f` , `1.0f`/`2.0f` - phi, phi);
4067	phi = if_then_else(y < `0.0f` , `1.0f` - phi , phi);
4068	phi = if_then_else(phi != phi , `0` , phi); // Check for NaN.
4069	x = phi;
4070	}
4071	STAGE_GG(xy_to_radius, Ctx::None) {
4072	x = sqrt_(xx + yy);
4073	}
4074
4075	// ~~~~~~ Compound stages ~~~~~~ //
4076
4077	STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
4078	auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
4079
4080	load_8888_(ptr, tail, &dr,&dg,&db,&da);
4081	r = r + div255( dr*inv(a) );
4082	g = g + div255( dg*inv(a) );
4083	b = b + div255( db*inv(a) );
4084	a = a + div255( da*inv(a) );
4085	store_8888_(ptr, tail, r,g,b,a);
4086	}
4087
4088	#if defined(SK_DISABLE_LOWP_BILERP_CLAMP_CLAMP_STAGE)
4089	static void(bilerp_clamp_8888)(void) = nullptr*;
4090	static void(bilinear)(void) = nullptr*;
4091	#else
4092	STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
4093	// (cx,cy) are the center of our sample.
4094	F cx = x,
4095	cy = y;
4096
4097	// All sample points are at the same fractional offset (fx,fy).
4098	// They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
4099	F fx = fract(cx + `0.5f`),
4100	fy = fract(cy + `0.5f`);
4101
4102	// We'll accumulate the color of all four samples into {r,g,b,a} directly.
4103	r = g = b = a = `0`;
4104
4105	// The first three sample points will calculate their area using math
4106	// just like in the float code above, but the fourth will take up all the rest.
4107	//
4108	// Logically this is the same as doing the math for the fourth pixel too,
4109	// but rounding error makes this a better strategy, keeping opaque opaque, etc.
4110	//
4111	// We can keep up to 8 bits of fractional precision without overflowing 16-bit,
4112	// so our "1.0" area is 256.
4113	const uint16_t bias = `256`;
4114	U16 remaining = bias;
4115
4116	for (float dy = -`0.5f`; dy <= +`0.5f`; dy += `1.0f`)
4117	for (float dx = -`0.5f`; dx <= +`0.5f`; dx += `1.0f`) {
4118	// (x,y) are the coordinates of this sample point.
4119	F x = cx + dx,
4120	y = cy + dy;
4121
4122	// ix_and_ptr() will clamp to the image's bounds for us.
4123	const uint32_t* ptr;
4124	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
4125
4126	U16 sr,sg,sb,sa;
4127	from_8888(gather<U32>(ptr, ix), &sr,&sg,&sb,&sa);
4128
4129	// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
4130	// are combined in direct proportion to their area overlapping that logical query pixel.
4131	// At positive offsets, the x-axis contribution to that rectangle is fx,
4132	// or (1-fx) at negative x. Same deal for y.
4133	F sx = (dx > `0`) ? fx : `1.0f` - fx,
4134	sy = (dy > `0`) ? fy : `1.0f` - fy;
4135
4136	U16 area = (dy == `0.5f` && dx == `0.5f`) ? remaining
4137	: cast<U16>(sx * sy * bias);
4138	for (size_t i = `0`; i < N; i++) {
4139	SkASSERT(remaining[i] >= area[i]);
4140	}
4141	remaining -= area;
4142
4143	r += sr * area;
4144	g += sg * area;
4145	b += sb * area;
4146	a += sa * area;
4147	}
4148
4149	r = (r + bias/`2`) / bias;
4150	g = (g + bias/`2`) / bias;
4151	b = (b + bias/`2`) / bias;
4152	a = (a + bias/`2`) / bias;
4153	}
4154
4155	// TODO: lowp::tile() is identical to the highp tile()... share?
4156	SI F tile(F v, SkTileMode mode, float limit, float invLimit) {
4157	// After ix_and_ptr() will clamp the output of tile(), so we need not clamp here.
4158	switch (mode) {
4159	case SkTileMode::kDecal: // TODO, for now fallthrough to clamp
4160	case SkTileMode::kClamp: return v;
4161	case SkTileMode::kRepeat: return v - floor_(vinvLimit)limit;
4162	case SkTileMode::kMirror:
4163	return abs_( (v-limit) - (limit+limit)floor_((v-limit)(invLimit*`0.5f`)) - limit );
4164	}
4165	SkUNREACHABLE;
4166	}
4167
4168	SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y,
4169	U16* r, U16* g, U16* b, U16* a) {
4170	x = tile(x, ctx->tileX, ctx->width , ctx->invWidth );
4171	y = tile(y, ctx->tileY, ctx->height, ctx->invHeight);
4172
4173	switch (ctx->ct) {
4174	default: r = g = b = a = `0`; // TODO
4175	break;
4176
4177	case kRGBA_8888_SkColorType:
4178	case kBGRA_8888_SkColorType: {
4179	const uint32_t* ptr;
4180	U32 ix = ix_and_ptr(&ptr, ctx, x,y);
4181	from_8888(gather<U32>(ptr, ix), r,g,b,a);
4182	if (ctx->ct == kBGRA_8888_SkColorType) {
4183	std::swap(r,b);
4184	}
4185	} break;
4186	}
4187	}
4188
4189	template <int D>
4190	SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx,
4191	F cx, F cy, const F (&wx)[D], const F (&wy)[D],
4192	U16* r, U16* g, U16* b, U16* a) {
4193
4194	float start = -`0.5f`*(D-`1`);
4195
4196	const uint16_t bias = `256`;
4197	U16 remaining = bias;
4198
4199	r = g = b = a = `0`;
4200	F y = cy + start;
4201	for (int j = `0`; j < D; j++, y += `1.0f`) {
4202	F x = cx + start;
4203	for (int i = `0`; i < D; i++, x += `1.0f`) {
4204	U16 R,G,B,A;
4205	sample(ctx, x,y, &R,&G,&B,&A);
4206
4207	U16 w = (i == D-`1` && j == D-`1`) ? remaining
4208	: cast<U16>(wx[i]wy[j]bias);
4209	remaining -= w;
4210	r += wR;
4211	g += wG;
4212	b += wB;
4213	a += wA;
4214	}
4215	}
4216	r = (r + bias/`2`) / bias;
4217	g = (g + bias/`2`) / bias;
4218	b = (b + bias/`2`) / bias;
4219	a = (a + bias/`2`) / bias;
4220	}
4221
4222	STAGE_GP(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) {
4223	F fx = fract(x + `0.5f`),
4224	fy = fract(y + `0.5f`);
4225	const F wx[] = {`1.0f` - fx, fx};
4226	const F wy[] = {`1.0f` - fy, fy};
4227
4228	sampler(ctx, x,y, wx,wy, &r,&g,&b,&a);
4229	}
4230	#endif
4231
4232	// ~~~~~~ GrSwizzle stage ~~~~~~ //
4233
4234	STAGE_PP(swizzle, void* ctx) {
4235	auto ir = r, ig = g, ib = b, ia = a;
4236	U16* o[] = {&r, &g, &b, &a};
4237	char swiz[`4`];
4238	memcpy(swiz, &ctx, sizeof(swiz));
4239
4240	for (int i = `0`; i < `4`; ++i) {
4241	switch (swiz[i]) {
4242	case `'r'`: o[i] = ir; break*;
4243	case `'g'`: o[i] = ig; break*;
4244	case `'b'`: o[i] = ib; break*;
4245	case `'a'`: o[i] = ia; break*;
4246	case `'0'`: o[i] = U16(`0`); break*;
4247	case `'1'`: o[i] = U16(`255`); break*;
4248	default: break;
4249	}
4250	}
4251	}
4252
4253	// Now we'll add null stand-ins for stages we haven't implemented in lowp.
4254	// If a pipeline uses these stages, it'll boot it out of lowp into highp.
4255	#define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr;
4256	NOT_IMPLEMENTED(callback)
4257	NOT_IMPLEMENTED(interpreter)
4258	NOT_IMPLEMENTED(unbounded_set_rgb)
4259	NOT_IMPLEMENTED(unbounded_uniform_color)
4260	NOT_IMPLEMENTED(unpremul)
4261	NOT_IMPLEMENTED(dither) // TODO
4262	NOT_IMPLEMENTED(from_srgb)
4263	NOT_IMPLEMENTED(to_srgb)
4264	NOT_IMPLEMENTED(load_16161616)
4265	NOT_IMPLEMENTED(load_16161616_dst)
4266	NOT_IMPLEMENTED(store_16161616)
4267	NOT_IMPLEMENTED(gather_16161616)
4268	NOT_IMPLEMENTED(load_a16)
4269	NOT_IMPLEMENTED(load_a16_dst)
4270	NOT_IMPLEMENTED(store_a16)
4271	NOT_IMPLEMENTED(gather_a16)
4272	NOT_IMPLEMENTED(load_rg1616)
4273	NOT_IMPLEMENTED(load_rg1616_dst)
4274	NOT_IMPLEMENTED(store_rg1616)
4275	NOT_IMPLEMENTED(gather_rg1616)
4276	NOT_IMPLEMENTED(load_f16)
4277	NOT_IMPLEMENTED(load_f16_dst)
4278	NOT_IMPLEMENTED(store_f16)
4279	NOT_IMPLEMENTED(gather_f16)
4280	NOT_IMPLEMENTED(load_af16)
4281	NOT_IMPLEMENTED(load_af16_dst)
4282	NOT_IMPLEMENTED(store_af16)
4283	NOT_IMPLEMENTED(gather_af16)
4284	NOT_IMPLEMENTED(load_rgf16)
4285	NOT_IMPLEMENTED(load_rgf16_dst)
4286	NOT_IMPLEMENTED(store_rgf16)
4287	NOT_IMPLEMENTED(gather_rgf16)
4288	NOT_IMPLEMENTED(load_f32)
4289	NOT_IMPLEMENTED(load_f32_dst)
4290	NOT_IMPLEMENTED(store_f32)
4291	NOT_IMPLEMENTED(gather_f32)
4292	NOT_IMPLEMENTED(load_rgf32)
4293	NOT_IMPLEMENTED(store_rgf32)
4294	NOT_IMPLEMENTED(load_1010102)
4295	NOT_IMPLEMENTED(load_1010102_dst)
4296	NOT_IMPLEMENTED(store_1010102)
4297	NOT_IMPLEMENTED(gather_1010102)
4298	NOT_IMPLEMENTED(store_u16_be)
4299	NOT_IMPLEMENTED(byte_tables) // TODO
4300	NOT_IMPLEMENTED(colorburn)
4301	NOT_IMPLEMENTED(colordodge)
4302	NOT_IMPLEMENTED(softlight)
4303	NOT_IMPLEMENTED(hue)
4304	NOT_IMPLEMENTED(saturation)
4305	NOT_IMPLEMENTED(color)
4306	NOT_IMPLEMENTED(luminosity)
4307	NOT_IMPLEMENTED(matrix_3x3)
4308	NOT_IMPLEMENTED(matrix_3x4)
4309	NOT_IMPLEMENTED(matrix_4x5) // TODO
4310	NOT_IMPLEMENTED(matrix_4x3) // TODO
4311	NOT_IMPLEMENTED(parametric)
4312	NOT_IMPLEMENTED(gamma_)
4313	NOT_IMPLEMENTED(PQish)
4314	NOT_IMPLEMENTED(HLGish)
4315	NOT_IMPLEMENTED(HLGinvish)
4316	NOT_IMPLEMENTED(rgb_to_hsl)
4317	NOT_IMPLEMENTED(hsl_to_rgb)
4318	NOT_IMPLEMENTED(gauss_a_to_rgba) // TODO
4319	NOT_IMPLEMENTED(mirror_x) // TODO
4320	NOT_IMPLEMENTED(repeat_x) // TODO
4321	NOT_IMPLEMENTED(mirror_y) // TODO
4322	NOT_IMPLEMENTED(repeat_y) // TODO
4323	NOT_IMPLEMENTED(negate_x)
4324	NOT_IMPLEMENTED(bicubic) // TODO if I can figure out negative weights
4325	NOT_IMPLEMENTED(bicubic_clamp_8888)
4326	NOT_IMPLEMENTED(bilinear_nx) // TODO
4327	NOT_IMPLEMENTED(bilinear_ny) // TODO
4328	NOT_IMPLEMENTED(bilinear_px) // TODO
4329	NOT_IMPLEMENTED(bilinear_py) // TODO
4330	NOT_IMPLEMENTED(bicubic_n3x) // TODO
4331	NOT_IMPLEMENTED(bicubic_n1x) // TODO
4332	NOT_IMPLEMENTED(bicubic_p1x) // TODO
4333	NOT_IMPLEMENTED(bicubic_p3x) // TODO
4334	NOT_IMPLEMENTED(bicubic_n3y) // TODO
4335	NOT_IMPLEMENTED(bicubic_n1y) // TODO
4336	NOT_IMPLEMENTED(bicubic_p1y) // TODO
4337	NOT_IMPLEMENTED(bicubic_p3y) // TODO
4338	NOT_IMPLEMENTED(save_xy) // TODO
4339	NOT_IMPLEMENTED(accumulate) // TODO
4340	NOT_IMPLEMENTED(xy_to_2pt_conical_well_behaved)
4341	NOT_IMPLEMENTED(xy_to_2pt_conical_strip)
4342	NOT_IMPLEMENTED(xy_to_2pt_conical_focal_on_circle)
4343	NOT_IMPLEMENTED(xy_to_2pt_conical_smaller)
4344	NOT_IMPLEMENTED(xy_to_2pt_conical_greater)
4345	NOT_IMPLEMENTED(alter_2pt_conical_compensate_focal)
4346	NOT_IMPLEMENTED(alter_2pt_conical_unswap)
4347	NOT_IMPLEMENTED(mask_2pt_conical_nan)
4348	NOT_IMPLEMENTED(mask_2pt_conical_degenerates)
4349	NOT_IMPLEMENTED(apply_vector_mask)
4350	#undef NOT_IMPLEMENTED
4351
4352	#endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages
4353	} // namespace lowp
4354
4355	} // namespace SK_OPTS_NS
4356
4357	#endif//SkRasterPipeline_opts_DEFINED
4358

Browse the source code of Skia/src/opts/SkRasterPipeline_opts.h