1 | /* |
2 | * Copyright 2018 Google Inc. |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #ifndef SkRasterPipeline_opts_DEFINED |
9 | #define SkRasterPipeline_opts_DEFINED |
10 | |
11 | #include "include/core/SkTypes.h" |
12 | #include "src/core/SkUtils.h" // unaligned_{load,store} |
13 | #include "src/sksl/SkSLByteCode.h" |
14 | |
15 | // Every function in this file should be marked static and inline using SI. |
16 | #if defined(__clang__) |
17 | #define SI __attribute__((always_inline)) static inline |
18 | #else |
19 | #define SI static inline |
20 | #endif |
21 | |
22 | template <typename Dst, typename Src> |
23 | SI Dst bit_cast(const Src& src) { |
24 | static_assert(sizeof(Dst) == sizeof(Src), "" ); |
25 | return sk_unaligned_load<Dst>(&src); |
26 | } |
27 | |
28 | template <typename Dst, typename Src> |
29 | SI Dst widen_cast(const Src& src) { |
30 | static_assert(sizeof(Dst) > sizeof(Src), "" ); |
31 | Dst dst; |
32 | memcpy(&dst, &src, sizeof(Src)); |
33 | return dst; |
34 | } |
35 | |
36 | // Our program is an array of void*, either |
37 | // - 1 void* per stage with no context pointer, the next stage; |
38 | // - 2 void* per stage with a context pointer, first the context pointer, then the next stage. |
39 | |
40 | // load_and_inc() steps the program forward by 1 void*, returning that pointer. |
41 | SI void* load_and_inc(void**& program) { |
42 | #if defined(__GNUC__) && defined(__x86_64__) |
43 | // If program is in %rsi (we try to make this likely) then this is a single instruction. |
44 | void* rax; |
45 | asm("lodsq" : "=a" (rax), "+S" (program)); // Write-only %rax, read-write %rsi. |
46 | return rax; |
47 | #else |
48 | // On ARM *program++ compiles into pretty ideal code without any handholding. |
49 | return *program++; |
50 | #endif |
51 | } |
52 | |
53 | // Lazily resolved on first cast. Does nothing if cast to Ctx::None. |
54 | struct Ctx { |
55 | struct None {}; |
56 | |
57 | void* ptr; |
58 | void**& program; |
59 | |
60 | explicit Ctx(void**& p) : ptr(nullptr), program(p) {} |
61 | |
62 | template <typename T> |
63 | operator T*() { |
64 | if (!ptr) { ptr = load_and_inc(program); } |
65 | return (T*)ptr; |
66 | } |
67 | operator None() { return None{}; } |
68 | }; |
69 | |
70 | |
71 | #if !defined(__clang__) |
72 | #define JUMPER_IS_SCALAR |
73 | #elif defined(SK_ARM_HAS_NEON) |
74 | #define JUMPER_IS_NEON |
75 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX512 |
76 | #define JUMPER_IS_AVX512 |
77 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
78 | #define JUMPER_IS_HSW |
79 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX |
80 | #define JUMPER_IS_AVX |
81 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
82 | #define JUMPER_IS_SSE41 |
83 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
84 | #define JUMPER_IS_SSE2 |
85 | #else |
86 | #define JUMPER_IS_SCALAR |
87 | #endif |
88 | |
89 | // Older Clangs seem to crash when generating non-optimized NEON code for ARMv7. |
90 | #if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32) |
91 | // Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative. |
92 | #if defined(__apple_build_version__) && __clang_major__ < 9 |
93 | #define JUMPER_IS_SCALAR |
94 | #elif __clang_major__ < 5 |
95 | #define JUMPER_IS_SCALAR |
96 | #endif |
97 | |
98 | #if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR) |
99 | #undef JUMPER_IS_NEON |
100 | #endif |
101 | #endif |
102 | |
103 | #if defined(JUMPER_IS_SCALAR) |
104 | #include <math.h> |
105 | #elif defined(JUMPER_IS_NEON) |
106 | #include <arm_neon.h> |
107 | #else |
108 | #include <immintrin.h> |
109 | #endif |
110 | |
111 | namespace SK_OPTS_NS { |
112 | |
113 | #if defined(JUMPER_IS_SCALAR) |
114 | // This path should lead to portable scalar code. |
115 | using F = float ; |
116 | using I32 = int32_t; |
117 | using U64 = uint64_t; |
118 | using U32 = uint32_t; |
119 | using U16 = uint16_t; |
120 | using U8 = uint8_t ; |
121 | |
122 | SI F mad(F f, F m, F a) { return f*m+a; } |
123 | SI F min(F a, F b) { return fminf(a,b); } |
124 | SI F max(F a, F b) { return fmaxf(a,b); } |
125 | SI F abs_ (F v) { return fabsf(v); } |
126 | SI F floor_(F v) { return floorf(v); } |
127 | SI F rcp (F v) { return 1.0f / v; } |
128 | SI F rsqrt (F v) { return 1.0f / sqrtf(v); } |
129 | SI F sqrt_(F v) { return sqrtf(v); } |
130 | SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); } |
131 | SI U16 pack(U32 v) { return (U16)v; } |
132 | SI U8 pack(U16 v) { return (U8)v; } |
133 | |
134 | SI F if_then_else(I32 c, F t, F e) { return c ? t : e; } |
135 | |
136 | template <typename T> |
137 | SI T gather(const T* p, U32 ix) { return p[ix]; } |
138 | |
139 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
140 | *r = ptr[0]; |
141 | *g = ptr[1]; |
142 | } |
143 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
144 | ptr[0] = r; |
145 | ptr[1] = g; |
146 | } |
147 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
148 | *r = ptr[0]; |
149 | *g = ptr[1]; |
150 | *b = ptr[2]; |
151 | } |
152 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
153 | *r = ptr[0]; |
154 | *g = ptr[1]; |
155 | *b = ptr[2]; |
156 | *a = ptr[3]; |
157 | } |
158 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
159 | ptr[0] = r; |
160 | ptr[1] = g; |
161 | ptr[2] = b; |
162 | ptr[3] = a; |
163 | } |
164 | |
165 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
166 | *r = ptr[0]; |
167 | *g = ptr[1]; |
168 | } |
169 | SI void store2(float* ptr, size_t tail, F r, F g) { |
170 | ptr[0] = r; |
171 | ptr[1] = g; |
172 | } |
173 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
174 | *r = ptr[0]; |
175 | *g = ptr[1]; |
176 | *b = ptr[2]; |
177 | *a = ptr[3]; |
178 | } |
179 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
180 | ptr[0] = r; |
181 | ptr[1] = g; |
182 | ptr[2] = b; |
183 | ptr[3] = a; |
184 | } |
185 | |
186 | #elif defined(JUMPER_IS_NEON) |
187 | // Since we know we're using Clang, we can use its vector extensions. |
188 | template <typename T> using V = T __attribute__((ext_vector_type(4))); |
189 | using F = V<float >; |
190 | using I32 = V< int32_t>; |
191 | using U64 = V<uint64_t>; |
192 | using U32 = V<uint32_t>; |
193 | using U16 = V<uint16_t>; |
194 | using U8 = V<uint8_t >; |
195 | |
196 | // We polyfill a few routines that Clang doesn't build into ext_vector_types. |
197 | SI F min(F a, F b) { return vminq_f32(a,b); } |
198 | SI F max(F a, F b) { return vmaxq_f32(a,b); } |
199 | SI F abs_ (F v) { return vabsq_f32(v); } |
200 | SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; } |
201 | SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } |
202 | SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); } |
203 | SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); } |
204 | |
205 | SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } |
206 | |
207 | #if defined(SK_CPU_ARM64) |
208 | SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); } |
209 | SI F floor_(F v) { return vrndmq_f32(v); } |
210 | SI F sqrt_(F v) { return vsqrtq_f32(v); } |
211 | SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } |
212 | #else |
213 | SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); } |
214 | SI F floor_(F v) { |
215 | F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); |
216 | return roundtrip - if_then_else(roundtrip > v, 1, 0); |
217 | } |
218 | |
219 | SI F sqrt_(F v) { |
220 | auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v). |
221 | e *= vrsqrtsq_f32(v,e*e); |
222 | e *= vrsqrtsq_f32(v,e*e); |
223 | return v*e; // sqrt(v) == v*rsqrt(v). |
224 | } |
225 | |
226 | SI U32 round(F v, F scale) { |
227 | return vcvtq_u32_f32(mad(v,scale,0.5f)); |
228 | } |
229 | #endif |
230 | |
231 | |
232 | template <typename T> |
233 | SI V<T> gather(const T* p, U32 ix) { |
234 | return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; |
235 | } |
236 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
237 | uint16x4x2_t rg; |
238 | if (__builtin_expect(tail,0)) { |
239 | if ( true ) { rg = vld2_lane_u16(ptr + 0, rg, 0); } |
240 | if (tail > 1) { rg = vld2_lane_u16(ptr + 2, rg, 1); } |
241 | if (tail > 2) { rg = vld2_lane_u16(ptr + 4, rg, 2); } |
242 | } else { |
243 | rg = vld2_u16(ptr); |
244 | } |
245 | *r = rg.val[0]; |
246 | *g = rg.val[1]; |
247 | } |
248 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
249 | if (__builtin_expect(tail,0)) { |
250 | if ( true ) { vst2_lane_u16(ptr + 0, (uint16x4x2_t{{r,g}}), 0); } |
251 | if (tail > 1) { vst2_lane_u16(ptr + 2, (uint16x4x2_t{{r,g}}), 1); } |
252 | if (tail > 2) { vst2_lane_u16(ptr + 4, (uint16x4x2_t{{r,g}}), 2); } |
253 | } else { |
254 | vst2_u16(ptr, (uint16x4x2_t{{r,g}})); |
255 | } |
256 | } |
257 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
258 | uint16x4x3_t rgb; |
259 | if (__builtin_expect(tail,0)) { |
260 | if ( true ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); } |
261 | if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); } |
262 | if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); } |
263 | } else { |
264 | rgb = vld3_u16(ptr); |
265 | } |
266 | *r = rgb.val[0]; |
267 | *g = rgb.val[1]; |
268 | *b = rgb.val[2]; |
269 | } |
270 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
271 | uint16x4x4_t rgba; |
272 | if (__builtin_expect(tail,0)) { |
273 | if ( true ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); } |
274 | if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); } |
275 | if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); } |
276 | } else { |
277 | rgba = vld4_u16(ptr); |
278 | } |
279 | *r = rgba.val[0]; |
280 | *g = rgba.val[1]; |
281 | *b = rgba.val[2]; |
282 | *a = rgba.val[3]; |
283 | } |
284 | |
285 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
286 | if (__builtin_expect(tail,0)) { |
287 | if ( true ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); } |
288 | if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); } |
289 | if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); } |
290 | } else { |
291 | vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}})); |
292 | } |
293 | } |
294 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
295 | float32x4x2_t rg; |
296 | if (__builtin_expect(tail,0)) { |
297 | if ( true ) { rg = vld2q_lane_f32(ptr + 0, rg, 0); } |
298 | if (tail > 1) { rg = vld2q_lane_f32(ptr + 2, rg, 1); } |
299 | if (tail > 2) { rg = vld2q_lane_f32(ptr + 4, rg, 2); } |
300 | } else { |
301 | rg = vld2q_f32(ptr); |
302 | } |
303 | *r = rg.val[0]; |
304 | *g = rg.val[1]; |
305 | } |
306 | SI void store2(float* ptr, size_t tail, F r, F g) { |
307 | if (__builtin_expect(tail,0)) { |
308 | if ( true ) { vst2q_lane_f32(ptr + 0, (float32x4x2_t{{r,g}}), 0); } |
309 | if (tail > 1) { vst2q_lane_f32(ptr + 2, (float32x4x2_t{{r,g}}), 1); } |
310 | if (tail > 2) { vst2q_lane_f32(ptr + 4, (float32x4x2_t{{r,g}}), 2); } |
311 | } else { |
312 | vst2q_f32(ptr, (float32x4x2_t{{r,g}})); |
313 | } |
314 | } |
315 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
316 | float32x4x4_t rgba; |
317 | if (__builtin_expect(tail,0)) { |
318 | if ( true ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); } |
319 | if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); } |
320 | if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); } |
321 | } else { |
322 | rgba = vld4q_f32(ptr); |
323 | } |
324 | *r = rgba.val[0]; |
325 | *g = rgba.val[1]; |
326 | *b = rgba.val[2]; |
327 | *a = rgba.val[3]; |
328 | } |
329 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
330 | if (__builtin_expect(tail,0)) { |
331 | if ( true ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); } |
332 | if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); } |
333 | if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); } |
334 | } else { |
335 | vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}})); |
336 | } |
337 | } |
338 | |
339 | #elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
340 | // These are __m256 and __m256i, but friendlier and strongly-typed. |
341 | template <typename T> using V = T __attribute__((ext_vector_type(8))); |
342 | using F = V<float >; |
343 | using I32 = V< int32_t>; |
344 | using U64 = V<uint64_t>; |
345 | using U32 = V<uint32_t>; |
346 | using U16 = V<uint16_t>; |
347 | using U8 = V<uint8_t >; |
348 | |
349 | SI F mad(F f, F m, F a) { |
350 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
351 | return _mm256_fmadd_ps(f,m,a); |
352 | #else |
353 | return f*m+a; |
354 | #endif |
355 | } |
356 | |
357 | SI F min(F a, F b) { return _mm256_min_ps(a,b); } |
358 | SI F max(F a, F b) { return _mm256_max_ps(a,b); } |
359 | SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); } |
360 | SI F floor_(F v) { return _mm256_floor_ps(v); } |
361 | SI F rcp (F v) { return _mm256_rcp_ps (v); } |
362 | SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); } |
363 | SI F sqrt_(F v) { return _mm256_sqrt_ps (v); } |
364 | SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); } |
365 | |
366 | SI U16 pack(U32 v) { |
367 | return _mm_packus_epi32(_mm256_extractf128_si256(v, 0), |
368 | _mm256_extractf128_si256(v, 1)); |
369 | } |
370 | SI U8 pack(U16 v) { |
371 | auto r = _mm_packus_epi16(v,v); |
372 | return sk_unaligned_load<U8>(&r); |
373 | } |
374 | |
375 | SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } |
376 | |
377 | template <typename T> |
378 | SI V<T> gather(const T* p, U32 ix) { |
379 | return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], |
380 | p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], }; |
381 | } |
382 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
383 | SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); } |
384 | SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); } |
385 | SI U64 gather(const uint64_t* p, U32 ix) { |
386 | __m256i parts[] = { |
387 | _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8), |
388 | _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8), |
389 | }; |
390 | return bit_cast<U64>(parts); |
391 | } |
392 | #endif |
393 | |
394 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
395 | U16 _0123, _4567; |
396 | if (__builtin_expect(tail,0)) { |
397 | _0123 = _4567 = _mm_setzero_si128(); |
398 | auto* d = &_0123; |
399 | if (tail > 3) { |
400 | *d = _mm_loadu_si128(((__m128i*)ptr) + 0); |
401 | tail -= 4; |
402 | ptr += 8; |
403 | d = &_4567; |
404 | } |
405 | bool high = false; |
406 | if (tail > 1) { |
407 | *d = _mm_loadu_si64(ptr); |
408 | tail -= 2; |
409 | ptr += 4; |
410 | high = true; |
411 | } |
412 | if (tail > 0) { |
413 | (*d)[high ? 4 : 0] = *(ptr + 0); |
414 | (*d)[high ? 5 : 1] = *(ptr + 1); |
415 | } |
416 | } else { |
417 | _0123 = _mm_loadu_si128(((__m128i*)ptr) + 0); |
418 | _4567 = _mm_loadu_si128(((__m128i*)ptr) + 1); |
419 | } |
420 | *r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16), |
421 | _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16)); |
422 | *g = _mm_packs_epi32(_mm_srai_epi32(_0123, 16), |
423 | _mm_srai_epi32(_4567, 16)); |
424 | } |
425 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
426 | auto _0123 = _mm_unpacklo_epi16(r, g), |
427 | _4567 = _mm_unpackhi_epi16(r, g); |
428 | if (__builtin_expect(tail,0)) { |
429 | const auto* s = &_0123; |
430 | if (tail > 3) { |
431 | _mm_storeu_si128((__m128i*)ptr, *s); |
432 | s = &_4567; |
433 | tail -= 4; |
434 | ptr += 8; |
435 | } |
436 | bool high = false; |
437 | if (tail > 1) { |
438 | _mm_storel_epi64((__m128i*)ptr, *s); |
439 | ptr += 4; |
440 | tail -= 2; |
441 | high = true; |
442 | } |
443 | if (tail > 0) { |
444 | if (high) { |
445 | *(int32_t*)ptr = _mm_extract_epi32(*s, 2); |
446 | } else { |
447 | *(int32_t*)ptr = _mm_cvtsi128_si32(*s); |
448 | } |
449 | } |
450 | } else { |
451 | _mm_storeu_si128((__m128i*)ptr + 0, _0123); |
452 | _mm_storeu_si128((__m128i*)ptr + 1, _4567); |
453 | } |
454 | } |
455 | |
456 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
457 | __m128i _0,_1,_2,_3,_4,_5,_6,_7; |
458 | if (__builtin_expect(tail,0)) { |
459 | auto load_rgb = [](const uint16_t* src) { |
460 | auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); |
461 | return _mm_insert_epi16(v, src[2], 2); |
462 | }; |
463 | _1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128(); |
464 | if ( true ) { _0 = load_rgb(ptr + 0); } |
465 | if (tail > 1) { _1 = load_rgb(ptr + 3); } |
466 | if (tail > 2) { _2 = load_rgb(ptr + 6); } |
467 | if (tail > 3) { _3 = load_rgb(ptr + 9); } |
468 | if (tail > 4) { _4 = load_rgb(ptr + 12); } |
469 | if (tail > 5) { _5 = load_rgb(ptr + 15); } |
470 | if (tail > 6) { _6 = load_rgb(ptr + 18); } |
471 | } else { |
472 | // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over. |
473 | auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ; |
474 | auto _23 = _mm_loadu_si128((const __m128i*)(ptr + 6)) ; |
475 | auto _45 = _mm_loadu_si128((const __m128i*)(ptr + 12)) ; |
476 | auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4); |
477 | _0 = _01; _1 = _mm_srli_si128(_01, 6); |
478 | _2 = _23; _3 = _mm_srli_si128(_23, 6); |
479 | _4 = _45; _5 = _mm_srli_si128(_45, 6); |
480 | _6 = _67; _7 = _mm_srli_si128(_67, 6); |
481 | } |
482 | |
483 | auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx |
484 | _13 = _mm_unpacklo_epi16(_1, _3), |
485 | _46 = _mm_unpacklo_epi16(_4, _6), |
486 | _57 = _mm_unpacklo_epi16(_5, _7); |
487 | |
488 | auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
489 | bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx |
490 | rg4567 = _mm_unpacklo_epi16(_46, _57), |
491 | bx4567 = _mm_unpackhi_epi16(_46, _57); |
492 | |
493 | *r = _mm_unpacklo_epi64(rg0123, rg4567); |
494 | *g = _mm_unpackhi_epi64(rg0123, rg4567); |
495 | *b = _mm_unpacklo_epi64(bx0123, bx4567); |
496 | } |
497 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
498 | __m128i _01, _23, _45, _67; |
499 | if (__builtin_expect(tail,0)) { |
500 | auto src = (const double*)ptr; |
501 | _01 = _23 = _45 = _67 = _mm_setzero_si128(); |
502 | if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } |
503 | if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } |
504 | if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } |
505 | if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } |
506 | if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } |
507 | if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } |
508 | if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } |
509 | } else { |
510 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); |
511 | _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); |
512 | _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); |
513 | _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); |
514 | } |
515 | |
516 | auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 |
517 | _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 |
518 | _46 = _mm_unpacklo_epi16(_45, _67), |
519 | _57 = _mm_unpackhi_epi16(_45, _67); |
520 | |
521 | auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
522 | ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 |
523 | rg4567 = _mm_unpacklo_epi16(_46, _57), |
524 | ba4567 = _mm_unpackhi_epi16(_46, _57); |
525 | |
526 | *r = _mm_unpacklo_epi64(rg0123, rg4567); |
527 | *g = _mm_unpackhi_epi64(rg0123, rg4567); |
528 | *b = _mm_unpacklo_epi64(ba0123, ba4567); |
529 | *a = _mm_unpackhi_epi64(ba0123, ba4567); |
530 | } |
531 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
532 | auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3 |
533 | rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7 |
534 | ba0123 = _mm_unpacklo_epi16(b, a), |
535 | ba4567 = _mm_unpackhi_epi16(b, a); |
536 | |
537 | auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), |
538 | _23 = _mm_unpackhi_epi32(rg0123, ba0123), |
539 | _45 = _mm_unpacklo_epi32(rg4567, ba4567), |
540 | _67 = _mm_unpackhi_epi32(rg4567, ba4567); |
541 | |
542 | if (__builtin_expect(tail,0)) { |
543 | auto dst = (double*)ptr; |
544 | if (tail > 0) { _mm_storel_pd(dst+0, _01); } |
545 | if (tail > 1) { _mm_storeh_pd(dst+1, _01); } |
546 | if (tail > 2) { _mm_storel_pd(dst+2, _23); } |
547 | if (tail > 3) { _mm_storeh_pd(dst+3, _23); } |
548 | if (tail > 4) { _mm_storel_pd(dst+4, _45); } |
549 | if (tail > 5) { _mm_storeh_pd(dst+5, _45); } |
550 | if (tail > 6) { _mm_storel_pd(dst+6, _67); } |
551 | } else { |
552 | _mm_storeu_si128((__m128i*)ptr + 0, _01); |
553 | _mm_storeu_si128((__m128i*)ptr + 1, _23); |
554 | _mm_storeu_si128((__m128i*)ptr + 2, _45); |
555 | _mm_storeu_si128((__m128i*)ptr + 3, _67); |
556 | } |
557 | } |
558 | |
559 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
560 | F _0123, _4567; |
561 | if (__builtin_expect(tail, 0)) { |
562 | _0123 = _4567 = _mm256_setzero_ps(); |
563 | F* d = &_0123; |
564 | if (tail > 3) { |
565 | *d = _mm256_loadu_ps(ptr); |
566 | ptr += 8; |
567 | tail -= 4; |
568 | d = &_4567; |
569 | } |
570 | bool high = false; |
571 | if (tail > 1) { |
572 | *d = _mm256_castps128_ps256(_mm_loadu_ps(ptr)); |
573 | ptr += 4; |
574 | tail -= 2; |
575 | high = true; |
576 | } |
577 | if (tail > 0) { |
578 | *d = high ? _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 1) |
579 | : _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 0); |
580 | } |
581 | } else { |
582 | _0123 = _mm256_loadu_ps(ptr + 0); |
583 | _4567 = _mm256_loadu_ps(ptr + 8); |
584 | } |
585 | |
586 | F _0145 = _mm256_permute2f128_pd(_0123, _4567, 0x20), |
587 | _2367 = _mm256_permute2f128_pd(_0123, _4567, 0x31); |
588 | |
589 | *r = _mm256_shuffle_ps(_0145, _2367, 0x88); |
590 | *g = _mm256_shuffle_ps(_0145, _2367, 0xDD); |
591 | } |
592 | SI void store2(float* ptr, size_t tail, F r, F g) { |
593 | F _0145 = _mm256_unpacklo_ps(r, g), |
594 | _2367 = _mm256_unpackhi_ps(r, g); |
595 | F _0123 = _mm256_permute2f128_pd(_0145, _2367, 0x20), |
596 | _4567 = _mm256_permute2f128_pd(_0145, _2367, 0x31); |
597 | |
598 | if (__builtin_expect(tail, 0)) { |
599 | const __m256* s = &_0123; |
600 | if (tail > 3) { |
601 | _mm256_storeu_ps(ptr, *s); |
602 | s = &_4567; |
603 | tail -= 4; |
604 | ptr += 8; |
605 | } |
606 | bool high = false; |
607 | if (tail > 1) { |
608 | _mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, 0)); |
609 | ptr += 4; |
610 | tail -= 2; |
611 | high = true; |
612 | } |
613 | if (tail > 0) { |
614 | *(ptr + 0) = (*s)[ high ? 4 : 0]; |
615 | *(ptr + 1) = (*s)[ high ? 5 : 1]; |
616 | } |
617 | } else { |
618 | _mm256_storeu_ps(ptr + 0, _0123); |
619 | _mm256_storeu_ps(ptr + 8, _4567); |
620 | } |
621 | } |
622 | |
623 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
624 | F _04, _15, _26, _37; |
625 | _04 = _15 = _26 = _37 = 0; |
626 | switch (tail) { |
627 | case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1); |
628 | case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1); |
629 | case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1); |
630 | case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1); |
631 | case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0); |
632 | case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0); |
633 | case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0); |
634 | case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0); |
635 | } |
636 | |
637 | F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5 |
638 | ba0145 = _mm256_unpackhi_ps(_04,_15), |
639 | rg2367 = _mm256_unpacklo_ps(_26,_37), |
640 | ba2367 = _mm256_unpackhi_ps(_26,_37); |
641 | |
642 | *r = _mm256_unpacklo_pd(rg0145, rg2367); |
643 | *g = _mm256_unpackhi_pd(rg0145, rg2367); |
644 | *b = _mm256_unpacklo_pd(ba0145, ba2367); |
645 | *a = _mm256_unpackhi_pd(ba0145, ba2367); |
646 | } |
647 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
648 | F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5 |
649 | rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ... |
650 | ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5 |
651 | ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ... |
652 | |
653 | F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4 |
654 | _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... |
655 | _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... |
656 | _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... |
657 | |
658 | if (__builtin_expect(tail, 0)) { |
659 | if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); } |
660 | if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); } |
661 | if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); } |
662 | if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); } |
663 | if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); } |
664 | if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); } |
665 | if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); } |
666 | } else { |
667 | F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo |
668 | _23 = _mm256_permute2f128_ps(_26, _37, 32), |
669 | _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi |
670 | _67 = _mm256_permute2f128_ps(_26, _37, 49); |
671 | _mm256_storeu_ps(ptr+ 0, _01); |
672 | _mm256_storeu_ps(ptr+ 8, _23); |
673 | _mm256_storeu_ps(ptr+16, _45); |
674 | _mm256_storeu_ps(ptr+24, _67); |
675 | } |
676 | } |
677 | |
678 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) |
679 | template <typename T> using V = T __attribute__((ext_vector_type(4))); |
680 | using F = V<float >; |
681 | using I32 = V< int32_t>; |
682 | using U64 = V<uint64_t>; |
683 | using U32 = V<uint32_t>; |
684 | using U16 = V<uint16_t>; |
685 | using U8 = V<uint8_t >; |
686 | |
687 | SI F mad(F f, F m, F a) { return f*m+a; } |
688 | SI F min(F a, F b) { return _mm_min_ps(a,b); } |
689 | SI F max(F a, F b) { return _mm_max_ps(a,b); } |
690 | SI F abs_(F v) { return _mm_and_ps(v, 0-v); } |
691 | SI F rcp (F v) { return _mm_rcp_ps (v); } |
692 | SI F rsqrt (F v) { return _mm_rsqrt_ps(v); } |
693 | SI F sqrt_(F v) { return _mm_sqrt_ps (v); } |
694 | SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); } |
695 | |
696 | SI U16 pack(U32 v) { |
697 | #if defined(JUMPER_IS_SSE41) |
698 | auto p = _mm_packus_epi32(v,v); |
699 | #else |
700 | // Sign extend so that _mm_packs_epi32() does the pack we want. |
701 | auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16); |
702 | p = _mm_packs_epi32(p,p); |
703 | #endif |
704 | return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one. |
705 | } |
706 | SI U8 pack(U16 v) { |
707 | auto r = widen_cast<__m128i>(v); |
708 | r = _mm_packus_epi16(r,r); |
709 | return sk_unaligned_load<U8>(&r); |
710 | } |
711 | |
712 | SI F if_then_else(I32 c, F t, F e) { |
713 | return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e)); |
714 | } |
715 | |
716 | SI F floor_(F v) { |
717 | #if defined(JUMPER_IS_SSE41) |
718 | return _mm_floor_ps(v); |
719 | #else |
720 | F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); |
721 | return roundtrip - if_then_else(roundtrip > v, 1, 0); |
722 | #endif |
723 | } |
724 | |
725 | template <typename T> |
726 | SI V<T> gather(const T* p, U32 ix) { |
727 | return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; |
728 | } |
729 | |
730 | // TODO: these loads and stores are incredibly difficult to follow. |
731 | |
732 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
733 | __m128i _01; |
734 | if (__builtin_expect(tail,0)) { |
735 | _01 = _mm_setzero_si128(); |
736 | if (tail > 1) { |
737 | _01 = _mm_loadl_pd(_01, (const double*)ptr); // r0 g0 r1 g1 00 00 00 00 |
738 | if (tail > 2) { |
739 | _01 = _mm_insert_epi16(_01, *(ptr+4), 4); // r0 g0 r1 g1 r2 00 00 00 |
740 | _01 = _mm_insert_epi16(_01, *(ptr+5), 5); // r0 g0 r1 g1 r2 g2 00 00 |
741 | } |
742 | } else { |
743 | _01 = _mm_cvtsi32_si128(*(const uint32_t*)ptr); // r0 g0 00 00 00 00 00 00 |
744 | } |
745 | } else { |
746 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 r1 g1 r2 g2 r3 g3 |
747 | } |
748 | auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8); // r0 r1 g0 g1 r2 g2 r3 g3 |
749 | auto rg = _mm_shufflehi_epi16(rg01_23, 0xD8); // r0 r1 g0 g1 r2 r3 g2 g3 |
750 | |
751 | auto R = _mm_shuffle_epi32(rg, 0x88); // r0 r1 r2 r3 r0 r1 r2 r3 |
752 | auto G = _mm_shuffle_epi32(rg, 0xDD); // g0 g1 g2 g3 g0 g1 g2 g3 |
753 | *r = sk_unaligned_load<U16>(&R); |
754 | *g = sk_unaligned_load<U16>(&G); |
755 | } |
756 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
757 | U32 rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)); |
758 | if (__builtin_expect(tail, 0)) { |
759 | if (tail > 1) { |
760 | _mm_storel_epi64((__m128i*)ptr, rg); |
761 | if (tail > 2) { |
762 | int32_t rgpair = rg[2]; |
763 | memcpy(ptr + 4, &rgpair, sizeof(rgpair)); |
764 | } |
765 | } else { |
766 | int32_t rgpair = rg[0]; |
767 | memcpy(ptr, &rgpair, sizeof(rgpair)); |
768 | } |
769 | } else { |
770 | _mm_storeu_si128((__m128i*)ptr + 0, rg); |
771 | } |
772 | } |
773 | |
774 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
775 | __m128i _0, _1, _2, _3; |
776 | if (__builtin_expect(tail,0)) { |
777 | _1 = _2 = _3 = _mm_setzero_si128(); |
778 | auto load_rgb = [](const uint16_t* src) { |
779 | auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); |
780 | return _mm_insert_epi16(v, src[2], 2); |
781 | }; |
782 | if ( true ) { _0 = load_rgb(ptr + 0); } |
783 | if (tail > 1) { _1 = load_rgb(ptr + 3); } |
784 | if (tail > 2) { _2 = load_rgb(ptr + 6); } |
785 | } else { |
786 | // Load slightly weirdly to make sure we don't load past the end of 4x48 bits. |
787 | auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) , |
788 | _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4); |
789 | |
790 | // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored). |
791 | _0 = _01; |
792 | _1 = _mm_srli_si128(_01, 6); |
793 | _2 = _23; |
794 | _3 = _mm_srli_si128(_23, 6); |
795 | } |
796 | |
797 | // De-interlace to R,G,B. |
798 | auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx |
799 | _13 = _mm_unpacklo_epi16(_1, _3); // r1 r3 g1 g3 b1 b3 xx xx |
800 | |
801 | auto R = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
802 | G = _mm_srli_si128(R, 8), |
803 | B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx |
804 | |
805 | *r = sk_unaligned_load<U16>(&R); |
806 | *g = sk_unaligned_load<U16>(&G); |
807 | *b = sk_unaligned_load<U16>(&B); |
808 | } |
809 | |
810 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
811 | __m128i _01, _23; |
812 | if (__builtin_expect(tail,0)) { |
813 | _01 = _23 = _mm_setzero_si128(); |
814 | auto src = (const double*)ptr; |
815 | if ( true ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00 |
816 | if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1 |
817 | if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00 |
818 | } else { |
819 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1 |
820 | _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3 |
821 | } |
822 | |
823 | auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 |
824 | _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3 |
825 | |
826 | auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
827 | ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 |
828 | |
829 | *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0); |
830 | *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4); |
831 | *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0); |
832 | *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4); |
833 | } |
834 | |
835 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
836 | auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)), |
837 | ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a)); |
838 | |
839 | if (__builtin_expect(tail, 0)) { |
840 | auto dst = (double*)ptr; |
841 | if ( true ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); } |
842 | if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); } |
843 | if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); } |
844 | } else { |
845 | _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); |
846 | _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); |
847 | } |
848 | } |
849 | |
850 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
851 | F _01, _23; |
852 | if (__builtin_expect(tail, 0)) { |
853 | _01 = _23 = _mm_setzero_si128(); |
854 | if ( true ) { _01 = _mm_loadl_pi(_01, (__m64 const*)(ptr + 0)); } |
855 | if (tail > 1) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + 2)); } |
856 | if (tail > 2) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + 4)); } |
857 | } else { |
858 | _01 = _mm_loadu_ps(ptr + 0); |
859 | _23 = _mm_loadu_ps(ptr + 4); |
860 | } |
861 | *r = _mm_shuffle_ps(_01, _23, 0x88); |
862 | *g = _mm_shuffle_ps(_01, _23, 0xDD); |
863 | } |
864 | SI void store2(float* ptr, size_t tail, F r, F g) { |
865 | F _01 = _mm_unpacklo_ps(r, g), |
866 | _23 = _mm_unpackhi_ps(r, g); |
867 | if (__builtin_expect(tail, 0)) { |
868 | if ( true ) { _mm_storel_pi((__m64*)(ptr + 0), _01); } |
869 | if (tail > 1) { _mm_storeh_pi((__m64*)(ptr + 2), _01); } |
870 | if (tail > 2) { _mm_storel_pi((__m64*)(ptr + 4), _23); } |
871 | } else { |
872 | _mm_storeu_ps(ptr + 0, _01); |
873 | _mm_storeu_ps(ptr + 4, _23); |
874 | } |
875 | } |
876 | |
877 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
878 | F _0, _1, _2, _3; |
879 | if (__builtin_expect(tail, 0)) { |
880 | _1 = _2 = _3 = _mm_setzero_si128(); |
881 | if ( true ) { _0 = _mm_loadu_ps(ptr + 0); } |
882 | if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); } |
883 | if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); } |
884 | } else { |
885 | _0 = _mm_loadu_ps(ptr + 0); |
886 | _1 = _mm_loadu_ps(ptr + 4); |
887 | _2 = _mm_loadu_ps(ptr + 8); |
888 | _3 = _mm_loadu_ps(ptr +12); |
889 | } |
890 | _MM_TRANSPOSE4_PS(_0,_1,_2,_3); |
891 | *r = _0; |
892 | *g = _1; |
893 | *b = _2; |
894 | *a = _3; |
895 | } |
896 | |
897 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
898 | _MM_TRANSPOSE4_PS(r,g,b,a); |
899 | if (__builtin_expect(tail, 0)) { |
900 | if ( true ) { _mm_storeu_ps(ptr + 0, r); } |
901 | if (tail > 1) { _mm_storeu_ps(ptr + 4, g); } |
902 | if (tail > 2) { _mm_storeu_ps(ptr + 8, b); } |
903 | } else { |
904 | _mm_storeu_ps(ptr + 0, r); |
905 | _mm_storeu_ps(ptr + 4, g); |
906 | _mm_storeu_ps(ptr + 8, b); |
907 | _mm_storeu_ps(ptr +12, a); |
908 | } |
909 | } |
910 | #endif |
911 | |
912 | // We need to be a careful with casts. |
913 | // (F)x means cast x to float in the portable path, but bit_cast x to float in the others. |
914 | // These named casts and bit_cast() are always what they seem to be. |
915 | #if defined(JUMPER_IS_SCALAR) |
916 | SI F cast (U32 v) { return (F)v; } |
917 | SI F cast64(U64 v) { return (F)v; } |
918 | SI U32 trunc_(F v) { return (U32)v; } |
919 | SI U32 expand(U16 v) { return (U32)v; } |
920 | SI U32 expand(U8 v) { return (U32)v; } |
921 | #else |
922 | SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); } |
923 | SI F cast64(U64 v) { return __builtin_convertvector( v, F); } |
924 | SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); } |
925 | SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); } |
926 | SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); } |
927 | #endif |
928 | |
929 | template <typename V> |
930 | SI V if_then_else(I32 c, V t, V e) { |
931 | return bit_cast<V>(if_then_else(c, bit_cast<F>(t), bit_cast<F>(e))); |
932 | } |
933 | |
934 | SI U16 bswap(U16 x) { |
935 | #if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) |
936 | // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes |
937 | // when generating code for SSE2 and SSE4.1. We'll do it manually... |
938 | auto v = widen_cast<__m128i>(x); |
939 | v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8); |
940 | return sk_unaligned_load<U16>(&v); |
941 | #else |
942 | return (x<<8) | (x>>8); |
943 | #endif |
944 | } |
945 | |
946 | SI F fract(F v) { return v - floor_(v); } |
947 | |
948 | // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. |
949 | SI F approx_log2(F x) { |
950 | // e - 127 is a fair approximation of log2(x) in its own right... |
951 | F e = cast(bit_cast<U32>(x)) * (1.0f / (1<<23)); |
952 | |
953 | // ... but using the mantissa to refine its error is _much_ better. |
954 | F m = bit_cast<F>((bit_cast<U32>(x) & 0x007fffff) | 0x3f000000); |
955 | return e |
956 | - 124.225514990f |
957 | - 1.498030302f * m |
958 | - 1.725879990f / (0.3520887068f + m); |
959 | } |
960 | |
961 | SI F approx_log(F x) { |
962 | const float ln2 = 0.69314718f; |
963 | return ln2 * approx_log2(x); |
964 | } |
965 | |
966 | SI F approx_pow2(F x) { |
967 | F f = fract(x); |
968 | return bit_cast<F>(round(1.0f * (1<<23), |
969 | x + 121.274057500f |
970 | - 1.490129070f * f |
971 | + 27.728023300f / (4.84252568f - f))); |
972 | } |
973 | |
974 | SI F approx_exp(F x) { |
975 | const float log2_e = 1.4426950408889634074f; |
976 | return approx_pow2(log2_e * x); |
977 | } |
978 | |
979 | SI F approx_powf(F x, F y) { |
980 | #if defined(SK_LEGACY_APPROX_POWF_SPECIALCASE) |
981 | return if_then_else((x == 0) , 0 |
982 | #else |
983 | return if_then_else((x == 0)|(x == 1), x |
984 | #endif |
985 | , approx_pow2(approx_log2(x) * y)); |
986 | } |
987 | |
988 | SI F from_half(U16 h) { |
989 | #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \ |
990 | && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. |
991 | return vcvt_f32_f16(h); |
992 | |
993 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
994 | return _mm256_cvtph_ps(h); |
995 | |
996 | #else |
997 | // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias. |
998 | U32 sem = expand(h), |
999 | s = sem & 0x8000, |
1000 | em = sem ^ s; |
1001 | |
1002 | // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero. |
1003 | auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here. |
1004 | return if_then_else(denorm, F(0) |
1005 | , bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) )); |
1006 | #endif |
1007 | } |
1008 | |
1009 | SI U16 to_half(F f) { |
1010 | #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \ |
1011 | && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. |
1012 | return vcvt_f16_f32(f); |
1013 | |
1014 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
1015 | return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION); |
1016 | |
1017 | #else |
1018 | // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias. |
1019 | U32 sem = bit_cast<U32>(f), |
1020 | s = sem & 0x80000000, |
1021 | em = sem ^ s; |
1022 | |
1023 | // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero. |
1024 | auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here. |
1025 | return pack(if_then_else(denorm, U32(0) |
1026 | , (s>>16) + (em>>13) - ((127-15)<<10))); |
1027 | #endif |
1028 | } |
1029 | |
1030 | // Our fundamental vector depth is our pixel stride. |
1031 | static const size_t N = sizeof(F) / sizeof(float); |
1032 | |
1033 | // We're finally going to get to what a Stage function looks like! |
1034 | // tail == 0 ~~> work on a full N pixels |
1035 | // tail != 0 ~~> work on only the first tail pixels |
1036 | // tail is always < N. |
1037 | |
1038 | // Any custom ABI to use for all (non-externally-facing) stage functions? |
1039 | // Also decide here whether to use narrow (compromise) or wide (ideal) stages. |
1040 | #if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON) |
1041 | // This lets us pass vectors more efficiently on 32-bit ARM. |
1042 | // We can still only pass 16 floats, so best as 4x {r,g,b,a}. |
1043 | #define ABI __attribute__((pcs("aapcs-vfp"))) |
1044 | #define JUMPER_NARROW_STAGES 1 |
1045 | #elif 0 && defined(_MSC_VER) && defined(__clang__) && defined(__x86_64__) |
1046 | // SysV ABI makes it very sensible to use wide stages with clang-cl. |
1047 | // TODO: crashes during compilation :( |
1048 | #define ABI __attribute__((sysv_abi)) |
1049 | #define JUMPER_NARROW_STAGES 0 |
1050 | #elif defined(_MSC_VER) |
1051 | // Even if not vectorized, this lets us pass {r,g,b,a} as registers, |
1052 | // instead of {b,a} on the stack. Narrow stages work best for __vectorcall. |
1053 | #define ABI __vectorcall |
1054 | #define JUMPER_NARROW_STAGES 1 |
1055 | #elif defined(__x86_64__) || defined(SK_CPU_ARM64) |
1056 | // These platforms are ideal for wider stages, and their default ABI is ideal. |
1057 | #define ABI |
1058 | #define JUMPER_NARROW_STAGES 0 |
1059 | #else |
1060 | // 32-bit or unknown... shunt them down the narrow path. |
1061 | // Odds are these have few registers and are better off there. |
1062 | #define ABI |
1063 | #define JUMPER_NARROW_STAGES 1 |
1064 | #endif |
1065 | |
1066 | #if JUMPER_NARROW_STAGES |
1067 | struct Params { |
1068 | size_t dx, dy, tail; |
1069 | F dr,dg,db,da; |
1070 | }; |
1071 | using Stage = void(ABI*)(Params*, void** program, F r, F g, F b, F a); |
1072 | #else |
1073 | // We keep program the second argument, so that it's passed in rsi for load_and_inc(). |
1074 | using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F); |
1075 | #endif |
1076 | |
1077 | |
1078 | static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, void** program) { |
1079 | auto start = (Stage)load_and_inc(program); |
1080 | const size_t x0 = dx; |
1081 | for (; dy < ylimit; dy++) { |
1082 | #if JUMPER_NARROW_STAGES |
1083 | Params params = { x0,dy,0, 0,0,0,0 }; |
1084 | while (params.dx + N <= xlimit) { |
1085 | start(¶ms,program, 0,0,0,0); |
1086 | params.dx += N; |
1087 | } |
1088 | if (size_t tail = xlimit - params.dx) { |
1089 | params.tail = tail; |
1090 | start(¶ms,program, 0,0,0,0); |
1091 | } |
1092 | #else |
1093 | dx = x0; |
1094 | while (dx + N <= xlimit) { |
1095 | start(0,program,dx,dy, 0,0,0,0, 0,0,0,0); |
1096 | dx += N; |
1097 | } |
1098 | if (size_t tail = xlimit - dx) { |
1099 | start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); |
1100 | } |
1101 | #endif |
1102 | } |
1103 | } |
1104 | |
1105 | #if JUMPER_NARROW_STAGES |
1106 | #define STAGE(name, ...) \ |
1107 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
1108 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ |
1109 | static void ABI name(Params* params, void** program, \ |
1110 | F r, F g, F b, F a) { \ |
1111 | name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \ |
1112 | params->dr, params->dg, params->db, params->da); \ |
1113 | auto next = (Stage)load_and_inc(program); \ |
1114 | next(params,program, r,g,b,a); \ |
1115 | } \ |
1116 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
1117 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) |
1118 | #else |
1119 | #define STAGE(name, ...) \ |
1120 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
1121 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ |
1122 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
1123 | F r, F g, F b, F a, F dr, F dg, F db, F da) { \ |
1124 | name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \ |
1125 | auto next = (Stage)load_and_inc(program); \ |
1126 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
1127 | } \ |
1128 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
1129 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) |
1130 | #endif |
1131 | |
1132 | |
1133 | // just_return() is a simple no-op stage that only exists to end the chain, |
1134 | // returning back up to start_pipeline(), and from there to the caller. |
1135 | #if JUMPER_NARROW_STAGES |
1136 | static void ABI just_return(Params*, void**, F,F,F,F) {} |
1137 | #else |
1138 | static void ABI just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {} |
1139 | #endif |
1140 | |
1141 | |
1142 | // We could start defining normal Stages now. But first, some helper functions. |
1143 | |
1144 | // These load() and store() methods are tail-aware, |
1145 | // but focus mainly on keeping the at-stride tail==0 case fast. |
1146 | |
1147 | template <typename V, typename T> |
1148 | SI V load(const T* src, size_t tail) { |
1149 | #if !defined(JUMPER_IS_SCALAR) |
1150 | __builtin_assume(tail < N); |
1151 | if (__builtin_expect(tail, 0)) { |
1152 | V v{}; // Any inactive lanes are zeroed. |
1153 | switch (tail) { |
1154 | case 7: v[6] = src[6]; |
1155 | case 6: v[5] = src[5]; |
1156 | case 5: v[4] = src[4]; |
1157 | case 4: memcpy(&v, src, 4*sizeof(T)); break; |
1158 | case 3: v[2] = src[2]; |
1159 | case 2: memcpy(&v, src, 2*sizeof(T)); break; |
1160 | case 1: memcpy(&v, src, 1*sizeof(T)); break; |
1161 | } |
1162 | return v; |
1163 | } |
1164 | #endif |
1165 | return sk_unaligned_load<V>(src); |
1166 | } |
1167 | |
1168 | template <typename V, typename T> |
1169 | SI void store(T* dst, V v, size_t tail) { |
1170 | #if !defined(JUMPER_IS_SCALAR) |
1171 | __builtin_assume(tail < N); |
1172 | if (__builtin_expect(tail, 0)) { |
1173 | switch (tail) { |
1174 | case 7: dst[6] = v[6]; |
1175 | case 6: dst[5] = v[5]; |
1176 | case 5: dst[4] = v[4]; |
1177 | case 4: memcpy(dst, &v, 4*sizeof(T)); break; |
1178 | case 3: dst[2] = v[2]; |
1179 | case 2: memcpy(dst, &v, 2*sizeof(T)); break; |
1180 | case 1: memcpy(dst, &v, 1*sizeof(T)); break; |
1181 | } |
1182 | return; |
1183 | } |
1184 | #endif |
1185 | sk_unaligned_store(dst, v); |
1186 | } |
1187 | |
1188 | SI F from_byte(U8 b) { |
1189 | return cast(expand(b)) * (1/255.0f); |
1190 | } |
1191 | SI F from_short(U16 s) { |
1192 | return cast(expand(s)) * (1/65535.0f); |
1193 | } |
1194 | SI void from_565(U16 _565, F* r, F* g, F* b) { |
1195 | U32 wide = expand(_565); |
1196 | *r = cast(wide & (31<<11)) * (1.0f / (31<<11)); |
1197 | *g = cast(wide & (63<< 5)) * (1.0f / (63<< 5)); |
1198 | *b = cast(wide & (31<< 0)) * (1.0f / (31<< 0)); |
1199 | } |
1200 | SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) { |
1201 | U32 wide = expand(_4444); |
1202 | *r = cast(wide & (15<<12)) * (1.0f / (15<<12)); |
1203 | *g = cast(wide & (15<< 8)) * (1.0f / (15<< 8)); |
1204 | *b = cast(wide & (15<< 4)) * (1.0f / (15<< 4)); |
1205 | *a = cast(wide & (15<< 0)) * (1.0f / (15<< 0)); |
1206 | } |
1207 | SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) { |
1208 | *r = cast((_8888 ) & 0xff) * (1/255.0f); |
1209 | *g = cast((_8888 >> 8) & 0xff) * (1/255.0f); |
1210 | *b = cast((_8888 >> 16) & 0xff) * (1/255.0f); |
1211 | *a = cast((_8888 >> 24) ) * (1/255.0f); |
1212 | } |
1213 | SI void from_88(U16 _88, F* r, F* g) { |
1214 | U32 wide = expand(_88); |
1215 | *r = cast((wide ) & 0xff) * (1/255.0f); |
1216 | *g = cast((wide >> 8) & 0xff) * (1/255.0f); |
1217 | } |
1218 | SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) { |
1219 | *r = cast((rgba ) & 0x3ff) * (1/1023.0f); |
1220 | *g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f); |
1221 | *b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f); |
1222 | *a = cast((rgba >> 30) ) * (1/ 3.0f); |
1223 | } |
1224 | SI void from_1616(U32 _1616, F* r, F* g) { |
1225 | *r = cast((_1616 ) & 0xffff) * (1/65535.0f); |
1226 | *g = cast((_1616 >> 16) & 0xffff) * (1/65535.0f); |
1227 | } |
1228 | SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) { |
1229 | *r = cast64((_16161616 ) & 0xffff) * (1/65535.0f); |
1230 | *g = cast64((_16161616 >> 16) & 0xffff) * (1/65535.0f); |
1231 | *b = cast64((_16161616 >> 32) & 0xffff) * (1/65535.0f); |
1232 | *a = cast64((_16161616 >> 48) & 0xffff) * (1/65535.0f); |
1233 | } |
1234 | |
1235 | // Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory. |
1236 | template <typename T> |
1237 | SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) { |
1238 | return (T*)ctx->pixels + dy*ctx->stride + dx; |
1239 | } |
1240 | |
1241 | // clamp v to [0,limit). |
1242 | SI F clamp(F v, F limit) { |
1243 | F inclusive = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive. |
1244 | return min(max(0, v), inclusive); |
1245 | } |
1246 | |
1247 | // Used by gather_ stages to calculate the base pointer and a vector of indices to load. |
1248 | template <typename T> |
1249 | SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) { |
1250 | x = clamp(x, ctx->width); |
1251 | y = clamp(y, ctx->height); |
1252 | |
1253 | *ptr = (const T*)ctx->pixels; |
1254 | return trunc_(y)*ctx->stride + trunc_(x); |
1255 | } |
1256 | |
1257 | // We often have a nominally [0,1] float value we need to scale and convert to an integer, |
1258 | // whether for a table lookup or to pack back down into bytes for storage. |
1259 | // |
1260 | // In practice, especially when dealing with interesting color spaces, that notionally |
1261 | // [0,1] float may be out of [0,1] range. Unorms cannot represent that, so we must clamp. |
1262 | // |
1263 | // You can adjust the expected input to [0,bias] by tweaking that parameter. |
1264 | SI U32 to_unorm(F v, F scale, F bias = 1.0f) { |
1265 | // TODO: platform-specific implementations to to_unorm(), removing round() entirely? |
1266 | // Any time we use round() we probably want to use to_unorm(). |
1267 | return round(min(max(0, v), bias), scale); |
1268 | } |
1269 | |
1270 | SI I32 cond_to_mask(I32 cond) { return if_then_else(cond, I32(~0), I32(0)); } |
1271 | |
1272 | // Now finally, normal Stages! |
1273 | |
1274 | STAGE(seed_shader, Ctx::None) { |
1275 | static const float iota[] = { |
1276 | 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f, |
1277 | 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f, |
1278 | }; |
1279 | // It's important for speed to explicitly cast(dx) and cast(dy), |
1280 | // which has the effect of splatting them to vectors before converting to floats. |
1281 | // On Intel this breaks a data dependency on previous loop iterations' registers. |
1282 | r = cast(dx) + sk_unaligned_load<F>(iota); |
1283 | g = cast(dy) + 0.5f; |
1284 | b = 1.0f; |
1285 | a = 0; |
1286 | dr = dg = db = da = 0; |
1287 | } |
1288 | |
1289 | STAGE(dither, const float* rate) { |
1290 | // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors. |
1291 | uint32_t iota[] = {0,1,2,3,4,5,6,7}; |
1292 | U32 X = dx + sk_unaligned_load<U32>(iota), |
1293 | Y = dy; |
1294 | |
1295 | // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering. |
1296 | // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ]. |
1297 | |
1298 | // We only need X and X^Y from here on, so it's easier to just think of that as "Y". |
1299 | Y ^= X; |
1300 | |
1301 | // We'll mix the bottom 3 bits of each of X and Y to make 6 bits, |
1302 | // for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda. |
1303 | U32 M = (Y & 1) << 5 | (X & 1) << 4 |
1304 | | (Y & 2) << 2 | (X & 2) << 1 |
1305 | | (Y & 4) >> 1 | (X & 4) >> 2; |
1306 | |
1307 | // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon. |
1308 | // We want to make sure our dither is less than 0.5 in either direction to keep exact values |
1309 | // like 0 and 1 unchanged after rounding. |
1310 | F dither = cast(M) * (2/128.0f) - (63/128.0f); |
1311 | |
1312 | r += *rate*dither; |
1313 | g += *rate*dither; |
1314 | b += *rate*dither; |
1315 | |
1316 | r = max(0, min(r, a)); |
1317 | g = max(0, min(g, a)); |
1318 | b = max(0, min(b, a)); |
1319 | } |
1320 | |
1321 | // load 4 floats from memory, and splat them into r,g,b,a |
1322 | STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) { |
1323 | r = c->r; |
1324 | g = c->g; |
1325 | b = c->b; |
1326 | a = c->a; |
1327 | } |
1328 | STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) { |
1329 | r = c->r; |
1330 | g = c->g; |
1331 | b = c->b; |
1332 | a = c->a; |
1333 | } |
1334 | // load 4 floats from memory, and splat them into dr,dg,db,da |
1335 | STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) { |
1336 | dr = c->r; |
1337 | dg = c->g; |
1338 | db = c->b; |
1339 | da = c->a; |
1340 | } |
1341 | |
1342 | // splats opaque-black into r,g,b,a |
1343 | STAGE(black_color, Ctx::None) { |
1344 | r = g = b = 0.0f; |
1345 | a = 1.0f; |
1346 | } |
1347 | |
1348 | STAGE(white_color, Ctx::None) { |
1349 | r = g = b = a = 1.0f; |
1350 | } |
1351 | |
1352 | // load registers r,g,b,a from context (mirrors store_rgba) |
1353 | STAGE(load_src, const float* ptr) { |
1354 | r = sk_unaligned_load<F>(ptr + 0*N); |
1355 | g = sk_unaligned_load<F>(ptr + 1*N); |
1356 | b = sk_unaligned_load<F>(ptr + 2*N); |
1357 | a = sk_unaligned_load<F>(ptr + 3*N); |
1358 | } |
1359 | |
1360 | // store registers r,g,b,a into context (mirrors load_rgba) |
1361 | STAGE(store_src, float* ptr) { |
1362 | sk_unaligned_store(ptr + 0*N, r); |
1363 | sk_unaligned_store(ptr + 1*N, g); |
1364 | sk_unaligned_store(ptr + 2*N, b); |
1365 | sk_unaligned_store(ptr + 3*N, a); |
1366 | } |
1367 | STAGE(store_src_a, float* ptr) { |
1368 | sk_unaligned_store(ptr, a); |
1369 | } |
1370 | |
1371 | // load registers dr,dg,db,da from context (mirrors store_dst) |
1372 | STAGE(load_dst, const float* ptr) { |
1373 | dr = sk_unaligned_load<F>(ptr + 0*N); |
1374 | dg = sk_unaligned_load<F>(ptr + 1*N); |
1375 | db = sk_unaligned_load<F>(ptr + 2*N); |
1376 | da = sk_unaligned_load<F>(ptr + 3*N); |
1377 | } |
1378 | |
1379 | // store registers dr,dg,db,da into context (mirrors load_dst) |
1380 | STAGE(store_dst, float* ptr) { |
1381 | sk_unaligned_store(ptr + 0*N, dr); |
1382 | sk_unaligned_store(ptr + 1*N, dg); |
1383 | sk_unaligned_store(ptr + 2*N, db); |
1384 | sk_unaligned_store(ptr + 3*N, da); |
1385 | } |
1386 | |
1387 | // Most blend modes apply the same logic to each channel. |
1388 | #define BLEND_MODE(name) \ |
1389 | SI F name##_channel(F s, F d, F sa, F da); \ |
1390 | STAGE(name, Ctx::None) { \ |
1391 | r = name##_channel(r,dr,a,da); \ |
1392 | g = name##_channel(g,dg,a,da); \ |
1393 | b = name##_channel(b,db,a,da); \ |
1394 | a = name##_channel(a,da,a,da); \ |
1395 | } \ |
1396 | SI F name##_channel(F s, F d, F sa, F da) |
1397 | |
1398 | SI F inv(F x) { return 1.0f - x; } |
1399 | SI F two(F x) { return x + x; } |
1400 | |
1401 | |
1402 | BLEND_MODE(clear) { return 0; } |
1403 | BLEND_MODE(srcatop) { return s*da + d*inv(sa); } |
1404 | BLEND_MODE(dstatop) { return d*sa + s*inv(da); } |
1405 | BLEND_MODE(srcin) { return s * da; } |
1406 | BLEND_MODE(dstin) { return d * sa; } |
1407 | BLEND_MODE(srcout) { return s * inv(da); } |
1408 | BLEND_MODE(dstout) { return d * inv(sa); } |
1409 | BLEND_MODE(srcover) { return mad(d, inv(sa), s); } |
1410 | BLEND_MODE(dstover) { return mad(s, inv(da), d); } |
1411 | |
1412 | BLEND_MODE(modulate) { return s*d; } |
1413 | BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; } |
1414 | BLEND_MODE(plus_) { return min(s + d, 1.0f); } // We can clamp to either 1 or sa. |
1415 | BLEND_MODE(screen) { return s + d - s*d; } |
1416 | BLEND_MODE(xor_) { return s*inv(da) + d*inv(sa); } |
1417 | #undef BLEND_MODE |
1418 | |
1419 | // Most other blend modes apply the same logic to colors, and srcover to alpha. |
1420 | #define BLEND_MODE(name) \ |
1421 | SI F name##_channel(F s, F d, F sa, F da); \ |
1422 | STAGE(name, Ctx::None) { \ |
1423 | r = name##_channel(r,dr,a,da); \ |
1424 | g = name##_channel(g,dg,a,da); \ |
1425 | b = name##_channel(b,db,a,da); \ |
1426 | a = mad(da, inv(a), a); \ |
1427 | } \ |
1428 | SI F name##_channel(F s, F d, F sa, F da) |
1429 | |
1430 | BLEND_MODE(darken) { return s + d - max(s*da, d*sa) ; } |
1431 | BLEND_MODE(lighten) { return s + d - min(s*da, d*sa) ; } |
1432 | BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); } |
1433 | BLEND_MODE(exclusion) { return s + d - two(s*d); } |
1434 | |
1435 | BLEND_MODE(colorburn) { |
1436 | return if_then_else(d == da, d + s*inv(da), |
1437 | if_then_else(s == 0, /* s + */ d*inv(sa), |
1438 | sa*(da - min(da, (da-d)*sa*rcp(s))) + s*inv(da) + d*inv(sa))); |
1439 | } |
1440 | BLEND_MODE(colordodge) { |
1441 | return if_then_else(d == 0, /* d + */ s*inv(da), |
1442 | if_then_else(s == sa, s + d*inv(sa), |
1443 | sa*min(da, (d*sa)*rcp(sa - s)) + s*inv(da) + d*inv(sa))); |
1444 | } |
1445 | BLEND_MODE(hardlight) { |
1446 | return s*inv(da) + d*inv(sa) |
1447 | + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s))); |
1448 | } |
1449 | BLEND_MODE(overlay) { |
1450 | return s*inv(da) + d*inv(sa) |
1451 | + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s))); |
1452 | } |
1453 | |
1454 | BLEND_MODE(softlight) { |
1455 | F m = if_then_else(da > 0, d / da, 0), |
1456 | s2 = two(s), |
1457 | m4 = two(two(m)); |
1458 | |
1459 | // The logic forks three ways: |
1460 | // 1. dark src? |
1461 | // 2. light src, dark dst? |
1462 | // 3. light src, light dst? |
1463 | F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)), // Used in case 1. |
1464 | darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m, // Used in case 2. |
1465 | liteDst = rcp(rsqrt(m)) - m, // Used in case 3. |
1466 | liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3? |
1467 | return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)? |
1468 | } |
1469 | #undef BLEND_MODE |
1470 | |
1471 | // We're basing our implemenation of non-separable blend modes on |
1472 | // https://www.w3.org/TR/compositing-1/#blendingnonseparable. |
1473 | // and |
1474 | // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf |
1475 | // They're equivalent, but ES' math has been better simplified. |
1476 | // |
1477 | // Anything extra we add beyond that is to make the math work with premul inputs. |
1478 | |
1479 | SI F sat(F r, F g, F b) { return max(r, max(g,b)) - min(r, min(g,b)); } |
1480 | SI F lum(F r, F g, F b) { return r*0.30f + g*0.59f + b*0.11f; } |
1481 | |
1482 | SI void set_sat(F* r, F* g, F* b, F s) { |
1483 | F mn = min(*r, min(*g,*b)), |
1484 | mx = max(*r, max(*g,*b)), |
1485 | sat = mx - mn; |
1486 | |
1487 | // Map min channel to 0, max channel to s, and scale the middle proportionally. |
1488 | auto scale = [=](F c) { |
1489 | return if_then_else(sat == 0, 0, (c - mn) * s / sat); |
1490 | }; |
1491 | *r = scale(*r); |
1492 | *g = scale(*g); |
1493 | *b = scale(*b); |
1494 | } |
1495 | SI void set_lum(F* r, F* g, F* b, F l) { |
1496 | F diff = l - lum(*r, *g, *b); |
1497 | *r += diff; |
1498 | *g += diff; |
1499 | *b += diff; |
1500 | } |
1501 | SI void clip_color(F* r, F* g, F* b, F a) { |
1502 | F mn = min(*r, min(*g, *b)), |
1503 | mx = max(*r, max(*g, *b)), |
1504 | l = lum(*r, *g, *b); |
1505 | |
1506 | auto clip = [=](F c) { |
1507 | c = if_then_else(mn >= 0, c, l + (c - l) * ( l) / (l - mn) ); |
1508 | c = if_then_else(mx > a, l + (c - l) * (a - l) / (mx - l), c); |
1509 | c = max(c, 0); // Sometimes without this we may dip just a little negative. |
1510 | return c; |
1511 | }; |
1512 | *r = clip(*r); |
1513 | *g = clip(*g); |
1514 | *b = clip(*b); |
1515 | } |
1516 | |
1517 | STAGE(hue, Ctx::None) { |
1518 | F R = r*a, |
1519 | G = g*a, |
1520 | B = b*a; |
1521 | |
1522 | set_sat(&R, &G, &B, sat(dr,dg,db)*a); |
1523 | set_lum(&R, &G, &B, lum(dr,dg,db)*a); |
1524 | clip_color(&R,&G,&B, a*da); |
1525 | |
1526 | r = r*inv(da) + dr*inv(a) + R; |
1527 | g = g*inv(da) + dg*inv(a) + G; |
1528 | b = b*inv(da) + db*inv(a) + B; |
1529 | a = a + da - a*da; |
1530 | } |
1531 | STAGE(saturation, Ctx::None) { |
1532 | F R = dr*a, |
1533 | G = dg*a, |
1534 | B = db*a; |
1535 | |
1536 | set_sat(&R, &G, &B, sat( r, g, b)*da); |
1537 | set_lum(&R, &G, &B, lum(dr,dg,db)* a); // (This is not redundant.) |
1538 | clip_color(&R,&G,&B, a*da); |
1539 | |
1540 | r = r*inv(da) + dr*inv(a) + R; |
1541 | g = g*inv(da) + dg*inv(a) + G; |
1542 | b = b*inv(da) + db*inv(a) + B; |
1543 | a = a + da - a*da; |
1544 | } |
1545 | STAGE(color, Ctx::None) { |
1546 | F R = r*da, |
1547 | G = g*da, |
1548 | B = b*da; |
1549 | |
1550 | set_lum(&R, &G, &B, lum(dr,dg,db)*a); |
1551 | clip_color(&R,&G,&B, a*da); |
1552 | |
1553 | r = r*inv(da) + dr*inv(a) + R; |
1554 | g = g*inv(da) + dg*inv(a) + G; |
1555 | b = b*inv(da) + db*inv(a) + B; |
1556 | a = a + da - a*da; |
1557 | } |
1558 | STAGE(luminosity, Ctx::None) { |
1559 | F R = dr*a, |
1560 | G = dg*a, |
1561 | B = db*a; |
1562 | |
1563 | set_lum(&R, &G, &B, lum(r,g,b)*da); |
1564 | clip_color(&R,&G,&B, a*da); |
1565 | |
1566 | r = r*inv(da) + dr*inv(a) + R; |
1567 | g = g*inv(da) + dg*inv(a) + G; |
1568 | b = b*inv(da) + db*inv(a) + B; |
1569 | a = a + da - a*da; |
1570 | } |
1571 | |
1572 | STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
1573 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
1574 | |
1575 | U32 dst = load<U32>(ptr, tail); |
1576 | dr = cast((dst ) & 0xff); |
1577 | dg = cast((dst >> 8) & 0xff); |
1578 | db = cast((dst >> 16) & 0xff); |
1579 | da = cast((dst >> 24) ); |
1580 | // {dr,dg,db,da} are in [0,255] |
1581 | // { r, g, b, a} are in [0, 1] (but may be out of gamut) |
1582 | |
1583 | r = mad(dr, inv(a), r*255.0f); |
1584 | g = mad(dg, inv(a), g*255.0f); |
1585 | b = mad(db, inv(a), b*255.0f); |
1586 | a = mad(da, inv(a), a*255.0f); |
1587 | // { r, g, b, a} are now in [0,255] (but may be out of gamut) |
1588 | |
1589 | // to_unorm() clamps back to gamut. Scaling by 1 since we're already 255-biased. |
1590 | dst = to_unorm(r, 1, 255) |
1591 | | to_unorm(g, 1, 255) << 8 |
1592 | | to_unorm(b, 1, 255) << 16 |
1593 | | to_unorm(a, 1, 255) << 24; |
1594 | store(ptr, dst, tail); |
1595 | } |
1596 | |
1597 | STAGE(clamp_0, Ctx::None) { |
1598 | r = max(r, 0); |
1599 | g = max(g, 0); |
1600 | b = max(b, 0); |
1601 | a = max(a, 0); |
1602 | } |
1603 | |
1604 | STAGE(clamp_1, Ctx::None) { |
1605 | r = min(r, 1.0f); |
1606 | g = min(g, 1.0f); |
1607 | b = min(b, 1.0f); |
1608 | a = min(a, 1.0f); |
1609 | } |
1610 | |
1611 | STAGE(clamp_a, Ctx::None) { |
1612 | a = min(a, 1.0f); |
1613 | r = min(r, a); |
1614 | g = min(g, a); |
1615 | b = min(b, a); |
1616 | } |
1617 | |
1618 | STAGE(clamp_gamut, Ctx::None) { |
1619 | a = min(max(a, 0), 1.0f); |
1620 | r = min(max(r, 0), a); |
1621 | g = min(max(g, 0), a); |
1622 | b = min(max(b, 0), a); |
1623 | } |
1624 | |
1625 | STAGE(set_rgb, const float* rgb) { |
1626 | r = rgb[0]; |
1627 | g = rgb[1]; |
1628 | b = rgb[2]; |
1629 | } |
1630 | STAGE(unbounded_set_rgb, const float* rgb) { |
1631 | r = rgb[0]; |
1632 | g = rgb[1]; |
1633 | b = rgb[2]; |
1634 | } |
1635 | |
1636 | STAGE(swap_rb, Ctx::None) { |
1637 | auto tmp = r; |
1638 | r = b; |
1639 | b = tmp; |
1640 | } |
1641 | STAGE(swap_rb_dst, Ctx::None) { |
1642 | auto tmp = dr; |
1643 | dr = db; |
1644 | db = tmp; |
1645 | } |
1646 | |
1647 | STAGE(move_src_dst, Ctx::None) { |
1648 | dr = r; |
1649 | dg = g; |
1650 | db = b; |
1651 | da = a; |
1652 | } |
1653 | STAGE(move_dst_src, Ctx::None) { |
1654 | r = dr; |
1655 | g = dg; |
1656 | b = db; |
1657 | a = da; |
1658 | } |
1659 | |
1660 | STAGE(premul, Ctx::None) { |
1661 | r = r * a; |
1662 | g = g * a; |
1663 | b = b * a; |
1664 | } |
1665 | STAGE(premul_dst, Ctx::None) { |
1666 | dr = dr * da; |
1667 | dg = dg * da; |
1668 | db = db * da; |
1669 | } |
1670 | STAGE(unpremul, Ctx::None) { |
1671 | float inf = bit_cast<float>(0x7f800000); |
1672 | auto scale = if_then_else(1.0f/a < inf, 1.0f/a, 0); |
1673 | r *= scale; |
1674 | g *= scale; |
1675 | b *= scale; |
1676 | } |
1677 | |
1678 | STAGE(force_opaque , Ctx::None) { a = 1; } |
1679 | STAGE(force_opaque_dst, Ctx::None) { da = 1; } |
1680 | |
1681 | // Clamp x to [0,1], both sides inclusive (think, gradients). |
1682 | // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. |
1683 | SI F clamp_01(F v) { return min(max(0, v), 1); } |
1684 | |
1685 | STAGE(rgb_to_hsl, Ctx::None) { |
1686 | F mx = max(r, max(g,b)), |
1687 | mn = min(r, min(g,b)), |
1688 | d = mx - mn, |
1689 | d_rcp = 1.0f / d; |
1690 | |
1691 | F h = (1/6.0f) * |
1692 | if_then_else(mx == mn, 0, |
1693 | if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, 6.0f, 0), |
1694 | if_then_else(mx == g, (b-r)*d_rcp + 2.0f, |
1695 | (r-g)*d_rcp + 4.0f))); |
1696 | |
1697 | F l = (mx + mn) * 0.5f; |
1698 | F s = if_then_else(mx == mn, 0, |
1699 | d / if_then_else(l > 0.5f, 2.0f-mx-mn, mx+mn)); |
1700 | |
1701 | r = h; |
1702 | g = s; |
1703 | b = l; |
1704 | } |
1705 | STAGE(hsl_to_rgb, Ctx::None) { |
1706 | // See GrRGBToHSLFilterEffect.fp |
1707 | |
1708 | F h = r, |
1709 | s = g, |
1710 | l = b, |
1711 | c = (1.0f - abs_(2.0f * l - 1)) * s; |
1712 | |
1713 | auto hue_to_rgb = [&](F hue) { |
1714 | F q = clamp_01(abs_(fract(hue) * 6.0f - 3.0f) - 1.0f); |
1715 | return (q - 0.5f) * c + l; |
1716 | }; |
1717 | |
1718 | r = hue_to_rgb(h + 0.0f/3.0f); |
1719 | g = hue_to_rgb(h + 2.0f/3.0f); |
1720 | b = hue_to_rgb(h + 1.0f/3.0f); |
1721 | } |
1722 | |
1723 | // Derive alpha's coverage from rgb coverage and the values of src and dst alpha. |
1724 | SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) { |
1725 | return if_then_else(a < da, min(cr, min(cg,cb)) |
1726 | , max(cr, max(cg,cb))); |
1727 | } |
1728 | |
1729 | STAGE(scale_1_float, const float* c) { |
1730 | r = r * *c; |
1731 | g = g * *c; |
1732 | b = b * *c; |
1733 | a = a * *c; |
1734 | } |
1735 | STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
1736 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
1737 | |
1738 | auto scales = load<U8>(ptr, tail); |
1739 | auto c = from_byte(scales); |
1740 | |
1741 | r = r * c; |
1742 | g = g * c; |
1743 | b = b * c; |
1744 | a = a * c; |
1745 | } |
1746 | STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) { |
1747 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
1748 | |
1749 | F cr,cg,cb; |
1750 | from_565(load<U16>(ptr, tail), &cr, &cg, &cb); |
1751 | |
1752 | F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
1753 | |
1754 | r = r * cr; |
1755 | g = g * cg; |
1756 | b = b * cb; |
1757 | a = a * ca; |
1758 | } |
1759 | |
1760 | SI F lerp(F from, F to, F t) { |
1761 | return mad(to-from, t, from); |
1762 | } |
1763 | |
1764 | STAGE(lerp_1_float, const float* c) { |
1765 | r = lerp(dr, r, *c); |
1766 | g = lerp(dg, g, *c); |
1767 | b = lerp(db, b, *c); |
1768 | a = lerp(da, a, *c); |
1769 | } |
1770 | STAGE(scale_native, const float scales[]) { |
1771 | auto c = sk_unaligned_load<F>(scales); |
1772 | r = r * c; |
1773 | g = g * c; |
1774 | b = b * c; |
1775 | a = a * c; |
1776 | } |
1777 | STAGE(lerp_native, const float scales[]) { |
1778 | auto c = sk_unaligned_load<F>(scales); |
1779 | r = lerp(dr, r, c); |
1780 | g = lerp(dg, g, c); |
1781 | b = lerp(db, b, c); |
1782 | a = lerp(da, a, c); |
1783 | } |
1784 | STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
1785 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
1786 | |
1787 | auto scales = load<U8>(ptr, tail); |
1788 | auto c = from_byte(scales); |
1789 | |
1790 | r = lerp(dr, r, c); |
1791 | g = lerp(dg, g, c); |
1792 | b = lerp(db, b, c); |
1793 | a = lerp(da, a, c); |
1794 | } |
1795 | STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) { |
1796 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
1797 | |
1798 | F cr,cg,cb; |
1799 | from_565(load<U16>(ptr, tail), &cr, &cg, &cb); |
1800 | |
1801 | F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
1802 | |
1803 | r = lerp(dr, r, cr); |
1804 | g = lerp(dg, g, cg); |
1805 | b = lerp(db, b, cb); |
1806 | a = lerp(da, a, ca); |
1807 | } |
1808 | |
1809 | STAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) { |
1810 | auto mptr = ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), |
1811 | aptr = ptr_at_xy<const uint8_t>(&ctx->add, dx,dy); |
1812 | |
1813 | F mul = from_byte(load<U8>(mptr, tail)), |
1814 | add = from_byte(load<U8>(aptr, tail)); |
1815 | |
1816 | r = mad(r, mul, add); |
1817 | g = mad(g, mul, add); |
1818 | b = mad(b, mul, add); |
1819 | } |
1820 | |
1821 | STAGE(byte_tables, const void* ctx) { // TODO: rename Tables SkRasterPipeline_ByteTablesCtx |
1822 | struct Tables { const uint8_t *r, *g, *b, *a; }; |
1823 | auto tables = (const Tables*)ctx; |
1824 | |
1825 | r = from_byte(gather(tables->r, to_unorm(r, 255))); |
1826 | g = from_byte(gather(tables->g, to_unorm(g, 255))); |
1827 | b = from_byte(gather(tables->b, to_unorm(b, 255))); |
1828 | a = from_byte(gather(tables->a, to_unorm(a, 255))); |
1829 | } |
1830 | |
1831 | SI F strip_sign(F x, U32* sign) { |
1832 | U32 bits = bit_cast<U32>(x); |
1833 | *sign = bits & 0x80000000; |
1834 | return bit_cast<F>(bits ^ *sign); |
1835 | } |
1836 | |
1837 | SI F apply_sign(F x, U32 sign) { |
1838 | return bit_cast<F>(sign | bit_cast<U32>(x)); |
1839 | } |
1840 | |
1841 | STAGE(parametric, const skcms_TransferFunction* ctx) { |
1842 | auto fn = [&](F v) { |
1843 | U32 sign; |
1844 | v = strip_sign(v, &sign); |
1845 | |
1846 | F r = if_then_else(v <= ctx->d, mad(ctx->c, v, ctx->f) |
1847 | , approx_powf(mad(ctx->a, v, ctx->b), ctx->g) + ctx->e); |
1848 | return apply_sign(r, sign); |
1849 | }; |
1850 | r = fn(r); |
1851 | g = fn(g); |
1852 | b = fn(b); |
1853 | } |
1854 | |
1855 | STAGE(gamma_, const float* G) { |
1856 | auto fn = [&](F v) { |
1857 | U32 sign; |
1858 | v = strip_sign(v, &sign); |
1859 | return apply_sign(approx_powf(v, *G), sign); |
1860 | }; |
1861 | r = fn(r); |
1862 | g = fn(g); |
1863 | b = fn(b); |
1864 | } |
1865 | |
1866 | STAGE(PQish, const skcms_TransferFunction* ctx) { |
1867 | auto fn = [&](F v) { |
1868 | U32 sign; |
1869 | v = strip_sign(v, &sign); |
1870 | |
1871 | F r = approx_powf(max(mad(ctx->b, approx_powf(v, ctx->c), ctx->a), 0) |
1872 | / (mad(ctx->e, approx_powf(v, ctx->c), ctx->d)), |
1873 | ctx->f); |
1874 | |
1875 | return apply_sign(r, sign); |
1876 | }; |
1877 | r = fn(r); |
1878 | g = fn(g); |
1879 | b = fn(b); |
1880 | } |
1881 | |
1882 | STAGE(HLGish, const skcms_TransferFunction* ctx) { |
1883 | auto fn = [&](F v) { |
1884 | U32 sign; |
1885 | v = strip_sign(v, &sign); |
1886 | |
1887 | const float R = ctx->a, G = ctx->b, |
1888 | a = ctx->c, b = ctx->d, c = ctx->e; |
1889 | |
1890 | F r = if_then_else(v*R <= 1, approx_powf(v*R, G) |
1891 | , approx_exp((v-c)*a) + b); |
1892 | |
1893 | return apply_sign(r, sign); |
1894 | }; |
1895 | r = fn(r); |
1896 | g = fn(g); |
1897 | b = fn(b); |
1898 | } |
1899 | |
1900 | STAGE(HLGinvish, const skcms_TransferFunction* ctx) { |
1901 | auto fn = [&](F v) { |
1902 | U32 sign; |
1903 | v = strip_sign(v, &sign); |
1904 | |
1905 | const float R = ctx->a, G = ctx->b, |
1906 | a = ctx->c, b = ctx->d, c = ctx->e; |
1907 | |
1908 | F r = if_then_else(v <= 1, R * approx_powf(v, G) |
1909 | , a * approx_log(v - b) + c); |
1910 | |
1911 | return apply_sign(r, sign); |
1912 | }; |
1913 | r = fn(r); |
1914 | g = fn(g); |
1915 | b = fn(b); |
1916 | } |
1917 | |
1918 | STAGE(from_srgb, Ctx::None) { |
1919 | auto fn = [](F s) { |
1920 | U32 sign; |
1921 | s = strip_sign(s, &sign); |
1922 | auto lo = s * (1/12.92f); |
1923 | auto hi = mad(s*s, mad(s, 0.3000f, 0.6975f), 0.0025f); |
1924 | return apply_sign(if_then_else(s < 0.055f, lo, hi), sign); |
1925 | }; |
1926 | r = fn(r); |
1927 | g = fn(g); |
1928 | b = fn(b); |
1929 | } |
1930 | STAGE(to_srgb, Ctx::None) { |
1931 | auto fn = [](F l) { |
1932 | U32 sign; |
1933 | l = strip_sign(l, &sign); |
1934 | // We tweak c and d for each instruction set to make sure fn(1) is exactly 1. |
1935 | #if defined(JUMPER_IS_AVX512) |
1936 | const float c = 1.130026340485f, |
1937 | d = 0.141387879848f; |
1938 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || \ |
1939 | defined(JUMPER_IS_AVX ) || defined(JUMPER_IS_HSW ) |
1940 | const float c = 1.130048394203f, |
1941 | d = 0.141357362270f; |
1942 | #elif defined(JUMPER_IS_NEON) |
1943 | const float c = 1.129999995232f, |
1944 | d = 0.141381442547f; |
1945 | #else |
1946 | const float c = 1.129999995232f, |
1947 | d = 0.141377761960f; |
1948 | #endif |
1949 | F t = rsqrt(l); |
1950 | auto lo = l * 12.92f; |
1951 | auto hi = mad(t, mad(t, -0.0024542345f, 0.013832027f), c) |
1952 | * rcp(d + t); |
1953 | return apply_sign(if_then_else(l < 0.00465985f, lo, hi), sign); |
1954 | }; |
1955 | r = fn(r); |
1956 | g = fn(g); |
1957 | b = fn(b); |
1958 | } |
1959 | |
1960 | STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
1961 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
1962 | |
1963 | r = g = b = 0.0f; |
1964 | a = from_byte(load<U8>(ptr, tail)); |
1965 | } |
1966 | STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
1967 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
1968 | |
1969 | dr = dg = db = 0.0f; |
1970 | da = from_byte(load<U8>(ptr, tail)); |
1971 | } |
1972 | STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) { |
1973 | const uint8_t* ptr; |
1974 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
1975 | r = g = b = 0.0f; |
1976 | a = from_byte(gather(ptr, ix)); |
1977 | } |
1978 | STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
1979 | auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy); |
1980 | |
1981 | U8 packed = pack(pack(to_unorm(a, 255))); |
1982 | store(ptr, packed, tail); |
1983 | } |
1984 | |
1985 | STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) { |
1986 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
1987 | |
1988 | from_565(load<U16>(ptr, tail), &r,&g,&b); |
1989 | a = 1.0f; |
1990 | } |
1991 | STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
1992 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
1993 | |
1994 | from_565(load<U16>(ptr, tail), &dr,&dg,&db); |
1995 | da = 1.0f; |
1996 | } |
1997 | STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) { |
1998 | const uint16_t* ptr; |
1999 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
2000 | from_565(gather(ptr, ix), &r,&g,&b); |
2001 | a = 1.0f; |
2002 | } |
2003 | STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) { |
2004 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
2005 | |
2006 | U16 px = pack( to_unorm(r, 31) << 11 |
2007 | | to_unorm(g, 63) << 5 |
2008 | | to_unorm(b, 31) ); |
2009 | store(ptr, px, tail); |
2010 | } |
2011 | |
2012 | STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
2013 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
2014 | from_4444(load<U16>(ptr, tail), &r,&g,&b,&a); |
2015 | } |
2016 | STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2017 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
2018 | from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da); |
2019 | } |
2020 | STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) { |
2021 | const uint16_t* ptr; |
2022 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
2023 | from_4444(gather(ptr, ix), &r,&g,&b,&a); |
2024 | } |
2025 | STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
2026 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
2027 | U16 px = pack( to_unorm(r, 15) << 12 |
2028 | | to_unorm(g, 15) << 8 |
2029 | | to_unorm(b, 15) << 4 |
2030 | | to_unorm(a, 15) ); |
2031 | store(ptr, px, tail); |
2032 | } |
2033 | |
2034 | STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
2035 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
2036 | from_8888(load<U32>(ptr, tail), &r,&g,&b,&a); |
2037 | } |
2038 | STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2039 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
2040 | from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da); |
2041 | } |
2042 | STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) { |
2043 | const uint32_t* ptr; |
2044 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
2045 | from_8888(gather(ptr, ix), &r,&g,&b,&a); |
2046 | } |
2047 | STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
2048 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
2049 | |
2050 | U32 px = to_unorm(r, 255) |
2051 | | to_unorm(g, 255) << 8 |
2052 | | to_unorm(b, 255) << 16 |
2053 | | to_unorm(a, 255) << 24; |
2054 | store(ptr, px, tail); |
2055 | } |
2056 | |
2057 | STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
2058 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
2059 | from_88(load<U16>(ptr, tail), &r, &g); |
2060 | b = 0; |
2061 | a = 1; |
2062 | } |
2063 | STAGE(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2064 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
2065 | from_88(load<U16>(ptr, tail), &dr, &dg); |
2066 | db = 0; |
2067 | da = 1; |
2068 | } |
2069 | STAGE(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) { |
2070 | const uint16_t* ptr; |
2071 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2072 | from_88(gather(ptr, ix), &r, &g); |
2073 | b = 0; |
2074 | a = 1; |
2075 | } |
2076 | STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
2077 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy); |
2078 | U16 px = pack( to_unorm(r, 255) | to_unorm(g, 255) << 8 ); |
2079 | store(ptr, px, tail); |
2080 | } |
2081 | |
2082 | STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) { |
2083 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
2084 | r = g = b = 0; |
2085 | a = from_short(load<U16>(ptr, tail)); |
2086 | } |
2087 | STAGE(load_a16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2088 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
2089 | dr = dg = db = 0.0f; |
2090 | da = from_short(load<U16>(ptr, tail)); |
2091 | } |
2092 | STAGE(gather_a16, const SkRasterPipeline_GatherCtx* ctx) { |
2093 | const uint16_t* ptr; |
2094 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2095 | r = g = b = 0.0f; |
2096 | a = from_short(gather(ptr, ix)); |
2097 | } |
2098 | STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) { |
2099 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
2100 | |
2101 | U16 px = pack(to_unorm(a, 65535)); |
2102 | store(ptr, px, tail); |
2103 | } |
2104 | |
2105 | STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) { |
2106 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
2107 | b = 0; a = 1; |
2108 | from_1616(load<U32>(ptr, tail), &r,&g); |
2109 | } |
2110 | STAGE(load_rg1616_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2111 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
2112 | from_1616(load<U32>(ptr, tail), &dr, &dg); |
2113 | db = 0; |
2114 | da = 1; |
2115 | } |
2116 | STAGE(gather_rg1616, const SkRasterPipeline_GatherCtx* ctx) { |
2117 | const uint32_t* ptr; |
2118 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2119 | from_1616(gather(ptr, ix), &r, &g); |
2120 | b = 0; |
2121 | a = 1; |
2122 | } |
2123 | STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) { |
2124 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
2125 | |
2126 | U32 px = to_unorm(r, 65535) |
2127 | | to_unorm(g, 65535) << 16; |
2128 | store(ptr, px, tail); |
2129 | } |
2130 | |
2131 | STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) { |
2132 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy); |
2133 | from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a); |
2134 | } |
2135 | STAGE(load_16161616_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2136 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy); |
2137 | from_16161616(load<U64>(ptr, tail), &dr, &dg, &db, &da); |
2138 | } |
2139 | STAGE(gather_16161616, const SkRasterPipeline_GatherCtx* ctx) { |
2140 | const uint64_t* ptr; |
2141 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2142 | from_16161616(gather(ptr, ix), &r, &g, &b, &a); |
2143 | } |
2144 | STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) { |
2145 | auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy); |
2146 | |
2147 | U16 R = pack(to_unorm(r, 65535)), |
2148 | G = pack(to_unorm(g, 65535)), |
2149 | B = pack(to_unorm(b, 65535)), |
2150 | A = pack(to_unorm(a, 65535)); |
2151 | |
2152 | store4(ptr,tail, R,G,B,A); |
2153 | } |
2154 | |
2155 | |
2156 | STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) { |
2157 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
2158 | from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a); |
2159 | } |
2160 | STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2161 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
2162 | from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da); |
2163 | } |
2164 | STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) { |
2165 | const uint32_t* ptr; |
2166 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
2167 | from_1010102(gather(ptr, ix), &r,&g,&b,&a); |
2168 | } |
2169 | STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) { |
2170 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
2171 | |
2172 | U32 px = to_unorm(r, 1023) |
2173 | | to_unorm(g, 1023) << 10 |
2174 | | to_unorm(b, 1023) << 20 |
2175 | | to_unorm(a, 3) << 30; |
2176 | store(ptr, px, tail); |
2177 | } |
2178 | |
2179 | STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) { |
2180 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); |
2181 | |
2182 | U16 R,G,B,A; |
2183 | load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); |
2184 | r = from_half(R); |
2185 | g = from_half(G); |
2186 | b = from_half(B); |
2187 | a = from_half(A); |
2188 | } |
2189 | STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2190 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); |
2191 | |
2192 | U16 R,G,B,A; |
2193 | load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); |
2194 | dr = from_half(R); |
2195 | dg = from_half(G); |
2196 | db = from_half(B); |
2197 | da = from_half(A); |
2198 | } |
2199 | STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) { |
2200 | const uint64_t* ptr; |
2201 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
2202 | auto px = gather(ptr, ix); |
2203 | |
2204 | U16 R,G,B,A; |
2205 | load4((const uint16_t*)&px,0, &R,&G,&B,&A); |
2206 | r = from_half(R); |
2207 | g = from_half(G); |
2208 | b = from_half(B); |
2209 | a = from_half(A); |
2210 | } |
2211 | STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) { |
2212 | auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy); |
2213 | store4((uint16_t*)ptr,tail, to_half(r) |
2214 | , to_half(g) |
2215 | , to_half(b) |
2216 | , to_half(a)); |
2217 | } |
2218 | |
2219 | STAGE(store_u16_be, const SkRasterPipeline_MemoryCtx* ctx) { |
2220 | auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,dy); |
2221 | |
2222 | U16 R = bswap(pack(to_unorm(r, 65535))), |
2223 | G = bswap(pack(to_unorm(g, 65535))), |
2224 | B = bswap(pack(to_unorm(b, 65535))), |
2225 | A = bswap(pack(to_unorm(a, 65535))); |
2226 | |
2227 | store4(ptr,tail, R,G,B,A); |
2228 | } |
2229 | |
2230 | STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) { |
2231 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
2232 | |
2233 | U16 A = load<U16>((const uint16_t*)ptr, tail); |
2234 | r = 0; |
2235 | g = 0; |
2236 | b = 0; |
2237 | a = from_half(A); |
2238 | } |
2239 | STAGE(load_af16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2240 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
2241 | |
2242 | U16 A = load<U16>((const uint16_t*)ptr, tail); |
2243 | dr = dg = db = 0.0f; |
2244 | da = from_half(A); |
2245 | } |
2246 | STAGE(gather_af16, const SkRasterPipeline_GatherCtx* ctx) { |
2247 | const uint16_t* ptr; |
2248 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2249 | r = g = b = 0.0f; |
2250 | a = from_half(gather(ptr, ix)); |
2251 | } |
2252 | STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) { |
2253 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
2254 | store(ptr, to_half(a), tail); |
2255 | } |
2256 | |
2257 | STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) { |
2258 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
2259 | |
2260 | U16 R,G; |
2261 | load2((const uint16_t*)ptr, tail, &R, &G); |
2262 | r = from_half(R); |
2263 | g = from_half(G); |
2264 | b = 0; |
2265 | a = 1; |
2266 | } |
2267 | STAGE(load_rgf16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2268 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
2269 | |
2270 | U16 R,G; |
2271 | load2((const uint16_t*)ptr, tail, &R, &G); |
2272 | dr = from_half(R); |
2273 | dg = from_half(G); |
2274 | db = 0; |
2275 | da = 1; |
2276 | } |
2277 | STAGE(gather_rgf16, const SkRasterPipeline_GatherCtx* ctx) { |
2278 | const uint32_t* ptr; |
2279 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2280 | auto px = gather(ptr, ix); |
2281 | |
2282 | U16 R,G; |
2283 | load2((const uint16_t*)&px, 0, &R, &G); |
2284 | r = from_half(R); |
2285 | g = from_half(G); |
2286 | b = 0; |
2287 | a = 1; |
2288 | } |
2289 | STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) { |
2290 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy); |
2291 | store2((uint16_t*)ptr, tail, to_half(r) |
2292 | , to_half(g)); |
2293 | } |
2294 | |
2295 | STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) { |
2296 | auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy); |
2297 | load4(ptr,tail, &r,&g,&b,&a); |
2298 | } |
2299 | STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2300 | auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy); |
2301 | load4(ptr,tail, &dr,&dg,&db,&da); |
2302 | } |
2303 | STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) { |
2304 | const float* ptr; |
2305 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
2306 | r = gather(ptr, 4*ix + 0); |
2307 | g = gather(ptr, 4*ix + 1); |
2308 | b = gather(ptr, 4*ix + 2); |
2309 | a = gather(ptr, 4*ix + 3); |
2310 | } |
2311 | STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) { |
2312 | auto ptr = ptr_at_xy<float>(ctx, 4*dx,4*dy); |
2313 | store4(ptr,tail, r,g,b,a); |
2314 | } |
2315 | |
2316 | STAGE(load_rgf32, const SkRasterPipeline_MemoryCtx* ctx) { |
2317 | auto ptr = ptr_at_xy<const float>(ctx, 2*dx,2*dy); |
2318 | load2(ptr, tail, &r, &g); |
2319 | b = 0; |
2320 | a = 1; |
2321 | } |
2322 | STAGE(store_rgf32, const SkRasterPipeline_MemoryCtx* ctx) { |
2323 | auto ptr = ptr_at_xy<float>(ctx, 2*dx,2*dy); |
2324 | store2(ptr, tail, r, g); |
2325 | } |
2326 | |
2327 | SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) { |
2328 | return v - floor_(v*ctx->invScale)*ctx->scale; |
2329 | } |
2330 | SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) { |
2331 | auto limit = ctx->scale; |
2332 | auto invLimit = ctx->invScale; |
2333 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); |
2334 | } |
2335 | // Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images). |
2336 | // The gather stages will hard clamp the output of these stages to [0,limit)... |
2337 | // we just need to do the basic repeat or mirroring. |
2338 | STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(r, ctx); } |
2339 | STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(g, ctx); } |
2340 | STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(r, ctx); } |
2341 | STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(g, ctx); } |
2342 | |
2343 | STAGE( clamp_x_1, Ctx::None) { r = clamp_01(r); } |
2344 | STAGE(repeat_x_1, Ctx::None) { r = clamp_01(r - floor_(r)); } |
2345 | STAGE(mirror_x_1, Ctx::None) { r = clamp_01(abs_( (r-1.0f) - two(floor_((r-1.0f)*0.5f)) - 1.0f )); } |
2346 | |
2347 | // Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain: |
2348 | // mask == 0x00000000 if the coordinate(s) are out of bounds |
2349 | // mask == 0xFFFFFFFF if the coordinate(s) are in bounds |
2350 | // After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0 |
2351 | // if either of the coordinates were out of bounds. |
2352 | |
2353 | STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) { |
2354 | auto w = ctx->limit_x; |
2355 | sk_unaligned_store(ctx->mask, cond_to_mask((0 <= r) & (r < w))); |
2356 | } |
2357 | STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) { |
2358 | auto h = ctx->limit_y; |
2359 | sk_unaligned_store(ctx->mask, cond_to_mask((0 <= g) & (g < h))); |
2360 | } |
2361 | STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) { |
2362 | auto w = ctx->limit_x; |
2363 | auto h = ctx->limit_y; |
2364 | sk_unaligned_store(ctx->mask, |
2365 | cond_to_mask((0 <= r) & (r < w) & (0 <= g) & (g < h))); |
2366 | } |
2367 | STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) { |
2368 | auto mask = sk_unaligned_load<U32>(ctx->mask); |
2369 | r = bit_cast<F>( bit_cast<U32>(r) & mask ); |
2370 | g = bit_cast<F>( bit_cast<U32>(g) & mask ); |
2371 | b = bit_cast<F>( bit_cast<U32>(b) & mask ); |
2372 | a = bit_cast<F>( bit_cast<U32>(a) & mask ); |
2373 | } |
2374 | |
2375 | STAGE(alpha_to_gray, Ctx::None) { |
2376 | r = g = b = a; |
2377 | a = 1; |
2378 | } |
2379 | STAGE(alpha_to_gray_dst, Ctx::None) { |
2380 | dr = dg = db = da; |
2381 | da = 1; |
2382 | } |
2383 | STAGE(bt709_luminance_or_luma_to_alpha, Ctx::None) { |
2384 | a = r*0.2126f + g*0.7152f + b*0.0722f; |
2385 | r = g = b = 0; |
2386 | } |
2387 | |
2388 | STAGE(matrix_translate, const float* m) { |
2389 | r += m[0]; |
2390 | g += m[1]; |
2391 | } |
2392 | STAGE(matrix_scale_translate, const float* m) { |
2393 | r = mad(r,m[0], m[2]); |
2394 | g = mad(g,m[1], m[3]); |
2395 | } |
2396 | STAGE(matrix_2x3, const float* m) { |
2397 | auto R = mad(r,m[0], mad(g,m[2], m[4])), |
2398 | G = mad(r,m[1], mad(g,m[3], m[5])); |
2399 | r = R; |
2400 | g = G; |
2401 | } |
2402 | STAGE(matrix_3x3, const float* m) { |
2403 | auto R = mad(r,m[0], mad(g,m[3], b*m[6])), |
2404 | G = mad(r,m[1], mad(g,m[4], b*m[7])), |
2405 | B = mad(r,m[2], mad(g,m[5], b*m[8])); |
2406 | r = R; |
2407 | g = G; |
2408 | b = B; |
2409 | } |
2410 | STAGE(matrix_3x4, const float* m) { |
2411 | auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))), |
2412 | G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))), |
2413 | B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11]))); |
2414 | r = R; |
2415 | g = G; |
2416 | b = B; |
2417 | } |
2418 | STAGE(matrix_4x5, const float* m) { |
2419 | auto R = mad(r,m[ 0], mad(g,m[ 1], mad(b,m[ 2], mad(a,m[ 3], m[ 4])))), |
2420 | G = mad(r,m[ 5], mad(g,m[ 6], mad(b,m[ 7], mad(a,m[ 8], m[ 9])))), |
2421 | B = mad(r,m[10], mad(g,m[11], mad(b,m[12], mad(a,m[13], m[14])))), |
2422 | A = mad(r,m[15], mad(g,m[16], mad(b,m[17], mad(a,m[18], m[19])))); |
2423 | r = R; |
2424 | g = G; |
2425 | b = B; |
2426 | a = A; |
2427 | } |
2428 | STAGE(matrix_4x3, const float* m) { |
2429 | auto X = r, |
2430 | Y = g; |
2431 | |
2432 | r = mad(X, m[0], mad(Y, m[4], m[ 8])); |
2433 | g = mad(X, m[1], mad(Y, m[5], m[ 9])); |
2434 | b = mad(X, m[2], mad(Y, m[6], m[10])); |
2435 | a = mad(X, m[3], mad(Y, m[7], m[11])); |
2436 | } |
2437 | STAGE(matrix_perspective, const float* m) { |
2438 | // N.B. Unlike the other matrix_ stages, this matrix is row-major. |
2439 | auto R = mad(r,m[0], mad(g,m[1], m[2])), |
2440 | G = mad(r,m[3], mad(g,m[4], m[5])), |
2441 | Z = mad(r,m[6], mad(g,m[7], m[8])); |
2442 | r = R * rcp(Z); |
2443 | g = G * rcp(Z); |
2444 | } |
2445 | |
2446 | SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t, |
2447 | F* r, F* g, F* b, F* a) { |
2448 | F fr, br, fg, bg, fb, bb, fa, ba; |
2449 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
2450 | if (c->stopCount <=8) { |
2451 | fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx); |
2452 | br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx); |
2453 | fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), idx); |
2454 | bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), idx); |
2455 | fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), idx); |
2456 | bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), idx); |
2457 | fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), idx); |
2458 | ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), idx); |
2459 | } else |
2460 | #endif |
2461 | { |
2462 | fr = gather(c->fs[0], idx); |
2463 | br = gather(c->bs[0], idx); |
2464 | fg = gather(c->fs[1], idx); |
2465 | bg = gather(c->bs[1], idx); |
2466 | fb = gather(c->fs[2], idx); |
2467 | bb = gather(c->bs[2], idx); |
2468 | fa = gather(c->fs[3], idx); |
2469 | ba = gather(c->bs[3], idx); |
2470 | } |
2471 | |
2472 | *r = mad(t, fr, br); |
2473 | *g = mad(t, fg, bg); |
2474 | *b = mad(t, fb, bb); |
2475 | *a = mad(t, fa, ba); |
2476 | } |
2477 | |
2478 | STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) { |
2479 | auto t = r; |
2480 | auto idx = trunc_(t * (c->stopCount-1)); |
2481 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
2482 | } |
2483 | |
2484 | STAGE(gradient, const SkRasterPipeline_GradientCtx* c) { |
2485 | auto t = r; |
2486 | U32 idx = 0; |
2487 | |
2488 | // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. |
2489 | for (size_t i = 1; i < c->stopCount; i++) { |
2490 | idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); |
2491 | } |
2492 | |
2493 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
2494 | } |
2495 | |
2496 | STAGE(evenly_spaced_2_stop_gradient, const void* ctx) { |
2497 | // TODO: Rename Ctx SkRasterPipeline_EvenlySpaced2StopGradientCtx. |
2498 | struct Ctx { float f[4], b[4]; }; |
2499 | auto c = (const Ctx*)ctx; |
2500 | |
2501 | auto t = r; |
2502 | r = mad(t, c->f[0], c->b[0]); |
2503 | g = mad(t, c->f[1], c->b[1]); |
2504 | b = mad(t, c->f[2], c->b[2]); |
2505 | a = mad(t, c->f[3], c->b[3]); |
2506 | } |
2507 | |
2508 | STAGE(xy_to_unit_angle, Ctx::None) { |
2509 | F X = r, |
2510 | Y = g; |
2511 | F xabs = abs_(X), |
2512 | yabs = abs_(Y); |
2513 | |
2514 | F slope = min(xabs, yabs)/max(xabs, yabs); |
2515 | F s = slope * slope; |
2516 | |
2517 | // Use a 7th degree polynomial to approximate atan. |
2518 | // This was generated using sollya.gforge.inria.fr. |
2519 | // A float optimized polynomial was generated using the following command. |
2520 | // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); |
2521 | F phi = slope |
2522 | * (0.15912117063999176025390625f + s |
2523 | * (-5.185396969318389892578125e-2f + s |
2524 | * (2.476101927459239959716796875e-2f + s |
2525 | * (-7.0547382347285747528076171875e-3f)))); |
2526 | |
2527 | phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); |
2528 | phi = if_then_else(X < 0.0f , 1.0f/2.0f - phi, phi); |
2529 | phi = if_then_else(Y < 0.0f , 1.0f - phi , phi); |
2530 | phi = if_then_else(phi != phi , 0 , phi); // Check for NaN. |
2531 | r = phi; |
2532 | } |
2533 | |
2534 | STAGE(xy_to_radius, Ctx::None) { |
2535 | F X2 = r * r, |
2536 | Y2 = g * g; |
2537 | r = sqrt_(X2 + Y2); |
2538 | } |
2539 | |
2540 | // Please see https://skia.org/dev/design/conical for how our 2pt conical shader works. |
2541 | |
2542 | STAGE(negate_x, Ctx::None) { r = -r; } |
2543 | |
2544 | STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) { |
2545 | F x = r, y = g, &t = r; |
2546 | t = x + sqrt_(ctx->fP0 - y*y); // ctx->fP0 = r0 * r0 |
2547 | } |
2548 | |
2549 | STAGE(xy_to_2pt_conical_focal_on_circle, Ctx::None) { |
2550 | F x = r, y = g, &t = r; |
2551 | t = x + y*y / x; // (x^2 + y^2) / x |
2552 | } |
2553 | |
2554 | STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) { |
2555 | F x = r, y = g, &t = r; |
2556 | t = sqrt_(x*x + y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 |
2557 | } |
2558 | |
2559 | STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) { |
2560 | F x = r, y = g, &t = r; |
2561 | t = sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 |
2562 | } |
2563 | |
2564 | STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) { |
2565 | F x = r, y = g, &t = r; |
2566 | t = -sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 |
2567 | } |
2568 | |
2569 | STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) { |
2570 | F& t = r; |
2571 | t = t + ctx->fP1; // ctx->fP1 = f |
2572 | } |
2573 | |
2574 | STAGE(alter_2pt_conical_unswap, Ctx::None) { |
2575 | F& t = r; |
2576 | t = 1 - t; |
2577 | } |
2578 | |
2579 | STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) { |
2580 | F& t = r; |
2581 | auto is_degenerate = (t != t); // NaN |
2582 | t = if_then_else(is_degenerate, F(0), t); |
2583 | sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate)); |
2584 | } |
2585 | |
2586 | STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) { |
2587 | F& t = r; |
2588 | auto is_degenerate = (t <= 0) | (t != t); |
2589 | t = if_then_else(is_degenerate, F(0), t); |
2590 | sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate)); |
2591 | } |
2592 | |
2593 | STAGE(apply_vector_mask, const uint32_t* ctx) { |
2594 | const U32 mask = sk_unaligned_load<U32>(ctx); |
2595 | r = bit_cast<F>(bit_cast<U32>(r) & mask); |
2596 | g = bit_cast<F>(bit_cast<U32>(g) & mask); |
2597 | b = bit_cast<F>(bit_cast<U32>(b) & mask); |
2598 | a = bit_cast<F>(bit_cast<U32>(a) & mask); |
2599 | } |
2600 | |
2601 | STAGE(save_xy, SkRasterPipeline_SamplerCtx* c) { |
2602 | // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy). |
2603 | // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid |
2604 | // surrounding (x,y) at (0.5,0.5) off-center. |
2605 | F fx = fract(r + 0.5f), |
2606 | fy = fract(g + 0.5f); |
2607 | |
2608 | // Samplers will need to load x and fx, or y and fy. |
2609 | sk_unaligned_store(c->x, r); |
2610 | sk_unaligned_store(c->y, g); |
2611 | sk_unaligned_store(c->fx, fx); |
2612 | sk_unaligned_store(c->fy, fy); |
2613 | } |
2614 | |
2615 | STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) { |
2616 | // Bilinear and bicubic filters are both separable, so we produce independent contributions |
2617 | // from x and y, multiplying them together here to get each pixel's total scale factor. |
2618 | auto scale = sk_unaligned_load<F>(c->scalex) |
2619 | * sk_unaligned_load<F>(c->scaley); |
2620 | dr = mad(scale, r, dr); |
2621 | dg = mad(scale, g, dg); |
2622 | db = mad(scale, b, db); |
2623 | da = mad(scale, a, da); |
2624 | } |
2625 | |
2626 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center |
2627 | // are combined in direct proportion to their area overlapping that logical query pixel. |
2628 | // At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x. |
2629 | // The y-axis is symmetric. |
2630 | |
2631 | template <int kScale> |
2632 | SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) { |
2633 | *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f); |
2634 | F fx = sk_unaligned_load<F>(ctx->fx); |
2635 | |
2636 | F scalex; |
2637 | if (kScale == -1) { scalex = 1.0f - fx; } |
2638 | if (kScale == +1) { scalex = fx; } |
2639 | sk_unaligned_store(ctx->scalex, scalex); |
2640 | } |
2641 | template <int kScale> |
2642 | SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) { |
2643 | *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f); |
2644 | F fy = sk_unaligned_load<F>(ctx->fy); |
2645 | |
2646 | F scaley; |
2647 | if (kScale == -1) { scaley = 1.0f - fy; } |
2648 | if (kScale == +1) { scaley = fy; } |
2649 | sk_unaligned_store(ctx->scaley, scaley); |
2650 | } |
2651 | |
2652 | STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, &r); } |
2653 | STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+1>(ctx, &r); } |
2654 | STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-1>(ctx, &g); } |
2655 | STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+1>(ctx, &g); } |
2656 | |
2657 | |
2658 | // In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample |
2659 | // pixel center are combined with a non-uniform cubic filter, with higher values near the center. |
2660 | // |
2661 | // We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets. |
2662 | // See GrCubicEffect for details of this particular filter. |
2663 | |
2664 | SI F bicubic_near(F t) { |
2665 | // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18 |
2666 | return mad(t, mad(t, mad((-21/18.0f), t, (27/18.0f)), (9/18.0f)), (1/18.0f)); |
2667 | } |
2668 | SI F bicubic_far(F t) { |
2669 | // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18) |
2670 | return (t*t)*mad((7/18.0f), t, (-6/18.0f)); |
2671 | } |
2672 | |
2673 | template <int kScale> |
2674 | SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) { |
2675 | *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f); |
2676 | F fx = sk_unaligned_load<F>(ctx->fx); |
2677 | |
2678 | F scalex; |
2679 | if (kScale == -3) { scalex = bicubic_far (1.0f - fx); } |
2680 | if (kScale == -1) { scalex = bicubic_near(1.0f - fx); } |
2681 | if (kScale == +1) { scalex = bicubic_near( fx); } |
2682 | if (kScale == +3) { scalex = bicubic_far ( fx); } |
2683 | sk_unaligned_store(ctx->scalex, scalex); |
2684 | } |
2685 | template <int kScale> |
2686 | SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) { |
2687 | *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f); |
2688 | F fy = sk_unaligned_load<F>(ctx->fy); |
2689 | |
2690 | F scaley; |
2691 | if (kScale == -3) { scaley = bicubic_far (1.0f - fy); } |
2692 | if (kScale == -1) { scaley = bicubic_near(1.0f - fy); } |
2693 | if (kScale == +1) { scaley = bicubic_near( fy); } |
2694 | if (kScale == +3) { scaley = bicubic_far ( fy); } |
2695 | sk_unaligned_store(ctx->scaley, scaley); |
2696 | } |
2697 | |
2698 | STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, &r); } |
2699 | STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-1>(ctx, &r); } |
2700 | STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+1>(ctx, &r); } |
2701 | STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+3>(ctx, &r); } |
2702 | |
2703 | STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-3>(ctx, &g); } |
2704 | STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-1>(ctx, &g); } |
2705 | STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+1>(ctx, &g); } |
2706 | STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+3>(ctx, &g); } |
2707 | |
2708 | STAGE(callback, SkRasterPipeline_CallbackCtx* c) { |
2709 | store4(c->rgba,0, r,g,b,a); |
2710 | c->fn(c, tail ? tail : N); |
2711 | load4(c->read_from,0, &r,&g,&b,&a); |
2712 | } |
2713 | |
2714 | // shader: void main(float2 p, inout half4 color) |
2715 | // colorfilter: void main(inout half4 color) |
2716 | STAGE(interpreter, SkRasterPipeline_InterpreterCtx* c) { |
2717 | // If N is less than the interpreter's VecWidth, then we are doing more work than necessary in |
2718 | // the interpreter. This is a known issue, and will be addressed at some point. |
2719 | float xx[N], yy[N], |
2720 | rr[N], gg[N], bb[N], aa[N]; |
2721 | |
2722 | float* args[] = { xx, yy, rr, gg, bb, aa }; |
2723 | float** in_args = args; |
2724 | int in_count = 6; |
2725 | |
2726 | if (c->shaderConvention) { |
2727 | // our caller must have called seed_shader to set these |
2728 | sk_unaligned_store(xx, r); |
2729 | sk_unaligned_store(yy, g); |
2730 | sk_unaligned_store(rr, F(c->paintColor.fR)); |
2731 | sk_unaligned_store(gg, F(c->paintColor.fG)); |
2732 | sk_unaligned_store(bb, F(c->paintColor.fB)); |
2733 | sk_unaligned_store(aa, F(c->paintColor.fA)); |
2734 | } else { |
2735 | in_args += 2; // skip x,y |
2736 | in_count = 4; |
2737 | sk_unaligned_store(rr, r); |
2738 | sk_unaligned_store(gg, g); |
2739 | sk_unaligned_store(bb, b); |
2740 | sk_unaligned_store(aa, a); |
2741 | } |
2742 | |
2743 | SkAssertResult(c->byteCode->runStriped(c->fn, tail ? tail : N, in_args, in_count, |
2744 | nullptr, 0, (const float*)c->inputs, c->ninputs)); |
2745 | |
2746 | r = sk_unaligned_load<F>(rr); |
2747 | g = sk_unaligned_load<F>(gg); |
2748 | b = sk_unaligned_load<F>(bb); |
2749 | a = sk_unaligned_load<F>(aa); |
2750 | } |
2751 | |
2752 | STAGE(gauss_a_to_rgba, Ctx::None) { |
2753 | // x = 1 - x; |
2754 | // exp(-x * x * 4) - 0.018f; |
2755 | // ... now approximate with quartic |
2756 | // |
2757 | const float c4 = -2.26661229133605957031f; |
2758 | const float c3 = 2.89795351028442382812f; |
2759 | const float c2 = 0.21345567703247070312f; |
2760 | const float c1 = 0.15489584207534790039f; |
2761 | const float c0 = 0.00030726194381713867f; |
2762 | a = mad(a, mad(a, mad(a, mad(a, c4, c3), c2), c1), c0); |
2763 | r = a; |
2764 | g = a; |
2765 | b = a; |
2766 | } |
2767 | |
2768 | SI F tile(F v, SkTileMode mode, float limit, float invLimit) { |
2769 | // The ix_and_ptr() calls in sample() will clamp tile()'s output, so no need to clamp here. |
2770 | switch (mode) { |
2771 | case SkTileMode::kDecal: // TODO, for now fallthrough to clamp |
2772 | case SkTileMode::kClamp: return v; |
2773 | case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit; |
2774 | case SkTileMode::kMirror: |
2775 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); |
2776 | } |
2777 | SkUNREACHABLE; |
2778 | } |
2779 | |
2780 | SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y, |
2781 | F* r, F* g, F* b, F* a) { |
2782 | x = tile(x, ctx->tileX, ctx->width , ctx->invWidth ); |
2783 | y = tile(y, ctx->tileY, ctx->height, ctx->invHeight); |
2784 | |
2785 | switch (ctx->ct) { |
2786 | default: *r = *g = *b = *a = 0; // TODO |
2787 | break; |
2788 | |
2789 | case kRGBA_8888_SkColorType: |
2790 | case kBGRA_8888_SkColorType: { |
2791 | const uint32_t* ptr; |
2792 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
2793 | from_8888(gather(ptr, ix), r,g,b,a); |
2794 | if (ctx->ct == kBGRA_8888_SkColorType) { |
2795 | std::swap(*r,*b); |
2796 | } |
2797 | } break; |
2798 | } |
2799 | } |
2800 | |
2801 | template <int D> |
2802 | SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx, |
2803 | F cx, F cy, const F (&wx)[D], const F (&wy)[D], |
2804 | F* r, F* g, F* b, F* a) { |
2805 | |
2806 | float start = -0.5f*(D-1); |
2807 | |
2808 | *r = *g = *b = *a = 0; |
2809 | F y = cy + start; |
2810 | for (int j = 0; j < D; j++, y += 1.0f) { |
2811 | F x = cx + start; |
2812 | for (int i = 0; i < D; i++, x += 1.0f) { |
2813 | F R,G,B,A; |
2814 | sample(ctx, x,y, &R,&G,&B,&A); |
2815 | |
2816 | F w = wx[i] * wy[j]; |
2817 | *r = mad(w,R,*r); |
2818 | *g = mad(w,G,*g); |
2819 | *b = mad(w,B,*b); |
2820 | *a = mad(w,A,*a); |
2821 | } |
2822 | } |
2823 | } |
2824 | |
2825 | STAGE(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) { |
2826 | F x = r, fx = fract(x + 0.5f), |
2827 | y = g, fy = fract(y + 0.5f); |
2828 | const F wx[] = {1.0f - fx, fx}; |
2829 | const F wy[] = {1.0f - fy, fy}; |
2830 | |
2831 | sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); |
2832 | } |
2833 | STAGE(bicubic, SkRasterPipeline_SamplerCtx2* ctx) { |
2834 | F x = r, fx = fract(x + 0.5f), |
2835 | y = g, fy = fract(y + 0.5f); |
2836 | const F wx[] = { bicubic_far(1-fx), bicubic_near(1-fx), bicubic_near(fx), bicubic_far(fx) }; |
2837 | const F wy[] = { bicubic_far(1-fy), bicubic_near(1-fy), bicubic_near(fy), bicubic_far(fy) }; |
2838 | |
2839 | sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); |
2840 | } |
2841 | |
2842 | // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling. |
2843 | STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { |
2844 | // (cx,cy) are the center of our sample. |
2845 | F cx = r, |
2846 | cy = g; |
2847 | |
2848 | // All sample points are at the same fractional offset (fx,fy). |
2849 | // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. |
2850 | F fx = fract(cx + 0.5f), |
2851 | fy = fract(cy + 0.5f); |
2852 | |
2853 | // We'll accumulate the color of all four samples into {r,g,b,a} directly. |
2854 | r = g = b = a = 0; |
2855 | |
2856 | for (float dy = -0.5f; dy <= +0.5f; dy += 1.0f) |
2857 | for (float dx = -0.5f; dx <= +0.5f; dx += 1.0f) { |
2858 | // (x,y) are the coordinates of this sample point. |
2859 | F x = cx + dx, |
2860 | y = cy + dy; |
2861 | |
2862 | // ix_and_ptr() will clamp to the image's bounds for us. |
2863 | const uint32_t* ptr; |
2864 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
2865 | |
2866 | F sr,sg,sb,sa; |
2867 | from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa); |
2868 | |
2869 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center |
2870 | // are combined in direct proportion to their area overlapping that logical query pixel. |
2871 | // At positive offsets, the x-axis contribution to that rectangle is fx, |
2872 | // or (1-fx) at negative x. Same deal for y. |
2873 | F sx = (dx > 0) ? fx : 1.0f - fx, |
2874 | sy = (dy > 0) ? fy : 1.0f - fy, |
2875 | area = sx * sy; |
2876 | |
2877 | r += sr * area; |
2878 | g += sg * area; |
2879 | b += sb * area; |
2880 | a += sa * area; |
2881 | } |
2882 | } |
2883 | |
2884 | // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling. |
2885 | STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { |
2886 | // (cx,cy) are the center of our sample. |
2887 | F cx = r, |
2888 | cy = g; |
2889 | |
2890 | // All sample points are at the same fractional offset (fx,fy). |
2891 | // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. |
2892 | F fx = fract(cx + 0.5f), |
2893 | fy = fract(cy + 0.5f); |
2894 | |
2895 | // We'll accumulate the color of all four samples into {r,g,b,a} directly. |
2896 | r = g = b = a = 0; |
2897 | |
2898 | const F scaley[4] = { |
2899 | bicubic_far (1.0f - fy), bicubic_near(1.0f - fy), |
2900 | bicubic_near( fy), bicubic_far ( fy), |
2901 | }; |
2902 | const F scalex[4] = { |
2903 | bicubic_far (1.0f - fx), bicubic_near(1.0f - fx), |
2904 | bicubic_near( fx), bicubic_far ( fx), |
2905 | }; |
2906 | |
2907 | F sample_y = cy - 1.5f; |
2908 | for (int yy = 0; yy <= 3; ++yy) { |
2909 | F sample_x = cx - 1.5f; |
2910 | for (int xx = 0; xx <= 3; ++xx) { |
2911 | F scale = scalex[xx] * scaley[yy]; |
2912 | |
2913 | // ix_and_ptr() will clamp to the image's bounds for us. |
2914 | const uint32_t* ptr; |
2915 | U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y); |
2916 | |
2917 | F sr,sg,sb,sa; |
2918 | from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa); |
2919 | |
2920 | r = mad(scale, sr, r); |
2921 | g = mad(scale, sg, g); |
2922 | b = mad(scale, sb, b); |
2923 | a = mad(scale, sa, a); |
2924 | |
2925 | sample_x += 1; |
2926 | } |
2927 | sample_y += 1; |
2928 | } |
2929 | } |
2930 | |
2931 | // ~~~~~~ GrSwizzle stage ~~~~~~ // |
2932 | |
2933 | STAGE(swizzle, void* ctx) { |
2934 | auto ir = r, ig = g, ib = b, ia = a; |
2935 | F* o[] = {&r, &g, &b, &a}; |
2936 | char swiz[4]; |
2937 | memcpy(swiz, &ctx, sizeof(swiz)); |
2938 | |
2939 | for (int i = 0; i < 4; ++i) { |
2940 | switch (swiz[i]) { |
2941 | case 'r': *o[i] = ir; break; |
2942 | case 'g': *o[i] = ig; break; |
2943 | case 'b': *o[i] = ib; break; |
2944 | case 'a': *o[i] = ia; break; |
2945 | case '0': *o[i] = F(0); break; |
2946 | case '1': *o[i] = F(1); break; |
2947 | default: break; |
2948 | } |
2949 | } |
2950 | } |
2951 | |
2952 | namespace lowp { |
2953 | #if defined(JUMPER_IS_SCALAR) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE) |
2954 | // If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually), |
2955 | // we don't generate lowp stages. All these nullptrs will tell SkJumper.cpp to always use the |
2956 | // highp float pipeline. |
2957 | #define M(st) static void (*st)(void) = nullptr; |
2958 | SK_RASTER_PIPELINE_STAGES(M) |
2959 | #undef M |
2960 | static void (*just_return)(void) = nullptr; |
2961 | |
2962 | static void start_pipeline(size_t,size_t,size_t,size_t, void**) {} |
2963 | |
2964 | #else // We are compiling vector code with Clang... let's make some lowp stages! |
2965 | |
2966 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
2967 | using U8 = uint8_t __attribute__((ext_vector_type(16))); |
2968 | using U16 = uint16_t __attribute__((ext_vector_type(16))); |
2969 | using I16 = int16_t __attribute__((ext_vector_type(16))); |
2970 | using I32 = int32_t __attribute__((ext_vector_type(16))); |
2971 | using U32 = uint32_t __attribute__((ext_vector_type(16))); |
2972 | using F = float __attribute__((ext_vector_type(16))); |
2973 | #else |
2974 | using U8 = uint8_t __attribute__((ext_vector_type(8))); |
2975 | using U16 = uint16_t __attribute__((ext_vector_type(8))); |
2976 | using I16 = int16_t __attribute__((ext_vector_type(8))); |
2977 | using I32 = int32_t __attribute__((ext_vector_type(8))); |
2978 | using U32 = uint32_t __attribute__((ext_vector_type(8))); |
2979 | using F = float __attribute__((ext_vector_type(8))); |
2980 | #endif |
2981 | |
2982 | static const size_t N = sizeof(U16) / sizeof(uint16_t); |
2983 | |
2984 | // Once again, some platforms benefit from a restricted Stage calling convention, |
2985 | // but others can pass tons and tons of registers and we're happy to exploit that. |
2986 | // It's exactly the same decision and implementation strategy as the F stages above. |
2987 | #if JUMPER_NARROW_STAGES |
2988 | struct Params { |
2989 | size_t dx, dy, tail; |
2990 | U16 dr,dg,db,da; |
2991 | }; |
2992 | using Stage = void(ABI*)(Params*, void** program, U16 r, U16 g, U16 b, U16 a); |
2993 | #else |
2994 | // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64. |
2995 | using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy, |
2996 | U16 r, U16 g, U16 b, U16 a, |
2997 | U16 dr, U16 dg, U16 db, U16 da); |
2998 | #endif |
2999 | |
3000 | static void start_pipeline(const size_t x0, const size_t y0, |
3001 | const size_t xlimit, const size_t ylimit, void** program) { |
3002 | auto start = (Stage)load_and_inc(program); |
3003 | for (size_t dy = y0; dy < ylimit; dy++) { |
3004 | #if JUMPER_NARROW_STAGES |
3005 | Params params = { x0,dy,0, 0,0,0,0 }; |
3006 | for (; params.dx + N <= xlimit; params.dx += N) { |
3007 | start(¶ms,program, 0,0,0,0); |
3008 | } |
3009 | if (size_t tail = xlimit - params.dx) { |
3010 | params.tail = tail; |
3011 | start(¶ms,program, 0,0,0,0); |
3012 | } |
3013 | #else |
3014 | size_t dx = x0; |
3015 | for (; dx + N <= xlimit; dx += N) { |
3016 | start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0); |
3017 | } |
3018 | if (size_t tail = xlimit - dx) { |
3019 | start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); |
3020 | } |
3021 | #endif |
3022 | } |
3023 | } |
3024 | |
3025 | #if JUMPER_NARROW_STAGES |
3026 | static void ABI just_return(Params*, void**, U16,U16,U16,U16) {} |
3027 | #else |
3028 | static void ABI just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {} |
3029 | #endif |
3030 | |
3031 | // All stages use the same function call ABI to chain into each other, but there are three types: |
3032 | // GG: geometry in, geometry out -- think, a matrix |
3033 | // GP: geometry in, pixels out. -- think, a memory gather |
3034 | // PP: pixels in, pixels out. -- think, a blend mode |
3035 | // |
3036 | // (Some stages ignore their inputs or produce no logical output. That's perfectly fine.) |
3037 | // |
3038 | // These three STAGE_ macros let you define each type of stage, |
3039 | // and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate. |
3040 | |
3041 | #if JUMPER_NARROW_STAGES |
3042 | #define STAGE_GG(name, ...) \ |
3043 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ |
3044 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ |
3045 | auto x = join<F>(r,g), \ |
3046 | y = join<F>(b,a); \ |
3047 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \ |
3048 | split(x, &r,&g); \ |
3049 | split(y, &b,&a); \ |
3050 | auto next = (Stage)load_and_inc(program); \ |
3051 | next(params,program, r,g,b,a); \ |
3052 | } \ |
3053 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) |
3054 | |
3055 | #define STAGE_GP(name, ...) \ |
3056 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
3057 | U16& r, U16& g, U16& b, U16& a, \ |
3058 | U16& dr, U16& dg, U16& db, U16& da); \ |
3059 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ |
3060 | auto x = join<F>(r,g), \ |
3061 | y = join<F>(b,a); \ |
3062 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \ |
3063 | params->dr,params->dg,params->db,params->da); \ |
3064 | auto next = (Stage)load_and_inc(program); \ |
3065 | next(params,program, r,g,b,a); \ |
3066 | } \ |
3067 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
3068 | U16& r, U16& g, U16& b, U16& a, \ |
3069 | U16& dr, U16& dg, U16& db, U16& da) |
3070 | |
3071 | #define STAGE_PP(name, ...) \ |
3072 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
3073 | U16& r, U16& g, U16& b, U16& a, \ |
3074 | U16& dr, U16& dg, U16& db, U16& da); \ |
3075 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ |
3076 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \ |
3077 | params->dr,params->dg,params->db,params->da); \ |
3078 | auto next = (Stage)load_and_inc(program); \ |
3079 | next(params,program, r,g,b,a); \ |
3080 | } \ |
3081 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
3082 | U16& r, U16& g, U16& b, U16& a, \ |
3083 | U16& dr, U16& dg, U16& db, U16& da) |
3084 | #else |
3085 | #define STAGE_GG(name, ...) \ |
3086 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ |
3087 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
3088 | U16 r, U16 g, U16 b, U16 a, \ |
3089 | U16 dr, U16 dg, U16 db, U16 da) { \ |
3090 | auto x = join<F>(r,g), \ |
3091 | y = join<F>(b,a); \ |
3092 | name##_k(Ctx{program}, dx,dy,tail, x,y); \ |
3093 | split(x, &r,&g); \ |
3094 | split(y, &b,&a); \ |
3095 | auto next = (Stage)load_and_inc(program); \ |
3096 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
3097 | } \ |
3098 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) |
3099 | |
3100 | #define STAGE_GP(name, ...) \ |
3101 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
3102 | U16& r, U16& g, U16& b, U16& a, \ |
3103 | U16& dr, U16& dg, U16& db, U16& da); \ |
3104 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
3105 | U16 r, U16 g, U16 b, U16 a, \ |
3106 | U16 dr, U16 dg, U16 db, U16 da) { \ |
3107 | auto x = join<F>(r,g), \ |
3108 | y = join<F>(b,a); \ |
3109 | name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \ |
3110 | auto next = (Stage)load_and_inc(program); \ |
3111 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
3112 | } \ |
3113 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
3114 | U16& r, U16& g, U16& b, U16& a, \ |
3115 | U16& dr, U16& dg, U16& db, U16& da) |
3116 | |
3117 | #define STAGE_PP(name, ...) \ |
3118 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
3119 | U16& r, U16& g, U16& b, U16& a, \ |
3120 | U16& dr, U16& dg, U16& db, U16& da); \ |
3121 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
3122 | U16 r, U16 g, U16 b, U16 a, \ |
3123 | U16 dr, U16 dg, U16 db, U16 da) { \ |
3124 | name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \ |
3125 | auto next = (Stage)load_and_inc(program); \ |
3126 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
3127 | } \ |
3128 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
3129 | U16& r, U16& g, U16& b, U16& a, \ |
3130 | U16& dr, U16& dg, U16& db, U16& da) |
3131 | #endif |
3132 | |
3133 | // ~~~~~~ Commonly used helper functions ~~~~~~ // |
3134 | |
3135 | SI U16 div255(U16 v) { |
3136 | #if 0 |
3137 | return (v+127)/255; // The ideal rounding divide by 255. |
3138 | #elif 1 && defined(JUMPER_IS_NEON) |
3139 | // With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8 |
3140 | // just as fast as we can do the approximation below, so might as well be correct! |
3141 | // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up. |
3142 | return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8); |
3143 | #else |
3144 | return (v+255)/256; // A good approximation of (v+127)/255. |
3145 | #endif |
3146 | } |
3147 | |
3148 | SI U16 inv(U16 v) { return 255-v; } |
3149 | |
3150 | SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); } |
3151 | SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); } |
3152 | |
3153 | SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); } |
3154 | SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); } |
3155 | |
3156 | SI U16 from_float(float f) { return f * 255.0f + 0.5f; } |
3157 | |
3158 | SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); } |
3159 | |
3160 | template <typename D, typename S> |
3161 | SI D cast(S src) { |
3162 | return __builtin_convertvector(src, D); |
3163 | } |
3164 | |
3165 | template <typename D, typename S> |
3166 | SI void split(S v, D* lo, D* hi) { |
3167 | static_assert(2*sizeof(D) == sizeof(S), "" ); |
3168 | memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D)); |
3169 | memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D)); |
3170 | } |
3171 | template <typename D, typename S> |
3172 | SI D join(S lo, S hi) { |
3173 | static_assert(sizeof(D) == 2*sizeof(S), "" ); |
3174 | D v; |
3175 | memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S)); |
3176 | memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S)); |
3177 | return v; |
3178 | } |
3179 | |
3180 | SI F if_then_else(I32 c, F t, F e) { |
3181 | return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) ); |
3182 | } |
3183 | SI F max(F x, F y) { return if_then_else(x < y, y, x); } |
3184 | SI F min(F x, F y) { return if_then_else(x < y, x, y); } |
3185 | |
3186 | SI F mad(F f, F m, F a) { return f*m+a; } |
3187 | SI U32 trunc_(F x) { return (U32)cast<I32>(x); } |
3188 | |
3189 | SI F rcp(F x) { |
3190 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
3191 | __m256 lo,hi; |
3192 | split(x, &lo,&hi); |
3193 | return join<F>(_mm256_rcp_ps(lo), _mm256_rcp_ps(hi)); |
3194 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) |
3195 | __m128 lo,hi; |
3196 | split(x, &lo,&hi); |
3197 | return join<F>(_mm_rcp_ps(lo), _mm_rcp_ps(hi)); |
3198 | #elif defined(JUMPER_IS_NEON) |
3199 | auto rcp = [](float32x4_t v) { |
3200 | auto est = vrecpeq_f32(v); |
3201 | return vrecpsq_f32(v,est)*est; |
3202 | }; |
3203 | float32x4_t lo,hi; |
3204 | split(x, &lo,&hi); |
3205 | return join<F>(rcp(lo), rcp(hi)); |
3206 | #else |
3207 | return 1.0f / x; |
3208 | #endif |
3209 | } |
3210 | SI F sqrt_(F x) { |
3211 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
3212 | __m256 lo,hi; |
3213 | split(x, &lo,&hi); |
3214 | return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi)); |
3215 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) |
3216 | __m128 lo,hi; |
3217 | split(x, &lo,&hi); |
3218 | return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi)); |
3219 | #elif defined(SK_CPU_ARM64) |
3220 | float32x4_t lo,hi; |
3221 | split(x, &lo,&hi); |
3222 | return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi)); |
3223 | #elif defined(JUMPER_IS_NEON) |
3224 | auto sqrt = [](float32x4_t v) { |
3225 | auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v). |
3226 | est *= vrsqrtsq_f32(v,est*est); |
3227 | est *= vrsqrtsq_f32(v,est*est); |
3228 | return v*est; // sqrt(v) == v*rsqrt(v). |
3229 | }; |
3230 | float32x4_t lo,hi; |
3231 | split(x, &lo,&hi); |
3232 | return join<F>(sqrt(lo), sqrt(hi)); |
3233 | #else |
3234 | return F{ |
3235 | sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]), |
3236 | sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]), |
3237 | }; |
3238 | #endif |
3239 | } |
3240 | |
3241 | SI F floor_(F x) { |
3242 | #if defined(SK_CPU_ARM64) |
3243 | float32x4_t lo,hi; |
3244 | split(x, &lo,&hi); |
3245 | return join<F>(vrndmq_f32(lo), vrndmq_f32(hi)); |
3246 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
3247 | __m256 lo,hi; |
3248 | split(x, &lo,&hi); |
3249 | return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi)); |
3250 | #elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) |
3251 | __m128 lo,hi; |
3252 | split(x, &lo,&hi); |
3253 | return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi)); |
3254 | #else |
3255 | F roundtrip = cast<F>(cast<I32>(x)); |
3256 | return roundtrip - if_then_else(roundtrip > x, F(1), F(0)); |
3257 | #endif |
3258 | } |
3259 | SI F fract(F x) { return x - floor_(x); } |
3260 | SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & 0x7fffffff ); } |
3261 | |
3262 | // ~~~~~~ Basic / misc. stages ~~~~~~ // |
3263 | |
3264 | STAGE_GG(seed_shader, Ctx::None) { |
3265 | static const float iota[] = { |
3266 | 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f, |
3267 | 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f, |
3268 | }; |
3269 | x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota); |
3270 | y = cast<F>(I32(dy)) + 0.5f; |
3271 | } |
3272 | |
3273 | STAGE_GG(matrix_translate, const float* m) { |
3274 | x += m[0]; |
3275 | y += m[1]; |
3276 | } |
3277 | STAGE_GG(matrix_scale_translate, const float* m) { |
3278 | x = mad(x,m[0], m[2]); |
3279 | y = mad(y,m[1], m[3]); |
3280 | } |
3281 | STAGE_GG(matrix_2x3, const float* m) { |
3282 | auto X = mad(x,m[0], mad(y,m[2], m[4])), |
3283 | Y = mad(x,m[1], mad(y,m[3], m[5])); |
3284 | x = X; |
3285 | y = Y; |
3286 | } |
3287 | STAGE_GG(matrix_perspective, const float* m) { |
3288 | // N.B. Unlike the other matrix_ stages, this matrix is row-major. |
3289 | auto X = mad(x,m[0], mad(y,m[1], m[2])), |
3290 | Y = mad(x,m[3], mad(y,m[4], m[5])), |
3291 | Z = mad(x,m[6], mad(y,m[7], m[8])); |
3292 | x = X * rcp(Z); |
3293 | y = Y * rcp(Z); |
3294 | } |
3295 | |
3296 | STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) { |
3297 | r = c->rgba[0]; |
3298 | g = c->rgba[1]; |
3299 | b = c->rgba[2]; |
3300 | a = c->rgba[3]; |
3301 | } |
3302 | STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) { |
3303 | dr = c->rgba[0]; |
3304 | dg = c->rgba[1]; |
3305 | db = c->rgba[2]; |
3306 | da = c->rgba[3]; |
3307 | } |
3308 | STAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; } |
3309 | STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; } |
3310 | |
3311 | STAGE_PP(set_rgb, const float rgb[3]) { |
3312 | r = from_float(rgb[0]); |
3313 | g = from_float(rgb[1]); |
3314 | b = from_float(rgb[2]); |
3315 | } |
3316 | |
3317 | STAGE_PP(clamp_0, Ctx::None) { /*definitely a noop*/ } |
3318 | STAGE_PP(clamp_1, Ctx::None) { /*_should_ be a noop*/ } |
3319 | |
3320 | STAGE_PP(clamp_a, Ctx::None) { |
3321 | r = min(r, a); |
3322 | g = min(g, a); |
3323 | b = min(b, a); |
3324 | } |
3325 | |
3326 | STAGE_PP(clamp_gamut, Ctx::None) { |
3327 | // It shouldn't be possible to get out-of-gamut |
3328 | // colors when working in lowp. |
3329 | } |
3330 | |
3331 | STAGE_PP(premul, Ctx::None) { |
3332 | r = div255(r * a); |
3333 | g = div255(g * a); |
3334 | b = div255(b * a); |
3335 | } |
3336 | STAGE_PP(premul_dst, Ctx::None) { |
3337 | dr = div255(dr * da); |
3338 | dg = div255(dg * da); |
3339 | db = div255(db * da); |
3340 | } |
3341 | |
3342 | STAGE_PP(force_opaque , Ctx::None) { a = 255; } |
3343 | STAGE_PP(force_opaque_dst, Ctx::None) { da = 255; } |
3344 | |
3345 | STAGE_PP(swap_rb, Ctx::None) { |
3346 | auto tmp = r; |
3347 | r = b; |
3348 | b = tmp; |
3349 | } |
3350 | STAGE_PP(swap_rb_dst, Ctx::None) { |
3351 | auto tmp = dr; |
3352 | dr = db; |
3353 | db = tmp; |
3354 | } |
3355 | |
3356 | STAGE_PP(move_src_dst, Ctx::None) { |
3357 | dr = r; |
3358 | dg = g; |
3359 | db = b; |
3360 | da = a; |
3361 | } |
3362 | |
3363 | STAGE_PP(move_dst_src, Ctx::None) { |
3364 | r = dr; |
3365 | g = dg; |
3366 | b = db; |
3367 | a = da; |
3368 | } |
3369 | |
3370 | // ~~~~~~ Blend modes ~~~~~~ // |
3371 | |
3372 | // The same logic applied to all 4 channels. |
3373 | #define BLEND_MODE(name) \ |
3374 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ |
3375 | STAGE_PP(name, Ctx::None) { \ |
3376 | r = name##_channel(r,dr,a,da); \ |
3377 | g = name##_channel(g,dg,a,da); \ |
3378 | b = name##_channel(b,db,a,da); \ |
3379 | a = name##_channel(a,da,a,da); \ |
3380 | } \ |
3381 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) |
3382 | |
3383 | BLEND_MODE(clear) { return 0; } |
3384 | BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); } |
3385 | BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); } |
3386 | BLEND_MODE(srcin) { return div255( s*da ); } |
3387 | BLEND_MODE(dstin) { return div255( d*sa ); } |
3388 | BLEND_MODE(srcout) { return div255( s*inv(da) ); } |
3389 | BLEND_MODE(dstout) { return div255( d*inv(sa) ); } |
3390 | BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); } |
3391 | BLEND_MODE(dstover) { return d + div255( s*inv(da) ); } |
3392 | BLEND_MODE(modulate) { return div255( s*d ); } |
3393 | BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); } |
3394 | BLEND_MODE(plus_) { return min(s+d, 255); } |
3395 | BLEND_MODE(screen) { return s + d - div255( s*d ); } |
3396 | BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); } |
3397 | #undef BLEND_MODE |
3398 | |
3399 | // The same logic applied to color, and srcover for alpha. |
3400 | #define BLEND_MODE(name) \ |
3401 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ |
3402 | STAGE_PP(name, Ctx::None) { \ |
3403 | r = name##_channel(r,dr,a,da); \ |
3404 | g = name##_channel(g,dg,a,da); \ |
3405 | b = name##_channel(b,db,a,da); \ |
3406 | a = a + div255( da*inv(a) ); \ |
3407 | } \ |
3408 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) |
3409 | |
3410 | BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); } |
3411 | BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); } |
3412 | BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); } |
3413 | BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); } |
3414 | |
3415 | BLEND_MODE(hardlight) { |
3416 | return div255( s*inv(da) + d*inv(sa) + |
3417 | if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); |
3418 | } |
3419 | BLEND_MODE(overlay) { |
3420 | return div255( s*inv(da) + d*inv(sa) + |
3421 | if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); |
3422 | } |
3423 | #undef BLEND_MODE |
3424 | |
3425 | // ~~~~~~ Helpers for interacting with memory ~~~~~~ // |
3426 | |
3427 | template <typename T> |
3428 | SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) { |
3429 | return (T*)ctx->pixels + dy*ctx->stride + dx; |
3430 | } |
3431 | |
3432 | template <typename T> |
3433 | SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) { |
3434 | auto clamp = [](F v, F limit) { |
3435 | limit = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive. |
3436 | return min(max(0, v), limit); |
3437 | }; |
3438 | x = clamp(x, ctx->width); |
3439 | y = clamp(y, ctx->height); |
3440 | |
3441 | *ptr = (const T*)ctx->pixels; |
3442 | return trunc_(y)*ctx->stride + trunc_(x); |
3443 | } |
3444 | |
3445 | template <typename V, typename T> |
3446 | SI V load(const T* ptr, size_t tail) { |
3447 | V v = 0; |
3448 | switch (tail & (N-1)) { |
3449 | case 0: memcpy(&v, ptr, sizeof(v)); break; |
3450 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
3451 | case 15: v[14] = ptr[14]; |
3452 | case 14: v[13] = ptr[13]; |
3453 | case 13: v[12] = ptr[12]; |
3454 | case 12: memcpy(&v, ptr, 12*sizeof(T)); break; |
3455 | case 11: v[10] = ptr[10]; |
3456 | case 10: v[ 9] = ptr[ 9]; |
3457 | case 9: v[ 8] = ptr[ 8]; |
3458 | case 8: memcpy(&v, ptr, 8*sizeof(T)); break; |
3459 | #endif |
3460 | case 7: v[ 6] = ptr[ 6]; |
3461 | case 6: v[ 5] = ptr[ 5]; |
3462 | case 5: v[ 4] = ptr[ 4]; |
3463 | case 4: memcpy(&v, ptr, 4*sizeof(T)); break; |
3464 | case 3: v[ 2] = ptr[ 2]; |
3465 | case 2: memcpy(&v, ptr, 2*sizeof(T)); break; |
3466 | case 1: v[ 0] = ptr[ 0]; |
3467 | } |
3468 | return v; |
3469 | } |
3470 | template <typename V, typename T> |
3471 | SI void store(T* ptr, size_t tail, V v) { |
3472 | switch (tail & (N-1)) { |
3473 | case 0: memcpy(ptr, &v, sizeof(v)); break; |
3474 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
3475 | case 15: ptr[14] = v[14]; |
3476 | case 14: ptr[13] = v[13]; |
3477 | case 13: ptr[12] = v[12]; |
3478 | case 12: memcpy(ptr, &v, 12*sizeof(T)); break; |
3479 | case 11: ptr[10] = v[10]; |
3480 | case 10: ptr[ 9] = v[ 9]; |
3481 | case 9: ptr[ 8] = v[ 8]; |
3482 | case 8: memcpy(ptr, &v, 8*sizeof(T)); break; |
3483 | #endif |
3484 | case 7: ptr[ 6] = v[ 6]; |
3485 | case 6: ptr[ 5] = v[ 5]; |
3486 | case 5: ptr[ 4] = v[ 4]; |
3487 | case 4: memcpy(ptr, &v, 4*sizeof(T)); break; |
3488 | case 3: ptr[ 2] = v[ 2]; |
3489 | case 2: memcpy(ptr, &v, 2*sizeof(T)); break; |
3490 | case 1: ptr[ 0] = v[ 0]; |
3491 | } |
3492 | } |
3493 | |
3494 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
3495 | template <typename V, typename T> |
3496 | SI V gather(const T* ptr, U32 ix) { |
3497 | return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], |
3498 | ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], |
3499 | ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]], |
3500 | ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], }; |
3501 | } |
3502 | |
3503 | template<> |
3504 | F gather(const float* ptr, U32 ix) { |
3505 | __m256i lo, hi; |
3506 | split(ix, &lo, &hi); |
3507 | |
3508 | return join<F>(_mm256_i32gather_ps(ptr, lo, 4), |
3509 | _mm256_i32gather_ps(ptr, hi, 4)); |
3510 | } |
3511 | |
3512 | template<> |
3513 | U32 gather(const uint32_t* ptr, U32 ix) { |
3514 | __m256i lo, hi; |
3515 | split(ix, &lo, &hi); |
3516 | |
3517 | return join<U32>(_mm256_i32gather_epi32(ptr, lo, 4), |
3518 | _mm256_i32gather_epi32(ptr, hi, 4)); |
3519 | } |
3520 | #else |
3521 | template <typename V, typename T> |
3522 | SI V gather(const T* ptr, U32 ix) { |
3523 | return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], |
3524 | ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], }; |
3525 | } |
3526 | #endif |
3527 | |
3528 | |
3529 | // ~~~~~~ 32-bit memory loads and stores ~~~~~~ // |
3530 | |
3531 | SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { |
3532 | #if 1 && defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
3533 | // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely. |
3534 | __m256i _01,_23; |
3535 | split(rgba, &_01, &_23); |
3536 | __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20), |
3537 | _13 = _mm256_permute2x128_si256(_01,_23, 0x31); |
3538 | rgba = join<U32>(_02, _13); |
3539 | |
3540 | auto cast_U16 = [](U32 v) -> U16 { |
3541 | __m256i _02,_13; |
3542 | split(v, &_02,&_13); |
3543 | return _mm256_packus_epi32(_02,_13); |
3544 | }; |
3545 | #else |
3546 | auto cast_U16 = [](U32 v) -> U16 { |
3547 | return cast<U16>(v); |
3548 | }; |
3549 | #endif |
3550 | *r = cast_U16(rgba & 65535) & 255; |
3551 | *g = cast_U16(rgba & 65535) >> 8; |
3552 | *b = cast_U16(rgba >> 16) & 255; |
3553 | *a = cast_U16(rgba >> 16) >> 8; |
3554 | } |
3555 | |
3556 | SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
3557 | #if 1 && defined(JUMPER_IS_NEON) |
3558 | uint8x8x4_t rgba; |
3559 | switch (tail & (N-1)) { |
3560 | case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break; |
3561 | case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6); |
3562 | case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5); |
3563 | case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4); |
3564 | case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3); |
3565 | case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2); |
3566 | case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1); |
3567 | case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0); |
3568 | } |
3569 | *r = cast<U16>(rgba.val[0]); |
3570 | *g = cast<U16>(rgba.val[1]); |
3571 | *b = cast<U16>(rgba.val[2]); |
3572 | *a = cast<U16>(rgba.val[3]); |
3573 | #else |
3574 | from_8888(load<U32>(ptr, tail), r,g,b,a); |
3575 | #endif |
3576 | } |
3577 | SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
3578 | #if 1 && defined(JUMPER_IS_NEON) |
3579 | uint8x8x4_t rgba = {{ |
3580 | cast<U8>(r), |
3581 | cast<U8>(g), |
3582 | cast<U8>(b), |
3583 | cast<U8>(a), |
3584 | }}; |
3585 | switch (tail & (N-1)) { |
3586 | case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break; |
3587 | case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6); |
3588 | case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5); |
3589 | case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4); |
3590 | case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3); |
3591 | case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2); |
3592 | case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1); |
3593 | case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0); |
3594 | } |
3595 | #else |
3596 | store(ptr, tail, cast<U32>(r | (g<<8)) << 0 |
3597 | | cast<U32>(b | (a<<8)) << 16); |
3598 | #endif |
3599 | } |
3600 | |
3601 | STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
3602 | load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a); |
3603 | } |
3604 | STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
3605 | load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); |
3606 | } |
3607 | STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
3608 | store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a); |
3609 | } |
3610 | STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) { |
3611 | const uint32_t* ptr; |
3612 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
3613 | from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a); |
3614 | } |
3615 | |
3616 | // ~~~~~~ 16-bit memory loads and stores ~~~~~~ // |
3617 | |
3618 | SI void from_565(U16 rgb, U16* r, U16* g, U16* b) { |
3619 | // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0 |
3620 | U16 R = (rgb >> 11) & 31, |
3621 | G = (rgb >> 5) & 63, |
3622 | B = (rgb >> 0) & 31; |
3623 | |
3624 | // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit. |
3625 | *r = (R << 3) | (R >> 2); |
3626 | *g = (G << 2) | (G >> 4); |
3627 | *b = (B << 3) | (B >> 2); |
3628 | } |
3629 | SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
3630 | from_565(load<U16>(ptr, tail), r,g,b); |
3631 | } |
3632 | SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) { |
3633 | // Round from [0,255] to [0,31] or [0,63], as if x * (31/255.0f) + 0.5f. |
3634 | // (Don't feel like you need to find some fundamental truth in these... |
3635 | // they were brute-force searched.) |
3636 | U16 R = (r * 9 + 36) / 74, // 9/74 ≈ 31/255, plus 36/74, about half. |
3637 | G = (g * 21 + 42) / 85, // 21/85 = 63/255 exactly. |
3638 | B = (b * 9 + 36) / 74; |
3639 | // Pack them back into 15|rrrrr gggggg bbbbb|0. |
3640 | store(ptr, tail, R << 11 |
3641 | | G << 5 |
3642 | | B << 0); |
3643 | } |
3644 | |
3645 | STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) { |
3646 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b); |
3647 | a = 255; |
3648 | } |
3649 | STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
3650 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db); |
3651 | da = 255; |
3652 | } |
3653 | STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) { |
3654 | store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b); |
3655 | } |
3656 | STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) { |
3657 | const uint16_t* ptr; |
3658 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
3659 | from_565(gather<U16>(ptr, ix), &r, &g, &b); |
3660 | a = 255; |
3661 | } |
3662 | |
3663 | SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) { |
3664 | // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0. |
3665 | U16 R = (rgba >> 12) & 15, |
3666 | G = (rgba >> 8) & 15, |
3667 | B = (rgba >> 4) & 15, |
3668 | A = (rgba >> 0) & 15; |
3669 | |
3670 | // Scale [0,15] to [0,255]. |
3671 | *r = (R << 4) | R; |
3672 | *g = (G << 4) | G; |
3673 | *b = (B << 4) | B; |
3674 | *a = (A << 4) | A; |
3675 | } |
3676 | SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
3677 | from_4444(load<U16>(ptr, tail), r,g,b,a); |
3678 | } |
3679 | SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
3680 | // Round from [0,255] to [0,15], producing the same value as (x*(15/255.0f) + 0.5f). |
3681 | U16 R = (r + 8) / 17, |
3682 | G = (g + 8) / 17, |
3683 | B = (b + 8) / 17, |
3684 | A = (a + 8) / 17; |
3685 | // Pack them back into 15|rrrr gggg bbbb aaaa|0. |
3686 | store(ptr, tail, R << 12 |
3687 | | G << 8 |
3688 | | B << 4 |
3689 | | A << 0); |
3690 | } |
3691 | |
3692 | STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
3693 | load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a); |
3694 | } |
3695 | STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
3696 | load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); |
3697 | } |
3698 | STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
3699 | store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a); |
3700 | } |
3701 | STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) { |
3702 | const uint16_t* ptr; |
3703 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
3704 | from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a); |
3705 | } |
3706 | |
3707 | SI void from_88(U16 rg, U16* r, U16* g) { |
3708 | *r = (rg & 0xFF); |
3709 | *g = (rg >> 8); |
3710 | } |
3711 | |
3712 | SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
3713 | #if 1 && defined(JUMPER_IS_NEON) |
3714 | uint8x8x2_t rg; |
3715 | switch (tail & (N-1)) { |
3716 | case 0: rg = vld2_u8 ((const uint8_t*)(ptr+0) ); break; |
3717 | case 7: rg = vld2_lane_u8((const uint8_t*)(ptr+6), rg, 6); |
3718 | case 6: rg = vld2_lane_u8((const uint8_t*)(ptr+5), rg, 5); |
3719 | case 5: rg = vld2_lane_u8((const uint8_t*)(ptr+4), rg, 4); |
3720 | case 4: rg = vld2_lane_u8((const uint8_t*)(ptr+3), rg, 3); |
3721 | case 3: rg = vld2_lane_u8((const uint8_t*)(ptr+2), rg, 2); |
3722 | case 2: rg = vld2_lane_u8((const uint8_t*)(ptr+1), rg, 1); |
3723 | case 1: rg = vld2_lane_u8((const uint8_t*)(ptr+0), rg, 0); |
3724 | } |
3725 | *r = cast<U16>(rg.val[0]); |
3726 | *g = cast<U16>(rg.val[1]); |
3727 | #else |
3728 | from_88(load<U16>(ptr, tail), r,g); |
3729 | #endif |
3730 | } |
3731 | |
3732 | SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
3733 | #if 1 && defined(JUMPER_IS_NEON) |
3734 | uint8x8x2_t rg = {{ |
3735 | cast<U8>(r), |
3736 | cast<U8>(g), |
3737 | }}; |
3738 | switch (tail & (N-1)) { |
3739 | case 0: vst2_u8 ((uint8_t*)(ptr+0), rg ); break; |
3740 | case 7: vst2_lane_u8((uint8_t*)(ptr+6), rg, 6); |
3741 | case 6: vst2_lane_u8((uint8_t*)(ptr+5), rg, 5); |
3742 | case 5: vst2_lane_u8((uint8_t*)(ptr+4), rg, 4); |
3743 | case 4: vst2_lane_u8((uint8_t*)(ptr+3), rg, 3); |
3744 | case 3: vst2_lane_u8((uint8_t*)(ptr+2), rg, 2); |
3745 | case 2: vst2_lane_u8((uint8_t*)(ptr+1), rg, 1); |
3746 | case 1: vst2_lane_u8((uint8_t*)(ptr+0), rg, 0); |
3747 | } |
3748 | #else |
3749 | store(ptr, tail, cast<U16>(r | (g<<8)) << 0); |
3750 | #endif |
3751 | } |
3752 | |
3753 | STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
3754 | load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &r, &g); |
3755 | b = 0; |
3756 | a = 255; |
3757 | } |
3758 | STAGE_PP(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
3759 | load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &dr, &dg); |
3760 | db = 0; |
3761 | da = 255; |
3762 | } |
3763 | STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
3764 | store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g); |
3765 | } |
3766 | STAGE_GP(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) { |
3767 | const uint16_t* ptr; |
3768 | U32 ix = ix_and_ptr(&ptr, ctx, x, y); |
3769 | from_88(gather<U16>(ptr, ix), &r, &g); |
3770 | b = 0; |
3771 | a = 255; |
3772 | } |
3773 | |
3774 | // ~~~~~~ 8-bit memory loads and stores ~~~~~~ // |
3775 | |
3776 | SI U16 load_8(const uint8_t* ptr, size_t tail) { |
3777 | return cast<U16>(load<U8>(ptr, tail)); |
3778 | } |
3779 | SI void store_8(uint8_t* ptr, size_t tail, U16 v) { |
3780 | store(ptr, tail, cast<U8>(v)); |
3781 | } |
3782 | |
3783 | STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
3784 | r = g = b = 0; |
3785 | a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
3786 | } |
3787 | STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
3788 | dr = dg = db = 0; |
3789 | da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
3790 | } |
3791 | STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
3792 | store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a); |
3793 | } |
3794 | STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) { |
3795 | const uint8_t* ptr; |
3796 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
3797 | r = g = b = 0; |
3798 | a = cast<U16>(gather<U8>(ptr, ix)); |
3799 | } |
3800 | |
3801 | STAGE_PP(alpha_to_gray, Ctx::None) { |
3802 | r = g = b = a; |
3803 | a = 255; |
3804 | } |
3805 | STAGE_PP(alpha_to_gray_dst, Ctx::None) { |
3806 | dr = dg = db = da; |
3807 | da = 255; |
3808 | } |
3809 | STAGE_PP(bt709_luminance_or_luma_to_alpha, Ctx::None) { |
3810 | a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator. |
3811 | r = g = b = 0; |
3812 | } |
3813 | |
3814 | // ~~~~~~ Coverage scales / lerps ~~~~~~ // |
3815 | |
3816 | STAGE_PP(load_src, const uint16_t* ptr) { |
3817 | r = sk_unaligned_load<U16>(ptr + 0*N); |
3818 | g = sk_unaligned_load<U16>(ptr + 1*N); |
3819 | b = sk_unaligned_load<U16>(ptr + 2*N); |
3820 | a = sk_unaligned_load<U16>(ptr + 3*N); |
3821 | } |
3822 | STAGE_PP(store_src, uint16_t* ptr) { |
3823 | sk_unaligned_store(ptr + 0*N, r); |
3824 | sk_unaligned_store(ptr + 1*N, g); |
3825 | sk_unaligned_store(ptr + 2*N, b); |
3826 | sk_unaligned_store(ptr + 3*N, a); |
3827 | } |
3828 | STAGE_PP(store_src_a, uint16_t* ptr) { |
3829 | sk_unaligned_store(ptr, a); |
3830 | } |
3831 | STAGE_PP(load_dst, const uint16_t* ptr) { |
3832 | dr = sk_unaligned_load<U16>(ptr + 0*N); |
3833 | dg = sk_unaligned_load<U16>(ptr + 1*N); |
3834 | db = sk_unaligned_load<U16>(ptr + 2*N); |
3835 | da = sk_unaligned_load<U16>(ptr + 3*N); |
3836 | } |
3837 | STAGE_PP(store_dst, uint16_t* ptr) { |
3838 | sk_unaligned_store(ptr + 0*N, dr); |
3839 | sk_unaligned_store(ptr + 1*N, dg); |
3840 | sk_unaligned_store(ptr + 2*N, db); |
3841 | sk_unaligned_store(ptr + 3*N, da); |
3842 | } |
3843 | |
3844 | // ~~~~~~ Coverage scales / lerps ~~~~~~ // |
3845 | |
3846 | STAGE_PP(scale_1_float, const float* f) { |
3847 | U16 c = from_float(*f); |
3848 | r = div255( r * c ); |
3849 | g = div255( g * c ); |
3850 | b = div255( b * c ); |
3851 | a = div255( a * c ); |
3852 | } |
3853 | STAGE_PP(lerp_1_float, const float* f) { |
3854 | U16 c = from_float(*f); |
3855 | r = lerp(dr, r, c); |
3856 | g = lerp(dg, g, c); |
3857 | b = lerp(db, b, c); |
3858 | a = lerp(da, a, c); |
3859 | } |
3860 | STAGE_PP(scale_native, const uint16_t scales[]) { |
3861 | auto c = sk_unaligned_load<U16>(scales); |
3862 | r = div255( r * c ); |
3863 | g = div255( g * c ); |
3864 | b = div255( b * c ); |
3865 | a = div255( a * c ); |
3866 | } |
3867 | |
3868 | STAGE_PP(lerp_native, const uint16_t scales[]) { |
3869 | auto c = sk_unaligned_load<U16>(scales); |
3870 | r = lerp(dr, r, c); |
3871 | g = lerp(dg, g, c); |
3872 | b = lerp(db, b, c); |
3873 | a = lerp(da, a, c); |
3874 | } |
3875 | |
3876 | STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
3877 | U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
3878 | r = div255( r * c ); |
3879 | g = div255( g * c ); |
3880 | b = div255( b * c ); |
3881 | a = div255( a * c ); |
3882 | } |
3883 | STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
3884 | U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
3885 | r = lerp(dr, r, c); |
3886 | g = lerp(dg, g, c); |
3887 | b = lerp(db, b, c); |
3888 | a = lerp(da, a, c); |
3889 | } |
3890 | |
3891 | // Derive alpha's coverage from rgb coverage and the values of src and dst alpha. |
3892 | SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) { |
3893 | return if_then_else(a < da, min(cr, min(cg,cb)) |
3894 | , max(cr, max(cg,cb))); |
3895 | } |
3896 | STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) { |
3897 | U16 cr,cg,cb; |
3898 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); |
3899 | U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
3900 | |
3901 | r = div255( r * cr ); |
3902 | g = div255( g * cg ); |
3903 | b = div255( b * cb ); |
3904 | a = div255( a * ca ); |
3905 | } |
3906 | STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) { |
3907 | U16 cr,cg,cb; |
3908 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); |
3909 | U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
3910 | |
3911 | r = lerp(dr, r, cr); |
3912 | g = lerp(dg, g, cg); |
3913 | b = lerp(db, b, cb); |
3914 | a = lerp(da, a, ca); |
3915 | } |
3916 | |
3917 | STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) { |
3918 | U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail), |
3919 | add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail); |
3920 | |
3921 | r = min(div255(r*mul) + add, a); |
3922 | g = min(div255(g*mul) + add, a); |
3923 | b = min(div255(b*mul) + add, a); |
3924 | } |
3925 | |
3926 | |
3927 | // ~~~~~~ Gradient stages ~~~~~~ // |
3928 | |
3929 | // Clamp x to [0,1], both sides inclusive (think, gradients). |
3930 | // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. |
3931 | SI F clamp_01(F v) { return min(max(0, v), 1); } |
3932 | |
3933 | STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); } |
3934 | STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); } |
3935 | STAGE_GG(mirror_x_1, Ctx::None) { |
3936 | auto two = [](F x){ return x+x; }; |
3937 | x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f )); |
3938 | } |
3939 | |
3940 | SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); } |
3941 | |
3942 | STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) { |
3943 | auto w = ctx->limit_x; |
3944 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w))); |
3945 | } |
3946 | STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) { |
3947 | auto h = ctx->limit_y; |
3948 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h))); |
3949 | } |
3950 | STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) { |
3951 | auto w = ctx->limit_x; |
3952 | auto h = ctx->limit_y; |
3953 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h))); |
3954 | } |
3955 | STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) { |
3956 | auto mask = sk_unaligned_load<U16>(ctx->mask); |
3957 | r = r & mask; |
3958 | g = g & mask; |
3959 | b = b & mask; |
3960 | a = a & mask; |
3961 | } |
3962 | |
3963 | SI void round_F_to_U16(F R, F G, F B, F A, bool interpolatedInPremul, |
3964 | U16* r, U16* g, U16* b, U16* a) { |
3965 | auto round = [](F x) { return cast<U16>(x * 255.0f + 0.5f); }; |
3966 | |
3967 | F limit = interpolatedInPremul ? A |
3968 | : 1; |
3969 | *r = round(min(max(0,R), limit)); |
3970 | *g = round(min(max(0,G), limit)); |
3971 | *b = round(min(max(0,B), limit)); |
3972 | *a = round(A); // we assume alpha is already in [0,1]. |
3973 | } |
3974 | |
3975 | SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t, |
3976 | U16* r, U16* g, U16* b, U16* a) { |
3977 | |
3978 | F fr, fg, fb, fa, br, bg, bb, ba; |
3979 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) |
3980 | if (c->stopCount <=8) { |
3981 | __m256i lo, hi; |
3982 | split(idx, &lo, &hi); |
3983 | |
3984 | fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo), |
3985 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi)); |
3986 | br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo), |
3987 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi)); |
3988 | fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo), |
3989 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi)); |
3990 | bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo), |
3991 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi)); |
3992 | fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo), |
3993 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi)); |
3994 | bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo), |
3995 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi)); |
3996 | fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo), |
3997 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi)); |
3998 | ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo), |
3999 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi)); |
4000 | } else |
4001 | #endif |
4002 | { |
4003 | fr = gather<F>(c->fs[0], idx); |
4004 | fg = gather<F>(c->fs[1], idx); |
4005 | fb = gather<F>(c->fs[2], idx); |
4006 | fa = gather<F>(c->fs[3], idx); |
4007 | br = gather<F>(c->bs[0], idx); |
4008 | bg = gather<F>(c->bs[1], idx); |
4009 | bb = gather<F>(c->bs[2], idx); |
4010 | ba = gather<F>(c->bs[3], idx); |
4011 | } |
4012 | round_F_to_U16(mad(t, fr, br), |
4013 | mad(t, fg, bg), |
4014 | mad(t, fb, bb), |
4015 | mad(t, fa, ba), |
4016 | c->interpolatedInPremul, |
4017 | r,g,b,a); |
4018 | } |
4019 | |
4020 | STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) { |
4021 | auto t = x; |
4022 | U32 idx = 0; |
4023 | |
4024 | // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. |
4025 | for (size_t i = 1; i < c->stopCount; i++) { |
4026 | idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); |
4027 | } |
4028 | |
4029 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
4030 | } |
4031 | |
4032 | STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) { |
4033 | auto t = x; |
4034 | auto idx = trunc_(t * (c->stopCount-1)); |
4035 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
4036 | } |
4037 | |
4038 | STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) { |
4039 | auto t = x; |
4040 | round_F_to_U16(mad(t, c->f[0], c->b[0]), |
4041 | mad(t, c->f[1], c->b[1]), |
4042 | mad(t, c->f[2], c->b[2]), |
4043 | mad(t, c->f[3], c->b[3]), |
4044 | c->interpolatedInPremul, |
4045 | &r,&g,&b,&a); |
4046 | } |
4047 | |
4048 | STAGE_GG(xy_to_unit_angle, Ctx::None) { |
4049 | F xabs = abs_(x), |
4050 | yabs = abs_(y); |
4051 | |
4052 | F slope = min(xabs, yabs)/max(xabs, yabs); |
4053 | F s = slope * slope; |
4054 | |
4055 | // Use a 7th degree polynomial to approximate atan. |
4056 | // This was generated using sollya.gforge.inria.fr. |
4057 | // A float optimized polynomial was generated using the following command. |
4058 | // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); |
4059 | F phi = slope |
4060 | * (0.15912117063999176025390625f + s |
4061 | * (-5.185396969318389892578125e-2f + s |
4062 | * (2.476101927459239959716796875e-2f + s |
4063 | * (-7.0547382347285747528076171875e-3f)))); |
4064 | |
4065 | phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); |
4066 | phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi); |
4067 | phi = if_then_else(y < 0.0f , 1.0f - phi , phi); |
4068 | phi = if_then_else(phi != phi , 0 , phi); // Check for NaN. |
4069 | x = phi; |
4070 | } |
4071 | STAGE_GG(xy_to_radius, Ctx::None) { |
4072 | x = sqrt_(x*x + y*y); |
4073 | } |
4074 | |
4075 | // ~~~~~~ Compound stages ~~~~~~ // |
4076 | |
4077 | STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
4078 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
4079 | |
4080 | load_8888_(ptr, tail, &dr,&dg,&db,&da); |
4081 | r = r + div255( dr*inv(a) ); |
4082 | g = g + div255( dg*inv(a) ); |
4083 | b = b + div255( db*inv(a) ); |
4084 | a = a + div255( da*inv(a) ); |
4085 | store_8888_(ptr, tail, r,g,b,a); |
4086 | } |
4087 | |
4088 | #if defined(SK_DISABLE_LOWP_BILERP_CLAMP_CLAMP_STAGE) |
4089 | static void(*bilerp_clamp_8888)(void) = nullptr; |
4090 | static void(*bilinear)(void) = nullptr; |
4091 | #else |
4092 | STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { |
4093 | // (cx,cy) are the center of our sample. |
4094 | F cx = x, |
4095 | cy = y; |
4096 | |
4097 | // All sample points are at the same fractional offset (fx,fy). |
4098 | // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. |
4099 | F fx = fract(cx + 0.5f), |
4100 | fy = fract(cy + 0.5f); |
4101 | |
4102 | // We'll accumulate the color of all four samples into {r,g,b,a} directly. |
4103 | r = g = b = a = 0; |
4104 | |
4105 | // The first three sample points will calculate their area using math |
4106 | // just like in the float code above, but the fourth will take up all the rest. |
4107 | // |
4108 | // Logically this is the same as doing the math for the fourth pixel too, |
4109 | // but rounding error makes this a better strategy, keeping opaque opaque, etc. |
4110 | // |
4111 | // We can keep up to 8 bits of fractional precision without overflowing 16-bit, |
4112 | // so our "1.0" area is 256. |
4113 | const uint16_t bias = 256; |
4114 | U16 remaining = bias; |
4115 | |
4116 | for (float dy = -0.5f; dy <= +0.5f; dy += 1.0f) |
4117 | for (float dx = -0.5f; dx <= +0.5f; dx += 1.0f) { |
4118 | // (x,y) are the coordinates of this sample point. |
4119 | F x = cx + dx, |
4120 | y = cy + dy; |
4121 | |
4122 | // ix_and_ptr() will clamp to the image's bounds for us. |
4123 | const uint32_t* ptr; |
4124 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
4125 | |
4126 | U16 sr,sg,sb,sa; |
4127 | from_8888(gather<U32>(ptr, ix), &sr,&sg,&sb,&sa); |
4128 | |
4129 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center |
4130 | // are combined in direct proportion to their area overlapping that logical query pixel. |
4131 | // At positive offsets, the x-axis contribution to that rectangle is fx, |
4132 | // or (1-fx) at negative x. Same deal for y. |
4133 | F sx = (dx > 0) ? fx : 1.0f - fx, |
4134 | sy = (dy > 0) ? fy : 1.0f - fy; |
4135 | |
4136 | U16 area = (dy == 0.5f && dx == 0.5f) ? remaining |
4137 | : cast<U16>(sx * sy * bias); |
4138 | for (size_t i = 0; i < N; i++) { |
4139 | SkASSERT(remaining[i] >= area[i]); |
4140 | } |
4141 | remaining -= area; |
4142 | |
4143 | r += sr * area; |
4144 | g += sg * area; |
4145 | b += sb * area; |
4146 | a += sa * area; |
4147 | } |
4148 | |
4149 | r = (r + bias/2) / bias; |
4150 | g = (g + bias/2) / bias; |
4151 | b = (b + bias/2) / bias; |
4152 | a = (a + bias/2) / bias; |
4153 | } |
4154 | |
4155 | // TODO: lowp::tile() is identical to the highp tile()... share? |
4156 | SI F tile(F v, SkTileMode mode, float limit, float invLimit) { |
4157 | // After ix_and_ptr() will clamp the output of tile(), so we need not clamp here. |
4158 | switch (mode) { |
4159 | case SkTileMode::kDecal: // TODO, for now fallthrough to clamp |
4160 | case SkTileMode::kClamp: return v; |
4161 | case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit; |
4162 | case SkTileMode::kMirror: |
4163 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); |
4164 | } |
4165 | SkUNREACHABLE; |
4166 | } |
4167 | |
4168 | SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y, |
4169 | U16* r, U16* g, U16* b, U16* a) { |
4170 | x = tile(x, ctx->tileX, ctx->width , ctx->invWidth ); |
4171 | y = tile(y, ctx->tileY, ctx->height, ctx->invHeight); |
4172 | |
4173 | switch (ctx->ct) { |
4174 | default: *r = *g = *b = *a = 0; // TODO |
4175 | break; |
4176 | |
4177 | case kRGBA_8888_SkColorType: |
4178 | case kBGRA_8888_SkColorType: { |
4179 | const uint32_t* ptr; |
4180 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
4181 | from_8888(gather<U32>(ptr, ix), r,g,b,a); |
4182 | if (ctx->ct == kBGRA_8888_SkColorType) { |
4183 | std::swap(*r,*b); |
4184 | } |
4185 | } break; |
4186 | } |
4187 | } |
4188 | |
4189 | template <int D> |
4190 | SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx, |
4191 | F cx, F cy, const F (&wx)[D], const F (&wy)[D], |
4192 | U16* r, U16* g, U16* b, U16* a) { |
4193 | |
4194 | float start = -0.5f*(D-1); |
4195 | |
4196 | const uint16_t bias = 256; |
4197 | U16 remaining = bias; |
4198 | |
4199 | *r = *g = *b = *a = 0; |
4200 | F y = cy + start; |
4201 | for (int j = 0; j < D; j++, y += 1.0f) { |
4202 | F x = cx + start; |
4203 | for (int i = 0; i < D; i++, x += 1.0f) { |
4204 | U16 R,G,B,A; |
4205 | sample(ctx, x,y, &R,&G,&B,&A); |
4206 | |
4207 | U16 w = (i == D-1 && j == D-1) ? remaining |
4208 | : cast<U16>(wx[i]*wy[j]*bias); |
4209 | remaining -= w; |
4210 | *r += w*R; |
4211 | *g += w*G; |
4212 | *b += w*B; |
4213 | *a += w*A; |
4214 | } |
4215 | } |
4216 | *r = (*r + bias/2) / bias; |
4217 | *g = (*g + bias/2) / bias; |
4218 | *b = (*b + bias/2) / bias; |
4219 | *a = (*a + bias/2) / bias; |
4220 | } |
4221 | |
4222 | STAGE_GP(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) { |
4223 | F fx = fract(x + 0.5f), |
4224 | fy = fract(y + 0.5f); |
4225 | const F wx[] = {1.0f - fx, fx}; |
4226 | const F wy[] = {1.0f - fy, fy}; |
4227 | |
4228 | sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); |
4229 | } |
4230 | #endif |
4231 | |
4232 | // ~~~~~~ GrSwizzle stage ~~~~~~ // |
4233 | |
4234 | STAGE_PP(swizzle, void* ctx) { |
4235 | auto ir = r, ig = g, ib = b, ia = a; |
4236 | U16* o[] = {&r, &g, &b, &a}; |
4237 | char swiz[4]; |
4238 | memcpy(swiz, &ctx, sizeof(swiz)); |
4239 | |
4240 | for (int i = 0; i < 4; ++i) { |
4241 | switch (swiz[i]) { |
4242 | case 'r': *o[i] = ir; break; |
4243 | case 'g': *o[i] = ig; break; |
4244 | case 'b': *o[i] = ib; break; |
4245 | case 'a': *o[i] = ia; break; |
4246 | case '0': *o[i] = U16(0); break; |
4247 | case '1': *o[i] = U16(255); break; |
4248 | default: break; |
4249 | } |
4250 | } |
4251 | } |
4252 | |
4253 | // Now we'll add null stand-ins for stages we haven't implemented in lowp. |
4254 | // If a pipeline uses these stages, it'll boot it out of lowp into highp. |
4255 | #define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr; |
4256 | NOT_IMPLEMENTED(callback) |
4257 | NOT_IMPLEMENTED(interpreter) |
4258 | NOT_IMPLEMENTED(unbounded_set_rgb) |
4259 | NOT_IMPLEMENTED(unbounded_uniform_color) |
4260 | NOT_IMPLEMENTED(unpremul) |
4261 | NOT_IMPLEMENTED(dither) // TODO |
4262 | NOT_IMPLEMENTED(from_srgb) |
4263 | NOT_IMPLEMENTED(to_srgb) |
4264 | NOT_IMPLEMENTED(load_16161616) |
4265 | NOT_IMPLEMENTED(load_16161616_dst) |
4266 | NOT_IMPLEMENTED(store_16161616) |
4267 | NOT_IMPLEMENTED(gather_16161616) |
4268 | NOT_IMPLEMENTED(load_a16) |
4269 | NOT_IMPLEMENTED(load_a16_dst) |
4270 | NOT_IMPLEMENTED(store_a16) |
4271 | NOT_IMPLEMENTED(gather_a16) |
4272 | NOT_IMPLEMENTED(load_rg1616) |
4273 | NOT_IMPLEMENTED(load_rg1616_dst) |
4274 | NOT_IMPLEMENTED(store_rg1616) |
4275 | NOT_IMPLEMENTED(gather_rg1616) |
4276 | NOT_IMPLEMENTED(load_f16) |
4277 | NOT_IMPLEMENTED(load_f16_dst) |
4278 | NOT_IMPLEMENTED(store_f16) |
4279 | NOT_IMPLEMENTED(gather_f16) |
4280 | NOT_IMPLEMENTED(load_af16) |
4281 | NOT_IMPLEMENTED(load_af16_dst) |
4282 | NOT_IMPLEMENTED(store_af16) |
4283 | NOT_IMPLEMENTED(gather_af16) |
4284 | NOT_IMPLEMENTED(load_rgf16) |
4285 | NOT_IMPLEMENTED(load_rgf16_dst) |
4286 | NOT_IMPLEMENTED(store_rgf16) |
4287 | NOT_IMPLEMENTED(gather_rgf16) |
4288 | NOT_IMPLEMENTED(load_f32) |
4289 | NOT_IMPLEMENTED(load_f32_dst) |
4290 | NOT_IMPLEMENTED(store_f32) |
4291 | NOT_IMPLEMENTED(gather_f32) |
4292 | NOT_IMPLEMENTED(load_rgf32) |
4293 | NOT_IMPLEMENTED(store_rgf32) |
4294 | NOT_IMPLEMENTED(load_1010102) |
4295 | NOT_IMPLEMENTED(load_1010102_dst) |
4296 | NOT_IMPLEMENTED(store_1010102) |
4297 | NOT_IMPLEMENTED(gather_1010102) |
4298 | NOT_IMPLEMENTED(store_u16_be) |
4299 | NOT_IMPLEMENTED(byte_tables) // TODO |
4300 | NOT_IMPLEMENTED(colorburn) |
4301 | NOT_IMPLEMENTED(colordodge) |
4302 | NOT_IMPLEMENTED(softlight) |
4303 | NOT_IMPLEMENTED(hue) |
4304 | NOT_IMPLEMENTED(saturation) |
4305 | NOT_IMPLEMENTED(color) |
4306 | NOT_IMPLEMENTED(luminosity) |
4307 | NOT_IMPLEMENTED(matrix_3x3) |
4308 | NOT_IMPLEMENTED(matrix_3x4) |
4309 | NOT_IMPLEMENTED(matrix_4x5) // TODO |
4310 | NOT_IMPLEMENTED(matrix_4x3) // TODO |
4311 | NOT_IMPLEMENTED(parametric) |
4312 | NOT_IMPLEMENTED(gamma_) |
4313 | NOT_IMPLEMENTED(PQish) |
4314 | NOT_IMPLEMENTED(HLGish) |
4315 | NOT_IMPLEMENTED(HLGinvish) |
4316 | NOT_IMPLEMENTED(rgb_to_hsl) |
4317 | NOT_IMPLEMENTED(hsl_to_rgb) |
4318 | NOT_IMPLEMENTED(gauss_a_to_rgba) // TODO |
4319 | NOT_IMPLEMENTED(mirror_x) // TODO |
4320 | NOT_IMPLEMENTED(repeat_x) // TODO |
4321 | NOT_IMPLEMENTED(mirror_y) // TODO |
4322 | NOT_IMPLEMENTED(repeat_y) // TODO |
4323 | NOT_IMPLEMENTED(negate_x) |
4324 | NOT_IMPLEMENTED(bicubic) // TODO if I can figure out negative weights |
4325 | NOT_IMPLEMENTED(bicubic_clamp_8888) |
4326 | NOT_IMPLEMENTED(bilinear_nx) // TODO |
4327 | NOT_IMPLEMENTED(bilinear_ny) // TODO |
4328 | NOT_IMPLEMENTED(bilinear_px) // TODO |
4329 | NOT_IMPLEMENTED(bilinear_py) // TODO |
4330 | NOT_IMPLEMENTED(bicubic_n3x) // TODO |
4331 | NOT_IMPLEMENTED(bicubic_n1x) // TODO |
4332 | NOT_IMPLEMENTED(bicubic_p1x) // TODO |
4333 | NOT_IMPLEMENTED(bicubic_p3x) // TODO |
4334 | NOT_IMPLEMENTED(bicubic_n3y) // TODO |
4335 | NOT_IMPLEMENTED(bicubic_n1y) // TODO |
4336 | NOT_IMPLEMENTED(bicubic_p1y) // TODO |
4337 | NOT_IMPLEMENTED(bicubic_p3y) // TODO |
4338 | NOT_IMPLEMENTED(save_xy) // TODO |
4339 | NOT_IMPLEMENTED(accumulate) // TODO |
4340 | NOT_IMPLEMENTED(xy_to_2pt_conical_well_behaved) |
4341 | NOT_IMPLEMENTED(xy_to_2pt_conical_strip) |
4342 | NOT_IMPLEMENTED(xy_to_2pt_conical_focal_on_circle) |
4343 | NOT_IMPLEMENTED(xy_to_2pt_conical_smaller) |
4344 | NOT_IMPLEMENTED(xy_to_2pt_conical_greater) |
4345 | NOT_IMPLEMENTED(alter_2pt_conical_compensate_focal) |
4346 | NOT_IMPLEMENTED(alter_2pt_conical_unswap) |
4347 | NOT_IMPLEMENTED(mask_2pt_conical_nan) |
4348 | NOT_IMPLEMENTED(mask_2pt_conical_degenerates) |
4349 | NOT_IMPLEMENTED(apply_vector_mask) |
4350 | #undef NOT_IMPLEMENTED |
4351 | |
4352 | #endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages |
4353 | } // namespace lowp |
4354 | |
4355 | } // namespace SK_OPTS_NS |
4356 | |
4357 | #endif//SkRasterPipeline_opts_DEFINED |
4358 | |