1 | /* |
2 | * Copyright 2018 Google Inc. |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #ifndef SkRasterPipeline_opts_DEFINED |
9 | #define SkRasterPipeline_opts_DEFINED |
10 | |
11 | #include "include/core/SkData.h" |
12 | #include "include/core/SkTypes.h" |
13 | #include "src/core/SkUtils.h" // unaligned_{load,store} |
14 | |
15 | // Every function in this file should be marked static and inline using SI. |
16 | #if defined(__clang__) |
17 | #define SI __attribute__((always_inline)) static inline |
18 | #else |
19 | #define SI static inline |
20 | #endif |
21 | |
22 | template <typename Dst, typename Src> |
23 | SI Dst widen_cast(const Src& src) { |
24 | static_assert(sizeof(Dst) > sizeof(Src)); |
25 | static_assert(std::is_trivially_copyable<Dst>::value); |
26 | static_assert(std::is_trivially_copyable<Src>::value); |
27 | Dst dst; |
28 | memcpy(&dst, &src, sizeof(Src)); |
29 | return dst; |
30 | } |
31 | |
32 | // Our program is an array of void*, either |
33 | // - 1 void* per stage with no context pointer, the next stage; |
34 | // - 2 void* per stage with a context pointer, first the context pointer, then the next stage. |
35 | |
36 | // load_and_inc() steps the program forward by 1 void*, returning that pointer. |
37 | SI void* load_and_inc(void**& program) { |
38 | #if defined(__GNUC__) && defined(__x86_64__) |
39 | // If program is in %rsi (we try to make this likely) then this is a single instruction. |
40 | void* rax; |
41 | asm("lodsq" : "=a" (rax), "+S" (program)); // Write-only %rax, read-write %rsi. |
42 | return rax; |
43 | #else |
44 | // On ARM *program++ compiles into pretty ideal code without any handholding. |
45 | return *program++; |
46 | #endif |
47 | } |
48 | |
49 | // Lazily resolved on first cast. Does nothing if cast to Ctx::None. |
50 | struct Ctx { |
51 | struct None {}; |
52 | |
53 | void* ptr; |
54 | void**& program; |
55 | |
56 | explicit Ctx(void**& p) : ptr(nullptr), program(p) {} |
57 | |
58 | template <typename T> |
59 | operator T*() { |
60 | if (!ptr) { ptr = load_and_inc(program); } |
61 | return (T*)ptr; |
62 | } |
63 | operator None() { return None{}; } |
64 | }; |
65 | |
66 | |
67 | #if !defined(__clang__) |
68 | #define JUMPER_IS_SCALAR |
69 | #elif defined(SK_ARM_HAS_NEON) |
70 | #define JUMPER_IS_NEON |
71 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX |
72 | #define JUMPER_IS_SKX |
73 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
74 | #define JUMPER_IS_HSW |
75 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX |
76 | #define JUMPER_IS_AVX |
77 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
78 | #define JUMPER_IS_SSE41 |
79 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
80 | #define JUMPER_IS_SSE2 |
81 | #else |
82 | #define JUMPER_IS_SCALAR |
83 | #endif |
84 | |
85 | // Older Clangs seem to crash when generating non-optimized NEON code for ARMv7. |
86 | #if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32) |
87 | // Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative. |
88 | #if defined(__apple_build_version__) && __clang_major__ < 9 |
89 | #define JUMPER_IS_SCALAR |
90 | #elif __clang_major__ < 5 |
91 | #define JUMPER_IS_SCALAR |
92 | #endif |
93 | |
94 | #if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR) |
95 | #undef JUMPER_IS_NEON |
96 | #endif |
97 | #endif |
98 | |
99 | #if defined(JUMPER_IS_SCALAR) |
100 | #include <math.h> |
101 | #elif defined(JUMPER_IS_NEON) |
102 | #include <arm_neon.h> |
103 | #else |
104 | #include <immintrin.h> |
105 | #endif |
106 | |
107 | namespace SK_OPTS_NS { |
108 | |
109 | #if defined(JUMPER_IS_SCALAR) |
110 | // This path should lead to portable scalar code. |
111 | using F = float ; |
112 | using I32 = int32_t; |
113 | using U64 = uint64_t; |
114 | using U32 = uint32_t; |
115 | using U16 = uint16_t; |
116 | using U8 = uint8_t ; |
117 | |
118 | SI F mad(F f, F m, F a) { return f*m+a; } |
119 | SI F min(F a, F b) { return fminf(a,b); } |
120 | SI F max(F a, F b) { return fmaxf(a,b); } |
121 | SI F abs_ (F v) { return fabsf(v); } |
122 | SI F floor_(F v) { return floorf(v); } |
123 | SI F rcp (F v) { return 1.0f / v; } |
124 | SI F rsqrt (F v) { return 1.0f / sqrtf(v); } |
125 | SI F sqrt_(F v) { return sqrtf(v); } |
126 | SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); } |
127 | SI U16 pack(U32 v) { return (U16)v; } |
128 | SI U8 pack(U16 v) { return (U8)v; } |
129 | |
130 | SI F if_then_else(I32 c, F t, F e) { return c ? t : e; } |
131 | |
132 | template <typename T> |
133 | SI T gather(const T* p, U32 ix) { return p[ix]; } |
134 | |
135 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
136 | *r = ptr[0]; |
137 | *g = ptr[1]; |
138 | } |
139 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
140 | ptr[0] = r; |
141 | ptr[1] = g; |
142 | } |
143 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
144 | *r = ptr[0]; |
145 | *g = ptr[1]; |
146 | *b = ptr[2]; |
147 | } |
148 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
149 | *r = ptr[0]; |
150 | *g = ptr[1]; |
151 | *b = ptr[2]; |
152 | *a = ptr[3]; |
153 | } |
154 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
155 | ptr[0] = r; |
156 | ptr[1] = g; |
157 | ptr[2] = b; |
158 | ptr[3] = a; |
159 | } |
160 | |
161 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
162 | *r = ptr[0]; |
163 | *g = ptr[1]; |
164 | } |
165 | SI void store2(float* ptr, size_t tail, F r, F g) { |
166 | ptr[0] = r; |
167 | ptr[1] = g; |
168 | } |
169 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
170 | *r = ptr[0]; |
171 | *g = ptr[1]; |
172 | *b = ptr[2]; |
173 | *a = ptr[3]; |
174 | } |
175 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
176 | ptr[0] = r; |
177 | ptr[1] = g; |
178 | ptr[2] = b; |
179 | ptr[3] = a; |
180 | } |
181 | |
182 | #elif defined(JUMPER_IS_NEON) |
183 | // Since we know we're using Clang, we can use its vector extensions. |
184 | template <typename T> using V = T __attribute__((ext_vector_type(4))); |
185 | using F = V<float >; |
186 | using I32 = V< int32_t>; |
187 | using U64 = V<uint64_t>; |
188 | using U32 = V<uint32_t>; |
189 | using U16 = V<uint16_t>; |
190 | using U8 = V<uint8_t >; |
191 | |
192 | // We polyfill a few routines that Clang doesn't build into ext_vector_types. |
193 | SI F min(F a, F b) { return vminq_f32(a,b); } |
194 | SI F max(F a, F b) { return vmaxq_f32(a,b); } |
195 | SI F abs_ (F v) { return vabsq_f32(v); } |
196 | SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; } |
197 | SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } |
198 | SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); } |
199 | SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); } |
200 | |
201 | SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } |
202 | |
203 | #if defined(SK_CPU_ARM64) |
204 | SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); } |
205 | SI F floor_(F v) { return vrndmq_f32(v); } |
206 | SI F sqrt_(F v) { return vsqrtq_f32(v); } |
207 | SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } |
208 | #else |
209 | SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); } |
210 | SI F floor_(F v) { |
211 | F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); |
212 | return roundtrip - if_then_else(roundtrip > v, 1, 0); |
213 | } |
214 | |
215 | SI F sqrt_(F v) { |
216 | auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v). |
217 | e *= vrsqrtsq_f32(v,e*e); |
218 | e *= vrsqrtsq_f32(v,e*e); |
219 | return v*e; // sqrt(v) == v*rsqrt(v). |
220 | } |
221 | |
222 | SI U32 round(F v, F scale) { |
223 | return vcvtq_u32_f32(mad(v,scale,0.5f)); |
224 | } |
225 | #endif |
226 | |
227 | |
228 | template <typename T> |
229 | SI V<T> gather(const T* p, U32 ix) { |
230 | return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; |
231 | } |
232 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
233 | uint16x4x2_t rg; |
234 | if (__builtin_expect(tail,0)) { |
235 | if ( true ) { rg = vld2_lane_u16(ptr + 0, rg, 0); } |
236 | if (tail > 1) { rg = vld2_lane_u16(ptr + 2, rg, 1); } |
237 | if (tail > 2) { rg = vld2_lane_u16(ptr + 4, rg, 2); } |
238 | } else { |
239 | rg = vld2_u16(ptr); |
240 | } |
241 | *r = rg.val[0]; |
242 | *g = rg.val[1]; |
243 | } |
244 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
245 | if (__builtin_expect(tail,0)) { |
246 | if ( true ) { vst2_lane_u16(ptr + 0, (uint16x4x2_t{{r,g}}), 0); } |
247 | if (tail > 1) { vst2_lane_u16(ptr + 2, (uint16x4x2_t{{r,g}}), 1); } |
248 | if (tail > 2) { vst2_lane_u16(ptr + 4, (uint16x4x2_t{{r,g}}), 2); } |
249 | } else { |
250 | vst2_u16(ptr, (uint16x4x2_t{{r,g}})); |
251 | } |
252 | } |
253 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
254 | uint16x4x3_t rgb; |
255 | if (__builtin_expect(tail,0)) { |
256 | if ( true ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); } |
257 | if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); } |
258 | if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); } |
259 | } else { |
260 | rgb = vld3_u16(ptr); |
261 | } |
262 | *r = rgb.val[0]; |
263 | *g = rgb.val[1]; |
264 | *b = rgb.val[2]; |
265 | } |
266 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
267 | uint16x4x4_t rgba; |
268 | if (__builtin_expect(tail,0)) { |
269 | if ( true ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); } |
270 | if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); } |
271 | if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); } |
272 | } else { |
273 | rgba = vld4_u16(ptr); |
274 | } |
275 | *r = rgba.val[0]; |
276 | *g = rgba.val[1]; |
277 | *b = rgba.val[2]; |
278 | *a = rgba.val[3]; |
279 | } |
280 | |
281 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
282 | if (__builtin_expect(tail,0)) { |
283 | if ( true ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); } |
284 | if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); } |
285 | if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); } |
286 | } else { |
287 | vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}})); |
288 | } |
289 | } |
290 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
291 | float32x4x2_t rg; |
292 | if (__builtin_expect(tail,0)) { |
293 | if ( true ) { rg = vld2q_lane_f32(ptr + 0, rg, 0); } |
294 | if (tail > 1) { rg = vld2q_lane_f32(ptr + 2, rg, 1); } |
295 | if (tail > 2) { rg = vld2q_lane_f32(ptr + 4, rg, 2); } |
296 | } else { |
297 | rg = vld2q_f32(ptr); |
298 | } |
299 | *r = rg.val[0]; |
300 | *g = rg.val[1]; |
301 | } |
302 | SI void store2(float* ptr, size_t tail, F r, F g) { |
303 | if (__builtin_expect(tail,0)) { |
304 | if ( true ) { vst2q_lane_f32(ptr + 0, (float32x4x2_t{{r,g}}), 0); } |
305 | if (tail > 1) { vst2q_lane_f32(ptr + 2, (float32x4x2_t{{r,g}}), 1); } |
306 | if (tail > 2) { vst2q_lane_f32(ptr + 4, (float32x4x2_t{{r,g}}), 2); } |
307 | } else { |
308 | vst2q_f32(ptr, (float32x4x2_t{{r,g}})); |
309 | } |
310 | } |
311 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
312 | float32x4x4_t rgba; |
313 | if (__builtin_expect(tail,0)) { |
314 | if ( true ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); } |
315 | if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); } |
316 | if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); } |
317 | } else { |
318 | rgba = vld4q_f32(ptr); |
319 | } |
320 | *r = rgba.val[0]; |
321 | *g = rgba.val[1]; |
322 | *b = rgba.val[2]; |
323 | *a = rgba.val[3]; |
324 | } |
325 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
326 | if (__builtin_expect(tail,0)) { |
327 | if ( true ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); } |
328 | if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); } |
329 | if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); } |
330 | } else { |
331 | vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}})); |
332 | } |
333 | } |
334 | |
335 | #elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
336 | // These are __m256 and __m256i, but friendlier and strongly-typed. |
337 | template <typename T> using V = T __attribute__((ext_vector_type(8))); |
338 | using F = V<float >; |
339 | using I32 = V< int32_t>; |
340 | using U64 = V<uint64_t>; |
341 | using U32 = V<uint32_t>; |
342 | using U16 = V<uint16_t>; |
343 | using U8 = V<uint8_t >; |
344 | |
345 | SI F mad(F f, F m, F a) { |
346 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
347 | return _mm256_fmadd_ps(f,m,a); |
348 | #else |
349 | return f*m+a; |
350 | #endif |
351 | } |
352 | |
353 | SI F min(F a, F b) { return _mm256_min_ps(a,b); } |
354 | SI F max(F a, F b) { return _mm256_max_ps(a,b); } |
355 | SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); } |
356 | SI F floor_(F v) { return _mm256_floor_ps(v); } |
357 | SI F rcp (F v) { return _mm256_rcp_ps (v); } |
358 | SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); } |
359 | SI F sqrt_(F v) { return _mm256_sqrt_ps (v); } |
360 | SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); } |
361 | |
362 | SI U16 pack(U32 v) { |
363 | return _mm_packus_epi32(_mm256_extractf128_si256(v, 0), |
364 | _mm256_extractf128_si256(v, 1)); |
365 | } |
366 | SI U8 pack(U16 v) { |
367 | auto r = _mm_packus_epi16(v,v); |
368 | return sk_unaligned_load<U8>(&r); |
369 | } |
370 | |
371 | SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } |
372 | |
373 | template <typename T> |
374 | SI V<T> gather(const T* p, U32 ix) { |
375 | return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], |
376 | p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], }; |
377 | } |
378 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
379 | SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); } |
380 | SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); } |
381 | SI U64 gather(const uint64_t* p, U32 ix) { |
382 | __m256i parts[] = { |
383 | _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8), |
384 | _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8), |
385 | }; |
386 | return sk_bit_cast<U64>(parts); |
387 | } |
388 | #endif |
389 | |
390 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
391 | U16 _0123, _4567; |
392 | if (__builtin_expect(tail,0)) { |
393 | _0123 = _4567 = _mm_setzero_si128(); |
394 | auto* d = &_0123; |
395 | if (tail > 3) { |
396 | *d = _mm_loadu_si128(((__m128i*)ptr) + 0); |
397 | tail -= 4; |
398 | ptr += 8; |
399 | d = &_4567; |
400 | } |
401 | bool high = false; |
402 | if (tail > 1) { |
403 | *d = _mm_loadu_si64(ptr); |
404 | tail -= 2; |
405 | ptr += 4; |
406 | high = true; |
407 | } |
408 | if (tail > 0) { |
409 | (*d)[high ? 4 : 0] = *(ptr + 0); |
410 | (*d)[high ? 5 : 1] = *(ptr + 1); |
411 | } |
412 | } else { |
413 | _0123 = _mm_loadu_si128(((__m128i*)ptr) + 0); |
414 | _4567 = _mm_loadu_si128(((__m128i*)ptr) + 1); |
415 | } |
416 | *r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16), |
417 | _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16)); |
418 | *g = _mm_packs_epi32(_mm_srai_epi32(_0123, 16), |
419 | _mm_srai_epi32(_4567, 16)); |
420 | } |
421 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
422 | auto _0123 = _mm_unpacklo_epi16(r, g), |
423 | _4567 = _mm_unpackhi_epi16(r, g); |
424 | if (__builtin_expect(tail,0)) { |
425 | const auto* s = &_0123; |
426 | if (tail > 3) { |
427 | _mm_storeu_si128((__m128i*)ptr, *s); |
428 | s = &_4567; |
429 | tail -= 4; |
430 | ptr += 8; |
431 | } |
432 | bool high = false; |
433 | if (tail > 1) { |
434 | _mm_storel_epi64((__m128i*)ptr, *s); |
435 | ptr += 4; |
436 | tail -= 2; |
437 | high = true; |
438 | } |
439 | if (tail > 0) { |
440 | if (high) { |
441 | *(int32_t*)ptr = _mm_extract_epi32(*s, 2); |
442 | } else { |
443 | *(int32_t*)ptr = _mm_cvtsi128_si32(*s); |
444 | } |
445 | } |
446 | } else { |
447 | _mm_storeu_si128((__m128i*)ptr + 0, _0123); |
448 | _mm_storeu_si128((__m128i*)ptr + 1, _4567); |
449 | } |
450 | } |
451 | |
452 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
453 | __m128i _0,_1,_2,_3,_4,_5,_6,_7; |
454 | if (__builtin_expect(tail,0)) { |
455 | auto load_rgb = [](const uint16_t* src) { |
456 | auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); |
457 | return _mm_insert_epi16(v, src[2], 2); |
458 | }; |
459 | _1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128(); |
460 | if ( true ) { _0 = load_rgb(ptr + 0); } |
461 | if (tail > 1) { _1 = load_rgb(ptr + 3); } |
462 | if (tail > 2) { _2 = load_rgb(ptr + 6); } |
463 | if (tail > 3) { _3 = load_rgb(ptr + 9); } |
464 | if (tail > 4) { _4 = load_rgb(ptr + 12); } |
465 | if (tail > 5) { _5 = load_rgb(ptr + 15); } |
466 | if (tail > 6) { _6 = load_rgb(ptr + 18); } |
467 | } else { |
468 | // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over. |
469 | auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ; |
470 | auto _23 = _mm_loadu_si128((const __m128i*)(ptr + 6)) ; |
471 | auto _45 = _mm_loadu_si128((const __m128i*)(ptr + 12)) ; |
472 | auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4); |
473 | _0 = _01; _1 = _mm_srli_si128(_01, 6); |
474 | _2 = _23; _3 = _mm_srli_si128(_23, 6); |
475 | _4 = _45; _5 = _mm_srli_si128(_45, 6); |
476 | _6 = _67; _7 = _mm_srli_si128(_67, 6); |
477 | } |
478 | |
479 | auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx |
480 | _13 = _mm_unpacklo_epi16(_1, _3), |
481 | _46 = _mm_unpacklo_epi16(_4, _6), |
482 | _57 = _mm_unpacklo_epi16(_5, _7); |
483 | |
484 | auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
485 | bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx |
486 | rg4567 = _mm_unpacklo_epi16(_46, _57), |
487 | bx4567 = _mm_unpackhi_epi16(_46, _57); |
488 | |
489 | *r = _mm_unpacklo_epi64(rg0123, rg4567); |
490 | *g = _mm_unpackhi_epi64(rg0123, rg4567); |
491 | *b = _mm_unpacklo_epi64(bx0123, bx4567); |
492 | } |
493 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
494 | __m128i _01, _23, _45, _67; |
495 | if (__builtin_expect(tail,0)) { |
496 | auto src = (const double*)ptr; |
497 | _01 = _23 = _45 = _67 = _mm_setzero_si128(); |
498 | if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } |
499 | if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } |
500 | if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } |
501 | if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } |
502 | if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } |
503 | if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } |
504 | if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } |
505 | } else { |
506 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); |
507 | _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); |
508 | _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); |
509 | _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); |
510 | } |
511 | |
512 | auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 |
513 | _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 |
514 | _46 = _mm_unpacklo_epi16(_45, _67), |
515 | _57 = _mm_unpackhi_epi16(_45, _67); |
516 | |
517 | auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
518 | ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 |
519 | rg4567 = _mm_unpacklo_epi16(_46, _57), |
520 | ba4567 = _mm_unpackhi_epi16(_46, _57); |
521 | |
522 | *r = _mm_unpacklo_epi64(rg0123, rg4567); |
523 | *g = _mm_unpackhi_epi64(rg0123, rg4567); |
524 | *b = _mm_unpacklo_epi64(ba0123, ba4567); |
525 | *a = _mm_unpackhi_epi64(ba0123, ba4567); |
526 | } |
527 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
528 | auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3 |
529 | rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7 |
530 | ba0123 = _mm_unpacklo_epi16(b, a), |
531 | ba4567 = _mm_unpackhi_epi16(b, a); |
532 | |
533 | auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), |
534 | _23 = _mm_unpackhi_epi32(rg0123, ba0123), |
535 | _45 = _mm_unpacklo_epi32(rg4567, ba4567), |
536 | _67 = _mm_unpackhi_epi32(rg4567, ba4567); |
537 | |
538 | if (__builtin_expect(tail,0)) { |
539 | auto dst = (double*)ptr; |
540 | if (tail > 0) { _mm_storel_pd(dst+0, _01); } |
541 | if (tail > 1) { _mm_storeh_pd(dst+1, _01); } |
542 | if (tail > 2) { _mm_storel_pd(dst+2, _23); } |
543 | if (tail > 3) { _mm_storeh_pd(dst+3, _23); } |
544 | if (tail > 4) { _mm_storel_pd(dst+4, _45); } |
545 | if (tail > 5) { _mm_storeh_pd(dst+5, _45); } |
546 | if (tail > 6) { _mm_storel_pd(dst+6, _67); } |
547 | } else { |
548 | _mm_storeu_si128((__m128i*)ptr + 0, _01); |
549 | _mm_storeu_si128((__m128i*)ptr + 1, _23); |
550 | _mm_storeu_si128((__m128i*)ptr + 2, _45); |
551 | _mm_storeu_si128((__m128i*)ptr + 3, _67); |
552 | } |
553 | } |
554 | |
555 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
556 | F _0123, _4567; |
557 | if (__builtin_expect(tail, 0)) { |
558 | _0123 = _4567 = _mm256_setzero_ps(); |
559 | F* d = &_0123; |
560 | if (tail > 3) { |
561 | *d = _mm256_loadu_ps(ptr); |
562 | ptr += 8; |
563 | tail -= 4; |
564 | d = &_4567; |
565 | } |
566 | bool high = false; |
567 | if (tail > 1) { |
568 | *d = _mm256_castps128_ps256(_mm_loadu_ps(ptr)); |
569 | ptr += 4; |
570 | tail -= 2; |
571 | high = true; |
572 | } |
573 | if (tail > 0) { |
574 | *d = high ? _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 1) |
575 | : _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 0); |
576 | } |
577 | } else { |
578 | _0123 = _mm256_loadu_ps(ptr + 0); |
579 | _4567 = _mm256_loadu_ps(ptr + 8); |
580 | } |
581 | |
582 | F _0145 = _mm256_permute2f128_pd(_0123, _4567, 0x20), |
583 | _2367 = _mm256_permute2f128_pd(_0123, _4567, 0x31); |
584 | |
585 | *r = _mm256_shuffle_ps(_0145, _2367, 0x88); |
586 | *g = _mm256_shuffle_ps(_0145, _2367, 0xDD); |
587 | } |
588 | SI void store2(float* ptr, size_t tail, F r, F g) { |
589 | F _0145 = _mm256_unpacklo_ps(r, g), |
590 | _2367 = _mm256_unpackhi_ps(r, g); |
591 | F _0123 = _mm256_permute2f128_pd(_0145, _2367, 0x20), |
592 | _4567 = _mm256_permute2f128_pd(_0145, _2367, 0x31); |
593 | |
594 | if (__builtin_expect(tail, 0)) { |
595 | const __m256* s = &_0123; |
596 | if (tail > 3) { |
597 | _mm256_storeu_ps(ptr, *s); |
598 | s = &_4567; |
599 | tail -= 4; |
600 | ptr += 8; |
601 | } |
602 | bool high = false; |
603 | if (tail > 1) { |
604 | _mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, 0)); |
605 | ptr += 4; |
606 | tail -= 2; |
607 | high = true; |
608 | } |
609 | if (tail > 0) { |
610 | *(ptr + 0) = (*s)[ high ? 4 : 0]; |
611 | *(ptr + 1) = (*s)[ high ? 5 : 1]; |
612 | } |
613 | } else { |
614 | _mm256_storeu_ps(ptr + 0, _0123); |
615 | _mm256_storeu_ps(ptr + 8, _4567); |
616 | } |
617 | } |
618 | |
619 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
620 | F _04, _15, _26, _37; |
621 | _04 = _15 = _26 = _37 = 0; |
622 | switch (tail) { |
623 | case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1); [[fallthrough]]; |
624 | case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1); [[fallthrough]]; |
625 | case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1); [[fallthrough]]; |
626 | case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1); [[fallthrough]]; |
627 | case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0); [[fallthrough]]; |
628 | case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0); [[fallthrough]]; |
629 | case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0); [[fallthrough]]; |
630 | case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0); |
631 | } |
632 | |
633 | F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5 |
634 | ba0145 = _mm256_unpackhi_ps(_04,_15), |
635 | rg2367 = _mm256_unpacklo_ps(_26,_37), |
636 | ba2367 = _mm256_unpackhi_ps(_26,_37); |
637 | |
638 | *r = _mm256_unpacklo_pd(rg0145, rg2367); |
639 | *g = _mm256_unpackhi_pd(rg0145, rg2367); |
640 | *b = _mm256_unpacklo_pd(ba0145, ba2367); |
641 | *a = _mm256_unpackhi_pd(ba0145, ba2367); |
642 | } |
643 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
644 | F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5 |
645 | rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ... |
646 | ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5 |
647 | ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ... |
648 | |
649 | F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4 |
650 | _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... |
651 | _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... |
652 | _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... |
653 | |
654 | if (__builtin_expect(tail, 0)) { |
655 | if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); } |
656 | if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); } |
657 | if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); } |
658 | if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); } |
659 | if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); } |
660 | if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); } |
661 | if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); } |
662 | } else { |
663 | F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo |
664 | _23 = _mm256_permute2f128_ps(_26, _37, 32), |
665 | _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi |
666 | _67 = _mm256_permute2f128_ps(_26, _37, 49); |
667 | _mm256_storeu_ps(ptr+ 0, _01); |
668 | _mm256_storeu_ps(ptr+ 8, _23); |
669 | _mm256_storeu_ps(ptr+16, _45); |
670 | _mm256_storeu_ps(ptr+24, _67); |
671 | } |
672 | } |
673 | |
674 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) |
675 | template <typename T> using V = T __attribute__((ext_vector_type(4))); |
676 | using F = V<float >; |
677 | using I32 = V< int32_t>; |
678 | using U64 = V<uint64_t>; |
679 | using U32 = V<uint32_t>; |
680 | using U16 = V<uint16_t>; |
681 | using U8 = V<uint8_t >; |
682 | |
683 | SI F mad(F f, F m, F a) { return f*m+a; } |
684 | SI F min(F a, F b) { return _mm_min_ps(a,b); } |
685 | SI F max(F a, F b) { return _mm_max_ps(a,b); } |
686 | SI F abs_(F v) { return _mm_and_ps(v, 0-v); } |
687 | SI F rcp (F v) { return _mm_rcp_ps (v); } |
688 | SI F rsqrt (F v) { return _mm_rsqrt_ps(v); } |
689 | SI F sqrt_(F v) { return _mm_sqrt_ps (v); } |
690 | SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); } |
691 | |
692 | SI U16 pack(U32 v) { |
693 | #if defined(JUMPER_IS_SSE41) |
694 | auto p = _mm_packus_epi32(v,v); |
695 | #else |
696 | // Sign extend so that _mm_packs_epi32() does the pack we want. |
697 | auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16); |
698 | p = _mm_packs_epi32(p,p); |
699 | #endif |
700 | return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one. |
701 | } |
702 | SI U8 pack(U16 v) { |
703 | auto r = widen_cast<__m128i>(v); |
704 | r = _mm_packus_epi16(r,r); |
705 | return sk_unaligned_load<U8>(&r); |
706 | } |
707 | |
708 | SI F if_then_else(I32 c, F t, F e) { |
709 | return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e)); |
710 | } |
711 | |
712 | SI F floor_(F v) { |
713 | #if defined(JUMPER_IS_SSE41) |
714 | return _mm_floor_ps(v); |
715 | #else |
716 | F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); |
717 | return roundtrip - if_then_else(roundtrip > v, 1, 0); |
718 | #endif |
719 | } |
720 | |
721 | template <typename T> |
722 | SI V<T> gather(const T* p, U32 ix) { |
723 | return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; |
724 | } |
725 | |
726 | // TODO: these loads and stores are incredibly difficult to follow. |
727 | |
728 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
729 | __m128i _01; |
730 | if (__builtin_expect(tail,0)) { |
731 | _01 = _mm_setzero_si128(); |
732 | if (tail > 1) { |
733 | _01 = _mm_loadl_pd(_01, (const double*)ptr); // r0 g0 r1 g1 00 00 00 00 |
734 | if (tail > 2) { |
735 | _01 = _mm_insert_epi16(_01, *(ptr+4), 4); // r0 g0 r1 g1 r2 00 00 00 |
736 | _01 = _mm_insert_epi16(_01, *(ptr+5), 5); // r0 g0 r1 g1 r2 g2 00 00 |
737 | } |
738 | } else { |
739 | _01 = _mm_cvtsi32_si128(*(const uint32_t*)ptr); // r0 g0 00 00 00 00 00 00 |
740 | } |
741 | } else { |
742 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 r1 g1 r2 g2 r3 g3 |
743 | } |
744 | auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8); // r0 r1 g0 g1 r2 g2 r3 g3 |
745 | auto rg = _mm_shufflehi_epi16(rg01_23, 0xD8); // r0 r1 g0 g1 r2 r3 g2 g3 |
746 | |
747 | auto R = _mm_shuffle_epi32(rg, 0x88); // r0 r1 r2 r3 r0 r1 r2 r3 |
748 | auto G = _mm_shuffle_epi32(rg, 0xDD); // g0 g1 g2 g3 g0 g1 g2 g3 |
749 | *r = sk_unaligned_load<U16>(&R); |
750 | *g = sk_unaligned_load<U16>(&G); |
751 | } |
752 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
753 | U32 rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)); |
754 | if (__builtin_expect(tail, 0)) { |
755 | if (tail > 1) { |
756 | _mm_storel_epi64((__m128i*)ptr, rg); |
757 | if (tail > 2) { |
758 | int32_t rgpair = rg[2]; |
759 | memcpy(ptr + 4, &rgpair, sizeof(rgpair)); |
760 | } |
761 | } else { |
762 | int32_t rgpair = rg[0]; |
763 | memcpy(ptr, &rgpair, sizeof(rgpair)); |
764 | } |
765 | } else { |
766 | _mm_storeu_si128((__m128i*)ptr + 0, rg); |
767 | } |
768 | } |
769 | |
770 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
771 | __m128i _0, _1, _2, _3; |
772 | if (__builtin_expect(tail,0)) { |
773 | _1 = _2 = _3 = _mm_setzero_si128(); |
774 | auto load_rgb = [](const uint16_t* src) { |
775 | auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); |
776 | return _mm_insert_epi16(v, src[2], 2); |
777 | }; |
778 | if ( true ) { _0 = load_rgb(ptr + 0); } |
779 | if (tail > 1) { _1 = load_rgb(ptr + 3); } |
780 | if (tail > 2) { _2 = load_rgb(ptr + 6); } |
781 | } else { |
782 | // Load slightly weirdly to make sure we don't load past the end of 4x48 bits. |
783 | auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) , |
784 | _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4); |
785 | |
786 | // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored). |
787 | _0 = _01; |
788 | _1 = _mm_srli_si128(_01, 6); |
789 | _2 = _23; |
790 | _3 = _mm_srli_si128(_23, 6); |
791 | } |
792 | |
793 | // De-interlace to R,G,B. |
794 | auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx |
795 | _13 = _mm_unpacklo_epi16(_1, _3); // r1 r3 g1 g3 b1 b3 xx xx |
796 | |
797 | auto R = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
798 | G = _mm_srli_si128(R, 8), |
799 | B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx |
800 | |
801 | *r = sk_unaligned_load<U16>(&R); |
802 | *g = sk_unaligned_load<U16>(&G); |
803 | *b = sk_unaligned_load<U16>(&B); |
804 | } |
805 | |
806 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
807 | __m128i _01, _23; |
808 | if (__builtin_expect(tail,0)) { |
809 | _01 = _23 = _mm_setzero_si128(); |
810 | auto src = (const double*)ptr; |
811 | if ( true ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00 |
812 | if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1 |
813 | if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00 |
814 | } else { |
815 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1 |
816 | _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3 |
817 | } |
818 | |
819 | auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 |
820 | _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3 |
821 | |
822 | auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 |
823 | ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 |
824 | |
825 | *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0); |
826 | *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4); |
827 | *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0); |
828 | *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4); |
829 | } |
830 | |
831 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
832 | auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)), |
833 | ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a)); |
834 | |
835 | if (__builtin_expect(tail, 0)) { |
836 | auto dst = (double*)ptr; |
837 | if ( true ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); } |
838 | if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); } |
839 | if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); } |
840 | } else { |
841 | _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); |
842 | _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); |
843 | } |
844 | } |
845 | |
846 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { |
847 | F _01, _23; |
848 | if (__builtin_expect(tail, 0)) { |
849 | _01 = _23 = _mm_setzero_si128(); |
850 | if ( true ) { _01 = _mm_loadl_pi(_01, (__m64 const*)(ptr + 0)); } |
851 | if (tail > 1) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + 2)); } |
852 | if (tail > 2) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + 4)); } |
853 | } else { |
854 | _01 = _mm_loadu_ps(ptr + 0); |
855 | _23 = _mm_loadu_ps(ptr + 4); |
856 | } |
857 | *r = _mm_shuffle_ps(_01, _23, 0x88); |
858 | *g = _mm_shuffle_ps(_01, _23, 0xDD); |
859 | } |
860 | SI void store2(float* ptr, size_t tail, F r, F g) { |
861 | F _01 = _mm_unpacklo_ps(r, g), |
862 | _23 = _mm_unpackhi_ps(r, g); |
863 | if (__builtin_expect(tail, 0)) { |
864 | if ( true ) { _mm_storel_pi((__m64*)(ptr + 0), _01); } |
865 | if (tail > 1) { _mm_storeh_pi((__m64*)(ptr + 2), _01); } |
866 | if (tail > 2) { _mm_storel_pi((__m64*)(ptr + 4), _23); } |
867 | } else { |
868 | _mm_storeu_ps(ptr + 0, _01); |
869 | _mm_storeu_ps(ptr + 4, _23); |
870 | } |
871 | } |
872 | |
873 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { |
874 | F _0, _1, _2, _3; |
875 | if (__builtin_expect(tail, 0)) { |
876 | _1 = _2 = _3 = _mm_setzero_si128(); |
877 | if ( true ) { _0 = _mm_loadu_ps(ptr + 0); } |
878 | if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); } |
879 | if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); } |
880 | } else { |
881 | _0 = _mm_loadu_ps(ptr + 0); |
882 | _1 = _mm_loadu_ps(ptr + 4); |
883 | _2 = _mm_loadu_ps(ptr + 8); |
884 | _3 = _mm_loadu_ps(ptr +12); |
885 | } |
886 | _MM_TRANSPOSE4_PS(_0,_1,_2,_3); |
887 | *r = _0; |
888 | *g = _1; |
889 | *b = _2; |
890 | *a = _3; |
891 | } |
892 | |
893 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { |
894 | _MM_TRANSPOSE4_PS(r,g,b,a); |
895 | if (__builtin_expect(tail, 0)) { |
896 | if ( true ) { _mm_storeu_ps(ptr + 0, r); } |
897 | if (tail > 1) { _mm_storeu_ps(ptr + 4, g); } |
898 | if (tail > 2) { _mm_storeu_ps(ptr + 8, b); } |
899 | } else { |
900 | _mm_storeu_ps(ptr + 0, r); |
901 | _mm_storeu_ps(ptr + 4, g); |
902 | _mm_storeu_ps(ptr + 8, b); |
903 | _mm_storeu_ps(ptr +12, a); |
904 | } |
905 | } |
906 | #endif |
907 | |
908 | // We need to be a careful with casts. |
909 | // (F)x means cast x to float in the portable path, but bit_cast x to float in the others. |
910 | // These named casts and bit_cast() are always what they seem to be. |
911 | #if defined(JUMPER_IS_SCALAR) |
912 | SI F cast (U32 v) { return (F)v; } |
913 | SI F cast64(U64 v) { return (F)v; } |
914 | SI U32 trunc_(F v) { return (U32)v; } |
915 | SI U32 expand(U16 v) { return (U32)v; } |
916 | SI U32 expand(U8 v) { return (U32)v; } |
917 | #else |
918 | SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); } |
919 | SI F cast64(U64 v) { return __builtin_convertvector( v, F); } |
920 | SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); } |
921 | SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); } |
922 | SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); } |
923 | #endif |
924 | |
925 | template <typename V> |
926 | SI V if_then_else(I32 c, V t, V e) { |
927 | return sk_bit_cast<V>(if_then_else(c, sk_bit_cast<F>(t), sk_bit_cast<F>(e))); |
928 | } |
929 | |
930 | SI U16 bswap(U16 x) { |
931 | #if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) |
932 | // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes |
933 | // when generating code for SSE2 and SSE4.1. We'll do it manually... |
934 | auto v = widen_cast<__m128i>(x); |
935 | v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8); |
936 | return sk_unaligned_load<U16>(&v); |
937 | #else |
938 | return (x<<8) | (x>>8); |
939 | #endif |
940 | } |
941 | |
942 | SI F fract(F v) { return v - floor_(v); } |
943 | |
944 | // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. |
945 | SI F approx_log2(F x) { |
946 | // e - 127 is a fair approximation of log2(x) in its own right... |
947 | F e = cast(sk_bit_cast<U32>(x)) * (1.0f / (1<<23)); |
948 | |
949 | // ... but using the mantissa to refine its error is _much_ better. |
950 | F m = sk_bit_cast<F>((sk_bit_cast<U32>(x) & 0x007fffff) | 0x3f000000); |
951 | return e |
952 | - 124.225514990f |
953 | - 1.498030302f * m |
954 | - 1.725879990f / (0.3520887068f + m); |
955 | } |
956 | |
957 | SI F approx_log(F x) { |
958 | const float ln2 = 0.69314718f; |
959 | return ln2 * approx_log2(x); |
960 | } |
961 | |
962 | SI F approx_pow2(F x) { |
963 | F f = fract(x); |
964 | return sk_bit_cast<F>(round(1.0f * (1<<23), |
965 | x + 121.274057500f |
966 | - 1.490129070f * f |
967 | + 27.728023300f / (4.84252568f - f))); |
968 | } |
969 | |
970 | SI F approx_exp(F x) { |
971 | const float log2_e = 1.4426950408889634074f; |
972 | return approx_pow2(log2_e * x); |
973 | } |
974 | |
975 | SI F approx_powf(F x, F y) { |
976 | return if_then_else((x == 0)|(x == 1), x |
977 | , approx_pow2(approx_log2(x) * y)); |
978 | } |
979 | |
980 | SI F from_half(U16 h) { |
981 | #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \ |
982 | && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. |
983 | return vcvt_f32_f16(h); |
984 | |
985 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
986 | return _mm256_cvtph_ps(h); |
987 | |
988 | #else |
989 | // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias. |
990 | U32 sem = expand(h), |
991 | s = sem & 0x8000, |
992 | em = sem ^ s; |
993 | |
994 | // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero. |
995 | auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here. |
996 | return if_then_else(denorm, F(0) |
997 | , sk_bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) )); |
998 | #endif |
999 | } |
1000 | |
1001 | SI U16 to_half(F f) { |
1002 | #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \ |
1003 | && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. |
1004 | return vcvt_f16_f32(f); |
1005 | |
1006 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
1007 | return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION); |
1008 | |
1009 | #else |
1010 | // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias. |
1011 | U32 sem = sk_bit_cast<U32>(f), |
1012 | s = sem & 0x80000000, |
1013 | em = sem ^ s; |
1014 | |
1015 | // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero. |
1016 | auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here. |
1017 | return pack(if_then_else(denorm, U32(0) |
1018 | , (s>>16) + (em>>13) - ((127-15)<<10))); |
1019 | #endif |
1020 | } |
1021 | |
1022 | // Our fundamental vector depth is our pixel stride. |
1023 | static const size_t N = sizeof(F) / sizeof(float); |
1024 | |
1025 | // We're finally going to get to what a Stage function looks like! |
1026 | // tail == 0 ~~> work on a full N pixels |
1027 | // tail != 0 ~~> work on only the first tail pixels |
1028 | // tail is always < N. |
1029 | |
1030 | // Any custom ABI to use for all (non-externally-facing) stage functions? |
1031 | // Also decide here whether to use narrow (compromise) or wide (ideal) stages. |
1032 | #if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON) |
1033 | // This lets us pass vectors more efficiently on 32-bit ARM. |
1034 | // We can still only pass 16 floats, so best as 4x {r,g,b,a}. |
1035 | #define ABI __attribute__((pcs("aapcs-vfp"))) |
1036 | #define JUMPER_NARROW_STAGES 1 |
1037 | #elif 0 && defined(_MSC_VER) && defined(__clang__) && defined(__x86_64__) |
1038 | // SysV ABI makes it very sensible to use wide stages with clang-cl. |
1039 | // TODO: crashes during compilation :( |
1040 | #define ABI __attribute__((sysv_abi)) |
1041 | #define JUMPER_NARROW_STAGES 0 |
1042 | #elif defined(_MSC_VER) |
1043 | // Even if not vectorized, this lets us pass {r,g,b,a} as registers, |
1044 | // instead of {b,a} on the stack. Narrow stages work best for __vectorcall. |
1045 | #define ABI __vectorcall |
1046 | #define JUMPER_NARROW_STAGES 1 |
1047 | #elif defined(__x86_64__) || defined(SK_CPU_ARM64) |
1048 | // These platforms are ideal for wider stages, and their default ABI is ideal. |
1049 | #define ABI |
1050 | #define JUMPER_NARROW_STAGES 0 |
1051 | #else |
1052 | // 32-bit or unknown... shunt them down the narrow path. |
1053 | // Odds are these have few registers and are better off there. |
1054 | #define ABI |
1055 | #define JUMPER_NARROW_STAGES 1 |
1056 | #endif |
1057 | |
1058 | #if JUMPER_NARROW_STAGES |
1059 | struct Params { |
1060 | size_t dx, dy, tail; |
1061 | F dr,dg,db,da; |
1062 | }; |
1063 | using Stage = void(ABI*)(Params*, void** program, F r, F g, F b, F a); |
1064 | #else |
1065 | // We keep program the second argument, so that it's passed in rsi for load_and_inc(). |
1066 | using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F); |
1067 | #endif |
1068 | |
1069 | |
1070 | static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, void** program) { |
1071 | auto start = (Stage)load_and_inc(program); |
1072 | const size_t x0 = dx; |
1073 | for (; dy < ylimit; dy++) { |
1074 | #if JUMPER_NARROW_STAGES |
1075 | Params params = { x0,dy,0, 0,0,0,0 }; |
1076 | while (params.dx + N <= xlimit) { |
1077 | start(¶ms,program, 0,0,0,0); |
1078 | params.dx += N; |
1079 | } |
1080 | if (size_t tail = xlimit - params.dx) { |
1081 | params.tail = tail; |
1082 | start(¶ms,program, 0,0,0,0); |
1083 | } |
1084 | #else |
1085 | dx = x0; |
1086 | while (dx + N <= xlimit) { |
1087 | start(0,program,dx,dy, 0,0,0,0, 0,0,0,0); |
1088 | dx += N; |
1089 | } |
1090 | if (size_t tail = xlimit - dx) { |
1091 | start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); |
1092 | } |
1093 | #endif |
1094 | } |
1095 | } |
1096 | |
1097 | #if JUMPER_NARROW_STAGES |
1098 | #define STAGE(name, ...) \ |
1099 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
1100 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ |
1101 | static void ABI name(Params* params, void** program, \ |
1102 | F r, F g, F b, F a) { \ |
1103 | name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \ |
1104 | params->dr, params->dg, params->db, params->da); \ |
1105 | auto next = (Stage)load_and_inc(program); \ |
1106 | next(params,program, r,g,b,a); \ |
1107 | } \ |
1108 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
1109 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) |
1110 | #else |
1111 | #define STAGE(name, ...) \ |
1112 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
1113 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ |
1114 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
1115 | F r, F g, F b, F a, F dr, F dg, F db, F da) { \ |
1116 | name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \ |
1117 | auto next = (Stage)load_and_inc(program); \ |
1118 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
1119 | } \ |
1120 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
1121 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) |
1122 | #endif |
1123 | |
1124 | |
1125 | // just_return() is a simple no-op stage that only exists to end the chain, |
1126 | // returning back up to start_pipeline(), and from there to the caller. |
1127 | #if JUMPER_NARROW_STAGES |
1128 | static void ABI just_return(Params*, void**, F,F,F,F) {} |
1129 | #else |
1130 | static void ABI just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {} |
1131 | #endif |
1132 | |
1133 | |
1134 | // We could start defining normal Stages now. But first, some helper functions. |
1135 | |
1136 | // These load() and store() methods are tail-aware, |
1137 | // but focus mainly on keeping the at-stride tail==0 case fast. |
1138 | |
1139 | template <typename V, typename T> |
1140 | SI V load(const T* src, size_t tail) { |
1141 | #if !defined(JUMPER_IS_SCALAR) |
1142 | __builtin_assume(tail < N); |
1143 | if (__builtin_expect(tail, 0)) { |
1144 | V v{}; // Any inactive lanes are zeroed. |
1145 | switch (tail) { |
1146 | case 7: v[6] = src[6]; [[fallthrough]]; |
1147 | case 6: v[5] = src[5]; [[fallthrough]]; |
1148 | case 5: v[4] = src[4]; [[fallthrough]]; |
1149 | case 4: memcpy(&v, src, 4*sizeof(T)); break; |
1150 | case 3: v[2] = src[2]; [[fallthrough]]; |
1151 | case 2: memcpy(&v, src, 2*sizeof(T)); break; |
1152 | case 1: memcpy(&v, src, 1*sizeof(T)); break; |
1153 | } |
1154 | return v; |
1155 | } |
1156 | #endif |
1157 | return sk_unaligned_load<V>(src); |
1158 | } |
1159 | |
1160 | template <typename V, typename T> |
1161 | SI void store(T* dst, V v, size_t tail) { |
1162 | #if !defined(JUMPER_IS_SCALAR) |
1163 | __builtin_assume(tail < N); |
1164 | if (__builtin_expect(tail, 0)) { |
1165 | switch (tail) { |
1166 | case 7: dst[6] = v[6]; [[fallthrough]]; |
1167 | case 6: dst[5] = v[5]; [[fallthrough]]; |
1168 | case 5: dst[4] = v[4]; [[fallthrough]]; |
1169 | case 4: memcpy(dst, &v, 4*sizeof(T)); break; |
1170 | case 3: dst[2] = v[2]; [[fallthrough]]; |
1171 | case 2: memcpy(dst, &v, 2*sizeof(T)); break; |
1172 | case 1: memcpy(dst, &v, 1*sizeof(T)); break; |
1173 | } |
1174 | return; |
1175 | } |
1176 | #endif |
1177 | sk_unaligned_store(dst, v); |
1178 | } |
1179 | |
1180 | SI F from_byte(U8 b) { |
1181 | return cast(expand(b)) * (1/255.0f); |
1182 | } |
1183 | SI F from_short(U16 s) { |
1184 | return cast(expand(s)) * (1/65535.0f); |
1185 | } |
1186 | SI void from_565(U16 _565, F* r, F* g, F* b) { |
1187 | U32 wide = expand(_565); |
1188 | *r = cast(wide & (31<<11)) * (1.0f / (31<<11)); |
1189 | *g = cast(wide & (63<< 5)) * (1.0f / (63<< 5)); |
1190 | *b = cast(wide & (31<< 0)) * (1.0f / (31<< 0)); |
1191 | } |
1192 | SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) { |
1193 | U32 wide = expand(_4444); |
1194 | *r = cast(wide & (15<<12)) * (1.0f / (15<<12)); |
1195 | *g = cast(wide & (15<< 8)) * (1.0f / (15<< 8)); |
1196 | *b = cast(wide & (15<< 4)) * (1.0f / (15<< 4)); |
1197 | *a = cast(wide & (15<< 0)) * (1.0f / (15<< 0)); |
1198 | } |
1199 | SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) { |
1200 | *r = cast((_8888 ) & 0xff) * (1/255.0f); |
1201 | *g = cast((_8888 >> 8) & 0xff) * (1/255.0f); |
1202 | *b = cast((_8888 >> 16) & 0xff) * (1/255.0f); |
1203 | *a = cast((_8888 >> 24) ) * (1/255.0f); |
1204 | } |
1205 | SI void from_88(U16 _88, F* r, F* g) { |
1206 | U32 wide = expand(_88); |
1207 | *r = cast((wide ) & 0xff) * (1/255.0f); |
1208 | *g = cast((wide >> 8) & 0xff) * (1/255.0f); |
1209 | } |
1210 | SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) { |
1211 | *r = cast((rgba ) & 0x3ff) * (1/1023.0f); |
1212 | *g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f); |
1213 | *b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f); |
1214 | *a = cast((rgba >> 30) ) * (1/ 3.0f); |
1215 | } |
1216 | SI void from_1616(U32 _1616, F* r, F* g) { |
1217 | *r = cast((_1616 ) & 0xffff) * (1/65535.0f); |
1218 | *g = cast((_1616 >> 16) & 0xffff) * (1/65535.0f); |
1219 | } |
1220 | SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) { |
1221 | *r = cast64((_16161616 ) & 0xffff) * (1/65535.0f); |
1222 | *g = cast64((_16161616 >> 16) & 0xffff) * (1/65535.0f); |
1223 | *b = cast64((_16161616 >> 32) & 0xffff) * (1/65535.0f); |
1224 | *a = cast64((_16161616 >> 48) & 0xffff) * (1/65535.0f); |
1225 | } |
1226 | |
1227 | // Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory. |
1228 | template <typename T> |
1229 | SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) { |
1230 | return (T*)ctx->pixels + dy*ctx->stride + dx; |
1231 | } |
1232 | |
1233 | // clamp v to [0,limit). |
1234 | SI F clamp(F v, F limit) { |
1235 | F inclusive = sk_bit_cast<F>( sk_bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive. |
1236 | return min(max(0, v), inclusive); |
1237 | } |
1238 | |
1239 | // Used by gather_ stages to calculate the base pointer and a vector of indices to load. |
1240 | template <typename T> |
1241 | SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) { |
1242 | x = clamp(x, ctx->width); |
1243 | y = clamp(y, ctx->height); |
1244 | |
1245 | *ptr = (const T*)ctx->pixels; |
1246 | return trunc_(y)*ctx->stride + trunc_(x); |
1247 | } |
1248 | |
1249 | // We often have a nominally [0,1] float value we need to scale and convert to an integer, |
1250 | // whether for a table lookup or to pack back down into bytes for storage. |
1251 | // |
1252 | // In practice, especially when dealing with interesting color spaces, that notionally |
1253 | // [0,1] float may be out of [0,1] range. Unorms cannot represent that, so we must clamp. |
1254 | // |
1255 | // You can adjust the expected input to [0,bias] by tweaking that parameter. |
1256 | SI U32 to_unorm(F v, F scale, F bias = 1.0f) { |
1257 | // TODO: platform-specific implementations to to_unorm(), removing round() entirely? |
1258 | // Any time we use round() we probably want to use to_unorm(). |
1259 | return round(min(max(0, v), bias), scale); |
1260 | } |
1261 | |
1262 | SI I32 cond_to_mask(I32 cond) { return if_then_else(cond, I32(~0), I32(0)); } |
1263 | |
1264 | // Now finally, normal Stages! |
1265 | |
1266 | STAGE(seed_shader, Ctx::None) { |
1267 | static const float iota[] = { |
1268 | 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f, |
1269 | 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f, |
1270 | }; |
1271 | // It's important for speed to explicitly cast(dx) and cast(dy), |
1272 | // which has the effect of splatting them to vectors before converting to floats. |
1273 | // On Intel this breaks a data dependency on previous loop iterations' registers. |
1274 | r = cast(dx) + sk_unaligned_load<F>(iota); |
1275 | g = cast(dy) + 0.5f; |
1276 | b = 1.0f; |
1277 | a = 0; |
1278 | dr = dg = db = da = 0; |
1279 | } |
1280 | |
1281 | STAGE(dither, const float* rate) { |
1282 | // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors. |
1283 | uint32_t iota[] = {0,1,2,3,4,5,6,7}; |
1284 | U32 X = dx + sk_unaligned_load<U32>(iota), |
1285 | Y = dy; |
1286 | |
1287 | // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering. |
1288 | // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ]. |
1289 | |
1290 | // We only need X and X^Y from here on, so it's easier to just think of that as "Y". |
1291 | Y ^= X; |
1292 | |
1293 | // We'll mix the bottom 3 bits of each of X and Y to make 6 bits, |
1294 | // for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda. |
1295 | U32 M = (Y & 1) << 5 | (X & 1) << 4 |
1296 | | (Y & 2) << 2 | (X & 2) << 1 |
1297 | | (Y & 4) >> 1 | (X & 4) >> 2; |
1298 | |
1299 | // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon. |
1300 | // We want to make sure our dither is less than 0.5 in either direction to keep exact values |
1301 | // like 0 and 1 unchanged after rounding. |
1302 | F dither = cast(M) * (2/128.0f) - (63/128.0f); |
1303 | |
1304 | r += *rate*dither; |
1305 | g += *rate*dither; |
1306 | b += *rate*dither; |
1307 | |
1308 | r = max(0, min(r, a)); |
1309 | g = max(0, min(g, a)); |
1310 | b = max(0, min(b, a)); |
1311 | } |
1312 | |
1313 | // load 4 floats from memory, and splat them into r,g,b,a |
1314 | STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) { |
1315 | r = c->r; |
1316 | g = c->g; |
1317 | b = c->b; |
1318 | a = c->a; |
1319 | } |
1320 | STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) { |
1321 | r = c->r; |
1322 | g = c->g; |
1323 | b = c->b; |
1324 | a = c->a; |
1325 | } |
1326 | // load 4 floats from memory, and splat them into dr,dg,db,da |
1327 | STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) { |
1328 | dr = c->r; |
1329 | dg = c->g; |
1330 | db = c->b; |
1331 | da = c->a; |
1332 | } |
1333 | |
1334 | // splats opaque-black into r,g,b,a |
1335 | STAGE(black_color, Ctx::None) { |
1336 | r = g = b = 0.0f; |
1337 | a = 1.0f; |
1338 | } |
1339 | |
1340 | STAGE(white_color, Ctx::None) { |
1341 | r = g = b = a = 1.0f; |
1342 | } |
1343 | |
1344 | // load registers r,g,b,a from context (mirrors store_rgba) |
1345 | STAGE(load_src, const float* ptr) { |
1346 | r = sk_unaligned_load<F>(ptr + 0*N); |
1347 | g = sk_unaligned_load<F>(ptr + 1*N); |
1348 | b = sk_unaligned_load<F>(ptr + 2*N); |
1349 | a = sk_unaligned_load<F>(ptr + 3*N); |
1350 | } |
1351 | |
1352 | // store registers r,g,b,a into context (mirrors load_rgba) |
1353 | STAGE(store_src, float* ptr) { |
1354 | sk_unaligned_store(ptr + 0*N, r); |
1355 | sk_unaligned_store(ptr + 1*N, g); |
1356 | sk_unaligned_store(ptr + 2*N, b); |
1357 | sk_unaligned_store(ptr + 3*N, a); |
1358 | } |
1359 | STAGE(store_src_a, float* ptr) { |
1360 | sk_unaligned_store(ptr, a); |
1361 | } |
1362 | |
1363 | // load registers dr,dg,db,da from context (mirrors store_dst) |
1364 | STAGE(load_dst, const float* ptr) { |
1365 | dr = sk_unaligned_load<F>(ptr + 0*N); |
1366 | dg = sk_unaligned_load<F>(ptr + 1*N); |
1367 | db = sk_unaligned_load<F>(ptr + 2*N); |
1368 | da = sk_unaligned_load<F>(ptr + 3*N); |
1369 | } |
1370 | |
1371 | // store registers dr,dg,db,da into context (mirrors load_dst) |
1372 | STAGE(store_dst, float* ptr) { |
1373 | sk_unaligned_store(ptr + 0*N, dr); |
1374 | sk_unaligned_store(ptr + 1*N, dg); |
1375 | sk_unaligned_store(ptr + 2*N, db); |
1376 | sk_unaligned_store(ptr + 3*N, da); |
1377 | } |
1378 | |
1379 | // Most blend modes apply the same logic to each channel. |
1380 | #define BLEND_MODE(name) \ |
1381 | SI F name##_channel(F s, F d, F sa, F da); \ |
1382 | STAGE(name, Ctx::None) { \ |
1383 | r = name##_channel(r,dr,a,da); \ |
1384 | g = name##_channel(g,dg,a,da); \ |
1385 | b = name##_channel(b,db,a,da); \ |
1386 | a = name##_channel(a,da,a,da); \ |
1387 | } \ |
1388 | SI F name##_channel(F s, F d, F sa, F da) |
1389 | |
1390 | SI F inv(F x) { return 1.0f - x; } |
1391 | SI F two(F x) { return x + x; } |
1392 | |
1393 | |
1394 | BLEND_MODE(clear) { return 0; } |
1395 | BLEND_MODE(srcatop) { return s*da + d*inv(sa); } |
1396 | BLEND_MODE(dstatop) { return d*sa + s*inv(da); } |
1397 | BLEND_MODE(srcin) { return s * da; } |
1398 | BLEND_MODE(dstin) { return d * sa; } |
1399 | BLEND_MODE(srcout) { return s * inv(da); } |
1400 | BLEND_MODE(dstout) { return d * inv(sa); } |
1401 | BLEND_MODE(srcover) { return mad(d, inv(sa), s); } |
1402 | BLEND_MODE(dstover) { return mad(s, inv(da), d); } |
1403 | |
1404 | BLEND_MODE(modulate) { return s*d; } |
1405 | BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; } |
1406 | BLEND_MODE(plus_) { return min(s + d, 1.0f); } // We can clamp to either 1 or sa. |
1407 | BLEND_MODE(screen) { return s + d - s*d; } |
1408 | BLEND_MODE(xor_) { return s*inv(da) + d*inv(sa); } |
1409 | #undef BLEND_MODE |
1410 | |
1411 | // Most other blend modes apply the same logic to colors, and srcover to alpha. |
1412 | #define BLEND_MODE(name) \ |
1413 | SI F name##_channel(F s, F d, F sa, F da); \ |
1414 | STAGE(name, Ctx::None) { \ |
1415 | r = name##_channel(r,dr,a,da); \ |
1416 | g = name##_channel(g,dg,a,da); \ |
1417 | b = name##_channel(b,db,a,da); \ |
1418 | a = mad(da, inv(a), a); \ |
1419 | } \ |
1420 | SI F name##_channel(F s, F d, F sa, F da) |
1421 | |
1422 | BLEND_MODE(darken) { return s + d - max(s*da, d*sa) ; } |
1423 | BLEND_MODE(lighten) { return s + d - min(s*da, d*sa) ; } |
1424 | BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); } |
1425 | BLEND_MODE(exclusion) { return s + d - two(s*d); } |
1426 | |
1427 | BLEND_MODE(colorburn) { |
1428 | return if_then_else(d == da, d + s*inv(da), |
1429 | if_then_else(s == 0, /* s + */ d*inv(sa), |
1430 | sa*(da - min(da, (da-d)*sa*rcp(s))) + s*inv(da) + d*inv(sa))); |
1431 | } |
1432 | BLEND_MODE(colordodge) { |
1433 | return if_then_else(d == 0, /* d + */ s*inv(da), |
1434 | if_then_else(s == sa, s + d*inv(sa), |
1435 | sa*min(da, (d*sa)*rcp(sa - s)) + s*inv(da) + d*inv(sa))); |
1436 | } |
1437 | BLEND_MODE(hardlight) { |
1438 | return s*inv(da) + d*inv(sa) |
1439 | + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s))); |
1440 | } |
1441 | BLEND_MODE(overlay) { |
1442 | return s*inv(da) + d*inv(sa) |
1443 | + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s))); |
1444 | } |
1445 | |
1446 | BLEND_MODE(softlight) { |
1447 | F m = if_then_else(da > 0, d / da, 0), |
1448 | s2 = two(s), |
1449 | m4 = two(two(m)); |
1450 | |
1451 | // The logic forks three ways: |
1452 | // 1. dark src? |
1453 | // 2. light src, dark dst? |
1454 | // 3. light src, light dst? |
1455 | F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)), // Used in case 1. |
1456 | darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m, // Used in case 2. |
1457 | liteDst = rcp(rsqrt(m)) - m, // Used in case 3. |
1458 | liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3? |
1459 | return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)? |
1460 | } |
1461 | #undef BLEND_MODE |
1462 | |
1463 | // We're basing our implemenation of non-separable blend modes on |
1464 | // https://www.w3.org/TR/compositing-1/#blendingnonseparable. |
1465 | // and |
1466 | // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf |
1467 | // They're equivalent, but ES' math has been better simplified. |
1468 | // |
1469 | // Anything extra we add beyond that is to make the math work with premul inputs. |
1470 | |
1471 | SI F sat(F r, F g, F b) { return max(r, max(g,b)) - min(r, min(g,b)); } |
1472 | SI F lum(F r, F g, F b) { return r*0.30f + g*0.59f + b*0.11f; } |
1473 | |
1474 | SI void set_sat(F* r, F* g, F* b, F s) { |
1475 | F mn = min(*r, min(*g,*b)), |
1476 | mx = max(*r, max(*g,*b)), |
1477 | sat = mx - mn; |
1478 | |
1479 | // Map min channel to 0, max channel to s, and scale the middle proportionally. |
1480 | auto scale = [=](F c) { |
1481 | return if_then_else(sat == 0, 0, (c - mn) * s / sat); |
1482 | }; |
1483 | *r = scale(*r); |
1484 | *g = scale(*g); |
1485 | *b = scale(*b); |
1486 | } |
1487 | SI void set_lum(F* r, F* g, F* b, F l) { |
1488 | F diff = l - lum(*r, *g, *b); |
1489 | *r += diff; |
1490 | *g += diff; |
1491 | *b += diff; |
1492 | } |
1493 | SI void clip_color(F* r, F* g, F* b, F a) { |
1494 | F mn = min(*r, min(*g, *b)), |
1495 | mx = max(*r, max(*g, *b)), |
1496 | l = lum(*r, *g, *b); |
1497 | |
1498 | auto clip = [=](F c) { |
1499 | c = if_then_else(mn >= 0, c, l + (c - l) * ( l) / (l - mn) ); |
1500 | c = if_then_else(mx > a, l + (c - l) * (a - l) / (mx - l), c); |
1501 | c = max(c, 0); // Sometimes without this we may dip just a little negative. |
1502 | return c; |
1503 | }; |
1504 | *r = clip(*r); |
1505 | *g = clip(*g); |
1506 | *b = clip(*b); |
1507 | } |
1508 | |
1509 | STAGE(hue, Ctx::None) { |
1510 | F R = r*a, |
1511 | G = g*a, |
1512 | B = b*a; |
1513 | |
1514 | set_sat(&R, &G, &B, sat(dr,dg,db)*a); |
1515 | set_lum(&R, &G, &B, lum(dr,dg,db)*a); |
1516 | clip_color(&R,&G,&B, a*da); |
1517 | |
1518 | r = r*inv(da) + dr*inv(a) + R; |
1519 | g = g*inv(da) + dg*inv(a) + G; |
1520 | b = b*inv(da) + db*inv(a) + B; |
1521 | a = a + da - a*da; |
1522 | } |
1523 | STAGE(saturation, Ctx::None) { |
1524 | F R = dr*a, |
1525 | G = dg*a, |
1526 | B = db*a; |
1527 | |
1528 | set_sat(&R, &G, &B, sat( r, g, b)*da); |
1529 | set_lum(&R, &G, &B, lum(dr,dg,db)* a); // (This is not redundant.) |
1530 | clip_color(&R,&G,&B, a*da); |
1531 | |
1532 | r = r*inv(da) + dr*inv(a) + R; |
1533 | g = g*inv(da) + dg*inv(a) + G; |
1534 | b = b*inv(da) + db*inv(a) + B; |
1535 | a = a + da - a*da; |
1536 | } |
1537 | STAGE(color, Ctx::None) { |
1538 | F R = r*da, |
1539 | G = g*da, |
1540 | B = b*da; |
1541 | |
1542 | set_lum(&R, &G, &B, lum(dr,dg,db)*a); |
1543 | clip_color(&R,&G,&B, a*da); |
1544 | |
1545 | r = r*inv(da) + dr*inv(a) + R; |
1546 | g = g*inv(da) + dg*inv(a) + G; |
1547 | b = b*inv(da) + db*inv(a) + B; |
1548 | a = a + da - a*da; |
1549 | } |
1550 | STAGE(luminosity, Ctx::None) { |
1551 | F R = dr*a, |
1552 | G = dg*a, |
1553 | B = db*a; |
1554 | |
1555 | set_lum(&R, &G, &B, lum(r,g,b)*da); |
1556 | clip_color(&R,&G,&B, a*da); |
1557 | |
1558 | r = r*inv(da) + dr*inv(a) + R; |
1559 | g = g*inv(da) + dg*inv(a) + G; |
1560 | b = b*inv(da) + db*inv(a) + B; |
1561 | a = a + da - a*da; |
1562 | } |
1563 | |
1564 | STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
1565 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
1566 | |
1567 | U32 dst = load<U32>(ptr, tail); |
1568 | dr = cast((dst ) & 0xff); |
1569 | dg = cast((dst >> 8) & 0xff); |
1570 | db = cast((dst >> 16) & 0xff); |
1571 | da = cast((dst >> 24) ); |
1572 | // {dr,dg,db,da} are in [0,255] |
1573 | // { r, g, b, a} are in [0, 1] (but may be out of gamut) |
1574 | |
1575 | r = mad(dr, inv(a), r*255.0f); |
1576 | g = mad(dg, inv(a), g*255.0f); |
1577 | b = mad(db, inv(a), b*255.0f); |
1578 | a = mad(da, inv(a), a*255.0f); |
1579 | // { r, g, b, a} are now in [0,255] (but may be out of gamut) |
1580 | |
1581 | // to_unorm() clamps back to gamut. Scaling by 1 since we're already 255-biased. |
1582 | dst = to_unorm(r, 1, 255) |
1583 | | to_unorm(g, 1, 255) << 8 |
1584 | | to_unorm(b, 1, 255) << 16 |
1585 | | to_unorm(a, 1, 255) << 24; |
1586 | store(ptr, dst, tail); |
1587 | } |
1588 | |
1589 | STAGE(clamp_0, Ctx::None) { |
1590 | r = max(r, 0); |
1591 | g = max(g, 0); |
1592 | b = max(b, 0); |
1593 | a = max(a, 0); |
1594 | } |
1595 | |
1596 | STAGE(clamp_1, Ctx::None) { |
1597 | r = min(r, 1.0f); |
1598 | g = min(g, 1.0f); |
1599 | b = min(b, 1.0f); |
1600 | a = min(a, 1.0f); |
1601 | } |
1602 | |
1603 | STAGE(clamp_a, Ctx::None) { |
1604 | a = min(a, 1.0f); |
1605 | r = min(r, a); |
1606 | g = min(g, a); |
1607 | b = min(b, a); |
1608 | } |
1609 | |
1610 | STAGE(clamp_gamut, Ctx::None) { |
1611 | a = min(max(a, 0), 1.0f); |
1612 | r = min(max(r, 0), a); |
1613 | g = min(max(g, 0), a); |
1614 | b = min(max(b, 0), a); |
1615 | } |
1616 | |
1617 | STAGE(set_rgb, const float* rgb) { |
1618 | r = rgb[0]; |
1619 | g = rgb[1]; |
1620 | b = rgb[2]; |
1621 | } |
1622 | STAGE(unbounded_set_rgb, const float* rgb) { |
1623 | r = rgb[0]; |
1624 | g = rgb[1]; |
1625 | b = rgb[2]; |
1626 | } |
1627 | |
1628 | STAGE(swap_rb, Ctx::None) { |
1629 | auto tmp = r; |
1630 | r = b; |
1631 | b = tmp; |
1632 | } |
1633 | STAGE(swap_rb_dst, Ctx::None) { |
1634 | auto tmp = dr; |
1635 | dr = db; |
1636 | db = tmp; |
1637 | } |
1638 | |
1639 | STAGE(move_src_dst, Ctx::None) { |
1640 | dr = r; |
1641 | dg = g; |
1642 | db = b; |
1643 | da = a; |
1644 | } |
1645 | STAGE(move_dst_src, Ctx::None) { |
1646 | r = dr; |
1647 | g = dg; |
1648 | b = db; |
1649 | a = da; |
1650 | } |
1651 | |
1652 | STAGE(premul, Ctx::None) { |
1653 | r = r * a; |
1654 | g = g * a; |
1655 | b = b * a; |
1656 | } |
1657 | STAGE(premul_dst, Ctx::None) { |
1658 | dr = dr * da; |
1659 | dg = dg * da; |
1660 | db = db * da; |
1661 | } |
1662 | STAGE(unpremul, Ctx::None) { |
1663 | float inf = sk_bit_cast<float>(0x7f800000); |
1664 | auto scale = if_then_else(1.0f/a < inf, 1.0f/a, 0); |
1665 | r *= scale; |
1666 | g *= scale; |
1667 | b *= scale; |
1668 | } |
1669 | |
1670 | STAGE(force_opaque , Ctx::None) { a = 1; } |
1671 | STAGE(force_opaque_dst, Ctx::None) { da = 1; } |
1672 | |
1673 | // Clamp x to [0,1], both sides inclusive (think, gradients). |
1674 | // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. |
1675 | SI F clamp_01(F v) { return min(max(0, v), 1); } |
1676 | |
1677 | STAGE(rgb_to_hsl, Ctx::None) { |
1678 | F mx = max(r, max(g,b)), |
1679 | mn = min(r, min(g,b)), |
1680 | d = mx - mn, |
1681 | d_rcp = 1.0f / d; |
1682 | |
1683 | F h = (1/6.0f) * |
1684 | if_then_else(mx == mn, 0, |
1685 | if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, 6.0f, 0), |
1686 | if_then_else(mx == g, (b-r)*d_rcp + 2.0f, |
1687 | (r-g)*d_rcp + 4.0f))); |
1688 | |
1689 | F l = (mx + mn) * 0.5f; |
1690 | F s = if_then_else(mx == mn, 0, |
1691 | d / if_then_else(l > 0.5f, 2.0f-mx-mn, mx+mn)); |
1692 | |
1693 | r = h; |
1694 | g = s; |
1695 | b = l; |
1696 | } |
1697 | STAGE(hsl_to_rgb, Ctx::None) { |
1698 | // See GrRGBToHSLFilterEffect.fp |
1699 | |
1700 | F h = r, |
1701 | s = g, |
1702 | l = b, |
1703 | c = (1.0f - abs_(2.0f * l - 1)) * s; |
1704 | |
1705 | auto hue_to_rgb = [&](F hue) { |
1706 | F q = clamp_01(abs_(fract(hue) * 6.0f - 3.0f) - 1.0f); |
1707 | return (q - 0.5f) * c + l; |
1708 | }; |
1709 | |
1710 | r = hue_to_rgb(h + 0.0f/3.0f); |
1711 | g = hue_to_rgb(h + 2.0f/3.0f); |
1712 | b = hue_to_rgb(h + 1.0f/3.0f); |
1713 | } |
1714 | |
1715 | // Derive alpha's coverage from rgb coverage and the values of src and dst alpha. |
1716 | SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) { |
1717 | return if_then_else(a < da, min(cr, min(cg,cb)) |
1718 | , max(cr, max(cg,cb))); |
1719 | } |
1720 | |
1721 | STAGE(scale_1_float, const float* c) { |
1722 | r = r * *c; |
1723 | g = g * *c; |
1724 | b = b * *c; |
1725 | a = a * *c; |
1726 | } |
1727 | STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
1728 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
1729 | |
1730 | auto scales = load<U8>(ptr, tail); |
1731 | auto c = from_byte(scales); |
1732 | |
1733 | r = r * c; |
1734 | g = g * c; |
1735 | b = b * c; |
1736 | a = a * c; |
1737 | } |
1738 | STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) { |
1739 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
1740 | |
1741 | F cr,cg,cb; |
1742 | from_565(load<U16>(ptr, tail), &cr, &cg, &cb); |
1743 | |
1744 | F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
1745 | |
1746 | r = r * cr; |
1747 | g = g * cg; |
1748 | b = b * cb; |
1749 | a = a * ca; |
1750 | } |
1751 | |
1752 | SI F lerp(F from, F to, F t) { |
1753 | return mad(to-from, t, from); |
1754 | } |
1755 | |
1756 | STAGE(lerp_1_float, const float* c) { |
1757 | r = lerp(dr, r, *c); |
1758 | g = lerp(dg, g, *c); |
1759 | b = lerp(db, b, *c); |
1760 | a = lerp(da, a, *c); |
1761 | } |
1762 | STAGE(scale_native, const float scales[]) { |
1763 | auto c = sk_unaligned_load<F>(scales); |
1764 | r = r * c; |
1765 | g = g * c; |
1766 | b = b * c; |
1767 | a = a * c; |
1768 | } |
1769 | STAGE(lerp_native, const float scales[]) { |
1770 | auto c = sk_unaligned_load<F>(scales); |
1771 | r = lerp(dr, r, c); |
1772 | g = lerp(dg, g, c); |
1773 | b = lerp(db, b, c); |
1774 | a = lerp(da, a, c); |
1775 | } |
1776 | STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
1777 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
1778 | |
1779 | auto scales = load<U8>(ptr, tail); |
1780 | auto c = from_byte(scales); |
1781 | |
1782 | r = lerp(dr, r, c); |
1783 | g = lerp(dg, g, c); |
1784 | b = lerp(db, b, c); |
1785 | a = lerp(da, a, c); |
1786 | } |
1787 | STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) { |
1788 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
1789 | |
1790 | F cr,cg,cb; |
1791 | from_565(load<U16>(ptr, tail), &cr, &cg, &cb); |
1792 | |
1793 | F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
1794 | |
1795 | r = lerp(dr, r, cr); |
1796 | g = lerp(dg, g, cg); |
1797 | b = lerp(db, b, cb); |
1798 | a = lerp(da, a, ca); |
1799 | } |
1800 | |
1801 | STAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) { |
1802 | auto mptr = ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), |
1803 | aptr = ptr_at_xy<const uint8_t>(&ctx->add, dx,dy); |
1804 | |
1805 | F mul = from_byte(load<U8>(mptr, tail)), |
1806 | add = from_byte(load<U8>(aptr, tail)); |
1807 | |
1808 | r = mad(r, mul, add); |
1809 | g = mad(g, mul, add); |
1810 | b = mad(b, mul, add); |
1811 | } |
1812 | |
1813 | STAGE(byte_tables, const void* ctx) { // TODO: rename Tables SkRasterPipeline_ByteTablesCtx |
1814 | struct Tables { const uint8_t *r, *g, *b, *a; }; |
1815 | auto tables = (const Tables*)ctx; |
1816 | |
1817 | r = from_byte(gather(tables->r, to_unorm(r, 255))); |
1818 | g = from_byte(gather(tables->g, to_unorm(g, 255))); |
1819 | b = from_byte(gather(tables->b, to_unorm(b, 255))); |
1820 | a = from_byte(gather(tables->a, to_unorm(a, 255))); |
1821 | } |
1822 | |
1823 | SI F strip_sign(F x, U32* sign) { |
1824 | U32 bits = sk_bit_cast<U32>(x); |
1825 | *sign = bits & 0x80000000; |
1826 | return sk_bit_cast<F>(bits ^ *sign); |
1827 | } |
1828 | |
1829 | SI F apply_sign(F x, U32 sign) { |
1830 | return sk_bit_cast<F>(sign | sk_bit_cast<U32>(x)); |
1831 | } |
1832 | |
1833 | STAGE(parametric, const skcms_TransferFunction* ctx) { |
1834 | auto fn = [&](F v) { |
1835 | U32 sign; |
1836 | v = strip_sign(v, &sign); |
1837 | |
1838 | F r = if_then_else(v <= ctx->d, mad(ctx->c, v, ctx->f) |
1839 | , approx_powf(mad(ctx->a, v, ctx->b), ctx->g) + ctx->e); |
1840 | return apply_sign(r, sign); |
1841 | }; |
1842 | r = fn(r); |
1843 | g = fn(g); |
1844 | b = fn(b); |
1845 | } |
1846 | |
1847 | STAGE(gamma_, const float* G) { |
1848 | auto fn = [&](F v) { |
1849 | U32 sign; |
1850 | v = strip_sign(v, &sign); |
1851 | return apply_sign(approx_powf(v, *G), sign); |
1852 | }; |
1853 | r = fn(r); |
1854 | g = fn(g); |
1855 | b = fn(b); |
1856 | } |
1857 | |
1858 | STAGE(PQish, const skcms_TransferFunction* ctx) { |
1859 | auto fn = [&](F v) { |
1860 | U32 sign; |
1861 | v = strip_sign(v, &sign); |
1862 | |
1863 | F r = approx_powf(max(mad(ctx->b, approx_powf(v, ctx->c), ctx->a), 0) |
1864 | / (mad(ctx->e, approx_powf(v, ctx->c), ctx->d)), |
1865 | ctx->f); |
1866 | |
1867 | return apply_sign(r, sign); |
1868 | }; |
1869 | r = fn(r); |
1870 | g = fn(g); |
1871 | b = fn(b); |
1872 | } |
1873 | |
1874 | STAGE(HLGish, const skcms_TransferFunction* ctx) { |
1875 | auto fn = [&](F v) { |
1876 | U32 sign; |
1877 | v = strip_sign(v, &sign); |
1878 | |
1879 | const float R = ctx->a, G = ctx->b, |
1880 | a = ctx->c, b = ctx->d, c = ctx->e; |
1881 | |
1882 | F r = if_then_else(v*R <= 1, approx_powf(v*R, G) |
1883 | , approx_exp((v-c)*a) + b); |
1884 | |
1885 | return apply_sign(r, sign); |
1886 | }; |
1887 | r = fn(r); |
1888 | g = fn(g); |
1889 | b = fn(b); |
1890 | } |
1891 | |
1892 | STAGE(HLGinvish, const skcms_TransferFunction* ctx) { |
1893 | auto fn = [&](F v) { |
1894 | U32 sign; |
1895 | v = strip_sign(v, &sign); |
1896 | |
1897 | const float R = ctx->a, G = ctx->b, |
1898 | a = ctx->c, b = ctx->d, c = ctx->e; |
1899 | |
1900 | F r = if_then_else(v <= 1, R * approx_powf(v, G) |
1901 | , a * approx_log(v - b) + c); |
1902 | |
1903 | return apply_sign(r, sign); |
1904 | }; |
1905 | r = fn(r); |
1906 | g = fn(g); |
1907 | b = fn(b); |
1908 | } |
1909 | |
1910 | STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
1911 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
1912 | |
1913 | r = g = b = 0.0f; |
1914 | a = from_byte(load<U8>(ptr, tail)); |
1915 | } |
1916 | STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
1917 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); |
1918 | |
1919 | dr = dg = db = 0.0f; |
1920 | da = from_byte(load<U8>(ptr, tail)); |
1921 | } |
1922 | STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) { |
1923 | const uint8_t* ptr; |
1924 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
1925 | r = g = b = 0.0f; |
1926 | a = from_byte(gather(ptr, ix)); |
1927 | } |
1928 | STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
1929 | auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy); |
1930 | |
1931 | U8 packed = pack(pack(to_unorm(a, 255))); |
1932 | store(ptr, packed, tail); |
1933 | } |
1934 | |
1935 | STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) { |
1936 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
1937 | |
1938 | from_565(load<U16>(ptr, tail), &r,&g,&b); |
1939 | a = 1.0f; |
1940 | } |
1941 | STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
1942 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
1943 | |
1944 | from_565(load<U16>(ptr, tail), &dr,&dg,&db); |
1945 | da = 1.0f; |
1946 | } |
1947 | STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) { |
1948 | const uint16_t* ptr; |
1949 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
1950 | from_565(gather(ptr, ix), &r,&g,&b); |
1951 | a = 1.0f; |
1952 | } |
1953 | STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) { |
1954 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
1955 | |
1956 | U16 px = pack( to_unorm(r, 31) << 11 |
1957 | | to_unorm(g, 63) << 5 |
1958 | | to_unorm(b, 31) ); |
1959 | store(ptr, px, tail); |
1960 | } |
1961 | |
1962 | STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
1963 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
1964 | from_4444(load<U16>(ptr, tail), &r,&g,&b,&a); |
1965 | } |
1966 | STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
1967 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
1968 | from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da); |
1969 | } |
1970 | STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) { |
1971 | const uint16_t* ptr; |
1972 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
1973 | from_4444(gather(ptr, ix), &r,&g,&b,&a); |
1974 | } |
1975 | STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
1976 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
1977 | U16 px = pack( to_unorm(r, 15) << 12 |
1978 | | to_unorm(g, 15) << 8 |
1979 | | to_unorm(b, 15) << 4 |
1980 | | to_unorm(a, 15) ); |
1981 | store(ptr, px, tail); |
1982 | } |
1983 | |
1984 | STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
1985 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
1986 | from_8888(load<U32>(ptr, tail), &r,&g,&b,&a); |
1987 | } |
1988 | STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
1989 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
1990 | from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da); |
1991 | } |
1992 | STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) { |
1993 | const uint32_t* ptr; |
1994 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
1995 | from_8888(gather(ptr, ix), &r,&g,&b,&a); |
1996 | } |
1997 | STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
1998 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
1999 | |
2000 | U32 px = to_unorm(r, 255) |
2001 | | to_unorm(g, 255) << 8 |
2002 | | to_unorm(b, 255) << 16 |
2003 | | to_unorm(a, 255) << 24; |
2004 | store(ptr, px, tail); |
2005 | } |
2006 | |
2007 | STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
2008 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
2009 | from_88(load<U16>(ptr, tail), &r, &g); |
2010 | b = 0; |
2011 | a = 1; |
2012 | } |
2013 | STAGE(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2014 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
2015 | from_88(load<U16>(ptr, tail), &dr, &dg); |
2016 | db = 0; |
2017 | da = 1; |
2018 | } |
2019 | STAGE(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) { |
2020 | const uint16_t* ptr; |
2021 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2022 | from_88(gather(ptr, ix), &r, &g); |
2023 | b = 0; |
2024 | a = 1; |
2025 | } |
2026 | STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
2027 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy); |
2028 | U16 px = pack( to_unorm(r, 255) | to_unorm(g, 255) << 8 ); |
2029 | store(ptr, px, tail); |
2030 | } |
2031 | |
2032 | STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) { |
2033 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
2034 | r = g = b = 0; |
2035 | a = from_short(load<U16>(ptr, tail)); |
2036 | } |
2037 | STAGE(load_a16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2038 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
2039 | dr = dg = db = 0.0f; |
2040 | da = from_short(load<U16>(ptr, tail)); |
2041 | } |
2042 | STAGE(gather_a16, const SkRasterPipeline_GatherCtx* ctx) { |
2043 | const uint16_t* ptr; |
2044 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2045 | r = g = b = 0.0f; |
2046 | a = from_short(gather(ptr, ix)); |
2047 | } |
2048 | STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) { |
2049 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
2050 | |
2051 | U16 px = pack(to_unorm(a, 65535)); |
2052 | store(ptr, px, tail); |
2053 | } |
2054 | |
2055 | STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) { |
2056 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
2057 | b = 0; a = 1; |
2058 | from_1616(load<U32>(ptr, tail), &r,&g); |
2059 | } |
2060 | STAGE(load_rg1616_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2061 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
2062 | from_1616(load<U32>(ptr, tail), &dr, &dg); |
2063 | db = 0; |
2064 | da = 1; |
2065 | } |
2066 | STAGE(gather_rg1616, const SkRasterPipeline_GatherCtx* ctx) { |
2067 | const uint32_t* ptr; |
2068 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2069 | from_1616(gather(ptr, ix), &r, &g); |
2070 | b = 0; |
2071 | a = 1; |
2072 | } |
2073 | STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) { |
2074 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
2075 | |
2076 | U32 px = to_unorm(r, 65535) |
2077 | | to_unorm(g, 65535) << 16; |
2078 | store(ptr, px, tail); |
2079 | } |
2080 | |
2081 | STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) { |
2082 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy); |
2083 | from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a); |
2084 | } |
2085 | STAGE(load_16161616_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2086 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy); |
2087 | from_16161616(load<U64>(ptr, tail), &dr, &dg, &db, &da); |
2088 | } |
2089 | STAGE(gather_16161616, const SkRasterPipeline_GatherCtx* ctx) { |
2090 | const uint64_t* ptr; |
2091 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2092 | from_16161616(gather(ptr, ix), &r, &g, &b, &a); |
2093 | } |
2094 | STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) { |
2095 | auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy); |
2096 | |
2097 | U16 R = pack(to_unorm(r, 65535)), |
2098 | G = pack(to_unorm(g, 65535)), |
2099 | B = pack(to_unorm(b, 65535)), |
2100 | A = pack(to_unorm(a, 65535)); |
2101 | |
2102 | store4(ptr,tail, R,G,B,A); |
2103 | } |
2104 | |
2105 | |
2106 | STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) { |
2107 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
2108 | from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a); |
2109 | } |
2110 | STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2111 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); |
2112 | from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da); |
2113 | } |
2114 | STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) { |
2115 | const uint32_t* ptr; |
2116 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
2117 | from_1010102(gather(ptr, ix), &r,&g,&b,&a); |
2118 | } |
2119 | STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) { |
2120 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
2121 | |
2122 | U32 px = to_unorm(r, 1023) |
2123 | | to_unorm(g, 1023) << 10 |
2124 | | to_unorm(b, 1023) << 20 |
2125 | | to_unorm(a, 3) << 30; |
2126 | store(ptr, px, tail); |
2127 | } |
2128 | |
2129 | STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) { |
2130 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); |
2131 | |
2132 | U16 R,G,B,A; |
2133 | load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); |
2134 | r = from_half(R); |
2135 | g = from_half(G); |
2136 | b = from_half(B); |
2137 | a = from_half(A); |
2138 | } |
2139 | STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2140 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); |
2141 | |
2142 | U16 R,G,B,A; |
2143 | load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); |
2144 | dr = from_half(R); |
2145 | dg = from_half(G); |
2146 | db = from_half(B); |
2147 | da = from_half(A); |
2148 | } |
2149 | STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) { |
2150 | const uint64_t* ptr; |
2151 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
2152 | auto px = gather(ptr, ix); |
2153 | |
2154 | U16 R,G,B,A; |
2155 | load4((const uint16_t*)&px,0, &R,&G,&B,&A); |
2156 | r = from_half(R); |
2157 | g = from_half(G); |
2158 | b = from_half(B); |
2159 | a = from_half(A); |
2160 | } |
2161 | STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) { |
2162 | auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy); |
2163 | store4((uint16_t*)ptr,tail, to_half(r) |
2164 | , to_half(g) |
2165 | , to_half(b) |
2166 | , to_half(a)); |
2167 | } |
2168 | |
2169 | STAGE(store_u16_be, const SkRasterPipeline_MemoryCtx* ctx) { |
2170 | auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,dy); |
2171 | |
2172 | U16 R = bswap(pack(to_unorm(r, 65535))), |
2173 | G = bswap(pack(to_unorm(g, 65535))), |
2174 | B = bswap(pack(to_unorm(b, 65535))), |
2175 | A = bswap(pack(to_unorm(a, 65535))); |
2176 | |
2177 | store4(ptr,tail, R,G,B,A); |
2178 | } |
2179 | |
2180 | STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) { |
2181 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); |
2182 | |
2183 | U16 A = load<U16>((const uint16_t*)ptr, tail); |
2184 | r = 0; |
2185 | g = 0; |
2186 | b = 0; |
2187 | a = from_half(A); |
2188 | } |
2189 | STAGE(load_af16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2190 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); |
2191 | |
2192 | U16 A = load<U16>((const uint16_t*)ptr, tail); |
2193 | dr = dg = db = 0.0f; |
2194 | da = from_half(A); |
2195 | } |
2196 | STAGE(gather_af16, const SkRasterPipeline_GatherCtx* ctx) { |
2197 | const uint16_t* ptr; |
2198 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2199 | r = g = b = 0.0f; |
2200 | a = from_half(gather(ptr, ix)); |
2201 | } |
2202 | STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) { |
2203 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); |
2204 | store(ptr, to_half(a), tail); |
2205 | } |
2206 | |
2207 | STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) { |
2208 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
2209 | |
2210 | U16 R,G; |
2211 | load2((const uint16_t*)ptr, tail, &R, &G); |
2212 | r = from_half(R); |
2213 | g = from_half(G); |
2214 | b = 0; |
2215 | a = 1; |
2216 | } |
2217 | STAGE(load_rgf16_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2218 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); |
2219 | |
2220 | U16 R,G; |
2221 | load2((const uint16_t*)ptr, tail, &R, &G); |
2222 | dr = from_half(R); |
2223 | dg = from_half(G); |
2224 | db = 0; |
2225 | da = 1; |
2226 | } |
2227 | STAGE(gather_rgf16, const SkRasterPipeline_GatherCtx* ctx) { |
2228 | const uint32_t* ptr; |
2229 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); |
2230 | auto px = gather(ptr, ix); |
2231 | |
2232 | U16 R,G; |
2233 | load2((const uint16_t*)&px, 0, &R, &G); |
2234 | r = from_half(R); |
2235 | g = from_half(G); |
2236 | b = 0; |
2237 | a = 1; |
2238 | } |
2239 | STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) { |
2240 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy); |
2241 | store2((uint16_t*)ptr, tail, to_half(r) |
2242 | , to_half(g)); |
2243 | } |
2244 | |
2245 | STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) { |
2246 | auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy); |
2247 | load4(ptr,tail, &r,&g,&b,&a); |
2248 | } |
2249 | STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
2250 | auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy); |
2251 | load4(ptr,tail, &dr,&dg,&db,&da); |
2252 | } |
2253 | STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) { |
2254 | const float* ptr; |
2255 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); |
2256 | r = gather(ptr, 4*ix + 0); |
2257 | g = gather(ptr, 4*ix + 1); |
2258 | b = gather(ptr, 4*ix + 2); |
2259 | a = gather(ptr, 4*ix + 3); |
2260 | } |
2261 | STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) { |
2262 | auto ptr = ptr_at_xy<float>(ctx, 4*dx,4*dy); |
2263 | store4(ptr,tail, r,g,b,a); |
2264 | } |
2265 | |
2266 | STAGE(load_rgf32, const SkRasterPipeline_MemoryCtx* ctx) { |
2267 | auto ptr = ptr_at_xy<const float>(ctx, 2*dx,2*dy); |
2268 | load2(ptr, tail, &r, &g); |
2269 | b = 0; |
2270 | a = 1; |
2271 | } |
2272 | STAGE(store_rgf32, const SkRasterPipeline_MemoryCtx* ctx) { |
2273 | auto ptr = ptr_at_xy<float>(ctx, 2*dx,2*dy); |
2274 | store2(ptr, tail, r, g); |
2275 | } |
2276 | |
2277 | SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) { |
2278 | return v - floor_(v*ctx->invScale)*ctx->scale; |
2279 | } |
2280 | SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) { |
2281 | auto limit = ctx->scale; |
2282 | auto invLimit = ctx->invScale; |
2283 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); |
2284 | } |
2285 | // Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images). |
2286 | // The gather stages will hard clamp the output of these stages to [0,limit)... |
2287 | // we just need to do the basic repeat or mirroring. |
2288 | STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(r, ctx); } |
2289 | STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(g, ctx); } |
2290 | STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(r, ctx); } |
2291 | STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(g, ctx); } |
2292 | |
2293 | STAGE( clamp_x_1, Ctx::None) { r = clamp_01(r); } |
2294 | STAGE(repeat_x_1, Ctx::None) { r = clamp_01(r - floor_(r)); } |
2295 | STAGE(mirror_x_1, Ctx::None) { r = clamp_01(abs_( (r-1.0f) - two(floor_((r-1.0f)*0.5f)) - 1.0f )); } |
2296 | |
2297 | // Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain: |
2298 | // mask == 0x00000000 if the coordinate(s) are out of bounds |
2299 | // mask == 0xFFFFFFFF if the coordinate(s) are in bounds |
2300 | // After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0 |
2301 | // if either of the coordinates were out of bounds. |
2302 | |
2303 | STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) { |
2304 | auto w = ctx->limit_x; |
2305 | sk_unaligned_store(ctx->mask, cond_to_mask((0 <= r) & (r < w))); |
2306 | } |
2307 | STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) { |
2308 | auto h = ctx->limit_y; |
2309 | sk_unaligned_store(ctx->mask, cond_to_mask((0 <= g) & (g < h))); |
2310 | } |
2311 | STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) { |
2312 | auto w = ctx->limit_x; |
2313 | auto h = ctx->limit_y; |
2314 | sk_unaligned_store(ctx->mask, |
2315 | cond_to_mask((0 <= r) & (r < w) & (0 <= g) & (g < h))); |
2316 | } |
2317 | STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) { |
2318 | auto mask = sk_unaligned_load<U32>(ctx->mask); |
2319 | r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask); |
2320 | g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask); |
2321 | b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask); |
2322 | a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask); |
2323 | } |
2324 | |
2325 | STAGE(alpha_to_gray, Ctx::None) { |
2326 | r = g = b = a; |
2327 | a = 1; |
2328 | } |
2329 | STAGE(alpha_to_gray_dst, Ctx::None) { |
2330 | dr = dg = db = da; |
2331 | da = 1; |
2332 | } |
2333 | STAGE(bt709_luminance_or_luma_to_alpha, Ctx::None) { |
2334 | a = r*0.2126f + g*0.7152f + b*0.0722f; |
2335 | r = g = b = 0; |
2336 | } |
2337 | |
2338 | STAGE(matrix_translate, const float* m) { |
2339 | r += m[0]; |
2340 | g += m[1]; |
2341 | } |
2342 | STAGE(matrix_scale_translate, const float* m) { |
2343 | r = mad(r,m[0], m[2]); |
2344 | g = mad(g,m[1], m[3]); |
2345 | } |
2346 | STAGE(matrix_2x3, const float* m) { |
2347 | auto R = mad(r,m[0], mad(g,m[2], m[4])), |
2348 | G = mad(r,m[1], mad(g,m[3], m[5])); |
2349 | r = R; |
2350 | g = G; |
2351 | } |
2352 | STAGE(matrix_3x3, const float* m) { |
2353 | auto R = mad(r,m[0], mad(g,m[3], b*m[6])), |
2354 | G = mad(r,m[1], mad(g,m[4], b*m[7])), |
2355 | B = mad(r,m[2], mad(g,m[5], b*m[8])); |
2356 | r = R; |
2357 | g = G; |
2358 | b = B; |
2359 | } |
2360 | STAGE(matrix_3x4, const float* m) { |
2361 | auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))), |
2362 | G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))), |
2363 | B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11]))); |
2364 | r = R; |
2365 | g = G; |
2366 | b = B; |
2367 | } |
2368 | STAGE(matrix_4x5, const float* m) { |
2369 | auto R = mad(r,m[ 0], mad(g,m[ 1], mad(b,m[ 2], mad(a,m[ 3], m[ 4])))), |
2370 | G = mad(r,m[ 5], mad(g,m[ 6], mad(b,m[ 7], mad(a,m[ 8], m[ 9])))), |
2371 | B = mad(r,m[10], mad(g,m[11], mad(b,m[12], mad(a,m[13], m[14])))), |
2372 | A = mad(r,m[15], mad(g,m[16], mad(b,m[17], mad(a,m[18], m[19])))); |
2373 | r = R; |
2374 | g = G; |
2375 | b = B; |
2376 | a = A; |
2377 | } |
2378 | STAGE(matrix_4x3, const float* m) { |
2379 | auto X = r, |
2380 | Y = g; |
2381 | |
2382 | r = mad(X, m[0], mad(Y, m[4], m[ 8])); |
2383 | g = mad(X, m[1], mad(Y, m[5], m[ 9])); |
2384 | b = mad(X, m[2], mad(Y, m[6], m[10])); |
2385 | a = mad(X, m[3], mad(Y, m[7], m[11])); |
2386 | } |
2387 | STAGE(matrix_perspective, const float* m) { |
2388 | // N.B. Unlike the other matrix_ stages, this matrix is row-major. |
2389 | auto R = mad(r,m[0], mad(g,m[1], m[2])), |
2390 | G = mad(r,m[3], mad(g,m[4], m[5])), |
2391 | Z = mad(r,m[6], mad(g,m[7], m[8])); |
2392 | r = R * rcp(Z); |
2393 | g = G * rcp(Z); |
2394 | } |
2395 | |
2396 | SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t, |
2397 | F* r, F* g, F* b, F* a) { |
2398 | F fr, br, fg, bg, fb, bb, fa, ba; |
2399 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
2400 | if (c->stopCount <=8) { |
2401 | fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx); |
2402 | br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx); |
2403 | fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), idx); |
2404 | bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), idx); |
2405 | fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), idx); |
2406 | bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), idx); |
2407 | fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), idx); |
2408 | ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), idx); |
2409 | } else |
2410 | #endif |
2411 | { |
2412 | fr = gather(c->fs[0], idx); |
2413 | br = gather(c->bs[0], idx); |
2414 | fg = gather(c->fs[1], idx); |
2415 | bg = gather(c->bs[1], idx); |
2416 | fb = gather(c->fs[2], idx); |
2417 | bb = gather(c->bs[2], idx); |
2418 | fa = gather(c->fs[3], idx); |
2419 | ba = gather(c->bs[3], idx); |
2420 | } |
2421 | |
2422 | *r = mad(t, fr, br); |
2423 | *g = mad(t, fg, bg); |
2424 | *b = mad(t, fb, bb); |
2425 | *a = mad(t, fa, ba); |
2426 | } |
2427 | |
2428 | STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) { |
2429 | auto t = r; |
2430 | auto idx = trunc_(t * (c->stopCount-1)); |
2431 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
2432 | } |
2433 | |
2434 | STAGE(gradient, const SkRasterPipeline_GradientCtx* c) { |
2435 | auto t = r; |
2436 | U32 idx = 0; |
2437 | |
2438 | // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. |
2439 | for (size_t i = 1; i < c->stopCount; i++) { |
2440 | idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); |
2441 | } |
2442 | |
2443 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
2444 | } |
2445 | |
2446 | STAGE(evenly_spaced_2_stop_gradient, const void* ctx) { |
2447 | // TODO: Rename Ctx SkRasterPipeline_EvenlySpaced2StopGradientCtx. |
2448 | struct Ctx { float f[4], b[4]; }; |
2449 | auto c = (const Ctx*)ctx; |
2450 | |
2451 | auto t = r; |
2452 | r = mad(t, c->f[0], c->b[0]); |
2453 | g = mad(t, c->f[1], c->b[1]); |
2454 | b = mad(t, c->f[2], c->b[2]); |
2455 | a = mad(t, c->f[3], c->b[3]); |
2456 | } |
2457 | |
2458 | STAGE(xy_to_unit_angle, Ctx::None) { |
2459 | F X = r, |
2460 | Y = g; |
2461 | F xabs = abs_(X), |
2462 | yabs = abs_(Y); |
2463 | |
2464 | F slope = min(xabs, yabs)/max(xabs, yabs); |
2465 | F s = slope * slope; |
2466 | |
2467 | // Use a 7th degree polynomial to approximate atan. |
2468 | // This was generated using sollya.gforge.inria.fr. |
2469 | // A float optimized polynomial was generated using the following command. |
2470 | // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); |
2471 | F phi = slope |
2472 | * (0.15912117063999176025390625f + s |
2473 | * (-5.185396969318389892578125e-2f + s |
2474 | * (2.476101927459239959716796875e-2f + s |
2475 | * (-7.0547382347285747528076171875e-3f)))); |
2476 | |
2477 | phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); |
2478 | phi = if_then_else(X < 0.0f , 1.0f/2.0f - phi, phi); |
2479 | phi = if_then_else(Y < 0.0f , 1.0f - phi , phi); |
2480 | phi = if_then_else(phi != phi , 0 , phi); // Check for NaN. |
2481 | r = phi; |
2482 | } |
2483 | |
2484 | STAGE(xy_to_radius, Ctx::None) { |
2485 | F X2 = r * r, |
2486 | Y2 = g * g; |
2487 | r = sqrt_(X2 + Y2); |
2488 | } |
2489 | |
2490 | // Please see https://skia.org/dev/design/conical for how our 2pt conical shader works. |
2491 | |
2492 | STAGE(negate_x, Ctx::None) { r = -r; } |
2493 | |
2494 | STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) { |
2495 | F x = r, y = g, &t = r; |
2496 | t = x + sqrt_(ctx->fP0 - y*y); // ctx->fP0 = r0 * r0 |
2497 | } |
2498 | |
2499 | STAGE(xy_to_2pt_conical_focal_on_circle, Ctx::None) { |
2500 | F x = r, y = g, &t = r; |
2501 | t = x + y*y / x; // (x^2 + y^2) / x |
2502 | } |
2503 | |
2504 | STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) { |
2505 | F x = r, y = g, &t = r; |
2506 | t = sqrt_(x*x + y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 |
2507 | } |
2508 | |
2509 | STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) { |
2510 | F x = r, y = g, &t = r; |
2511 | t = sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 |
2512 | } |
2513 | |
2514 | STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) { |
2515 | F x = r, y = g, &t = r; |
2516 | t = -sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 |
2517 | } |
2518 | |
2519 | STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) { |
2520 | F& t = r; |
2521 | t = t + ctx->fP1; // ctx->fP1 = f |
2522 | } |
2523 | |
2524 | STAGE(alter_2pt_conical_unswap, Ctx::None) { |
2525 | F& t = r; |
2526 | t = 1 - t; |
2527 | } |
2528 | |
2529 | STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) { |
2530 | F& t = r; |
2531 | auto is_degenerate = (t != t); // NaN |
2532 | t = if_then_else(is_degenerate, F(0), t); |
2533 | sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate)); |
2534 | } |
2535 | |
2536 | STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) { |
2537 | F& t = r; |
2538 | auto is_degenerate = (t <= 0) | (t != t); |
2539 | t = if_then_else(is_degenerate, F(0), t); |
2540 | sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate)); |
2541 | } |
2542 | |
2543 | STAGE(apply_vector_mask, const uint32_t* ctx) { |
2544 | const U32 mask = sk_unaligned_load<U32>(ctx); |
2545 | r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask); |
2546 | g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask); |
2547 | b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask); |
2548 | a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask); |
2549 | } |
2550 | |
2551 | STAGE(save_xy, SkRasterPipeline_SamplerCtx* c) { |
2552 | // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy). |
2553 | // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid |
2554 | // surrounding (x,y) at (0.5,0.5) off-center. |
2555 | F fx = fract(r + 0.5f), |
2556 | fy = fract(g + 0.5f); |
2557 | |
2558 | // Samplers will need to load x and fx, or y and fy. |
2559 | sk_unaligned_store(c->x, r); |
2560 | sk_unaligned_store(c->y, g); |
2561 | sk_unaligned_store(c->fx, fx); |
2562 | sk_unaligned_store(c->fy, fy); |
2563 | } |
2564 | |
2565 | STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) { |
2566 | // Bilinear and bicubic filters are both separable, so we produce independent contributions |
2567 | // from x and y, multiplying them together here to get each pixel's total scale factor. |
2568 | auto scale = sk_unaligned_load<F>(c->scalex) |
2569 | * sk_unaligned_load<F>(c->scaley); |
2570 | dr = mad(scale, r, dr); |
2571 | dg = mad(scale, g, dg); |
2572 | db = mad(scale, b, db); |
2573 | da = mad(scale, a, da); |
2574 | } |
2575 | |
2576 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center |
2577 | // are combined in direct proportion to their area overlapping that logical query pixel. |
2578 | // At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x. |
2579 | // The y-axis is symmetric. |
2580 | |
2581 | template <int kScale> |
2582 | SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) { |
2583 | *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f); |
2584 | F fx = sk_unaligned_load<F>(ctx->fx); |
2585 | |
2586 | F scalex; |
2587 | if (kScale == -1) { scalex = 1.0f - fx; } |
2588 | if (kScale == +1) { scalex = fx; } |
2589 | sk_unaligned_store(ctx->scalex, scalex); |
2590 | } |
2591 | template <int kScale> |
2592 | SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) { |
2593 | *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f); |
2594 | F fy = sk_unaligned_load<F>(ctx->fy); |
2595 | |
2596 | F scaley; |
2597 | if (kScale == -1) { scaley = 1.0f - fy; } |
2598 | if (kScale == +1) { scaley = fy; } |
2599 | sk_unaligned_store(ctx->scaley, scaley); |
2600 | } |
2601 | |
2602 | STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, &r); } |
2603 | STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+1>(ctx, &r); } |
2604 | STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-1>(ctx, &g); } |
2605 | STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+1>(ctx, &g); } |
2606 | |
2607 | |
2608 | // In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample |
2609 | // pixel center are combined with a non-uniform cubic filter, with higher values near the center. |
2610 | // |
2611 | // We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets. |
2612 | // See GrCubicEffect for details of this particular filter. |
2613 | |
2614 | SI F bicubic_near(F t) { |
2615 | // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18 |
2616 | return mad(t, mad(t, mad((-21/18.0f), t, (27/18.0f)), (9/18.0f)), (1/18.0f)); |
2617 | } |
2618 | SI F bicubic_far(F t) { |
2619 | // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18) |
2620 | return (t*t)*mad((7/18.0f), t, (-6/18.0f)); |
2621 | } |
2622 | |
2623 | template <int kScale> |
2624 | SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) { |
2625 | *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f); |
2626 | F fx = sk_unaligned_load<F>(ctx->fx); |
2627 | |
2628 | F scalex; |
2629 | if (kScale == -3) { scalex = bicubic_far (1.0f - fx); } |
2630 | if (kScale == -1) { scalex = bicubic_near(1.0f - fx); } |
2631 | if (kScale == +1) { scalex = bicubic_near( fx); } |
2632 | if (kScale == +3) { scalex = bicubic_far ( fx); } |
2633 | sk_unaligned_store(ctx->scalex, scalex); |
2634 | } |
2635 | template <int kScale> |
2636 | SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) { |
2637 | *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f); |
2638 | F fy = sk_unaligned_load<F>(ctx->fy); |
2639 | |
2640 | F scaley; |
2641 | if (kScale == -3) { scaley = bicubic_far (1.0f - fy); } |
2642 | if (kScale == -1) { scaley = bicubic_near(1.0f - fy); } |
2643 | if (kScale == +1) { scaley = bicubic_near( fy); } |
2644 | if (kScale == +3) { scaley = bicubic_far ( fy); } |
2645 | sk_unaligned_store(ctx->scaley, scaley); |
2646 | } |
2647 | |
2648 | STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, &r); } |
2649 | STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-1>(ctx, &r); } |
2650 | STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+1>(ctx, &r); } |
2651 | STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+3>(ctx, &r); } |
2652 | |
2653 | STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-3>(ctx, &g); } |
2654 | STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-1>(ctx, &g); } |
2655 | STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+1>(ctx, &g); } |
2656 | STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+3>(ctx, &g); } |
2657 | |
2658 | STAGE(callback, SkRasterPipeline_CallbackCtx* c) { |
2659 | store4(c->rgba,0, r,g,b,a); |
2660 | c->fn(c, tail ? tail : N); |
2661 | load4(c->read_from,0, &r,&g,&b,&a); |
2662 | } |
2663 | |
2664 | STAGE(gauss_a_to_rgba, Ctx::None) { |
2665 | // x = 1 - x; |
2666 | // exp(-x * x * 4) - 0.018f; |
2667 | // ... now approximate with quartic |
2668 | // |
2669 | const float c4 = -2.26661229133605957031f; |
2670 | const float c3 = 2.89795351028442382812f; |
2671 | const float c2 = 0.21345567703247070312f; |
2672 | const float c1 = 0.15489584207534790039f; |
2673 | const float c0 = 0.00030726194381713867f; |
2674 | a = mad(a, mad(a, mad(a, mad(a, c4, c3), c2), c1), c0); |
2675 | r = a; |
2676 | g = a; |
2677 | b = a; |
2678 | } |
2679 | |
2680 | SI F tile(F v, SkTileMode mode, float limit, float invLimit) { |
2681 | // The ix_and_ptr() calls in sample() will clamp tile()'s output, so no need to clamp here. |
2682 | switch (mode) { |
2683 | case SkTileMode::kDecal: // TODO, for now fallthrough to clamp |
2684 | case SkTileMode::kClamp: return v; |
2685 | case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit; |
2686 | case SkTileMode::kMirror: |
2687 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); |
2688 | } |
2689 | SkUNREACHABLE; |
2690 | } |
2691 | |
2692 | SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y, |
2693 | F* r, F* g, F* b, F* a) { |
2694 | x = tile(x, ctx->tileX, ctx->width , ctx->invWidth ); |
2695 | y = tile(y, ctx->tileY, ctx->height, ctx->invHeight); |
2696 | |
2697 | switch (ctx->ct) { |
2698 | default: *r = *g = *b = *a = 0; // TODO |
2699 | break; |
2700 | |
2701 | case kRGBA_8888_SkColorType: |
2702 | case kBGRA_8888_SkColorType: { |
2703 | const uint32_t* ptr; |
2704 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
2705 | from_8888(gather(ptr, ix), r,g,b,a); |
2706 | if (ctx->ct == kBGRA_8888_SkColorType) { |
2707 | std::swap(*r,*b); |
2708 | } |
2709 | } break; |
2710 | } |
2711 | } |
2712 | |
2713 | template <int D> |
2714 | SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx, |
2715 | F cx, F cy, const F (&wx)[D], const F (&wy)[D], |
2716 | F* r, F* g, F* b, F* a) { |
2717 | |
2718 | float start = -0.5f*(D-1); |
2719 | |
2720 | *r = *g = *b = *a = 0; |
2721 | F y = cy + start; |
2722 | for (int j = 0; j < D; j++, y += 1.0f) { |
2723 | F x = cx + start; |
2724 | for (int i = 0; i < D; i++, x += 1.0f) { |
2725 | F R,G,B,A; |
2726 | sample(ctx, x,y, &R,&G,&B,&A); |
2727 | |
2728 | F w = wx[i] * wy[j]; |
2729 | *r = mad(w,R,*r); |
2730 | *g = mad(w,G,*g); |
2731 | *b = mad(w,B,*b); |
2732 | *a = mad(w,A,*a); |
2733 | } |
2734 | } |
2735 | } |
2736 | |
2737 | STAGE(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) { |
2738 | F x = r, fx = fract(x + 0.5f), |
2739 | y = g, fy = fract(y + 0.5f); |
2740 | const F wx[] = {1.0f - fx, fx}; |
2741 | const F wy[] = {1.0f - fy, fy}; |
2742 | |
2743 | sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); |
2744 | } |
2745 | STAGE(bicubic, SkRasterPipeline_SamplerCtx2* ctx) { |
2746 | F x = r, fx = fract(x + 0.5f), |
2747 | y = g, fy = fract(y + 0.5f); |
2748 | const F wx[] = { bicubic_far(1-fx), bicubic_near(1-fx), bicubic_near(fx), bicubic_far(fx) }; |
2749 | const F wy[] = { bicubic_far(1-fy), bicubic_near(1-fy), bicubic_near(fy), bicubic_far(fy) }; |
2750 | |
2751 | sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); |
2752 | } |
2753 | |
2754 | // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling. |
2755 | STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { |
2756 | // (cx,cy) are the center of our sample. |
2757 | F cx = r, |
2758 | cy = g; |
2759 | |
2760 | // All sample points are at the same fractional offset (fx,fy). |
2761 | // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. |
2762 | F fx = fract(cx + 0.5f), |
2763 | fy = fract(cy + 0.5f); |
2764 | |
2765 | // We'll accumulate the color of all four samples into {r,g,b,a} directly. |
2766 | r = g = b = a = 0; |
2767 | |
2768 | for (float dy = -0.5f; dy <= +0.5f; dy += 1.0f) |
2769 | for (float dx = -0.5f; dx <= +0.5f; dx += 1.0f) { |
2770 | // (x,y) are the coordinates of this sample point. |
2771 | F x = cx + dx, |
2772 | y = cy + dy; |
2773 | |
2774 | // ix_and_ptr() will clamp to the image's bounds for us. |
2775 | const uint32_t* ptr; |
2776 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
2777 | |
2778 | F sr,sg,sb,sa; |
2779 | from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa); |
2780 | |
2781 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center |
2782 | // are combined in direct proportion to their area overlapping that logical query pixel. |
2783 | // At positive offsets, the x-axis contribution to that rectangle is fx, |
2784 | // or (1-fx) at negative x. Same deal for y. |
2785 | F sx = (dx > 0) ? fx : 1.0f - fx, |
2786 | sy = (dy > 0) ? fy : 1.0f - fy, |
2787 | area = sx * sy; |
2788 | |
2789 | r += sr * area; |
2790 | g += sg * area; |
2791 | b += sb * area; |
2792 | a += sa * area; |
2793 | } |
2794 | } |
2795 | |
2796 | // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling. |
2797 | STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { |
2798 | // (cx,cy) are the center of our sample. |
2799 | F cx = r, |
2800 | cy = g; |
2801 | |
2802 | // All sample points are at the same fractional offset (fx,fy). |
2803 | // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. |
2804 | F fx = fract(cx + 0.5f), |
2805 | fy = fract(cy + 0.5f); |
2806 | |
2807 | // We'll accumulate the color of all four samples into {r,g,b,a} directly. |
2808 | r = g = b = a = 0; |
2809 | |
2810 | const F scaley[4] = { |
2811 | bicubic_far (1.0f - fy), bicubic_near(1.0f - fy), |
2812 | bicubic_near( fy), bicubic_far ( fy), |
2813 | }; |
2814 | const F scalex[4] = { |
2815 | bicubic_far (1.0f - fx), bicubic_near(1.0f - fx), |
2816 | bicubic_near( fx), bicubic_far ( fx), |
2817 | }; |
2818 | |
2819 | F sample_y = cy - 1.5f; |
2820 | for (int yy = 0; yy <= 3; ++yy) { |
2821 | F sample_x = cx - 1.5f; |
2822 | for (int xx = 0; xx <= 3; ++xx) { |
2823 | F scale = scalex[xx] * scaley[yy]; |
2824 | |
2825 | // ix_and_ptr() will clamp to the image's bounds for us. |
2826 | const uint32_t* ptr; |
2827 | U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y); |
2828 | |
2829 | F sr,sg,sb,sa; |
2830 | from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa); |
2831 | |
2832 | r = mad(scale, sr, r); |
2833 | g = mad(scale, sg, g); |
2834 | b = mad(scale, sb, b); |
2835 | a = mad(scale, sa, a); |
2836 | |
2837 | sample_x += 1; |
2838 | } |
2839 | sample_y += 1; |
2840 | } |
2841 | } |
2842 | |
2843 | // ~~~~~~ GrSwizzle stage ~~~~~~ // |
2844 | |
2845 | STAGE(swizzle, void* ctx) { |
2846 | auto ir = r, ig = g, ib = b, ia = a; |
2847 | F* o[] = {&r, &g, &b, &a}; |
2848 | char swiz[4]; |
2849 | memcpy(swiz, &ctx, sizeof(swiz)); |
2850 | |
2851 | for (int i = 0; i < 4; ++i) { |
2852 | switch (swiz[i]) { |
2853 | case 'r': *o[i] = ir; break; |
2854 | case 'g': *o[i] = ig; break; |
2855 | case 'b': *o[i] = ib; break; |
2856 | case 'a': *o[i] = ia; break; |
2857 | case '0': *o[i] = F(0); break; |
2858 | case '1': *o[i] = F(1); break; |
2859 | default: break; |
2860 | } |
2861 | } |
2862 | } |
2863 | |
2864 | namespace lowp { |
2865 | #if defined(JUMPER_IS_SCALAR) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE) |
2866 | // If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually), |
2867 | // we don't generate lowp stages. All these nullptrs will tell SkJumper.cpp to always use the |
2868 | // highp float pipeline. |
2869 | #define M(st) static void (*st)(void) = nullptr; |
2870 | SK_RASTER_PIPELINE_STAGES(M) |
2871 | #undef M |
2872 | static void (*just_return)(void) = nullptr; |
2873 | |
2874 | static void start_pipeline(size_t,size_t,size_t,size_t, void**) {} |
2875 | |
2876 | #else // We are compiling vector code with Clang... let's make some lowp stages! |
2877 | |
2878 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
2879 | using U8 = uint8_t __attribute__((ext_vector_type(16))); |
2880 | using U16 = uint16_t __attribute__((ext_vector_type(16))); |
2881 | using I16 = int16_t __attribute__((ext_vector_type(16))); |
2882 | using I32 = int32_t __attribute__((ext_vector_type(16))); |
2883 | using U32 = uint32_t __attribute__((ext_vector_type(16))); |
2884 | using F = float __attribute__((ext_vector_type(16))); |
2885 | #else |
2886 | using U8 = uint8_t __attribute__((ext_vector_type(8))); |
2887 | using U16 = uint16_t __attribute__((ext_vector_type(8))); |
2888 | using I16 = int16_t __attribute__((ext_vector_type(8))); |
2889 | using I32 = int32_t __attribute__((ext_vector_type(8))); |
2890 | using U32 = uint32_t __attribute__((ext_vector_type(8))); |
2891 | using F = float __attribute__((ext_vector_type(8))); |
2892 | #endif |
2893 | |
2894 | static const size_t N = sizeof(U16) / sizeof(uint16_t); |
2895 | |
2896 | // Once again, some platforms benefit from a restricted Stage calling convention, |
2897 | // but others can pass tons and tons of registers and we're happy to exploit that. |
2898 | // It's exactly the same decision and implementation strategy as the F stages above. |
2899 | #if JUMPER_NARROW_STAGES |
2900 | struct Params { |
2901 | size_t dx, dy, tail; |
2902 | U16 dr,dg,db,da; |
2903 | }; |
2904 | using Stage = void(ABI*)(Params*, void** program, U16 r, U16 g, U16 b, U16 a); |
2905 | #else |
2906 | // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64. |
2907 | using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy, |
2908 | U16 r, U16 g, U16 b, U16 a, |
2909 | U16 dr, U16 dg, U16 db, U16 da); |
2910 | #endif |
2911 | |
2912 | static void start_pipeline(const size_t x0, const size_t y0, |
2913 | const size_t xlimit, const size_t ylimit, void** program) { |
2914 | auto start = (Stage)load_and_inc(program); |
2915 | for (size_t dy = y0; dy < ylimit; dy++) { |
2916 | #if JUMPER_NARROW_STAGES |
2917 | Params params = { x0,dy,0, 0,0,0,0 }; |
2918 | for (; params.dx + N <= xlimit; params.dx += N) { |
2919 | start(¶ms,program, 0,0,0,0); |
2920 | } |
2921 | if (size_t tail = xlimit - params.dx) { |
2922 | params.tail = tail; |
2923 | start(¶ms,program, 0,0,0,0); |
2924 | } |
2925 | #else |
2926 | size_t dx = x0; |
2927 | for (; dx + N <= xlimit; dx += N) { |
2928 | start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0); |
2929 | } |
2930 | if (size_t tail = xlimit - dx) { |
2931 | start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); |
2932 | } |
2933 | #endif |
2934 | } |
2935 | } |
2936 | |
2937 | #if JUMPER_NARROW_STAGES |
2938 | static void ABI just_return(Params*, void**, U16,U16,U16,U16) {} |
2939 | #else |
2940 | static void ABI just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {} |
2941 | #endif |
2942 | |
2943 | // All stages use the same function call ABI to chain into each other, but there are three types: |
2944 | // GG: geometry in, geometry out -- think, a matrix |
2945 | // GP: geometry in, pixels out. -- think, a memory gather |
2946 | // PP: pixels in, pixels out. -- think, a blend mode |
2947 | // |
2948 | // (Some stages ignore their inputs or produce no logical output. That's perfectly fine.) |
2949 | // |
2950 | // These three STAGE_ macros let you define each type of stage, |
2951 | // and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate. |
2952 | |
2953 | #if JUMPER_NARROW_STAGES |
2954 | #define STAGE_GG(name, ...) \ |
2955 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ |
2956 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ |
2957 | auto x = join<F>(r,g), \ |
2958 | y = join<F>(b,a); \ |
2959 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \ |
2960 | split(x, &r,&g); \ |
2961 | split(y, &b,&a); \ |
2962 | auto next = (Stage)load_and_inc(program); \ |
2963 | next(params,program, r,g,b,a); \ |
2964 | } \ |
2965 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) |
2966 | |
2967 | #define STAGE_GP(name, ...) \ |
2968 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
2969 | U16& r, U16& g, U16& b, U16& a, \ |
2970 | U16& dr, U16& dg, U16& db, U16& da); \ |
2971 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ |
2972 | auto x = join<F>(r,g), \ |
2973 | y = join<F>(b,a); \ |
2974 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \ |
2975 | params->dr,params->dg,params->db,params->da); \ |
2976 | auto next = (Stage)load_and_inc(program); \ |
2977 | next(params,program, r,g,b,a); \ |
2978 | } \ |
2979 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
2980 | U16& r, U16& g, U16& b, U16& a, \ |
2981 | U16& dr, U16& dg, U16& db, U16& da) |
2982 | |
2983 | #define STAGE_PP(name, ...) \ |
2984 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
2985 | U16& r, U16& g, U16& b, U16& a, \ |
2986 | U16& dr, U16& dg, U16& db, U16& da); \ |
2987 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ |
2988 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \ |
2989 | params->dr,params->dg,params->db,params->da); \ |
2990 | auto next = (Stage)load_and_inc(program); \ |
2991 | next(params,program, r,g,b,a); \ |
2992 | } \ |
2993 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
2994 | U16& r, U16& g, U16& b, U16& a, \ |
2995 | U16& dr, U16& dg, U16& db, U16& da) |
2996 | #else |
2997 | #define STAGE_GG(name, ...) \ |
2998 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ |
2999 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
3000 | U16 r, U16 g, U16 b, U16 a, \ |
3001 | U16 dr, U16 dg, U16 db, U16 da) { \ |
3002 | auto x = join<F>(r,g), \ |
3003 | y = join<F>(b,a); \ |
3004 | name##_k(Ctx{program}, dx,dy,tail, x,y); \ |
3005 | split(x, &r,&g); \ |
3006 | split(y, &b,&a); \ |
3007 | auto next = (Stage)load_and_inc(program); \ |
3008 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
3009 | } \ |
3010 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) |
3011 | |
3012 | #define STAGE_GP(name, ...) \ |
3013 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
3014 | U16& r, U16& g, U16& b, U16& a, \ |
3015 | U16& dr, U16& dg, U16& db, U16& da); \ |
3016 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
3017 | U16 r, U16 g, U16 b, U16 a, \ |
3018 | U16 dr, U16 dg, U16 db, U16 da) { \ |
3019 | auto x = join<F>(r,g), \ |
3020 | y = join<F>(b,a); \ |
3021 | name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \ |
3022 | auto next = (Stage)load_and_inc(program); \ |
3023 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
3024 | } \ |
3025 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ |
3026 | U16& r, U16& g, U16& b, U16& a, \ |
3027 | U16& dr, U16& dg, U16& db, U16& da) |
3028 | |
3029 | #define STAGE_PP(name, ...) \ |
3030 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
3031 | U16& r, U16& g, U16& b, U16& a, \ |
3032 | U16& dr, U16& dg, U16& db, U16& da); \ |
3033 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ |
3034 | U16 r, U16 g, U16 b, U16 a, \ |
3035 | U16 dr, U16 dg, U16 db, U16 da) { \ |
3036 | name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \ |
3037 | auto next = (Stage)load_and_inc(program); \ |
3038 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ |
3039 | } \ |
3040 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ |
3041 | U16& r, U16& g, U16& b, U16& a, \ |
3042 | U16& dr, U16& dg, U16& db, U16& da) |
3043 | #endif |
3044 | |
3045 | // ~~~~~~ Commonly used helper functions ~~~~~~ // |
3046 | |
3047 | SI U16 div255(U16 v) { |
3048 | #if 0 |
3049 | return (v+127)/255; // The ideal rounding divide by 255. |
3050 | #elif 1 && defined(JUMPER_IS_NEON) |
3051 | // With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8 |
3052 | // just as fast as we can do the approximation below, so might as well be correct! |
3053 | // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up. |
3054 | return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8); |
3055 | #else |
3056 | return (v+255)/256; // A good approximation of (v+127)/255. |
3057 | #endif |
3058 | } |
3059 | |
3060 | SI U16 inv(U16 v) { return 255-v; } |
3061 | |
3062 | SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); } |
3063 | SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); } |
3064 | |
3065 | SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); } |
3066 | SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); } |
3067 | |
3068 | SI U16 from_float(float f) { return f * 255.0f + 0.5f; } |
3069 | |
3070 | SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); } |
3071 | |
3072 | template <typename D, typename S> |
3073 | SI D cast(S src) { |
3074 | return __builtin_convertvector(src, D); |
3075 | } |
3076 | |
3077 | template <typename D, typename S> |
3078 | SI void split(S v, D* lo, D* hi) { |
3079 | static_assert(2*sizeof(D) == sizeof(S), "" ); |
3080 | memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D)); |
3081 | memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D)); |
3082 | } |
3083 | template <typename D, typename S> |
3084 | SI D join(S lo, S hi) { |
3085 | static_assert(sizeof(D) == 2*sizeof(S), "" ); |
3086 | D v; |
3087 | memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S)); |
3088 | memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S)); |
3089 | return v; |
3090 | } |
3091 | |
3092 | SI F if_then_else(I32 c, F t, F e) { |
3093 | return sk_bit_cast<F>( (sk_bit_cast<I32>(t) & c) | (sk_bit_cast<I32>(e) & ~c) ); |
3094 | } |
3095 | SI F max(F x, F y) { return if_then_else(x < y, y, x); } |
3096 | SI F min(F x, F y) { return if_then_else(x < y, x, y); } |
3097 | |
3098 | SI F mad(F f, F m, F a) { return f*m+a; } |
3099 | SI U32 trunc_(F x) { return (U32)cast<I32>(x); } |
3100 | |
3101 | SI F rcp(F x) { |
3102 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
3103 | __m256 lo,hi; |
3104 | split(x, &lo,&hi); |
3105 | return join<F>(_mm256_rcp_ps(lo), _mm256_rcp_ps(hi)); |
3106 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) |
3107 | __m128 lo,hi; |
3108 | split(x, &lo,&hi); |
3109 | return join<F>(_mm_rcp_ps(lo), _mm_rcp_ps(hi)); |
3110 | #elif defined(JUMPER_IS_NEON) |
3111 | auto rcp = [](float32x4_t v) { |
3112 | auto est = vrecpeq_f32(v); |
3113 | return vrecpsq_f32(v,est)*est; |
3114 | }; |
3115 | float32x4_t lo,hi; |
3116 | split(x, &lo,&hi); |
3117 | return join<F>(rcp(lo), rcp(hi)); |
3118 | #else |
3119 | return 1.0f / x; |
3120 | #endif |
3121 | } |
3122 | SI F sqrt_(F x) { |
3123 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
3124 | __m256 lo,hi; |
3125 | split(x, &lo,&hi); |
3126 | return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi)); |
3127 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) |
3128 | __m128 lo,hi; |
3129 | split(x, &lo,&hi); |
3130 | return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi)); |
3131 | #elif defined(SK_CPU_ARM64) |
3132 | float32x4_t lo,hi; |
3133 | split(x, &lo,&hi); |
3134 | return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi)); |
3135 | #elif defined(JUMPER_IS_NEON) |
3136 | auto sqrt = [](float32x4_t v) { |
3137 | auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v). |
3138 | est *= vrsqrtsq_f32(v,est*est); |
3139 | est *= vrsqrtsq_f32(v,est*est); |
3140 | return v*est; // sqrt(v) == v*rsqrt(v). |
3141 | }; |
3142 | float32x4_t lo,hi; |
3143 | split(x, &lo,&hi); |
3144 | return join<F>(sqrt(lo), sqrt(hi)); |
3145 | #else |
3146 | return F{ |
3147 | sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]), |
3148 | sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]), |
3149 | }; |
3150 | #endif |
3151 | } |
3152 | |
3153 | SI F floor_(F x) { |
3154 | #if defined(SK_CPU_ARM64) |
3155 | float32x4_t lo,hi; |
3156 | split(x, &lo,&hi); |
3157 | return join<F>(vrndmq_f32(lo), vrndmq_f32(hi)); |
3158 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
3159 | __m256 lo,hi; |
3160 | split(x, &lo,&hi); |
3161 | return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi)); |
3162 | #elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) |
3163 | __m128 lo,hi; |
3164 | split(x, &lo,&hi); |
3165 | return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi)); |
3166 | #else |
3167 | F roundtrip = cast<F>(cast<I32>(x)); |
3168 | return roundtrip - if_then_else(roundtrip > x, F(1), F(0)); |
3169 | #endif |
3170 | } |
3171 | SI F fract(F x) { return x - floor_(x); } |
3172 | SI F abs_(F x) { return sk_bit_cast<F>( sk_bit_cast<I32>(x) & 0x7fffffff ); } |
3173 | |
3174 | // ~~~~~~ Basic / misc. stages ~~~~~~ // |
3175 | |
3176 | STAGE_GG(seed_shader, Ctx::None) { |
3177 | static const float iota[] = { |
3178 | 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f, |
3179 | 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f, |
3180 | }; |
3181 | x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota); |
3182 | y = cast<F>(I32(dy)) + 0.5f; |
3183 | } |
3184 | |
3185 | STAGE_GG(matrix_translate, const float* m) { |
3186 | x += m[0]; |
3187 | y += m[1]; |
3188 | } |
3189 | STAGE_GG(matrix_scale_translate, const float* m) { |
3190 | x = mad(x,m[0], m[2]); |
3191 | y = mad(y,m[1], m[3]); |
3192 | } |
3193 | STAGE_GG(matrix_2x3, const float* m) { |
3194 | auto X = mad(x,m[0], mad(y,m[2], m[4])), |
3195 | Y = mad(x,m[1], mad(y,m[3], m[5])); |
3196 | x = X; |
3197 | y = Y; |
3198 | } |
3199 | STAGE_GG(matrix_perspective, const float* m) { |
3200 | // N.B. Unlike the other matrix_ stages, this matrix is row-major. |
3201 | auto X = mad(x,m[0], mad(y,m[1], m[2])), |
3202 | Y = mad(x,m[3], mad(y,m[4], m[5])), |
3203 | Z = mad(x,m[6], mad(y,m[7], m[8])); |
3204 | x = X * rcp(Z); |
3205 | y = Y * rcp(Z); |
3206 | } |
3207 | |
3208 | STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) { |
3209 | r = c->rgba[0]; |
3210 | g = c->rgba[1]; |
3211 | b = c->rgba[2]; |
3212 | a = c->rgba[3]; |
3213 | } |
3214 | STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) { |
3215 | dr = c->rgba[0]; |
3216 | dg = c->rgba[1]; |
3217 | db = c->rgba[2]; |
3218 | da = c->rgba[3]; |
3219 | } |
3220 | STAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; } |
3221 | STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; } |
3222 | |
3223 | STAGE_PP(set_rgb, const float rgb[3]) { |
3224 | r = from_float(rgb[0]); |
3225 | g = from_float(rgb[1]); |
3226 | b = from_float(rgb[2]); |
3227 | } |
3228 | |
3229 | STAGE_PP(clamp_0, Ctx::None) { /*definitely a noop*/ } |
3230 | STAGE_PP(clamp_1, Ctx::None) { /*_should_ be a noop*/ } |
3231 | |
3232 | STAGE_PP(clamp_a, Ctx::None) { |
3233 | r = min(r, a); |
3234 | g = min(g, a); |
3235 | b = min(b, a); |
3236 | } |
3237 | |
3238 | STAGE_PP(clamp_gamut, Ctx::None) { |
3239 | // It shouldn't be possible to get out-of-gamut |
3240 | // colors when working in lowp. |
3241 | } |
3242 | |
3243 | STAGE_PP(premul, Ctx::None) { |
3244 | r = div255(r * a); |
3245 | g = div255(g * a); |
3246 | b = div255(b * a); |
3247 | } |
3248 | STAGE_PP(premul_dst, Ctx::None) { |
3249 | dr = div255(dr * da); |
3250 | dg = div255(dg * da); |
3251 | db = div255(db * da); |
3252 | } |
3253 | |
3254 | STAGE_PP(force_opaque , Ctx::None) { a = 255; } |
3255 | STAGE_PP(force_opaque_dst, Ctx::None) { da = 255; } |
3256 | |
3257 | STAGE_PP(swap_rb, Ctx::None) { |
3258 | auto tmp = r; |
3259 | r = b; |
3260 | b = tmp; |
3261 | } |
3262 | STAGE_PP(swap_rb_dst, Ctx::None) { |
3263 | auto tmp = dr; |
3264 | dr = db; |
3265 | db = tmp; |
3266 | } |
3267 | |
3268 | STAGE_PP(move_src_dst, Ctx::None) { |
3269 | dr = r; |
3270 | dg = g; |
3271 | db = b; |
3272 | da = a; |
3273 | } |
3274 | |
3275 | STAGE_PP(move_dst_src, Ctx::None) { |
3276 | r = dr; |
3277 | g = dg; |
3278 | b = db; |
3279 | a = da; |
3280 | } |
3281 | |
3282 | // ~~~~~~ Blend modes ~~~~~~ // |
3283 | |
3284 | // The same logic applied to all 4 channels. |
3285 | #define BLEND_MODE(name) \ |
3286 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ |
3287 | STAGE_PP(name, Ctx::None) { \ |
3288 | r = name##_channel(r,dr,a,da); \ |
3289 | g = name##_channel(g,dg,a,da); \ |
3290 | b = name##_channel(b,db,a,da); \ |
3291 | a = name##_channel(a,da,a,da); \ |
3292 | } \ |
3293 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) |
3294 | |
3295 | BLEND_MODE(clear) { return 0; } |
3296 | BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); } |
3297 | BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); } |
3298 | BLEND_MODE(srcin) { return div255( s*da ); } |
3299 | BLEND_MODE(dstin) { return div255( d*sa ); } |
3300 | BLEND_MODE(srcout) { return div255( s*inv(da) ); } |
3301 | BLEND_MODE(dstout) { return div255( d*inv(sa) ); } |
3302 | BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); } |
3303 | BLEND_MODE(dstover) { return d + div255( s*inv(da) ); } |
3304 | BLEND_MODE(modulate) { return div255( s*d ); } |
3305 | BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); } |
3306 | BLEND_MODE(plus_) { return min(s+d, 255); } |
3307 | BLEND_MODE(screen) { return s + d - div255( s*d ); } |
3308 | BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); } |
3309 | #undef BLEND_MODE |
3310 | |
3311 | // The same logic applied to color, and srcover for alpha. |
3312 | #define BLEND_MODE(name) \ |
3313 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ |
3314 | STAGE_PP(name, Ctx::None) { \ |
3315 | r = name##_channel(r,dr,a,da); \ |
3316 | g = name##_channel(g,dg,a,da); \ |
3317 | b = name##_channel(b,db,a,da); \ |
3318 | a = a + div255( da*inv(a) ); \ |
3319 | } \ |
3320 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) |
3321 | |
3322 | BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); } |
3323 | BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); } |
3324 | BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); } |
3325 | BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); } |
3326 | |
3327 | BLEND_MODE(hardlight) { |
3328 | return div255( s*inv(da) + d*inv(sa) + |
3329 | if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); |
3330 | } |
3331 | BLEND_MODE(overlay) { |
3332 | return div255( s*inv(da) + d*inv(sa) + |
3333 | if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); |
3334 | } |
3335 | #undef BLEND_MODE |
3336 | |
3337 | // ~~~~~~ Helpers for interacting with memory ~~~~~~ // |
3338 | |
3339 | template <typename T> |
3340 | SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) { |
3341 | return (T*)ctx->pixels + dy*ctx->stride + dx; |
3342 | } |
3343 | |
3344 | template <typename T> |
3345 | SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) { |
3346 | // Exclusive -> inclusive. |
3347 | const F w = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->width ) - 1), |
3348 | h = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->height) - 1); |
3349 | |
3350 | x = min(max(0, x), w); |
3351 | y = min(max(0, y), h); |
3352 | |
3353 | *ptr = (const T*)ctx->pixels; |
3354 | return trunc_(y)*ctx->stride + trunc_(x); |
3355 | } |
3356 | |
3357 | template <typename V, typename T> |
3358 | SI V load(const T* ptr, size_t tail) { |
3359 | V v = 0; |
3360 | switch (tail & (N-1)) { |
3361 | case 0: memcpy(&v, ptr, sizeof(v)); break; |
3362 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
3363 | case 15: v[14] = ptr[14]; [[fallthrough]]; |
3364 | case 14: v[13] = ptr[13]; [[fallthrough]]; |
3365 | case 13: v[12] = ptr[12]; [[fallthrough]]; |
3366 | case 12: memcpy(&v, ptr, 12*sizeof(T)); break; |
3367 | case 11: v[10] = ptr[10]; [[fallthrough]]; |
3368 | case 10: v[ 9] = ptr[ 9]; [[fallthrough]]; |
3369 | case 9: v[ 8] = ptr[ 8]; [[fallthrough]]; |
3370 | case 8: memcpy(&v, ptr, 8*sizeof(T)); break; |
3371 | #endif |
3372 | case 7: v[ 6] = ptr[ 6]; [[fallthrough]]; |
3373 | case 6: v[ 5] = ptr[ 5]; [[fallthrough]]; |
3374 | case 5: v[ 4] = ptr[ 4]; [[fallthrough]]; |
3375 | case 4: memcpy(&v, ptr, 4*sizeof(T)); break; |
3376 | case 3: v[ 2] = ptr[ 2]; [[fallthrough]]; |
3377 | case 2: memcpy(&v, ptr, 2*sizeof(T)); break; |
3378 | case 1: v[ 0] = ptr[ 0]; |
3379 | } |
3380 | return v; |
3381 | } |
3382 | template <typename V, typename T> |
3383 | SI void store(T* ptr, size_t tail, V v) { |
3384 | switch (tail & (N-1)) { |
3385 | case 0: memcpy(ptr, &v, sizeof(v)); break; |
3386 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
3387 | case 15: ptr[14] = v[14]; [[fallthrough]]; |
3388 | case 14: ptr[13] = v[13]; [[fallthrough]]; |
3389 | case 13: ptr[12] = v[12]; [[fallthrough]]; |
3390 | case 12: memcpy(ptr, &v, 12*sizeof(T)); break; |
3391 | case 11: ptr[10] = v[10]; [[fallthrough]]; |
3392 | case 10: ptr[ 9] = v[ 9]; [[fallthrough]]; |
3393 | case 9: ptr[ 8] = v[ 8]; [[fallthrough]]; |
3394 | case 8: memcpy(ptr, &v, 8*sizeof(T)); break; |
3395 | #endif |
3396 | case 7: ptr[ 6] = v[ 6]; [[fallthrough]]; |
3397 | case 6: ptr[ 5] = v[ 5]; [[fallthrough]]; |
3398 | case 5: ptr[ 4] = v[ 4]; [[fallthrough]]; |
3399 | case 4: memcpy(ptr, &v, 4*sizeof(T)); break; |
3400 | case 3: ptr[ 2] = v[ 2]; [[fallthrough]]; |
3401 | case 2: memcpy(ptr, &v, 2*sizeof(T)); break; |
3402 | case 1: ptr[ 0] = v[ 0]; |
3403 | } |
3404 | } |
3405 | |
3406 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
3407 | template <typename V, typename T> |
3408 | SI V gather(const T* ptr, U32 ix) { |
3409 | return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], |
3410 | ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], |
3411 | ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]], |
3412 | ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], }; |
3413 | } |
3414 | |
3415 | template<> |
3416 | F gather(const float* ptr, U32 ix) { |
3417 | __m256i lo, hi; |
3418 | split(ix, &lo, &hi); |
3419 | |
3420 | return join<F>(_mm256_i32gather_ps(ptr, lo, 4), |
3421 | _mm256_i32gather_ps(ptr, hi, 4)); |
3422 | } |
3423 | |
3424 | template<> |
3425 | U32 gather(const uint32_t* ptr, U32 ix) { |
3426 | __m256i lo, hi; |
3427 | split(ix, &lo, &hi); |
3428 | |
3429 | return join<U32>(_mm256_i32gather_epi32(ptr, lo, 4), |
3430 | _mm256_i32gather_epi32(ptr, hi, 4)); |
3431 | } |
3432 | #else |
3433 | template <typename V, typename T> |
3434 | SI V gather(const T* ptr, U32 ix) { |
3435 | return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], |
3436 | ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], }; |
3437 | } |
3438 | #endif |
3439 | |
3440 | |
3441 | // ~~~~~~ 32-bit memory loads and stores ~~~~~~ // |
3442 | |
3443 | SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { |
3444 | #if 1 && defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
3445 | // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely. |
3446 | __m256i _01,_23; |
3447 | split(rgba, &_01, &_23); |
3448 | __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20), |
3449 | _13 = _mm256_permute2x128_si256(_01,_23, 0x31); |
3450 | rgba = join<U32>(_02, _13); |
3451 | |
3452 | auto cast_U16 = [](U32 v) -> U16 { |
3453 | __m256i _02,_13; |
3454 | split(v, &_02,&_13); |
3455 | return _mm256_packus_epi32(_02,_13); |
3456 | }; |
3457 | #else |
3458 | auto cast_U16 = [](U32 v) -> U16 { |
3459 | return cast<U16>(v); |
3460 | }; |
3461 | #endif |
3462 | *r = cast_U16(rgba & 65535) & 255; |
3463 | *g = cast_U16(rgba & 65535) >> 8; |
3464 | *b = cast_U16(rgba >> 16) & 255; |
3465 | *a = cast_U16(rgba >> 16) >> 8; |
3466 | } |
3467 | |
3468 | SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
3469 | #if 1 && defined(JUMPER_IS_NEON) |
3470 | uint8x8x4_t rgba; |
3471 | switch (tail & (N-1)) { |
3472 | case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break; |
3473 | case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6); [[fallthrough]]; |
3474 | case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5); [[fallthrough]]; |
3475 | case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4); [[fallthrough]]; |
3476 | case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3); [[fallthrough]]; |
3477 | case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2); [[fallthrough]]; |
3478 | case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1); [[fallthrough]]; |
3479 | case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0); |
3480 | } |
3481 | *r = cast<U16>(rgba.val[0]); |
3482 | *g = cast<U16>(rgba.val[1]); |
3483 | *b = cast<U16>(rgba.val[2]); |
3484 | *a = cast<U16>(rgba.val[3]); |
3485 | #else |
3486 | from_8888(load<U32>(ptr, tail), r,g,b,a); |
3487 | #endif |
3488 | } |
3489 | SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
3490 | #if 1 && defined(JUMPER_IS_NEON) |
3491 | uint8x8x4_t rgba = {{ |
3492 | cast<U8>(r), |
3493 | cast<U8>(g), |
3494 | cast<U8>(b), |
3495 | cast<U8>(a), |
3496 | }}; |
3497 | switch (tail & (N-1)) { |
3498 | case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break; |
3499 | case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6); [[fallthrough]]; |
3500 | case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5); [[fallthrough]]; |
3501 | case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4); [[fallthrough]]; |
3502 | case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3); [[fallthrough]]; |
3503 | case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2); [[fallthrough]]; |
3504 | case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1); [[fallthrough]]; |
3505 | case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0); |
3506 | } |
3507 | #else |
3508 | store(ptr, tail, cast<U32>(r | (g<<8)) << 0 |
3509 | | cast<U32>(b | (a<<8)) << 16); |
3510 | #endif |
3511 | } |
3512 | |
3513 | STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
3514 | load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a); |
3515 | } |
3516 | STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
3517 | load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); |
3518 | } |
3519 | STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
3520 | store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a); |
3521 | } |
3522 | STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) { |
3523 | const uint32_t* ptr; |
3524 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
3525 | from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a); |
3526 | } |
3527 | |
3528 | // ~~~~~~ 16-bit memory loads and stores ~~~~~~ // |
3529 | |
3530 | SI void from_565(U16 rgb, U16* r, U16* g, U16* b) { |
3531 | // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0 |
3532 | U16 R = (rgb >> 11) & 31, |
3533 | G = (rgb >> 5) & 63, |
3534 | B = (rgb >> 0) & 31; |
3535 | |
3536 | // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit. |
3537 | *r = (R << 3) | (R >> 2); |
3538 | *g = (G << 2) | (G >> 4); |
3539 | *b = (B << 3) | (B >> 2); |
3540 | } |
3541 | SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { |
3542 | from_565(load<U16>(ptr, tail), r,g,b); |
3543 | } |
3544 | SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) { |
3545 | // Round from [0,255] to [0,31] or [0,63], as if x * (31/255.0f) + 0.5f. |
3546 | // (Don't feel like you need to find some fundamental truth in these... |
3547 | // they were brute-force searched.) |
3548 | U16 R = (r * 9 + 36) / 74, // 9/74 ≈ 31/255, plus 36/74, about half. |
3549 | G = (g * 21 + 42) / 85, // 21/85 = 63/255 exactly. |
3550 | B = (b * 9 + 36) / 74; |
3551 | // Pack them back into 15|rrrrr gggggg bbbbb|0. |
3552 | store(ptr, tail, R << 11 |
3553 | | G << 5 |
3554 | | B << 0); |
3555 | } |
3556 | |
3557 | STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) { |
3558 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b); |
3559 | a = 255; |
3560 | } |
3561 | STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
3562 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db); |
3563 | da = 255; |
3564 | } |
3565 | STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) { |
3566 | store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b); |
3567 | } |
3568 | STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) { |
3569 | const uint16_t* ptr; |
3570 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
3571 | from_565(gather<U16>(ptr, ix), &r, &g, &b); |
3572 | a = 255; |
3573 | } |
3574 | |
3575 | SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) { |
3576 | // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0. |
3577 | U16 R = (rgba >> 12) & 15, |
3578 | G = (rgba >> 8) & 15, |
3579 | B = (rgba >> 4) & 15, |
3580 | A = (rgba >> 0) & 15; |
3581 | |
3582 | // Scale [0,15] to [0,255]. |
3583 | *r = (R << 4) | R; |
3584 | *g = (G << 4) | G; |
3585 | *b = (B << 4) | B; |
3586 | *a = (A << 4) | A; |
3587 | } |
3588 | SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { |
3589 | from_4444(load<U16>(ptr, tail), r,g,b,a); |
3590 | } |
3591 | SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { |
3592 | // Round from [0,255] to [0,15], producing the same value as (x*(15/255.0f) + 0.5f). |
3593 | U16 R = (r + 8) / 17, |
3594 | G = (g + 8) / 17, |
3595 | B = (b + 8) / 17, |
3596 | A = (a + 8) / 17; |
3597 | // Pack them back into 15|rrrr gggg bbbb aaaa|0. |
3598 | store(ptr, tail, R << 12 |
3599 | | G << 8 |
3600 | | B << 4 |
3601 | | A << 0); |
3602 | } |
3603 | |
3604 | STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
3605 | load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a); |
3606 | } |
3607 | STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
3608 | load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); |
3609 | } |
3610 | STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) { |
3611 | store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a); |
3612 | } |
3613 | STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) { |
3614 | const uint16_t* ptr; |
3615 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
3616 | from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a); |
3617 | } |
3618 | |
3619 | SI void from_88(U16 rg, U16* r, U16* g) { |
3620 | *r = (rg & 0xFF); |
3621 | *g = (rg >> 8); |
3622 | } |
3623 | |
3624 | SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) { |
3625 | #if 1 && defined(JUMPER_IS_NEON) |
3626 | uint8x8x2_t rg; |
3627 | switch (tail & (N-1)) { |
3628 | case 0: rg = vld2_u8 ((const uint8_t*)(ptr+0) ); break; |
3629 | case 7: rg = vld2_lane_u8((const uint8_t*)(ptr+6), rg, 6); [[fallthrough]]; |
3630 | case 6: rg = vld2_lane_u8((const uint8_t*)(ptr+5), rg, 5); [[fallthrough]]; |
3631 | case 5: rg = vld2_lane_u8((const uint8_t*)(ptr+4), rg, 4); [[fallthrough]]; |
3632 | case 4: rg = vld2_lane_u8((const uint8_t*)(ptr+3), rg, 3); [[fallthrough]]; |
3633 | case 3: rg = vld2_lane_u8((const uint8_t*)(ptr+2), rg, 2); [[fallthrough]]; |
3634 | case 2: rg = vld2_lane_u8((const uint8_t*)(ptr+1), rg, 1); [[fallthrough]]; |
3635 | case 1: rg = vld2_lane_u8((const uint8_t*)(ptr+0), rg, 0); |
3636 | } |
3637 | *r = cast<U16>(rg.val[0]); |
3638 | *g = cast<U16>(rg.val[1]); |
3639 | #else |
3640 | from_88(load<U16>(ptr, tail), r,g); |
3641 | #endif |
3642 | } |
3643 | |
3644 | SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) { |
3645 | #if 1 && defined(JUMPER_IS_NEON) |
3646 | uint8x8x2_t rg = {{ |
3647 | cast<U8>(r), |
3648 | cast<U8>(g), |
3649 | }}; |
3650 | switch (tail & (N-1)) { |
3651 | case 0: vst2_u8 ((uint8_t*)(ptr+0), rg ); break; |
3652 | case 7: vst2_lane_u8((uint8_t*)(ptr+6), rg, 6); [[fallthrough]]; |
3653 | case 6: vst2_lane_u8((uint8_t*)(ptr+5), rg, 5); [[fallthrough]]; |
3654 | case 5: vst2_lane_u8((uint8_t*)(ptr+4), rg, 4); [[fallthrough]]; |
3655 | case 4: vst2_lane_u8((uint8_t*)(ptr+3), rg, 3); [[fallthrough]]; |
3656 | case 3: vst2_lane_u8((uint8_t*)(ptr+2), rg, 2); [[fallthrough]]; |
3657 | case 2: vst2_lane_u8((uint8_t*)(ptr+1), rg, 1); [[fallthrough]]; |
3658 | case 1: vst2_lane_u8((uint8_t*)(ptr+0), rg, 0); |
3659 | } |
3660 | #else |
3661 | store(ptr, tail, cast<U16>(r | (g<<8)) << 0); |
3662 | #endif |
3663 | } |
3664 | |
3665 | STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
3666 | load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &r, &g); |
3667 | b = 0; |
3668 | a = 255; |
3669 | } |
3670 | STAGE_PP(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
3671 | load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &dr, &dg); |
3672 | db = 0; |
3673 | da = 255; |
3674 | } |
3675 | STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) { |
3676 | store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g); |
3677 | } |
3678 | STAGE_GP(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) { |
3679 | const uint16_t* ptr; |
3680 | U32 ix = ix_and_ptr(&ptr, ctx, x, y); |
3681 | from_88(gather<U16>(ptr, ix), &r, &g); |
3682 | b = 0; |
3683 | a = 255; |
3684 | } |
3685 | |
3686 | // ~~~~~~ 8-bit memory loads and stores ~~~~~~ // |
3687 | |
3688 | SI U16 load_8(const uint8_t* ptr, size_t tail) { |
3689 | return cast<U16>(load<U8>(ptr, tail)); |
3690 | } |
3691 | SI void store_8(uint8_t* ptr, size_t tail, U16 v) { |
3692 | store(ptr, tail, cast<U8>(v)); |
3693 | } |
3694 | |
3695 | STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
3696 | r = g = b = 0; |
3697 | a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
3698 | } |
3699 | STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) { |
3700 | dr = dg = db = 0; |
3701 | da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
3702 | } |
3703 | STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) { |
3704 | store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a); |
3705 | } |
3706 | STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) { |
3707 | const uint8_t* ptr; |
3708 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
3709 | r = g = b = 0; |
3710 | a = cast<U16>(gather<U8>(ptr, ix)); |
3711 | } |
3712 | |
3713 | STAGE_PP(alpha_to_gray, Ctx::None) { |
3714 | r = g = b = a; |
3715 | a = 255; |
3716 | } |
3717 | STAGE_PP(alpha_to_gray_dst, Ctx::None) { |
3718 | dr = dg = db = da; |
3719 | da = 255; |
3720 | } |
3721 | STAGE_PP(bt709_luminance_or_luma_to_alpha, Ctx::None) { |
3722 | a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator. |
3723 | r = g = b = 0; |
3724 | } |
3725 | |
3726 | // ~~~~~~ Coverage scales / lerps ~~~~~~ // |
3727 | |
3728 | STAGE_PP(load_src, const uint16_t* ptr) { |
3729 | r = sk_unaligned_load<U16>(ptr + 0*N); |
3730 | g = sk_unaligned_load<U16>(ptr + 1*N); |
3731 | b = sk_unaligned_load<U16>(ptr + 2*N); |
3732 | a = sk_unaligned_load<U16>(ptr + 3*N); |
3733 | } |
3734 | STAGE_PP(store_src, uint16_t* ptr) { |
3735 | sk_unaligned_store(ptr + 0*N, r); |
3736 | sk_unaligned_store(ptr + 1*N, g); |
3737 | sk_unaligned_store(ptr + 2*N, b); |
3738 | sk_unaligned_store(ptr + 3*N, a); |
3739 | } |
3740 | STAGE_PP(store_src_a, uint16_t* ptr) { |
3741 | sk_unaligned_store(ptr, a); |
3742 | } |
3743 | STAGE_PP(load_dst, const uint16_t* ptr) { |
3744 | dr = sk_unaligned_load<U16>(ptr + 0*N); |
3745 | dg = sk_unaligned_load<U16>(ptr + 1*N); |
3746 | db = sk_unaligned_load<U16>(ptr + 2*N); |
3747 | da = sk_unaligned_load<U16>(ptr + 3*N); |
3748 | } |
3749 | STAGE_PP(store_dst, uint16_t* ptr) { |
3750 | sk_unaligned_store(ptr + 0*N, dr); |
3751 | sk_unaligned_store(ptr + 1*N, dg); |
3752 | sk_unaligned_store(ptr + 2*N, db); |
3753 | sk_unaligned_store(ptr + 3*N, da); |
3754 | } |
3755 | |
3756 | // ~~~~~~ Coverage scales / lerps ~~~~~~ // |
3757 | |
3758 | STAGE_PP(scale_1_float, const float* f) { |
3759 | U16 c = from_float(*f); |
3760 | r = div255( r * c ); |
3761 | g = div255( g * c ); |
3762 | b = div255( b * c ); |
3763 | a = div255( a * c ); |
3764 | } |
3765 | STAGE_PP(lerp_1_float, const float* f) { |
3766 | U16 c = from_float(*f); |
3767 | r = lerp(dr, r, c); |
3768 | g = lerp(dg, g, c); |
3769 | b = lerp(db, b, c); |
3770 | a = lerp(da, a, c); |
3771 | } |
3772 | STAGE_PP(scale_native, const uint16_t scales[]) { |
3773 | auto c = sk_unaligned_load<U16>(scales); |
3774 | r = div255( r * c ); |
3775 | g = div255( g * c ); |
3776 | b = div255( b * c ); |
3777 | a = div255( a * c ); |
3778 | } |
3779 | |
3780 | STAGE_PP(lerp_native, const uint16_t scales[]) { |
3781 | auto c = sk_unaligned_load<U16>(scales); |
3782 | r = lerp(dr, r, c); |
3783 | g = lerp(dg, g, c); |
3784 | b = lerp(db, b, c); |
3785 | a = lerp(da, a, c); |
3786 | } |
3787 | |
3788 | STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
3789 | U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
3790 | r = div255( r * c ); |
3791 | g = div255( g * c ); |
3792 | b = div255( b * c ); |
3793 | a = div255( a * c ); |
3794 | } |
3795 | STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) { |
3796 | U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); |
3797 | r = lerp(dr, r, c); |
3798 | g = lerp(dg, g, c); |
3799 | b = lerp(db, b, c); |
3800 | a = lerp(da, a, c); |
3801 | } |
3802 | |
3803 | // Derive alpha's coverage from rgb coverage and the values of src and dst alpha. |
3804 | SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) { |
3805 | return if_then_else(a < da, min(cr, min(cg,cb)) |
3806 | , max(cr, max(cg,cb))); |
3807 | } |
3808 | STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) { |
3809 | U16 cr,cg,cb; |
3810 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); |
3811 | U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
3812 | |
3813 | r = div255( r * cr ); |
3814 | g = div255( g * cg ); |
3815 | b = div255( b * cb ); |
3816 | a = div255( a * ca ); |
3817 | } |
3818 | STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) { |
3819 | U16 cr,cg,cb; |
3820 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); |
3821 | U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); |
3822 | |
3823 | r = lerp(dr, r, cr); |
3824 | g = lerp(dg, g, cg); |
3825 | b = lerp(db, b, cb); |
3826 | a = lerp(da, a, ca); |
3827 | } |
3828 | |
3829 | STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) { |
3830 | U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail), |
3831 | add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail); |
3832 | |
3833 | r = min(div255(r*mul) + add, a); |
3834 | g = min(div255(g*mul) + add, a); |
3835 | b = min(div255(b*mul) + add, a); |
3836 | } |
3837 | |
3838 | |
3839 | // ~~~~~~ Gradient stages ~~~~~~ // |
3840 | |
3841 | // Clamp x to [0,1], both sides inclusive (think, gradients). |
3842 | // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. |
3843 | SI F clamp_01(F v) { return min(max(0, v), 1); } |
3844 | |
3845 | STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); } |
3846 | STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); } |
3847 | STAGE_GG(mirror_x_1, Ctx::None) { |
3848 | auto two = [](F x){ return x+x; }; |
3849 | x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f )); |
3850 | } |
3851 | |
3852 | SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); } |
3853 | |
3854 | STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) { |
3855 | auto w = ctx->limit_x; |
3856 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w))); |
3857 | } |
3858 | STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) { |
3859 | auto h = ctx->limit_y; |
3860 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h))); |
3861 | } |
3862 | STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) { |
3863 | auto w = ctx->limit_x; |
3864 | auto h = ctx->limit_y; |
3865 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h))); |
3866 | } |
3867 | STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) { |
3868 | auto mask = sk_unaligned_load<U16>(ctx->mask); |
3869 | r = r & mask; |
3870 | g = g & mask; |
3871 | b = b & mask; |
3872 | a = a & mask; |
3873 | } |
3874 | |
3875 | SI void round_F_to_U16(F R, F G, F B, F A, bool interpolatedInPremul, |
3876 | U16* r, U16* g, U16* b, U16* a) { |
3877 | auto round = [](F x) { return cast<U16>(x * 255.0f + 0.5f); }; |
3878 | |
3879 | F limit = interpolatedInPremul ? A |
3880 | : 1; |
3881 | *r = round(min(max(0,R), limit)); |
3882 | *g = round(min(max(0,G), limit)); |
3883 | *b = round(min(max(0,B), limit)); |
3884 | *a = round(A); // we assume alpha is already in [0,1]. |
3885 | } |
3886 | |
3887 | SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t, |
3888 | U16* r, U16* g, U16* b, U16* a) { |
3889 | |
3890 | F fr, fg, fb, fa, br, bg, bb, ba; |
3891 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) |
3892 | if (c->stopCount <=8) { |
3893 | __m256i lo, hi; |
3894 | split(idx, &lo, &hi); |
3895 | |
3896 | fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo), |
3897 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi)); |
3898 | br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo), |
3899 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi)); |
3900 | fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo), |
3901 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi)); |
3902 | bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo), |
3903 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi)); |
3904 | fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo), |
3905 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi)); |
3906 | bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo), |
3907 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi)); |
3908 | fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo), |
3909 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi)); |
3910 | ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo), |
3911 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi)); |
3912 | } else |
3913 | #endif |
3914 | { |
3915 | fr = gather<F>(c->fs[0], idx); |
3916 | fg = gather<F>(c->fs[1], idx); |
3917 | fb = gather<F>(c->fs[2], idx); |
3918 | fa = gather<F>(c->fs[3], idx); |
3919 | br = gather<F>(c->bs[0], idx); |
3920 | bg = gather<F>(c->bs[1], idx); |
3921 | bb = gather<F>(c->bs[2], idx); |
3922 | ba = gather<F>(c->bs[3], idx); |
3923 | } |
3924 | round_F_to_U16(mad(t, fr, br), |
3925 | mad(t, fg, bg), |
3926 | mad(t, fb, bb), |
3927 | mad(t, fa, ba), |
3928 | c->interpolatedInPremul, |
3929 | r,g,b,a); |
3930 | } |
3931 | |
3932 | STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) { |
3933 | auto t = x; |
3934 | U32 idx = 0; |
3935 | |
3936 | // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. |
3937 | for (size_t i = 1; i < c->stopCount; i++) { |
3938 | idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); |
3939 | } |
3940 | |
3941 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
3942 | } |
3943 | |
3944 | STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) { |
3945 | auto t = x; |
3946 | auto idx = trunc_(t * (c->stopCount-1)); |
3947 | gradient_lookup(c, idx, t, &r, &g, &b, &a); |
3948 | } |
3949 | |
3950 | STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) { |
3951 | auto t = x; |
3952 | round_F_to_U16(mad(t, c->f[0], c->b[0]), |
3953 | mad(t, c->f[1], c->b[1]), |
3954 | mad(t, c->f[2], c->b[2]), |
3955 | mad(t, c->f[3], c->b[3]), |
3956 | c->interpolatedInPremul, |
3957 | &r,&g,&b,&a); |
3958 | } |
3959 | |
3960 | STAGE_GG(xy_to_unit_angle, Ctx::None) { |
3961 | F xabs = abs_(x), |
3962 | yabs = abs_(y); |
3963 | |
3964 | F slope = min(xabs, yabs)/max(xabs, yabs); |
3965 | F s = slope * slope; |
3966 | |
3967 | // Use a 7th degree polynomial to approximate atan. |
3968 | // This was generated using sollya.gforge.inria.fr. |
3969 | // A float optimized polynomial was generated using the following command. |
3970 | // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); |
3971 | F phi = slope |
3972 | * (0.15912117063999176025390625f + s |
3973 | * (-5.185396969318389892578125e-2f + s |
3974 | * (2.476101927459239959716796875e-2f + s |
3975 | * (-7.0547382347285747528076171875e-3f)))); |
3976 | |
3977 | phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); |
3978 | phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi); |
3979 | phi = if_then_else(y < 0.0f , 1.0f - phi , phi); |
3980 | phi = if_then_else(phi != phi , 0 , phi); // Check for NaN. |
3981 | x = phi; |
3982 | } |
3983 | STAGE_GG(xy_to_radius, Ctx::None) { |
3984 | x = sqrt_(x*x + y*y); |
3985 | } |
3986 | |
3987 | // ~~~~~~ Compound stages ~~~~~~ // |
3988 | |
3989 | STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) { |
3990 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); |
3991 | |
3992 | load_8888_(ptr, tail, &dr,&dg,&db,&da); |
3993 | r = r + div255( dr*inv(a) ); |
3994 | g = g + div255( dg*inv(a) ); |
3995 | b = b + div255( db*inv(a) ); |
3996 | a = a + div255( da*inv(a) ); |
3997 | store_8888_(ptr, tail, r,g,b,a); |
3998 | } |
3999 | |
4000 | #if defined(SK_DISABLE_LOWP_BILERP_CLAMP_CLAMP_STAGE) |
4001 | static void(*bilerp_clamp_8888)(void) = nullptr; |
4002 | static void(*bilinear)(void) = nullptr; |
4003 | #else |
4004 | STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { |
4005 | // (cx,cy) are the center of our sample. |
4006 | F cx = x, |
4007 | cy = y; |
4008 | |
4009 | // All sample points are at the same fractional offset (fx,fy). |
4010 | // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. |
4011 | F fx = fract(cx + 0.5f), |
4012 | fy = fract(cy + 0.5f); |
4013 | |
4014 | // We'll accumulate the color of all four samples into {r,g,b,a} directly. |
4015 | r = g = b = a = 0; |
4016 | |
4017 | // The first three sample points will calculate their area using math |
4018 | // just like in the float code above, but the fourth will take up all the rest. |
4019 | // |
4020 | // Logically this is the same as doing the math for the fourth pixel too, |
4021 | // but rounding error makes this a better strategy, keeping opaque opaque, etc. |
4022 | // |
4023 | // We can keep up to 8 bits of fractional precision without overflowing 16-bit, |
4024 | // so our "1.0" area is 256. |
4025 | const uint16_t bias = 256; |
4026 | U16 remaining = bias; |
4027 | |
4028 | for (float dy = -0.5f; dy <= +0.5f; dy += 1.0f) |
4029 | for (float dx = -0.5f; dx <= +0.5f; dx += 1.0f) { |
4030 | // (x,y) are the coordinates of this sample point. |
4031 | F x = cx + dx, |
4032 | y = cy + dy; |
4033 | |
4034 | // ix_and_ptr() will clamp to the image's bounds for us. |
4035 | const uint32_t* ptr; |
4036 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
4037 | |
4038 | U16 sr,sg,sb,sa; |
4039 | from_8888(gather<U32>(ptr, ix), &sr,&sg,&sb,&sa); |
4040 | |
4041 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center |
4042 | // are combined in direct proportion to their area overlapping that logical query pixel. |
4043 | // At positive offsets, the x-axis contribution to that rectangle is fx, |
4044 | // or (1-fx) at negative x. Same deal for y. |
4045 | F sx = (dx > 0) ? fx : 1.0f - fx, |
4046 | sy = (dy > 0) ? fy : 1.0f - fy; |
4047 | |
4048 | U16 area = (dy == 0.5f && dx == 0.5f) ? remaining |
4049 | : cast<U16>(sx * sy * bias); |
4050 | for (size_t i = 0; i < N; i++) { |
4051 | SkASSERT(remaining[i] >= area[i]); |
4052 | } |
4053 | remaining -= area; |
4054 | |
4055 | r += sr * area; |
4056 | g += sg * area; |
4057 | b += sb * area; |
4058 | a += sa * area; |
4059 | } |
4060 | |
4061 | r = (r + bias/2) / bias; |
4062 | g = (g + bias/2) / bias; |
4063 | b = (b + bias/2) / bias; |
4064 | a = (a + bias/2) / bias; |
4065 | } |
4066 | |
4067 | // TODO: lowp::tile() is identical to the highp tile()... share? |
4068 | SI F tile(F v, SkTileMode mode, float limit, float invLimit) { |
4069 | // After ix_and_ptr() will clamp the output of tile(), so we need not clamp here. |
4070 | switch (mode) { |
4071 | case SkTileMode::kDecal: // TODO, for now fallthrough to clamp |
4072 | case SkTileMode::kClamp: return v; |
4073 | case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit; |
4074 | case SkTileMode::kMirror: |
4075 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); |
4076 | } |
4077 | SkUNREACHABLE; |
4078 | } |
4079 | |
4080 | SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y, |
4081 | U16* r, U16* g, U16* b, U16* a) { |
4082 | x = tile(x, ctx->tileX, ctx->width , ctx->invWidth ); |
4083 | y = tile(y, ctx->tileY, ctx->height, ctx->invHeight); |
4084 | |
4085 | switch (ctx->ct) { |
4086 | default: *r = *g = *b = *a = 0; // TODO |
4087 | break; |
4088 | |
4089 | case kRGBA_8888_SkColorType: |
4090 | case kBGRA_8888_SkColorType: { |
4091 | const uint32_t* ptr; |
4092 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); |
4093 | from_8888(gather<U32>(ptr, ix), r,g,b,a); |
4094 | if (ctx->ct == kBGRA_8888_SkColorType) { |
4095 | std::swap(*r,*b); |
4096 | } |
4097 | } break; |
4098 | } |
4099 | } |
4100 | |
4101 | template <int D> |
4102 | SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx, |
4103 | F cx, F cy, const F (&wx)[D], const F (&wy)[D], |
4104 | U16* r, U16* g, U16* b, U16* a) { |
4105 | |
4106 | float start = -0.5f*(D-1); |
4107 | |
4108 | const uint16_t bias = 256; |
4109 | U16 remaining = bias; |
4110 | |
4111 | *r = *g = *b = *a = 0; |
4112 | F y = cy + start; |
4113 | for (int j = 0; j < D; j++, y += 1.0f) { |
4114 | F x = cx + start; |
4115 | for (int i = 0; i < D; i++, x += 1.0f) { |
4116 | U16 R,G,B,A; |
4117 | sample(ctx, x,y, &R,&G,&B,&A); |
4118 | |
4119 | U16 w = (i == D-1 && j == D-1) ? remaining |
4120 | : cast<U16>(wx[i]*wy[j]*bias); |
4121 | remaining -= w; |
4122 | *r += w*R; |
4123 | *g += w*G; |
4124 | *b += w*B; |
4125 | *a += w*A; |
4126 | } |
4127 | } |
4128 | *r = (*r + bias/2) / bias; |
4129 | *g = (*g + bias/2) / bias; |
4130 | *b = (*b + bias/2) / bias; |
4131 | *a = (*a + bias/2) / bias; |
4132 | } |
4133 | |
4134 | STAGE_GP(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) { |
4135 | F fx = fract(x + 0.5f), |
4136 | fy = fract(y + 0.5f); |
4137 | const F wx[] = {1.0f - fx, fx}; |
4138 | const F wy[] = {1.0f - fy, fy}; |
4139 | |
4140 | sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); |
4141 | } |
4142 | #endif |
4143 | |
4144 | // ~~~~~~ GrSwizzle stage ~~~~~~ // |
4145 | |
4146 | STAGE_PP(swizzle, void* ctx) { |
4147 | auto ir = r, ig = g, ib = b, ia = a; |
4148 | U16* o[] = {&r, &g, &b, &a}; |
4149 | char swiz[4]; |
4150 | memcpy(swiz, &ctx, sizeof(swiz)); |
4151 | |
4152 | for (int i = 0; i < 4; ++i) { |
4153 | switch (swiz[i]) { |
4154 | case 'r': *o[i] = ir; break; |
4155 | case 'g': *o[i] = ig; break; |
4156 | case 'b': *o[i] = ib; break; |
4157 | case 'a': *o[i] = ia; break; |
4158 | case '0': *o[i] = U16(0); break; |
4159 | case '1': *o[i] = U16(255); break; |
4160 | default: break; |
4161 | } |
4162 | } |
4163 | } |
4164 | |
4165 | // Now we'll add null stand-ins for stages we haven't implemented in lowp. |
4166 | // If a pipeline uses these stages, it'll boot it out of lowp into highp. |
4167 | #define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr; |
4168 | NOT_IMPLEMENTED(callback) |
4169 | NOT_IMPLEMENTED(interpreter) |
4170 | NOT_IMPLEMENTED(unbounded_set_rgb) |
4171 | NOT_IMPLEMENTED(unbounded_uniform_color) |
4172 | NOT_IMPLEMENTED(unpremul) |
4173 | NOT_IMPLEMENTED(dither) // TODO |
4174 | NOT_IMPLEMENTED(load_16161616) |
4175 | NOT_IMPLEMENTED(load_16161616_dst) |
4176 | NOT_IMPLEMENTED(store_16161616) |
4177 | NOT_IMPLEMENTED(gather_16161616) |
4178 | NOT_IMPLEMENTED(load_a16) |
4179 | NOT_IMPLEMENTED(load_a16_dst) |
4180 | NOT_IMPLEMENTED(store_a16) |
4181 | NOT_IMPLEMENTED(gather_a16) |
4182 | NOT_IMPLEMENTED(load_rg1616) |
4183 | NOT_IMPLEMENTED(load_rg1616_dst) |
4184 | NOT_IMPLEMENTED(store_rg1616) |
4185 | NOT_IMPLEMENTED(gather_rg1616) |
4186 | NOT_IMPLEMENTED(load_f16) |
4187 | NOT_IMPLEMENTED(load_f16_dst) |
4188 | NOT_IMPLEMENTED(store_f16) |
4189 | NOT_IMPLEMENTED(gather_f16) |
4190 | NOT_IMPLEMENTED(load_af16) |
4191 | NOT_IMPLEMENTED(load_af16_dst) |
4192 | NOT_IMPLEMENTED(store_af16) |
4193 | NOT_IMPLEMENTED(gather_af16) |
4194 | NOT_IMPLEMENTED(load_rgf16) |
4195 | NOT_IMPLEMENTED(load_rgf16_dst) |
4196 | NOT_IMPLEMENTED(store_rgf16) |
4197 | NOT_IMPLEMENTED(gather_rgf16) |
4198 | NOT_IMPLEMENTED(load_f32) |
4199 | NOT_IMPLEMENTED(load_f32_dst) |
4200 | NOT_IMPLEMENTED(store_f32) |
4201 | NOT_IMPLEMENTED(gather_f32) |
4202 | NOT_IMPLEMENTED(load_rgf32) |
4203 | NOT_IMPLEMENTED(store_rgf32) |
4204 | NOT_IMPLEMENTED(load_1010102) |
4205 | NOT_IMPLEMENTED(load_1010102_dst) |
4206 | NOT_IMPLEMENTED(store_1010102) |
4207 | NOT_IMPLEMENTED(gather_1010102) |
4208 | NOT_IMPLEMENTED(store_u16_be) |
4209 | NOT_IMPLEMENTED(byte_tables) // TODO |
4210 | NOT_IMPLEMENTED(colorburn) |
4211 | NOT_IMPLEMENTED(colordodge) |
4212 | NOT_IMPLEMENTED(softlight) |
4213 | NOT_IMPLEMENTED(hue) |
4214 | NOT_IMPLEMENTED(saturation) |
4215 | NOT_IMPLEMENTED(color) |
4216 | NOT_IMPLEMENTED(luminosity) |
4217 | NOT_IMPLEMENTED(matrix_3x3) |
4218 | NOT_IMPLEMENTED(matrix_3x4) |
4219 | NOT_IMPLEMENTED(matrix_4x5) // TODO |
4220 | NOT_IMPLEMENTED(matrix_4x3) // TODO |
4221 | NOT_IMPLEMENTED(parametric) |
4222 | NOT_IMPLEMENTED(gamma_) |
4223 | NOT_IMPLEMENTED(PQish) |
4224 | NOT_IMPLEMENTED(HLGish) |
4225 | NOT_IMPLEMENTED(HLGinvish) |
4226 | NOT_IMPLEMENTED(rgb_to_hsl) |
4227 | NOT_IMPLEMENTED(hsl_to_rgb) |
4228 | NOT_IMPLEMENTED(gauss_a_to_rgba) // TODO |
4229 | NOT_IMPLEMENTED(mirror_x) // TODO |
4230 | NOT_IMPLEMENTED(repeat_x) // TODO |
4231 | NOT_IMPLEMENTED(mirror_y) // TODO |
4232 | NOT_IMPLEMENTED(repeat_y) // TODO |
4233 | NOT_IMPLEMENTED(negate_x) |
4234 | NOT_IMPLEMENTED(bicubic) // TODO if I can figure out negative weights |
4235 | NOT_IMPLEMENTED(bicubic_clamp_8888) |
4236 | NOT_IMPLEMENTED(bilinear_nx) // TODO |
4237 | NOT_IMPLEMENTED(bilinear_ny) // TODO |
4238 | NOT_IMPLEMENTED(bilinear_px) // TODO |
4239 | NOT_IMPLEMENTED(bilinear_py) // TODO |
4240 | NOT_IMPLEMENTED(bicubic_n3x) // TODO |
4241 | NOT_IMPLEMENTED(bicubic_n1x) // TODO |
4242 | NOT_IMPLEMENTED(bicubic_p1x) // TODO |
4243 | NOT_IMPLEMENTED(bicubic_p3x) // TODO |
4244 | NOT_IMPLEMENTED(bicubic_n3y) // TODO |
4245 | NOT_IMPLEMENTED(bicubic_n1y) // TODO |
4246 | NOT_IMPLEMENTED(bicubic_p1y) // TODO |
4247 | NOT_IMPLEMENTED(bicubic_p3y) // TODO |
4248 | NOT_IMPLEMENTED(save_xy) // TODO |
4249 | NOT_IMPLEMENTED(accumulate) // TODO |
4250 | NOT_IMPLEMENTED(xy_to_2pt_conical_well_behaved) |
4251 | NOT_IMPLEMENTED(xy_to_2pt_conical_strip) |
4252 | NOT_IMPLEMENTED(xy_to_2pt_conical_focal_on_circle) |
4253 | NOT_IMPLEMENTED(xy_to_2pt_conical_smaller) |
4254 | NOT_IMPLEMENTED(xy_to_2pt_conical_greater) |
4255 | NOT_IMPLEMENTED(alter_2pt_conical_compensate_focal) |
4256 | NOT_IMPLEMENTED(alter_2pt_conical_unswap) |
4257 | NOT_IMPLEMENTED(mask_2pt_conical_nan) |
4258 | NOT_IMPLEMENTED(mask_2pt_conical_degenerates) |
4259 | NOT_IMPLEMENTED(apply_vector_mask) |
4260 | #undef NOT_IMPLEMENTED |
4261 | |
4262 | #endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages |
4263 | } // namespace lowp |
4264 | |
4265 | } // namespace SK_OPTS_NS |
4266 | |
4267 | #endif//SkRasterPipeline_opts_DEFINED |
4268 | |