| 1 | /* | 
|---|
| 2 | * Copyright 2018 Google Inc. | 
|---|
| 3 | * | 
|---|
| 4 | * Use of this source code is governed by a BSD-style license that can be | 
|---|
| 5 | * found in the LICENSE file. | 
|---|
| 6 | */ | 
|---|
| 7 |  | 
|---|
| 8 | #ifndef SkRasterPipeline_opts_DEFINED | 
|---|
| 9 | #define SkRasterPipeline_opts_DEFINED | 
|---|
| 10 |  | 
|---|
| 11 | #include "include/core/SkTypes.h" | 
|---|
| 12 | #include "src/core/SkUtils.h"  // unaligned_{load,store} | 
|---|
| 13 | #include "src/sksl/SkSLByteCode.h" | 
|---|
| 14 |  | 
|---|
| 15 | // Every function in this file should be marked static and inline using SI. | 
|---|
| 16 | #if defined(__clang__) | 
|---|
| 17 | #define SI __attribute__((always_inline)) static inline | 
|---|
| 18 | #else | 
|---|
| 19 | #define SI static inline | 
|---|
| 20 | #endif | 
|---|
| 21 |  | 
|---|
| 22 | template <typename Dst, typename Src> | 
|---|
| 23 | SI Dst bit_cast(const Src& src) { | 
|---|
| 24 | static_assert(sizeof(Dst) == sizeof(Src), ""); | 
|---|
| 25 | return sk_unaligned_load<Dst>(&src); | 
|---|
| 26 | } | 
|---|
| 27 |  | 
|---|
| 28 | template <typename Dst, typename Src> | 
|---|
| 29 | SI Dst widen_cast(const Src& src) { | 
|---|
| 30 | static_assert(sizeof(Dst) > sizeof(Src), ""); | 
|---|
| 31 | Dst dst; | 
|---|
| 32 | memcpy(&dst, &src, sizeof(Src)); | 
|---|
| 33 | return dst; | 
|---|
| 34 | } | 
|---|
| 35 |  | 
|---|
| 36 | // Our program is an array of void*, either | 
|---|
| 37 | //   - 1 void* per stage with no context pointer, the next stage; | 
|---|
| 38 | //   - 2 void* per stage with a context pointer, first the context pointer, then the next stage. | 
|---|
| 39 |  | 
|---|
| 40 | // load_and_inc() steps the program forward by 1 void*, returning that pointer. | 
|---|
| 41 | SI void* load_and_inc(void**& program) { | 
|---|
| 42 | #if defined(__GNUC__) && defined(__x86_64__) | 
|---|
| 43 | // If program is in %rsi (we try to make this likely) then this is a single instruction. | 
|---|
| 44 | void* rax; | 
|---|
| 45 | asm( "lodsq": "=a"(rax), "+S"(program));  // Write-only %rax, read-write %rsi. | 
|---|
| 46 | return rax; | 
|---|
| 47 | #else | 
|---|
| 48 | // On ARM *program++ compiles into pretty ideal code without any handholding. | 
|---|
| 49 | return *program++; | 
|---|
| 50 | #endif | 
|---|
| 51 | } | 
|---|
| 52 |  | 
|---|
| 53 | // Lazily resolved on first cast.  Does nothing if cast to Ctx::None. | 
|---|
| 54 | struct Ctx { | 
|---|
| 55 | struct None {}; | 
|---|
| 56 |  | 
|---|
| 57 | void*   ptr; | 
|---|
| 58 | void**& program; | 
|---|
| 59 |  | 
|---|
| 60 | explicit Ctx(void**& p) : ptr(nullptr), program(p) {} | 
|---|
| 61 |  | 
|---|
| 62 | template <typename T> | 
|---|
| 63 | operator T*() { | 
|---|
| 64 | if (!ptr) { ptr = load_and_inc(program); } | 
|---|
| 65 | return (T*)ptr; | 
|---|
| 66 | } | 
|---|
| 67 | operator None() { return None{}; } | 
|---|
| 68 | }; | 
|---|
| 69 |  | 
|---|
| 70 |  | 
|---|
| 71 | #if !defined(__clang__) | 
|---|
| 72 | #define JUMPER_IS_SCALAR | 
|---|
| 73 | #elif defined(SK_ARM_HAS_NEON) | 
|---|
| 74 | #define JUMPER_IS_NEON | 
|---|
| 75 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX512 | 
|---|
| 76 | #define JUMPER_IS_AVX512 | 
|---|
| 77 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 | 
|---|
| 78 | #define JUMPER_IS_HSW | 
|---|
| 79 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX | 
|---|
| 80 | #define JUMPER_IS_AVX | 
|---|
| 81 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 | 
|---|
| 82 | #define JUMPER_IS_SSE41 | 
|---|
| 83 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 
|---|
| 84 | #define JUMPER_IS_SSE2 | 
|---|
| 85 | #else | 
|---|
| 86 | #define JUMPER_IS_SCALAR | 
|---|
| 87 | #endif | 
|---|
| 88 |  | 
|---|
| 89 | // Older Clangs seem to crash when generating non-optimized NEON code for ARMv7. | 
|---|
| 90 | #if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32) | 
|---|
| 91 | // Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative. | 
|---|
| 92 | #if defined(__apple_build_version__) && __clang_major__ < 9 | 
|---|
| 93 | #define JUMPER_IS_SCALAR | 
|---|
| 94 | #elif __clang_major__ < 5 | 
|---|
| 95 | #define JUMPER_IS_SCALAR | 
|---|
| 96 | #endif | 
|---|
| 97 |  | 
|---|
| 98 | #if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR) | 
|---|
| 99 | #undef  JUMPER_IS_NEON | 
|---|
| 100 | #endif | 
|---|
| 101 | #endif | 
|---|
| 102 |  | 
|---|
| 103 | #if defined(JUMPER_IS_SCALAR) | 
|---|
| 104 | #include <math.h> | 
|---|
| 105 | #elif defined(JUMPER_IS_NEON) | 
|---|
| 106 | #include <arm_neon.h> | 
|---|
| 107 | #else | 
|---|
| 108 | #include <immintrin.h> | 
|---|
| 109 | #endif | 
|---|
| 110 |  | 
|---|
| 111 | namespace SK_OPTS_NS { | 
|---|
| 112 |  | 
|---|
| 113 | #if defined(JUMPER_IS_SCALAR) | 
|---|
| 114 | // This path should lead to portable scalar code. | 
|---|
| 115 | using F   = float   ; | 
|---|
| 116 | using I32 =  int32_t; | 
|---|
| 117 | using U64 = uint64_t; | 
|---|
| 118 | using U32 = uint32_t; | 
|---|
| 119 | using U16 = uint16_t; | 
|---|
| 120 | using U8  = uint8_t ; | 
|---|
| 121 |  | 
|---|
| 122 | SI F   mad(F f, F m, F a)   { return f*m+a; } | 
|---|
| 123 | SI F   min(F a, F b)        { return fminf(a,b); } | 
|---|
| 124 | SI F   max(F a, F b)        { return fmaxf(a,b); } | 
|---|
| 125 | SI F   abs_  (F v)          { return fabsf(v); } | 
|---|
| 126 | SI F   floor_(F v)          { return floorf(v); } | 
|---|
| 127 | SI F   rcp   (F v)          { return 1.0f / v; } | 
|---|
| 128 | SI F   rsqrt (F v)          { return 1.0f / sqrtf(v); } | 
|---|
| 129 | SI F    sqrt_(F v)          { return sqrtf(v); } | 
|---|
| 130 | SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); } | 
|---|
| 131 | SI U16 pack(U32 v)          { return (U16)v; } | 
|---|
| 132 | SI U8  pack(U16 v)          { return  (U8)v; } | 
|---|
| 133 |  | 
|---|
| 134 | SI F if_then_else(I32 c, F t, F e) { return c ? t : e; } | 
|---|
| 135 |  | 
|---|
| 136 | template <typename T> | 
|---|
| 137 | SI T gather(const T* p, U32 ix) { return p[ix]; } | 
|---|
| 138 |  | 
|---|
| 139 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { | 
|---|
| 140 | *r = ptr[0]; | 
|---|
| 141 | *g = ptr[1]; | 
|---|
| 142 | } | 
|---|
| 143 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { | 
|---|
| 144 | ptr[0] = r; | 
|---|
| 145 | ptr[1] = g; | 
|---|
| 146 | } | 
|---|
| 147 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { | 
|---|
| 148 | *r = ptr[0]; | 
|---|
| 149 | *g = ptr[1]; | 
|---|
| 150 | *b = ptr[2]; | 
|---|
| 151 | } | 
|---|
| 152 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 153 | *r = ptr[0]; | 
|---|
| 154 | *g = ptr[1]; | 
|---|
| 155 | *b = ptr[2]; | 
|---|
| 156 | *a = ptr[3]; | 
|---|
| 157 | } | 
|---|
| 158 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { | 
|---|
| 159 | ptr[0] = r; | 
|---|
| 160 | ptr[1] = g; | 
|---|
| 161 | ptr[2] = b; | 
|---|
| 162 | ptr[3] = a; | 
|---|
| 163 | } | 
|---|
| 164 |  | 
|---|
| 165 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { | 
|---|
| 166 | *r = ptr[0]; | 
|---|
| 167 | *g = ptr[1]; | 
|---|
| 168 | } | 
|---|
| 169 | SI void store2(float* ptr, size_t tail, F r, F g) { | 
|---|
| 170 | ptr[0] = r; | 
|---|
| 171 | ptr[1] = g; | 
|---|
| 172 | } | 
|---|
| 173 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { | 
|---|
| 174 | *r = ptr[0]; | 
|---|
| 175 | *g = ptr[1]; | 
|---|
| 176 | *b = ptr[2]; | 
|---|
| 177 | *a = ptr[3]; | 
|---|
| 178 | } | 
|---|
| 179 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { | 
|---|
| 180 | ptr[0] = r; | 
|---|
| 181 | ptr[1] = g; | 
|---|
| 182 | ptr[2] = b; | 
|---|
| 183 | ptr[3] = a; | 
|---|
| 184 | } | 
|---|
| 185 |  | 
|---|
| 186 | #elif defined(JUMPER_IS_NEON) | 
|---|
| 187 | // Since we know we're using Clang, we can use its vector extensions. | 
|---|
| 188 | template <typename T> using V = T __attribute__((ext_vector_type(4))); | 
|---|
| 189 | using F   = V<float   >; | 
|---|
| 190 | using I32 = V< int32_t>; | 
|---|
| 191 | using U64 = V<uint64_t>; | 
|---|
| 192 | using U32 = V<uint32_t>; | 
|---|
| 193 | using U16 = V<uint16_t>; | 
|---|
| 194 | using U8  = V<uint8_t >; | 
|---|
| 195 |  | 
|---|
| 196 | // We polyfill a few routines that Clang doesn't build into ext_vector_types. | 
|---|
| 197 | SI F   min(F a, F b)                         { return vminq_f32(a,b);          } | 
|---|
| 198 | SI F   max(F a, F b)                         { return vmaxq_f32(a,b);          } | 
|---|
| 199 | SI F   abs_  (F v)                           { return vabsq_f32(v);            } | 
|---|
| 200 | SI F   rcp   (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e  ) * e; } | 
|---|
| 201 | SI F   rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } | 
|---|
| 202 | SI U16 pack(U32 v)                           { return __builtin_convertvector(v, U16); } | 
|---|
| 203 | SI U8  pack(U16 v)                           { return __builtin_convertvector(v,  U8); } | 
|---|
| 204 |  | 
|---|
| 205 | SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } | 
|---|
| 206 |  | 
|---|
| 207 | #if defined(SK_CPU_ARM64) | 
|---|
| 208 | SI F     mad(F f, F m, F a) { return vfmaq_f32(a,f,m); } | 
|---|
| 209 | SI F  floor_(F v) { return vrndmq_f32(v); } | 
|---|
| 210 | SI F   sqrt_(F v) { return vsqrtq_f32(v); } | 
|---|
| 211 | SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } | 
|---|
| 212 | #else | 
|---|
| 213 | SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); } | 
|---|
| 214 | SI F floor_(F v) { | 
|---|
| 215 | F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); | 
|---|
| 216 | return roundtrip - if_then_else(roundtrip > v, 1, 0); | 
|---|
| 217 | } | 
|---|
| 218 |  | 
|---|
| 219 | SI F sqrt_(F v) { | 
|---|
| 220 | auto e = vrsqrteq_f32(v);  // Estimate and two refinement steps for e = rsqrt(v). | 
|---|
| 221 | e *= vrsqrtsq_f32(v,e*e); | 
|---|
| 222 | e *= vrsqrtsq_f32(v,e*e); | 
|---|
| 223 | return v*e;                // sqrt(v) == v*rsqrt(v). | 
|---|
| 224 | } | 
|---|
| 225 |  | 
|---|
| 226 | SI U32 round(F v, F scale) { | 
|---|
| 227 | return vcvtq_u32_f32(mad(v,scale,0.5f)); | 
|---|
| 228 | } | 
|---|
| 229 | #endif | 
|---|
| 230 |  | 
|---|
| 231 |  | 
|---|
| 232 | template <typename T> | 
|---|
| 233 | SI V<T> gather(const T* p, U32 ix) { | 
|---|
| 234 | return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; | 
|---|
| 235 | } | 
|---|
| 236 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { | 
|---|
| 237 | uint16x4x2_t rg; | 
|---|
| 238 | if (__builtin_expect(tail,0)) { | 
|---|
| 239 | if (  true  ) { rg = vld2_lane_u16(ptr + 0, rg, 0); } | 
|---|
| 240 | if (tail > 1) { rg = vld2_lane_u16(ptr + 2, rg, 1); } | 
|---|
| 241 | if (tail > 2) { rg = vld2_lane_u16(ptr + 4, rg, 2); } | 
|---|
| 242 | } else { | 
|---|
| 243 | rg = vld2_u16(ptr); | 
|---|
| 244 | } | 
|---|
| 245 | *r = rg.val[0]; | 
|---|
| 246 | *g = rg.val[1]; | 
|---|
| 247 | } | 
|---|
| 248 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { | 
|---|
| 249 | if (__builtin_expect(tail,0)) { | 
|---|
| 250 | if (  true  ) { vst2_lane_u16(ptr + 0, (uint16x4x2_t{{r,g}}), 0); } | 
|---|
| 251 | if (tail > 1) { vst2_lane_u16(ptr + 2, (uint16x4x2_t{{r,g}}), 1); } | 
|---|
| 252 | if (tail > 2) { vst2_lane_u16(ptr + 4, (uint16x4x2_t{{r,g}}), 2); } | 
|---|
| 253 | } else { | 
|---|
| 254 | vst2_u16(ptr, (uint16x4x2_t{{r,g}})); | 
|---|
| 255 | } | 
|---|
| 256 | } | 
|---|
| 257 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { | 
|---|
| 258 | uint16x4x3_t rgb; | 
|---|
| 259 | if (__builtin_expect(tail,0)) { | 
|---|
| 260 | if (  true  ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); } | 
|---|
| 261 | if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); } | 
|---|
| 262 | if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); } | 
|---|
| 263 | } else { | 
|---|
| 264 | rgb = vld3_u16(ptr); | 
|---|
| 265 | } | 
|---|
| 266 | *r = rgb.val[0]; | 
|---|
| 267 | *g = rgb.val[1]; | 
|---|
| 268 | *b = rgb.val[2]; | 
|---|
| 269 | } | 
|---|
| 270 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 271 | uint16x4x4_t rgba; | 
|---|
| 272 | if (__builtin_expect(tail,0)) { | 
|---|
| 273 | if (  true  ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); } | 
|---|
| 274 | if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); } | 
|---|
| 275 | if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); } | 
|---|
| 276 | } else { | 
|---|
| 277 | rgba = vld4_u16(ptr); | 
|---|
| 278 | } | 
|---|
| 279 | *r = rgba.val[0]; | 
|---|
| 280 | *g = rgba.val[1]; | 
|---|
| 281 | *b = rgba.val[2]; | 
|---|
| 282 | *a = rgba.val[3]; | 
|---|
| 283 | } | 
|---|
| 284 |  | 
|---|
| 285 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { | 
|---|
| 286 | if (__builtin_expect(tail,0)) { | 
|---|
| 287 | if (  true  ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); } | 
|---|
| 288 | if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); } | 
|---|
| 289 | if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); } | 
|---|
| 290 | } else { | 
|---|
| 291 | vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}})); | 
|---|
| 292 | } | 
|---|
| 293 | } | 
|---|
| 294 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { | 
|---|
| 295 | float32x4x2_t rg; | 
|---|
| 296 | if (__builtin_expect(tail,0)) { | 
|---|
| 297 | if (  true  ) { rg = vld2q_lane_f32(ptr + 0, rg, 0); } | 
|---|
| 298 | if (tail > 1) { rg = vld2q_lane_f32(ptr + 2, rg, 1); } | 
|---|
| 299 | if (tail > 2) { rg = vld2q_lane_f32(ptr + 4, rg, 2); } | 
|---|
| 300 | } else { | 
|---|
| 301 | rg = vld2q_f32(ptr); | 
|---|
| 302 | } | 
|---|
| 303 | *r = rg.val[0]; | 
|---|
| 304 | *g = rg.val[1]; | 
|---|
| 305 | } | 
|---|
| 306 | SI void store2(float* ptr, size_t tail, F r, F g) { | 
|---|
| 307 | if (__builtin_expect(tail,0)) { | 
|---|
| 308 | if (  true  ) { vst2q_lane_f32(ptr + 0, (float32x4x2_t{{r,g}}), 0); } | 
|---|
| 309 | if (tail > 1) { vst2q_lane_f32(ptr + 2, (float32x4x2_t{{r,g}}), 1); } | 
|---|
| 310 | if (tail > 2) { vst2q_lane_f32(ptr + 4, (float32x4x2_t{{r,g}}), 2); } | 
|---|
| 311 | } else { | 
|---|
| 312 | vst2q_f32(ptr, (float32x4x2_t{{r,g}})); | 
|---|
| 313 | } | 
|---|
| 314 | } | 
|---|
| 315 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { | 
|---|
| 316 | float32x4x4_t rgba; | 
|---|
| 317 | if (__builtin_expect(tail,0)) { | 
|---|
| 318 | if (  true  ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); } | 
|---|
| 319 | if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); } | 
|---|
| 320 | if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); } | 
|---|
| 321 | } else { | 
|---|
| 322 | rgba = vld4q_f32(ptr); | 
|---|
| 323 | } | 
|---|
| 324 | *r = rgba.val[0]; | 
|---|
| 325 | *g = rgba.val[1]; | 
|---|
| 326 | *b = rgba.val[2]; | 
|---|
| 327 | *a = rgba.val[3]; | 
|---|
| 328 | } | 
|---|
| 329 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { | 
|---|
| 330 | if (__builtin_expect(tail,0)) { | 
|---|
| 331 | if (  true  ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); } | 
|---|
| 332 | if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); } | 
|---|
| 333 | if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); } | 
|---|
| 334 | } else { | 
|---|
| 335 | vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}})); | 
|---|
| 336 | } | 
|---|
| 337 | } | 
|---|
| 338 |  | 
|---|
| 339 | #elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 340 | // These are __m256 and __m256i, but friendlier and strongly-typed. | 
|---|
| 341 | template <typename T> using V = T __attribute__((ext_vector_type(8))); | 
|---|
| 342 | using F   = V<float   >; | 
|---|
| 343 | using I32 = V< int32_t>; | 
|---|
| 344 | using U64 = V<uint64_t>; | 
|---|
| 345 | using U32 = V<uint32_t>; | 
|---|
| 346 | using U16 = V<uint16_t>; | 
|---|
| 347 | using U8  = V<uint8_t >; | 
|---|
| 348 |  | 
|---|
| 349 | SI F mad(F f, F m, F a)  { | 
|---|
| 350 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 351 | return _mm256_fmadd_ps(f,m,a); | 
|---|
| 352 | #else | 
|---|
| 353 | return f*m+a; | 
|---|
| 354 | #endif | 
|---|
| 355 | } | 
|---|
| 356 |  | 
|---|
| 357 | SI F   min(F a, F b)        { return _mm256_min_ps(a,b);    } | 
|---|
| 358 | SI F   max(F a, F b)        { return _mm256_max_ps(a,b);    } | 
|---|
| 359 | SI F   abs_  (F v)          { return _mm256_and_ps(v, 0-v); } | 
|---|
| 360 | SI F   floor_(F v)          { return _mm256_floor_ps(v);    } | 
|---|
| 361 | SI F   rcp   (F v)          { return _mm256_rcp_ps  (v);    } | 
|---|
| 362 | SI F   rsqrt (F v)          { return _mm256_rsqrt_ps(v);    } | 
|---|
| 363 | SI F    sqrt_(F v)          { return _mm256_sqrt_ps (v);    } | 
|---|
| 364 | SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); } | 
|---|
| 365 |  | 
|---|
| 366 | SI U16 pack(U32 v) { | 
|---|
| 367 | return _mm_packus_epi32(_mm256_extractf128_si256(v, 0), | 
|---|
| 368 | _mm256_extractf128_si256(v, 1)); | 
|---|
| 369 | } | 
|---|
| 370 | SI U8 pack(U16 v) { | 
|---|
| 371 | auto r = _mm_packus_epi16(v,v); | 
|---|
| 372 | return sk_unaligned_load<U8>(&r); | 
|---|
| 373 | } | 
|---|
| 374 |  | 
|---|
| 375 | SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } | 
|---|
| 376 |  | 
|---|
| 377 | template <typename T> | 
|---|
| 378 | SI V<T> gather(const T* p, U32 ix) { | 
|---|
| 379 | return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], | 
|---|
| 380 | p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], }; | 
|---|
| 381 | } | 
|---|
| 382 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 383 | SI F   gather(const float*    p, U32 ix) { return _mm256_i32gather_ps   (p, ix, 4); } | 
|---|
| 384 | SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); } | 
|---|
| 385 | SI U64 gather(const uint64_t* p, U32 ix) { | 
|---|
| 386 | __m256i parts[] = { | 
|---|
| 387 | _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8), | 
|---|
| 388 | _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8), | 
|---|
| 389 | }; | 
|---|
| 390 | return bit_cast<U64>(parts); | 
|---|
| 391 | } | 
|---|
| 392 | #endif | 
|---|
| 393 |  | 
|---|
| 394 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { | 
|---|
| 395 | U16 _0123, _4567; | 
|---|
| 396 | if (__builtin_expect(tail,0)) { | 
|---|
| 397 | _0123 = _4567 = _mm_setzero_si128(); | 
|---|
| 398 | auto* d = &_0123; | 
|---|
| 399 | if (tail > 3) { | 
|---|
| 400 | *d = _mm_loadu_si128(((__m128i*)ptr) + 0); | 
|---|
| 401 | tail -= 4; | 
|---|
| 402 | ptr += 8; | 
|---|
| 403 | d = &_4567; | 
|---|
| 404 | } | 
|---|
| 405 | bool high = false; | 
|---|
| 406 | if (tail > 1) { | 
|---|
| 407 | *d = _mm_loadu_si64(ptr); | 
|---|
| 408 | tail -= 2; | 
|---|
| 409 | ptr += 4; | 
|---|
| 410 | high = true; | 
|---|
| 411 | } | 
|---|
| 412 | if (tail > 0) { | 
|---|
| 413 | (*d)[high ? 4 : 0] = *(ptr + 0); | 
|---|
| 414 | (*d)[high ? 5 : 1] = *(ptr + 1); | 
|---|
| 415 | } | 
|---|
| 416 | } else { | 
|---|
| 417 | _0123 = _mm_loadu_si128(((__m128i*)ptr) + 0); | 
|---|
| 418 | _4567 = _mm_loadu_si128(((__m128i*)ptr) + 1); | 
|---|
| 419 | } | 
|---|
| 420 | *r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16), | 
|---|
| 421 | _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16)); | 
|---|
| 422 | *g = _mm_packs_epi32(_mm_srai_epi32(_0123, 16), | 
|---|
| 423 | _mm_srai_epi32(_4567, 16)); | 
|---|
| 424 | } | 
|---|
| 425 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { | 
|---|
| 426 | auto _0123 = _mm_unpacklo_epi16(r, g), | 
|---|
| 427 | _4567 = _mm_unpackhi_epi16(r, g); | 
|---|
| 428 | if (__builtin_expect(tail,0)) { | 
|---|
| 429 | const auto* s = &_0123; | 
|---|
| 430 | if (tail > 3) { | 
|---|
| 431 | _mm_storeu_si128((__m128i*)ptr, *s); | 
|---|
| 432 | s = &_4567; | 
|---|
| 433 | tail -= 4; | 
|---|
| 434 | ptr += 8; | 
|---|
| 435 | } | 
|---|
| 436 | bool high = false; | 
|---|
| 437 | if (tail > 1) { | 
|---|
| 438 | _mm_storel_epi64((__m128i*)ptr, *s); | 
|---|
| 439 | ptr += 4; | 
|---|
| 440 | tail -= 2; | 
|---|
| 441 | high = true; | 
|---|
| 442 | } | 
|---|
| 443 | if (tail > 0) { | 
|---|
| 444 | if (high) { | 
|---|
| 445 | *(int32_t*)ptr = _mm_extract_epi32(*s, 2); | 
|---|
| 446 | } else { | 
|---|
| 447 | *(int32_t*)ptr = _mm_cvtsi128_si32(*s); | 
|---|
| 448 | } | 
|---|
| 449 | } | 
|---|
| 450 | } else { | 
|---|
| 451 | _mm_storeu_si128((__m128i*)ptr + 0, _0123); | 
|---|
| 452 | _mm_storeu_si128((__m128i*)ptr + 1, _4567); | 
|---|
| 453 | } | 
|---|
| 454 | } | 
|---|
| 455 |  | 
|---|
| 456 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { | 
|---|
| 457 | __m128i _0,_1,_2,_3,_4,_5,_6,_7; | 
|---|
| 458 | if (__builtin_expect(tail,0)) { | 
|---|
| 459 | auto load_rgb = [](const uint16_t* src) { | 
|---|
| 460 | auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); | 
|---|
| 461 | return _mm_insert_epi16(v, src[2], 2); | 
|---|
| 462 | }; | 
|---|
| 463 | _1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128(); | 
|---|
| 464 | if (  true  ) { _0 = load_rgb(ptr +  0); } | 
|---|
| 465 | if (tail > 1) { _1 = load_rgb(ptr +  3); } | 
|---|
| 466 | if (tail > 2) { _2 = load_rgb(ptr +  6); } | 
|---|
| 467 | if (tail > 3) { _3 = load_rgb(ptr +  9); } | 
|---|
| 468 | if (tail > 4) { _4 = load_rgb(ptr + 12); } | 
|---|
| 469 | if (tail > 5) { _5 = load_rgb(ptr + 15); } | 
|---|
| 470 | if (tail > 6) { _6 = load_rgb(ptr + 18); } | 
|---|
| 471 | } else { | 
|---|
| 472 | // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over. | 
|---|
| 473 | auto _01 =                _mm_loadu_si128((const __m128i*)(ptr +  0))    ; | 
|---|
| 474 | auto _23 =                _mm_loadu_si128((const __m128i*)(ptr +  6))    ; | 
|---|
| 475 | auto _45 =                _mm_loadu_si128((const __m128i*)(ptr + 12))    ; | 
|---|
| 476 | auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4); | 
|---|
| 477 | _0 = _01; _1 = _mm_srli_si128(_01, 6); | 
|---|
| 478 | _2 = _23; _3 = _mm_srli_si128(_23, 6); | 
|---|
| 479 | _4 = _45; _5 = _mm_srli_si128(_45, 6); | 
|---|
| 480 | _6 = _67; _7 = _mm_srli_si128(_67, 6); | 
|---|
| 481 | } | 
|---|
| 482 |  | 
|---|
| 483 | auto _02 = _mm_unpacklo_epi16(_0, _2),  // r0 r2 g0 g2 b0 b2 xx xx | 
|---|
| 484 | _13 = _mm_unpacklo_epi16(_1, _3), | 
|---|
| 485 | _46 = _mm_unpacklo_epi16(_4, _6), | 
|---|
| 486 | _57 = _mm_unpacklo_epi16(_5, _7); | 
|---|
| 487 |  | 
|---|
| 488 | auto rg0123 = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3 | 
|---|
| 489 | bx0123 = _mm_unpackhi_epi16(_02, _13),  // b0 b1 b2 b3 xx xx xx xx | 
|---|
| 490 | rg4567 = _mm_unpacklo_epi16(_46, _57), | 
|---|
| 491 | bx4567 = _mm_unpackhi_epi16(_46, _57); | 
|---|
| 492 |  | 
|---|
| 493 | *r = _mm_unpacklo_epi64(rg0123, rg4567); | 
|---|
| 494 | *g = _mm_unpackhi_epi64(rg0123, rg4567); | 
|---|
| 495 | *b = _mm_unpacklo_epi64(bx0123, bx4567); | 
|---|
| 496 | } | 
|---|
| 497 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 498 | __m128i _01, _23, _45, _67; | 
|---|
| 499 | if (__builtin_expect(tail,0)) { | 
|---|
| 500 | auto src = (const double*)ptr; | 
|---|
| 501 | _01 = _23 = _45 = _67 = _mm_setzero_si128(); | 
|---|
| 502 | if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } | 
|---|
| 503 | if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } | 
|---|
| 504 | if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } | 
|---|
| 505 | if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } | 
|---|
| 506 | if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } | 
|---|
| 507 | if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } | 
|---|
| 508 | if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } | 
|---|
| 509 | } else { | 
|---|
| 510 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); | 
|---|
| 511 | _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); | 
|---|
| 512 | _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); | 
|---|
| 513 | _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); | 
|---|
| 514 | } | 
|---|
| 515 |  | 
|---|
| 516 | auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2 | 
|---|
| 517 | _13 = _mm_unpackhi_epi16(_01, _23),  // r1 r3 g1 g3 b1 b3 a1 a3 | 
|---|
| 518 | _46 = _mm_unpacklo_epi16(_45, _67), | 
|---|
| 519 | _57 = _mm_unpackhi_epi16(_45, _67); | 
|---|
| 520 |  | 
|---|
| 521 | auto rg0123 = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3 | 
|---|
| 522 | ba0123 = _mm_unpackhi_epi16(_02, _13),  // b0 b1 b2 b3 a0 a1 a2 a3 | 
|---|
| 523 | rg4567 = _mm_unpacklo_epi16(_46, _57), | 
|---|
| 524 | ba4567 = _mm_unpackhi_epi16(_46, _57); | 
|---|
| 525 |  | 
|---|
| 526 | *r = _mm_unpacklo_epi64(rg0123, rg4567); | 
|---|
| 527 | *g = _mm_unpackhi_epi64(rg0123, rg4567); | 
|---|
| 528 | *b = _mm_unpacklo_epi64(ba0123, ba4567); | 
|---|
| 529 | *a = _mm_unpackhi_epi64(ba0123, ba4567); | 
|---|
| 530 | } | 
|---|
| 531 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { | 
|---|
| 532 | auto rg0123 = _mm_unpacklo_epi16(r, g),  // r0 g0 r1 g1 r2 g2 r3 g3 | 
|---|
| 533 | rg4567 = _mm_unpackhi_epi16(r, g),  // r4 g4 r5 g5 r6 g6 r7 g7 | 
|---|
| 534 | ba0123 = _mm_unpacklo_epi16(b, a), | 
|---|
| 535 | ba4567 = _mm_unpackhi_epi16(b, a); | 
|---|
| 536 |  | 
|---|
| 537 | auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), | 
|---|
| 538 | _23 = _mm_unpackhi_epi32(rg0123, ba0123), | 
|---|
| 539 | _45 = _mm_unpacklo_epi32(rg4567, ba4567), | 
|---|
| 540 | _67 = _mm_unpackhi_epi32(rg4567, ba4567); | 
|---|
| 541 |  | 
|---|
| 542 | if (__builtin_expect(tail,0)) { | 
|---|
| 543 | auto dst = (double*)ptr; | 
|---|
| 544 | if (tail > 0) { _mm_storel_pd(dst+0, _01); } | 
|---|
| 545 | if (tail > 1) { _mm_storeh_pd(dst+1, _01); } | 
|---|
| 546 | if (tail > 2) { _mm_storel_pd(dst+2, _23); } | 
|---|
| 547 | if (tail > 3) { _mm_storeh_pd(dst+3, _23); } | 
|---|
| 548 | if (tail > 4) { _mm_storel_pd(dst+4, _45); } | 
|---|
| 549 | if (tail > 5) { _mm_storeh_pd(dst+5, _45); } | 
|---|
| 550 | if (tail > 6) { _mm_storel_pd(dst+6, _67); } | 
|---|
| 551 | } else { | 
|---|
| 552 | _mm_storeu_si128((__m128i*)ptr + 0, _01); | 
|---|
| 553 | _mm_storeu_si128((__m128i*)ptr + 1, _23); | 
|---|
| 554 | _mm_storeu_si128((__m128i*)ptr + 2, _45); | 
|---|
| 555 | _mm_storeu_si128((__m128i*)ptr + 3, _67); | 
|---|
| 556 | } | 
|---|
| 557 | } | 
|---|
| 558 |  | 
|---|
| 559 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { | 
|---|
| 560 | F _0123, _4567; | 
|---|
| 561 | if (__builtin_expect(tail, 0)) { | 
|---|
| 562 | _0123 = _4567 = _mm256_setzero_ps(); | 
|---|
| 563 | F* d = &_0123; | 
|---|
| 564 | if (tail > 3) { | 
|---|
| 565 | *d = _mm256_loadu_ps(ptr); | 
|---|
| 566 | ptr += 8; | 
|---|
| 567 | tail -= 4; | 
|---|
| 568 | d = &_4567; | 
|---|
| 569 | } | 
|---|
| 570 | bool high = false; | 
|---|
| 571 | if (tail > 1) { | 
|---|
| 572 | *d = _mm256_castps128_ps256(_mm_loadu_ps(ptr)); | 
|---|
| 573 | ptr += 4; | 
|---|
| 574 | tail -= 2; | 
|---|
| 575 | high = true; | 
|---|
| 576 | } | 
|---|
| 577 | if (tail > 0) { | 
|---|
| 578 | *d = high ? _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 1) | 
|---|
| 579 | : _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 0); | 
|---|
| 580 | } | 
|---|
| 581 | } else { | 
|---|
| 582 | _0123 = _mm256_loadu_ps(ptr + 0); | 
|---|
| 583 | _4567 = _mm256_loadu_ps(ptr + 8); | 
|---|
| 584 | } | 
|---|
| 585 |  | 
|---|
| 586 | F _0145 = _mm256_permute2f128_pd(_0123, _4567, 0x20), | 
|---|
| 587 | _2367 = _mm256_permute2f128_pd(_0123, _4567, 0x31); | 
|---|
| 588 |  | 
|---|
| 589 | *r = _mm256_shuffle_ps(_0145, _2367, 0x88); | 
|---|
| 590 | *g = _mm256_shuffle_ps(_0145, _2367, 0xDD); | 
|---|
| 591 | } | 
|---|
| 592 | SI void store2(float* ptr, size_t tail, F r, F g) { | 
|---|
| 593 | F _0145 = _mm256_unpacklo_ps(r, g), | 
|---|
| 594 | _2367 = _mm256_unpackhi_ps(r, g); | 
|---|
| 595 | F _0123 = _mm256_permute2f128_pd(_0145, _2367, 0x20), | 
|---|
| 596 | _4567 = _mm256_permute2f128_pd(_0145, _2367, 0x31); | 
|---|
| 597 |  | 
|---|
| 598 | if (__builtin_expect(tail, 0)) { | 
|---|
| 599 | const __m256* s = &_0123; | 
|---|
| 600 | if (tail > 3) { | 
|---|
| 601 | _mm256_storeu_ps(ptr, *s); | 
|---|
| 602 | s = &_4567; | 
|---|
| 603 | tail -= 4; | 
|---|
| 604 | ptr += 8; | 
|---|
| 605 | } | 
|---|
| 606 | bool high = false; | 
|---|
| 607 | if (tail > 1) { | 
|---|
| 608 | _mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, 0)); | 
|---|
| 609 | ptr += 4; | 
|---|
| 610 | tail -= 2; | 
|---|
| 611 | high = true; | 
|---|
| 612 | } | 
|---|
| 613 | if (tail > 0) { | 
|---|
| 614 | *(ptr + 0) = (*s)[ high ? 4 : 0]; | 
|---|
| 615 | *(ptr + 1) = (*s)[ high ? 5 : 1]; | 
|---|
| 616 | } | 
|---|
| 617 | } else { | 
|---|
| 618 | _mm256_storeu_ps(ptr + 0, _0123); | 
|---|
| 619 | _mm256_storeu_ps(ptr + 8, _4567); | 
|---|
| 620 | } | 
|---|
| 621 | } | 
|---|
| 622 |  | 
|---|
| 623 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { | 
|---|
| 624 | F _04, _15, _26, _37; | 
|---|
| 625 | _04 = _15 = _26 = _37 = 0; | 
|---|
| 626 | switch (tail) { | 
|---|
| 627 | case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1); | 
|---|
| 628 | case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1); | 
|---|
| 629 | case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1); | 
|---|
| 630 | case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1); | 
|---|
| 631 | case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0); | 
|---|
| 632 | case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0); | 
|---|
| 633 | case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0); | 
|---|
| 634 | case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0); | 
|---|
| 635 | } | 
|---|
| 636 |  | 
|---|
| 637 | F rg0145 = _mm256_unpacklo_ps(_04,_15),  // r0 r1 g0 g1 | r4 r5 g4 g5 | 
|---|
| 638 | ba0145 = _mm256_unpackhi_ps(_04,_15), | 
|---|
| 639 | rg2367 = _mm256_unpacklo_ps(_26,_37), | 
|---|
| 640 | ba2367 = _mm256_unpackhi_ps(_26,_37); | 
|---|
| 641 |  | 
|---|
| 642 | *r = _mm256_unpacklo_pd(rg0145, rg2367); | 
|---|
| 643 | *g = _mm256_unpackhi_pd(rg0145, rg2367); | 
|---|
| 644 | *b = _mm256_unpacklo_pd(ba0145, ba2367); | 
|---|
| 645 | *a = _mm256_unpackhi_pd(ba0145, ba2367); | 
|---|
| 646 | } | 
|---|
| 647 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { | 
|---|
| 648 | F rg0145 = _mm256_unpacklo_ps(r, g),  // r0 g0 r1 g1 | r4 g4 r5 g5 | 
|---|
| 649 | rg2367 = _mm256_unpackhi_ps(r, g),  // r2 ...      | r6 ... | 
|---|
| 650 | ba0145 = _mm256_unpacklo_ps(b, a),  // b0 a0 b1 a1 | b4 a4 b5 a5 | 
|---|
| 651 | ba2367 = _mm256_unpackhi_ps(b, a);  // b2 ...      | b6 ... | 
|---|
| 652 |  | 
|---|
| 653 | F _04 = _mm256_unpacklo_pd(rg0145, ba0145),  // r0 g0 b0 a0 | r4 g4 b4 a4 | 
|---|
| 654 | _15 = _mm256_unpackhi_pd(rg0145, ba0145),  // r1 ...      | r5 ... | 
|---|
| 655 | _26 = _mm256_unpacklo_pd(rg2367, ba2367),  // r2 ...      | r6 ... | 
|---|
| 656 | _37 = _mm256_unpackhi_pd(rg2367, ba2367);  // r3 ...      | r7 ... | 
|---|
| 657 |  | 
|---|
| 658 | if (__builtin_expect(tail, 0)) { | 
|---|
| 659 | if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); } | 
|---|
| 660 | if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); } | 
|---|
| 661 | if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); } | 
|---|
| 662 | if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); } | 
|---|
| 663 | if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); } | 
|---|
| 664 | if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); } | 
|---|
| 665 | if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); } | 
|---|
| 666 | } else { | 
|---|
| 667 | F _01 = _mm256_permute2f128_ps(_04, _15, 32),  // 32 == 0010 0000 == lo, lo | 
|---|
| 668 | _23 = _mm256_permute2f128_ps(_26, _37, 32), | 
|---|
| 669 | _45 = _mm256_permute2f128_ps(_04, _15, 49),  // 49 == 0011 0001 == hi, hi | 
|---|
| 670 | _67 = _mm256_permute2f128_ps(_26, _37, 49); | 
|---|
| 671 | _mm256_storeu_ps(ptr+ 0, _01); | 
|---|
| 672 | _mm256_storeu_ps(ptr+ 8, _23); | 
|---|
| 673 | _mm256_storeu_ps(ptr+16, _45); | 
|---|
| 674 | _mm256_storeu_ps(ptr+24, _67); | 
|---|
| 675 | } | 
|---|
| 676 | } | 
|---|
| 677 |  | 
|---|
| 678 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) | 
|---|
| 679 | template <typename T> using V = T __attribute__((ext_vector_type(4))); | 
|---|
| 680 | using F   = V<float   >; | 
|---|
| 681 | using I32 = V< int32_t>; | 
|---|
| 682 | using U64 = V<uint64_t>; | 
|---|
| 683 | using U32 = V<uint32_t>; | 
|---|
| 684 | using U16 = V<uint16_t>; | 
|---|
| 685 | using U8  = V<uint8_t >; | 
|---|
| 686 |  | 
|---|
| 687 | SI F   mad(F f, F m, F a)  { return f*m+a;              } | 
|---|
| 688 | SI F   min(F a, F b)       { return _mm_min_ps(a,b);    } | 
|---|
| 689 | SI F   max(F a, F b)       { return _mm_max_ps(a,b);    } | 
|---|
| 690 | SI F   abs_(F v)           { return _mm_and_ps(v, 0-v); } | 
|---|
| 691 | SI F   rcp   (F v)         { return _mm_rcp_ps  (v);    } | 
|---|
| 692 | SI F   rsqrt (F v)         { return _mm_rsqrt_ps(v);    } | 
|---|
| 693 | SI F    sqrt_(F v)         { return _mm_sqrt_ps (v);    } | 
|---|
| 694 | SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); } | 
|---|
| 695 |  | 
|---|
| 696 | SI U16 pack(U32 v) { | 
|---|
| 697 | #if defined(JUMPER_IS_SSE41) | 
|---|
| 698 | auto p = _mm_packus_epi32(v,v); | 
|---|
| 699 | #else | 
|---|
| 700 | // Sign extend so that _mm_packs_epi32() does the pack we want. | 
|---|
| 701 | auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16); | 
|---|
| 702 | p = _mm_packs_epi32(p,p); | 
|---|
| 703 | #endif | 
|---|
| 704 | return sk_unaligned_load<U16>(&p);  // We have two copies.  Return (the lower) one. | 
|---|
| 705 | } | 
|---|
| 706 | SI U8 pack(U16 v) { | 
|---|
| 707 | auto r = widen_cast<__m128i>(v); | 
|---|
| 708 | r = _mm_packus_epi16(r,r); | 
|---|
| 709 | return sk_unaligned_load<U8>(&r); | 
|---|
| 710 | } | 
|---|
| 711 |  | 
|---|
| 712 | SI F if_then_else(I32 c, F t, F e) { | 
|---|
| 713 | return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e)); | 
|---|
| 714 | } | 
|---|
| 715 |  | 
|---|
| 716 | SI F floor_(F v) { | 
|---|
| 717 | #if defined(JUMPER_IS_SSE41) | 
|---|
| 718 | return _mm_floor_ps(v); | 
|---|
| 719 | #else | 
|---|
| 720 | F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); | 
|---|
| 721 | return roundtrip - if_then_else(roundtrip > v, 1, 0); | 
|---|
| 722 | #endif | 
|---|
| 723 | } | 
|---|
| 724 |  | 
|---|
| 725 | template <typename T> | 
|---|
| 726 | SI V<T> gather(const T* p, U32 ix) { | 
|---|
| 727 | return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; | 
|---|
| 728 | } | 
|---|
| 729 |  | 
|---|
| 730 | // TODO: these loads and stores are incredibly difficult to follow. | 
|---|
| 731 |  | 
|---|
| 732 | SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { | 
|---|
| 733 | __m128i _01; | 
|---|
| 734 | if (__builtin_expect(tail,0)) { | 
|---|
| 735 | _01 = _mm_setzero_si128(); | 
|---|
| 736 | if (tail > 1) { | 
|---|
| 737 | _01 = _mm_loadl_pd(_01, (const double*)ptr);            // r0 g0 r1 g1 00 00 00 00 | 
|---|
| 738 | if (tail > 2) { | 
|---|
| 739 | _01 = _mm_insert_epi16(_01, *(ptr+4), 4);             // r0 g0 r1 g1 r2 00 00 00 | 
|---|
| 740 | _01 = _mm_insert_epi16(_01, *(ptr+5), 5);             // r0 g0 r1 g1 r2 g2 00 00 | 
|---|
| 741 | } | 
|---|
| 742 | } else { | 
|---|
| 743 | _01 = _mm_cvtsi32_si128(*(const uint32_t*)ptr);         // r0 g0 00 00 00 00 00 00 | 
|---|
| 744 | } | 
|---|
| 745 | } else { | 
|---|
| 746 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);  // r0 g0 r1 g1 r2 g2 r3 g3 | 
|---|
| 747 | } | 
|---|
| 748 | auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8);      // r0 r1 g0 g1 r2 g2 r3 g3 | 
|---|
| 749 | auto rg      = _mm_shufflehi_epi16(rg01_23, 0xD8);  // r0 r1 g0 g1 r2 r3 g2 g3 | 
|---|
| 750 |  | 
|---|
| 751 | auto R = _mm_shuffle_epi32(rg, 0x88);  // r0 r1 r2 r3 r0 r1 r2 r3 | 
|---|
| 752 | auto G = _mm_shuffle_epi32(rg, 0xDD);  // g0 g1 g2 g3 g0 g1 g2 g3 | 
|---|
| 753 | *r = sk_unaligned_load<U16>(&R); | 
|---|
| 754 | *g = sk_unaligned_load<U16>(&G); | 
|---|
| 755 | } | 
|---|
| 756 | SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { | 
|---|
| 757 | U32 rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)); | 
|---|
| 758 | if (__builtin_expect(tail, 0)) { | 
|---|
| 759 | if (tail > 1) { | 
|---|
| 760 | _mm_storel_epi64((__m128i*)ptr, rg); | 
|---|
| 761 | if (tail > 2) { | 
|---|
| 762 | int32_t rgpair = rg[2]; | 
|---|
| 763 | memcpy(ptr + 4, &rgpair, sizeof(rgpair)); | 
|---|
| 764 | } | 
|---|
| 765 | } else { | 
|---|
| 766 | int32_t rgpair = rg[0]; | 
|---|
| 767 | memcpy(ptr, &rgpair, sizeof(rgpair)); | 
|---|
| 768 | } | 
|---|
| 769 | } else { | 
|---|
| 770 | _mm_storeu_si128((__m128i*)ptr + 0, rg); | 
|---|
| 771 | } | 
|---|
| 772 | } | 
|---|
| 773 |  | 
|---|
| 774 | SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { | 
|---|
| 775 | __m128i _0, _1, _2, _3; | 
|---|
| 776 | if (__builtin_expect(tail,0)) { | 
|---|
| 777 | _1 = _2 = _3 = _mm_setzero_si128(); | 
|---|
| 778 | auto load_rgb = [](const uint16_t* src) { | 
|---|
| 779 | auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); | 
|---|
| 780 | return _mm_insert_epi16(v, src[2], 2); | 
|---|
| 781 | }; | 
|---|
| 782 | if (  true  ) { _0 = load_rgb(ptr + 0); } | 
|---|
| 783 | if (tail > 1) { _1 = load_rgb(ptr + 3); } | 
|---|
| 784 | if (tail > 2) { _2 = load_rgb(ptr + 6); } | 
|---|
| 785 | } else { | 
|---|
| 786 | // Load slightly weirdly to make sure we don't load past the end of 4x48 bits. | 
|---|
| 787 | auto _01 =                _mm_loadu_si128((const __m128i*)(ptr + 0))    , | 
|---|
| 788 | _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4); | 
|---|
| 789 |  | 
|---|
| 790 | // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored). | 
|---|
| 791 | _0 = _01; | 
|---|
| 792 | _1 = _mm_srli_si128(_01, 6); | 
|---|
| 793 | _2 = _23; | 
|---|
| 794 | _3 = _mm_srli_si128(_23, 6); | 
|---|
| 795 | } | 
|---|
| 796 |  | 
|---|
| 797 | // De-interlace to R,G,B. | 
|---|
| 798 | auto _02 = _mm_unpacklo_epi16(_0, _2),  // r0 r2 g0 g2 b0 b2 xx xx | 
|---|
| 799 | _13 = _mm_unpacklo_epi16(_1, _3);  // r1 r3 g1 g3 b1 b3 xx xx | 
|---|
| 800 |  | 
|---|
| 801 | auto R = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3 | 
|---|
| 802 | G = _mm_srli_si128(R, 8), | 
|---|
| 803 | B = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 xx xx xx xx | 
|---|
| 804 |  | 
|---|
| 805 | *r = sk_unaligned_load<U16>(&R); | 
|---|
| 806 | *g = sk_unaligned_load<U16>(&G); | 
|---|
| 807 | *b = sk_unaligned_load<U16>(&B); | 
|---|
| 808 | } | 
|---|
| 809 |  | 
|---|
| 810 | SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 811 | __m128i _01, _23; | 
|---|
| 812 | if (__builtin_expect(tail,0)) { | 
|---|
| 813 | _01 = _23 = _mm_setzero_si128(); | 
|---|
| 814 | auto src = (const double*)ptr; | 
|---|
| 815 | if (  true  ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00 | 
|---|
| 816 | if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1 | 
|---|
| 817 | if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00 | 
|---|
| 818 | } else { | 
|---|
| 819 | _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1 | 
|---|
| 820 | _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3 | 
|---|
| 821 | } | 
|---|
| 822 |  | 
|---|
| 823 | auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2 | 
|---|
| 824 | _13 = _mm_unpackhi_epi16(_01, _23);  // r1 r3 g1 g3 b1 b3 a1 a3 | 
|---|
| 825 |  | 
|---|
| 826 | auto rg = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3 | 
|---|
| 827 | ba = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3 | 
|---|
| 828 |  | 
|---|
| 829 | *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0); | 
|---|
| 830 | *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4); | 
|---|
| 831 | *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0); | 
|---|
| 832 | *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4); | 
|---|
| 833 | } | 
|---|
| 834 |  | 
|---|
| 835 | SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { | 
|---|
| 836 | auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)), | 
|---|
| 837 | ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a)); | 
|---|
| 838 |  | 
|---|
| 839 | if (__builtin_expect(tail, 0)) { | 
|---|
| 840 | auto dst = (double*)ptr; | 
|---|
| 841 | if (  true  ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); } | 
|---|
| 842 | if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); } | 
|---|
| 843 | if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); } | 
|---|
| 844 | } else { | 
|---|
| 845 | _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); | 
|---|
| 846 | _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); | 
|---|
| 847 | } | 
|---|
| 848 | } | 
|---|
| 849 |  | 
|---|
| 850 | SI void load2(const float* ptr, size_t tail, F* r, F* g) { | 
|---|
| 851 | F _01, _23; | 
|---|
| 852 | if (__builtin_expect(tail, 0)) { | 
|---|
| 853 | _01 = _23 = _mm_setzero_si128(); | 
|---|
| 854 | if (  true  ) { _01 = _mm_loadl_pi(_01, (__m64 const*)(ptr + 0)); } | 
|---|
| 855 | if (tail > 1) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + 2)); } | 
|---|
| 856 | if (tail > 2) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + 4)); } | 
|---|
| 857 | } else { | 
|---|
| 858 | _01 = _mm_loadu_ps(ptr + 0); | 
|---|
| 859 | _23 = _mm_loadu_ps(ptr + 4); | 
|---|
| 860 | } | 
|---|
| 861 | *r = _mm_shuffle_ps(_01, _23, 0x88); | 
|---|
| 862 | *g = _mm_shuffle_ps(_01, _23, 0xDD); | 
|---|
| 863 | } | 
|---|
| 864 | SI void store2(float* ptr, size_t tail, F r, F g) { | 
|---|
| 865 | F _01 = _mm_unpacklo_ps(r, g), | 
|---|
| 866 | _23 = _mm_unpackhi_ps(r, g); | 
|---|
| 867 | if (__builtin_expect(tail, 0)) { | 
|---|
| 868 | if (  true  ) { _mm_storel_pi((__m64*)(ptr + 0), _01); } | 
|---|
| 869 | if (tail > 1) { _mm_storeh_pi((__m64*)(ptr + 2), _01); } | 
|---|
| 870 | if (tail > 2) { _mm_storel_pi((__m64*)(ptr + 4), _23); } | 
|---|
| 871 | } else { | 
|---|
| 872 | _mm_storeu_ps(ptr + 0, _01); | 
|---|
| 873 | _mm_storeu_ps(ptr + 4, _23); | 
|---|
| 874 | } | 
|---|
| 875 | } | 
|---|
| 876 |  | 
|---|
| 877 | SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { | 
|---|
| 878 | F _0, _1, _2, _3; | 
|---|
| 879 | if (__builtin_expect(tail, 0)) { | 
|---|
| 880 | _1 = _2 = _3 = _mm_setzero_si128(); | 
|---|
| 881 | if (  true  ) { _0 = _mm_loadu_ps(ptr + 0); } | 
|---|
| 882 | if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); } | 
|---|
| 883 | if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); } | 
|---|
| 884 | } else { | 
|---|
| 885 | _0 = _mm_loadu_ps(ptr + 0); | 
|---|
| 886 | _1 = _mm_loadu_ps(ptr + 4); | 
|---|
| 887 | _2 = _mm_loadu_ps(ptr + 8); | 
|---|
| 888 | _3 = _mm_loadu_ps(ptr +12); | 
|---|
| 889 | } | 
|---|
| 890 | _MM_TRANSPOSE4_PS(_0,_1,_2,_3); | 
|---|
| 891 | *r = _0; | 
|---|
| 892 | *g = _1; | 
|---|
| 893 | *b = _2; | 
|---|
| 894 | *a = _3; | 
|---|
| 895 | } | 
|---|
| 896 |  | 
|---|
| 897 | SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { | 
|---|
| 898 | _MM_TRANSPOSE4_PS(r,g,b,a); | 
|---|
| 899 | if (__builtin_expect(tail, 0)) { | 
|---|
| 900 | if (  true  ) { _mm_storeu_ps(ptr + 0, r); } | 
|---|
| 901 | if (tail > 1) { _mm_storeu_ps(ptr + 4, g); } | 
|---|
| 902 | if (tail > 2) { _mm_storeu_ps(ptr + 8, b); } | 
|---|
| 903 | } else { | 
|---|
| 904 | _mm_storeu_ps(ptr + 0, r); | 
|---|
| 905 | _mm_storeu_ps(ptr + 4, g); | 
|---|
| 906 | _mm_storeu_ps(ptr + 8, b); | 
|---|
| 907 | _mm_storeu_ps(ptr +12, a); | 
|---|
| 908 | } | 
|---|
| 909 | } | 
|---|
| 910 | #endif | 
|---|
| 911 |  | 
|---|
| 912 | // We need to be a careful with casts. | 
|---|
| 913 | // (F)x means cast x to float in the portable path, but bit_cast x to float in the others. | 
|---|
| 914 | // These named casts and bit_cast() are always what they seem to be. | 
|---|
| 915 | #if defined(JUMPER_IS_SCALAR) | 
|---|
| 916 | SI F   cast  (U32 v) { return   (F)v; } | 
|---|
| 917 | SI F   cast64(U64 v) { return   (F)v; } | 
|---|
| 918 | SI U32 trunc_(F   v) { return (U32)v; } | 
|---|
| 919 | SI U32 expand(U16 v) { return (U32)v; } | 
|---|
| 920 | SI U32 expand(U8  v) { return (U32)v; } | 
|---|
| 921 | #else | 
|---|
| 922 | SI F   cast  (U32 v) { return      __builtin_convertvector((I32)v,   F); } | 
|---|
| 923 | SI F   cast64(U64 v) { return      __builtin_convertvector(     v,   F); } | 
|---|
| 924 | SI U32 trunc_(F   v) { return (U32)__builtin_convertvector(     v, I32); } | 
|---|
| 925 | SI U32 expand(U16 v) { return      __builtin_convertvector(     v, U32); } | 
|---|
| 926 | SI U32 expand(U8  v) { return      __builtin_convertvector(     v, U32); } | 
|---|
| 927 | #endif | 
|---|
| 928 |  | 
|---|
| 929 | template <typename V> | 
|---|
| 930 | SI V if_then_else(I32 c, V t, V e) { | 
|---|
| 931 | return bit_cast<V>(if_then_else(c, bit_cast<F>(t), bit_cast<F>(e))); | 
|---|
| 932 | } | 
|---|
| 933 |  | 
|---|
| 934 | SI U16 bswap(U16 x) { | 
|---|
| 935 | #if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) | 
|---|
| 936 | // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes | 
|---|
| 937 | // when generating code for SSE2 and SSE4.1.  We'll do it manually... | 
|---|
| 938 | auto v = widen_cast<__m128i>(x); | 
|---|
| 939 | v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8); | 
|---|
| 940 | return sk_unaligned_load<U16>(&v); | 
|---|
| 941 | #else | 
|---|
| 942 | return (x<<8) | (x>>8); | 
|---|
| 943 | #endif | 
|---|
| 944 | } | 
|---|
| 945 |  | 
|---|
| 946 | SI F fract(F v) { return v - floor_(v); } | 
|---|
| 947 |  | 
|---|
| 948 | // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. | 
|---|
| 949 | SI F approx_log2(F x) { | 
|---|
| 950 | // e - 127 is a fair approximation of log2(x) in its own right... | 
|---|
| 951 | F e = cast(bit_cast<U32>(x)) * (1.0f / (1<<23)); | 
|---|
| 952 |  | 
|---|
| 953 | // ... but using the mantissa to refine its error is _much_ better. | 
|---|
| 954 | F m = bit_cast<F>((bit_cast<U32>(x) & 0x007fffff) | 0x3f000000); | 
|---|
| 955 | return e | 
|---|
| 956 | - 124.225514990f | 
|---|
| 957 | -   1.498030302f * m | 
|---|
| 958 | -   1.725879990f / (0.3520887068f + m); | 
|---|
| 959 | } | 
|---|
| 960 |  | 
|---|
| 961 | SI F approx_log(F x) { | 
|---|
| 962 | const float ln2 = 0.69314718f; | 
|---|
| 963 | return ln2 * approx_log2(x); | 
|---|
| 964 | } | 
|---|
| 965 |  | 
|---|
| 966 | SI F approx_pow2(F x) { | 
|---|
| 967 | F f = fract(x); | 
|---|
| 968 | return bit_cast<F>(round(1.0f * (1<<23), | 
|---|
| 969 | x + 121.274057500f | 
|---|
| 970 | -   1.490129070f * f | 
|---|
| 971 | +  27.728023300f / (4.84252568f - f))); | 
|---|
| 972 | } | 
|---|
| 973 |  | 
|---|
| 974 | SI F approx_exp(F x) { | 
|---|
| 975 | const float log2_e = 1.4426950408889634074f; | 
|---|
| 976 | return approx_pow2(log2_e * x); | 
|---|
| 977 | } | 
|---|
| 978 |  | 
|---|
| 979 | SI F approx_powf(F x, F y) { | 
|---|
| 980 | #if defined(SK_LEGACY_APPROX_POWF_SPECIALCASE) | 
|---|
| 981 | return if_then_else((x == 0)         , 0 | 
|---|
| 982 | #else | 
|---|
| 983 | return if_then_else((x == 0)|(x == 1), x | 
|---|
| 984 | #endif | 
|---|
| 985 | , approx_pow2(approx_log2(x) * y)); | 
|---|
| 986 | } | 
|---|
| 987 |  | 
|---|
| 988 | SI F from_half(U16 h) { | 
|---|
| 989 | #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \ | 
|---|
| 990 | && !defined(SK_BUILD_FOR_GOOGLE3)  // Temporary workaround for some Google3 builds. | 
|---|
| 991 | return vcvt_f32_f16(h); | 
|---|
| 992 |  | 
|---|
| 993 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 994 | return _mm256_cvtph_ps(h); | 
|---|
| 995 |  | 
|---|
| 996 | #else | 
|---|
| 997 | // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias. | 
|---|
| 998 | U32 sem = expand(h), | 
|---|
| 999 | s   = sem & 0x8000, | 
|---|
| 1000 | em = sem ^ s; | 
|---|
| 1001 |  | 
|---|
| 1002 | // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero. | 
|---|
| 1003 | auto denorm = (I32)em < 0x0400;      // I32 comparison is often quicker, and always safe here. | 
|---|
| 1004 | return if_then_else(denorm, F(0) | 
|---|
| 1005 | , bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) )); | 
|---|
| 1006 | #endif | 
|---|
| 1007 | } | 
|---|
| 1008 |  | 
|---|
| 1009 | SI U16 to_half(F f) { | 
|---|
| 1010 | #if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \ | 
|---|
| 1011 | && !defined(SK_BUILD_FOR_GOOGLE3)  // Temporary workaround for some Google3 builds. | 
|---|
| 1012 | return vcvt_f16_f32(f); | 
|---|
| 1013 |  | 
|---|
| 1014 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 1015 | return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION); | 
|---|
| 1016 |  | 
|---|
| 1017 | #else | 
|---|
| 1018 | // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias. | 
|---|
| 1019 | U32 sem = bit_cast<U32>(f), | 
|---|
| 1020 | s   = sem & 0x80000000, | 
|---|
| 1021 | em = sem ^ s; | 
|---|
| 1022 |  | 
|---|
| 1023 | // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero. | 
|---|
| 1024 | auto denorm = (I32)em < 0x38800000;  // I32 comparison is often quicker, and always safe here. | 
|---|
| 1025 | return pack(if_then_else(denorm, U32(0) | 
|---|
| 1026 | , (s>>16) + (em>>13) - ((127-15)<<10))); | 
|---|
| 1027 | #endif | 
|---|
| 1028 | } | 
|---|
| 1029 |  | 
|---|
| 1030 | // Our fundamental vector depth is our pixel stride. | 
|---|
| 1031 | static const size_t N = sizeof(F) / sizeof(float); | 
|---|
| 1032 |  | 
|---|
| 1033 | // We're finally going to get to what a Stage function looks like! | 
|---|
| 1034 | //    tail == 0 ~~> work on a full N pixels | 
|---|
| 1035 | //    tail != 0 ~~> work on only the first tail pixels | 
|---|
| 1036 | // tail is always < N. | 
|---|
| 1037 |  | 
|---|
| 1038 | // Any custom ABI to use for all (non-externally-facing) stage functions? | 
|---|
| 1039 | // Also decide here whether to use narrow (compromise) or wide (ideal) stages. | 
|---|
| 1040 | #if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON) | 
|---|
| 1041 | // This lets us pass vectors more efficiently on 32-bit ARM. | 
|---|
| 1042 | // We can still only pass 16 floats, so best as 4x {r,g,b,a}. | 
|---|
| 1043 | #define ABI __attribute__((pcs("aapcs-vfp"))) | 
|---|
| 1044 | #define JUMPER_NARROW_STAGES 1 | 
|---|
| 1045 | #elif 0 && defined(_MSC_VER) && defined(__clang__) && defined(__x86_64__) | 
|---|
| 1046 | // SysV ABI makes it very sensible to use wide stages with clang-cl. | 
|---|
| 1047 | // TODO: crashes during compilation  :( | 
|---|
| 1048 | #define ABI __attribute__((sysv_abi)) | 
|---|
| 1049 | #define JUMPER_NARROW_STAGES 0 | 
|---|
| 1050 | #elif defined(_MSC_VER) | 
|---|
| 1051 | // Even if not vectorized, this lets us pass {r,g,b,a} as registers, | 
|---|
| 1052 | // instead of {b,a} on the stack.  Narrow stages work best for __vectorcall. | 
|---|
| 1053 | #define ABI __vectorcall | 
|---|
| 1054 | #define JUMPER_NARROW_STAGES 1 | 
|---|
| 1055 | #elif defined(__x86_64__) || defined(SK_CPU_ARM64) | 
|---|
| 1056 | // These platforms are ideal for wider stages, and their default ABI is ideal. | 
|---|
| 1057 | #define ABI | 
|---|
| 1058 | #define JUMPER_NARROW_STAGES 0 | 
|---|
| 1059 | #else | 
|---|
| 1060 | // 32-bit or unknown... shunt them down the narrow path. | 
|---|
| 1061 | // Odds are these have few registers and are better off there. | 
|---|
| 1062 | #define ABI | 
|---|
| 1063 | #define JUMPER_NARROW_STAGES 1 | 
|---|
| 1064 | #endif | 
|---|
| 1065 |  | 
|---|
| 1066 | #if JUMPER_NARROW_STAGES | 
|---|
| 1067 | struct Params { | 
|---|
| 1068 | size_t dx, dy, tail; | 
|---|
| 1069 | F dr,dg,db,da; | 
|---|
| 1070 | }; | 
|---|
| 1071 | using Stage = void(ABI*)(Params*, void** program, F r, F g, F b, F a); | 
|---|
| 1072 | #else | 
|---|
| 1073 | // We keep program the second argument, so that it's passed in rsi for load_and_inc(). | 
|---|
| 1074 | using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F); | 
|---|
| 1075 | #endif | 
|---|
| 1076 |  | 
|---|
| 1077 |  | 
|---|
| 1078 | static void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, void** program) { | 
|---|
| 1079 | auto start = (Stage)load_and_inc(program); | 
|---|
| 1080 | const size_t x0 = dx; | 
|---|
| 1081 | for (; dy < ylimit; dy++) { | 
|---|
| 1082 | #if JUMPER_NARROW_STAGES | 
|---|
| 1083 | Params params = { x0,dy,0, 0,0,0,0 }; | 
|---|
| 1084 | while (params.dx + N <= xlimit) { | 
|---|
| 1085 | start(¶ms,program, 0,0,0,0); | 
|---|
| 1086 | params.dx += N; | 
|---|
| 1087 | } | 
|---|
| 1088 | if (size_t tail = xlimit - params.dx) { | 
|---|
| 1089 | params.tail = tail; | 
|---|
| 1090 | start(¶ms,program, 0,0,0,0); | 
|---|
| 1091 | } | 
|---|
| 1092 | #else | 
|---|
| 1093 | dx = x0; | 
|---|
| 1094 | while (dx + N <= xlimit) { | 
|---|
| 1095 | start(0,program,dx,dy,    0,0,0,0, 0,0,0,0); | 
|---|
| 1096 | dx += N; | 
|---|
| 1097 | } | 
|---|
| 1098 | if (size_t tail = xlimit - dx) { | 
|---|
| 1099 | start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); | 
|---|
| 1100 | } | 
|---|
| 1101 | #endif | 
|---|
| 1102 | } | 
|---|
| 1103 | } | 
|---|
| 1104 |  | 
|---|
| 1105 | #if JUMPER_NARROW_STAGES | 
|---|
| 1106 | #define STAGE(name, ...)                                                    \ | 
|---|
| 1107 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,        \ | 
|---|
| 1108 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);   \ | 
|---|
| 1109 | static void ABI name(Params* params, void** program,                    \ | 
|---|
| 1110 | F r, F g, F b, F a) {                              \ | 
|---|
| 1111 | name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a,  \ | 
|---|
| 1112 | params->dr, params->dg, params->db, params->da);           \ | 
|---|
| 1113 | auto next = (Stage)load_and_inc(program);                           \ | 
|---|
| 1114 | next(params,program, r,g,b,a);                                      \ | 
|---|
| 1115 | }                                                                       \ | 
|---|
| 1116 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,        \ | 
|---|
| 1117 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) | 
|---|
| 1118 | #else | 
|---|
| 1119 | #define STAGE(name, ...)                                                         \ | 
|---|
| 1120 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,             \ | 
|---|
| 1121 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);        \ | 
|---|
| 1122 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy,      \ | 
|---|
| 1123 | F r, F g, F b, F a, F dr, F dg, F db, F da) {           \ | 
|---|
| 1124 | name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da);                 \ | 
|---|
| 1125 | auto next = (Stage)load_and_inc(program);                                \ | 
|---|
| 1126 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                          \ | 
|---|
| 1127 | }                                                                            \ | 
|---|
| 1128 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,             \ | 
|---|
| 1129 | F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) | 
|---|
| 1130 | #endif | 
|---|
| 1131 |  | 
|---|
| 1132 |  | 
|---|
| 1133 | // just_return() is a simple no-op stage that only exists to end the chain, | 
|---|
| 1134 | // returning back up to start_pipeline(), and from there to the caller. | 
|---|
| 1135 | #if JUMPER_NARROW_STAGES | 
|---|
| 1136 | static void ABI just_return(Params*, void**, F,F,F,F) {} | 
|---|
| 1137 | #else | 
|---|
| 1138 | static void ABI just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {} | 
|---|
| 1139 | #endif | 
|---|
| 1140 |  | 
|---|
| 1141 |  | 
|---|
| 1142 | // We could start defining normal Stages now.  But first, some helper functions. | 
|---|
| 1143 |  | 
|---|
| 1144 | // These load() and store() methods are tail-aware, | 
|---|
| 1145 | // but focus mainly on keeping the at-stride tail==0 case fast. | 
|---|
| 1146 |  | 
|---|
| 1147 | template <typename V, typename T> | 
|---|
| 1148 | SI V load(const T* src, size_t tail) { | 
|---|
| 1149 | #if !defined(JUMPER_IS_SCALAR) | 
|---|
| 1150 | __builtin_assume(tail < N); | 
|---|
| 1151 | if (__builtin_expect(tail, 0)) { | 
|---|
| 1152 | V v{};  // Any inactive lanes are zeroed. | 
|---|
| 1153 | switch (tail) { | 
|---|
| 1154 | case 7: v[6] = src[6]; | 
|---|
| 1155 | case 6: v[5] = src[5]; | 
|---|
| 1156 | case 5: v[4] = src[4]; | 
|---|
| 1157 | case 4: memcpy(&v, src, 4*sizeof(T)); break; | 
|---|
| 1158 | case 3: v[2] = src[2]; | 
|---|
| 1159 | case 2: memcpy(&v, src, 2*sizeof(T)); break; | 
|---|
| 1160 | case 1: memcpy(&v, src, 1*sizeof(T)); break; | 
|---|
| 1161 | } | 
|---|
| 1162 | return v; | 
|---|
| 1163 | } | 
|---|
| 1164 | #endif | 
|---|
| 1165 | return sk_unaligned_load<V>(src); | 
|---|
| 1166 | } | 
|---|
| 1167 |  | 
|---|
| 1168 | template <typename V, typename T> | 
|---|
| 1169 | SI void store(T* dst, V v, size_t tail) { | 
|---|
| 1170 | #if !defined(JUMPER_IS_SCALAR) | 
|---|
| 1171 | __builtin_assume(tail < N); | 
|---|
| 1172 | if (__builtin_expect(tail, 0)) { | 
|---|
| 1173 | switch (tail) { | 
|---|
| 1174 | case 7: dst[6] = v[6]; | 
|---|
| 1175 | case 6: dst[5] = v[5]; | 
|---|
| 1176 | case 5: dst[4] = v[4]; | 
|---|
| 1177 | case 4: memcpy(dst, &v, 4*sizeof(T)); break; | 
|---|
| 1178 | case 3: dst[2] = v[2]; | 
|---|
| 1179 | case 2: memcpy(dst, &v, 2*sizeof(T)); break; | 
|---|
| 1180 | case 1: memcpy(dst, &v, 1*sizeof(T)); break; | 
|---|
| 1181 | } | 
|---|
| 1182 | return; | 
|---|
| 1183 | } | 
|---|
| 1184 | #endif | 
|---|
| 1185 | sk_unaligned_store(dst, v); | 
|---|
| 1186 | } | 
|---|
| 1187 |  | 
|---|
| 1188 | SI F from_byte(U8 b) { | 
|---|
| 1189 | return cast(expand(b)) * (1/255.0f); | 
|---|
| 1190 | } | 
|---|
| 1191 | SI F from_short(U16 s) { | 
|---|
| 1192 | return cast(expand(s)) * (1/65535.0f); | 
|---|
| 1193 | } | 
|---|
| 1194 | SI void from_565(U16 _565, F* r, F* g, F* b) { | 
|---|
| 1195 | U32 wide = expand(_565); | 
|---|
| 1196 | *r = cast(wide & (31<<11)) * (1.0f / (31<<11)); | 
|---|
| 1197 | *g = cast(wide & (63<< 5)) * (1.0f / (63<< 5)); | 
|---|
| 1198 | *b = cast(wide & (31<< 0)) * (1.0f / (31<< 0)); | 
|---|
| 1199 | } | 
|---|
| 1200 | SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) { | 
|---|
| 1201 | U32 wide = expand(_4444); | 
|---|
| 1202 | *r = cast(wide & (15<<12)) * (1.0f / (15<<12)); | 
|---|
| 1203 | *g = cast(wide & (15<< 8)) * (1.0f / (15<< 8)); | 
|---|
| 1204 | *b = cast(wide & (15<< 4)) * (1.0f / (15<< 4)); | 
|---|
| 1205 | *a = cast(wide & (15<< 0)) * (1.0f / (15<< 0)); | 
|---|
| 1206 | } | 
|---|
| 1207 | SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) { | 
|---|
| 1208 | *r = cast((_8888      ) & 0xff) * (1/255.0f); | 
|---|
| 1209 | *g = cast((_8888 >>  8) & 0xff) * (1/255.0f); | 
|---|
| 1210 | *b = cast((_8888 >> 16) & 0xff) * (1/255.0f); | 
|---|
| 1211 | *a = cast((_8888 >> 24)       ) * (1/255.0f); | 
|---|
| 1212 | } | 
|---|
| 1213 | SI void from_88(U16 _88, F* r, F* g) { | 
|---|
| 1214 | U32 wide = expand(_88); | 
|---|
| 1215 | *r = cast((wide      ) & 0xff) * (1/255.0f); | 
|---|
| 1216 | *g = cast((wide >>  8) & 0xff) * (1/255.0f); | 
|---|
| 1217 | } | 
|---|
| 1218 | SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) { | 
|---|
| 1219 | *r = cast((rgba      ) & 0x3ff) * (1/1023.0f); | 
|---|
| 1220 | *g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f); | 
|---|
| 1221 | *b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f); | 
|---|
| 1222 | *a = cast((rgba >> 30)        ) * (1/   3.0f); | 
|---|
| 1223 | } | 
|---|
| 1224 | SI void from_1616(U32 _1616, F* r, F* g) { | 
|---|
| 1225 | *r = cast((_1616      ) & 0xffff) * (1/65535.0f); | 
|---|
| 1226 | *g = cast((_1616 >> 16) & 0xffff) * (1/65535.0f); | 
|---|
| 1227 | } | 
|---|
| 1228 | SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) { | 
|---|
| 1229 | *r = cast64((_16161616      ) & 0xffff) * (1/65535.0f); | 
|---|
| 1230 | *g = cast64((_16161616 >> 16) & 0xffff) * (1/65535.0f); | 
|---|
| 1231 | *b = cast64((_16161616 >> 32) & 0xffff) * (1/65535.0f); | 
|---|
| 1232 | *a = cast64((_16161616 >> 48) & 0xffff) * (1/65535.0f); | 
|---|
| 1233 | } | 
|---|
| 1234 |  | 
|---|
| 1235 | // Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory. | 
|---|
| 1236 | template <typename T> | 
|---|
| 1237 | SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) { | 
|---|
| 1238 | return (T*)ctx->pixels + dy*ctx->stride + dx; | 
|---|
| 1239 | } | 
|---|
| 1240 |  | 
|---|
| 1241 | // clamp v to [0,limit). | 
|---|
| 1242 | SI F clamp(F v, F limit) { | 
|---|
| 1243 | F inclusive = bit_cast<F>( bit_cast<U32>(limit) - 1 );  // Exclusive -> inclusive. | 
|---|
| 1244 | return min(max(0, v), inclusive); | 
|---|
| 1245 | } | 
|---|
| 1246 |  | 
|---|
| 1247 | // Used by gather_ stages to calculate the base pointer and a vector of indices to load. | 
|---|
| 1248 | template <typename T> | 
|---|
| 1249 | SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) { | 
|---|
| 1250 | x = clamp(x, ctx->width); | 
|---|
| 1251 | y = clamp(y, ctx->height); | 
|---|
| 1252 |  | 
|---|
| 1253 | *ptr = (const T*)ctx->pixels; | 
|---|
| 1254 | return trunc_(y)*ctx->stride + trunc_(x); | 
|---|
| 1255 | } | 
|---|
| 1256 |  | 
|---|
| 1257 | // We often have a nominally [0,1] float value we need to scale and convert to an integer, | 
|---|
| 1258 | // whether for a table lookup or to pack back down into bytes for storage. | 
|---|
| 1259 | // | 
|---|
| 1260 | // In practice, especially when dealing with interesting color spaces, that notionally | 
|---|
| 1261 | // [0,1] float may be out of [0,1] range.  Unorms cannot represent that, so we must clamp. | 
|---|
| 1262 | // | 
|---|
| 1263 | // You can adjust the expected input to [0,bias] by tweaking that parameter. | 
|---|
| 1264 | SI U32 to_unorm(F v, F scale, F bias = 1.0f) { | 
|---|
| 1265 | // TODO: platform-specific implementations to to_unorm(), removing round() entirely? | 
|---|
| 1266 | // Any time we use round() we probably want to use to_unorm(). | 
|---|
| 1267 | return round(min(max(0, v), bias), scale); | 
|---|
| 1268 | } | 
|---|
| 1269 |  | 
|---|
| 1270 | SI I32 cond_to_mask(I32 cond) { return if_then_else(cond, I32(~0), I32(0)); } | 
|---|
| 1271 |  | 
|---|
| 1272 | // Now finally, normal Stages! | 
|---|
| 1273 |  | 
|---|
| 1274 | STAGE(seed_shader, Ctx::None) { | 
|---|
| 1275 | static const float iota[] = { | 
|---|
| 1276 | 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f, | 
|---|
| 1277 | 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f, | 
|---|
| 1278 | }; | 
|---|
| 1279 | // It's important for speed to explicitly cast(dx) and cast(dy), | 
|---|
| 1280 | // which has the effect of splatting them to vectors before converting to floats. | 
|---|
| 1281 | // On Intel this breaks a data dependency on previous loop iterations' registers. | 
|---|
| 1282 | r = cast(dx) + sk_unaligned_load<F>(iota); | 
|---|
| 1283 | g = cast(dy) + 0.5f; | 
|---|
| 1284 | b = 1.0f; | 
|---|
| 1285 | a = 0; | 
|---|
| 1286 | dr = dg = db = da = 0; | 
|---|
| 1287 | } | 
|---|
| 1288 |  | 
|---|
| 1289 | STAGE(dither, const float* rate) { | 
|---|
| 1290 | // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors. | 
|---|
| 1291 | uint32_t iota[] = {0,1,2,3,4,5,6,7}; | 
|---|
| 1292 | U32 X = dx + sk_unaligned_load<U32>(iota), | 
|---|
| 1293 | Y = dy; | 
|---|
| 1294 |  | 
|---|
| 1295 | // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering. | 
|---|
| 1296 | // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ]. | 
|---|
| 1297 |  | 
|---|
| 1298 | // We only need X and X^Y from here on, so it's easier to just think of that as "Y". | 
|---|
| 1299 | Y ^= X; | 
|---|
| 1300 |  | 
|---|
| 1301 | // We'll mix the bottom 3 bits of each of X and Y to make 6 bits, | 
|---|
| 1302 | // for 2^6 == 64 == 8x8 matrix values.  If X=abc and Y=def, we make fcebda. | 
|---|
| 1303 | U32 M = (Y & 1) << 5 | (X & 1) << 4 | 
|---|
| 1304 | | (Y & 2) << 2 | (X & 2) << 1 | 
|---|
| 1305 | | (Y & 4) >> 1 | (X & 4) >> 2; | 
|---|
| 1306 |  | 
|---|
| 1307 | // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon. | 
|---|
| 1308 | // We want to make sure our dither is less than 0.5 in either direction to keep exact values | 
|---|
| 1309 | // like 0 and 1 unchanged after rounding. | 
|---|
| 1310 | F dither = cast(M) * (2/128.0f) - (63/128.0f); | 
|---|
| 1311 |  | 
|---|
| 1312 | r += *rate*dither; | 
|---|
| 1313 | g += *rate*dither; | 
|---|
| 1314 | b += *rate*dither; | 
|---|
| 1315 |  | 
|---|
| 1316 | r = max(0, min(r, a)); | 
|---|
| 1317 | g = max(0, min(g, a)); | 
|---|
| 1318 | b = max(0, min(b, a)); | 
|---|
| 1319 | } | 
|---|
| 1320 |  | 
|---|
| 1321 | // load 4 floats from memory, and splat them into r,g,b,a | 
|---|
| 1322 | STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) { | 
|---|
| 1323 | r = c->r; | 
|---|
| 1324 | g = c->g; | 
|---|
| 1325 | b = c->b; | 
|---|
| 1326 | a = c->a; | 
|---|
| 1327 | } | 
|---|
| 1328 | STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) { | 
|---|
| 1329 | r = c->r; | 
|---|
| 1330 | g = c->g; | 
|---|
| 1331 | b = c->b; | 
|---|
| 1332 | a = c->a; | 
|---|
| 1333 | } | 
|---|
| 1334 | // load 4 floats from memory, and splat them into dr,dg,db,da | 
|---|
| 1335 | STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) { | 
|---|
| 1336 | dr = c->r; | 
|---|
| 1337 | dg = c->g; | 
|---|
| 1338 | db = c->b; | 
|---|
| 1339 | da = c->a; | 
|---|
| 1340 | } | 
|---|
| 1341 |  | 
|---|
| 1342 | // splats opaque-black into r,g,b,a | 
|---|
| 1343 | STAGE(black_color, Ctx::None) { | 
|---|
| 1344 | r = g = b = 0.0f; | 
|---|
| 1345 | a = 1.0f; | 
|---|
| 1346 | } | 
|---|
| 1347 |  | 
|---|
| 1348 | STAGE(white_color, Ctx::None) { | 
|---|
| 1349 | r = g = b = a = 1.0f; | 
|---|
| 1350 | } | 
|---|
| 1351 |  | 
|---|
| 1352 | // load registers r,g,b,a from context (mirrors store_rgba) | 
|---|
| 1353 | STAGE(load_src, const float* ptr) { | 
|---|
| 1354 | r = sk_unaligned_load<F>(ptr + 0*N); | 
|---|
| 1355 | g = sk_unaligned_load<F>(ptr + 1*N); | 
|---|
| 1356 | b = sk_unaligned_load<F>(ptr + 2*N); | 
|---|
| 1357 | a = sk_unaligned_load<F>(ptr + 3*N); | 
|---|
| 1358 | } | 
|---|
| 1359 |  | 
|---|
| 1360 | // store registers r,g,b,a into context (mirrors load_rgba) | 
|---|
| 1361 | STAGE(store_src, float* ptr) { | 
|---|
| 1362 | sk_unaligned_store(ptr + 0*N, r); | 
|---|
| 1363 | sk_unaligned_store(ptr + 1*N, g); | 
|---|
| 1364 | sk_unaligned_store(ptr + 2*N, b); | 
|---|
| 1365 | sk_unaligned_store(ptr + 3*N, a); | 
|---|
| 1366 | } | 
|---|
| 1367 | STAGE(store_src_a, float* ptr) { | 
|---|
| 1368 | sk_unaligned_store(ptr, a); | 
|---|
| 1369 | } | 
|---|
| 1370 |  | 
|---|
| 1371 | // load registers dr,dg,db,da from context (mirrors store_dst) | 
|---|
| 1372 | STAGE(load_dst, const float* ptr) { | 
|---|
| 1373 | dr = sk_unaligned_load<F>(ptr + 0*N); | 
|---|
| 1374 | dg = sk_unaligned_load<F>(ptr + 1*N); | 
|---|
| 1375 | db = sk_unaligned_load<F>(ptr + 2*N); | 
|---|
| 1376 | da = sk_unaligned_load<F>(ptr + 3*N); | 
|---|
| 1377 | } | 
|---|
| 1378 |  | 
|---|
| 1379 | // store registers dr,dg,db,da into context (mirrors load_dst) | 
|---|
| 1380 | STAGE(store_dst, float* ptr) { | 
|---|
| 1381 | sk_unaligned_store(ptr + 0*N, dr); | 
|---|
| 1382 | sk_unaligned_store(ptr + 1*N, dg); | 
|---|
| 1383 | sk_unaligned_store(ptr + 2*N, db); | 
|---|
| 1384 | sk_unaligned_store(ptr + 3*N, da); | 
|---|
| 1385 | } | 
|---|
| 1386 |  | 
|---|
| 1387 | // Most blend modes apply the same logic to each channel. | 
|---|
| 1388 | #define BLEND_MODE(name)                       \ | 
|---|
| 1389 | SI F name##_channel(F s, F d, F sa, F da); \ | 
|---|
| 1390 | STAGE(name, Ctx::None) {                   \ | 
|---|
| 1391 | r = name##_channel(r,dr,a,da);         \ | 
|---|
| 1392 | g = name##_channel(g,dg,a,da);         \ | 
|---|
| 1393 | b = name##_channel(b,db,a,da);         \ | 
|---|
| 1394 | a = name##_channel(a,da,a,da);         \ | 
|---|
| 1395 | }                                          \ | 
|---|
| 1396 | SI F name##_channel(F s, F d, F sa, F da) | 
|---|
| 1397 |  | 
|---|
| 1398 | SI F inv(F x) { return 1.0f - x; } | 
|---|
| 1399 | SI F two(F x) { return x + x; } | 
|---|
| 1400 |  | 
|---|
| 1401 |  | 
|---|
| 1402 | BLEND_MODE(clear)    { return 0; } | 
|---|
| 1403 | BLEND_MODE(srcatop)  { return s*da + d*inv(sa); } | 
|---|
| 1404 | BLEND_MODE(dstatop)  { return d*sa + s*inv(da); } | 
|---|
| 1405 | BLEND_MODE(srcin)    { return s * da; } | 
|---|
| 1406 | BLEND_MODE(dstin)    { return d * sa; } | 
|---|
| 1407 | BLEND_MODE(srcout)   { return s * inv(da); } | 
|---|
| 1408 | BLEND_MODE(dstout)   { return d * inv(sa); } | 
|---|
| 1409 | BLEND_MODE(srcover)  { return mad(d, inv(sa), s); } | 
|---|
| 1410 | BLEND_MODE(dstover)  { return mad(s, inv(da), d); } | 
|---|
| 1411 |  | 
|---|
| 1412 | BLEND_MODE(modulate) { return s*d; } | 
|---|
| 1413 | BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; } | 
|---|
| 1414 | BLEND_MODE(plus_)    { return min(s + d, 1.0f); }  // We can clamp to either 1 or sa. | 
|---|
| 1415 | BLEND_MODE(screen)   { return s + d - s*d; } | 
|---|
| 1416 | BLEND_MODE(xor_)     { return s*inv(da) + d*inv(sa); } | 
|---|
| 1417 | #undef BLEND_MODE | 
|---|
| 1418 |  | 
|---|
| 1419 | // Most other blend modes apply the same logic to colors, and srcover to alpha. | 
|---|
| 1420 | #define BLEND_MODE(name)                       \ | 
|---|
| 1421 | SI F name##_channel(F s, F d, F sa, F da); \ | 
|---|
| 1422 | STAGE(name, Ctx::None) {                   \ | 
|---|
| 1423 | r = name##_channel(r,dr,a,da);         \ | 
|---|
| 1424 | g = name##_channel(g,dg,a,da);         \ | 
|---|
| 1425 | b = name##_channel(b,db,a,da);         \ | 
|---|
| 1426 | a = mad(da, inv(a), a);                \ | 
|---|
| 1427 | }                                          \ | 
|---|
| 1428 | SI F name##_channel(F s, F d, F sa, F da) | 
|---|
| 1429 |  | 
|---|
| 1430 | BLEND_MODE(darken)     { return s + d -     max(s*da, d*sa) ; } | 
|---|
| 1431 | BLEND_MODE(lighten)    { return s + d -     min(s*da, d*sa) ; } | 
|---|
| 1432 | BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); } | 
|---|
| 1433 | BLEND_MODE(exclusion)  { return s + d - two(s*d); } | 
|---|
| 1434 |  | 
|---|
| 1435 | BLEND_MODE(colorburn) { | 
|---|
| 1436 | return if_then_else(d == da,    d +    s*inv(da), | 
|---|
| 1437 | if_then_else(s ==  0, /* s + */ d*inv(sa), | 
|---|
| 1438 | sa*(da - min(da, (da-d)*sa*rcp(s))) + s*inv(da) + d*inv(sa))); | 
|---|
| 1439 | } | 
|---|
| 1440 | BLEND_MODE(colordodge) { | 
|---|
| 1441 | return if_then_else(d ==  0, /* d + */ s*inv(da), | 
|---|
| 1442 | if_then_else(s == sa,    s +    d*inv(sa), | 
|---|
| 1443 | sa*min(da, (d*sa)*rcp(sa - s)) + s*inv(da) + d*inv(sa))); | 
|---|
| 1444 | } | 
|---|
| 1445 | BLEND_MODE(hardlight) { | 
|---|
| 1446 | return s*inv(da) + d*inv(sa) | 
|---|
| 1447 | + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s))); | 
|---|
| 1448 | } | 
|---|
| 1449 | BLEND_MODE(overlay) { | 
|---|
| 1450 | return s*inv(da) + d*inv(sa) | 
|---|
| 1451 | + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s))); | 
|---|
| 1452 | } | 
|---|
| 1453 |  | 
|---|
| 1454 | BLEND_MODE(softlight) { | 
|---|
| 1455 | F m  = if_then_else(da > 0, d / da, 0), | 
|---|
| 1456 | s2 = two(s), | 
|---|
| 1457 | m4 = two(two(m)); | 
|---|
| 1458 |  | 
|---|
| 1459 | // The logic forks three ways: | 
|---|
| 1460 | //    1. dark src? | 
|---|
| 1461 | //    2. light src, dark dst? | 
|---|
| 1462 | //    3. light src, light dst? | 
|---|
| 1463 | F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)),     // Used in case 1. | 
|---|
| 1464 | darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m,  // Used in case 2. | 
|---|
| 1465 | liteDst = rcp(rsqrt(m)) - m,                 // Used in case 3. | 
|---|
| 1466 | liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3? | 
|---|
| 1467 | return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc);      // 1 or (2 or 3)? | 
|---|
| 1468 | } | 
|---|
| 1469 | #undef BLEND_MODE | 
|---|
| 1470 |  | 
|---|
| 1471 | // We're basing our implemenation of non-separable blend modes on | 
|---|
| 1472 | //   https://www.w3.org/TR/compositing-1/#blendingnonseparable. | 
|---|
| 1473 | // and | 
|---|
| 1474 | //   https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf | 
|---|
| 1475 | // They're equivalent, but ES' math has been better simplified. | 
|---|
| 1476 | // | 
|---|
| 1477 | // Anything extra we add beyond that is to make the math work with premul inputs. | 
|---|
| 1478 |  | 
|---|
| 1479 | SI F sat(F r, F g, F b) { return max(r, max(g,b)) - min(r, min(g,b)); } | 
|---|
| 1480 | SI F lum(F r, F g, F b) { return r*0.30f + g*0.59f + b*0.11f; } | 
|---|
| 1481 |  | 
|---|
| 1482 | SI void set_sat(F* r, F* g, F* b, F s) { | 
|---|
| 1483 | F mn  = min(*r, min(*g,*b)), | 
|---|
| 1484 | mx  = max(*r, max(*g,*b)), | 
|---|
| 1485 | sat = mx - mn; | 
|---|
| 1486 |  | 
|---|
| 1487 | // Map min channel to 0, max channel to s, and scale the middle proportionally. | 
|---|
| 1488 | auto scale = [=](F c) { | 
|---|
| 1489 | return if_then_else(sat == 0, 0, (c - mn) * s / sat); | 
|---|
| 1490 | }; | 
|---|
| 1491 | *r = scale(*r); | 
|---|
| 1492 | *g = scale(*g); | 
|---|
| 1493 | *b = scale(*b); | 
|---|
| 1494 | } | 
|---|
| 1495 | SI void set_lum(F* r, F* g, F* b, F l) { | 
|---|
| 1496 | F diff = l - lum(*r, *g, *b); | 
|---|
| 1497 | *r += diff; | 
|---|
| 1498 | *g += diff; | 
|---|
| 1499 | *b += diff; | 
|---|
| 1500 | } | 
|---|
| 1501 | SI void clip_color(F* r, F* g, F* b, F a) { | 
|---|
| 1502 | F mn = min(*r, min(*g, *b)), | 
|---|
| 1503 | mx = max(*r, max(*g, *b)), | 
|---|
| 1504 | l  = lum(*r, *g, *b); | 
|---|
| 1505 |  | 
|---|
| 1506 | auto clip = [=](F c) { | 
|---|
| 1507 | c = if_then_else(mn >= 0, c, l + (c - l) * (    l) / (l - mn)   ); | 
|---|
| 1508 | c = if_then_else(mx >  a,    l + (c - l) * (a - l) / (mx - l), c); | 
|---|
| 1509 | c = max(c, 0);  // Sometimes without this we may dip just a little negative. | 
|---|
| 1510 | return c; | 
|---|
| 1511 | }; | 
|---|
| 1512 | *r = clip(*r); | 
|---|
| 1513 | *g = clip(*g); | 
|---|
| 1514 | *b = clip(*b); | 
|---|
| 1515 | } | 
|---|
| 1516 |  | 
|---|
| 1517 | STAGE(hue, Ctx::None) { | 
|---|
| 1518 | F R = r*a, | 
|---|
| 1519 | G = g*a, | 
|---|
| 1520 | B = b*a; | 
|---|
| 1521 |  | 
|---|
| 1522 | set_sat(&R, &G, &B, sat(dr,dg,db)*a); | 
|---|
| 1523 | set_lum(&R, &G, &B, lum(dr,dg,db)*a); | 
|---|
| 1524 | clip_color(&R,&G,&B, a*da); | 
|---|
| 1525 |  | 
|---|
| 1526 | r = r*inv(da) + dr*inv(a) + R; | 
|---|
| 1527 | g = g*inv(da) + dg*inv(a) + G; | 
|---|
| 1528 | b = b*inv(da) + db*inv(a) + B; | 
|---|
| 1529 | a = a + da - a*da; | 
|---|
| 1530 | } | 
|---|
| 1531 | STAGE(saturation, Ctx::None) { | 
|---|
| 1532 | F R = dr*a, | 
|---|
| 1533 | G = dg*a, | 
|---|
| 1534 | B = db*a; | 
|---|
| 1535 |  | 
|---|
| 1536 | set_sat(&R, &G, &B, sat( r, g, b)*da); | 
|---|
| 1537 | set_lum(&R, &G, &B, lum(dr,dg,db)* a);  // (This is not redundant.) | 
|---|
| 1538 | clip_color(&R,&G,&B, a*da); | 
|---|
| 1539 |  | 
|---|
| 1540 | r = r*inv(da) + dr*inv(a) + R; | 
|---|
| 1541 | g = g*inv(da) + dg*inv(a) + G; | 
|---|
| 1542 | b = b*inv(da) + db*inv(a) + B; | 
|---|
| 1543 | a = a + da - a*da; | 
|---|
| 1544 | } | 
|---|
| 1545 | STAGE(color, Ctx::None) { | 
|---|
| 1546 | F R = r*da, | 
|---|
| 1547 | G = g*da, | 
|---|
| 1548 | B = b*da; | 
|---|
| 1549 |  | 
|---|
| 1550 | set_lum(&R, &G, &B, lum(dr,dg,db)*a); | 
|---|
| 1551 | clip_color(&R,&G,&B, a*da); | 
|---|
| 1552 |  | 
|---|
| 1553 | r = r*inv(da) + dr*inv(a) + R; | 
|---|
| 1554 | g = g*inv(da) + dg*inv(a) + G; | 
|---|
| 1555 | b = b*inv(da) + db*inv(a) + B; | 
|---|
| 1556 | a = a + da - a*da; | 
|---|
| 1557 | } | 
|---|
| 1558 | STAGE(luminosity, Ctx::None) { | 
|---|
| 1559 | F R = dr*a, | 
|---|
| 1560 | G = dg*a, | 
|---|
| 1561 | B = db*a; | 
|---|
| 1562 |  | 
|---|
| 1563 | set_lum(&R, &G, &B, lum(r,g,b)*da); | 
|---|
| 1564 | clip_color(&R,&G,&B, a*da); | 
|---|
| 1565 |  | 
|---|
| 1566 | r = r*inv(da) + dr*inv(a) + R; | 
|---|
| 1567 | g = g*inv(da) + dg*inv(a) + G; | 
|---|
| 1568 | b = b*inv(da) + db*inv(a) + B; | 
|---|
| 1569 | a = a + da - a*da; | 
|---|
| 1570 | } | 
|---|
| 1571 |  | 
|---|
| 1572 | STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 1573 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); | 
|---|
| 1574 |  | 
|---|
| 1575 | U32 dst = load<U32>(ptr, tail); | 
|---|
| 1576 | dr = cast((dst      ) & 0xff); | 
|---|
| 1577 | dg = cast((dst >>  8) & 0xff); | 
|---|
| 1578 | db = cast((dst >> 16) & 0xff); | 
|---|
| 1579 | da = cast((dst >> 24)       ); | 
|---|
| 1580 | // {dr,dg,db,da} are in [0,255] | 
|---|
| 1581 | // { r, g, b, a} are in [0,  1] (but may be out of gamut) | 
|---|
| 1582 |  | 
|---|
| 1583 | r = mad(dr, inv(a), r*255.0f); | 
|---|
| 1584 | g = mad(dg, inv(a), g*255.0f); | 
|---|
| 1585 | b = mad(db, inv(a), b*255.0f); | 
|---|
| 1586 | a = mad(da, inv(a), a*255.0f); | 
|---|
| 1587 | // { r, g, b, a} are now in [0,255]  (but may be out of gamut) | 
|---|
| 1588 |  | 
|---|
| 1589 | // to_unorm() clamps back to gamut.  Scaling by 1 since we're already 255-biased. | 
|---|
| 1590 | dst = to_unorm(r, 1, 255) | 
|---|
| 1591 | | to_unorm(g, 1, 255) <<  8 | 
|---|
| 1592 | | to_unorm(b, 1, 255) << 16 | 
|---|
| 1593 | | to_unorm(a, 1, 255) << 24; | 
|---|
| 1594 | store(ptr, dst, tail); | 
|---|
| 1595 | } | 
|---|
| 1596 |  | 
|---|
| 1597 | STAGE(clamp_0, Ctx::None) { | 
|---|
| 1598 | r = max(r, 0); | 
|---|
| 1599 | g = max(g, 0); | 
|---|
| 1600 | b = max(b, 0); | 
|---|
| 1601 | a = max(a, 0); | 
|---|
| 1602 | } | 
|---|
| 1603 |  | 
|---|
| 1604 | STAGE(clamp_1, Ctx::None) { | 
|---|
| 1605 | r = min(r, 1.0f); | 
|---|
| 1606 | g = min(g, 1.0f); | 
|---|
| 1607 | b = min(b, 1.0f); | 
|---|
| 1608 | a = min(a, 1.0f); | 
|---|
| 1609 | } | 
|---|
| 1610 |  | 
|---|
| 1611 | STAGE(clamp_a, Ctx::None) { | 
|---|
| 1612 | a = min(a, 1.0f); | 
|---|
| 1613 | r = min(r, a); | 
|---|
| 1614 | g = min(g, a); | 
|---|
| 1615 | b = min(b, a); | 
|---|
| 1616 | } | 
|---|
| 1617 |  | 
|---|
| 1618 | STAGE(clamp_gamut, Ctx::None) { | 
|---|
| 1619 | a = min(max(a, 0), 1.0f); | 
|---|
| 1620 | r = min(max(r, 0), a); | 
|---|
| 1621 | g = min(max(g, 0), a); | 
|---|
| 1622 | b = min(max(b, 0), a); | 
|---|
| 1623 | } | 
|---|
| 1624 |  | 
|---|
| 1625 | STAGE(set_rgb, const float* rgb) { | 
|---|
| 1626 | r = rgb[0]; | 
|---|
| 1627 | g = rgb[1]; | 
|---|
| 1628 | b = rgb[2]; | 
|---|
| 1629 | } | 
|---|
| 1630 | STAGE(unbounded_set_rgb, const float* rgb) { | 
|---|
| 1631 | r = rgb[0]; | 
|---|
| 1632 | g = rgb[1]; | 
|---|
| 1633 | b = rgb[2]; | 
|---|
| 1634 | } | 
|---|
| 1635 |  | 
|---|
| 1636 | STAGE(swap_rb, Ctx::None) { | 
|---|
| 1637 | auto tmp = r; | 
|---|
| 1638 | r = b; | 
|---|
| 1639 | b = tmp; | 
|---|
| 1640 | } | 
|---|
| 1641 | STAGE(swap_rb_dst, Ctx::None) { | 
|---|
| 1642 | auto tmp = dr; | 
|---|
| 1643 | dr = db; | 
|---|
| 1644 | db = tmp; | 
|---|
| 1645 | } | 
|---|
| 1646 |  | 
|---|
| 1647 | STAGE(move_src_dst, Ctx::None) { | 
|---|
| 1648 | dr = r; | 
|---|
| 1649 | dg = g; | 
|---|
| 1650 | db = b; | 
|---|
| 1651 | da = a; | 
|---|
| 1652 | } | 
|---|
| 1653 | STAGE(move_dst_src, Ctx::None) { | 
|---|
| 1654 | r = dr; | 
|---|
| 1655 | g = dg; | 
|---|
| 1656 | b = db; | 
|---|
| 1657 | a = da; | 
|---|
| 1658 | } | 
|---|
| 1659 |  | 
|---|
| 1660 | STAGE(premul, Ctx::None) { | 
|---|
| 1661 | r = r * a; | 
|---|
| 1662 | g = g * a; | 
|---|
| 1663 | b = b * a; | 
|---|
| 1664 | } | 
|---|
| 1665 | STAGE(premul_dst, Ctx::None) { | 
|---|
| 1666 | dr = dr * da; | 
|---|
| 1667 | dg = dg * da; | 
|---|
| 1668 | db = db * da; | 
|---|
| 1669 | } | 
|---|
| 1670 | STAGE(unpremul, Ctx::None) { | 
|---|
| 1671 | float inf = bit_cast<float>(0x7f800000); | 
|---|
| 1672 | auto scale = if_then_else(1.0f/a < inf, 1.0f/a, 0); | 
|---|
| 1673 | r *= scale; | 
|---|
| 1674 | g *= scale; | 
|---|
| 1675 | b *= scale; | 
|---|
| 1676 | } | 
|---|
| 1677 |  | 
|---|
| 1678 | STAGE(force_opaque    , Ctx::None) {  a = 1; } | 
|---|
| 1679 | STAGE(force_opaque_dst, Ctx::None) { da = 1; } | 
|---|
| 1680 |  | 
|---|
| 1681 | // Clamp x to [0,1], both sides inclusive (think, gradients). | 
|---|
| 1682 | // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. | 
|---|
| 1683 | SI F clamp_01(F v) { return min(max(0, v), 1); } | 
|---|
| 1684 |  | 
|---|
| 1685 | STAGE(rgb_to_hsl, Ctx::None) { | 
|---|
| 1686 | F mx = max(r, max(g,b)), | 
|---|
| 1687 | mn = min(r, min(g,b)), | 
|---|
| 1688 | d = mx - mn, | 
|---|
| 1689 | d_rcp = 1.0f / d; | 
|---|
| 1690 |  | 
|---|
| 1691 | F h = (1/6.0f) * | 
|---|
| 1692 | if_then_else(mx == mn, 0, | 
|---|
| 1693 | if_then_else(mx ==  r, (g-b)*d_rcp + if_then_else(g < b, 6.0f, 0), | 
|---|
| 1694 | if_then_else(mx ==  g, (b-r)*d_rcp + 2.0f, | 
|---|
| 1695 | (r-g)*d_rcp + 4.0f))); | 
|---|
| 1696 |  | 
|---|
| 1697 | F l = (mx + mn) * 0.5f; | 
|---|
| 1698 | F s = if_then_else(mx == mn, 0, | 
|---|
| 1699 | d / if_then_else(l > 0.5f, 2.0f-mx-mn, mx+mn)); | 
|---|
| 1700 |  | 
|---|
| 1701 | r = h; | 
|---|
| 1702 | g = s; | 
|---|
| 1703 | b = l; | 
|---|
| 1704 | } | 
|---|
| 1705 | STAGE(hsl_to_rgb, Ctx::None) { | 
|---|
| 1706 | // See GrRGBToHSLFilterEffect.fp | 
|---|
| 1707 |  | 
|---|
| 1708 | F h = r, | 
|---|
| 1709 | s = g, | 
|---|
| 1710 | l = b, | 
|---|
| 1711 | c = (1.0f - abs_(2.0f * l - 1)) * s; | 
|---|
| 1712 |  | 
|---|
| 1713 | auto hue_to_rgb = [&](F hue) { | 
|---|
| 1714 | F q = clamp_01(abs_(fract(hue) * 6.0f - 3.0f) - 1.0f); | 
|---|
| 1715 | return (q - 0.5f) * c + l; | 
|---|
| 1716 | }; | 
|---|
| 1717 |  | 
|---|
| 1718 | r = hue_to_rgb(h + 0.0f/3.0f); | 
|---|
| 1719 | g = hue_to_rgb(h + 2.0f/3.0f); | 
|---|
| 1720 | b = hue_to_rgb(h + 1.0f/3.0f); | 
|---|
| 1721 | } | 
|---|
| 1722 |  | 
|---|
| 1723 | // Derive alpha's coverage from rgb coverage and the values of src and dst alpha. | 
|---|
| 1724 | SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) { | 
|---|
| 1725 | return if_then_else(a < da, min(cr, min(cg,cb)) | 
|---|
| 1726 | , max(cr, max(cg,cb))); | 
|---|
| 1727 | } | 
|---|
| 1728 |  | 
|---|
| 1729 | STAGE(scale_1_float, const float* c) { | 
|---|
| 1730 | r = r * *c; | 
|---|
| 1731 | g = g * *c; | 
|---|
| 1732 | b = b * *c; | 
|---|
| 1733 | a = a * *c; | 
|---|
| 1734 | } | 
|---|
| 1735 | STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 1736 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); | 
|---|
| 1737 |  | 
|---|
| 1738 | auto scales = load<U8>(ptr, tail); | 
|---|
| 1739 | auto c = from_byte(scales); | 
|---|
| 1740 |  | 
|---|
| 1741 | r = r * c; | 
|---|
| 1742 | g = g * c; | 
|---|
| 1743 | b = b * c; | 
|---|
| 1744 | a = a * c; | 
|---|
| 1745 | } | 
|---|
| 1746 | STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 1747 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); | 
|---|
| 1748 |  | 
|---|
| 1749 | F cr,cg,cb; | 
|---|
| 1750 | from_565(load<U16>(ptr, tail), &cr, &cg, &cb); | 
|---|
| 1751 |  | 
|---|
| 1752 | F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); | 
|---|
| 1753 |  | 
|---|
| 1754 | r = r * cr; | 
|---|
| 1755 | g = g * cg; | 
|---|
| 1756 | b = b * cb; | 
|---|
| 1757 | a = a * ca; | 
|---|
| 1758 | } | 
|---|
| 1759 |  | 
|---|
| 1760 | SI F lerp(F from, F to, F t) { | 
|---|
| 1761 | return mad(to-from, t, from); | 
|---|
| 1762 | } | 
|---|
| 1763 |  | 
|---|
| 1764 | STAGE(lerp_1_float, const float* c) { | 
|---|
| 1765 | r = lerp(dr, r, *c); | 
|---|
| 1766 | g = lerp(dg, g, *c); | 
|---|
| 1767 | b = lerp(db, b, *c); | 
|---|
| 1768 | a = lerp(da, a, *c); | 
|---|
| 1769 | } | 
|---|
| 1770 | STAGE(scale_native, const float scales[]) { | 
|---|
| 1771 | auto c = sk_unaligned_load<F>(scales); | 
|---|
| 1772 | r = r * c; | 
|---|
| 1773 | g = g * c; | 
|---|
| 1774 | b = b * c; | 
|---|
| 1775 | a = a * c; | 
|---|
| 1776 | } | 
|---|
| 1777 | STAGE(lerp_native, const float scales[]) { | 
|---|
| 1778 | auto c = sk_unaligned_load<F>(scales); | 
|---|
| 1779 | r = lerp(dr, r, c); | 
|---|
| 1780 | g = lerp(dg, g, c); | 
|---|
| 1781 | b = lerp(db, b, c); | 
|---|
| 1782 | a = lerp(da, a, c); | 
|---|
| 1783 | } | 
|---|
| 1784 | STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 1785 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); | 
|---|
| 1786 |  | 
|---|
| 1787 | auto scales = load<U8>(ptr, tail); | 
|---|
| 1788 | auto c = from_byte(scales); | 
|---|
| 1789 |  | 
|---|
| 1790 | r = lerp(dr, r, c); | 
|---|
| 1791 | g = lerp(dg, g, c); | 
|---|
| 1792 | b = lerp(db, b, c); | 
|---|
| 1793 | a = lerp(da, a, c); | 
|---|
| 1794 | } | 
|---|
| 1795 | STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 1796 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); | 
|---|
| 1797 |  | 
|---|
| 1798 | F cr,cg,cb; | 
|---|
| 1799 | from_565(load<U16>(ptr, tail), &cr, &cg, &cb); | 
|---|
| 1800 |  | 
|---|
| 1801 | F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); | 
|---|
| 1802 |  | 
|---|
| 1803 | r = lerp(dr, r, cr); | 
|---|
| 1804 | g = lerp(dg, g, cg); | 
|---|
| 1805 | b = lerp(db, b, cb); | 
|---|
| 1806 | a = lerp(da, a, ca); | 
|---|
| 1807 | } | 
|---|
| 1808 |  | 
|---|
| 1809 | STAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) { | 
|---|
| 1810 | auto mptr = ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), | 
|---|
| 1811 | aptr = ptr_at_xy<const uint8_t>(&ctx->add, dx,dy); | 
|---|
| 1812 |  | 
|---|
| 1813 | F mul = from_byte(load<U8>(mptr, tail)), | 
|---|
| 1814 | add = from_byte(load<U8>(aptr, tail)); | 
|---|
| 1815 |  | 
|---|
| 1816 | r = mad(r, mul, add); | 
|---|
| 1817 | g = mad(g, mul, add); | 
|---|
| 1818 | b = mad(b, mul, add); | 
|---|
| 1819 | } | 
|---|
| 1820 |  | 
|---|
| 1821 | STAGE(byte_tables, const void* ctx) {  // TODO: rename Tables SkRasterPipeline_ByteTablesCtx | 
|---|
| 1822 | struct Tables { const uint8_t *r, *g, *b, *a; }; | 
|---|
| 1823 | auto tables = (const Tables*)ctx; | 
|---|
| 1824 |  | 
|---|
| 1825 | r = from_byte(gather(tables->r, to_unorm(r, 255))); | 
|---|
| 1826 | g = from_byte(gather(tables->g, to_unorm(g, 255))); | 
|---|
| 1827 | b = from_byte(gather(tables->b, to_unorm(b, 255))); | 
|---|
| 1828 | a = from_byte(gather(tables->a, to_unorm(a, 255))); | 
|---|
| 1829 | } | 
|---|
| 1830 |  | 
|---|
| 1831 | SI F strip_sign(F x, U32* sign) { | 
|---|
| 1832 | U32 bits = bit_cast<U32>(x); | 
|---|
| 1833 | *sign = bits & 0x80000000; | 
|---|
| 1834 | return bit_cast<F>(bits ^ *sign); | 
|---|
| 1835 | } | 
|---|
| 1836 |  | 
|---|
| 1837 | SI F apply_sign(F x, U32 sign) { | 
|---|
| 1838 | return bit_cast<F>(sign | bit_cast<U32>(x)); | 
|---|
| 1839 | } | 
|---|
| 1840 |  | 
|---|
| 1841 | STAGE(parametric, const skcms_TransferFunction* ctx) { | 
|---|
| 1842 | auto fn = [&](F v) { | 
|---|
| 1843 | U32 sign; | 
|---|
| 1844 | v = strip_sign(v, &sign); | 
|---|
| 1845 |  | 
|---|
| 1846 | F r = if_then_else(v <= ctx->d, mad(ctx->c, v, ctx->f) | 
|---|
| 1847 | , approx_powf(mad(ctx->a, v, ctx->b), ctx->g) + ctx->e); | 
|---|
| 1848 | return apply_sign(r, sign); | 
|---|
| 1849 | }; | 
|---|
| 1850 | r = fn(r); | 
|---|
| 1851 | g = fn(g); | 
|---|
| 1852 | b = fn(b); | 
|---|
| 1853 | } | 
|---|
| 1854 |  | 
|---|
| 1855 | STAGE(gamma_, const float* G) { | 
|---|
| 1856 | auto fn = [&](F v) { | 
|---|
| 1857 | U32 sign; | 
|---|
| 1858 | v = strip_sign(v, &sign); | 
|---|
| 1859 | return apply_sign(approx_powf(v, *G), sign); | 
|---|
| 1860 | }; | 
|---|
| 1861 | r = fn(r); | 
|---|
| 1862 | g = fn(g); | 
|---|
| 1863 | b = fn(b); | 
|---|
| 1864 | } | 
|---|
| 1865 |  | 
|---|
| 1866 | STAGE(PQish, const skcms_TransferFunction* ctx) { | 
|---|
| 1867 | auto fn = [&](F v) { | 
|---|
| 1868 | U32 sign; | 
|---|
| 1869 | v = strip_sign(v, &sign); | 
|---|
| 1870 |  | 
|---|
| 1871 | F r = approx_powf(max(mad(ctx->b, approx_powf(v, ctx->c), ctx->a), 0) | 
|---|
| 1872 | / (mad(ctx->e, approx_powf(v, ctx->c), ctx->d)), | 
|---|
| 1873 | ctx->f); | 
|---|
| 1874 |  | 
|---|
| 1875 | return apply_sign(r, sign); | 
|---|
| 1876 | }; | 
|---|
| 1877 | r = fn(r); | 
|---|
| 1878 | g = fn(g); | 
|---|
| 1879 | b = fn(b); | 
|---|
| 1880 | } | 
|---|
| 1881 |  | 
|---|
| 1882 | STAGE(HLGish, const skcms_TransferFunction* ctx) { | 
|---|
| 1883 | auto fn = [&](F v) { | 
|---|
| 1884 | U32 sign; | 
|---|
| 1885 | v = strip_sign(v, &sign); | 
|---|
| 1886 |  | 
|---|
| 1887 | const float R = ctx->a, G = ctx->b, | 
|---|
| 1888 | a = ctx->c, b = ctx->d, c = ctx->e; | 
|---|
| 1889 |  | 
|---|
| 1890 | F r = if_then_else(v*R <= 1, approx_powf(v*R, G) | 
|---|
| 1891 | , approx_exp((v-c)*a) + b); | 
|---|
| 1892 |  | 
|---|
| 1893 | return apply_sign(r, sign); | 
|---|
| 1894 | }; | 
|---|
| 1895 | r = fn(r); | 
|---|
| 1896 | g = fn(g); | 
|---|
| 1897 | b = fn(b); | 
|---|
| 1898 | } | 
|---|
| 1899 |  | 
|---|
| 1900 | STAGE(HLGinvish, const skcms_TransferFunction* ctx) { | 
|---|
| 1901 | auto fn = [&](F v) { | 
|---|
| 1902 | U32 sign; | 
|---|
| 1903 | v = strip_sign(v, &sign); | 
|---|
| 1904 |  | 
|---|
| 1905 | const float R = ctx->a, G = ctx->b, | 
|---|
| 1906 | a = ctx->c, b = ctx->d, c = ctx->e; | 
|---|
| 1907 |  | 
|---|
| 1908 | F r = if_then_else(v <= 1, R * approx_powf(v, G) | 
|---|
| 1909 | , a * approx_log(v - b) + c); | 
|---|
| 1910 |  | 
|---|
| 1911 | return apply_sign(r, sign); | 
|---|
| 1912 | }; | 
|---|
| 1913 | r = fn(r); | 
|---|
| 1914 | g = fn(g); | 
|---|
| 1915 | b = fn(b); | 
|---|
| 1916 | } | 
|---|
| 1917 |  | 
|---|
| 1918 | STAGE(from_srgb, Ctx::None) { | 
|---|
| 1919 | auto fn = [](F s) { | 
|---|
| 1920 | U32 sign; | 
|---|
| 1921 | s = strip_sign(s, &sign); | 
|---|
| 1922 | auto lo = s * (1/12.92f); | 
|---|
| 1923 | auto hi = mad(s*s, mad(s, 0.3000f, 0.6975f), 0.0025f); | 
|---|
| 1924 | return apply_sign(if_then_else(s < 0.055f, lo, hi), sign); | 
|---|
| 1925 | }; | 
|---|
| 1926 | r = fn(r); | 
|---|
| 1927 | g = fn(g); | 
|---|
| 1928 | b = fn(b); | 
|---|
| 1929 | } | 
|---|
| 1930 | STAGE(to_srgb, Ctx::None) { | 
|---|
| 1931 | auto fn = [](F l) { | 
|---|
| 1932 | U32 sign; | 
|---|
| 1933 | l = strip_sign(l, &sign); | 
|---|
| 1934 | // We tweak c and d for each instruction set to make sure fn(1) is exactly 1. | 
|---|
| 1935 | #if defined(JUMPER_IS_AVX512) | 
|---|
| 1936 | const float c = 1.130026340485f, | 
|---|
| 1937 | d = 0.141387879848f; | 
|---|
| 1938 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || \ | 
|---|
| 1939 | defined(JUMPER_IS_AVX ) || defined(JUMPER_IS_HSW ) | 
|---|
| 1940 | const float c = 1.130048394203f, | 
|---|
| 1941 | d = 0.141357362270f; | 
|---|
| 1942 | #elif defined(JUMPER_IS_NEON) | 
|---|
| 1943 | const float c = 1.129999995232f, | 
|---|
| 1944 | d = 0.141381442547f; | 
|---|
| 1945 | #else | 
|---|
| 1946 | const float c = 1.129999995232f, | 
|---|
| 1947 | d = 0.141377761960f; | 
|---|
| 1948 | #endif | 
|---|
| 1949 | F t = rsqrt(l); | 
|---|
| 1950 | auto lo = l * 12.92f; | 
|---|
| 1951 | auto hi = mad(t, mad(t, -0.0024542345f, 0.013832027f), c) | 
|---|
| 1952 | * rcp(d + t); | 
|---|
| 1953 | return apply_sign(if_then_else(l < 0.00465985f, lo, hi), sign); | 
|---|
| 1954 | }; | 
|---|
| 1955 | r = fn(r); | 
|---|
| 1956 | g = fn(g); | 
|---|
| 1957 | b = fn(b); | 
|---|
| 1958 | } | 
|---|
| 1959 |  | 
|---|
| 1960 | STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 1961 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); | 
|---|
| 1962 |  | 
|---|
| 1963 | r = g = b = 0.0f; | 
|---|
| 1964 | a = from_byte(load<U8>(ptr, tail)); | 
|---|
| 1965 | } | 
|---|
| 1966 | STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 1967 | auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); | 
|---|
| 1968 |  | 
|---|
| 1969 | dr = dg = db = 0.0f; | 
|---|
| 1970 | da = from_byte(load<U8>(ptr, tail)); | 
|---|
| 1971 | } | 
|---|
| 1972 | STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 1973 | const uint8_t* ptr; | 
|---|
| 1974 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); | 
|---|
| 1975 | r = g = b = 0.0f; | 
|---|
| 1976 | a = from_byte(gather(ptr, ix)); | 
|---|
| 1977 | } | 
|---|
| 1978 | STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 1979 | auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy); | 
|---|
| 1980 |  | 
|---|
| 1981 | U8 packed = pack(pack(to_unorm(a, 255))); | 
|---|
| 1982 | store(ptr, packed, tail); | 
|---|
| 1983 | } | 
|---|
| 1984 |  | 
|---|
| 1985 | STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 1986 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); | 
|---|
| 1987 |  | 
|---|
| 1988 | from_565(load<U16>(ptr, tail), &r,&g,&b); | 
|---|
| 1989 | a = 1.0f; | 
|---|
| 1990 | } | 
|---|
| 1991 | STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 1992 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); | 
|---|
| 1993 |  | 
|---|
| 1994 | from_565(load<U16>(ptr, tail), &dr,&dg,&db); | 
|---|
| 1995 | da = 1.0f; | 
|---|
| 1996 | } | 
|---|
| 1997 | STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 1998 | const uint16_t* ptr; | 
|---|
| 1999 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); | 
|---|
| 2000 | from_565(gather(ptr, ix), &r,&g,&b); | 
|---|
| 2001 | a = 1.0f; | 
|---|
| 2002 | } | 
|---|
| 2003 | STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2004 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); | 
|---|
| 2005 |  | 
|---|
| 2006 | U16 px = pack( to_unorm(r, 31) << 11 | 
|---|
| 2007 | | to_unorm(g, 63) <<  5 | 
|---|
| 2008 | | to_unorm(b, 31)      ); | 
|---|
| 2009 | store(ptr, px, tail); | 
|---|
| 2010 | } | 
|---|
| 2011 |  | 
|---|
| 2012 | STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2013 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); | 
|---|
| 2014 | from_4444(load<U16>(ptr, tail), &r,&g,&b,&a); | 
|---|
| 2015 | } | 
|---|
| 2016 | STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2017 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); | 
|---|
| 2018 | from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da); | 
|---|
| 2019 | } | 
|---|
| 2020 | STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2021 | const uint16_t* ptr; | 
|---|
| 2022 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); | 
|---|
| 2023 | from_4444(gather(ptr, ix), &r,&g,&b,&a); | 
|---|
| 2024 | } | 
|---|
| 2025 | STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2026 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); | 
|---|
| 2027 | U16 px = pack( to_unorm(r, 15) << 12 | 
|---|
| 2028 | | to_unorm(g, 15) <<  8 | 
|---|
| 2029 | | to_unorm(b, 15) <<  4 | 
|---|
| 2030 | | to_unorm(a, 15)      ); | 
|---|
| 2031 | store(ptr, px, tail); | 
|---|
| 2032 | } | 
|---|
| 2033 |  | 
|---|
| 2034 | STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2035 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); | 
|---|
| 2036 | from_8888(load<U32>(ptr, tail), &r,&g,&b,&a); | 
|---|
| 2037 | } | 
|---|
| 2038 | STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2039 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); | 
|---|
| 2040 | from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da); | 
|---|
| 2041 | } | 
|---|
| 2042 | STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2043 | const uint32_t* ptr; | 
|---|
| 2044 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); | 
|---|
| 2045 | from_8888(gather(ptr, ix), &r,&g,&b,&a); | 
|---|
| 2046 | } | 
|---|
| 2047 | STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2048 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); | 
|---|
| 2049 |  | 
|---|
| 2050 | U32 px = to_unorm(r, 255) | 
|---|
| 2051 | | to_unorm(g, 255) <<  8 | 
|---|
| 2052 | | to_unorm(b, 255) << 16 | 
|---|
| 2053 | | to_unorm(a, 255) << 24; | 
|---|
| 2054 | store(ptr, px, tail); | 
|---|
| 2055 | } | 
|---|
| 2056 |  | 
|---|
| 2057 | STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2058 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); | 
|---|
| 2059 | from_88(load<U16>(ptr, tail), &r, &g); | 
|---|
| 2060 | b = 0; | 
|---|
| 2061 | a = 1; | 
|---|
| 2062 | } | 
|---|
| 2063 | STAGE(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2064 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); | 
|---|
| 2065 | from_88(load<U16>(ptr, tail), &dr, &dg); | 
|---|
| 2066 | db = 0; | 
|---|
| 2067 | da = 1; | 
|---|
| 2068 | } | 
|---|
| 2069 | STAGE(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2070 | const uint16_t* ptr; | 
|---|
| 2071 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); | 
|---|
| 2072 | from_88(gather(ptr, ix), &r, &g); | 
|---|
| 2073 | b = 0; | 
|---|
| 2074 | a = 1; | 
|---|
| 2075 | } | 
|---|
| 2076 | STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2077 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy); | 
|---|
| 2078 | U16 px = pack( to_unorm(r, 255) | to_unorm(g, 255) <<  8 ); | 
|---|
| 2079 | store(ptr, px, tail); | 
|---|
| 2080 | } | 
|---|
| 2081 |  | 
|---|
| 2082 | STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2083 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); | 
|---|
| 2084 | r = g = b = 0; | 
|---|
| 2085 | a = from_short(load<U16>(ptr, tail)); | 
|---|
| 2086 | } | 
|---|
| 2087 | STAGE(load_a16_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2088 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); | 
|---|
| 2089 | dr = dg = db = 0.0f; | 
|---|
| 2090 | da = from_short(load<U16>(ptr, tail)); | 
|---|
| 2091 | } | 
|---|
| 2092 | STAGE(gather_a16, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2093 | const uint16_t* ptr; | 
|---|
| 2094 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); | 
|---|
| 2095 | r = g = b = 0.0f; | 
|---|
| 2096 | a = from_short(gather(ptr, ix)); | 
|---|
| 2097 | } | 
|---|
| 2098 | STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2099 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); | 
|---|
| 2100 |  | 
|---|
| 2101 | U16 px = pack(to_unorm(a, 65535)); | 
|---|
| 2102 | store(ptr, px, tail); | 
|---|
| 2103 | } | 
|---|
| 2104 |  | 
|---|
| 2105 | STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2106 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); | 
|---|
| 2107 | b = 0; a = 1; | 
|---|
| 2108 | from_1616(load<U32>(ptr, tail), &r,&g); | 
|---|
| 2109 | } | 
|---|
| 2110 | STAGE(load_rg1616_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2111 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); | 
|---|
| 2112 | from_1616(load<U32>(ptr, tail), &dr, &dg); | 
|---|
| 2113 | db = 0; | 
|---|
| 2114 | da = 1; | 
|---|
| 2115 | } | 
|---|
| 2116 | STAGE(gather_rg1616, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2117 | const uint32_t* ptr; | 
|---|
| 2118 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); | 
|---|
| 2119 | from_1616(gather(ptr, ix), &r, &g); | 
|---|
| 2120 | b = 0; | 
|---|
| 2121 | a = 1; | 
|---|
| 2122 | } | 
|---|
| 2123 | STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2124 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); | 
|---|
| 2125 |  | 
|---|
| 2126 | U32 px = to_unorm(r, 65535) | 
|---|
| 2127 | | to_unorm(g, 65535) <<  16; | 
|---|
| 2128 | store(ptr, px, tail); | 
|---|
| 2129 | } | 
|---|
| 2130 |  | 
|---|
| 2131 | STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2132 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy); | 
|---|
| 2133 | from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a); | 
|---|
| 2134 | } | 
|---|
| 2135 | STAGE(load_16161616_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2136 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy); | 
|---|
| 2137 | from_16161616(load<U64>(ptr, tail), &dr, &dg, &db, &da); | 
|---|
| 2138 | } | 
|---|
| 2139 | STAGE(gather_16161616, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2140 | const uint64_t* ptr; | 
|---|
| 2141 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); | 
|---|
| 2142 | from_16161616(gather(ptr, ix), &r, &g, &b, &a); | 
|---|
| 2143 | } | 
|---|
| 2144 | STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2145 | auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy); | 
|---|
| 2146 |  | 
|---|
| 2147 | U16 R = pack(to_unorm(r, 65535)), | 
|---|
| 2148 | G = pack(to_unorm(g, 65535)), | 
|---|
| 2149 | B = pack(to_unorm(b, 65535)), | 
|---|
| 2150 | A = pack(to_unorm(a, 65535)); | 
|---|
| 2151 |  | 
|---|
| 2152 | store4(ptr,tail, R,G,B,A); | 
|---|
| 2153 | } | 
|---|
| 2154 |  | 
|---|
| 2155 |  | 
|---|
| 2156 | STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2157 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); | 
|---|
| 2158 | from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a); | 
|---|
| 2159 | } | 
|---|
| 2160 | STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2161 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); | 
|---|
| 2162 | from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da); | 
|---|
| 2163 | } | 
|---|
| 2164 | STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2165 | const uint32_t* ptr; | 
|---|
| 2166 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); | 
|---|
| 2167 | from_1010102(gather(ptr, ix), &r,&g,&b,&a); | 
|---|
| 2168 | } | 
|---|
| 2169 | STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2170 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); | 
|---|
| 2171 |  | 
|---|
| 2172 | U32 px = to_unorm(r, 1023) | 
|---|
| 2173 | | to_unorm(g, 1023) << 10 | 
|---|
| 2174 | | to_unorm(b, 1023) << 20 | 
|---|
| 2175 | | to_unorm(a,    3) << 30; | 
|---|
| 2176 | store(ptr, px, tail); | 
|---|
| 2177 | } | 
|---|
| 2178 |  | 
|---|
| 2179 | STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2180 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); | 
|---|
| 2181 |  | 
|---|
| 2182 | U16 R,G,B,A; | 
|---|
| 2183 | load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); | 
|---|
| 2184 | r = from_half(R); | 
|---|
| 2185 | g = from_half(G); | 
|---|
| 2186 | b = from_half(B); | 
|---|
| 2187 | a = from_half(A); | 
|---|
| 2188 | } | 
|---|
| 2189 | STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2190 | auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); | 
|---|
| 2191 |  | 
|---|
| 2192 | U16 R,G,B,A; | 
|---|
| 2193 | load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); | 
|---|
| 2194 | dr = from_half(R); | 
|---|
| 2195 | dg = from_half(G); | 
|---|
| 2196 | db = from_half(B); | 
|---|
| 2197 | da = from_half(A); | 
|---|
| 2198 | } | 
|---|
| 2199 | STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2200 | const uint64_t* ptr; | 
|---|
| 2201 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); | 
|---|
| 2202 | auto px = gather(ptr, ix); | 
|---|
| 2203 |  | 
|---|
| 2204 | U16 R,G,B,A; | 
|---|
| 2205 | load4((const uint16_t*)&px,0, &R,&G,&B,&A); | 
|---|
| 2206 | r = from_half(R); | 
|---|
| 2207 | g = from_half(G); | 
|---|
| 2208 | b = from_half(B); | 
|---|
| 2209 | a = from_half(A); | 
|---|
| 2210 | } | 
|---|
| 2211 | STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2212 | auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy); | 
|---|
| 2213 | store4((uint16_t*)ptr,tail, to_half(r) | 
|---|
| 2214 | , to_half(g) | 
|---|
| 2215 | , to_half(b) | 
|---|
| 2216 | , to_half(a)); | 
|---|
| 2217 | } | 
|---|
| 2218 |  | 
|---|
| 2219 | STAGE(store_u16_be, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2220 | auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,dy); | 
|---|
| 2221 |  | 
|---|
| 2222 | U16 R = bswap(pack(to_unorm(r, 65535))), | 
|---|
| 2223 | G = bswap(pack(to_unorm(g, 65535))), | 
|---|
| 2224 | B = bswap(pack(to_unorm(b, 65535))), | 
|---|
| 2225 | A = bswap(pack(to_unorm(a, 65535))); | 
|---|
| 2226 |  | 
|---|
| 2227 | store4(ptr,tail, R,G,B,A); | 
|---|
| 2228 | } | 
|---|
| 2229 |  | 
|---|
| 2230 | STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2231 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); | 
|---|
| 2232 |  | 
|---|
| 2233 | U16 A = load<U16>((const uint16_t*)ptr, tail); | 
|---|
| 2234 | r = 0; | 
|---|
| 2235 | g = 0; | 
|---|
| 2236 | b = 0; | 
|---|
| 2237 | a = from_half(A); | 
|---|
| 2238 | } | 
|---|
| 2239 | STAGE(load_af16_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2240 | auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); | 
|---|
| 2241 |  | 
|---|
| 2242 | U16 A = load<U16>((const uint16_t*)ptr, tail); | 
|---|
| 2243 | dr = dg = db = 0.0f; | 
|---|
| 2244 | da = from_half(A); | 
|---|
| 2245 | } | 
|---|
| 2246 | STAGE(gather_af16, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2247 | const uint16_t* ptr; | 
|---|
| 2248 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); | 
|---|
| 2249 | r = g = b = 0.0f; | 
|---|
| 2250 | a = from_half(gather(ptr, ix)); | 
|---|
| 2251 | } | 
|---|
| 2252 | STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2253 | auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); | 
|---|
| 2254 | store(ptr, to_half(a), tail); | 
|---|
| 2255 | } | 
|---|
| 2256 |  | 
|---|
| 2257 | STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2258 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); | 
|---|
| 2259 |  | 
|---|
| 2260 | U16 R,G; | 
|---|
| 2261 | load2((const uint16_t*)ptr, tail, &R, &G); | 
|---|
| 2262 | r = from_half(R); | 
|---|
| 2263 | g = from_half(G); | 
|---|
| 2264 | b = 0; | 
|---|
| 2265 | a = 1; | 
|---|
| 2266 | } | 
|---|
| 2267 | STAGE(load_rgf16_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2268 | auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); | 
|---|
| 2269 |  | 
|---|
| 2270 | U16 R,G; | 
|---|
| 2271 | load2((const uint16_t*)ptr, tail, &R, &G); | 
|---|
| 2272 | dr = from_half(R); | 
|---|
| 2273 | dg = from_half(G); | 
|---|
| 2274 | db = 0; | 
|---|
| 2275 | da = 1; | 
|---|
| 2276 | } | 
|---|
| 2277 | STAGE(gather_rgf16, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2278 | const uint32_t* ptr; | 
|---|
| 2279 | U32 ix = ix_and_ptr(&ptr, ctx, r, g); | 
|---|
| 2280 | auto px = gather(ptr, ix); | 
|---|
| 2281 |  | 
|---|
| 2282 | U16 R,G; | 
|---|
| 2283 | load2((const uint16_t*)&px, 0, &R, &G); | 
|---|
| 2284 | r = from_half(R); | 
|---|
| 2285 | g = from_half(G); | 
|---|
| 2286 | b = 0; | 
|---|
| 2287 | a = 1; | 
|---|
| 2288 | } | 
|---|
| 2289 | STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2290 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy); | 
|---|
| 2291 | store2((uint16_t*)ptr, tail, to_half(r) | 
|---|
| 2292 | , to_half(g)); | 
|---|
| 2293 | } | 
|---|
| 2294 |  | 
|---|
| 2295 | STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2296 | auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy); | 
|---|
| 2297 | load4(ptr,tail, &r,&g,&b,&a); | 
|---|
| 2298 | } | 
|---|
| 2299 | STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2300 | auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy); | 
|---|
| 2301 | load4(ptr,tail, &dr,&dg,&db,&da); | 
|---|
| 2302 | } | 
|---|
| 2303 | STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2304 | const float* ptr; | 
|---|
| 2305 | U32 ix = ix_and_ptr(&ptr, ctx, r,g); | 
|---|
| 2306 | r = gather(ptr, 4*ix + 0); | 
|---|
| 2307 | g = gather(ptr, 4*ix + 1); | 
|---|
| 2308 | b = gather(ptr, 4*ix + 2); | 
|---|
| 2309 | a = gather(ptr, 4*ix + 3); | 
|---|
| 2310 | } | 
|---|
| 2311 | STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2312 | auto ptr = ptr_at_xy<float>(ctx, 4*dx,4*dy); | 
|---|
| 2313 | store4(ptr,tail, r,g,b,a); | 
|---|
| 2314 | } | 
|---|
| 2315 |  | 
|---|
| 2316 | STAGE(load_rgf32, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2317 | auto ptr = ptr_at_xy<const float>(ctx, 2*dx,2*dy); | 
|---|
| 2318 | load2(ptr, tail, &r, &g); | 
|---|
| 2319 | b = 0; | 
|---|
| 2320 | a = 1; | 
|---|
| 2321 | } | 
|---|
| 2322 | STAGE(store_rgf32, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 2323 | auto ptr = ptr_at_xy<float>(ctx, 2*dx,2*dy); | 
|---|
| 2324 | store2(ptr, tail, r, g); | 
|---|
| 2325 | } | 
|---|
| 2326 |  | 
|---|
| 2327 | SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) { | 
|---|
| 2328 | return v - floor_(v*ctx->invScale)*ctx->scale; | 
|---|
| 2329 | } | 
|---|
| 2330 | SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) { | 
|---|
| 2331 | auto limit = ctx->scale; | 
|---|
| 2332 | auto invLimit = ctx->invScale; | 
|---|
| 2333 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); | 
|---|
| 2334 | } | 
|---|
| 2335 | // Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images). | 
|---|
| 2336 | // The gather stages will hard clamp the output of these stages to [0,limit)... | 
|---|
| 2337 | // we just need to do the basic repeat or mirroring. | 
|---|
| 2338 | STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(r, ctx); } | 
|---|
| 2339 | STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(g, ctx); } | 
|---|
| 2340 | STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(r, ctx); } | 
|---|
| 2341 | STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(g, ctx); } | 
|---|
| 2342 |  | 
|---|
| 2343 | STAGE( clamp_x_1, Ctx::None) { r = clamp_01(r); } | 
|---|
| 2344 | STAGE(repeat_x_1, Ctx::None) { r = clamp_01(r - floor_(r)); } | 
|---|
| 2345 | STAGE(mirror_x_1, Ctx::None) { r = clamp_01(abs_( (r-1.0f) - two(floor_((r-1.0f)*0.5f)) - 1.0f )); } | 
|---|
| 2346 |  | 
|---|
| 2347 | // Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain: | 
|---|
| 2348 | //      mask == 0x00000000 if the coordinate(s) are out of bounds | 
|---|
| 2349 | //      mask == 0xFFFFFFFF if the coordinate(s) are in bounds | 
|---|
| 2350 | // After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0 | 
|---|
| 2351 | // if either of the coordinates were out of bounds. | 
|---|
| 2352 |  | 
|---|
| 2353 | STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) { | 
|---|
| 2354 | auto w = ctx->limit_x; | 
|---|
| 2355 | sk_unaligned_store(ctx->mask, cond_to_mask((0 <= r) & (r < w))); | 
|---|
| 2356 | } | 
|---|
| 2357 | STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) { | 
|---|
| 2358 | auto h = ctx->limit_y; | 
|---|
| 2359 | sk_unaligned_store(ctx->mask, cond_to_mask((0 <= g) & (g < h))); | 
|---|
| 2360 | } | 
|---|
| 2361 | STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) { | 
|---|
| 2362 | auto w = ctx->limit_x; | 
|---|
| 2363 | auto h = ctx->limit_y; | 
|---|
| 2364 | sk_unaligned_store(ctx->mask, | 
|---|
| 2365 | cond_to_mask((0 <= r) & (r < w) & (0 <= g) & (g < h))); | 
|---|
| 2366 | } | 
|---|
| 2367 | STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) { | 
|---|
| 2368 | auto mask = sk_unaligned_load<U32>(ctx->mask); | 
|---|
| 2369 | r = bit_cast<F>( bit_cast<U32>(r) & mask ); | 
|---|
| 2370 | g = bit_cast<F>( bit_cast<U32>(g) & mask ); | 
|---|
| 2371 | b = bit_cast<F>( bit_cast<U32>(b) & mask ); | 
|---|
| 2372 | a = bit_cast<F>( bit_cast<U32>(a) & mask ); | 
|---|
| 2373 | } | 
|---|
| 2374 |  | 
|---|
| 2375 | STAGE(alpha_to_gray, Ctx::None) { | 
|---|
| 2376 | r = g = b = a; | 
|---|
| 2377 | a = 1; | 
|---|
| 2378 | } | 
|---|
| 2379 | STAGE(alpha_to_gray_dst, Ctx::None) { | 
|---|
| 2380 | dr = dg = db = da; | 
|---|
| 2381 | da = 1; | 
|---|
| 2382 | } | 
|---|
| 2383 | STAGE(bt709_luminance_or_luma_to_alpha, Ctx::None) { | 
|---|
| 2384 | a = r*0.2126f + g*0.7152f + b*0.0722f; | 
|---|
| 2385 | r = g = b = 0; | 
|---|
| 2386 | } | 
|---|
| 2387 |  | 
|---|
| 2388 | STAGE(matrix_translate, const float* m) { | 
|---|
| 2389 | r += m[0]; | 
|---|
| 2390 | g += m[1]; | 
|---|
| 2391 | } | 
|---|
| 2392 | STAGE(matrix_scale_translate, const float* m) { | 
|---|
| 2393 | r = mad(r,m[0], m[2]); | 
|---|
| 2394 | g = mad(g,m[1], m[3]); | 
|---|
| 2395 | } | 
|---|
| 2396 | STAGE(matrix_2x3, const float* m) { | 
|---|
| 2397 | auto R = mad(r,m[0], mad(g,m[2], m[4])), | 
|---|
| 2398 | G = mad(r,m[1], mad(g,m[3], m[5])); | 
|---|
| 2399 | r = R; | 
|---|
| 2400 | g = G; | 
|---|
| 2401 | } | 
|---|
| 2402 | STAGE(matrix_3x3, const float* m) { | 
|---|
| 2403 | auto R = mad(r,m[0], mad(g,m[3], b*m[6])), | 
|---|
| 2404 | G = mad(r,m[1], mad(g,m[4], b*m[7])), | 
|---|
| 2405 | B = mad(r,m[2], mad(g,m[5], b*m[8])); | 
|---|
| 2406 | r = R; | 
|---|
| 2407 | g = G; | 
|---|
| 2408 | b = B; | 
|---|
| 2409 | } | 
|---|
| 2410 | STAGE(matrix_3x4, const float* m) { | 
|---|
| 2411 | auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))), | 
|---|
| 2412 | G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))), | 
|---|
| 2413 | B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11]))); | 
|---|
| 2414 | r = R; | 
|---|
| 2415 | g = G; | 
|---|
| 2416 | b = B; | 
|---|
| 2417 | } | 
|---|
| 2418 | STAGE(matrix_4x5, const float* m) { | 
|---|
| 2419 | auto R = mad(r,m[ 0], mad(g,m[ 1], mad(b,m[ 2], mad(a,m[ 3], m[ 4])))), | 
|---|
| 2420 | G = mad(r,m[ 5], mad(g,m[ 6], mad(b,m[ 7], mad(a,m[ 8], m[ 9])))), | 
|---|
| 2421 | B = mad(r,m[10], mad(g,m[11], mad(b,m[12], mad(a,m[13], m[14])))), | 
|---|
| 2422 | A = mad(r,m[15], mad(g,m[16], mad(b,m[17], mad(a,m[18], m[19])))); | 
|---|
| 2423 | r = R; | 
|---|
| 2424 | g = G; | 
|---|
| 2425 | b = B; | 
|---|
| 2426 | a = A; | 
|---|
| 2427 | } | 
|---|
| 2428 | STAGE(matrix_4x3, const float* m) { | 
|---|
| 2429 | auto X = r, | 
|---|
| 2430 | Y = g; | 
|---|
| 2431 |  | 
|---|
| 2432 | r = mad(X, m[0], mad(Y, m[4], m[ 8])); | 
|---|
| 2433 | g = mad(X, m[1], mad(Y, m[5], m[ 9])); | 
|---|
| 2434 | b = mad(X, m[2], mad(Y, m[6], m[10])); | 
|---|
| 2435 | a = mad(X, m[3], mad(Y, m[7], m[11])); | 
|---|
| 2436 | } | 
|---|
| 2437 | STAGE(matrix_perspective, const float* m) { | 
|---|
| 2438 | // N.B. Unlike the other matrix_ stages, this matrix is row-major. | 
|---|
| 2439 | auto R = mad(r,m[0], mad(g,m[1], m[2])), | 
|---|
| 2440 | G = mad(r,m[3], mad(g,m[4], m[5])), | 
|---|
| 2441 | Z = mad(r,m[6], mad(g,m[7], m[8])); | 
|---|
| 2442 | r = R * rcp(Z); | 
|---|
| 2443 | g = G * rcp(Z); | 
|---|
| 2444 | } | 
|---|
| 2445 |  | 
|---|
| 2446 | SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t, | 
|---|
| 2447 | F* r, F* g, F* b, F* a) { | 
|---|
| 2448 | F fr, br, fg, bg, fb, bb, fa, ba; | 
|---|
| 2449 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 2450 | if (c->stopCount <=8) { | 
|---|
| 2451 | fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx); | 
|---|
| 2452 | br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx); | 
|---|
| 2453 | fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), idx); | 
|---|
| 2454 | bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), idx); | 
|---|
| 2455 | fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), idx); | 
|---|
| 2456 | bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), idx); | 
|---|
| 2457 | fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), idx); | 
|---|
| 2458 | ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), idx); | 
|---|
| 2459 | } else | 
|---|
| 2460 | #endif | 
|---|
| 2461 | { | 
|---|
| 2462 | fr = gather(c->fs[0], idx); | 
|---|
| 2463 | br = gather(c->bs[0], idx); | 
|---|
| 2464 | fg = gather(c->fs[1], idx); | 
|---|
| 2465 | bg = gather(c->bs[1], idx); | 
|---|
| 2466 | fb = gather(c->fs[2], idx); | 
|---|
| 2467 | bb = gather(c->bs[2], idx); | 
|---|
| 2468 | fa = gather(c->fs[3], idx); | 
|---|
| 2469 | ba = gather(c->bs[3], idx); | 
|---|
| 2470 | } | 
|---|
| 2471 |  | 
|---|
| 2472 | *r = mad(t, fr, br); | 
|---|
| 2473 | *g = mad(t, fg, bg); | 
|---|
| 2474 | *b = mad(t, fb, bb); | 
|---|
| 2475 | *a = mad(t, fa, ba); | 
|---|
| 2476 | } | 
|---|
| 2477 |  | 
|---|
| 2478 | STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) { | 
|---|
| 2479 | auto t = r; | 
|---|
| 2480 | auto idx = trunc_(t * (c->stopCount-1)); | 
|---|
| 2481 | gradient_lookup(c, idx, t, &r, &g, &b, &a); | 
|---|
| 2482 | } | 
|---|
| 2483 |  | 
|---|
| 2484 | STAGE(gradient, const SkRasterPipeline_GradientCtx* c) { | 
|---|
| 2485 | auto t = r; | 
|---|
| 2486 | U32 idx = 0; | 
|---|
| 2487 |  | 
|---|
| 2488 | // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. | 
|---|
| 2489 | for (size_t i = 1; i < c->stopCount; i++) { | 
|---|
| 2490 | idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); | 
|---|
| 2491 | } | 
|---|
| 2492 |  | 
|---|
| 2493 | gradient_lookup(c, idx, t, &r, &g, &b, &a); | 
|---|
| 2494 | } | 
|---|
| 2495 |  | 
|---|
| 2496 | STAGE(evenly_spaced_2_stop_gradient, const void* ctx) { | 
|---|
| 2497 | // TODO: Rename Ctx SkRasterPipeline_EvenlySpaced2StopGradientCtx. | 
|---|
| 2498 | struct Ctx { float f[4], b[4]; }; | 
|---|
| 2499 | auto c = (const Ctx*)ctx; | 
|---|
| 2500 |  | 
|---|
| 2501 | auto t = r; | 
|---|
| 2502 | r = mad(t, c->f[0], c->b[0]); | 
|---|
| 2503 | g = mad(t, c->f[1], c->b[1]); | 
|---|
| 2504 | b = mad(t, c->f[2], c->b[2]); | 
|---|
| 2505 | a = mad(t, c->f[3], c->b[3]); | 
|---|
| 2506 | } | 
|---|
| 2507 |  | 
|---|
| 2508 | STAGE(xy_to_unit_angle, Ctx::None) { | 
|---|
| 2509 | F X = r, | 
|---|
| 2510 | Y = g; | 
|---|
| 2511 | F xabs = abs_(X), | 
|---|
| 2512 | yabs = abs_(Y); | 
|---|
| 2513 |  | 
|---|
| 2514 | F slope = min(xabs, yabs)/max(xabs, yabs); | 
|---|
| 2515 | F s = slope * slope; | 
|---|
| 2516 |  | 
|---|
| 2517 | // Use a 7th degree polynomial to approximate atan. | 
|---|
| 2518 | // This was generated using sollya.gforge.inria.fr. | 
|---|
| 2519 | // A float optimized polynomial was generated using the following command. | 
|---|
| 2520 | // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); | 
|---|
| 2521 | F phi = slope | 
|---|
| 2522 | * (0.15912117063999176025390625f     + s | 
|---|
| 2523 | * (-5.185396969318389892578125e-2f   + s | 
|---|
| 2524 | * (2.476101927459239959716796875e-2f + s | 
|---|
| 2525 | * (-7.0547382347285747528076171875e-3f)))); | 
|---|
| 2526 |  | 
|---|
| 2527 | phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); | 
|---|
| 2528 | phi = if_then_else(X < 0.0f   , 1.0f/2.0f - phi, phi); | 
|---|
| 2529 | phi = if_then_else(Y < 0.0f   , 1.0f - phi     , phi); | 
|---|
| 2530 | phi = if_then_else(phi != phi , 0              , phi);  // Check for NaN. | 
|---|
| 2531 | r = phi; | 
|---|
| 2532 | } | 
|---|
| 2533 |  | 
|---|
| 2534 | STAGE(xy_to_radius, Ctx::None) { | 
|---|
| 2535 | F X2 = r * r, | 
|---|
| 2536 | Y2 = g * g; | 
|---|
| 2537 | r = sqrt_(X2 + Y2); | 
|---|
| 2538 | } | 
|---|
| 2539 |  | 
|---|
| 2540 | // Please see https://skia.org/dev/design/conical for how our 2pt conical shader works. | 
|---|
| 2541 |  | 
|---|
| 2542 | STAGE(negate_x, Ctx::None) { r = -r; } | 
|---|
| 2543 |  | 
|---|
| 2544 | STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) { | 
|---|
| 2545 | F x = r, y = g, &t = r; | 
|---|
| 2546 | t = x + sqrt_(ctx->fP0 - y*y); // ctx->fP0 = r0 * r0 | 
|---|
| 2547 | } | 
|---|
| 2548 |  | 
|---|
| 2549 | STAGE(xy_to_2pt_conical_focal_on_circle, Ctx::None) { | 
|---|
| 2550 | F x = r, y = g, &t = r; | 
|---|
| 2551 | t = x + y*y / x; // (x^2 + y^2) / x | 
|---|
| 2552 | } | 
|---|
| 2553 |  | 
|---|
| 2554 | STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) { | 
|---|
| 2555 | F x = r, y = g, &t = r; | 
|---|
| 2556 | t = sqrt_(x*x + y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 | 
|---|
| 2557 | } | 
|---|
| 2558 |  | 
|---|
| 2559 | STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) { | 
|---|
| 2560 | F x = r, y = g, &t = r; | 
|---|
| 2561 | t = sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 | 
|---|
| 2562 | } | 
|---|
| 2563 |  | 
|---|
| 2564 | STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) { | 
|---|
| 2565 | F x = r, y = g, &t = r; | 
|---|
| 2566 | t = -sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 | 
|---|
| 2567 | } | 
|---|
| 2568 |  | 
|---|
| 2569 | STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) { | 
|---|
| 2570 | F& t = r; | 
|---|
| 2571 | t = t + ctx->fP1; // ctx->fP1 = f | 
|---|
| 2572 | } | 
|---|
| 2573 |  | 
|---|
| 2574 | STAGE(alter_2pt_conical_unswap, Ctx::None) { | 
|---|
| 2575 | F& t = r; | 
|---|
| 2576 | t = 1 - t; | 
|---|
| 2577 | } | 
|---|
| 2578 |  | 
|---|
| 2579 | STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) { | 
|---|
| 2580 | F& t = r; | 
|---|
| 2581 | auto is_degenerate = (t != t); // NaN | 
|---|
| 2582 | t = if_then_else(is_degenerate, F(0), t); | 
|---|
| 2583 | sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate)); | 
|---|
| 2584 | } | 
|---|
| 2585 |  | 
|---|
| 2586 | STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) { | 
|---|
| 2587 | F& t = r; | 
|---|
| 2588 | auto is_degenerate = (t <= 0) | (t != t); | 
|---|
| 2589 | t = if_then_else(is_degenerate, F(0), t); | 
|---|
| 2590 | sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate)); | 
|---|
| 2591 | } | 
|---|
| 2592 |  | 
|---|
| 2593 | STAGE(apply_vector_mask, const uint32_t* ctx) { | 
|---|
| 2594 | const U32 mask = sk_unaligned_load<U32>(ctx); | 
|---|
| 2595 | r = bit_cast<F>(bit_cast<U32>(r) & mask); | 
|---|
| 2596 | g = bit_cast<F>(bit_cast<U32>(g) & mask); | 
|---|
| 2597 | b = bit_cast<F>(bit_cast<U32>(b) & mask); | 
|---|
| 2598 | a = bit_cast<F>(bit_cast<U32>(a) & mask); | 
|---|
| 2599 | } | 
|---|
| 2600 |  | 
|---|
| 2601 | STAGE(save_xy, SkRasterPipeline_SamplerCtx* c) { | 
|---|
| 2602 | // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy). | 
|---|
| 2603 | // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid | 
|---|
| 2604 | // surrounding (x,y) at (0.5,0.5) off-center. | 
|---|
| 2605 | F fx = fract(r + 0.5f), | 
|---|
| 2606 | fy = fract(g + 0.5f); | 
|---|
| 2607 |  | 
|---|
| 2608 | // Samplers will need to load x and fx, or y and fy. | 
|---|
| 2609 | sk_unaligned_store(c->x,  r); | 
|---|
| 2610 | sk_unaligned_store(c->y,  g); | 
|---|
| 2611 | sk_unaligned_store(c->fx, fx); | 
|---|
| 2612 | sk_unaligned_store(c->fy, fy); | 
|---|
| 2613 | } | 
|---|
| 2614 |  | 
|---|
| 2615 | STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) { | 
|---|
| 2616 | // Bilinear and bicubic filters are both separable, so we produce independent contributions | 
|---|
| 2617 | // from x and y, multiplying them together here to get each pixel's total scale factor. | 
|---|
| 2618 | auto scale = sk_unaligned_load<F>(c->scalex) | 
|---|
| 2619 | * sk_unaligned_load<F>(c->scaley); | 
|---|
| 2620 | dr = mad(scale, r, dr); | 
|---|
| 2621 | dg = mad(scale, g, dg); | 
|---|
| 2622 | db = mad(scale, b, db); | 
|---|
| 2623 | da = mad(scale, a, da); | 
|---|
| 2624 | } | 
|---|
| 2625 |  | 
|---|
| 2626 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center | 
|---|
| 2627 | // are combined in direct proportion to their area overlapping that logical query pixel. | 
|---|
| 2628 | // At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x. | 
|---|
| 2629 | // The y-axis is symmetric. | 
|---|
| 2630 |  | 
|---|
| 2631 | template <int kScale> | 
|---|
| 2632 | SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) { | 
|---|
| 2633 | *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f); | 
|---|
| 2634 | F fx = sk_unaligned_load<F>(ctx->fx); | 
|---|
| 2635 |  | 
|---|
| 2636 | F scalex; | 
|---|
| 2637 | if (kScale == -1) { scalex = 1.0f - fx; } | 
|---|
| 2638 | if (kScale == +1) { scalex =        fx; } | 
|---|
| 2639 | sk_unaligned_store(ctx->scalex, scalex); | 
|---|
| 2640 | } | 
|---|
| 2641 | template <int kScale> | 
|---|
| 2642 | SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) { | 
|---|
| 2643 | *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f); | 
|---|
| 2644 | F fy = sk_unaligned_load<F>(ctx->fy); | 
|---|
| 2645 |  | 
|---|
| 2646 | F scaley; | 
|---|
| 2647 | if (kScale == -1) { scaley = 1.0f - fy; } | 
|---|
| 2648 | if (kScale == +1) { scaley =        fy; } | 
|---|
| 2649 | sk_unaligned_store(ctx->scaley, scaley); | 
|---|
| 2650 | } | 
|---|
| 2651 |  | 
|---|
| 2652 | STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, &r); } | 
|---|
| 2653 | STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+1>(ctx, &r); } | 
|---|
| 2654 | STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-1>(ctx, &g); } | 
|---|
| 2655 | STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+1>(ctx, &g); } | 
|---|
| 2656 |  | 
|---|
| 2657 |  | 
|---|
| 2658 | // In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample | 
|---|
| 2659 | // pixel center are combined with a non-uniform cubic filter, with higher values near the center. | 
|---|
| 2660 | // | 
|---|
| 2661 | // We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets. | 
|---|
| 2662 | // See GrCubicEffect for details of this particular filter. | 
|---|
| 2663 |  | 
|---|
| 2664 | SI F bicubic_near(F t) { | 
|---|
| 2665 | // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18 | 
|---|
| 2666 | return mad(t, mad(t, mad((-21/18.0f), t, (27/18.0f)), (9/18.0f)), (1/18.0f)); | 
|---|
| 2667 | } | 
|---|
| 2668 | SI F bicubic_far(F t) { | 
|---|
| 2669 | // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18) | 
|---|
| 2670 | return (t*t)*mad((7/18.0f), t, (-6/18.0f)); | 
|---|
| 2671 | } | 
|---|
| 2672 |  | 
|---|
| 2673 | template <int kScale> | 
|---|
| 2674 | SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) { | 
|---|
| 2675 | *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f); | 
|---|
| 2676 | F fx = sk_unaligned_load<F>(ctx->fx); | 
|---|
| 2677 |  | 
|---|
| 2678 | F scalex; | 
|---|
| 2679 | if (kScale == -3) { scalex = bicubic_far (1.0f - fx); } | 
|---|
| 2680 | if (kScale == -1) { scalex = bicubic_near(1.0f - fx); } | 
|---|
| 2681 | if (kScale == +1) { scalex = bicubic_near(       fx); } | 
|---|
| 2682 | if (kScale == +3) { scalex = bicubic_far (       fx); } | 
|---|
| 2683 | sk_unaligned_store(ctx->scalex, scalex); | 
|---|
| 2684 | } | 
|---|
| 2685 | template <int kScale> | 
|---|
| 2686 | SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) { | 
|---|
| 2687 | *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f); | 
|---|
| 2688 | F fy = sk_unaligned_load<F>(ctx->fy); | 
|---|
| 2689 |  | 
|---|
| 2690 | F scaley; | 
|---|
| 2691 | if (kScale == -3) { scaley = bicubic_far (1.0f - fy); } | 
|---|
| 2692 | if (kScale == -1) { scaley = bicubic_near(1.0f - fy); } | 
|---|
| 2693 | if (kScale == +1) { scaley = bicubic_near(       fy); } | 
|---|
| 2694 | if (kScale == +3) { scaley = bicubic_far (       fy); } | 
|---|
| 2695 | sk_unaligned_store(ctx->scaley, scaley); | 
|---|
| 2696 | } | 
|---|
| 2697 |  | 
|---|
| 2698 | STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, &r); } | 
|---|
| 2699 | STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-1>(ctx, &r); } | 
|---|
| 2700 | STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+1>(ctx, &r); } | 
|---|
| 2701 | STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+3>(ctx, &r); } | 
|---|
| 2702 |  | 
|---|
| 2703 | STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-3>(ctx, &g); } | 
|---|
| 2704 | STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-1>(ctx, &g); } | 
|---|
| 2705 | STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+1>(ctx, &g); } | 
|---|
| 2706 | STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+3>(ctx, &g); } | 
|---|
| 2707 |  | 
|---|
| 2708 | STAGE(callback, SkRasterPipeline_CallbackCtx* c) { | 
|---|
| 2709 | store4(c->rgba,0, r,g,b,a); | 
|---|
| 2710 | c->fn(c, tail ? tail : N); | 
|---|
| 2711 | load4(c->read_from,0, &r,&g,&b,&a); | 
|---|
| 2712 | } | 
|---|
| 2713 |  | 
|---|
| 2714 | // shader:      void main(float2 p, inout half4 color) | 
|---|
| 2715 | // colorfilter: void main(inout half4 color) | 
|---|
| 2716 | STAGE(interpreter, SkRasterPipeline_InterpreterCtx* c) { | 
|---|
| 2717 | // If N is less than the interpreter's VecWidth, then we are doing more work than necessary in | 
|---|
| 2718 | // the interpreter. This is a known issue, and will be addressed at some point. | 
|---|
| 2719 | float xx[N], yy[N], | 
|---|
| 2720 | rr[N], gg[N], bb[N], aa[N]; | 
|---|
| 2721 |  | 
|---|
| 2722 | float*  args[]  = { xx, yy, rr, gg, bb, aa }; | 
|---|
| 2723 | float** in_args = args; | 
|---|
| 2724 | int     in_count = 6; | 
|---|
| 2725 |  | 
|---|
| 2726 | if (c->shaderConvention) { | 
|---|
| 2727 | // our caller must have called seed_shader to set these | 
|---|
| 2728 | sk_unaligned_store(xx, r); | 
|---|
| 2729 | sk_unaligned_store(yy, g); | 
|---|
| 2730 | sk_unaligned_store(rr, F(c->paintColor.fR)); | 
|---|
| 2731 | sk_unaligned_store(gg, F(c->paintColor.fG)); | 
|---|
| 2732 | sk_unaligned_store(bb, F(c->paintColor.fB)); | 
|---|
| 2733 | sk_unaligned_store(aa, F(c->paintColor.fA)); | 
|---|
| 2734 | } else { | 
|---|
| 2735 | in_args += 2;   // skip x,y | 
|---|
| 2736 | in_count = 4; | 
|---|
| 2737 | sk_unaligned_store(rr, r); | 
|---|
| 2738 | sk_unaligned_store(gg, g); | 
|---|
| 2739 | sk_unaligned_store(bb, b); | 
|---|
| 2740 | sk_unaligned_store(aa, a); | 
|---|
| 2741 | } | 
|---|
| 2742 |  | 
|---|
| 2743 | SkAssertResult(c->byteCode->runStriped(c->fn, tail ? tail : N, in_args, in_count, | 
|---|
| 2744 | nullptr, 0, (const float*)c->inputs, c->ninputs)); | 
|---|
| 2745 |  | 
|---|
| 2746 | r = sk_unaligned_load<F>(rr); | 
|---|
| 2747 | g = sk_unaligned_load<F>(gg); | 
|---|
| 2748 | b = sk_unaligned_load<F>(bb); | 
|---|
| 2749 | a = sk_unaligned_load<F>(aa); | 
|---|
| 2750 | } | 
|---|
| 2751 |  | 
|---|
| 2752 | STAGE(gauss_a_to_rgba, Ctx::None) { | 
|---|
| 2753 | // x = 1 - x; | 
|---|
| 2754 | // exp(-x * x * 4) - 0.018f; | 
|---|
| 2755 | // ... now approximate with quartic | 
|---|
| 2756 | // | 
|---|
| 2757 | const float c4 = -2.26661229133605957031f; | 
|---|
| 2758 | const float c3 = 2.89795351028442382812f; | 
|---|
| 2759 | const float c2 = 0.21345567703247070312f; | 
|---|
| 2760 | const float c1 = 0.15489584207534790039f; | 
|---|
| 2761 | const float c0 = 0.00030726194381713867f; | 
|---|
| 2762 | a = mad(a, mad(a, mad(a, mad(a, c4, c3), c2), c1), c0); | 
|---|
| 2763 | r = a; | 
|---|
| 2764 | g = a; | 
|---|
| 2765 | b = a; | 
|---|
| 2766 | } | 
|---|
| 2767 |  | 
|---|
| 2768 | SI F tile(F v, SkTileMode mode, float limit, float invLimit) { | 
|---|
| 2769 | // The ix_and_ptr() calls in sample() will clamp tile()'s output, so no need to clamp here. | 
|---|
| 2770 | switch (mode) { | 
|---|
| 2771 | case SkTileMode::kDecal:  // TODO, for now fallthrough to clamp | 
|---|
| 2772 | case SkTileMode::kClamp:  return v; | 
|---|
| 2773 | case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit; | 
|---|
| 2774 | case SkTileMode::kMirror: | 
|---|
| 2775 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); | 
|---|
| 2776 | } | 
|---|
| 2777 | SkUNREACHABLE; | 
|---|
| 2778 | } | 
|---|
| 2779 |  | 
|---|
| 2780 | SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y, | 
|---|
| 2781 | F* r, F* g, F* b, F* a) { | 
|---|
| 2782 | x = tile(x, ctx->tileX, ctx->width , ctx->invWidth ); | 
|---|
| 2783 | y = tile(y, ctx->tileY, ctx->height, ctx->invHeight); | 
|---|
| 2784 |  | 
|---|
| 2785 | switch (ctx->ct) { | 
|---|
| 2786 | default: *r = *g = *b = *a = 0;  // TODO | 
|---|
| 2787 | break; | 
|---|
| 2788 |  | 
|---|
| 2789 | case kRGBA_8888_SkColorType: | 
|---|
| 2790 | case kBGRA_8888_SkColorType: { | 
|---|
| 2791 | const uint32_t* ptr; | 
|---|
| 2792 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); | 
|---|
| 2793 | from_8888(gather(ptr, ix), r,g,b,a); | 
|---|
| 2794 | if (ctx->ct == kBGRA_8888_SkColorType) { | 
|---|
| 2795 | std::swap(*r,*b); | 
|---|
| 2796 | } | 
|---|
| 2797 | } break; | 
|---|
| 2798 | } | 
|---|
| 2799 | } | 
|---|
| 2800 |  | 
|---|
| 2801 | template <int D> | 
|---|
| 2802 | SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx, | 
|---|
| 2803 | F cx, F cy, const F (&wx)[D], const F (&wy)[D], | 
|---|
| 2804 | F* r, F* g, F* b, F* a) { | 
|---|
| 2805 |  | 
|---|
| 2806 | float start = -0.5f*(D-1); | 
|---|
| 2807 |  | 
|---|
| 2808 | *r = *g = *b = *a = 0; | 
|---|
| 2809 | F y = cy + start; | 
|---|
| 2810 | for (int j = 0; j < D; j++, y += 1.0f) { | 
|---|
| 2811 | F x = cx + start; | 
|---|
| 2812 | for (int i = 0; i < D; i++, x += 1.0f) { | 
|---|
| 2813 | F R,G,B,A; | 
|---|
| 2814 | sample(ctx, x,y, &R,&G,&B,&A); | 
|---|
| 2815 |  | 
|---|
| 2816 | F w = wx[i] * wy[j]; | 
|---|
| 2817 | *r = mad(w,R,*r); | 
|---|
| 2818 | *g = mad(w,G,*g); | 
|---|
| 2819 | *b = mad(w,B,*b); | 
|---|
| 2820 | *a = mad(w,A,*a); | 
|---|
| 2821 | } | 
|---|
| 2822 | } | 
|---|
| 2823 | } | 
|---|
| 2824 |  | 
|---|
| 2825 | STAGE(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) { | 
|---|
| 2826 | F x = r, fx = fract(x + 0.5f), | 
|---|
| 2827 | y = g, fy = fract(y + 0.5f); | 
|---|
| 2828 | const F wx[] = {1.0f - fx, fx}; | 
|---|
| 2829 | const F wy[] = {1.0f - fy, fy}; | 
|---|
| 2830 |  | 
|---|
| 2831 | sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); | 
|---|
| 2832 | } | 
|---|
| 2833 | STAGE(bicubic, SkRasterPipeline_SamplerCtx2* ctx) { | 
|---|
| 2834 | F x = r, fx = fract(x + 0.5f), | 
|---|
| 2835 | y = g, fy = fract(y + 0.5f); | 
|---|
| 2836 | const F wx[] = { bicubic_far(1-fx), bicubic_near(1-fx), bicubic_near(fx), bicubic_far(fx) }; | 
|---|
| 2837 | const F wy[] = { bicubic_far(1-fy), bicubic_near(1-fy), bicubic_near(fy), bicubic_far(fy) }; | 
|---|
| 2838 |  | 
|---|
| 2839 | sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); | 
|---|
| 2840 | } | 
|---|
| 2841 |  | 
|---|
| 2842 | // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling. | 
|---|
| 2843 | STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2844 | // (cx,cy) are the center of our sample. | 
|---|
| 2845 | F cx = r, | 
|---|
| 2846 | cy = g; | 
|---|
| 2847 |  | 
|---|
| 2848 | // All sample points are at the same fractional offset (fx,fy). | 
|---|
| 2849 | // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. | 
|---|
| 2850 | F fx = fract(cx + 0.5f), | 
|---|
| 2851 | fy = fract(cy + 0.5f); | 
|---|
| 2852 |  | 
|---|
| 2853 | // We'll accumulate the color of all four samples into {r,g,b,a} directly. | 
|---|
| 2854 | r = g = b = a = 0; | 
|---|
| 2855 |  | 
|---|
| 2856 | for (float dy = -0.5f; dy <= +0.5f; dy += 1.0f) | 
|---|
| 2857 | for (float dx = -0.5f; dx <= +0.5f; dx += 1.0f) { | 
|---|
| 2858 | // (x,y) are the coordinates of this sample point. | 
|---|
| 2859 | F x = cx + dx, | 
|---|
| 2860 | y = cy + dy; | 
|---|
| 2861 |  | 
|---|
| 2862 | // ix_and_ptr() will clamp to the image's bounds for us. | 
|---|
| 2863 | const uint32_t* ptr; | 
|---|
| 2864 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); | 
|---|
| 2865 |  | 
|---|
| 2866 | F sr,sg,sb,sa; | 
|---|
| 2867 | from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa); | 
|---|
| 2868 |  | 
|---|
| 2869 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center | 
|---|
| 2870 | // are combined in direct proportion to their area overlapping that logical query pixel. | 
|---|
| 2871 | // At positive offsets, the x-axis contribution to that rectangle is fx, | 
|---|
| 2872 | // or (1-fx) at negative x.  Same deal for y. | 
|---|
| 2873 | F sx = (dx > 0) ? fx : 1.0f - fx, | 
|---|
| 2874 | sy = (dy > 0) ? fy : 1.0f - fy, | 
|---|
| 2875 | area = sx * sy; | 
|---|
| 2876 |  | 
|---|
| 2877 | r += sr * area; | 
|---|
| 2878 | g += sg * area; | 
|---|
| 2879 | b += sb * area; | 
|---|
| 2880 | a += sa * area; | 
|---|
| 2881 | } | 
|---|
| 2882 | } | 
|---|
| 2883 |  | 
|---|
| 2884 | // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling. | 
|---|
| 2885 | STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 2886 | // (cx,cy) are the center of our sample. | 
|---|
| 2887 | F cx = r, | 
|---|
| 2888 | cy = g; | 
|---|
| 2889 |  | 
|---|
| 2890 | // All sample points are at the same fractional offset (fx,fy). | 
|---|
| 2891 | // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. | 
|---|
| 2892 | F fx = fract(cx + 0.5f), | 
|---|
| 2893 | fy = fract(cy + 0.5f); | 
|---|
| 2894 |  | 
|---|
| 2895 | // We'll accumulate the color of all four samples into {r,g,b,a} directly. | 
|---|
| 2896 | r = g = b = a = 0; | 
|---|
| 2897 |  | 
|---|
| 2898 | const F scaley[4] = { | 
|---|
| 2899 | bicubic_far (1.0f - fy), bicubic_near(1.0f - fy), | 
|---|
| 2900 | bicubic_near(       fy), bicubic_far (       fy), | 
|---|
| 2901 | }; | 
|---|
| 2902 | const F scalex[4] = { | 
|---|
| 2903 | bicubic_far (1.0f - fx), bicubic_near(1.0f - fx), | 
|---|
| 2904 | bicubic_near(       fx), bicubic_far (       fx), | 
|---|
| 2905 | }; | 
|---|
| 2906 |  | 
|---|
| 2907 | F sample_y = cy - 1.5f; | 
|---|
| 2908 | for (int yy = 0; yy <= 3; ++yy) { | 
|---|
| 2909 | F sample_x = cx - 1.5f; | 
|---|
| 2910 | for (int xx = 0; xx <= 3; ++xx) { | 
|---|
| 2911 | F scale = scalex[xx] * scaley[yy]; | 
|---|
| 2912 |  | 
|---|
| 2913 | // ix_and_ptr() will clamp to the image's bounds for us. | 
|---|
| 2914 | const uint32_t* ptr; | 
|---|
| 2915 | U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y); | 
|---|
| 2916 |  | 
|---|
| 2917 | F sr,sg,sb,sa; | 
|---|
| 2918 | from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa); | 
|---|
| 2919 |  | 
|---|
| 2920 | r = mad(scale, sr, r); | 
|---|
| 2921 | g = mad(scale, sg, g); | 
|---|
| 2922 | b = mad(scale, sb, b); | 
|---|
| 2923 | a = mad(scale, sa, a); | 
|---|
| 2924 |  | 
|---|
| 2925 | sample_x += 1; | 
|---|
| 2926 | } | 
|---|
| 2927 | sample_y += 1; | 
|---|
| 2928 | } | 
|---|
| 2929 | } | 
|---|
| 2930 |  | 
|---|
| 2931 | // ~~~~~~ GrSwizzle stage ~~~~~~ // | 
|---|
| 2932 |  | 
|---|
| 2933 | STAGE(swizzle, void* ctx) { | 
|---|
| 2934 | auto ir = r, ig = g, ib = b, ia = a; | 
|---|
| 2935 | F* o[] = {&r, &g, &b, &a}; | 
|---|
| 2936 | char swiz[4]; | 
|---|
| 2937 | memcpy(swiz, &ctx, sizeof(swiz)); | 
|---|
| 2938 |  | 
|---|
| 2939 | for (int i = 0; i < 4; ++i) { | 
|---|
| 2940 | switch (swiz[i]) { | 
|---|
| 2941 | case 'r': *o[i] = ir;   break; | 
|---|
| 2942 | case 'g': *o[i] = ig;   break; | 
|---|
| 2943 | case 'b': *o[i] = ib;   break; | 
|---|
| 2944 | case 'a': *o[i] = ia;   break; | 
|---|
| 2945 | case '0': *o[i] = F(0); break; | 
|---|
| 2946 | case '1': *o[i] = F(1); break; | 
|---|
| 2947 | default:                break; | 
|---|
| 2948 | } | 
|---|
| 2949 | } | 
|---|
| 2950 | } | 
|---|
| 2951 |  | 
|---|
| 2952 | namespace lowp { | 
|---|
| 2953 | #if defined(JUMPER_IS_SCALAR) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE) | 
|---|
| 2954 | // If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually), | 
|---|
| 2955 | // we don't generate lowp stages.  All these nullptrs will tell SkJumper.cpp to always use the | 
|---|
| 2956 | // highp float pipeline. | 
|---|
| 2957 | #define M(st) static void (*st)(void) = nullptr; | 
|---|
| 2958 | SK_RASTER_PIPELINE_STAGES(M) | 
|---|
| 2959 | #undef M | 
|---|
| 2960 | static void (*just_return)(void) = nullptr; | 
|---|
| 2961 |  | 
|---|
| 2962 | static void start_pipeline(size_t,size_t,size_t,size_t, void**) {} | 
|---|
| 2963 |  | 
|---|
| 2964 | #else  // We are compiling vector code with Clang... let's make some lowp stages! | 
|---|
| 2965 |  | 
|---|
| 2966 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 2967 | using U8  = uint8_t  __attribute__((ext_vector_type(16))); | 
|---|
| 2968 | using U16 = uint16_t __attribute__((ext_vector_type(16))); | 
|---|
| 2969 | using I16 =  int16_t __attribute__((ext_vector_type(16))); | 
|---|
| 2970 | using I32 =  int32_t __attribute__((ext_vector_type(16))); | 
|---|
| 2971 | using U32 = uint32_t __attribute__((ext_vector_type(16))); | 
|---|
| 2972 | using F   = float    __attribute__((ext_vector_type(16))); | 
|---|
| 2973 | #else | 
|---|
| 2974 | using U8  = uint8_t  __attribute__((ext_vector_type(8))); | 
|---|
| 2975 | using U16 = uint16_t __attribute__((ext_vector_type(8))); | 
|---|
| 2976 | using I16 =  int16_t __attribute__((ext_vector_type(8))); | 
|---|
| 2977 | using I32 =  int32_t __attribute__((ext_vector_type(8))); | 
|---|
| 2978 | using U32 = uint32_t __attribute__((ext_vector_type(8))); | 
|---|
| 2979 | using F   = float    __attribute__((ext_vector_type(8))); | 
|---|
| 2980 | #endif | 
|---|
| 2981 |  | 
|---|
| 2982 | static const size_t N = sizeof(U16) / sizeof(uint16_t); | 
|---|
| 2983 |  | 
|---|
| 2984 | // Once again, some platforms benefit from a restricted Stage calling convention, | 
|---|
| 2985 | // but others can pass tons and tons of registers and we're happy to exploit that. | 
|---|
| 2986 | // It's exactly the same decision and implementation strategy as the F stages above. | 
|---|
| 2987 | #if JUMPER_NARROW_STAGES | 
|---|
| 2988 | struct Params { | 
|---|
| 2989 | size_t dx, dy, tail; | 
|---|
| 2990 | U16 dr,dg,db,da; | 
|---|
| 2991 | }; | 
|---|
| 2992 | using Stage = void(ABI*)(Params*, void** program, U16 r, U16 g, U16 b, U16 a); | 
|---|
| 2993 | #else | 
|---|
| 2994 | // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64. | 
|---|
| 2995 | using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy, | 
|---|
| 2996 | U16  r, U16  g, U16  b, U16  a, | 
|---|
| 2997 | U16 dr, U16 dg, U16 db, U16 da); | 
|---|
| 2998 | #endif | 
|---|
| 2999 |  | 
|---|
| 3000 | static void start_pipeline(const size_t x0,     const size_t y0, | 
|---|
| 3001 | const size_t xlimit, const size_t ylimit, void** program) { | 
|---|
| 3002 | auto start = (Stage)load_and_inc(program); | 
|---|
| 3003 | for (size_t dy = y0; dy < ylimit; dy++) { | 
|---|
| 3004 | #if JUMPER_NARROW_STAGES | 
|---|
| 3005 | Params params = { x0,dy,0, 0,0,0,0 }; | 
|---|
| 3006 | for (; params.dx + N <= xlimit; params.dx += N) { | 
|---|
| 3007 | start(¶ms,program, 0,0,0,0); | 
|---|
| 3008 | } | 
|---|
| 3009 | if (size_t tail = xlimit - params.dx) { | 
|---|
| 3010 | params.tail = tail; | 
|---|
| 3011 | start(¶ms,program, 0,0,0,0); | 
|---|
| 3012 | } | 
|---|
| 3013 | #else | 
|---|
| 3014 | size_t dx = x0; | 
|---|
| 3015 | for (; dx + N <= xlimit; dx += N) { | 
|---|
| 3016 | start(   0,program,dx,dy, 0,0,0,0, 0,0,0,0); | 
|---|
| 3017 | } | 
|---|
| 3018 | if (size_t tail = xlimit - dx) { | 
|---|
| 3019 | start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); | 
|---|
| 3020 | } | 
|---|
| 3021 | #endif | 
|---|
| 3022 | } | 
|---|
| 3023 | } | 
|---|
| 3024 |  | 
|---|
| 3025 | #if JUMPER_NARROW_STAGES | 
|---|
| 3026 | static void ABI just_return(Params*, void**, U16,U16,U16,U16) {} | 
|---|
| 3027 | #else | 
|---|
| 3028 | static void ABI just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {} | 
|---|
| 3029 | #endif | 
|---|
| 3030 |  | 
|---|
| 3031 | // All stages use the same function call ABI to chain into each other, but there are three types: | 
|---|
| 3032 | //   GG: geometry in, geometry out  -- think, a matrix | 
|---|
| 3033 | //   GP: geometry in, pixels out.   -- think, a memory gather | 
|---|
| 3034 | //   PP: pixels in, pixels out.     -- think, a blend mode | 
|---|
| 3035 | // | 
|---|
| 3036 | // (Some stages ignore their inputs or produce no logical output.  That's perfectly fine.) | 
|---|
| 3037 | // | 
|---|
| 3038 | // These three STAGE_ macros let you define each type of stage, | 
|---|
| 3039 | // and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate. | 
|---|
| 3040 |  | 
|---|
| 3041 | #if JUMPER_NARROW_STAGES | 
|---|
| 3042 | #define STAGE_GG(name, ...)                                                                \ | 
|---|
| 3043 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);          \ | 
|---|
| 3044 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) {     \ | 
|---|
| 3045 | auto x = join<F>(r,g),                                                             \ | 
|---|
| 3046 | y = join<F>(b,a);                                                             \ | 
|---|
| 3047 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y);                   \ | 
|---|
| 3048 | split(x, &r,&g);                                                                   \ | 
|---|
| 3049 | split(y, &b,&a);                                                                   \ | 
|---|
| 3050 | auto next = (Stage)load_and_inc(program);                                          \ | 
|---|
| 3051 | next(params,program, r,g,b,a);                                                     \ | 
|---|
| 3052 | }                                                                                      \ | 
|---|
| 3053 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) | 
|---|
| 3054 |  | 
|---|
| 3055 | #define STAGE_GP(name, ...)                                                            \ | 
|---|
| 3056 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \ | 
|---|
| 3057 | U16&  r, U16&  g, U16&  b, U16&  a,                               \ | 
|---|
| 3058 | U16& dr, U16& dg, U16& db, U16& da);                              \ | 
|---|
| 3059 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ | 
|---|
| 3060 | auto x = join<F>(r,g),                                                         \ | 
|---|
| 3061 | y = join<F>(b,a);                                                         \ | 
|---|
| 3062 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a,       \ | 
|---|
| 3063 | params->dr,params->dg,params->db,params->da);                         \ | 
|---|
| 3064 | auto next = (Stage)load_and_inc(program);                                      \ | 
|---|
| 3065 | next(params,program, r,g,b,a);                                                 \ | 
|---|
| 3066 | }                                                                                  \ | 
|---|
| 3067 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \ | 
|---|
| 3068 | U16&  r, U16&  g, U16&  b, U16&  a,                               \ | 
|---|
| 3069 | U16& dr, U16& dg, U16& db, U16& da) | 
|---|
| 3070 |  | 
|---|
| 3071 | #define STAGE_PP(name, ...)                                                            \ | 
|---|
| 3072 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \ | 
|---|
| 3073 | U16&  r, U16&  g, U16&  b, U16&  a,                               \ | 
|---|
| 3074 | U16& dr, U16& dg, U16& db, U16& da);                              \ | 
|---|
| 3075 | static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ | 
|---|
| 3076 | name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a,            \ | 
|---|
| 3077 | params->dr,params->dg,params->db,params->da);                         \ | 
|---|
| 3078 | auto next = (Stage)load_and_inc(program);                                      \ | 
|---|
| 3079 | next(params,program, r,g,b,a);                                                 \ | 
|---|
| 3080 | }                                                                                  \ | 
|---|
| 3081 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \ | 
|---|
| 3082 | U16&  r, U16&  g, U16&  b, U16&  a,                               \ | 
|---|
| 3083 | U16& dr, U16& dg, U16& db, U16& da) | 
|---|
| 3084 | #else | 
|---|
| 3085 | #define STAGE_GG(name, ...)                                                            \ | 
|---|
| 3086 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \ | 
|---|
| 3087 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \ | 
|---|
| 3088 | U16  r, U16  g, U16  b, U16  a,                               \ | 
|---|
| 3089 | U16 dr, U16 dg, U16 db, U16 da) {                             \ | 
|---|
| 3090 | auto x = join<F>(r,g),                                                         \ | 
|---|
| 3091 | y = join<F>(b,a);                                                         \ | 
|---|
| 3092 | name##_k(Ctx{program}, dx,dy,tail, x,y);                                       \ | 
|---|
| 3093 | split(x, &r,&g);                                                               \ | 
|---|
| 3094 | split(y, &b,&a);                                                               \ | 
|---|
| 3095 | auto next = (Stage)load_and_inc(program);                                      \ | 
|---|
| 3096 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \ | 
|---|
| 3097 | }                                                                                  \ | 
|---|
| 3098 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) | 
|---|
| 3099 |  | 
|---|
| 3100 | #define STAGE_GP(name, ...)                                                            \ | 
|---|
| 3101 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \ | 
|---|
| 3102 | U16&  r, U16&  g, U16&  b, U16&  a,                               \ | 
|---|
| 3103 | U16& dr, U16& dg, U16& db, U16& da);                              \ | 
|---|
| 3104 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \ | 
|---|
| 3105 | U16  r, U16  g, U16  b, U16  a,                               \ | 
|---|
| 3106 | U16 dr, U16 dg, U16 db, U16 da) {                             \ | 
|---|
| 3107 | auto x = join<F>(r,g),                                                         \ | 
|---|
| 3108 | y = join<F>(b,a);                                                         \ | 
|---|
| 3109 | name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da);                 \ | 
|---|
| 3110 | auto next = (Stage)load_and_inc(program);                                      \ | 
|---|
| 3111 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \ | 
|---|
| 3112 | }                                                                                  \ | 
|---|
| 3113 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \ | 
|---|
| 3114 | U16&  r, U16&  g, U16&  b, U16&  a,                               \ | 
|---|
| 3115 | U16& dr, U16& dg, U16& db, U16& da) | 
|---|
| 3116 |  | 
|---|
| 3117 | #define STAGE_PP(name, ...)                                                            \ | 
|---|
| 3118 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \ | 
|---|
| 3119 | U16&  r, U16&  g, U16&  b, U16&  a,                               \ | 
|---|
| 3120 | U16& dr, U16& dg, U16& db, U16& da);                              \ | 
|---|
| 3121 | static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \ | 
|---|
| 3122 | U16  r, U16  g, U16  b, U16  a,                               \ | 
|---|
| 3123 | U16 dr, U16 dg, U16 db, U16 da) {                             \ | 
|---|
| 3124 | name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \ | 
|---|
| 3125 | auto next = (Stage)load_and_inc(program);                                      \ | 
|---|
| 3126 | next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \ | 
|---|
| 3127 | }                                                                                  \ | 
|---|
| 3128 | SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \ | 
|---|
| 3129 | U16&  r, U16&  g, U16&  b, U16&  a,                               \ | 
|---|
| 3130 | U16& dr, U16& dg, U16& db, U16& da) | 
|---|
| 3131 | #endif | 
|---|
| 3132 |  | 
|---|
| 3133 | // ~~~~~~ Commonly used helper functions ~~~~~~ // | 
|---|
| 3134 |  | 
|---|
| 3135 | SI U16 div255(U16 v) { | 
|---|
| 3136 | #if 0 | 
|---|
| 3137 | return (v+127)/255;  // The ideal rounding divide by 255. | 
|---|
| 3138 | #elif 1 && defined(JUMPER_IS_NEON) | 
|---|
| 3139 | // With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8 | 
|---|
| 3140 | // just as fast as we can do the approximation below, so might as well be correct! | 
|---|
| 3141 | // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up. | 
|---|
| 3142 | return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8); | 
|---|
| 3143 | #else | 
|---|
| 3144 | return (v+255)/256;  // A good approximation of (v+127)/255. | 
|---|
| 3145 | #endif | 
|---|
| 3146 | } | 
|---|
| 3147 |  | 
|---|
| 3148 | SI U16 inv(U16 v) { return 255-v; } | 
|---|
| 3149 |  | 
|---|
| 3150 | SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); } | 
|---|
| 3151 | SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); } | 
|---|
| 3152 |  | 
|---|
| 3153 | SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); } | 
|---|
| 3154 | SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); } | 
|---|
| 3155 |  | 
|---|
| 3156 | SI U16 from_float(float f) { return f * 255.0f + 0.5f; } | 
|---|
| 3157 |  | 
|---|
| 3158 | SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); } | 
|---|
| 3159 |  | 
|---|
| 3160 | template <typename D, typename S> | 
|---|
| 3161 | SI D cast(S src) { | 
|---|
| 3162 | return __builtin_convertvector(src, D); | 
|---|
| 3163 | } | 
|---|
| 3164 |  | 
|---|
| 3165 | template <typename D, typename S> | 
|---|
| 3166 | SI void split(S v, D* lo, D* hi) { | 
|---|
| 3167 | static_assert(2*sizeof(D) == sizeof(S), ""); | 
|---|
| 3168 | memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D)); | 
|---|
| 3169 | memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D)); | 
|---|
| 3170 | } | 
|---|
| 3171 | template <typename D, typename S> | 
|---|
| 3172 | SI D join(S lo, S hi) { | 
|---|
| 3173 | static_assert(sizeof(D) == 2*sizeof(S), ""); | 
|---|
| 3174 | D v; | 
|---|
| 3175 | memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S)); | 
|---|
| 3176 | memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S)); | 
|---|
| 3177 | return v; | 
|---|
| 3178 | } | 
|---|
| 3179 |  | 
|---|
| 3180 | SI F if_then_else(I32 c, F t, F e) { | 
|---|
| 3181 | return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) ); | 
|---|
| 3182 | } | 
|---|
| 3183 | SI F max(F x, F y) { return if_then_else(x < y, y, x); } | 
|---|
| 3184 | SI F min(F x, F y) { return if_then_else(x < y, x, y); } | 
|---|
| 3185 |  | 
|---|
| 3186 | SI F mad(F f, F m, F a) { return f*m+a; } | 
|---|
| 3187 | SI U32 trunc_(F x) { return (U32)cast<I32>(x); } | 
|---|
| 3188 |  | 
|---|
| 3189 | SI F rcp(F x) { | 
|---|
| 3190 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 3191 | __m256 lo,hi; | 
|---|
| 3192 | split(x, &lo,&hi); | 
|---|
| 3193 | return join<F>(_mm256_rcp_ps(lo), _mm256_rcp_ps(hi)); | 
|---|
| 3194 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) | 
|---|
| 3195 | __m128 lo,hi; | 
|---|
| 3196 | split(x, &lo,&hi); | 
|---|
| 3197 | return join<F>(_mm_rcp_ps(lo), _mm_rcp_ps(hi)); | 
|---|
| 3198 | #elif defined(JUMPER_IS_NEON) | 
|---|
| 3199 | auto rcp = [](float32x4_t v) { | 
|---|
| 3200 | auto est = vrecpeq_f32(v); | 
|---|
| 3201 | return vrecpsq_f32(v,est)*est; | 
|---|
| 3202 | }; | 
|---|
| 3203 | float32x4_t lo,hi; | 
|---|
| 3204 | split(x, &lo,&hi); | 
|---|
| 3205 | return join<F>(rcp(lo), rcp(hi)); | 
|---|
| 3206 | #else | 
|---|
| 3207 | return 1.0f / x; | 
|---|
| 3208 | #endif | 
|---|
| 3209 | } | 
|---|
| 3210 | SI F sqrt_(F x) { | 
|---|
| 3211 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 3212 | __m256 lo,hi; | 
|---|
| 3213 | split(x, &lo,&hi); | 
|---|
| 3214 | return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi)); | 
|---|
| 3215 | #elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) | 
|---|
| 3216 | __m128 lo,hi; | 
|---|
| 3217 | split(x, &lo,&hi); | 
|---|
| 3218 | return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi)); | 
|---|
| 3219 | #elif defined(SK_CPU_ARM64) | 
|---|
| 3220 | float32x4_t lo,hi; | 
|---|
| 3221 | split(x, &lo,&hi); | 
|---|
| 3222 | return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi)); | 
|---|
| 3223 | #elif defined(JUMPER_IS_NEON) | 
|---|
| 3224 | auto sqrt = [](float32x4_t v) { | 
|---|
| 3225 | auto est = vrsqrteq_f32(v);  // Estimate and two refinement steps for est = rsqrt(v). | 
|---|
| 3226 | est *= vrsqrtsq_f32(v,est*est); | 
|---|
| 3227 | est *= vrsqrtsq_f32(v,est*est); | 
|---|
| 3228 | return v*est;                // sqrt(v) == v*rsqrt(v). | 
|---|
| 3229 | }; | 
|---|
| 3230 | float32x4_t lo,hi; | 
|---|
| 3231 | split(x, &lo,&hi); | 
|---|
| 3232 | return join<F>(sqrt(lo), sqrt(hi)); | 
|---|
| 3233 | #else | 
|---|
| 3234 | return F{ | 
|---|
| 3235 | sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]), | 
|---|
| 3236 | sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]), | 
|---|
| 3237 | }; | 
|---|
| 3238 | #endif | 
|---|
| 3239 | } | 
|---|
| 3240 |  | 
|---|
| 3241 | SI F floor_(F x) { | 
|---|
| 3242 | #if defined(SK_CPU_ARM64) | 
|---|
| 3243 | float32x4_t lo,hi; | 
|---|
| 3244 | split(x, &lo,&hi); | 
|---|
| 3245 | return join<F>(vrndmq_f32(lo), vrndmq_f32(hi)); | 
|---|
| 3246 | #elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 3247 | __m256 lo,hi; | 
|---|
| 3248 | split(x, &lo,&hi); | 
|---|
| 3249 | return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi)); | 
|---|
| 3250 | #elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) | 
|---|
| 3251 | __m128 lo,hi; | 
|---|
| 3252 | split(x, &lo,&hi); | 
|---|
| 3253 | return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi)); | 
|---|
| 3254 | #else | 
|---|
| 3255 | F roundtrip = cast<F>(cast<I32>(x)); | 
|---|
| 3256 | return roundtrip - if_then_else(roundtrip > x, F(1), F(0)); | 
|---|
| 3257 | #endif | 
|---|
| 3258 | } | 
|---|
| 3259 | SI F fract(F x) { return x - floor_(x); } | 
|---|
| 3260 | SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & 0x7fffffff ); } | 
|---|
| 3261 |  | 
|---|
| 3262 | // ~~~~~~ Basic / misc. stages ~~~~~~ // | 
|---|
| 3263 |  | 
|---|
| 3264 | STAGE_GG(seed_shader, Ctx::None) { | 
|---|
| 3265 | static const float iota[] = { | 
|---|
| 3266 | 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f, | 
|---|
| 3267 | 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f, | 
|---|
| 3268 | }; | 
|---|
| 3269 | x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota); | 
|---|
| 3270 | y = cast<F>(I32(dy)) + 0.5f; | 
|---|
| 3271 | } | 
|---|
| 3272 |  | 
|---|
| 3273 | STAGE_GG(matrix_translate, const float* m) { | 
|---|
| 3274 | x += m[0]; | 
|---|
| 3275 | y += m[1]; | 
|---|
| 3276 | } | 
|---|
| 3277 | STAGE_GG(matrix_scale_translate, const float* m) { | 
|---|
| 3278 | x = mad(x,m[0], m[2]); | 
|---|
| 3279 | y = mad(y,m[1], m[3]); | 
|---|
| 3280 | } | 
|---|
| 3281 | STAGE_GG(matrix_2x3, const float* m) { | 
|---|
| 3282 | auto X = mad(x,m[0], mad(y,m[2], m[4])), | 
|---|
| 3283 | Y = mad(x,m[1], mad(y,m[3], m[5])); | 
|---|
| 3284 | x = X; | 
|---|
| 3285 | y = Y; | 
|---|
| 3286 | } | 
|---|
| 3287 | STAGE_GG(matrix_perspective, const float* m) { | 
|---|
| 3288 | // N.B. Unlike the other matrix_ stages, this matrix is row-major. | 
|---|
| 3289 | auto X = mad(x,m[0], mad(y,m[1], m[2])), | 
|---|
| 3290 | Y = mad(x,m[3], mad(y,m[4], m[5])), | 
|---|
| 3291 | Z = mad(x,m[6], mad(y,m[7], m[8])); | 
|---|
| 3292 | x = X * rcp(Z); | 
|---|
| 3293 | y = Y * rcp(Z); | 
|---|
| 3294 | } | 
|---|
| 3295 |  | 
|---|
| 3296 | STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) { | 
|---|
| 3297 | r = c->rgba[0]; | 
|---|
| 3298 | g = c->rgba[1]; | 
|---|
| 3299 | b = c->rgba[2]; | 
|---|
| 3300 | a = c->rgba[3]; | 
|---|
| 3301 | } | 
|---|
| 3302 | STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) { | 
|---|
| 3303 | dr = c->rgba[0]; | 
|---|
| 3304 | dg = c->rgba[1]; | 
|---|
| 3305 | db = c->rgba[2]; | 
|---|
| 3306 | da = c->rgba[3]; | 
|---|
| 3307 | } | 
|---|
| 3308 | STAGE_PP(black_color, Ctx::None) { r = g = b =   0; a = 255; } | 
|---|
| 3309 | STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; } | 
|---|
| 3310 |  | 
|---|
| 3311 | STAGE_PP(set_rgb, const float rgb[3]) { | 
|---|
| 3312 | r = from_float(rgb[0]); | 
|---|
| 3313 | g = from_float(rgb[1]); | 
|---|
| 3314 | b = from_float(rgb[2]); | 
|---|
| 3315 | } | 
|---|
| 3316 |  | 
|---|
| 3317 | STAGE_PP(clamp_0, Ctx::None) { /*definitely a noop*/ } | 
|---|
| 3318 | STAGE_PP(clamp_1, Ctx::None) { /*_should_ be a noop*/ } | 
|---|
| 3319 |  | 
|---|
| 3320 | STAGE_PP(clamp_a, Ctx::None) { | 
|---|
| 3321 | r = min(r, a); | 
|---|
| 3322 | g = min(g, a); | 
|---|
| 3323 | b = min(b, a); | 
|---|
| 3324 | } | 
|---|
| 3325 |  | 
|---|
| 3326 | STAGE_PP(clamp_gamut, Ctx::None) { | 
|---|
| 3327 | // It shouldn't be possible to get out-of-gamut | 
|---|
| 3328 | // colors when working in lowp. | 
|---|
| 3329 | } | 
|---|
| 3330 |  | 
|---|
| 3331 | STAGE_PP(premul, Ctx::None) { | 
|---|
| 3332 | r = div255(r * a); | 
|---|
| 3333 | g = div255(g * a); | 
|---|
| 3334 | b = div255(b * a); | 
|---|
| 3335 | } | 
|---|
| 3336 | STAGE_PP(premul_dst, Ctx::None) { | 
|---|
| 3337 | dr = div255(dr * da); | 
|---|
| 3338 | dg = div255(dg * da); | 
|---|
| 3339 | db = div255(db * da); | 
|---|
| 3340 | } | 
|---|
| 3341 |  | 
|---|
| 3342 | STAGE_PP(force_opaque    , Ctx::None) {  a = 255; } | 
|---|
| 3343 | STAGE_PP(force_opaque_dst, Ctx::None) { da = 255; } | 
|---|
| 3344 |  | 
|---|
| 3345 | STAGE_PP(swap_rb, Ctx::None) { | 
|---|
| 3346 | auto tmp = r; | 
|---|
| 3347 | r = b; | 
|---|
| 3348 | b = tmp; | 
|---|
| 3349 | } | 
|---|
| 3350 | STAGE_PP(swap_rb_dst, Ctx::None) { | 
|---|
| 3351 | auto tmp = dr; | 
|---|
| 3352 | dr = db; | 
|---|
| 3353 | db = tmp; | 
|---|
| 3354 | } | 
|---|
| 3355 |  | 
|---|
| 3356 | STAGE_PP(move_src_dst, Ctx::None) { | 
|---|
| 3357 | dr = r; | 
|---|
| 3358 | dg = g; | 
|---|
| 3359 | db = b; | 
|---|
| 3360 | da = a; | 
|---|
| 3361 | } | 
|---|
| 3362 |  | 
|---|
| 3363 | STAGE_PP(move_dst_src, Ctx::None) { | 
|---|
| 3364 | r = dr; | 
|---|
| 3365 | g = dg; | 
|---|
| 3366 | b = db; | 
|---|
| 3367 | a = da; | 
|---|
| 3368 | } | 
|---|
| 3369 |  | 
|---|
| 3370 | // ~~~~~~ Blend modes ~~~~~~ // | 
|---|
| 3371 |  | 
|---|
| 3372 | // The same logic applied to all 4 channels. | 
|---|
| 3373 | #define BLEND_MODE(name)                                 \ | 
|---|
| 3374 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ | 
|---|
| 3375 | STAGE_PP(name, Ctx::None) {                          \ | 
|---|
| 3376 | r = name##_channel(r,dr,a,da);                   \ | 
|---|
| 3377 | g = name##_channel(g,dg,a,da);                   \ | 
|---|
| 3378 | b = name##_channel(b,db,a,da);                   \ | 
|---|
| 3379 | a = name##_channel(a,da,a,da);                   \ | 
|---|
| 3380 | }                                                    \ | 
|---|
| 3381 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) | 
|---|
| 3382 |  | 
|---|
| 3383 | BLEND_MODE(clear)    { return 0; } | 
|---|
| 3384 | BLEND_MODE(srcatop)  { return div255( s*da + d*inv(sa) ); } | 
|---|
| 3385 | BLEND_MODE(dstatop)  { return div255( d*sa + s*inv(da) ); } | 
|---|
| 3386 | BLEND_MODE(srcin)    { return div255( s*da ); } | 
|---|
| 3387 | BLEND_MODE(dstin)    { return div255( d*sa ); } | 
|---|
| 3388 | BLEND_MODE(srcout)   { return div255( s*inv(da) ); } | 
|---|
| 3389 | BLEND_MODE(dstout)   { return div255( d*inv(sa) ); } | 
|---|
| 3390 | BLEND_MODE(srcover)  { return s + div255( d*inv(sa) ); } | 
|---|
| 3391 | BLEND_MODE(dstover)  { return d + div255( s*inv(da) ); } | 
|---|
| 3392 | BLEND_MODE(modulate) { return div255( s*d ); } | 
|---|
| 3393 | BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); } | 
|---|
| 3394 | BLEND_MODE(plus_)    { return min(s+d, 255); } | 
|---|
| 3395 | BLEND_MODE(screen)   { return s + d - div255( s*d ); } | 
|---|
| 3396 | BLEND_MODE(xor_)     { return div255( s*inv(da) + d*inv(sa) ); } | 
|---|
| 3397 | #undef BLEND_MODE | 
|---|
| 3398 |  | 
|---|
| 3399 | // The same logic applied to color, and srcover for alpha. | 
|---|
| 3400 | #define BLEND_MODE(name)                                 \ | 
|---|
| 3401 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ | 
|---|
| 3402 | STAGE_PP(name, Ctx::None) {                          \ | 
|---|
| 3403 | r = name##_channel(r,dr,a,da);                   \ | 
|---|
| 3404 | g = name##_channel(g,dg,a,da);                   \ | 
|---|
| 3405 | b = name##_channel(b,db,a,da);                   \ | 
|---|
| 3406 | a = a + div255( da*inv(a) );                     \ | 
|---|
| 3407 | }                                                    \ | 
|---|
| 3408 | SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) | 
|---|
| 3409 |  | 
|---|
| 3410 | BLEND_MODE(darken)     { return s + d -   div255( max(s*da, d*sa) ); } | 
|---|
| 3411 | BLEND_MODE(lighten)    { return s + d -   div255( min(s*da, d*sa) ); } | 
|---|
| 3412 | BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); } | 
|---|
| 3413 | BLEND_MODE(exclusion)  { return s + d - 2*div255( s*d ); } | 
|---|
| 3414 |  | 
|---|
| 3415 | BLEND_MODE(hardlight) { | 
|---|
| 3416 | return div255( s*inv(da) + d*inv(sa) + | 
|---|
| 3417 | if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); | 
|---|
| 3418 | } | 
|---|
| 3419 | BLEND_MODE(overlay) { | 
|---|
| 3420 | return div255( s*inv(da) + d*inv(sa) + | 
|---|
| 3421 | if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); | 
|---|
| 3422 | } | 
|---|
| 3423 | #undef BLEND_MODE | 
|---|
| 3424 |  | 
|---|
| 3425 | // ~~~~~~ Helpers for interacting with memory ~~~~~~ // | 
|---|
| 3426 |  | 
|---|
| 3427 | template <typename T> | 
|---|
| 3428 | SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) { | 
|---|
| 3429 | return (T*)ctx->pixels + dy*ctx->stride + dx; | 
|---|
| 3430 | } | 
|---|
| 3431 |  | 
|---|
| 3432 | template <typename T> | 
|---|
| 3433 | SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) { | 
|---|
| 3434 | auto clamp = [](F v, F limit) { | 
|---|
| 3435 | limit = bit_cast<F>( bit_cast<U32>(limit) - 1 );  // Exclusive -> inclusive. | 
|---|
| 3436 | return min(max(0, v), limit); | 
|---|
| 3437 | }; | 
|---|
| 3438 | x = clamp(x, ctx->width); | 
|---|
| 3439 | y = clamp(y, ctx->height); | 
|---|
| 3440 |  | 
|---|
| 3441 | *ptr = (const T*)ctx->pixels; | 
|---|
| 3442 | return trunc_(y)*ctx->stride + trunc_(x); | 
|---|
| 3443 | } | 
|---|
| 3444 |  | 
|---|
| 3445 | template <typename V, typename T> | 
|---|
| 3446 | SI V load(const T* ptr, size_t tail) { | 
|---|
| 3447 | V v = 0; | 
|---|
| 3448 | switch (tail & (N-1)) { | 
|---|
| 3449 | case  0: memcpy(&v, ptr, sizeof(v)); break; | 
|---|
| 3450 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 3451 | case 15: v[14] = ptr[14]; | 
|---|
| 3452 | case 14: v[13] = ptr[13]; | 
|---|
| 3453 | case 13: v[12] = ptr[12]; | 
|---|
| 3454 | case 12: memcpy(&v, ptr, 12*sizeof(T)); break; | 
|---|
| 3455 | case 11: v[10] = ptr[10]; | 
|---|
| 3456 | case 10: v[ 9] = ptr[ 9]; | 
|---|
| 3457 | case  9: v[ 8] = ptr[ 8]; | 
|---|
| 3458 | case  8: memcpy(&v, ptr,  8*sizeof(T)); break; | 
|---|
| 3459 | #endif | 
|---|
| 3460 | case  7: v[ 6] = ptr[ 6]; | 
|---|
| 3461 | case  6: v[ 5] = ptr[ 5]; | 
|---|
| 3462 | case  5: v[ 4] = ptr[ 4]; | 
|---|
| 3463 | case  4: memcpy(&v, ptr,  4*sizeof(T)); break; | 
|---|
| 3464 | case  3: v[ 2] = ptr[ 2]; | 
|---|
| 3465 | case  2: memcpy(&v, ptr,  2*sizeof(T)); break; | 
|---|
| 3466 | case  1: v[ 0] = ptr[ 0]; | 
|---|
| 3467 | } | 
|---|
| 3468 | return v; | 
|---|
| 3469 | } | 
|---|
| 3470 | template <typename V, typename T> | 
|---|
| 3471 | SI void store(T* ptr, size_t tail, V v) { | 
|---|
| 3472 | switch (tail & (N-1)) { | 
|---|
| 3473 | case  0: memcpy(ptr, &v, sizeof(v)); break; | 
|---|
| 3474 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 3475 | case 15: ptr[14] = v[14]; | 
|---|
| 3476 | case 14: ptr[13] = v[13]; | 
|---|
| 3477 | case 13: ptr[12] = v[12]; | 
|---|
| 3478 | case 12: memcpy(ptr, &v, 12*sizeof(T)); break; | 
|---|
| 3479 | case 11: ptr[10] = v[10]; | 
|---|
| 3480 | case 10: ptr[ 9] = v[ 9]; | 
|---|
| 3481 | case  9: ptr[ 8] = v[ 8]; | 
|---|
| 3482 | case  8: memcpy(ptr, &v,  8*sizeof(T)); break; | 
|---|
| 3483 | #endif | 
|---|
| 3484 | case  7: ptr[ 6] = v[ 6]; | 
|---|
| 3485 | case  6: ptr[ 5] = v[ 5]; | 
|---|
| 3486 | case  5: ptr[ 4] = v[ 4]; | 
|---|
| 3487 | case  4: memcpy(ptr, &v,  4*sizeof(T)); break; | 
|---|
| 3488 | case  3: ptr[ 2] = v[ 2]; | 
|---|
| 3489 | case  2: memcpy(ptr, &v,  2*sizeof(T)); break; | 
|---|
| 3490 | case  1: ptr[ 0] = v[ 0]; | 
|---|
| 3491 | } | 
|---|
| 3492 | } | 
|---|
| 3493 |  | 
|---|
| 3494 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 3495 | template <typename V, typename T> | 
|---|
| 3496 | SI V gather(const T* ptr, U32 ix) { | 
|---|
| 3497 | return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], | 
|---|
| 3498 | ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], | 
|---|
| 3499 | ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]], | 
|---|
| 3500 | ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], }; | 
|---|
| 3501 | } | 
|---|
| 3502 |  | 
|---|
| 3503 | template<> | 
|---|
| 3504 | F gather(const float* ptr, U32 ix) { | 
|---|
| 3505 | __m256i lo, hi; | 
|---|
| 3506 | split(ix, &lo, &hi); | 
|---|
| 3507 |  | 
|---|
| 3508 | return join<F>(_mm256_i32gather_ps(ptr, lo, 4), | 
|---|
| 3509 | _mm256_i32gather_ps(ptr, hi, 4)); | 
|---|
| 3510 | } | 
|---|
| 3511 |  | 
|---|
| 3512 | template<> | 
|---|
| 3513 | U32 gather(const uint32_t* ptr, U32 ix) { | 
|---|
| 3514 | __m256i lo, hi; | 
|---|
| 3515 | split(ix, &lo, &hi); | 
|---|
| 3516 |  | 
|---|
| 3517 | return join<U32>(_mm256_i32gather_epi32(ptr, lo, 4), | 
|---|
| 3518 | _mm256_i32gather_epi32(ptr, hi, 4)); | 
|---|
| 3519 | } | 
|---|
| 3520 | #else | 
|---|
| 3521 | template <typename V, typename T> | 
|---|
| 3522 | SI V gather(const T* ptr, U32 ix) { | 
|---|
| 3523 | return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], | 
|---|
| 3524 | ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], }; | 
|---|
| 3525 | } | 
|---|
| 3526 | #endif | 
|---|
| 3527 |  | 
|---|
| 3528 |  | 
|---|
| 3529 | // ~~~~~~ 32-bit memory loads and stores ~~~~~~ // | 
|---|
| 3530 |  | 
|---|
| 3531 | SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 3532 | #if 1 && defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 3533 | // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely. | 
|---|
| 3534 | __m256i _01,_23; | 
|---|
| 3535 | split(rgba, &_01, &_23); | 
|---|
| 3536 | __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20), | 
|---|
| 3537 | _13 = _mm256_permute2x128_si256(_01,_23, 0x31); | 
|---|
| 3538 | rgba = join<U32>(_02, _13); | 
|---|
| 3539 |  | 
|---|
| 3540 | auto cast_U16 = [](U32 v) -> U16 { | 
|---|
| 3541 | __m256i _02,_13; | 
|---|
| 3542 | split(v, &_02,&_13); | 
|---|
| 3543 | return _mm256_packus_epi32(_02,_13); | 
|---|
| 3544 | }; | 
|---|
| 3545 | #else | 
|---|
| 3546 | auto cast_U16 = [](U32 v) -> U16 { | 
|---|
| 3547 | return cast<U16>(v); | 
|---|
| 3548 | }; | 
|---|
| 3549 | #endif | 
|---|
| 3550 | *r = cast_U16(rgba & 65535) & 255; | 
|---|
| 3551 | *g = cast_U16(rgba & 65535) >>  8; | 
|---|
| 3552 | *b = cast_U16(rgba >>   16) & 255; | 
|---|
| 3553 | *a = cast_U16(rgba >>   16) >>  8; | 
|---|
| 3554 | } | 
|---|
| 3555 |  | 
|---|
| 3556 | SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 3557 | #if 1 && defined(JUMPER_IS_NEON) | 
|---|
| 3558 | uint8x8x4_t rgba; | 
|---|
| 3559 | switch (tail & (N-1)) { | 
|---|
| 3560 | case 0: rgba = vld4_u8     ((const uint8_t*)(ptr+0)         ); break; | 
|---|
| 3561 | case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6); | 
|---|
| 3562 | case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5); | 
|---|
| 3563 | case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4); | 
|---|
| 3564 | case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3); | 
|---|
| 3565 | case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2); | 
|---|
| 3566 | case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1); | 
|---|
| 3567 | case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0); | 
|---|
| 3568 | } | 
|---|
| 3569 | *r = cast<U16>(rgba.val[0]); | 
|---|
| 3570 | *g = cast<U16>(rgba.val[1]); | 
|---|
| 3571 | *b = cast<U16>(rgba.val[2]); | 
|---|
| 3572 | *a = cast<U16>(rgba.val[3]); | 
|---|
| 3573 | #else | 
|---|
| 3574 | from_8888(load<U32>(ptr, tail), r,g,b,a); | 
|---|
| 3575 | #endif | 
|---|
| 3576 | } | 
|---|
| 3577 | SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { | 
|---|
| 3578 | #if 1 && defined(JUMPER_IS_NEON) | 
|---|
| 3579 | uint8x8x4_t rgba = {{ | 
|---|
| 3580 | cast<U8>(r), | 
|---|
| 3581 | cast<U8>(g), | 
|---|
| 3582 | cast<U8>(b), | 
|---|
| 3583 | cast<U8>(a), | 
|---|
| 3584 | }}; | 
|---|
| 3585 | switch (tail & (N-1)) { | 
|---|
| 3586 | case 0: vst4_u8     ((uint8_t*)(ptr+0), rgba   ); break; | 
|---|
| 3587 | case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6); | 
|---|
| 3588 | case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5); | 
|---|
| 3589 | case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4); | 
|---|
| 3590 | case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3); | 
|---|
| 3591 | case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2); | 
|---|
| 3592 | case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1); | 
|---|
| 3593 | case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0); | 
|---|
| 3594 | } | 
|---|
| 3595 | #else | 
|---|
| 3596 | store(ptr, tail, cast<U32>(r | (g<<8)) <<  0 | 
|---|
| 3597 | | cast<U32>(b | (a<<8)) << 16); | 
|---|
| 3598 | #endif | 
|---|
| 3599 | } | 
|---|
| 3600 |  | 
|---|
| 3601 | STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3602 | load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a); | 
|---|
| 3603 | } | 
|---|
| 3604 | STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3605 | load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); | 
|---|
| 3606 | } | 
|---|
| 3607 | STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3608 | store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a); | 
|---|
| 3609 | } | 
|---|
| 3610 | STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 3611 | const uint32_t* ptr; | 
|---|
| 3612 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); | 
|---|
| 3613 | from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a); | 
|---|
| 3614 | } | 
|---|
| 3615 |  | 
|---|
| 3616 | // ~~~~~~ 16-bit memory loads and stores ~~~~~~ // | 
|---|
| 3617 |  | 
|---|
| 3618 | SI void from_565(U16 rgb, U16* r, U16* g, U16* b) { | 
|---|
| 3619 | // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0 | 
|---|
| 3620 | U16 R = (rgb >> 11) & 31, | 
|---|
| 3621 | G = (rgb >>  5) & 63, | 
|---|
| 3622 | B = (rgb >>  0) & 31; | 
|---|
| 3623 |  | 
|---|
| 3624 | // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit. | 
|---|
| 3625 | *r = (R << 3) | (R >> 2); | 
|---|
| 3626 | *g = (G << 2) | (G >> 4); | 
|---|
| 3627 | *b = (B << 3) | (B >> 2); | 
|---|
| 3628 | } | 
|---|
| 3629 | SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { | 
|---|
| 3630 | from_565(load<U16>(ptr, tail), r,g,b); | 
|---|
| 3631 | } | 
|---|
| 3632 | SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) { | 
|---|
| 3633 | // Round from [0,255] to [0,31] or [0,63], as if x * (31/255.0f) + 0.5f. | 
|---|
| 3634 | // (Don't feel like you need to find some fundamental truth in these... | 
|---|
| 3635 | // they were brute-force searched.) | 
|---|
| 3636 | U16 R = (r *  9 + 36) / 74,   //  9/74 ≈ 31/255, plus 36/74, about half. | 
|---|
| 3637 | G = (g * 21 + 42) / 85,   // 21/85 = 63/255 exactly. | 
|---|
| 3638 | B = (b *  9 + 36) / 74; | 
|---|
| 3639 | // Pack them back into 15|rrrrr gggggg bbbbb|0. | 
|---|
| 3640 | store(ptr, tail, R << 11 | 
|---|
| 3641 | | G <<  5 | 
|---|
| 3642 | | B <<  0); | 
|---|
| 3643 | } | 
|---|
| 3644 |  | 
|---|
| 3645 | STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3646 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b); | 
|---|
| 3647 | a = 255; | 
|---|
| 3648 | } | 
|---|
| 3649 | STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3650 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db); | 
|---|
| 3651 | da = 255; | 
|---|
| 3652 | } | 
|---|
| 3653 | STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3654 | store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b); | 
|---|
| 3655 | } | 
|---|
| 3656 | STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 3657 | const uint16_t* ptr; | 
|---|
| 3658 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); | 
|---|
| 3659 | from_565(gather<U16>(ptr, ix), &r, &g, &b); | 
|---|
| 3660 | a = 255; | 
|---|
| 3661 | } | 
|---|
| 3662 |  | 
|---|
| 3663 | SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 3664 | // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0. | 
|---|
| 3665 | U16 R = (rgba >> 12) & 15, | 
|---|
| 3666 | G = (rgba >>  8) & 15, | 
|---|
| 3667 | B = (rgba >>  4) & 15, | 
|---|
| 3668 | A = (rgba >>  0) & 15; | 
|---|
| 3669 |  | 
|---|
| 3670 | // Scale [0,15] to [0,255]. | 
|---|
| 3671 | *r = (R << 4) | R; | 
|---|
| 3672 | *g = (G << 4) | G; | 
|---|
| 3673 | *b = (B << 4) | B; | 
|---|
| 3674 | *a = (A << 4) | A; | 
|---|
| 3675 | } | 
|---|
| 3676 | SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 3677 | from_4444(load<U16>(ptr, tail), r,g,b,a); | 
|---|
| 3678 | } | 
|---|
| 3679 | SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { | 
|---|
| 3680 | // Round from [0,255] to [0,15], producing the same value as (x*(15/255.0f) + 0.5f). | 
|---|
| 3681 | U16 R = (r + 8) / 17, | 
|---|
| 3682 | G = (g + 8) / 17, | 
|---|
| 3683 | B = (b + 8) / 17, | 
|---|
| 3684 | A = (a + 8) / 17; | 
|---|
| 3685 | // Pack them back into 15|rrrr gggg bbbb aaaa|0. | 
|---|
| 3686 | store(ptr, tail, R << 12 | 
|---|
| 3687 | | G <<  8 | 
|---|
| 3688 | | B <<  4 | 
|---|
| 3689 | | A <<  0); | 
|---|
| 3690 | } | 
|---|
| 3691 |  | 
|---|
| 3692 | STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3693 | load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a); | 
|---|
| 3694 | } | 
|---|
| 3695 | STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3696 | load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); | 
|---|
| 3697 | } | 
|---|
| 3698 | STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3699 | store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a); | 
|---|
| 3700 | } | 
|---|
| 3701 | STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 3702 | const uint16_t* ptr; | 
|---|
| 3703 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); | 
|---|
| 3704 | from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a); | 
|---|
| 3705 | } | 
|---|
| 3706 |  | 
|---|
| 3707 | SI void from_88(U16 rg, U16* r, U16* g) { | 
|---|
| 3708 | *r = (rg & 0xFF); | 
|---|
| 3709 | *g = (rg >> 8); | 
|---|
| 3710 | } | 
|---|
| 3711 |  | 
|---|
| 3712 | SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) { | 
|---|
| 3713 | #if 1 && defined(JUMPER_IS_NEON) | 
|---|
| 3714 | uint8x8x2_t rg; | 
|---|
| 3715 | switch (tail & (N-1)) { | 
|---|
| 3716 | case 0: rg = vld2_u8     ((const uint8_t*)(ptr+0)         ); break; | 
|---|
| 3717 | case 7: rg = vld2_lane_u8((const uint8_t*)(ptr+6), rg, 6); | 
|---|
| 3718 | case 6: rg = vld2_lane_u8((const uint8_t*)(ptr+5), rg, 5); | 
|---|
| 3719 | case 5: rg = vld2_lane_u8((const uint8_t*)(ptr+4), rg, 4); | 
|---|
| 3720 | case 4: rg = vld2_lane_u8((const uint8_t*)(ptr+3), rg, 3); | 
|---|
| 3721 | case 3: rg = vld2_lane_u8((const uint8_t*)(ptr+2), rg, 2); | 
|---|
| 3722 | case 2: rg = vld2_lane_u8((const uint8_t*)(ptr+1), rg, 1); | 
|---|
| 3723 | case 1: rg = vld2_lane_u8((const uint8_t*)(ptr+0), rg, 0); | 
|---|
| 3724 | } | 
|---|
| 3725 | *r = cast<U16>(rg.val[0]); | 
|---|
| 3726 | *g = cast<U16>(rg.val[1]); | 
|---|
| 3727 | #else | 
|---|
| 3728 | from_88(load<U16>(ptr, tail), r,g); | 
|---|
| 3729 | #endif | 
|---|
| 3730 | } | 
|---|
| 3731 |  | 
|---|
| 3732 | SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) { | 
|---|
| 3733 | #if 1 && defined(JUMPER_IS_NEON) | 
|---|
| 3734 | uint8x8x2_t rg = {{ | 
|---|
| 3735 | cast<U8>(r), | 
|---|
| 3736 | cast<U8>(g), | 
|---|
| 3737 | }}; | 
|---|
| 3738 | switch (tail & (N-1)) { | 
|---|
| 3739 | case 0: vst2_u8     ((uint8_t*)(ptr+0), rg   ); break; | 
|---|
| 3740 | case 7: vst2_lane_u8((uint8_t*)(ptr+6), rg, 6); | 
|---|
| 3741 | case 6: vst2_lane_u8((uint8_t*)(ptr+5), rg, 5); | 
|---|
| 3742 | case 5: vst2_lane_u8((uint8_t*)(ptr+4), rg, 4); | 
|---|
| 3743 | case 4: vst2_lane_u8((uint8_t*)(ptr+3), rg, 3); | 
|---|
| 3744 | case 3: vst2_lane_u8((uint8_t*)(ptr+2), rg, 2); | 
|---|
| 3745 | case 2: vst2_lane_u8((uint8_t*)(ptr+1), rg, 1); | 
|---|
| 3746 | case 1: vst2_lane_u8((uint8_t*)(ptr+0), rg, 0); | 
|---|
| 3747 | } | 
|---|
| 3748 | #else | 
|---|
| 3749 | store(ptr, tail, cast<U16>(r | (g<<8)) <<  0); | 
|---|
| 3750 | #endif | 
|---|
| 3751 | } | 
|---|
| 3752 |  | 
|---|
| 3753 | STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3754 | load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &r, &g); | 
|---|
| 3755 | b = 0; | 
|---|
| 3756 | a = 255; | 
|---|
| 3757 | } | 
|---|
| 3758 | STAGE_PP(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3759 | load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &dr, &dg); | 
|---|
| 3760 | db = 0; | 
|---|
| 3761 | da = 255; | 
|---|
| 3762 | } | 
|---|
| 3763 | STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3764 | store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g); | 
|---|
| 3765 | } | 
|---|
| 3766 | STAGE_GP(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 3767 | const uint16_t* ptr; | 
|---|
| 3768 | U32 ix = ix_and_ptr(&ptr, ctx, x, y); | 
|---|
| 3769 | from_88(gather<U16>(ptr, ix), &r, &g); | 
|---|
| 3770 | b = 0; | 
|---|
| 3771 | a = 255; | 
|---|
| 3772 | } | 
|---|
| 3773 |  | 
|---|
| 3774 | // ~~~~~~ 8-bit memory loads and stores ~~~~~~ // | 
|---|
| 3775 |  | 
|---|
| 3776 | SI U16 load_8(const uint8_t* ptr, size_t tail) { | 
|---|
| 3777 | return cast<U16>(load<U8>(ptr, tail)); | 
|---|
| 3778 | } | 
|---|
| 3779 | SI void store_8(uint8_t* ptr, size_t tail, U16 v) { | 
|---|
| 3780 | store(ptr, tail, cast<U8>(v)); | 
|---|
| 3781 | } | 
|---|
| 3782 |  | 
|---|
| 3783 | STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3784 | r = g = b = 0; | 
|---|
| 3785 | a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); | 
|---|
| 3786 | } | 
|---|
| 3787 | STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3788 | dr = dg = db = 0; | 
|---|
| 3789 | da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); | 
|---|
| 3790 | } | 
|---|
| 3791 | STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3792 | store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a); | 
|---|
| 3793 | } | 
|---|
| 3794 | STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 3795 | const uint8_t* ptr; | 
|---|
| 3796 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); | 
|---|
| 3797 | r = g = b = 0; | 
|---|
| 3798 | a = cast<U16>(gather<U8>(ptr, ix)); | 
|---|
| 3799 | } | 
|---|
| 3800 |  | 
|---|
| 3801 | STAGE_PP(alpha_to_gray, Ctx::None) { | 
|---|
| 3802 | r = g = b = a; | 
|---|
| 3803 | a = 255; | 
|---|
| 3804 | } | 
|---|
| 3805 | STAGE_PP(alpha_to_gray_dst, Ctx::None) { | 
|---|
| 3806 | dr = dg = db = da; | 
|---|
| 3807 | da = 255; | 
|---|
| 3808 | } | 
|---|
| 3809 | STAGE_PP(bt709_luminance_or_luma_to_alpha, Ctx::None) { | 
|---|
| 3810 | a = (r*54 + g*183 + b*19)/256;  // 0.2126, 0.7152, 0.0722 with 256 denominator. | 
|---|
| 3811 | r = g = b = 0; | 
|---|
| 3812 | } | 
|---|
| 3813 |  | 
|---|
| 3814 | // ~~~~~~ Coverage scales / lerps ~~~~~~ // | 
|---|
| 3815 |  | 
|---|
| 3816 | STAGE_PP(load_src, const uint16_t* ptr) { | 
|---|
| 3817 | r = sk_unaligned_load<U16>(ptr + 0*N); | 
|---|
| 3818 | g = sk_unaligned_load<U16>(ptr + 1*N); | 
|---|
| 3819 | b = sk_unaligned_load<U16>(ptr + 2*N); | 
|---|
| 3820 | a = sk_unaligned_load<U16>(ptr + 3*N); | 
|---|
| 3821 | } | 
|---|
| 3822 | STAGE_PP(store_src, uint16_t* ptr) { | 
|---|
| 3823 | sk_unaligned_store(ptr + 0*N, r); | 
|---|
| 3824 | sk_unaligned_store(ptr + 1*N, g); | 
|---|
| 3825 | sk_unaligned_store(ptr + 2*N, b); | 
|---|
| 3826 | sk_unaligned_store(ptr + 3*N, a); | 
|---|
| 3827 | } | 
|---|
| 3828 | STAGE_PP(store_src_a, uint16_t* ptr) { | 
|---|
| 3829 | sk_unaligned_store(ptr, a); | 
|---|
| 3830 | } | 
|---|
| 3831 | STAGE_PP(load_dst, const uint16_t* ptr) { | 
|---|
| 3832 | dr = sk_unaligned_load<U16>(ptr + 0*N); | 
|---|
| 3833 | dg = sk_unaligned_load<U16>(ptr + 1*N); | 
|---|
| 3834 | db = sk_unaligned_load<U16>(ptr + 2*N); | 
|---|
| 3835 | da = sk_unaligned_load<U16>(ptr + 3*N); | 
|---|
| 3836 | } | 
|---|
| 3837 | STAGE_PP(store_dst, uint16_t* ptr) { | 
|---|
| 3838 | sk_unaligned_store(ptr + 0*N, dr); | 
|---|
| 3839 | sk_unaligned_store(ptr + 1*N, dg); | 
|---|
| 3840 | sk_unaligned_store(ptr + 2*N, db); | 
|---|
| 3841 | sk_unaligned_store(ptr + 3*N, da); | 
|---|
| 3842 | } | 
|---|
| 3843 |  | 
|---|
| 3844 | // ~~~~~~ Coverage scales / lerps ~~~~~~ // | 
|---|
| 3845 |  | 
|---|
| 3846 | STAGE_PP(scale_1_float, const float* f) { | 
|---|
| 3847 | U16 c = from_float(*f); | 
|---|
| 3848 | r = div255( r * c ); | 
|---|
| 3849 | g = div255( g * c ); | 
|---|
| 3850 | b = div255( b * c ); | 
|---|
| 3851 | a = div255( a * c ); | 
|---|
| 3852 | } | 
|---|
| 3853 | STAGE_PP(lerp_1_float, const float* f) { | 
|---|
| 3854 | U16 c = from_float(*f); | 
|---|
| 3855 | r = lerp(dr, r, c); | 
|---|
| 3856 | g = lerp(dg, g, c); | 
|---|
| 3857 | b = lerp(db, b, c); | 
|---|
| 3858 | a = lerp(da, a, c); | 
|---|
| 3859 | } | 
|---|
| 3860 | STAGE_PP(scale_native, const uint16_t scales[]) { | 
|---|
| 3861 | auto c = sk_unaligned_load<U16>(scales); | 
|---|
| 3862 | r = div255( r * c ); | 
|---|
| 3863 | g = div255( g * c ); | 
|---|
| 3864 | b = div255( b * c ); | 
|---|
| 3865 | a = div255( a * c ); | 
|---|
| 3866 | } | 
|---|
| 3867 |  | 
|---|
| 3868 | STAGE_PP(lerp_native, const uint16_t scales[]) { | 
|---|
| 3869 | auto c = sk_unaligned_load<U16>(scales); | 
|---|
| 3870 | r = lerp(dr, r, c); | 
|---|
| 3871 | g = lerp(dg, g, c); | 
|---|
| 3872 | b = lerp(db, b, c); | 
|---|
| 3873 | a = lerp(da, a, c); | 
|---|
| 3874 | } | 
|---|
| 3875 |  | 
|---|
| 3876 | STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3877 | U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); | 
|---|
| 3878 | r = div255( r * c ); | 
|---|
| 3879 | g = div255( g * c ); | 
|---|
| 3880 | b = div255( b * c ); | 
|---|
| 3881 | a = div255( a * c ); | 
|---|
| 3882 | } | 
|---|
| 3883 | STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3884 | U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); | 
|---|
| 3885 | r = lerp(dr, r, c); | 
|---|
| 3886 | g = lerp(dg, g, c); | 
|---|
| 3887 | b = lerp(db, b, c); | 
|---|
| 3888 | a = lerp(da, a, c); | 
|---|
| 3889 | } | 
|---|
| 3890 |  | 
|---|
| 3891 | // Derive alpha's coverage from rgb coverage and the values of src and dst alpha. | 
|---|
| 3892 | SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) { | 
|---|
| 3893 | return if_then_else(a < da, min(cr, min(cg,cb)) | 
|---|
| 3894 | , max(cr, max(cg,cb))); | 
|---|
| 3895 | } | 
|---|
| 3896 | STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3897 | U16 cr,cg,cb; | 
|---|
| 3898 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); | 
|---|
| 3899 | U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); | 
|---|
| 3900 |  | 
|---|
| 3901 | r = div255( r * cr ); | 
|---|
| 3902 | g = div255( g * cg ); | 
|---|
| 3903 | b = div255( b * cb ); | 
|---|
| 3904 | a = div255( a * ca ); | 
|---|
| 3905 | } | 
|---|
| 3906 | STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 3907 | U16 cr,cg,cb; | 
|---|
| 3908 | load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); | 
|---|
| 3909 | U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); | 
|---|
| 3910 |  | 
|---|
| 3911 | r = lerp(dr, r, cr); | 
|---|
| 3912 | g = lerp(dg, g, cg); | 
|---|
| 3913 | b = lerp(db, b, cb); | 
|---|
| 3914 | a = lerp(da, a, ca); | 
|---|
| 3915 | } | 
|---|
| 3916 |  | 
|---|
| 3917 | STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) { | 
|---|
| 3918 | U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail), | 
|---|
| 3919 | add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail); | 
|---|
| 3920 |  | 
|---|
| 3921 | r = min(div255(r*mul) + add, a); | 
|---|
| 3922 | g = min(div255(g*mul) + add, a); | 
|---|
| 3923 | b = min(div255(b*mul) + add, a); | 
|---|
| 3924 | } | 
|---|
| 3925 |  | 
|---|
| 3926 |  | 
|---|
| 3927 | // ~~~~~~ Gradient stages ~~~~~~ // | 
|---|
| 3928 |  | 
|---|
| 3929 | // Clamp x to [0,1], both sides inclusive (think, gradients). | 
|---|
| 3930 | // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. | 
|---|
| 3931 | SI F clamp_01(F v) { return min(max(0, v), 1); } | 
|---|
| 3932 |  | 
|---|
| 3933 | STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); } | 
|---|
| 3934 | STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); } | 
|---|
| 3935 | STAGE_GG(mirror_x_1, Ctx::None) { | 
|---|
| 3936 | auto two = [](F x){ return x+x; }; | 
|---|
| 3937 | x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f )); | 
|---|
| 3938 | } | 
|---|
| 3939 |  | 
|---|
| 3940 | SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); } | 
|---|
| 3941 |  | 
|---|
| 3942 | STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) { | 
|---|
| 3943 | auto w = ctx->limit_x; | 
|---|
| 3944 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w))); | 
|---|
| 3945 | } | 
|---|
| 3946 | STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) { | 
|---|
| 3947 | auto h = ctx->limit_y; | 
|---|
| 3948 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h))); | 
|---|
| 3949 | } | 
|---|
| 3950 | STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) { | 
|---|
| 3951 | auto w = ctx->limit_x; | 
|---|
| 3952 | auto h = ctx->limit_y; | 
|---|
| 3953 | sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h))); | 
|---|
| 3954 | } | 
|---|
| 3955 | STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) { | 
|---|
| 3956 | auto mask = sk_unaligned_load<U16>(ctx->mask); | 
|---|
| 3957 | r = r & mask; | 
|---|
| 3958 | g = g & mask; | 
|---|
| 3959 | b = b & mask; | 
|---|
| 3960 | a = a & mask; | 
|---|
| 3961 | } | 
|---|
| 3962 |  | 
|---|
| 3963 | SI void round_F_to_U16(F    R, F    G, F    B, F    A, bool interpolatedInPremul, | 
|---|
| 3964 | U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 3965 | auto round = [](F x) { return cast<U16>(x * 255.0f + 0.5f); }; | 
|---|
| 3966 |  | 
|---|
| 3967 | F limit = interpolatedInPremul ? A | 
|---|
| 3968 | : 1; | 
|---|
| 3969 | *r = round(min(max(0,R), limit)); | 
|---|
| 3970 | *g = round(min(max(0,G), limit)); | 
|---|
| 3971 | *b = round(min(max(0,B), limit)); | 
|---|
| 3972 | *a = round(A);  // we assume alpha is already in [0,1]. | 
|---|
| 3973 | } | 
|---|
| 3974 |  | 
|---|
| 3975 | SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t, | 
|---|
| 3976 | U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 3977 |  | 
|---|
| 3978 | F fr, fg, fb, fa, br, bg, bb, ba; | 
|---|
| 3979 | #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_AVX512) | 
|---|
| 3980 | if (c->stopCount <=8) { | 
|---|
| 3981 | __m256i lo, hi; | 
|---|
| 3982 | split(idx, &lo, &hi); | 
|---|
| 3983 |  | 
|---|
| 3984 | fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo), | 
|---|
| 3985 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi)); | 
|---|
| 3986 | br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo), | 
|---|
| 3987 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi)); | 
|---|
| 3988 | fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo), | 
|---|
| 3989 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi)); | 
|---|
| 3990 | bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo), | 
|---|
| 3991 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi)); | 
|---|
| 3992 | fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo), | 
|---|
| 3993 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi)); | 
|---|
| 3994 | bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo), | 
|---|
| 3995 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi)); | 
|---|
| 3996 | fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo), | 
|---|
| 3997 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi)); | 
|---|
| 3998 | ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo), | 
|---|
| 3999 | _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi)); | 
|---|
| 4000 | } else | 
|---|
| 4001 | #endif | 
|---|
| 4002 | { | 
|---|
| 4003 | fr = gather<F>(c->fs[0], idx); | 
|---|
| 4004 | fg = gather<F>(c->fs[1], idx); | 
|---|
| 4005 | fb = gather<F>(c->fs[2], idx); | 
|---|
| 4006 | fa = gather<F>(c->fs[3], idx); | 
|---|
| 4007 | br = gather<F>(c->bs[0], idx); | 
|---|
| 4008 | bg = gather<F>(c->bs[1], idx); | 
|---|
| 4009 | bb = gather<F>(c->bs[2], idx); | 
|---|
| 4010 | ba = gather<F>(c->bs[3], idx); | 
|---|
| 4011 | } | 
|---|
| 4012 | round_F_to_U16(mad(t, fr, br), | 
|---|
| 4013 | mad(t, fg, bg), | 
|---|
| 4014 | mad(t, fb, bb), | 
|---|
| 4015 | mad(t, fa, ba), | 
|---|
| 4016 | c->interpolatedInPremul, | 
|---|
| 4017 | r,g,b,a); | 
|---|
| 4018 | } | 
|---|
| 4019 |  | 
|---|
| 4020 | STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) { | 
|---|
| 4021 | auto t = x; | 
|---|
| 4022 | U32 idx = 0; | 
|---|
| 4023 |  | 
|---|
| 4024 | // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. | 
|---|
| 4025 | for (size_t i = 1; i < c->stopCount; i++) { | 
|---|
| 4026 | idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); | 
|---|
| 4027 | } | 
|---|
| 4028 |  | 
|---|
| 4029 | gradient_lookup(c, idx, t, &r, &g, &b, &a); | 
|---|
| 4030 | } | 
|---|
| 4031 |  | 
|---|
| 4032 | STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) { | 
|---|
| 4033 | auto t = x; | 
|---|
| 4034 | auto idx = trunc_(t * (c->stopCount-1)); | 
|---|
| 4035 | gradient_lookup(c, idx, t, &r, &g, &b, &a); | 
|---|
| 4036 | } | 
|---|
| 4037 |  | 
|---|
| 4038 | STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) { | 
|---|
| 4039 | auto t = x; | 
|---|
| 4040 | round_F_to_U16(mad(t, c->f[0], c->b[0]), | 
|---|
| 4041 | mad(t, c->f[1], c->b[1]), | 
|---|
| 4042 | mad(t, c->f[2], c->b[2]), | 
|---|
| 4043 | mad(t, c->f[3], c->b[3]), | 
|---|
| 4044 | c->interpolatedInPremul, | 
|---|
| 4045 | &r,&g,&b,&a); | 
|---|
| 4046 | } | 
|---|
| 4047 |  | 
|---|
| 4048 | STAGE_GG(xy_to_unit_angle, Ctx::None) { | 
|---|
| 4049 | F xabs = abs_(x), | 
|---|
| 4050 | yabs = abs_(y); | 
|---|
| 4051 |  | 
|---|
| 4052 | F slope = min(xabs, yabs)/max(xabs, yabs); | 
|---|
| 4053 | F s = slope * slope; | 
|---|
| 4054 |  | 
|---|
| 4055 | // Use a 7th degree polynomial to approximate atan. | 
|---|
| 4056 | // This was generated using sollya.gforge.inria.fr. | 
|---|
| 4057 | // A float optimized polynomial was generated using the following command. | 
|---|
| 4058 | // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); | 
|---|
| 4059 | F phi = slope | 
|---|
| 4060 | * (0.15912117063999176025390625f     + s | 
|---|
| 4061 | * (-5.185396969318389892578125e-2f   + s | 
|---|
| 4062 | * (2.476101927459239959716796875e-2f + s | 
|---|
| 4063 | * (-7.0547382347285747528076171875e-3f)))); | 
|---|
| 4064 |  | 
|---|
| 4065 | phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); | 
|---|
| 4066 | phi = if_then_else(x < 0.0f   , 1.0f/2.0f - phi, phi); | 
|---|
| 4067 | phi = if_then_else(y < 0.0f   , 1.0f - phi     , phi); | 
|---|
| 4068 | phi = if_then_else(phi != phi , 0              , phi);  // Check for NaN. | 
|---|
| 4069 | x = phi; | 
|---|
| 4070 | } | 
|---|
| 4071 | STAGE_GG(xy_to_radius, Ctx::None) { | 
|---|
| 4072 | x = sqrt_(x*x + y*y); | 
|---|
| 4073 | } | 
|---|
| 4074 |  | 
|---|
| 4075 | // ~~~~~~ Compound stages ~~~~~~ // | 
|---|
| 4076 |  | 
|---|
| 4077 | STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) { | 
|---|
| 4078 | auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); | 
|---|
| 4079 |  | 
|---|
| 4080 | load_8888_(ptr, tail, &dr,&dg,&db,&da); | 
|---|
| 4081 | r = r + div255( dr*inv(a) ); | 
|---|
| 4082 | g = g + div255( dg*inv(a) ); | 
|---|
| 4083 | b = b + div255( db*inv(a) ); | 
|---|
| 4084 | a = a + div255( da*inv(a) ); | 
|---|
| 4085 | store_8888_(ptr, tail, r,g,b,a); | 
|---|
| 4086 | } | 
|---|
| 4087 |  | 
|---|
| 4088 | #if defined(SK_DISABLE_LOWP_BILERP_CLAMP_CLAMP_STAGE) | 
|---|
| 4089 | static void(*bilerp_clamp_8888)(void) = nullptr; | 
|---|
| 4090 | static void(*bilinear)(void) = nullptr; | 
|---|
| 4091 | #else | 
|---|
| 4092 | STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { | 
|---|
| 4093 | // (cx,cy) are the center of our sample. | 
|---|
| 4094 | F cx = x, | 
|---|
| 4095 | cy = y; | 
|---|
| 4096 |  | 
|---|
| 4097 | // All sample points are at the same fractional offset (fx,fy). | 
|---|
| 4098 | // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. | 
|---|
| 4099 | F fx = fract(cx + 0.5f), | 
|---|
| 4100 | fy = fract(cy + 0.5f); | 
|---|
| 4101 |  | 
|---|
| 4102 | // We'll accumulate the color of all four samples into {r,g,b,a} directly. | 
|---|
| 4103 | r = g = b = a = 0; | 
|---|
| 4104 |  | 
|---|
| 4105 | // The first three sample points will calculate their area using math | 
|---|
| 4106 | // just like in the float code above, but the fourth will take up all the rest. | 
|---|
| 4107 | // | 
|---|
| 4108 | // Logically this is the same as doing the math for the fourth pixel too, | 
|---|
| 4109 | // but rounding error makes this a better strategy, keeping opaque opaque, etc. | 
|---|
| 4110 | // | 
|---|
| 4111 | // We can keep up to 8 bits of fractional precision without overflowing 16-bit, | 
|---|
| 4112 | // so our "1.0" area is 256. | 
|---|
| 4113 | const uint16_t bias = 256; | 
|---|
| 4114 | U16 remaining = bias; | 
|---|
| 4115 |  | 
|---|
| 4116 | for (float dy = -0.5f; dy <= +0.5f; dy += 1.0f) | 
|---|
| 4117 | for (float dx = -0.5f; dx <= +0.5f; dx += 1.0f) { | 
|---|
| 4118 | // (x,y) are the coordinates of this sample point. | 
|---|
| 4119 | F x = cx + dx, | 
|---|
| 4120 | y = cy + dy; | 
|---|
| 4121 |  | 
|---|
| 4122 | // ix_and_ptr() will clamp to the image's bounds for us. | 
|---|
| 4123 | const uint32_t* ptr; | 
|---|
| 4124 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); | 
|---|
| 4125 |  | 
|---|
| 4126 | U16 sr,sg,sb,sa; | 
|---|
| 4127 | from_8888(gather<U32>(ptr, ix), &sr,&sg,&sb,&sa); | 
|---|
| 4128 |  | 
|---|
| 4129 | // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center | 
|---|
| 4130 | // are combined in direct proportion to their area overlapping that logical query pixel. | 
|---|
| 4131 | // At positive offsets, the x-axis contribution to that rectangle is fx, | 
|---|
| 4132 | // or (1-fx) at negative x.  Same deal for y. | 
|---|
| 4133 | F sx = (dx > 0) ? fx : 1.0f - fx, | 
|---|
| 4134 | sy = (dy > 0) ? fy : 1.0f - fy; | 
|---|
| 4135 |  | 
|---|
| 4136 | U16 area = (dy == 0.5f && dx == 0.5f) ? remaining | 
|---|
| 4137 | : cast<U16>(sx * sy * bias); | 
|---|
| 4138 | for (size_t i = 0; i < N; i++) { | 
|---|
| 4139 | SkASSERT(remaining[i] >= area[i]); | 
|---|
| 4140 | } | 
|---|
| 4141 | remaining -= area; | 
|---|
| 4142 |  | 
|---|
| 4143 | r += sr * area; | 
|---|
| 4144 | g += sg * area; | 
|---|
| 4145 | b += sb * area; | 
|---|
| 4146 | a += sa * area; | 
|---|
| 4147 | } | 
|---|
| 4148 |  | 
|---|
| 4149 | r = (r + bias/2) / bias; | 
|---|
| 4150 | g = (g + bias/2) / bias; | 
|---|
| 4151 | b = (b + bias/2) / bias; | 
|---|
| 4152 | a = (a + bias/2) / bias; | 
|---|
| 4153 | } | 
|---|
| 4154 |  | 
|---|
| 4155 | // TODO: lowp::tile() is identical to the highp tile()... share? | 
|---|
| 4156 | SI F tile(F v, SkTileMode mode, float limit, float invLimit) { | 
|---|
| 4157 | // After ix_and_ptr() will clamp the output of tile(), so we need not clamp here. | 
|---|
| 4158 | switch (mode) { | 
|---|
| 4159 | case SkTileMode::kDecal:  // TODO, for now fallthrough to clamp | 
|---|
| 4160 | case SkTileMode::kClamp:  return v; | 
|---|
| 4161 | case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit; | 
|---|
| 4162 | case SkTileMode::kMirror: | 
|---|
| 4163 | return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); | 
|---|
| 4164 | } | 
|---|
| 4165 | SkUNREACHABLE; | 
|---|
| 4166 | } | 
|---|
| 4167 |  | 
|---|
| 4168 | SI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y, | 
|---|
| 4169 | U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 4170 | x = tile(x, ctx->tileX, ctx->width , ctx->invWidth ); | 
|---|
| 4171 | y = tile(y, ctx->tileY, ctx->height, ctx->invHeight); | 
|---|
| 4172 |  | 
|---|
| 4173 | switch (ctx->ct) { | 
|---|
| 4174 | default: *r = *g = *b = *a = 0;  // TODO | 
|---|
| 4175 | break; | 
|---|
| 4176 |  | 
|---|
| 4177 | case kRGBA_8888_SkColorType: | 
|---|
| 4178 | case kBGRA_8888_SkColorType: { | 
|---|
| 4179 | const uint32_t* ptr; | 
|---|
| 4180 | U32 ix = ix_and_ptr(&ptr, ctx, x,y); | 
|---|
| 4181 | from_8888(gather<U32>(ptr, ix), r,g,b,a); | 
|---|
| 4182 | if (ctx->ct == kBGRA_8888_SkColorType) { | 
|---|
| 4183 | std::swap(*r,*b); | 
|---|
| 4184 | } | 
|---|
| 4185 | } break; | 
|---|
| 4186 | } | 
|---|
| 4187 | } | 
|---|
| 4188 |  | 
|---|
| 4189 | template <int D> | 
|---|
| 4190 | SI void sampler(const SkRasterPipeline_SamplerCtx2* ctx, | 
|---|
| 4191 | F cx, F cy, const F (&wx)[D], const F (&wy)[D], | 
|---|
| 4192 | U16* r, U16* g, U16* b, U16* a) { | 
|---|
| 4193 |  | 
|---|
| 4194 | float start = -0.5f*(D-1); | 
|---|
| 4195 |  | 
|---|
| 4196 | const uint16_t bias = 256; | 
|---|
| 4197 | U16 remaining = bias; | 
|---|
| 4198 |  | 
|---|
| 4199 | *r = *g = *b = *a = 0; | 
|---|
| 4200 | F y = cy + start; | 
|---|
| 4201 | for (int j = 0; j < D; j++, y += 1.0f) { | 
|---|
| 4202 | F x = cx + start; | 
|---|
| 4203 | for (int i = 0; i < D; i++, x += 1.0f) { | 
|---|
| 4204 | U16 R,G,B,A; | 
|---|
| 4205 | sample(ctx, x,y, &R,&G,&B,&A); | 
|---|
| 4206 |  | 
|---|
| 4207 | U16 w = (i == D-1 && j == D-1) ? remaining | 
|---|
| 4208 | : cast<U16>(wx[i]*wy[j]*bias); | 
|---|
| 4209 | remaining -= w; | 
|---|
| 4210 | *r += w*R; | 
|---|
| 4211 | *g += w*G; | 
|---|
| 4212 | *b += w*B; | 
|---|
| 4213 | *a += w*A; | 
|---|
| 4214 | } | 
|---|
| 4215 | } | 
|---|
| 4216 | *r = (*r + bias/2) / bias; | 
|---|
| 4217 | *g = (*g + bias/2) / bias; | 
|---|
| 4218 | *b = (*b + bias/2) / bias; | 
|---|
| 4219 | *a = (*a + bias/2) / bias; | 
|---|
| 4220 | } | 
|---|
| 4221 |  | 
|---|
| 4222 | STAGE_GP(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) { | 
|---|
| 4223 | F fx = fract(x + 0.5f), | 
|---|
| 4224 | fy = fract(y + 0.5f); | 
|---|
| 4225 | const F wx[] = {1.0f - fx, fx}; | 
|---|
| 4226 | const F wy[] = {1.0f - fy, fy}; | 
|---|
| 4227 |  | 
|---|
| 4228 | sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); | 
|---|
| 4229 | } | 
|---|
| 4230 | #endif | 
|---|
| 4231 |  | 
|---|
| 4232 | // ~~~~~~ GrSwizzle stage ~~~~~~ // | 
|---|
| 4233 |  | 
|---|
| 4234 | STAGE_PP(swizzle, void* ctx) { | 
|---|
| 4235 | auto ir = r, ig = g, ib = b, ia = a; | 
|---|
| 4236 | U16* o[] = {&r, &g, &b, &a}; | 
|---|
| 4237 | char swiz[4]; | 
|---|
| 4238 | memcpy(swiz, &ctx, sizeof(swiz)); | 
|---|
| 4239 |  | 
|---|
| 4240 | for (int i = 0; i < 4; ++i) { | 
|---|
| 4241 | switch (swiz[i]) { | 
|---|
| 4242 | case 'r': *o[i] = ir;       break; | 
|---|
| 4243 | case 'g': *o[i] = ig;       break; | 
|---|
| 4244 | case 'b': *o[i] = ib;       break; | 
|---|
| 4245 | case 'a': *o[i] = ia;       break; | 
|---|
| 4246 | case '0': *o[i] = U16(0);   break; | 
|---|
| 4247 | case '1': *o[i] = U16(255); break; | 
|---|
| 4248 | default:                    break; | 
|---|
| 4249 | } | 
|---|
| 4250 | } | 
|---|
| 4251 | } | 
|---|
| 4252 |  | 
|---|
| 4253 | // Now we'll add null stand-ins for stages we haven't implemented in lowp. | 
|---|
| 4254 | // If a pipeline uses these stages, it'll boot it out of lowp into highp. | 
|---|
| 4255 | #define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr; | 
|---|
| 4256 | NOT_IMPLEMENTED(callback) | 
|---|
| 4257 | NOT_IMPLEMENTED(interpreter) | 
|---|
| 4258 | NOT_IMPLEMENTED(unbounded_set_rgb) | 
|---|
| 4259 | NOT_IMPLEMENTED(unbounded_uniform_color) | 
|---|
| 4260 | NOT_IMPLEMENTED(unpremul) | 
|---|
| 4261 | NOT_IMPLEMENTED(dither)  // TODO | 
|---|
| 4262 | NOT_IMPLEMENTED(from_srgb) | 
|---|
| 4263 | NOT_IMPLEMENTED(to_srgb) | 
|---|
| 4264 | NOT_IMPLEMENTED(load_16161616) | 
|---|
| 4265 | NOT_IMPLEMENTED(load_16161616_dst) | 
|---|
| 4266 | NOT_IMPLEMENTED(store_16161616) | 
|---|
| 4267 | NOT_IMPLEMENTED(gather_16161616) | 
|---|
| 4268 | NOT_IMPLEMENTED(load_a16) | 
|---|
| 4269 | NOT_IMPLEMENTED(load_a16_dst) | 
|---|
| 4270 | NOT_IMPLEMENTED(store_a16) | 
|---|
| 4271 | NOT_IMPLEMENTED(gather_a16) | 
|---|
| 4272 | NOT_IMPLEMENTED(load_rg1616) | 
|---|
| 4273 | NOT_IMPLEMENTED(load_rg1616_dst) | 
|---|
| 4274 | NOT_IMPLEMENTED(store_rg1616) | 
|---|
| 4275 | NOT_IMPLEMENTED(gather_rg1616) | 
|---|
| 4276 | NOT_IMPLEMENTED(load_f16) | 
|---|
| 4277 | NOT_IMPLEMENTED(load_f16_dst) | 
|---|
| 4278 | NOT_IMPLEMENTED(store_f16) | 
|---|
| 4279 | NOT_IMPLEMENTED(gather_f16) | 
|---|
| 4280 | NOT_IMPLEMENTED(load_af16) | 
|---|
| 4281 | NOT_IMPLEMENTED(load_af16_dst) | 
|---|
| 4282 | NOT_IMPLEMENTED(store_af16) | 
|---|
| 4283 | NOT_IMPLEMENTED(gather_af16) | 
|---|
| 4284 | NOT_IMPLEMENTED(load_rgf16) | 
|---|
| 4285 | NOT_IMPLEMENTED(load_rgf16_dst) | 
|---|
| 4286 | NOT_IMPLEMENTED(store_rgf16) | 
|---|
| 4287 | NOT_IMPLEMENTED(gather_rgf16) | 
|---|
| 4288 | NOT_IMPLEMENTED(load_f32) | 
|---|
| 4289 | NOT_IMPLEMENTED(load_f32_dst) | 
|---|
| 4290 | NOT_IMPLEMENTED(store_f32) | 
|---|
| 4291 | NOT_IMPLEMENTED(gather_f32) | 
|---|
| 4292 | NOT_IMPLEMENTED(load_rgf32) | 
|---|
| 4293 | NOT_IMPLEMENTED(store_rgf32) | 
|---|
| 4294 | NOT_IMPLEMENTED(load_1010102) | 
|---|
| 4295 | NOT_IMPLEMENTED(load_1010102_dst) | 
|---|
| 4296 | NOT_IMPLEMENTED(store_1010102) | 
|---|
| 4297 | NOT_IMPLEMENTED(gather_1010102) | 
|---|
| 4298 | NOT_IMPLEMENTED(store_u16_be) | 
|---|
| 4299 | NOT_IMPLEMENTED(byte_tables)  // TODO | 
|---|
| 4300 | NOT_IMPLEMENTED(colorburn) | 
|---|
| 4301 | NOT_IMPLEMENTED(colordodge) | 
|---|
| 4302 | NOT_IMPLEMENTED(softlight) | 
|---|
| 4303 | NOT_IMPLEMENTED(hue) | 
|---|
| 4304 | NOT_IMPLEMENTED(saturation) | 
|---|
| 4305 | NOT_IMPLEMENTED(color) | 
|---|
| 4306 | NOT_IMPLEMENTED(luminosity) | 
|---|
| 4307 | NOT_IMPLEMENTED(matrix_3x3) | 
|---|
| 4308 | NOT_IMPLEMENTED(matrix_3x4) | 
|---|
| 4309 | NOT_IMPLEMENTED(matrix_4x5)  // TODO | 
|---|
| 4310 | NOT_IMPLEMENTED(matrix_4x3)  // TODO | 
|---|
| 4311 | NOT_IMPLEMENTED(parametric) | 
|---|
| 4312 | NOT_IMPLEMENTED(gamma_) | 
|---|
| 4313 | NOT_IMPLEMENTED(PQish) | 
|---|
| 4314 | NOT_IMPLEMENTED(HLGish) | 
|---|
| 4315 | NOT_IMPLEMENTED(HLGinvish) | 
|---|
| 4316 | NOT_IMPLEMENTED(rgb_to_hsl) | 
|---|
| 4317 | NOT_IMPLEMENTED(hsl_to_rgb) | 
|---|
| 4318 | NOT_IMPLEMENTED(gauss_a_to_rgba)  // TODO | 
|---|
| 4319 | NOT_IMPLEMENTED(mirror_x)         // TODO | 
|---|
| 4320 | NOT_IMPLEMENTED(repeat_x)         // TODO | 
|---|
| 4321 | NOT_IMPLEMENTED(mirror_y)         // TODO | 
|---|
| 4322 | NOT_IMPLEMENTED(repeat_y)         // TODO | 
|---|
| 4323 | NOT_IMPLEMENTED(negate_x) | 
|---|
| 4324 | NOT_IMPLEMENTED(bicubic)  // TODO if I can figure out negative weights | 
|---|
| 4325 | NOT_IMPLEMENTED(bicubic_clamp_8888) | 
|---|
| 4326 | NOT_IMPLEMENTED(bilinear_nx)      // TODO | 
|---|
| 4327 | NOT_IMPLEMENTED(bilinear_ny)      // TODO | 
|---|
| 4328 | NOT_IMPLEMENTED(bilinear_px)      // TODO | 
|---|
| 4329 | NOT_IMPLEMENTED(bilinear_py)      // TODO | 
|---|
| 4330 | NOT_IMPLEMENTED(bicubic_n3x)      // TODO | 
|---|
| 4331 | NOT_IMPLEMENTED(bicubic_n1x)      // TODO | 
|---|
| 4332 | NOT_IMPLEMENTED(bicubic_p1x)      // TODO | 
|---|
| 4333 | NOT_IMPLEMENTED(bicubic_p3x)      // TODO | 
|---|
| 4334 | NOT_IMPLEMENTED(bicubic_n3y)      // TODO | 
|---|
| 4335 | NOT_IMPLEMENTED(bicubic_n1y)      // TODO | 
|---|
| 4336 | NOT_IMPLEMENTED(bicubic_p1y)      // TODO | 
|---|
| 4337 | NOT_IMPLEMENTED(bicubic_p3y)      // TODO | 
|---|
| 4338 | NOT_IMPLEMENTED(save_xy)          // TODO | 
|---|
| 4339 | NOT_IMPLEMENTED(accumulate)       // TODO | 
|---|
| 4340 | NOT_IMPLEMENTED(xy_to_2pt_conical_well_behaved) | 
|---|
| 4341 | NOT_IMPLEMENTED(xy_to_2pt_conical_strip) | 
|---|
| 4342 | NOT_IMPLEMENTED(xy_to_2pt_conical_focal_on_circle) | 
|---|
| 4343 | NOT_IMPLEMENTED(xy_to_2pt_conical_smaller) | 
|---|
| 4344 | NOT_IMPLEMENTED(xy_to_2pt_conical_greater) | 
|---|
| 4345 | NOT_IMPLEMENTED(alter_2pt_conical_compensate_focal) | 
|---|
| 4346 | NOT_IMPLEMENTED(alter_2pt_conical_unswap) | 
|---|
| 4347 | NOT_IMPLEMENTED(mask_2pt_conical_nan) | 
|---|
| 4348 | NOT_IMPLEMENTED(mask_2pt_conical_degenerates) | 
|---|
| 4349 | NOT_IMPLEMENTED(apply_vector_mask) | 
|---|
| 4350 | #undef NOT_IMPLEMENTED | 
|---|
| 4351 |  | 
|---|
| 4352 | #endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages | 
|---|
| 4353 | }  // namespace lowp | 
|---|
| 4354 |  | 
|---|
| 4355 | }  // namespace SK_OPTS_NS | 
|---|
| 4356 |  | 
|---|
| 4357 | #endif//SkRasterPipeline_opts_DEFINED | 
|---|
| 4358 |  | 
|---|