1/*
2 * Copyright 2019 Google LLC
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "include/core/SkStream.h"
9#include "include/core/SkString.h"
10#include "include/private/SkChecksum.h"
11#include "include/private/SkHalf.h"
12#include "include/private/SkSpinlock.h"
13#include "include/private/SkTFitsIn.h"
14#include "include/private/SkThreadID.h"
15#include "include/private/SkVx.h"
16#include "src/core/SkColorSpaceXformSteps.h"
17#include "src/core/SkCpu.h"
18#include "src/core/SkEnumerate.h"
19#include "src/core/SkOpts.h"
20#include "src/core/SkVM.h"
21#include <algorithm>
22#include <atomic>
23#include <queue>
24
25#if defined(SKVM_LLVM)
26 #include <future>
27 #include <llvm/Bitcode/BitcodeWriter.h>
28 #include <llvm/ExecutionEngine/ExecutionEngine.h>
29 #include <llvm/IR/IRBuilder.h>
30 #include <llvm/IR/Verifier.h>
31 #include <llvm/Support/TargetSelect.h>
32
33 // Platform-specific intrinsics got their own files in LLVM 10.
34 #if __has_include(<llvm/IR/IntrinsicsX86.h>)
35 #include <llvm/IR/IntrinsicsX86.h>
36 #endif
37#endif
38
39bool gSkVMAllowJIT{false};
40bool gSkVMJITViaDylib{false};
41
42#if defined(SKVM_JIT)
43 #if defined(SK_BUILD_FOR_WIN)
44 #include "src/core/SkLeanWindows.h"
45 #include <memoryapi.h>
46
47 static void* alloc_jit_buffer(size_t* len) {
48 return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
49 }
50 static void unmap_jit_buffer(void* ptr, size_t len) {
51 VirtualFree(ptr, 0, MEM_RELEASE);
52 }
53 static void remap_as_executable(void* ptr, size_t len) {
54 DWORD old;
55 VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old);
56 SkASSERT(old == PAGE_READWRITE);
57 }
58 static void close_dylib(void* dylib) {
59 SkASSERT(false); // TODO? For now just assert we never make one.
60 }
61 #else
62 #include <dlfcn.h>
63 #include <sys/mman.h>
64
65 static void* alloc_jit_buffer(size_t* len) {
66 // While mprotect and VirtualAlloc both work at page granularity,
67 // mprotect doesn't round up for you, and instead requires *len is at page granularity.
68 const size_t page = sysconf(_SC_PAGESIZE);
69 *len = ((*len + page - 1) / page) * page;
70 return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
71 }
72 static void unmap_jit_buffer(void* ptr, size_t len) {
73 munmap(ptr, len);
74 }
75 static void remap_as_executable(void* ptr, size_t len) {
76 mprotect(ptr, len, PROT_READ|PROT_EXEC);
77 __builtin___clear_cache((char*)ptr,
78 (char*)ptr + len);
79 }
80 static void close_dylib(void* dylib) {
81 dlclose(dylib);
82 }
83 #endif
84
85 #if defined(SKVM_JIT_VTUNE)
86 #include <jitprofiling.h>
87 static void notify_vtune(const char* name, void* addr, size_t len) {
88 if (iJIT_IsProfilingActive() == iJIT_SAMPLING_ON) {
89 iJIT_Method_Load event;
90 memset(&event, 0, sizeof(event));
91 event.method_id = iJIT_GetNewMethodID();
92 event.method_name = const_cast<char*>(name);
93 event.method_load_address = addr;
94 event.method_size = len;
95 iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &event);
96 }
97 }
98 #else
99 static void notify_vtune(const char* name, void* addr, size_t len) {}
100 #endif
101#endif
102
103// JIT code isn't MSAN-instrumented, so we won't see when it uses
104// uninitialized memory, and we'll not see the writes it makes as properly
105// initializing memory. Instead force the interpreter, which should let
106// MSAN see everything our programs do properly.
107//
108// Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
109#if defined(__has_feature)
110 #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
111 #define SKVM_JIT_BUT_IGNORE_IT
112 #endif
113#endif
114
115
116
117namespace skvm {
118
119 struct Program::Impl {
120 std::vector<InterpreterInstruction> instructions;
121 int regs = 0;
122 int loop = 0;
123 std::vector<int> strides;
124
125 std::atomic<void*> jit_entry{nullptr}; // TODO: minimal std::memory_orders
126 size_t jit_size = 0;
127 void* dylib = nullptr;
128
129 #if defined(SKVM_LLVM)
130 std::unique_ptr<llvm::LLVMContext> llvm_ctx;
131 std::unique_ptr<llvm::ExecutionEngine> llvm_ee;
132 std::future<void> llvm_compiling;
133 #endif
134 };
135
136 // Debugging tools, mostly for printing various data structures out to a stream.
137
138 namespace {
139 class SkDebugfStream final : public SkWStream {
140 size_t fBytesWritten = 0;
141
142 bool write(const void* buffer, size_t size) override {
143 SkDebugf("%.*s", size, buffer);
144 fBytesWritten += size;
145 return true;
146 }
147
148 size_t bytesWritten() const override {
149 return fBytesWritten;
150 }
151 };
152
153 struct V { Val id; };
154 struct R { Reg id; };
155 struct Shift { int bits; };
156 struct Splat { int bits; };
157 struct Hex { int bits; };
158 struct Attr { const char* label; int v; };
159
160 static void write(SkWStream* o, const char* s) {
161 o->writeText(s);
162 }
163
164 static const char* name(Op op) {
165 switch (op) {
166 #define M(x) case Op::x: return #x;
167 SKVM_OPS(M)
168 #undef M
169 }
170 return "unknown op";
171 }
172
173 static void write(SkWStream* o, Op op) {
174 o->writeText(name(op));
175 }
176 static void write(SkWStream* o, Arg a) {
177 write(o, "arg(");
178 o->writeDecAsText(a.ix);
179 write(o, ")");
180 }
181 static void write(SkWStream* o, V v) {
182 write(o, "v");
183 o->writeDecAsText(v.id);
184 }
185 static void write(SkWStream* o, R r) {
186 write(o, "r");
187 o->writeDecAsText(r.id);
188 }
189 static void write(SkWStream* o, Shift s) {
190 o->writeDecAsText(s.bits);
191 }
192 static void write(SkWStream* o, Splat s) {
193 float f;
194 memcpy(&f, &s.bits, 4);
195 o->writeHexAsText(s.bits);
196 write(o, " (");
197 o->writeScalarAsText(f);
198 write(o, ")");
199 }
200 static void write(SkWStream* o, Hex h) {
201 o->writeHexAsText(h.bits);
202 }
203 [[maybe_unused]] static void write(SkWStream* o, Attr a) {
204 write(o, a.label);
205 write(o, " ");
206 o->writeDecAsText(a.v);
207 }
208
209 template <typename T, typename... Ts>
210 static void write(SkWStream* o, T first, Ts... rest) {
211 write(o, first);
212 write(o, " ");
213 write(o, rest...);
214 }
215 } // namespace
216
217 void Builder::dot(SkWStream* o) const {
218 SkDebugfStream debug;
219 if (!o) { o = &debug; }
220
221 std::vector<OptimizedInstruction> optimized = this->optimize();
222
223 o->writeText("digraph {\n");
224 for (Val id = 0; id < (Val)optimized.size(); id++) {
225 auto [op, x,y,z, immy,immz, death,can_hoist] = optimized[id];
226
227 switch (op) {
228 default:
229 write(o, "\t", V{id}, " [label = \"", V{id}, op);
230 // Not a perfect heuristic; sometimes y/z == NA and there is no immy/z.
231 // On the other hand, sometimes immy/z=0 is meaningful and should be printed.
232 if (y == NA) { write(o, "", Hex{immy}); }
233 if (z == NA) { write(o, "", Hex{immz}); }
234 write(o, "\"]\n");
235
236 write(o, "\t", V{id}, " -> {");
237 // In contrast to the heuristic imm labels, these dependences are exact.
238 if (x != NA) { write(o, "", V{x}); }
239 if (y != NA) { write(o, "", V{y}); }
240 if (z != NA) { write(o, "", V{z}); }
241 write(o, " }\n");
242
243 break;
244
245 // That default: impl works pretty well for most instructions,
246 // but some are nicer to see with a specialized label.
247
248 case Op::splat:
249 write(o, "\t", V{id}, " [label = \"", V{id}, op, Splat{immy}, "\"]\n");
250 break;
251 }
252 }
253 o->writeText("}\n");
254 }
255
256 template <typename I, typename... Fs>
257 static void write_one_instruction(Val id, const I& inst, SkWStream* o, Fs... fs) {
258 Op op = inst.op;
259 Val x = inst.x,
260 y = inst.y,
261 z = inst.z;
262 int immy = inst.immy,
263 immz = inst.immz;
264 switch (op) {
265 case Op::assert_true: write(o, op, V{x}, V{y}, fs(id)...); break;
266
267 case Op::store8: write(o, op, Arg{immy} , V{x}, fs(id)...); break;
268 case Op::store16: write(o, op, Arg{immy} , V{x}, fs(id)...); break;
269 case Op::store32: write(o, op, Arg{immy} , V{x}, fs(id)...); break;
270 case Op::store64: write(o, op, Arg{immz} , V{x},V{y}, fs(id)...); break;
271 case Op::store128: write(o, op, Arg{immz>>1}, V{x},V{y},Hex{immz&1}, fs(id)...); break;
272
273 case Op::index: write(o, V{id}, "=", op, fs(id)...); break;
274
275 case Op::load8: write(o, V{id}, "=", op, Arg{immy}, fs(id)...); break;
276 case Op::load16: write(o, V{id}, "=", op, Arg{immy}, fs(id)...); break;
277 case Op::load32: write(o, V{id}, "=", op, Arg{immy}, fs(id)...); break;
278 case Op::load64: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, fs(id)...); break;
279 case Op::load128: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, fs(id)...); break;
280
281 case Op::gather8: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}, fs(id)...); break;
282 case Op::gather16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}, fs(id)...); break;
283 case Op::gather32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}, fs(id)...); break;
284
285 case Op::uniform8: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, fs(id)...); break;
286 case Op::uniform16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, fs(id)...); break;
287 case Op::uniform32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, fs(id)...); break;
288
289 case Op::splat: write(o, V{id}, "=", op, Splat{immy}, fs(id)...); break;
290
291 case Op::add_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
292 case Op::sub_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
293 case Op::mul_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
294 case Op::div_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
295 case Op::min_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
296 case Op::max_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
297 case Op::fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break;
298 case Op::fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break;
299 case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break;
300
301
302 case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
303
304 case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
305 case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
306 case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
307 case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
308
309
310 case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
311 case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
312 case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
313
314 case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}, fs(id)...); break;
315 case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}, fs(id)...); break;
316 case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}, fs(id)...); break;
317
318 case Op:: eq_i32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
319 case Op:: gt_i32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
320
321 case Op::bit_and : write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
322 case Op::bit_or : write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
323 case Op::bit_xor : write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
324 case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
325
326 case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break;
327 case Op::pack: write(o, V{id}, "=", op, V{x}, V{y}, Shift{immz}, fs(id)...); break;
328
329 case Op::ceil: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
330 case Op::floor: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
331 case Op::to_f32: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
332 case Op::to_half: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
333 case Op::from_half: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
334 case Op::trunc: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
335 case Op::round: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
336 }
337
338 write(o, "\n");
339 }
340
341 void Builder::dump(SkWStream* o) const {
342 SkDebugfStream debug;
343 if (!o) { o = &debug; }
344
345 std::vector<OptimizedInstruction> optimized = this->optimize();
346 o->writeDecAsText(optimized.size());
347 o->writeText(" values (originally ");
348 o->writeDecAsText(fProgram.size());
349 o->writeText("):\n");
350 for (Val id = 0; id < (Val)optimized.size(); id++) {
351 const OptimizedInstruction& inst = optimized[id];
352 write(o, inst.can_hoist ? "↑ " : " ");
353 write_one_instruction(id, inst, o);
354 }
355 }
356
357 template <typename... Fs>
358 void dump_instructions(const std::vector<Instruction>& instructions, SkWStream* o, Fs... fs) {
359 SkDebugfStream debug;
360 if (o == nullptr) {
361 o = &debug;
362 }
363 write(o, Attr{"Instruction count:", (int)instructions.size()});
364 for (Val id = 0; id < (Val)instructions.size(); id++) {
365 write_one_instruction(id, instructions[id], o, std::forward<Fs>(fs)...);
366 }
367 }
368
369 void Program::dump(SkWStream* o) const {
370 SkDebugfStream debug;
371 if (!o) { o = &debug; }
372
373 o->writeDecAsText(fImpl->regs);
374 o->writeText(" registers, ");
375 o->writeDecAsText(fImpl->instructions.size());
376 o->writeText(" instructions:\n");
377 for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) {
378 if (i == fImpl->loop) { write(o, "loop:\n"); }
379 o->writeDecAsText(i);
380 o->writeText("\t");
381 if (i >= fImpl->loop) { write(o, " "); }
382 const InterpreterInstruction& inst = fImpl->instructions[i];
383 Op op = inst.op;
384 Reg d = inst.d,
385 x = inst.x,
386 y = inst.y,
387 z = inst.z;
388 int immy = inst.immy,
389 immz = inst.immz;
390 switch (op) {
391 case Op::assert_true: write(o, op, R{x}, R{y}); break;
392
393 case Op::store8: write(o, op, Arg{immy} , R{x} ); break;
394 case Op::store16: write(o, op, Arg{immy} , R{x} ); break;
395 case Op::store32: write(o, op, Arg{immy} , R{x} ); break;
396 case Op::store64: write(o, op, Arg{immz} , R{x}, R{y} ); break;
397 case Op::store128: write(o, op, Arg{immz>>1}, R{x}, R{y}, Hex{immz&1}); break;
398
399 case Op::index: write(o, R{d}, "=", op); break;
400
401 case Op::load8: write(o, R{d}, "=", op, Arg{immy}); break;
402 case Op::load16: write(o, R{d}, "=", op, Arg{immy}); break;
403 case Op::load32: write(o, R{d}, "=", op, Arg{immy}); break;
404 case Op::load64: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
405 case Op::load128: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
406
407 case Op::gather8: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
408 case Op::gather16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
409 case Op::gather32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
410
411 case Op::uniform8: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
412 case Op::uniform16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
413 case Op::uniform32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
414
415 case Op::splat: write(o, R{d}, "=", op, Splat{immy}); break;
416
417
418 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
419 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
420 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
421 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
422 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
423 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
424 case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
425 case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
426 case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
427
428 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break;
429
430 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
431 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
432 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
433 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
434
435
436 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
437 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
438 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
439
440 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
441 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
442 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
443
444 case Op:: eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
445 case Op:: gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
446
447 case Op::bit_and : write(o, R{d}, "=", op, R{x}, R{y} ); break;
448 case Op::bit_or : write(o, R{d}, "=", op, R{x}, R{y} ); break;
449 case Op::bit_xor : write(o, R{d}, "=", op, R{x}, R{y} ); break;
450 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y} ); break;
451
452 case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
453 case Op::pack: write(o, R{d}, "=", op, R{x}, R{y}, Shift{immz}); break;
454
455 case Op::ceil: write(o, R{d}, "=", op, R{x}); break;
456 case Op::floor: write(o, R{d}, "=", op, R{x}); break;
457 case Op::to_f32: write(o, R{d}, "=", op, R{x}); break;
458 case Op::to_half: write(o, R{d}, "=", op, R{x}); break;
459 case Op::from_half: write(o, R{d}, "=", op, R{x}); break;
460 case Op::trunc: write(o, R{d}, "=", op, R{x}); break;
461 case Op::round: write(o, R{d}, "=", op, R{x}); break;
462 }
463 write(o, "\n");
464 }
465 }
466
467 std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program) {
468 // Determine which Instructions are live by working back from side effects.
469 std::vector<bool> live(program.size(), false);
470 auto mark_live = [&](Val id, auto& recurse) -> void {
471 if (live[id] == false) {
472 live[id] = true;
473 Instruction inst = program[id];
474 for (Val arg : {inst.x, inst.y, inst.z}) {
475 if (arg != NA) { recurse(arg, recurse); }
476 }
477 }
478 };
479 for (Val id = 0; id < (Val)program.size(); id++) {
480 if (has_side_effect(program[id].op)) {
481 mark_live(id, mark_live);
482 }
483 }
484
485 // Rewrite the program with only live Instructions:
486 // - remap IDs in live Instructions to what they'll be once dead Instructions are removed;
487 // - then actually remove the dead Instructions.
488 std::vector<Val> new_id(program.size(), NA);
489 for (Val id = 0, next = 0; id < (Val)program.size(); id++) {
490 if (live[id]) {
491 Instruction& inst = program[id];
492 for (Val* arg : {&inst.x, &inst.y, &inst.z}) {
493 if (*arg != NA) {
494 *arg = new_id[*arg];
495 SkASSERT(*arg != NA);
496 }
497 }
498 new_id[id] = next++;
499 }
500 }
501 auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) {
502 Val id = (Val)(&inst - program.data());
503 return !live[id];
504 });
505 program.erase(it, program.end());
506
507 return program;
508 }
509
510 // Impose a deterministic scheduling of Instructions based on data flow alone,
511 // eliminating any influence from original program order. We'll schedule back-to-front,
512 // starting at the end of the program with Instructions that have side effects and
513 // recursing through arguments to Instructions that issue earlier in the program.
514 // We schedule each argument once all its users have been scheduled, which means it
515 // issues just before its first use. We arbitrarily schedule x, then y, then z, and so
516 // issue z, then y, then x.
517 std::vector<Instruction> schedule(std::vector<Instruction> program) {
518
519 std::vector<int> uses(program.size());
520 for (const Instruction& inst : program) {
521 for (Val arg : {inst.x, inst.y, inst.z}) {
522 if (arg != NA) { uses[arg]++; }
523 }
524 }
525
526 std::vector<Val> new_id(program.size(), NA);
527 Val next = (Val)program.size();
528 auto reorder = [&](Val id, auto& recurse) -> void {
529 new_id[id] = --next;
530 const Instruction& inst = program[id];
531 for (Val arg : {inst.x, inst.y, inst.z}) {
532 if (arg != NA && --uses[arg] == 0) {
533 recurse(arg, recurse);
534 }
535 }
536 };
537
538 for (Val id = 0; id < (Val)program.size(); id++) {
539 if (has_side_effect(program[id].op)) {
540 reorder(id, reorder);
541 }
542 }
543
544 // Remap each Instruction's arguments to their new IDs.
545 for (Instruction& inst : program) {
546 for (Val* arg : {&inst.x, &inst.y, &inst.z}) {
547 if (*arg != NA) {
548 *arg = new_id[*arg];
549 SkASSERT(*arg != NA);
550 }
551 }
552 }
553
554 // Finally, reorder the Instructions themselves according to the new schedule.
555 // This is O(N)... wish I had a good reference link breaking it down.
556 for (Val id = 0; id < (Val)program.size(); id++) {
557 while (id != new_id[id]) {
558 std::swap(program[id], program[new_id[id]]);
559 std::swap( new_id[id], new_id[new_id[id]]);
560 }
561 }
562
563 return program;
564 }
565
566 std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program) {
567 std::vector<OptimizedInstruction> optimized(program.size());
568 for (Val id = 0; id < (Val)program.size(); id++) {
569 Instruction inst = program[id];
570 optimized[id] = {inst.op, inst.x,inst.y,inst.z, inst.immy,inst.immz,
571 /*death=*/id, /*can_hoist=*/true};
572 }
573
574 // Each Instruction's inputs need to live at least until that Instruction issues.
575 for (Val id = 0; id < (Val)optimized.size(); id++) {
576 OptimizedInstruction& inst = optimized[id];
577 for (Val arg : {inst.x, inst.y, inst.z}) {
578 // (We're walking in order, so this is the same as max()ing with the existing Val.)
579 if (arg != NA) { optimized[arg].death = id; }
580 }
581 }
582
583 // Mark which values don't depend on the loop and can be hoisted.
584 for (OptimizedInstruction& inst : optimized) {
585 // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
586 if (is_always_varying(inst.op)) {
587 inst.can_hoist = false;
588 }
589
590 // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
591 if (inst.can_hoist) {
592 for (Val arg : {inst.x, inst.y, inst.z}) {
593 if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; }
594 }
595 }
596 }
597
598 // Extend the lifetime of any hoisted value that's used in the loop to infinity.
599 for (OptimizedInstruction& inst : optimized) {
600 if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) {
601 for (Val arg : {inst.x, inst.y, inst.z}) {
602 if (arg != NA && optimized[arg].can_hoist) {
603 optimized[arg].death = (Val)program.size();
604 }
605 }
606 }
607 }
608
609 return optimized;
610 }
611
612 std::vector<OptimizedInstruction> Builder::optimize() const {
613 std::vector<Instruction> program = this->program();
614 program = eliminate_dead_code(std::move(program));
615 program = schedule (std::move(program));
616 return finalize (std::move(program));
617 }
618
619 Program Builder::done(const char* debug_name) const {
620 char buf[64] = "skvm-jit-";
621 if (!debug_name) {
622 *SkStrAppendU32(buf+9, this->hash()) = '\0';
623 debug_name = buf;
624 }
625
626 return {this->optimize(), fStrides, debug_name};
627 }
628
629 uint64_t Builder::hash() const {
630 uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0),
631 hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1);
632 return (uint64_t)lo | (uint64_t)hi << 32;
633 }
634
635 bool operator==(const Instruction& a, const Instruction& b) {
636 return a.op == b.op
637 && a.x == b.x
638 && a.y == b.y
639 && a.z == b.z
640 && a.immy == b.immy
641 && a.immz == b.immz;
642 }
643
644 uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
645 return SkOpts::hash(&inst, sizeof(inst), seed);
646 }
647
648
649 // Most instructions produce a value and return it by ID,
650 // the value-producing instruction's own index in the program vector.
651 Val Builder::push(Instruction inst) {
652 // Basic common subexpression elimination:
653 // if we've already seen this exact Instruction, use it instead of creating a new one.
654 if (Val* id = fIndex.find(inst)) {
655 return *id;
656 }
657 Val id = static_cast<Val>(fProgram.size());
658 fProgram.push_back(inst);
659 fIndex.set(inst, id);
660 return id;
661 }
662
663 bool Builder::allImm() const { return true; }
664
665 template <typename T, typename... Rest>
666 bool Builder::allImm(Val id, T* imm, Rest... rest) const {
667 if (fProgram[id].op == Op::splat) {
668 static_assert(sizeof(T) == 4);
669 memcpy(imm, &fProgram[id].immy, 4);
670 return this->allImm(rest...);
671 }
672 return false;
673 }
674
675 Arg Builder::arg(int stride) {
676 int ix = (int)fStrides.size();
677 fStrides.push_back(stride);
678 return {ix};
679 }
680
681 void Builder::assert_true(I32 cond, I32 debug) {
682 #ifdef SK_DEBUG
683 int imm;
684 if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; }
685 (void)push(Op::assert_true, cond.id,debug.id,NA);
686 #endif
687 }
688
689 void Builder::store8 (Arg ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA, ptr.ix); }
690 void Builder::store16(Arg ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA, ptr.ix); }
691 void Builder::store32(Arg ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA, ptr.ix); }
692 void Builder::store64(Arg ptr, I32 lo, I32 hi) {
693 (void)push(Op::store64, lo.id,hi.id,NA, NA,ptr.ix);
694 }
695 void Builder::store128(Arg ptr, I32 lo, I32 hi, int lane) {
696 (void)push(Op::store128, lo.id,hi.id,NA, NA,(ptr.ix<<1)|(lane&1));
697 }
698
699 I32 Builder::index() { return {this, push(Op::index , NA,NA,NA,0) }; }
700
701 I32 Builder::load8 (Arg ptr) { return {this, push(Op::load8 , NA,NA,NA, ptr.ix) }; }
702 I32 Builder::load16(Arg ptr) { return {this, push(Op::load16, NA,NA,NA, ptr.ix) }; }
703 I32 Builder::load32(Arg ptr) { return {this, push(Op::load32, NA,NA,NA, ptr.ix) }; }
704 I32 Builder::load64(Arg ptr, int lane) {
705 return {this, push(Op::load64 , NA,NA,NA, ptr.ix,lane) };
706 }
707 I32 Builder::load128(Arg ptr, int lane) {
708 return {this, push(Op::load128, NA,NA,NA, ptr.ix,lane) };
709 }
710
711 I32 Builder::gather8 (Arg ptr, int offset, I32 index) {
712 return {this, push(Op::gather8 , index.id,NA,NA, ptr.ix,offset)};
713 }
714 I32 Builder::gather16(Arg ptr, int offset, I32 index) {
715 return {this, push(Op::gather16, index.id,NA,NA, ptr.ix,offset)};
716 }
717 I32 Builder::gather32(Arg ptr, int offset, I32 index) {
718 return {this, push(Op::gather32, index.id,NA,NA, ptr.ix,offset)};
719 }
720
721 I32 Builder::uniform8(Arg ptr, int offset) {
722 return {this, push(Op::uniform8, NA,NA,NA, ptr.ix, offset)};
723 }
724 I32 Builder::uniform16(Arg ptr, int offset) {
725 return {this, push(Op::uniform16, NA,NA,NA, ptr.ix, offset)};
726 }
727 I32 Builder::uniform32(Arg ptr, int offset) {
728 return {this, push(Op::uniform32, NA,NA,NA, ptr.ix, offset)};
729 }
730
731 // The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern.
732 I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA, n) }; }
733 F32 Builder::splat(float f) {
734 int bits;
735 memcpy(&bits, &f, 4);
736 return {this, push(Op::splat, NA,NA,NA, bits)};
737 }
738
739 bool fma_supported() {
740 static const bool supported =
741 #if defined(SK_CPU_X86)
742 SkCpu::Supports(SkCpu::HSW);
743 #elif defined(SK_CPU_ARM64)
744 true;
745 #else
746 false;
747 #endif
748 return supported;
749 }
750
751 // Be careful peepholing float math! Transformations you might expect to
752 // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0.
753 // Float peepholes must pass this equivalence test for all ~4B floats:
754 //
755 // bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); }
756 //
757 // unsigned bits = 0;
758 // do {
759 // float f;
760 // memcpy(&f, &bits, 4);
761 // if (!equiv(f, ...)) {
762 // abort();
763 // }
764 // } while (++bits != 0);
765
766 F32 Builder::add(F32 x, F32 y) {
767 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
768 if (this->isImm(y.id, 0.0f)) { return x; } // x+0 == x
769 if (this->isImm(x.id, 0.0f)) { return y; } // 0+y == y
770
771 if (fma_supported()) {
772 if (fProgram[x.id].op == Op::mul_f32) {
773 return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
774 }
775 if (fProgram[y.id].op == Op::mul_f32) {
776 return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
777 }
778 }
779 return {this, this->push(Op::add_f32, x.id, y.id)};
780 }
781
782 F32 Builder::sub(F32 x, F32 y) {
783 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
784 if (this->isImm(y.id, 0.0f)) { return x; } // x-0 == x
785 if (fma_supported()) {
786 if (fProgram[x.id].op == Op::mul_f32) {
787 return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
788 }
789 if (fProgram[y.id].op == Op::mul_f32) {
790 return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
791 }
792 }
793 return {this, this->push(Op::sub_f32, x.id, y.id)};
794 }
795
796 F32 Builder::mul(F32 x, F32 y) {
797 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
798 if (this->isImm(y.id, 1.0f)) { return x; } // x*1 == x
799 if (this->isImm(x.id, 1.0f)) { return y; } // 1*y == y
800 return {this, this->push(Op::mul_f32, x.id, y.id)};
801 }
802
803 F32 Builder::div(F32 x, F32 y) {
804 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X/Y); }
805 if (this->isImm(y.id, 1.0f)) { return x; } // x/1 == x
806 return {this, this->push(Op::div_f32, x.id, y.id)};
807 }
808
809 F32 Builder::sqrt(F32 x) {
810 if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); }
811 return {this, this->push(Op::sqrt_f32, x.id,NA,NA)};
812 }
813
814 // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
815 F32 Builder::approx_log2(F32 x) {
816 // e - 127 is a fair approximation of log2(x) in its own right...
817 F32 e = mul(to_f32(bit_cast(x)), splat(1.0f / (1<<23)));
818
819 // ... but using the mantissa to refine its error is _much_ better.
820 F32 m = bit_cast(bit_or(bit_and(bit_cast(x), 0x007fffff),
821 0x3f000000));
822 F32 approx = sub(e, 124.225514990f);
823 approx = sub(approx, mul(1.498030302f, m));
824 approx = sub(approx, div(1.725879990f, add(0.3520887068f, m)));
825
826 return approx;
827 }
828
829 F32 Builder::approx_pow2(F32 x) {
830 F32 f = fract(x);
831 F32 approx = add(x, 121.274057500f);
832 approx = sub(approx, mul( 1.490129070f, f));
833 approx = add(approx, div(27.728023300f, sub(4.84252568f, f)));
834
835 return bit_cast(round(mul(1.0f * (1<<23), approx)));
836 }
837
838 F32 Builder::approx_powf(F32 x, F32 y) {
839 // TODO: assert this instead? Sometimes x is very slightly negative. See skia:10210.
840 x = max(0.0f, x);
841
842 auto is_x = bit_or(eq(x, 0.0f),
843 eq(x, 1.0f));
844 return select(is_x, x, approx_pow2(mul(approx_log2(x), y)));
845 }
846
847 // Bhaskara I's sine approximation
848 // 16x(pi - x) / (5*pi^2 - 4x(pi - x)
849 // ... divide by 4
850 // 4x(pi - x) / 5*pi^2/4 - x(pi - x)
851 //
852 // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get
853 // radians into that range first.
854 //
855 F32 Builder::approx_sin(F32 radians) {
856 constexpr float Pi = SK_ScalarPI;
857 // x = radians mod 2pi
858 F32 x = fract(radians * (0.5f/Pi)) * (2*Pi);
859 I32 neg = x > Pi; // are we pi < x < 2pi --> need to negate result
860 x = select(neg, x - Pi, x);
861
862 F32 pair = x * (Pi - x);
863 x = 4.0f * pair / ((5*Pi*Pi/4) - pair);
864 x = select(neg, -x, x);
865 return x;
866 }
867
868 /* "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"
869 https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
870
871 approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
872
873 Some simplifications:
874 1. tan(x) is periodic, -PI/2 < x < PI/2
875 2. tan(x) is odd, so tan(-x) = -tan(x)
876 3. Our polynomial approximation is best near zero, so we use the following identity
877 tan(x) + tan(y)
878 tan(x + y) = -----------------
879 1 - tan(x)*tan(y)
880 tan(PI/4) = 1
881
882 So for x > PI/8, we do the following refactor:
883 x' = x - PI/4
884
885 1 + tan(x')
886 tan(x) = ------------
887 1 - tan(x')
888 */
889 F32 Builder::approx_tan(F32 x) {
890 constexpr float Pi = SK_ScalarPI;
891 // periodic between -pi/2 ... pi/2
892 // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
893 x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2);
894
895 I32 neg = (x < 0.0f);
896 x = select(neg, -x, x);
897
898 // minimize total error by shifting if x > pi/8
899 I32 use_quotient = (x > (Pi/8));
900 x = select(use_quotient, x - (Pi/4), x);
901
902 // 9th order poly = 4th order(x^2) * x
903 x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x;
904 x = select(use_quotient, (1+x)/(1-x), x);
905 x = select(neg, -x, x);
906 return x;
907 }
908
909 // http://mathforum.org/library/drmath/view/54137.html
910 // referencing Handbook of Mathematical Functions,
911 // by Milton Abramowitz and Irene Stegun
912 F32 Builder::approx_asin(F32 x) {
913 I32 neg = (x < 0.0f);
914 x = select(neg, -x, x);
915 x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f);
916 x = select(neg, -x, x);
917 return x;
918 }
919
920 /* Use 4th order polynomial approximation from https://arachnoid.com/polysolve/
921 * with 129 values of x,atan(x) for x:[0...1]
922 * This only works for 0 <= x <= 1
923 */
924 static F32 approx_atan_unit(F32 x) {
925 // for now we might be given NaN... let that through
926 x->assert_true((x != x) | ((x >= 0) & (x <= 1)));
927 return poly(x, 0.14130025741326729f,
928 -0.34312835980675116f,
929 -0.016172900528248768f,
930 1.0037696976200385f,
931 -0.00014758242182738969f);
932 }
933
934 /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1
935 */
936 F32 Builder::approx_atan(F32 x) {
937 I32 neg = (x < 0.0f);
938 x = select(neg, -x, x);
939 I32 flip = (x > 1.0f);
940 x = select(flip, 1/x, x);
941 x = approx_atan_unit(x);
942 x = select(flip, SK_ScalarPI/2 - x, x);
943 x = select(neg, -x, x);
944 return x;
945 }
946
947 /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1
948 * By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
949 * which avoids a 2nd divide instruction if we had instead called atan().
950 */
951 F32 Builder::approx_atan2(F32 y0, F32 x0) {
952
953 I32 flip = (abs(y0) > abs(x0));
954 F32 y = select(flip, x0, y0);
955 F32 x = select(flip, y0, x0);
956 F32 arg = y/x;
957
958 I32 neg = (arg < 0.0f);
959 arg = select(neg, -arg, arg);
960
961 F32 r = approx_atan_unit(arg);
962 r = select(flip, SK_ScalarPI/2 - r, r);
963 r = select(neg, -r, r);
964
965 // handle quadrant distinctions
966 r = select((y0 >= 0) & (x0 < 0), r + SK_ScalarPI, r);
967 r = select((y0 < 0) & (x0 <= 0), r - SK_ScalarPI, r);
968 // Note: we don't try to handle 0,0 or infinities (yet)
969 return r;
970 }
971
972 F32 Builder::min(F32 x, F32 y) {
973 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); }
974 return {this, this->push(Op::min_f32, x.id, y.id)};
975 }
976 F32 Builder::max(F32 x, F32 y) {
977 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); }
978 return {this, this->push(Op::max_f32, x.id, y.id)};
979 }
980
981 I32 Builder::add(I32 x, I32 y) {
982 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
983 if (this->isImm(x.id, 0)) { return y; }
984 if (this->isImm(y.id, 0)) { return x; }
985 return {this, this->push(Op::add_i32, x.id, y.id)};
986 }
987 I32 Builder::sub(I32 x, I32 y) {
988 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
989 if (this->isImm(y.id, 0)) { return x; }
990 return {this, this->push(Op::sub_i32, x.id, y.id)};
991 }
992 I32 Builder::mul(I32 x, I32 y) {
993 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
994 if (this->isImm(x.id, 0)) { return splat(0); }
995 if (this->isImm(y.id, 0)) { return splat(0); }
996 if (this->isImm(x.id, 1)) { return y; }
997 if (this->isImm(y.id, 1)) { return x; }
998 return {this, this->push(Op::mul_i32, x.id, y.id)};
999 }
1000
1001 I32 Builder::shl(I32 x, int bits) {
1002 if (bits == 0) { return x; }
1003 if (int X; this->allImm(x.id,&X)) { return splat(X << bits); }
1004 return {this, this->push(Op::shl_i32, x.id,NA,NA, bits)};
1005 }
1006 I32 Builder::shr(I32 x, int bits) {
1007 if (bits == 0) { return x; }
1008 if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); }
1009 return {this, this->push(Op::shr_i32, x.id,NA,NA, bits)};
1010 }
1011 I32 Builder::sra(I32 x, int bits) {
1012 if (bits == 0) { return x; }
1013 if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); }
1014 return {this, this->push(Op::sra_i32, x.id,NA,NA, bits)};
1015 }
1016
1017 I32 Builder:: eq(F32 x, F32 y) {
1018 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
1019 return {this, this->push(Op::eq_f32, x.id, y.id)};
1020 }
1021 I32 Builder::neq(F32 x, F32 y) {
1022 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
1023 return {this, this->push(Op::neq_f32, x.id, y.id)};
1024 }
1025 I32 Builder::lt(F32 x, F32 y) {
1026 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); }
1027 return {this, this->push(Op::gt_f32, y.id, x.id)};
1028 }
1029 I32 Builder::lte(F32 x, F32 y) {
1030 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); }
1031 return {this, this->push(Op::gte_f32, y.id, x.id)};
1032 }
1033 I32 Builder::gt(F32 x, F32 y) {
1034 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
1035 return {this, this->push(Op::gt_f32, x.id, y.id)};
1036 }
1037 I32 Builder::gte(F32 x, F32 y) {
1038 if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
1039 return {this, this->push(Op::gte_f32, x.id, y.id)};
1040 }
1041
1042 I32 Builder:: eq(I32 x, I32 y) {
1043 if (x.id == y.id) { return splat(~0); }
1044 return {this, this->push(Op:: eq_i32, x.id, y.id)};
1045 }
1046 I32 Builder::neq(I32 x, I32 y) {
1047 return ~(x == y);
1048 }
1049 I32 Builder:: gt(I32 x, I32 y) {
1050 return {this, this->push(Op:: gt_i32, x.id, y.id)};
1051 }
1052 I32 Builder::gte(I32 x, I32 y) {
1053 if (x.id == y.id) { return splat(~0); }
1054 return ~(x < y);
1055 }
1056 I32 Builder:: lt(I32 x, I32 y) { return y>x; }
1057 I32 Builder::lte(I32 x, I32 y) { return y>=x; }
1058
1059 I32 Builder::bit_and(I32 x, I32 y) {
1060 if (x.id == y.id) { return x; }
1061 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); }
1062 if (this->isImm(y.id, 0)) { return splat(0); } // (x & false) == false
1063 if (this->isImm(x.id, 0)) { return splat(0); } // (false & y) == false
1064 if (this->isImm(y.id,~0)) { return x; } // (x & true) == x
1065 if (this->isImm(x.id,~0)) { return y; } // (true & y) == y
1066 return {this, this->push(Op::bit_and, x.id, y.id)};
1067 }
1068 I32 Builder::bit_or(I32 x, I32 y) {
1069 if (x.id == y.id) { return x; }
1070 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); }
1071 if (this->isImm(y.id, 0)) { return x; } // (x | false) == x
1072 if (this->isImm(x.id, 0)) { return y; } // (false | y) == y
1073 if (this->isImm(y.id,~0)) { return splat(~0); } // (x | true) == true
1074 if (this->isImm(x.id,~0)) { return splat(~0); } // (true | y) == true
1075 return {this, this->push(Op::bit_or, x.id, y.id)};
1076 }
1077 I32 Builder::bit_xor(I32 x, I32 y) {
1078 if (x.id == y.id) { return splat(0); }
1079 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); }
1080 if (this->isImm(y.id, 0)) { return x; } // (x ^ false) == x
1081 if (this->isImm(x.id, 0)) { return y; } // (false ^ y) == y
1082 return {this, this->push(Op::bit_xor, x.id, y.id)};
1083 }
1084
1085 I32 Builder::bit_clear(I32 x, I32 y) {
1086 if (x.id == y.id) { return splat(0); }
1087 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); }
1088 if (this->isImm(y.id, 0)) { return x; } // (x & ~false) == x
1089 if (this->isImm(y.id,~0)) { return splat(0); } // (x & ~true) == false
1090 if (this->isImm(x.id, 0)) { return splat(0); } // (false & ~y) == false
1091 return {this, this->push(Op::bit_clear, x.id, y.id)};
1092 }
1093
1094 I32 Builder::select(I32 x, I32 y, I32 z) {
1095 if (y.id == z.id) { return y; }
1096 if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); }
1097 if (this->isImm(x.id,~0)) { return y; } // true ? y : z == y
1098 if (this->isImm(x.id, 0)) { return z; } // false ? y : z == z
1099 if (this->isImm(y.id, 0)) { return bit_clear(z,x); } // x ? 0 : z == ~x&z
1100 if (this->isImm(z.id, 0)) { return bit_and (y,x); } // x ? y : 0 == x&y
1101 return {this, this->push(Op::select, x.id, y.id, z.id)};
1102 }
1103
1104 I32 Builder::extract(I32 x, int bits, I32 z) {
1105 if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); }
1106 return bit_and(z, shr(x, bits));
1107 }
1108
1109 I32 Builder::pack(I32 x, I32 y, int bits) {
1110 if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|(Y<<bits)); }
1111 return {this, this->push(Op::pack, x.id,y.id,NA, 0,bits)};
1112 }
1113
1114 F32 Builder::ceil(F32 x) {
1115 if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); }
1116 return {this, this->push(Op::ceil, x.id)};
1117 }
1118 F32 Builder::floor(F32 x) {
1119 if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); }
1120 return {this, this->push(Op::floor, x.id)};
1121 }
1122 F32 Builder::to_f32(I32 x) {
1123 if (int X; this->allImm(x.id,&X)) { return splat((float)X); }
1124 return {this, this->push(Op::to_f32, x.id)};
1125 }
1126 I32 Builder::trunc(F32 x) {
1127 if (float X; this->allImm(x.id,&X)) { return splat((int)X); }
1128 return {this, this->push(Op::trunc, x.id)};
1129 }
1130 I32 Builder::round(F32 x) {
1131 if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); }
1132 return {this, this->push(Op::round, x.id)};
1133 }
1134
1135 I32 Builder::to_half(F32 x) {
1136 if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); }
1137 return {this, this->push(Op::to_half, x.id)};
1138 }
1139 F32 Builder::from_half(I32 x) {
1140 if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); }
1141 return {this, this->push(Op::from_half, x.id)};
1142 }
1143
1144 F32 Builder::from_unorm(int bits, I32 x) {
1145 F32 limit = splat(1 / ((1<<bits)-1.0f));
1146 return mul(to_f32(x), limit);
1147 }
1148 I32 Builder::to_unorm(int bits, F32 x) {
1149 F32 limit = splat((1<<bits)-1.0f);
1150 return round(mul(x, limit));
1151 }
1152
1153 bool SkColorType_to_PixelFormat(SkColorType ct, PixelFormat* f) {
1154 auto UNORM = PixelFormat::UNORM,
1155 FLOAT = PixelFormat::FLOAT;
1156 switch (ct) {
1157 case kUnknown_SkColorType: SkASSERT(false); return false;
1158
1159 case kRGBA_F32_SkColorType: *f = {FLOAT,32,32,32,32, 0,32,64,96}; return true;
1160
1161 case kRGBA_F16Norm_SkColorType: *f = {FLOAT,16,16,16,16, 0,16,32,48}; return true;
1162 case kRGBA_F16_SkColorType: *f = {FLOAT,16,16,16,16, 0,16,32,48}; return true;
1163 case kR16G16B16A16_unorm_SkColorType: *f = {UNORM,16,16,16,16, 0,16,32,48}; return true;
1164
1165 case kA16_float_SkColorType: *f = {FLOAT, 0, 0,0,16, 0, 0,0,0}; return true;
1166 case kR16G16_float_SkColorType: *f = {FLOAT, 16,16,0, 0, 0,16,0,0}; return true;
1167
1168 case kAlpha_8_SkColorType: *f = {UNORM, 0,0,0,8, 0,0,0,0}; return true;
1169 case kGray_8_SkColorType: *f = {UNORM, 8,8,8,0, 0,0,0,0}; return true; // Subtle.
1170
1171 case kRGB_565_SkColorType: *f = {UNORM, 5,6,5,0, 11,5,0,0}; return true; // (BGR)
1172 case kARGB_4444_SkColorType: *f = {UNORM, 4,4,4,4, 12,8,4,0}; return true; // (ABGR)
1173
1174 case kRGBA_8888_SkColorType: *f = {UNORM, 8,8,8,8, 0,8,16,24}; return true;
1175 case kRGB_888x_SkColorType: *f = {UNORM, 8,8,8,0, 0,8,16,32}; return true; // 32-bit
1176 case kBGRA_8888_SkColorType: *f = {UNORM, 8,8,8,8, 16,8, 0,24}; return true;
1177
1178 case kRGBA_1010102_SkColorType: *f = {UNORM, 10,10,10,2, 0,10,20,30}; return true;
1179 case kBGRA_1010102_SkColorType: *f = {UNORM, 10,10,10,2, 20,10, 0,30}; return true;
1180 case kRGB_101010x_SkColorType: *f = {UNORM, 10,10,10,0, 0,10,20, 0}; return true;
1181 case kBGR_101010x_SkColorType: *f = {UNORM, 10,10,10,0, 20,10, 0, 0}; return true;
1182
1183 case kR8G8_unorm_SkColorType: *f = {UNORM, 8, 8,0, 0, 0, 8,0,0}; return true;
1184 case kR16G16_unorm_SkColorType: *f = {UNORM, 16,16,0, 0, 0,16,0,0}; return true;
1185 case kA16_unorm_SkColorType: *f = {UNORM, 0, 0,0,16, 0, 0,0,0}; return true;
1186 }
1187 return false;
1188 }
1189
1190 static int byte_size(PixelFormat f) {
1191 // What's the highest bit we read?
1192 int bits = std::max(f.r_bits + f.r_shift,
1193 std::max(f.g_bits + f.g_shift,
1194 std::max(f.b_bits + f.b_shift,
1195 f.a_bits + f.a_shift)));
1196 // Round up to bytes.
1197 return (bits + 7) / 8;
1198 }
1199
1200 static Color unpack(PixelFormat f, I32 x) {
1201 SkASSERT(byte_size(f) <= 4);
1202 auto unpack_channel = [=](int bits, int shift) {
1203 I32 channel = extract(x, shift, (1<<bits)-1);
1204 switch (f.encoding) {
1205 case PixelFormat::UNORM: return from_unorm(bits, channel);
1206 case PixelFormat::FLOAT: return from_half ( channel);
1207 }
1208 SkUNREACHABLE;
1209 };
1210 return {
1211 f.r_bits ? unpack_channel(f.r_bits, f.r_shift) : x->splat(0.0f),
1212 f.g_bits ? unpack_channel(f.g_bits, f.g_shift) : x->splat(0.0f),
1213 f.b_bits ? unpack_channel(f.b_bits, f.b_shift) : x->splat(0.0f),
1214 f.a_bits ? unpack_channel(f.a_bits, f.a_shift) : x->splat(1.0f),
1215 };
1216 }
1217
1218 static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) {
1219 SkASSERT(byte_size(f) == 8);
1220 // We assume some of the channels are in the low 32 bits, some in the high 32 bits.
1221 // The assert on byte_size(lo) will trigger if this assumption is violated.
1222 *lo = f;
1223 if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; }
1224 if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; }
1225 if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; }
1226 if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; }
1227 SkASSERT(byte_size(*lo) == 4);
1228
1229 *hi = f;
1230 if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; }
1231 if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; }
1232 if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; }
1233 if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; }
1234 SkASSERT(byte_size(*hi) == 4);
1235 }
1236
1237 // The only 16-byte format we support today is RGBA F32,
1238 // though, TODO, we could generalize that to any swizzle, and to allow UNORM too.
1239 static void assert_16byte_is_rgba_f32(PixelFormat f) {
1240 #if defined(SK_DEBUG)
1241 SkASSERT(byte_size(f) == 16);
1242 PixelFormat rgba_f32;
1243 SkAssertResult(SkColorType_to_PixelFormat(kRGBA_F32_SkColorType, &rgba_f32));
1244
1245 SkASSERT(f.encoding == rgba_f32.encoding);
1246
1247 SkASSERT(f.r_bits == rgba_f32.r_bits);
1248 SkASSERT(f.g_bits == rgba_f32.g_bits);
1249 SkASSERT(f.b_bits == rgba_f32.b_bits);
1250 SkASSERT(f.a_bits == rgba_f32.a_bits);
1251
1252 SkASSERT(f.r_shift == rgba_f32.r_shift);
1253 SkASSERT(f.g_shift == rgba_f32.g_shift);
1254 SkASSERT(f.b_shift == rgba_f32.b_shift);
1255 SkASSERT(f.a_shift == rgba_f32.a_shift);
1256 #endif
1257 }
1258
1259 Color Builder::load(PixelFormat f, Arg ptr) {
1260 switch (byte_size(f)) {
1261 case 1: return unpack(f, load8 (ptr));
1262 case 2: return unpack(f, load16(ptr));
1263 case 4: return unpack(f, load32(ptr));
1264 case 8: {
1265 PixelFormat lo,hi;
1266 split_disjoint_8byte_format(f, &lo,&hi);
1267 Color l = unpack(lo, load64(ptr, 0)),
1268 h = unpack(hi, load64(ptr, 1));
1269 return {
1270 lo.r_bits ? l.r : h.r,
1271 lo.g_bits ? l.g : h.g,
1272 lo.b_bits ? l.b : h.b,
1273 lo.a_bits ? l.a : h.a,
1274 };
1275 }
1276 case 16: {
1277 assert_16byte_is_rgba_f32(f);
1278 return {
1279 bit_cast(load128(ptr, 0)),
1280 bit_cast(load128(ptr, 1)),
1281 bit_cast(load128(ptr, 2)),
1282 bit_cast(load128(ptr, 3)),
1283 };
1284 }
1285 default: SkUNREACHABLE;
1286 }
1287 return {};
1288 }
1289
1290 Color Builder::gather(PixelFormat f, Arg ptr, int offset, I32 index) {
1291 switch (byte_size(f)) {
1292 case 1: return unpack(f, gather8 (ptr, offset, index));
1293 case 2: return unpack(f, gather16(ptr, offset, index));
1294 case 4: return unpack(f, gather32(ptr, offset, index));
1295 case 8: {
1296 PixelFormat lo,hi;
1297 split_disjoint_8byte_format(f, &lo,&hi);
1298 Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)),
1299 h = unpack(hi, gather32(ptr, offset, (index<<1)+1));
1300 return {
1301 lo.r_bits ? l.r : h.r,
1302 lo.g_bits ? l.g : h.g,
1303 lo.b_bits ? l.b : h.b,
1304 lo.a_bits ? l.a : h.a,
1305 };
1306 }
1307 case 16: {
1308 assert_16byte_is_rgba_f32(f);
1309 return {
1310 gatherF(ptr, offset, (index<<2)+0),
1311 gatherF(ptr, offset, (index<<2)+1),
1312 gatherF(ptr, offset, (index<<2)+2),
1313 gatherF(ptr, offset, (index<<2)+3),
1314 };
1315 }
1316 default: SkUNREACHABLE;
1317 }
1318 return {};
1319 }
1320
1321 static I32 pack32(PixelFormat f, Color c) {
1322 SkASSERT(byte_size(f) <= 4);
1323 I32 packed = c->splat(0);
1324 auto pack_channel = [&](F32 channel, int bits, int shift) {
1325 I32 encoded;
1326 switch (f.encoding) {
1327 case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break;
1328 case PixelFormat::FLOAT: encoded = to_half ( channel); break;
1329 }
1330 packed = pack(packed, encoded, shift);
1331 };
1332 if (f.r_bits) { pack_channel(c.r, f.r_bits, f.r_shift); }
1333 if (f.g_bits) { pack_channel(c.g, f.g_bits, f.g_shift); }
1334 if (f.b_bits) { pack_channel(c.b, f.b_bits, f.b_shift); }
1335 if (f.a_bits) { pack_channel(c.a, f.a_bits, f.a_shift); }
1336 return packed;
1337 }
1338
1339 bool Builder::store(PixelFormat f, Arg ptr, Color c) {
1340 // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal.
1341 if (f.r_bits == f.g_bits && f.g_bits == f.b_bits &&
1342 f.r_shift == f.g_shift && f.g_shift == f.b_shift) {
1343
1344 // TODO: pull these coefficients from an SkColorSpace? This is sRGB luma/luminance.
1345 c.r = c.r * 0.2126f
1346 + c.g * 0.7152f
1347 + c.b * 0.0722f;
1348 f.g_bits = f.b_bits = 0;
1349 }
1350
1351 switch (byte_size(f)) {
1352 case 1: store8 (ptr, pack32(f,c)); return true;
1353 case 2: store16(ptr, pack32(f,c)); return true;
1354 case 4: store32(ptr, pack32(f,c)); return true;
1355 case 8: {
1356 PixelFormat lo,hi;
1357 split_disjoint_8byte_format(f, &lo,&hi);
1358 store64(ptr, pack32(lo,c)
1359 , pack32(hi,c));
1360 return true;
1361 }
1362 case 16: {
1363 assert_16byte_is_rgba_f32(f);
1364 store128(ptr, bit_cast(c.r), bit_cast(c.g), 0);
1365 store128(ptr, bit_cast(c.b), bit_cast(c.a), 1);
1366 return true;
1367 }
1368 default: SkUNREACHABLE;
1369 }
1370 return false;
1371 }
1372
1373 void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) {
1374 skvm::F32 invA = 1.0f / a,
1375 inf = bit_cast(splat(0x7f800000));
1376 // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0).
1377 invA = select(invA < inf, invA
1378 , 0.0f);
1379 *r *= invA;
1380 *g *= invA;
1381 *b *= invA;
1382 }
1383
1384 void Builder::premul(F32* r, F32* g, F32* b, F32 a) {
1385 *r *= a;
1386 *g *= a;
1387 *b *= a;
1388 }
1389
1390 Color Builder::uniformPremul(SkColor4f color, SkColorSpace* src,
1391 Uniforms* uniforms, SkColorSpace* dst) {
1392 SkColorSpaceXformSteps(src, kUnpremul_SkAlphaType,
1393 dst, kPremul_SkAlphaType).apply(color.vec());
1394 return {
1395 uniformF(uniforms->pushF(color.fR)),
1396 uniformF(uniforms->pushF(color.fG)),
1397 uniformF(uniforms->pushF(color.fB)),
1398 uniformF(uniforms->pushF(color.fA)),
1399 };
1400 }
1401
1402 F32 Builder::lerp(F32 lo, F32 hi, F32 t) {
1403 if (this->isImm(t.id, 0.0f)) { return lo; }
1404 if (this->isImm(t.id, 1.0f)) { return hi; }
1405 return mad(sub(hi, lo), t, lo);
1406 }
1407
1408 Color Builder::lerp(Color lo, Color hi, F32 t) {
1409 return {
1410 lerp(lo.r, hi.r, t),
1411 lerp(lo.g, hi.g, t),
1412 lerp(lo.b, hi.b, t),
1413 lerp(lo.a, hi.a, t),
1414 };
1415 }
1416
1417 HSLA Builder::to_hsla(Color c) {
1418 F32 mx = max(max(c.r,c.g),c.b),
1419 mn = min(min(c.r,c.g),c.b),
1420 d = mx - mn,
1421 invd = 1.0f / d,
1422 g_lt_b = select(c.g < c.b, splat(6.0f)
1423 , splat(0.0f));
1424
1425 F32 h = (1/6.0f) * select(mx == mn, 0.0f,
1426 select(mx == c.r, invd * (c.g - c.b) + g_lt_b,
1427 select(mx == c.g, invd * (c.b - c.r) + 2.0f
1428 , invd * (c.r - c.g) + 4.0f)));
1429
1430 F32 sum = mx + mn,
1431 l = sum * 0.5f,
1432 s = select(mx == mn, 0.0f
1433 , d / select(l > 0.5f, 2.0f - sum
1434 , sum));
1435 return {h, s, l, c.a};
1436 }
1437
1438 Color Builder::to_rgba(HSLA c) {
1439 // See GrRGBToHSLFilterEffect.fp
1440
1441 auto [h,s,l,a] = c;
1442 F32 x = s * (1.0f - abs(l + l - 1.0f));
1443
1444 auto hue_to_rgb = [&,l=l](auto hue) {
1445 auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f;
1446 return x * (clamp01(q) - 0.5f) + l;
1447 };
1448
1449 return {
1450 hue_to_rgb(h + 0/3.0f),
1451 hue_to_rgb(h + 2/3.0f),
1452 hue_to_rgb(h + 1/3.0f),
1453 c.a,
1454 };
1455 }
1456
1457 // We're basing our implementation of non-separable blend modes on
1458 // https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1459 // and
1460 // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1461 // They're equivalent, but ES' math has been better simplified.
1462 //
1463 // Anything extra we add beyond that is to make the math work with premul inputs.
1464
1465 static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1466 return max(r, max(g, b))
1467 - min(r, min(g, b));
1468 }
1469
1470 static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1471 return r*0.30f + g*0.59f + b*0.11f;
1472 }
1473
1474 static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) {
1475 F32 mn = min(*r, min(*g, *b)),
1476 mx = max(*r, max(*g, *b)),
1477 sat = mx - mn;
1478
1479 // Map min channel to 0, max channel to s, and scale the middle proportionally.
1480 auto scale = [&](skvm::F32 c) {
1481 auto scaled = ((c - mn) * s) / sat;
1482 return select(is_finite(scaled), scaled, 0.0f);
1483 };
1484 *r = scale(*r);
1485 *g = scale(*g);
1486 *b = scale(*b);
1487 }
1488
1489 static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) {
1490 auto diff = lu - luminance(*r, *g, *b);
1491 *r += diff;
1492 *g += diff;
1493 *b += diff;
1494 }
1495
1496 static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) {
1497 F32 mn = min(*r, min(*g, *b)),
1498 mx = max(*r, max(*g, *b)),
1499 lu = luminance(*r, *g, *b);
1500
1501 auto clip = [&](auto c) {
1502 c = select(mn >= 0, c
1503 , lu + ((c-lu)*( lu)) / (lu-mn));
1504 c = select(mx > a, lu + ((c-lu)*(a-lu)) / (mx-lu)
1505 , c);
1506 return clamp01(c); // May be a little negative, or worse, NaN.
1507 };
1508 *r = clip(*r);
1509 *g = clip(*g);
1510 *b = clip(*b);
1511 }
1512
1513 Color Builder::blend(SkBlendMode mode, Color src, Color dst) {
1514 auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) {
1515 return x*y + z*w;
1516 };
1517
1518 auto two = [](skvm::F32 x) { return x+x; };
1519
1520 auto apply_rgba = [&](auto fn) {
1521 return Color {
1522 fn(src.r, dst.r),
1523 fn(src.g, dst.g),
1524 fn(src.b, dst.b),
1525 fn(src.a, dst.a),
1526 };
1527 };
1528
1529 auto apply_rgb_srcover_a = [&](auto fn) {
1530 return Color {
1531 fn(src.r, dst.r),
1532 fn(src.g, dst.g),
1533 fn(src.b, dst.b),
1534 mad(dst.a, 1-src.a, src.a), // srcover for alpha
1535 };
1536 };
1537
1538 auto non_sep = [&](auto R, auto G, auto B) {
1539 return Color{
1540 R + mma(src.r, 1-dst.a, dst.r, 1-src.a),
1541 G + mma(src.g, 1-dst.a, dst.g, 1-src.a),
1542 B + mma(src.b, 1-dst.a, dst.b, 1-src.a),
1543 mad(dst.a, 1-src.a, src.a), // srcover for alpha
1544 };
1545 };
1546
1547 switch (mode) {
1548 default:
1549 SkASSERT(false);
1550 [[fallthrough]]; /*but also, for safety, fallthrough*/
1551
1552 case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) };
1553
1554 case SkBlendMode::kSrc: return src;
1555 case SkBlendMode::kDst: return dst;
1556
1557 case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]];
1558 case SkBlendMode::kSrcOver:
1559 return apply_rgba([&](auto s, auto d) {
1560 return mad(d,1-src.a, s);
1561 });
1562
1563 case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]];
1564 case SkBlendMode::kSrcIn:
1565 return apply_rgba([&](auto s, auto d) {
1566 return s * dst.a;
1567 });
1568
1569 case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]];
1570
1571 case SkBlendMode::kSrcOut:
1572 return apply_rgba([&](auto s, auto d) {
1573 return s * (1-dst.a);
1574 });
1575
1576 case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]];
1577 case SkBlendMode::kSrcATop:
1578 return apply_rgba([&](auto s, auto d) {
1579 return mma(s, dst.a, d, 1-src.a);
1580 });
1581
1582 case SkBlendMode::kXor:
1583 return apply_rgba([&](auto s, auto d) {
1584 return mma(s, 1-dst.a, d, 1-src.a);
1585 });
1586
1587 case SkBlendMode::kPlus:
1588 return apply_rgba([&](auto s, auto d) {
1589 return min(s+d, 1.0f);
1590 });
1591
1592 case SkBlendMode::kModulate:
1593 return apply_rgba([&](auto s, auto d) {
1594 return s * d;
1595 });
1596
1597 case SkBlendMode::kScreen:
1598 // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts.
1599 // It's kind of plausible that s + (d - sd) keeps more precision?
1600 return apply_rgba([&](auto s, auto d) {
1601 return s + (d - s*d);
1602 });
1603
1604 case SkBlendMode::kDarken:
1605 return apply_rgb_srcover_a([&](auto s, auto d) {
1606 return s + (d - max(s * dst.a,
1607 d * src.a));
1608 });
1609
1610 case SkBlendMode::kLighten:
1611 return apply_rgb_srcover_a([&](auto s, auto d) {
1612 return s + (d - min(s * dst.a,
1613 d * src.a));
1614 });
1615
1616 case SkBlendMode::kDifference:
1617 return apply_rgb_srcover_a([&](auto s, auto d) {
1618 return s + (d - two(min(s * dst.a,
1619 d * src.a)));
1620 });
1621
1622 case SkBlendMode::kExclusion:
1623 return apply_rgb_srcover_a([&](auto s, auto d) {
1624 return s + (d - two(s * d));
1625 });
1626
1627 case SkBlendMode::kColorBurn:
1628 return apply_rgb_srcover_a([&](auto s, auto d) {
1629 auto mn = min(dst.a,
1630 src.a * (dst.a - d) / s),
1631 burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a);
1632 return select(d == dst.a , s * (1-dst.a) + d,
1633 select(is_finite(burn), burn
1634 , d * (1-src.a) + s));
1635 });
1636
1637 case SkBlendMode::kColorDodge:
1638 return apply_rgb_srcover_a([&](auto s, auto d) {
1639 auto dodge = src.a * min(dst.a,
1640 d * src.a / (src.a - s))
1641 + mma(s, 1-dst.a, d, 1-src.a);
1642 return select(d == 0.0f , s * (1-dst.a) + d,
1643 select(is_finite(dodge), dodge
1644 , d * (1-src.a) + s));
1645 });
1646
1647 case SkBlendMode::kHardLight:
1648 return apply_rgb_srcover_a([&](auto s, auto d) {
1649 return mma(s, 1-dst.a, d, 1-src.a) +
1650 select(two(s) <= src.a,
1651 two(s * d),
1652 src.a * dst.a - two((dst.a - d) * (src.a - s)));
1653 });
1654
1655 case SkBlendMode::kOverlay:
1656 return apply_rgb_srcover_a([&](auto s, auto d) {
1657 return mma(s, 1-dst.a, d, 1-src.a) +
1658 select(two(d) <= dst.a,
1659 two(s * d),
1660 src.a * dst.a - two((dst.a - d) * (src.a - s)));
1661 });
1662
1663 case SkBlendMode::kMultiply:
1664 return apply_rgba([&](auto s, auto d) {
1665 return mma(s, 1-dst.a, d, 1-src.a) + s * d;
1666 });
1667
1668 case SkBlendMode::kSoftLight:
1669 return apply_rgb_srcover_a([&](auto s, auto d) {
1670 auto m = select(dst.a > 0.0f, d / dst.a
1671 , 0.0f),
1672 s2 = two(s),
1673 m4 = 4*m;
1674
1675 // The logic forks three ways:
1676 // 1. dark src?
1677 // 2. light src, dark dst?
1678 // 3. light src, light dst?
1679
1680 // Used in case 1
1681 auto darkSrc = d * ((s2-src.a) * (1-m) + src.a),
1682 // Used in case 2
1683 darkDst = (m4 * m4 + m4) * (m-1) + 7*m,
1684 // Used in case 3.
1685 liteDst = sqrt(m) - m,
1686 // Used in 2 or 3?
1687 liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst
1688 , liteDst)
1689 + d * src.a;
1690 return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc
1691 , liteSrc);
1692 });
1693
1694 case SkBlendMode::kHue: {
1695 skvm::F32 R = src.r * src.a,
1696 G = src.g * src.a,
1697 B = src.b * src.a;
1698
1699 set_sat (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b));
1700 set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1701 clip_color(&R, &G, &B, src.a * dst.a);
1702
1703 return non_sep(R, G, B);
1704 }
1705
1706 case SkBlendMode::kSaturation: {
1707 skvm::F32 R = dst.r * src.a,
1708 G = dst.g * src.a,
1709 B = dst.b * src.a;
1710
1711 set_sat (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b));
1712 set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1713 clip_color(&R, &G, &B, src.a * dst.a);
1714
1715 return non_sep(R, G, B);
1716 }
1717
1718 case SkBlendMode::kColor: {
1719 skvm::F32 R = src.r * dst.a,
1720 G = src.g * dst.a,
1721 B = src.b * dst.a;
1722
1723 set_lum (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b));
1724 clip_color(&R, &G, &B, src.a * dst.a);
1725
1726 return non_sep(R, G, B);
1727 }
1728
1729 case SkBlendMode::kLuminosity: {
1730 skvm::F32 R = dst.r * src.a,
1731 G = dst.g * src.a,
1732 B = dst.b * src.a;
1733
1734 set_lum (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b));
1735 clip_color(&R, &G, &B, dst.a * src.a);
1736
1737 return non_sep(R, G, B);
1738 }
1739 }
1740 }
1741
1742 // For a given program we'll store each Instruction's users contiguously in a table,
1743 // and track where each Instruction's span of users starts and ends in another index.
1744 // Here's a simple program that loads x and stores kx+k:
1745 //
1746 // v0 = splat(k)
1747 // v1 = load(...)
1748 // v2 = mul(v1, v0)
1749 // v3 = add(v2, v0)
1750 // v4 = store(..., v3)
1751 //
1752 // This program has 5 instructions v0-v4.
1753 // - v0 is used by v2 and v3
1754 // - v1 is used by v2
1755 // - v2 is used by v3
1756 // - v3 is used by v4
1757 // - v4 has a side-effect
1758 //
1759 // For this program we fill out these two arrays:
1760 // table: [v2,v3, v2, v3, v4]
1761 // index: [0, 2, 3, 4, 5]
1762 //
1763 // The table is just those "is used by ..." I wrote out above in order,
1764 // and the index tracks where an Instruction's span of users starts, table[index[id]].
1765 // The span continues up until the start of the next Instruction, table[index[id+1]].
1766 SkSpan<const Val> Usage::operator[](Val id) const {
1767 int begin = fIndex[id];
1768 int end = fIndex[id + 1];
1769 return SkMakeSpan(fTable.data() + begin, end - begin);
1770 }
1771
1772 Usage::Usage(const std::vector<Instruction>& program) {
1773 // uses[id] counts the number of times each Instruction is used.
1774 std::vector<int> uses(program.size(), 0);
1775 for (Val id = 0; id < (Val)program.size(); id++) {
1776 Instruction inst = program[id];
1777 if (inst.x != NA) { ++uses[inst.x]; }
1778 if (inst.y != NA) { ++uses[inst.y]; }
1779 if (inst.z != NA) { ++uses[inst.z]; }
1780 }
1781
1782 // Build our index into fTable, with an extra entry marking the final Instruction's end.
1783 fIndex.reserve(program.size() + 1);
1784 int total_uses = 0;
1785 for (int n : uses) {
1786 fIndex.push_back(total_uses);
1787 total_uses += n;
1788 }
1789 fIndex.push_back(total_uses);
1790
1791 // Tick down each Instruction's uses to fill in fTable.
1792 fTable.resize(total_uses, NA);
1793 for (Val id = (Val)program.size(); id --> 0; ) {
1794 Instruction inst = program[id];
1795 if (inst.x != NA) { fTable[fIndex[inst.x] + --uses[inst.x]] = id; }
1796 if (inst.y != NA) { fTable[fIndex[inst.y] + --uses[inst.y]] = id; }
1797 if (inst.z != NA) { fTable[fIndex[inst.z] + --uses[inst.z]] = id; }
1798 }
1799 for (int n : uses ) { (void)n; SkASSERT(n == 0 ); }
1800 for (Val id : fTable) { (void)id; SkASSERT(id != NA); }
1801 }
1802
1803 // ~~~~ Program::eval() and co. ~~~~ //
1804
1805 // Handy references for x86-64 instruction encoding:
1806 // https://wiki.osdev.org/X86-64_Instruction_Encoding
1807 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
1808 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
1809 // http://ref.x86asm.net/coder64.html
1810
1811 // Used for ModRM / immediate instruction encoding.
1812 static uint8_t _233(int a, int b, int c) {
1813 return (a & 3) << 6
1814 | (b & 7) << 3
1815 | (c & 7) << 0;
1816 }
1817
1818 // ModRM byte encodes the arguments of an opcode.
1819 enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
1820 static uint8_t mod_rm(Mod mod, int reg, int rm) {
1821 return _233((int)mod, reg, rm);
1822 }
1823
1824 static Mod mod(int imm) {
1825 if (imm == 0) { return Mod::Indirect; }
1826 if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
1827 return Mod::FourByteImm;
1828 }
1829
1830 static int imm_bytes(Mod mod) {
1831 switch (mod) {
1832 case Mod::Indirect: return 0;
1833 case Mod::OneByteImm: return 1;
1834 case Mod::FourByteImm: return 4;
1835 case Mod::Direct: SkUNREACHABLE;
1836 }
1837 SkUNREACHABLE;
1838 }
1839
1840 // SIB byte encodes a memory address, base + (index * scale).
1841 static uint8_t sib(Assembler::Scale scale, int index, int base) {
1842 return _233((int)scale, index, base);
1843 }
1844
1845 // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
1846 static uint8_t rex(bool W, // If set, operation is 64-bit, otherwise default, usually 32-bit.
1847 bool R, // Extra top bit to select ModRM reg, registers 8-15.
1848 bool X, // Extra top bit for SIB index register.
1849 bool B) { // Extra top bit for SIB base or ModRM rm register.
1850 return 0b01000000 // Fixed 0100 for top four bits.
1851 | (W << 3)
1852 | (R << 2)
1853 | (X << 1)
1854 | (B << 0);
1855 }
1856
1857
1858 // The VEX prefix extends SSE operations to AVX. Used generally, even with XMM.
1859 struct VEX {
1860 int len;
1861 uint8_t bytes[3];
1862 };
1863
1864 static VEX vex(bool WE, // Like REX W for int operations, or opcode extension for float?
1865 bool R, // Same as REX R. Pass high bit of dst register, dst>>3.
1866 bool X, // Same as REX X.
1867 bool B, // Same as REX B. Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
1868 int map, // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
1869 int vvvv, // 4-bit second operand register. Pass our x for 3-arg ops.
1870 bool L, // Set for 256-bit ymm operations, off for 128-bit xmm.
1871 int pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
1872
1873 // Pack x86 opcode map selector to 5-bit VEX encoding.
1874 map = [map]{
1875 switch (map) {
1876 case 0x0f: return 0b00001;
1877 case 0x380f: return 0b00010;
1878 case 0x3a0f: return 0b00011;
1879 // Several more cases only used by XOP / TBM.
1880 }
1881 SkUNREACHABLE;
1882 }();
1883
1884 // Pack mandatory SSE opcode prefix byte to 2-bit VEX encoding.
1885 pp = [pp]{
1886 switch (pp) {
1887 case 0x66: return 0b01;
1888 case 0xf3: return 0b10;
1889 case 0xf2: return 0b11;
1890 }
1891 return 0b00;
1892 }();
1893
1894 VEX vex = {0, {0,0,0}};
1895 if (X == 0 && B == 0 && WE == 0 && map == 0b00001) {
1896 // With these conditions met, we can optionally compress VEX to 2-byte.
1897 vex.len = 2;
1898 vex.bytes[0] = 0xc5;
1899 vex.bytes[1] = (pp & 3) << 0
1900 | (L & 1) << 2
1901 | (~vvvv & 15) << 3
1902 | (~(int)R & 1) << 7;
1903 } else {
1904 // We could use this 3-byte VEX prefix all the time if we like.
1905 vex.len = 3;
1906 vex.bytes[0] = 0xc4;
1907 vex.bytes[1] = (map & 31) << 0
1908 | (~(int)B & 1) << 5
1909 | (~(int)X & 1) << 6
1910 | (~(int)R & 1) << 7;
1911 vex.bytes[2] = (pp & 3) << 0
1912 | (L & 1) << 2
1913 | (~vvvv & 15) << 3
1914 | (WE & 1) << 7;
1915 }
1916 return vex;
1917 }
1918
1919 Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(0) {}
1920
1921 size_t Assembler::size() const { return fSize; }
1922
1923 void Assembler::bytes(const void* p, int n) {
1924 if (fCurr) {
1925 memcpy(fCurr, p, n);
1926 fCurr += n;
1927 }
1928 fSize += n;
1929 }
1930
1931 void Assembler::byte(uint8_t b) { this->bytes(&b, 1); }
1932 void Assembler::word(uint32_t w) { this->bytes(&w, 4); }
1933
1934 void Assembler::align(int mod) {
1935 while (this->size() % mod) {
1936 this->byte(0x00);
1937 }
1938 }
1939
1940 void Assembler::int3() {
1941 this->byte(0xcc);
1942 }
1943
1944 void Assembler::vzeroupper() {
1945 this->byte(0xc5);
1946 this->byte(0xf8);
1947 this->byte(0x77);
1948 }
1949 void Assembler::ret() { this->byte(0xc3); }
1950
1951 void Assembler::op(int opcode, Operand dst, GP64 x) {
1952 if (dst.kind == Operand::REG) {
1953 this->byte(rex(W1,x>>3,0,dst.reg>>3));
1954 this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1955 this->byte(mod_rm(Mod::Direct, x, dst.reg&7));
1956 } else {
1957 SkASSERT(dst.kind == Operand::MEM);
1958 const Mem& m = dst.mem;
1959 const bool need_SIB = (m.base&7) == rsp
1960 || m.index != rsp;
1961
1962 this->byte(rex(W1,x>>3,m.index>>3,m.base>>3));
1963 this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1964 this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7));
1965 if (need_SIB) {
1966 this->byte(sib(m.scale, m.index&7, m.base&7));
1967 }
1968 this->bytes(&m.disp, imm_bytes(mod(m.disp)));
1969 }
1970 }
1971
1972 void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) {
1973 opcode |= 0b1000'0000; // top bit set for instructions with any immediate
1974
1975 int imm_bytes = 4;
1976 if (SkTFitsIn<int8_t>(imm)) {
1977 imm_bytes = 1;
1978 opcode |= 0b0000'0010; // second bit set for 8-bit immediate, else 32-bit.
1979 }
1980
1981 this->op(opcode, dst, (GP64)opcode_ext);
1982 this->bytes(&imm, imm_bytes);
1983 }
1984
1985 void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); }
1986 void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); }
1987 void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); }
1988
1989 // These don't work quite like the other instructions with immediates:
1990 // these immediates are always fixed size at 4 bytes or 1 byte.
1991 void Assembler::mov(Operand dst, int imm) {
1992 this->op(0xC7,dst,(GP64)0b000);
1993 this->word(imm);
1994 }
1995 void Assembler::movb(Operand dst, int imm) {
1996 this->op(0xC6,dst,(GP64)0b000);
1997 this->byte(imm);
1998 }
1999
2000 void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); }
2001 void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); }
2002 void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); }
2003 void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); }
2004 void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); }
2005
2006 void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); }
2007 void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); }
2008 void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); }
2009 void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); }
2010 void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); }
2011
2012 void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); }
2013 void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); }
2014
2015 void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfe, dst,x,y); }
2016 void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfa, dst,x,y); }
2017 void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); }
2018
2019 void Assembler::vpsubw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xf9, dst,x,y); }
2020 void Assembler::vpmullw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xd5, dst,x,y); }
2021
2022 void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
2023 void Assembler::vpor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
2024 void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); }
2025 void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
2026
2027 void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); }
2028 void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); }
2029 void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); }
2030 void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); }
2031 void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); }
2032 void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); }
2033
2034 void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); }
2035 void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
2036 void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); }
2037
2038 void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); }
2039 void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); }
2040 void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); }
2041
2042 void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); }
2043 void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); }
2044 void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); }
2045
2046 void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); }
2047 void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0x67, dst,x,y); }
2048
2049 void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); }
2050 void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); }
2051
2052 void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); }
2053 void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); }
2054
2055
2056 void Assembler::imm_byte_after_operand(const Operand& operand, int imm) {
2057 // When we've embedded a label displacement in the middle of an instruction,
2058 // we need to tweak it a little so that the resolved displacement starts
2059 // from the end of the instruction and not the end of the displacement.
2060 if (operand.kind == Operand::LABEL && fCode) {
2061 int disp;
2062 memcpy(&disp, fCurr-4, 4);
2063 disp--;
2064 memcpy(fCurr-4, &disp, 4);
2065 }
2066 this->byte(imm);
2067 }
2068
2069 void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) {
2070 this->op(0,0x0f,0xc2, dst,x,y);
2071 this->imm_byte_after_operand(y, imm);
2072 }
2073
2074 void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) {
2075 this->op(0x66,0x3a0f,0x4c, dst,x,y);
2076 this->imm_byte_after_operand(y, z << 4);
2077 }
2078
2079 // Shift instructions encode their opcode extension as "dst", dst as x, and x as y.
2080 void Assembler::vpslld(Ymm dst, Ymm x, int imm) {
2081 this->op(0x66,0x0f,0x72,(Ymm)6, dst,x);
2082 this->byte(imm);
2083 }
2084 void Assembler::vpsrld(Ymm dst, Ymm x, int imm) {
2085 this->op(0x66,0x0f,0x72,(Ymm)2, dst,x);
2086 this->byte(imm);
2087 }
2088 void Assembler::vpsrad(Ymm dst, Ymm x, int imm) {
2089 this->op(0x66,0x0f,0x72,(Ymm)4, dst,x);
2090 this->byte(imm);
2091 }
2092 void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) {
2093 this->op(0x66,0x0f,0x71,(Ymm)2, dst,x);
2094 this->byte(imm);
2095 }
2096
2097 void Assembler::vpermq(Ymm dst, Operand x, int imm) {
2098 // A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
2099 this->op(0x66,0x3a0f,0x00, dst,x,W1);
2100 this->imm_byte_after_operand(x, imm);
2101 }
2102
2103 void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) {
2104 this->op(0x66,0x3a0f,0x06, dst,x,y);
2105 this->imm_byte_after_operand(y, imm);
2106 }
2107
2108 void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) {
2109 this->op(0x66,0x380f,0x16, dst,ix,src);
2110 }
2111
2112 void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) {
2113 this->op(0x66,0x3a0f,0x08, dst,x);
2114 this->imm_byte_after_operand(x, imm);
2115 }
2116
2117 void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); }
2118 void Assembler::vmovups(Ymm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); }
2119 void Assembler::vmovups(Xmm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); }
2120 void Assembler::vmovups(Operand dst, Ymm src) { this->op( 0,0x0f,0x11, src,dst); }
2121 void Assembler::vmovups(Operand dst, Xmm src) { this->op( 0,0x0f,0x11, src,dst); }
2122
2123 void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op( 0,0x0f,0x5b, dst,x); }
2124 void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); }
2125 void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); }
2126 void Assembler::vsqrtps (Ymm dst, Operand x) { this->op( 0,0x0f,0x51, dst,x); }
2127
2128 void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) {
2129 this->op(0x66,0x3a0f,0x1d, x,dst);
2130 this->imm_byte_after_operand(dst, imm);
2131 }
2132 void Assembler::vcvtph2ps(Ymm dst, Operand x) {
2133 this->op(0x66,0x380f,0x13, dst,x);
2134 }
2135
2136 int Assembler::disp19(Label* l) {
2137 SkASSERT(l->kind == Label::NotYetSet ||
2138 l->kind == Label::ARMDisp19);
2139 int here = (int)this->size();
2140 l->kind = Label::ARMDisp19;
2141 l->references.push_back(here);
2142 // ARM 19-bit instruction count, from the beginning of this instruction.
2143 return (l->offset - here) / 4;
2144 }
2145
2146 int Assembler::disp32(Label* l) {
2147 SkASSERT(l->kind == Label::NotYetSet ||
2148 l->kind == Label::X86Disp32);
2149 int here = (int)this->size();
2150 l->kind = Label::X86Disp32;
2151 l->references.push_back(here);
2152 // x86 32-bit byte count, from the end of this instruction.
2153 return l->offset - (here + 4);
2154 }
2155
2156 void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) {
2157 switch (y.kind) {
2158 case Operand::REG: {
2159 VEX v = vex(w, dst>>3, 0, y.reg>>3,
2160 map, x, l, prefix);
2161 this->bytes(v.bytes, v.len);
2162 this->byte(opcode);
2163 this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7));
2164 } return;
2165
2166 case Operand::MEM: {
2167 // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows;
2168 // without an SIB byte, that's where the base register would usually go.
2169 // This means we have to use an SIB byte if we want to use rsp as a base register.
2170 const Mem& m = y.mem;
2171 const bool need_SIB = m.base == rsp
2172 || m.index != rsp;
2173
2174 VEX v = vex(w, dst>>3, m.index>>3, m.base>>3,
2175 map, x, l, prefix);
2176 this->bytes(v.bytes, v.len);
2177 this->byte(opcode);
2178 this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7));
2179 if (need_SIB) {
2180 this->byte(sib(m.scale, m.index&7, m.base&7));
2181 }
2182 this->bytes(&m.disp, imm_bytes(mod(m.disp)));
2183 } return;
2184
2185 case Operand::LABEL: {
2186 // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
2187 const int rip = rbp;
2188
2189 VEX v = vex(w, dst>>3, 0, rip>>3,
2190 map, x, l, prefix);
2191 this->bytes(v.bytes, v.len);
2192 this->byte(opcode);
2193 this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
2194 this->word(this->disp32(y.label));
2195 } return;
2196 }
2197 }
2198
2199 void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); }
2200
2201 void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); }
2202
2203 void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); }
2204
2205 void Assembler::jump(uint8_t condition, Label* l) {
2206 // These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
2207 // 7? one-byte-disp
2208 // 0F 8? four-byte-disp
2209 // We always use the near displacement to make updating labels simpler (no resizing).
2210 this->byte(0x0f);
2211 this->byte(condition);
2212 this->word(this->disp32(l));
2213 }
2214 void Assembler::je (Label* l) { this->jump(0x84, l); }
2215 void Assembler::jne(Label* l) { this->jump(0x85, l); }
2216 void Assembler::jl (Label* l) { this->jump(0x8c, l); }
2217 void Assembler::jc (Label* l) { this->jump(0x82, l); }
2218
2219 void Assembler::jmp(Label* l) {
2220 // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
2221 this->byte(0xe9);
2222 this->word(this->disp32(l));
2223 }
2224
2225 void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); }
2226 void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); }
2227
2228 void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); }
2229
2230 void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); }
2231 void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); }
2232
2233 void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) {
2234 this->op(0x66,0x3a0f,0x22, dst,src,y);
2235 this->imm_byte_after_operand(y, imm);
2236 }
2237 void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) {
2238 this->op(0x66,0x0f,0xc4, dst,src,y);
2239 this->imm_byte_after_operand(y, imm);
2240 }
2241 void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) {
2242 this->op(0x66,0x3a0f,0x20, dst,src,y);
2243 this->imm_byte_after_operand(y, imm);
2244 }
2245
2246 void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
2247 this->op(0x66,0x3a0f,0x39, src,dst);
2248 SkASSERT(dst.kind != Operand::LABEL);
2249 this->byte(imm);
2250 }
2251 void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
2252 this->op(0x66,0x3a0f,0x16, src,dst);
2253 SkASSERT(dst.kind != Operand::LABEL);
2254 this->byte(imm);
2255 }
2256 void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
2257 this->op(0x66,0x3a0f,0x15, src,dst);
2258 SkASSERT(dst.kind != Operand::LABEL);
2259 this->byte(imm);
2260 }
2261 void Assembler::vpextrb(Operand dst, Xmm src, int imm) {
2262 this->op(0x66,0x3a0f,0x14, src,dst);
2263 SkASSERT(dst.kind != Operand::LABEL);
2264 this->byte(imm);
2265 }
2266
2267 void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
2268 // Unlike most instructions, no aliasing is permitted here.
2269 SkASSERT(dst != ix);
2270 SkASSERT(dst != mask);
2271 SkASSERT(mask != ix);
2272
2273 int prefix = 0x66,
2274 map = 0x380f,
2275 opcode = 0x92;
2276 VEX v = vex(0, dst>>3, ix>>3, base>>3,
2277 map, mask, /*ymm?*/1, prefix);
2278 this->bytes(v.bytes, v.len);
2279 this->byte(opcode);
2280 this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/));
2281 this->byte(sib(scale, ix&7, base&7));
2282 }
2283
2284 // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
2285
2286 static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; }
2287
2288 void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
2289 this->word( (hi & 11_mask) << 21
2290 | (m & 5_mask) << 16
2291 | (lo & 6_mask) << 10
2292 | (n & 5_mask) << 5
2293 | (d & 5_mask) << 0);
2294 }
2295 void Assembler::op(uint32_t op22, V n, V d, int imm) {
2296 this->word( (op22 & 22_mask) << 10
2297 | imm // size and location depends on the instruction
2298 | (n & 5_mask) << 5
2299 | (d & 5_mask) << 0);
2300 }
2301
2302 void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); }
2303 void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); }
2304 void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); }
2305 void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); }
2306 void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); }
2307 void Assembler::not16b(V d, V n) { this->op(0b0'1'1'01110'00'10000'00101'10, n, d); }
2308
2309 void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
2310 void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
2311 void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
2312
2313 void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); }
2314 void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); }
2315
2316 void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); }
2317 void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); }
2318
2319 void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
2320 void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
2321 void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
2322 void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
2323 void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); }
2324 void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); }
2325 void Assembler::fneg4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n, d); }
2326
2327 void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); }
2328 void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); }
2329 void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); }
2330
2331 void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
2332 void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); }
2333
2334 void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
2335
2336 void Assembler::sli4s(V d, V n, int imm5) {
2337 this->op(0b0'1'1'011110'0100'000'01010'1, n, d, ( imm5 & 5_mask)<<16);
2338 }
2339 void Assembler::shl4s(V d, V n, int imm5) {
2340 this->op(0b0'1'0'011110'0100'000'01010'1, n, d, ( imm5 & 5_mask)<<16);
2341 }
2342 void Assembler::sshr4s(V d, V n, int imm5) {
2343 this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2344 }
2345 void Assembler::ushr4s(V d, V n, int imm5) {
2346 this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2347 }
2348 void Assembler::ushr8h(V d, V n, int imm4) {
2349 this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & 4_mask)<<16);
2350 }
2351
2352 void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
2353 void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
2354 void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); }
2355
2356 void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
2357 void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
2358
2359 void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
2360 void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
2361
2362 void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); }
2363
2364 void Assembler::brk(int imm16) {
2365 this->op(0b11010100'001'00000000000, (imm16 & 16_mask) << 5);
2366 }
2367
2368 void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); }
2369
2370 void Assembler::add(X d, X n, int imm12) {
2371 this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2372 }
2373 void Assembler::sub(X d, X n, int imm12) {
2374 this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2375 }
2376 void Assembler::subs(X d, X n, int imm12) {
2377 this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2378 }
2379
2380 void Assembler::b(Condition cond, Label* l) {
2381 const int imm19 = this->disp19(l);
2382 this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & 19_mask) << 5);
2383 }
2384 void Assembler::cbz(X t, Label* l) {
2385 const int imm19 = this->disp19(l);
2386 this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2387 }
2388 void Assembler::cbnz(X t, Label* l) {
2389 const int imm19 = this->disp19(l);
2390 this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2391 }
2392
2393 void Assembler::ldrq(V dst, X src, int imm12) {
2394 this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & 12_mask) << 10);
2395 }
2396 void Assembler::ldrs(V dst, X src, int imm12) {
2397 this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2398 }
2399 void Assembler::ldrb(V dst, X src, int imm12) {
2400 this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2401 }
2402
2403 void Assembler::strq(V src, X dst, int imm12) {
2404 this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & 12_mask) << 10);
2405 }
2406 void Assembler::strs(V src, X dst, int imm12) {
2407 this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2408 }
2409 void Assembler::strb(V src, X dst, int imm12) {
2410 this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2411 }
2412
2413 void Assembler::fmovs(X dst, V src) {
2414 this->op(0b0'0'0'11110'00'1'00'110'000000, src, dst);
2415 }
2416
2417 void Assembler::ldrq(V dst, Label* l) {
2418 const int imm19 = this->disp19(l);
2419 this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & 19_mask) << 5);
2420 }
2421
2422 void Assembler::label(Label* l) {
2423 if (fCode) {
2424 // The instructions all currently point to l->offset.
2425 // We'll want to add a delta to point them to here.
2426 int here = (int)this->size();
2427 int delta = here - l->offset;
2428 l->offset = here;
2429
2430 if (l->kind == Label::ARMDisp19) {
2431 for (int ref : l->references) {
2432 // ref points to a 32-bit instruction with 19-bit displacement in instructions.
2433 uint32_t inst;
2434 memcpy(&inst, fCode + ref, 4);
2435
2436 // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
2437 int disp = (int)(inst << 8) >> 13;
2438
2439 disp += delta/4; // delta is in bytes, we want instructions.
2440
2441 // Put it all back together, preserving the high 8 bits and low 5.
2442 inst = ((disp << 5) & (19_mask << 5))
2443 | ((inst ) & ~(19_mask << 5));
2444
2445 memcpy(fCode + ref, &inst, 4);
2446 }
2447 }
2448
2449 if (l->kind == Label::X86Disp32) {
2450 for (int ref : l->references) {
2451 // ref points to a 32-bit displacement in bytes.
2452 int disp;
2453 memcpy(&disp, fCode + ref, 4);
2454
2455 disp += delta;
2456
2457 memcpy(fCode + ref, &disp, 4);
2458 }
2459 }
2460 }
2461 }
2462
2463 void Program::eval(int n, void* args[]) const {
2464 #define SKVM_JIT_STATS 0
2465 #if SKVM_JIT_STATS
2466 static std::atomic<int64_t> calls{0}, jits{0},
2467 pixels{0}, fast{0};
2468 pixels += n;
2469 if (0 == calls++) {
2470 atexit([]{
2471 int64_t num = jits .load(),
2472 den = calls.load();
2473 SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den);
2474 num = fast .load();
2475 den = pixels.load();
2476 SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den);
2477 });
2478 }
2479 #endif
2480
2481 #if !defined(SKVM_JIT_BUT_IGNORE_IT)
2482 const void* jit_entry = fImpl->jit_entry.load();
2483 // jit_entry may be null either simply because we can't JIT, or when using LLVM
2484 // if the work represented by fImpl->llvm_compiling hasn't finished yet.
2485 //
2486 // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it
2487 // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off,
2488 // due to timing or program caching.
2489 if (jit_entry != nullptr && gSkVMAllowJIT) {
2490 #if SKVM_JIT_STATS
2491 jits++;
2492 fast += n;
2493 #endif
2494 void** a = args;
2495 switch (fImpl->strides.size()) {
2496 case 0: return ((void(*)(int ))jit_entry)(n );
2497 case 1: return ((void(*)(int,void* ))jit_entry)(n,a[0] );
2498 case 2: return ((void(*)(int,void*,void* ))jit_entry)(n,a[0],a[1] );
2499 case 3: return ((void(*)(int,void*,void*,void* ))jit_entry)(n,a[0],a[1],a[2]);
2500 case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry)
2501 (n,a[0],a[1],a[2],a[3]);
2502 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry)
2503 (n,a[0],a[1],a[2],a[3],a[4]);
2504 case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry)
2505 (n,a[0],a[1],a[2],a[3],a[4],a[5]);
2506 default: SkASSERT(false); // TODO: >6 args?
2507 }
2508 }
2509 #endif
2510
2511 // So we'll sometimes use the interpreter here even if later calls will use the JIT.
2512 SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(),
2513 this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(),
2514 n, args);
2515 }
2516
2517#if defined(SKVM_LLVM)
2518 void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions,
2519 const char* debug_name) {
2520 auto ctx = std::make_unique<llvm::LLVMContext>();
2521
2522 auto mod = std::make_unique<llvm::Module>("", *ctx);
2523 // All the scary bare pointers from here on are owned by ctx or mod, I think.
2524
2525 // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines.
2526 const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4;
2527
2528 llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(),
2529 *i32 = llvm::Type::getInt32Ty(*ctx);
2530
2531 std::vector<llvm::Type*> arg_types = { i32 };
2532 for (size_t i = 0; i < fImpl->strides.size(); i++) {
2533 arg_types.push_back(ptr);
2534 }
2535
2536 llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx),
2537 arg_types, /*vararg?=*/false);
2538 llvm::Function* fn
2539 = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod);
2540 for (size_t i = 0; i < fImpl->strides.size(); i++) {
2541 fn->addParamAttr(i+1, llvm::Attribute::NoAlias);
2542 }
2543
2544 llvm::BasicBlock *enter = llvm::BasicBlock::Create(*ctx, "enter" , fn),
2545 *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK", fn),
2546 *testK = llvm::BasicBlock::Create(*ctx, "testK" , fn),
2547 *loopK = llvm::BasicBlock::Create(*ctx, "loopK" , fn),
2548 *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1", fn),
2549 *test1 = llvm::BasicBlock::Create(*ctx, "test1" , fn),
2550 *loop1 = llvm::BasicBlock::Create(*ctx, "loop1" , fn),
2551 *leave = llvm::BasicBlock::Create(*ctx, "leave" , fn);
2552
2553 using IRBuilder = llvm::IRBuilder<>;
2554
2555 llvm::PHINode* n;
2556 std::vector<llvm::PHINode*> args;
2557 std::vector<llvm::Value*> vals(instructions.size());
2558
2559 auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
2560 auto [op, x,y,z, immy,immz, death,can_hoist] = instructions[i];
2561
2562 llvm::Type *i1 = llvm::Type::getInt1Ty (*ctx),
2563 *i8 = llvm::Type::getInt8Ty (*ctx),
2564 *i16 = llvm::Type::getInt16Ty(*ctx),
2565 *f32 = llvm::Type::getFloatTy(*ctx),
2566 *I1 = scalar ? i1 : llvm::VectorType::get(i1 , K ),
2567 *I8 = scalar ? i8 : llvm::VectorType::get(i8 , K ),
2568 *I16 = scalar ? i16 : llvm::VectorType::get(i16, K ),
2569 *I32 = scalar ? i32 : llvm::VectorType::get(i32, K ),
2570 *F32 = scalar ? f32 : llvm::VectorType::get(f32, K );
2571
2572 auto I = [&](llvm::Value* v) { return b->CreateBitCast(v, I32 ); };
2573 auto F = [&](llvm::Value* v) { return b->CreateBitCast(v, F32 ); };
2574
2575 auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); };
2576
2577 switch (llvm::Type* t = nullptr; op) {
2578 default:
2579 SkDebugf("can't llvm %s (%d)\n", name(op), op);
2580 return false;
2581
2582 case Op::assert_true: /*TODO*/ break;
2583
2584 case Op::index:
2585 if (I32->isVectorTy()) {
2586 std::vector<llvm::Constant*> iota(K);
2587 for (int j = 0; j < K; j++) {
2588 iota[j] = b->getInt32(j);
2589 }
2590 vals[i] = b->CreateSub(b->CreateVectorSplat(K, n),
2591 llvm::ConstantVector::get(iota));
2592 } else {
2593 vals[i] = n;
2594 } break;
2595
2596 case Op::load8: t = I8 ; goto load;
2597 case Op::load16: t = I16; goto load;
2598 case Op::load32: t = I32; goto load;
2599 load: {
2600 llvm::Value* ptr = b->CreateBitCast(args[immy], t->getPointerTo());
2601 vals[i] = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), I32);
2602 } break;
2603
2604
2605 case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immy); break;
2606
2607 case Op::uniform8: t = i8 ; goto uniform;
2608 case Op::uniform16: t = i16; goto uniform;
2609 case Op::uniform32: t = i32; goto uniform;
2610 uniform: {
2611 llvm::Value* ptr = b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr,
2612 args[immy],
2613 immz),
2614 t->getPointerTo());
2615 llvm::Value* val = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), i32);
2616 vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val)
2617 : val;
2618 } break;
2619
2620 case Op::gather8: t = i8 ; goto gather;
2621 case Op::gather16: t = i16; goto gather;
2622 case Op::gather32: t = i32; goto gather;
2623 gather: {
2624 // Our gather base pointer is immz bytes off of uniform immy.
2625 llvm::Value* base =
2626 b->CreateLoad(b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr,
2627 args[immy],
2628 immz),
2629 t->getPointerTo()->getPointerTo()));
2630
2631 llvm::Value* ptr = b->CreateInBoundsGEP(nullptr, base, vals[x]);
2632 llvm::Value* gathered;
2633 if (ptr->getType()->isVectorTy()) {
2634 gathered = b->CreateMaskedGather(ptr, 1);
2635 } else {
2636 gathered = b->CreateAlignedLoad(ptr, 1);
2637 }
2638 vals[i] = b->CreateZExt(gathered, I32);
2639 } break;
2640
2641 case Op::store8: t = I8 ; goto store;
2642 case Op::store16: t = I16; goto store;
2643 case Op::store32: t = I32; goto store;
2644 store: {
2645 llvm::Value* val = b->CreateTrunc(vals[x], t);
2646 llvm::Value* ptr = b->CreateBitCast(args[immy],
2647 val->getType()->getPointerTo());
2648 vals[i] = b->CreateAlignedStore(val, ptr, 1);
2649 } break;
2650
2651 case Op::bit_and: vals[i] = b->CreateAnd(vals[x], vals[y]); break;
2652 case Op::bit_or : vals[i] = b->CreateOr (vals[x], vals[y]); break;
2653 case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break;
2654 case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break;
2655
2656 case Op::pack: vals[i] = b->CreateOr(vals[x], b->CreateShl(vals[y], immz)); break;
2657
2658 case Op::select:
2659 vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]);
2660 break;
2661
2662 case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break;
2663 case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break;
2664 case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break;
2665
2666 case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immy); break;
2667 case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immy); break;
2668 case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immy); break;
2669
2670 case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break;
2671 case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break;
2672
2673 case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break;
2674 case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break;
2675 case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break;
2676 case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break;
2677
2678 case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break;
2679 case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break;
2680 case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break;
2681 case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break;
2682
2683 case Op::fma_f32:
2684 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2685 {F(vals[x]), F(vals[y]), F(vals[z])}));
2686 break;
2687
2688 case Op::fms_f32:
2689 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2690 {F(vals[x]), F(vals[y]),
2691 b->CreateFNeg(F(vals[z]))}));
2692 break;
2693
2694 case Op::fnma_f32:
2695 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2696 {b->CreateFNeg(F(vals[x])), F(vals[y]),
2697 F(vals[z])}));
2698 break;
2699
2700 case Op::ceil:
2701 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::ceil, F(vals[x])));
2702 break;
2703 case Op::floor:
2704 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x])));
2705 break;
2706
2707 case Op::max_f32:
2708 vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])),
2709 F(vals[y]), F(vals[x])));
2710 break;
2711 case Op::min_f32:
2712 vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])),
2713 F(vals[y]), F(vals[x])));
2714 break;
2715
2716 case Op::sqrt_f32:
2717 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x])));
2718 break;
2719
2720 case Op::to_f32: vals[i] = I(b->CreateSIToFP( vals[x] , F32)); break;
2721 case Op::trunc : vals[i] = b->CreateFPToSI(F(vals[x]), I32) ; break;
2722 case Op::round : {
2723 // Basic impl when we can't use cvtps2dq and co.
2724 auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x]));
2725 vals[i] = b->CreateFPToSI(round, I32);
2726
2727 #if 1 && defined(SK_CPU_X86)
2728 // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling.
2729 if (scalar) {
2730 // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3. ¯\_(ツ)_/¯
2731 llvm::Value* v = llvm::UndefValue::get(llvm::VectorType::get(f32, 4));
2732 v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0);
2733 vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v});
2734 } else {
2735 SkASSERT(K == 4 || K == 8);
2736 auto intr = K == 4 ? llvm::Intrinsic::x86_sse2_cvtps2dq :
2737 /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256;
2738 vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])});
2739 }
2740 #endif
2741 } break;
2742
2743 }
2744 return true;
2745 };
2746
2747 {
2748 IRBuilder b(enter);
2749 b.CreateBr(hoistK);
2750 }
2751
2752 // hoistK: emit each hoistable vector instruction; goto testK;
2753 // LLVM can do this sort of thing itself, but we've got the information cheap,
2754 // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe.
2755 {
2756 IRBuilder b(hoistK);
2757
2758 // Hoisted instructions will need args (think, uniforms), so set that up now.
2759 // These phi nodes are degenerate... they'll always be the passed-in args from enter.
2760 // Later on when we start looping the phi nodes will start looking useful.
2761 llvm::Argument* arg = fn->arg_begin();
2762 (void)arg++; // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction.
2763 for (size_t i = 0; i < fImpl->strides.size(); i++) {
2764 args.push_back(b.CreatePHI(arg->getType(), 1));
2765 args.back()->addIncoming(arg++, enter);
2766 }
2767
2768 for (size_t i = 0; i < instructions.size(); i++) {
2769 if (instructions[i].can_hoist && !emit(i, false, &b)) {
2770 return;
2771 }
2772 }
2773
2774 b.CreateBr(testK);
2775 }
2776
2777 // testK: if (N >= K) goto loopK; else goto hoist1;
2778 {
2779 IRBuilder b(testK);
2780
2781 // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK.
2782 // These also start as the initial function arguments; hoistK can't have changed them.
2783 llvm::Argument* arg = fn->arg_begin();
2784
2785 n = b.CreatePHI(arg->getType(), 2);
2786 n->addIncoming(arg++, hoistK);
2787
2788 for (size_t i = 0; i < fImpl->strides.size(); i++) {
2789 args[i] = b.CreatePHI(arg->getType(), 2);
2790 args[i]->addIncoming(arg++, hoistK);
2791 }
2792
2793 b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1);
2794 }
2795
2796 // loopK: ... insts on K x T vectors; N -= K, args += K*stride; goto testK;
2797 {
2798 IRBuilder b(loopK);
2799 for (size_t i = 0; i < instructions.size(); i++) {
2800 if (!instructions[i].can_hoist && !emit(i, false, &b)) {
2801 return;
2802 }
2803 }
2804
2805 // n -= K
2806 llvm::Value* n_next = b.CreateSub(n, b.getInt32(K));
2807 n->addIncoming(n_next, loopK);
2808
2809 // Each arg ptr += K
2810 for (size_t i = 0; i < fImpl->strides.size(); i++) {
2811 llvm::Value* arg_next
2812 = b.CreateConstInBoundsGEP1_32(nullptr, args[i], K*fImpl->strides[i]);
2813 args[i]->addIncoming(arg_next, loopK);
2814 }
2815 b.CreateBr(testK);
2816 }
2817
2818 // hoist1: emit each hoistable scalar instruction; goto test1;
2819 {
2820 IRBuilder b(hoist1);
2821 for (size_t i = 0; i < instructions.size(); i++) {
2822 if (instructions[i].can_hoist && !emit(i, true, &b)) {
2823 return;
2824 }
2825 }
2826 b.CreateBr(test1);
2827 }
2828
2829 // test1: if (N >= 1) goto loop1; else goto leave;
2830 {
2831 IRBuilder b(test1);
2832
2833 // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1.
2834 llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2);
2835 n_new->addIncoming(n, hoist1);
2836 n = n_new;
2837
2838 for (size_t i = 0; i < fImpl->strides.size(); i++) {
2839 llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2);
2840 arg_new->addIncoming(args[i], hoist1);
2841 args[i] = arg_new;
2842 }
2843
2844 b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave);
2845 }
2846
2847 // loop1: ... insts on scalars; N -= 1, args += stride; goto test1;
2848 {
2849 IRBuilder b(loop1);
2850 for (size_t i = 0; i < instructions.size(); i++) {
2851 if (!instructions[i].can_hoist && !emit(i, true, &b)) {
2852 return;
2853 }
2854 }
2855
2856 // n -= 1
2857 llvm::Value* n_next = b.CreateSub(n, b.getInt32(1));
2858 n->addIncoming(n_next, loop1);
2859
2860 // Each arg ptr += K
2861 for (size_t i = 0; i < fImpl->strides.size(); i++) {
2862 llvm::Value* arg_next
2863 = b.CreateConstInBoundsGEP1_32(nullptr, args[i], fImpl->strides[i]);
2864 args[i]->addIncoming(arg_next, loop1);
2865 }
2866 b.CreateBr(test1);
2867 }
2868
2869 // leave: ret
2870 {
2871 IRBuilder b(leave);
2872 b.CreateRetVoid();
2873 }
2874
2875 SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs()));
2876
2877 if (true) {
2878 SkString path = SkStringPrintf("/tmp/%s.bc", debug_name);
2879 std::error_code err;
2880 llvm::raw_fd_ostream os(path.c_str(), err);
2881 if (err) {
2882 return;
2883 }
2884 llvm::WriteBitcodeToFile(*mod, os);
2885 }
2886
2887 static SkOnce once;
2888 once([]{
2889 SkAssertResult(false == llvm::InitializeNativeTarget());
2890 SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter());
2891 });
2892
2893 if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod))
2894 .setEngineKind(llvm::EngineKind::JIT)
2895 .setMCPU(llvm::sys::getHostCPUName())
2896 .create()) {
2897 fImpl->llvm_ctx = std::move(ctx);
2898 fImpl->llvm_ee.reset(ee);
2899
2900 // We have to be careful here about what we close over and how, in case fImpl moves.
2901 // fImpl itself may change, but its pointee fields won't, so close over them by value.
2902 // Also, debug_name will almost certainly leave scope, so copy it.
2903 fImpl->llvm_compiling = std::async(std::launch::async, [dst = &fImpl->jit_entry,
2904 ee = fImpl->llvm_ee.get(),
2905 name = std::string(debug_name)]{
2906 // std::atomic<void*>* dst;
2907 // llvm::ExecutionEngine* ee;
2908 // std::string name;
2909 dst->store( (void*)ee->getFunctionAddress(name.c_str()) );
2910 });
2911 }
2912 }
2913#endif
2914
2915 void Program::waitForLLVM() const {
2916 #if defined(SKVM_LLVM)
2917 if (fImpl->llvm_compiling.valid()) {
2918 fImpl->llvm_compiling.wait();
2919 }
2920 #endif
2921 }
2922
2923 bool Program::hasJIT() const {
2924 // Program::hasJIT() is really just a debugging / test aid,
2925 // so we don't mind adding a sync point here to wait for compilation.
2926 this->waitForLLVM();
2927
2928 return fImpl->jit_entry.load() != nullptr;
2929 }
2930
2931 void Program::dropJIT() {
2932 #if defined(SKVM_LLVM)
2933 this->waitForLLVM();
2934 fImpl->llvm_ee .reset(nullptr);
2935 fImpl->llvm_ctx.reset(nullptr);
2936 #elif defined(SKVM_JIT)
2937 if (fImpl->dylib) {
2938 close_dylib(fImpl->dylib);
2939 } else if (auto jit_entry = fImpl->jit_entry.load()) {
2940 unmap_jit_buffer(jit_entry, fImpl->jit_size);
2941 }
2942 #else
2943 SkASSERT(!this->hasJIT());
2944 #endif
2945
2946 fImpl->jit_entry.store(nullptr);
2947 fImpl->jit_size = 0;
2948 fImpl->dylib = nullptr;
2949 }
2950
2951 Program::Program() : fImpl(std::make_unique<Impl>()) {}
2952
2953 Program::~Program() {
2954 // Moved-from Programs may have fImpl == nullptr.
2955 if (fImpl) {
2956 this->dropJIT();
2957 }
2958 }
2959
2960 Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {}
2961
2962 Program& Program::operator=(Program&& other) {
2963 fImpl = std::move(other.fImpl);
2964 return *this;
2965 }
2966
2967 Program::Program(const std::vector<OptimizedInstruction>& instructions,
2968 const std::vector<int>& strides,
2969 const char* debug_name) : Program() {
2970 fImpl->strides = strides;
2971 if (gSkVMAllowJIT) {
2972 #if 1 && defined(SKVM_LLVM)
2973 this->setupLLVM(instructions, debug_name);
2974 #elif 1 && defined(SKVM_JIT)
2975 this->setupJIT(instructions, debug_name);
2976 #endif
2977 }
2978
2979 // Might as well do this after setupLLVM() to get a little more time to compile.
2980 this->setupInterpreter(instructions);
2981 }
2982
2983 std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; }
2984 int Program::nargs() const { return (int)fImpl->strides.size(); }
2985 int Program::nregs() const { return fImpl->regs; }
2986 int Program::loop () const { return fImpl->loop; }
2987 bool Program::empty() const { return fImpl->instructions.empty(); }
2988
2989 // Translate OptimizedInstructions to InterpreterInstructions.
2990 void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
2991 // Register each instruction is assigned to.
2992 std::vector<Reg> reg(instructions.size());
2993
2994 // This next bit is a bit more complicated than strictly necessary;
2995 // we could just assign every instruction to its own register.
2996 //
2997 // But recycling registers is fairly cheap, and good practice for the
2998 // JITs where minimizing register pressure really is important.
2999 //
3000 // Since we have effectively infinite registers, we hoist any value we can.
3001 // (The JIT may choose a more complex policy to reduce register pressure.)
3002
3003 fImpl->regs = 0;
3004 std::vector<Reg> avail;
3005
3006 // Assign this value to a register, recycling them where we can.
3007 auto assign_register = [&](Val id) {
3008 const OptimizedInstruction& inst = instructions[id];
3009
3010 // If this is a real input and it's lifetime ends at this instruction,
3011 // we can recycle the register it's occupying.
3012 auto maybe_recycle_register = [&](Val input) {
3013 if (input != NA && instructions[input].death == id) {
3014 avail.push_back(reg[input]);
3015 }
3016 };
3017
3018 // Take care to not recycle the same register twice.
3019 if (true ) { maybe_recycle_register(inst.x); }
3020 if (inst.y != inst.x ) { maybe_recycle_register(inst.y); }
3021 if (inst.z != inst.x && inst.z != inst.y) { maybe_recycle_register(inst.z); }
3022
3023 // Instructions that die at themselves (stores) don't need a register.
3024 if (inst.death != id) {
3025 // Allocate a register if we have to, preferring to reuse anything available.
3026 if (avail.empty()) {
3027 reg[id] = fImpl->regs++;
3028 } else {
3029 reg[id] = avail.back();
3030 avail.pop_back();
3031 }
3032 }
3033 };
3034
3035 // Assign a register to each hoisted instruction, then each non-hoisted loop instruction.
3036 for (Val id = 0; id < (Val)instructions.size(); id++) {
3037 if ( instructions[id].can_hoist) { assign_register(id); }
3038 }
3039 for (Val id = 0; id < (Val)instructions.size(); id++) {
3040 if (!instructions[id].can_hoist) { assign_register(id); }
3041 }
3042
3043 // Translate OptimizedInstructions to InterpreterIstructions by mapping values to
3044 // registers. This will be two passes, first hoisted instructions, then inside the loop.
3045
3046 // The loop begins at the fImpl->loop'th Instruction.
3047 fImpl->loop = 0;
3048 fImpl->instructions.reserve(instructions.size());
3049
3050 // Add a dummy mapping for the N/A sentinel Val to any arbitrary register
3051 // so lookups don't have to know which arguments are used by which Ops.
3052 auto lookup_register = [&](Val id) {
3053 return id == NA ? (Reg)0
3054 : reg[id];
3055 };
3056
3057 auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
3058 InterpreterInstruction pinst{
3059 inst.op,
3060 lookup_register(id),
3061 lookup_register(inst.x),
3062 {lookup_register(inst.y)},
3063 {lookup_register(inst.z)},
3064 };
3065 if (inst.y == NA) { pinst.immy = inst.immy; }
3066 if (inst.z == NA) { pinst.immz = inst.immz; }
3067 fImpl->instructions.push_back(pinst);
3068 };
3069
3070 for (Val id = 0; id < (Val)instructions.size(); id++) {
3071 const OptimizedInstruction& inst = instructions[id];
3072 if (inst.can_hoist) {
3073 push_instruction(id, inst);
3074 fImpl->loop++;
3075 }
3076 }
3077 for (Val id = 0; id < (Val)instructions.size(); id++) {
3078 const OptimizedInstruction& inst = instructions[id];
3079 if (!inst.can_hoist) {
3080 push_instruction(id, inst);
3081 }
3082 }
3083 }
3084
3085#if defined(SKVM_JIT)
3086
3087 bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
3088 int* stack_hint,
3089 uint32_t* registers_used,
3090 Assembler* a) const {
3091 using A = Assembler;
3092
3093 SkTHashMap<int, A::Label> constants; // Constants (mostly splats) share the same pool.
3094 A::Label iota; // Varies per lane, for Op::index.
3095 A::Label load64_index; // Used to load low or high half of 64-bit lanes.
3096
3097 // The `regs` array tracks everything we know about each register's state:
3098 // - NA: empty
3099 // - RES: reserved by ABI
3100 // - TMP: holding a temporary
3101 // - id: holding Val id
3102 constexpr Val RES = NA-1,
3103 TMP = RES-1;
3104
3105 // Map val -> stack slot.
3106 std::vector<int> stack_slot(instructions.size(), NA);
3107 int next_stack_slot = 0;
3108
3109 const int nstack_slots = *stack_hint >= 0 ? *stack_hint
3110 : stack_slot.size();
3111
3112 #if defined(__x86_64__) || defined(_M_X64)
3113 if (!SkCpu::Supports(SkCpu::HSW)) {
3114 return false;
3115 }
3116 const int K = 8;
3117 using Reg = A::Ymm;
3118 #if defined(_M_X64) // Important to check this first; clang-cl defines both.
3119 const A::GP64 N = A::rcx,
3120 GP0 = A::rax,
3121 GP1 = A::r11,
3122 arg[] = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi };
3123
3124 // xmm6-15 need are callee-saved.
3125 std::array<Val,16> regs = {
3126 NA, NA, NA, NA, NA, NA,RES,RES,
3127 RES,RES,RES,RES, RES,RES,RES,RES,
3128 };
3129 const uint32_t incoming_registers_used = *registers_used;
3130
3131 auto enter = [&]{
3132 // rcx,rdx,r8,r9 are all already holding their correct values.
3133 // Load caller-saved r10 from rsp+40 if there's a fourth arg.
3134 if (fImpl->strides.size() >= 4) {
3135 a->mov(A::r10, A::Mem{A::rsp, 40});
3136 }
3137 // Load callee-saved rdi from rsp+48 if there's a fifth arg,
3138 // first saving it to ABI reserved shadow area rsp+8.
3139 if (fImpl->strides.size() >= 5) {
3140 a->mov(A::Mem{A::rsp, 8}, A::rdi);
3141 a->mov(A::rdi, A::Mem{A::rsp, 48});
3142 }
3143 // Load callee-saved rsi from rsp+56 if there's a sixth arg,
3144 // first saving it to ABI reserved shadow area rsp+16.
3145 if (fImpl->strides.size() >= 6) {
3146 a->mov(A::Mem{A::rsp, 16}, A::rsi);
3147 a->mov(A::rsi, A::Mem{A::rsp, 56});
3148 }
3149
3150 // Allocate stack for our values and callee-saved xmm6-15.
3151 int stack_needed = nstack_slots*K*4;
3152 for (int r = 6; r < 16; r++) {
3153 if (incoming_registers_used & (1<<r)) {
3154 stack_needed += 16;
3155 }
3156 }
3157 if (stack_needed) { a->sub(A::rsp, stack_needed); }
3158
3159 int next_saved_xmm = nstack_slots*K*4;
3160 for (int r = 6; r < 16; r++) {
3161 if (incoming_registers_used & (1<<r)) {
3162 a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r);
3163 next_saved_xmm += 16;
3164 regs[r] = NA;
3165 }
3166 }
3167 };
3168 auto exit = [&]{
3169 // The second pass of jit() shouldn't use any register it didn't in the first pass.
3170 SkASSERT((*registers_used & incoming_registers_used) == *registers_used);
3171
3172 // Restore callee-saved xmm6-15 and the stack pointer.
3173 int stack_used = nstack_slots*K*4;
3174 for (int r = 6; r < 16; r++) {
3175 if (incoming_registers_used & (1<<r)) {
3176 a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used});
3177 stack_used += 16;
3178 }
3179 }
3180 if (stack_used) { a->add(A::rsp, stack_used); }
3181
3182 // Restore callee-saved rdi/rsi if we used them.
3183 if (fImpl->strides.size() >= 5) {
3184 a->mov(A::rdi, A::Mem{A::rsp, 8});
3185 }
3186 if (fImpl->strides.size() >= 6) {
3187 a->mov(A::rsi, A::Mem{A::rsp, 16});
3188 }
3189
3190 a->vzeroupper();
3191 a->ret();
3192 };
3193 #elif defined(__x86_64__)
3194 const A::GP64 N = A::rdi,
3195 GP0 = A::rax,
3196 GP1 = A::r11,
3197 arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 };
3198
3199 // All 16 ymm registers are available to use.
3200 std::array<Val,16> regs = {
3201 NA,NA,NA,NA, NA,NA,NA,NA,
3202 NA,NA,NA,NA, NA,NA,NA,NA,
3203 };
3204
3205 auto enter = [&]{
3206 // Load caller-saved r10 from rsp+8 if there's a sixth arg.
3207 if (fImpl->strides.size() >= 6) {
3208 a->mov(A::r10, A::Mem{A::rsp, 8});
3209 }
3210 if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); }
3211 };
3212 auto exit = [&]{
3213 if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); }
3214 a->vzeroupper();
3215 a->ret();
3216 };
3217 #endif
3218
3219 auto load_from_memory = [&](Reg r, Val v) {
3220 if (instructions[v].op == Op::splat) {
3221 if (instructions[v].immy == 0) {
3222 a->vpxor(r,r,r);
3223 } else {
3224 a->vmovups(r, constants.find(instructions[v].immy));
3225 }
3226 } else {
3227 SkASSERT(stack_slot[v] != NA);
3228 a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4});
3229 }
3230 };
3231 auto store_to_stack = [&](Reg r, Val v) {
3232 SkASSERT(next_stack_slot < nstack_slots);
3233 stack_slot[v] = next_stack_slot++;
3234 a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r);
3235 };
3236 #elif defined(__aarch64__)
3237 const int K = 4;
3238 using Reg = A::V;
3239 const A::X N = A::x0,
3240 GP0 = A::x8,
3241 arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
3242
3243 // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit.
3244 std::array<Val,32> regs = {
3245 NA, NA, NA, NA, NA, NA, NA, NA,
3246 RES,RES,RES,RES, RES,RES,RES,RES,
3247 NA, NA, NA, NA, NA, NA, NA, NA,
3248 NA, NA, NA, NA, NA, NA, NA, NA,
3249 };
3250
3251 auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } };
3252 auto exit = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); }
3253 a->ret(A::x30); };
3254
3255 auto load_from_memory = [&](Reg r, Val v) {
3256 if (instructions[v].op == Op::splat) {
3257 if (instructions[v].immy == 0) {
3258 a->eor16b(r,r,r);
3259 } else {
3260 a->ldrq(r, constants.find(instructions[v].immy));
3261 }
3262 } else {
3263 SkASSERT(stack_slot[v] != NA);
3264 a->ldrq(r, A::sp, stack_slot[v]);
3265 }
3266 };
3267 auto store_to_stack = [&](Reg r, Val v) {
3268 SkASSERT(next_stack_slot < nstack_slots);
3269 stack_slot[v] = next_stack_slot++;
3270 a->strq(r, A::sp, stack_slot[v]);
3271 };
3272 #endif
3273
3274 *registers_used = 0; // We'll update this as we go.
3275
3276 if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) {
3277 return false;
3278 }
3279
3280 auto emit = [&](Val id, bool scalar) {
3281 const OptimizedInstruction& inst = instructions[id];
3282 const Op op = inst.op;
3283 const Val x = inst.x,
3284 y = inst.y,
3285 z = inst.z;
3286 const int immy = inst.immy,
3287 immz = inst.immz;
3288
3289 // alloc_tmp() returns a temporary register, freed manually with free_tmp().
3290 auto alloc_tmp = [&]() -> Reg {
3291 // Find an available register, or spill an occupied one if nothing's available.
3292 auto avail = std::find_if(regs.begin(), regs.end(), [](Val v) { return v == NA; });
3293 if (avail == regs.end()) {
3294 auto score_spills = [&](Val v) -> int {
3295 // We cannot spill REServed registers,
3296 // nor any registers we need for this instruction.
3297 if (v == RES ||
3298 v == TMP || v == id || v == x || v == y || v == z) {
3299 return 0x7fff'ffff;
3300 }
3301 // At this point spilling is arbitrary, so we're in the realm of heuristics.
3302 // Here, spill the oldest value. This is nice because,
3303 // A) it's very predictable, even in assembly, and
3304 // B) it's as cheap as you can get.
3305 return v;
3306 };
3307 avail = std::min_element(regs.begin(), regs.end(), [&](Val a, Val b) {
3308 return score_spills(a) < score_spills(b);
3309 });
3310 }
3311 SkASSERT(avail != regs.end());
3312
3313 Reg r = (Reg)std::distance(regs.begin(), avail);
3314 Val& v = regs[r];
3315 *registers_used |= (1<<r);
3316
3317 SkASSERT(v == NA || v >= 0);
3318 if (v >= 0) {
3319 if (stack_slot[v] == NA && instructions[v].op != Op::splat) {
3320 store_to_stack(r, v);
3321 }
3322 v = NA;
3323 }
3324 SkASSERT(v == NA);
3325
3326 v = TMP;
3327 return r;
3328 };
3329
3330 #if defined(__x86_64__) || defined(_M_X64) // Nothing special... just unused on ARM.
3331 auto free_tmp = [&](Reg r) {
3332 SkASSERT(regs[r] == TMP);
3333 regs[r] = NA;
3334 };
3335 #endif
3336
3337 // Which register holds dst,x,y,z for this instruction? NA if none does yet.
3338 int rd = NA,
3339 rx = NA,
3340 ry = NA,
3341 rz = NA;
3342
3343 auto update_regs = [&](Reg r, Val v) {
3344 if (v == id) { rd = r; }
3345 if (v == x) { rx = r; }
3346 if (v == y) { ry = r; }
3347 if (v == z) { rz = r; }
3348 return r;
3349 };
3350
3351 auto find_existing_reg = [&](Val v) -> int {
3352 // Quick-check our working registers.
3353 if (v == id && rd != NA) { return rd; }
3354 if (v == x && rx != NA) { return rx; }
3355 if (v == y && ry != NA) { return ry; }
3356 if (v == z && rz != NA) { return rz; }
3357
3358 // Search inter-instruction register map.
3359 for (auto [r,val] : SkMakeEnumerate(regs)) {
3360 if (val == v) {
3361 return update_regs((Reg)r, v);
3362 }
3363 }
3364 return NA;
3365 };
3366
3367 // Return a register for Val, holding that value if it already exists.
3368 // During this instruction all calls to r(v) will return the same register.
3369 auto r = [&](Val v) -> Reg {
3370 SkASSERT(v >= 0);
3371
3372 if (int found = find_existing_reg(v); found != NA) {
3373 return (Reg)found;
3374 }
3375
3376 Reg r = alloc_tmp();
3377 SkASSERT(regs[r] == TMP);
3378
3379 SkASSERT(v <= id);
3380 if (v < id) {
3381 // If v < id, we're loading one of this instruction's inputs.
3382 // If v == id we're just allocating its destination register.
3383 load_from_memory(r, v);
3384 }
3385 regs[r] = v;
3386 return update_regs(r, v);
3387 };
3388
3389 auto dies_here = [&](Val v) -> bool {
3390 SkASSERT(v >= 0);
3391 return instructions[v].death == id;
3392 };
3393
3394 // Alias dst() to r(v) if dies_here(v).
3395 auto try_alias = [&](Val v) -> bool {
3396 SkASSERT(v == x || v == y || v == z);
3397 if (dies_here(v)) {
3398 rd = r(v); // Vals v and id share a register for this instruction.
3399 regs[rd] = id; // Next instruction, Val id will be in the register, not Val v.
3400 return true;
3401 }
3402 return false;
3403 };
3404
3405 // Generally r(id),
3406 // but with a hint, try to alias dst() to r(v) if dies_here(v).
3407 auto dst = [&](Val hint = NA) -> Reg {
3408 if (hint != NA) {
3409 (void)try_alias(hint);
3410 }
3411 return r(id);
3412 };
3413
3414 #if defined(__x86_64__) || defined(_M_X64)
3415 // On x86 we can work with many values directly from the stack or program constant pool.
3416 auto any = [&](Val v) -> A::Operand {
3417 SkASSERT(v >= 0);
3418 SkASSERT(v < id);
3419
3420 if (int found = find_existing_reg(v); found != NA) {
3421 return (Reg)found;
3422 }
3423 if (instructions[v].op == Op::splat) {
3424 return constants.find(instructions[v].immy);
3425 }
3426 return A::Mem{A::rsp, stack_slot[v]*K*4};
3427 };
3428
3429 // This is never really worth asking except when any() might be used;
3430 // if we need this value in ARM, might as well just call r(v) to get it into a register.
3431 auto in_reg = [&](Val v) -> bool {
3432 return find_existing_reg(v) != NA;
3433 };
3434 #endif
3435
3436 switch (op) {
3437 case Op::splat:
3438 // Make sure splat constants can be found by load_from_memory() or any().
3439 (void)constants[immy];
3440 break;
3441
3442 #if defined(__x86_64__) || defined(_M_X64)
3443 case Op::assert_true: {
3444 a->vptest (r(x), &constants[0xffffffff]);
3445 A::Label all_true;
3446 a->jc(&all_true);
3447 a->int3();
3448 a->label(&all_true);
3449 } break;
3450
3451 case Op::store8:
3452 if (scalar) {
3453 a->vpextrb(A::Mem{arg[immy]}, (A::Xmm)r(x), 0);
3454 } else {
3455 a->vpackusdw(dst(x), r(x), r(x));
3456 a->vpermq (dst(), dst(), 0xd8);
3457 a->vpackuswb(dst(), dst(), dst());
3458 a->vmovq (A::Mem{arg[immy]}, (A::Xmm)dst());
3459 } break;
3460
3461 case Op::store16:
3462 if (scalar) {
3463 a->vpextrw(A::Mem{arg[immy]}, (A::Xmm)r(x), 0);
3464 } else {
3465 a->vpackusdw(dst(x), r(x), r(x));
3466 a->vpermq (dst(), dst(), 0xd8);
3467 a->vmovups (A::Mem{arg[immy]}, (A::Xmm)dst());
3468 } break;
3469
3470 case Op::store32: if (scalar) { a->vmovd (A::Mem{arg[immy]}, (A::Xmm)r(x)); }
3471 else { a->vmovups(A::Mem{arg[immy]}, r(x)); }
3472 break;
3473
3474 case Op::store64: if (scalar) {
3475 a->vmovd(A::Mem{arg[immz],0}, (A::Xmm)r(x));
3476 a->vmovd(A::Mem{arg[immz],4}, (A::Xmm)r(y));
3477 } else {
3478 // r(x) = {a,b,c,d|e,f,g,h}
3479 // r(y) = {i,j,k,l|m,n,o,p}
3480 // We want to write a,i,b,j,c,k,d,l,e,m...
3481 A::Ymm L = alloc_tmp(),
3482 H = alloc_tmp();
3483 a->vpunpckldq(L, r(x), any(y)); // L = {a,i,b,j|e,m,f,n}
3484 a->vpunpckhdq(H, r(x), any(y)); // H = {c,k,d,l|g,o,h,p}
3485 a->vperm2f128(dst(), L,H, 0x20); // = {a,i,b,j|c,k,d,l}
3486 a->vmovups(A::Mem{arg[immz], 0}, dst());
3487 a->vperm2f128(dst(), L,H, 0x31); // = {e,m,f,n|g,o,h,p}
3488 a->vmovups(A::Mem{arg[immz],32}, dst());
3489 free_tmp(L);
3490 free_tmp(H);
3491 } break;
3492
3493 case Op::store128: {
3494 // TODO: 8 64-bit stores instead of 16 32-bit stores?
3495 int ptr = immz>>1,
3496 lane = immz&1;
3497 a->vmovd (A::Mem{arg[ptr], 0*16 + 8*lane + 0}, (A::Xmm)r(x) );
3498 a->vmovd (A::Mem{arg[ptr], 0*16 + 8*lane + 4}, (A::Xmm)r(y) );
3499 if (scalar) { break; }
3500 a->vpextrd(A::Mem{arg[ptr], 1*16 + 8*lane + 0}, (A::Xmm)r(x), 1);
3501 a->vpextrd(A::Mem{arg[ptr], 1*16 + 8*lane + 4}, (A::Xmm)r(y), 1);
3502 a->vpextrd(A::Mem{arg[ptr], 2*16 + 8*lane + 0}, (A::Xmm)r(x), 2);
3503 a->vpextrd(A::Mem{arg[ptr], 2*16 + 8*lane + 4}, (A::Xmm)r(y), 2);
3504 a->vpextrd(A::Mem{arg[ptr], 3*16 + 8*lane + 0}, (A::Xmm)r(x), 3);
3505 a->vpextrd(A::Mem{arg[ptr], 3*16 + 8*lane + 4}, (A::Xmm)r(y), 3);
3506 // Now we need to store the upper 128 bits of x and y.
3507 // Storing x then y rather than interlacing minimizes temporaries.
3508 a->vextracti128(dst(), r(x), 1);
3509 a->vmovd (A::Mem{arg[ptr], 4*16 + 8*lane + 0}, (A::Xmm)dst() );
3510 a->vpextrd(A::Mem{arg[ptr], 5*16 + 8*lane + 0}, (A::Xmm)dst(), 1);
3511 a->vpextrd(A::Mem{arg[ptr], 6*16 + 8*lane + 0}, (A::Xmm)dst(), 2);
3512 a->vpextrd(A::Mem{arg[ptr], 7*16 + 8*lane + 0}, (A::Xmm)dst(), 3);
3513 a->vextracti128(dst(), r(y), 1);
3514 a->vmovd (A::Mem{arg[ptr], 4*16 + 8*lane + 4}, (A::Xmm)dst() );
3515 a->vpextrd(A::Mem{arg[ptr], 5*16 + 8*lane + 4}, (A::Xmm)dst(), 1);
3516 a->vpextrd(A::Mem{arg[ptr], 6*16 + 8*lane + 4}, (A::Xmm)dst(), 2);
3517 a->vpextrd(A::Mem{arg[ptr], 7*16 + 8*lane + 4}, (A::Xmm)dst(), 3);
3518 } break;
3519
3520 case Op::load8: if (scalar) {
3521 a->vpxor (dst(), dst(), dst());
3522 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immy]}, 0);
3523 } else {
3524 a->vpmovzxbd(dst(), A::Mem{arg[immy]});
3525 } break;
3526
3527 case Op::load16: if (scalar) {
3528 a->vpxor (dst(), dst(), dst());
3529 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immy]}, 0);
3530 } else {
3531 a->vpmovzxwd(dst(), A::Mem{arg[immy]});
3532 } break;
3533
3534 case Op::load32: if (scalar) { a->vmovd ((A::Xmm)dst(), A::Mem{arg[immy]}); }
3535 else { a->vmovups( dst(), A::Mem{arg[immy]}); }
3536 break;
3537
3538 case Op::load64: if (scalar) {
3539 a->vmovd((A::Xmm)dst(), A::Mem{arg[immy], 4*immz});
3540 } else {
3541 A::Ymm tmp = alloc_tmp();
3542 a->vmovups(tmp, &load64_index);
3543 a->vpermps(dst(), tmp, A::Mem{arg[immy], 0});
3544 a->vpermps( tmp, tmp, A::Mem{arg[immy], 32});
3545 // Low 128 bits holds immz=0 lanes, high 128 bits holds immz=1.
3546 a->vperm2f128(dst(), dst(),tmp, immz ? 0x31 : 0x20);
3547 free_tmp(tmp);
3548 } break;
3549
3550 case Op::load128: if (scalar) {
3551 a->vmovd((A::Xmm)dst(), A::Mem{arg[immy], 4*immz});
3552 } else {
3553 // Load 4 low values into xmm tmp,
3554 A::Ymm tmp = alloc_tmp();
3555 A::Xmm t = (A::Xmm)tmp;
3556 a->vmovd (t, A::Mem{arg[immy], 0*16 + 4*immz} );
3557 a->vpinsrd(t,t, A::Mem{arg[immy], 1*16 + 4*immz}, 1);
3558 a->vpinsrd(t,t, A::Mem{arg[immy], 2*16 + 4*immz}, 2);
3559 a->vpinsrd(t,t, A::Mem{arg[immy], 3*16 + 4*immz}, 3);
3560
3561 // Load 4 high values into xmm dst(),
3562 A::Xmm d = (A::Xmm)dst();
3563 a->vmovd (d, A::Mem{arg[immy], 4*16 + 4*immz} );
3564 a->vpinsrd(d,d, A::Mem{arg[immy], 5*16 + 4*immz}, 1);
3565 a->vpinsrd(d,d, A::Mem{arg[immy], 6*16 + 4*immz}, 2);
3566 a->vpinsrd(d,d, A::Mem{arg[immy], 7*16 + 4*immz}, 3);
3567
3568 // Merge the two, ymm dst() = {xmm tmp|xmm dst()}
3569 a->vperm2f128(dst(), tmp,dst(), 0x20);
3570 free_tmp(tmp);
3571 } break;
3572
3573 case Op::gather8: {
3574 // As usual, the gather base pointer is immz bytes off of uniform immy.
3575 a->mov(GP0, A::Mem{arg[immy], immz});
3576
3577 A::Ymm tmp = alloc_tmp();
3578 a->vmovups(tmp, any(x));
3579
3580 for (int i = 0; i < (scalar ? 1 : 8); i++) {
3581 if (i == 4) {
3582 // vpextrd can only pluck indices out from an Xmm register,
3583 // so we manually swap over to the top when we're halfway through.
3584 a->vextracti128((A::Xmm)tmp, tmp, 1);
3585 }
3586 a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3587 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i);
3588 }
3589 a->vpmovzxbd(dst(), dst());
3590 free_tmp(tmp);
3591 } break;
3592
3593 case Op::gather16: {
3594 // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
3595 a->mov(GP0, A::Mem{arg[immy], immz});
3596
3597 A::Ymm tmp = alloc_tmp();
3598 a->vmovups(tmp, any(x));
3599
3600 for (int i = 0; i < (scalar ? 1 : 8); i++) {
3601 if (i == 4) {
3602 a->vextracti128((A::Xmm)tmp, tmp, 1);
3603 }
3604 a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3605 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i);
3606 }
3607 a->vpmovzxwd(dst(), dst());
3608 free_tmp(tmp);
3609 } break;
3610
3611 case Op::gather32:
3612 if (scalar) {
3613 // Our gather base pointer is immz bytes off of uniform immy.
3614 a->mov(GP0, A::Mem{arg[immy], immz});
3615
3616 // Grab our index from lane 0 of the index argument.
3617 a->vmovd(GP1, (A::Xmm)r(x));
3618
3619 // dst = *(base + 4*index)
3620 a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR});
3621 } else {
3622 a->mov(GP0, A::Mem{arg[immy], immz});
3623
3624 A::Ymm mask = alloc_tmp();
3625 a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.)
3626
3627 a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask);
3628 free_tmp(mask);
3629 }
3630 break;
3631
3632 case Op::uniform8: a->movzbq(GP0, A::Mem{arg[immy], immz});
3633 a->vmovd((A::Xmm)dst(), GP0);
3634 a->vbroadcastss(dst(), dst());
3635 break;
3636
3637 case Op::uniform16: a->movzwq(GP0, A::Mem{arg[immy], immz});
3638 a->vmovd((A::Xmm)dst(), GP0);
3639 a->vbroadcastss(dst(), dst());
3640 break;
3641
3642 case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immy], immz});
3643 break;
3644
3645 case Op::index: a->vmovd((A::Xmm)dst(), N);
3646 a->vbroadcastss(dst(), dst());
3647 a->vpsubd(dst(), dst(), &iota);
3648 break;
3649
3650 // We can swap the arguments of symmetric instructions to make better use of any().
3651 case Op::add_f32:
3652 if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); }
3653 else { a->vaddps(dst(y), r(y), any(x)); }
3654 break;
3655
3656 case Op::mul_f32:
3657 if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); }
3658 else { a->vmulps(dst(y), r(y), any(x)); }
3659 break;
3660
3661 case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break;
3662 case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break;
3663 case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break; // Order matters,
3664 case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break; // see test SkVM_min_max.
3665
3666 case Op::fma_f32:
3667 if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else
3668 if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else
3669 if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else
3670 { a->vmovups (dst(), any(x));
3671 a->vfmadd132ps(dst(), r(z), any(y)); }
3672 break;
3673
3674 case Op::fms_f32:
3675 if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else
3676 if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else
3677 if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else
3678 { a->vmovups (dst(), any(x));
3679 a->vfmsub132ps(dst(), r(z), any(y)); }
3680 break;
3681
3682 case Op::fnma_f32:
3683 if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else
3684 if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else
3685 if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else
3686 { a->vmovups (dst(), any(x));
3687 a->vfnmadd132ps(dst(), r(z), any(y)); }
3688 break;
3689
3690 // In situations like this we want to try aliasing dst(x) when x is
3691 // already in a register, but not if we'd have to load it from the stack
3692 // just to alias it. That's done better directly into the new register.
3693 case Op::sqrt_f32:
3694 if (in_reg(x)) { a->vsqrtps(dst(x), r(x)); }
3695 else { a->vsqrtps(dst(), any(x)); }
3696 break;
3697
3698 case Op::add_i32:
3699 if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); }
3700 else { a->vpaddd(dst(y), r(y), any(x)); }
3701 break;
3702 case Op::mul_i32:
3703 if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); }
3704 else { a->vpmulld(dst(y), r(y), any(x)); }
3705 break;
3706
3707 case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break;
3708
3709 case Op::bit_and:
3710 if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); }
3711 else { a->vpand(dst(y), r(y), any(x)); }
3712 break;
3713 case Op::bit_or:
3714 if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); }
3715 else { a->vpor(dst(y), r(y), any(x)); }
3716 break;
3717 case Op::bit_xor:
3718 if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); }
3719 else { a->vpxor(dst(y), r(y), any(x)); }
3720 break;
3721
3722 case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x.
3723
3724 case Op::select:
3725 if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); }
3726 else { a->vpblendvb(dst(x), r(z), any(y), r(x)); }
3727 break;
3728
3729 case Op::shl_i32: a->vpslld(dst(x), r(x), immy); break;
3730 case Op::shr_i32: a->vpsrld(dst(x), r(x), immy); break;
3731 case Op::sra_i32: a->vpsrad(dst(x), r(x), immy); break;
3732
3733 case Op::eq_i32:
3734 if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); }
3735 else { a->vpcmpeqd(dst(y), r(y), any(x)); }
3736 break;
3737
3738 case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break;
3739
3740 case Op::eq_f32:
3741 if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); }
3742 else { a->vcmpeqps(dst(y), r(y), any(x)); }
3743 break;
3744 case Op::neq_f32:
3745 if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); }
3746 else { a->vcmpneqps(dst(y), r(y), any(x)); }
3747 break;
3748
3749 case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break;
3750 case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break;
3751
3752 // It's safe to alias dst(y) only when y != x. Otherwise we'd overwrite x!
3753 case Op::pack: a->vpslld(dst(y != x ? y : NA), r(y), immz);
3754 a->vpor (dst(), dst(), any(x));
3755 break;
3756
3757 case Op::ceil:
3758 if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::CEIL); }
3759 else { a->vroundps(dst(), any(x), Assembler::CEIL); }
3760 break;
3761
3762 case Op::floor:
3763 if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::FLOOR); }
3764 else { a->vroundps(dst(), any(x), Assembler::FLOOR); }
3765 break;
3766
3767 case Op::to_f32:
3768 if (in_reg(x)) { a->vcvtdq2ps(dst(x), r(x)); }
3769 else { a->vcvtdq2ps(dst(), any(x)); }
3770 break;
3771
3772 case Op::trunc:
3773 if (in_reg(x)) { a->vcvttps2dq(dst(x), r(x)); }
3774 else { a->vcvttps2dq(dst(), any(x)); }
3775 break;
3776
3777 case Op::round:
3778 if (in_reg(x)) { a->vcvtps2dq(dst(x), r(x)); }
3779 else { a->vcvtps2dq(dst(), any(x)); }
3780 break;
3781
3782 case Op::to_half:
3783 a->vcvtps2ph(dst(x), r(x), A::CURRENT); // f32 ymm -> f16 xmm
3784 a->vpmovzxwd(dst(), dst()); // f16 xmm -> f16 ymm
3785 break;
3786
3787 case Op::from_half:
3788 a->vpackusdw(dst(x), r(x), r(x)); // f16 ymm -> f16 xmm
3789 a->vpermq (dst(), dst(), 0xd8); // swap middle two 64-bit lanes
3790 a->vcvtph2ps(dst(), dst()); // f16 xmm -> f32 ymm
3791 break;
3792
3793 #elif defined(__aarch64__)
3794 default: // TODO
3795 if (false) {
3796 SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
3797 }
3798 return false;
3799
3800 case Op::assert_true: {
3801 a->uminv4s(dst(), r(x)); // uminv acts like an all() across the vector.
3802 a->fmovs(GP0, dst());
3803 A::Label all_true;
3804 a->cbnz(GP0, &all_true);
3805 a->brk(0);
3806 a->label(&all_true);
3807 } break;
3808
3809 case Op::store8: a->xtns2h(dst(), r(x));
3810 a->xtnh2b(dst(), dst());
3811 if (scalar) { a->strb (dst(), arg[immy]); }
3812 else { a->strs (dst(), arg[immy]); }
3813 break;
3814
3815 case Op::store32: if (scalar) { a->strs(r(x), arg[immy]); }
3816 else { a->strq(r(x), arg[immy]); }
3817 break;
3818
3819 case Op::load8: if (scalar) { a->ldrb(dst(), arg[immy]); }
3820 else { a->ldrs(dst(), arg[immy]); }
3821 a->uxtlb2h(dst(), dst());
3822 a->uxtlh2s(dst(), dst());
3823 break;
3824
3825 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immy]); }
3826 else { a->ldrq(dst(), arg[immy]); }
3827 break;
3828
3829 case Op::add_f32: a->fadd4s(dst(), r(x), r(y)); break;
3830 case Op::sub_f32: a->fsub4s(dst(), r(x), r(y)); break;
3831 case Op::mul_f32: a->fmul4s(dst(), r(x), r(y)); break;
3832 case Op::div_f32: a->fdiv4s(dst(), r(x), r(y)); break;
3833
3834 case Op::fma_f32: // fmla.4s is z += x*y
3835 if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); }
3836 else { a->orr16b(dst(), r(z), r(z));
3837 a->fmla4s(dst(), r(x), r(y)); }
3838 break;
3839
3840 case Op::fnma_f32: // fmls.4s is z -= x*y
3841 if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
3842 else { a->orr16b(dst(), r(z), r(z));
3843 a->fmls4s(dst(), r(x), r(y)); }
3844 break;
3845
3846 case Op::fms_f32: // calculate z - xy, then negate to xy - z
3847 if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
3848 else { a->orr16b(dst(), r(z), r(z));
3849 a->fmls4s(dst(), r(x), r(y)); }
3850 a->fneg4s(dst(), dst());
3851 break;
3852
3853 case Op:: gt_f32: a->fcmgt4s (dst(), r(x), r(y)); break;
3854 case Op::gte_f32: a->fcmge4s (dst(), r(x), r(y)); break;
3855 case Op:: eq_f32: a->fcmeq4s (dst(), r(x), r(y)); break;
3856 case Op::neq_f32: a->fcmeq4s (dst(), r(x), r(y));
3857 a->not16b (dst(), dst()); break;
3858
3859
3860 case Op::add_i32: a->add4s(dst(), r(x), r(y)); break;
3861 case Op::sub_i32: a->sub4s(dst(), r(x), r(y)); break;
3862 case Op::mul_i32: a->mul4s(dst(), r(x), r(y)); break;
3863
3864 case Op::bit_and : a->and16b(dst(), r(x), r(y)); break;
3865 case Op::bit_or : a->orr16b(dst(), r(x), r(y)); break;
3866 case Op::bit_xor : a->eor16b(dst(), r(x), r(y)); break;
3867 case Op::bit_clear: a->bic16b(dst(), r(x), r(y)); break;
3868
3869 case Op::select: // bsl16b is x = x ? y : z
3870 if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); }
3871 else { a->orr16b(dst(), r(x), r(x));
3872 a->bsl16b(dst(), r(y), r(z)); }
3873 break;
3874
3875 // fmin4s and fmax4s don't work the way we want with NaN,
3876 // so we write them the long way:
3877 case Op::min_f32: // min(x,y) = y<x ? y : x
3878 a->fcmgt4s(dst(), r(x), r(y));
3879 a->bsl16b (dst(), r(y), r(x));
3880 break;
3881
3882 case Op::max_f32: // max(x,y) = x<y ? y : x
3883 a->fcmgt4s(dst(), r(y), r(x));
3884 a->bsl16b (dst(), r(y), r(x));
3885 break;
3886
3887 case Op::shl_i32: a-> shl4s(dst(), r(x), immy); break;
3888 case Op::shr_i32: a->ushr4s(dst(), r(x), immy); break;
3889 case Op::sra_i32: a->sshr4s(dst(), r(x), immy); break;
3890
3891 case Op::eq_i32: a->cmeq4s(dst(), r(x), r(y)); break;
3892 case Op::gt_i32: a->cmgt4s(dst(), r(x), r(y)); break;
3893
3894 case Op::pack:
3895 if (try_alias(x)) { a->sli4s ( r(x), r(y), immz); }
3896 else { a->shl4s (dst(), r(y), immz);
3897 a->orr16b(dst(), dst(), r(x)); }
3898 break;
3899
3900 case Op::to_f32: a->scvtf4s (dst(), r(x)); break;
3901 case Op::trunc: a->fcvtzs4s(dst(), r(x)); break;
3902 case Op::round: a->fcvtns4s(dst(), r(x)); break;
3903 // TODO: fcvtns.4s rounds to nearest even.
3904 // I think we actually want frintx -> fcvtzs to round to current mode.
3905 #endif
3906 }
3907
3908 // Proactively free the registers holding any value that dies here.
3909 if (rd != NA && dies_here(regs[rd])) { regs[rd] = NA; }
3910 if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; }
3911 if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; }
3912 if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; }
3913 return true;
3914 };
3915
3916 #if defined(__x86_64__) || defined(_M_X64)
3917 auto jump_if_less = [&](A::Label* l) { a->jl (l); };
3918 auto jump = [&](A::Label* l) { a->jmp(l); };
3919
3920 auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
3921 auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
3922 #elif defined(__aarch64__)
3923 auto jump_if_less = [&](A::Label* l) { a->blt(l); };
3924 auto jump = [&](A::Label* l) { a->b (l); };
3925
3926 auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
3927 auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
3928 #endif
3929
3930 A::Label body,
3931 tail,
3932 done;
3933
3934 enter();
3935 for (Val id = 0; id < (Val)instructions.size(); id++) {
3936 if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
3937 return false;
3938 }
3939 }
3940
3941 // This point marks a kind of canonical fixed point for register contents: if loop
3942 // code is generated as if these registers are holding these values, the next time
3943 // the loop comes around we'd better find those same registers holding those same values.
3944 auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot,
3945 saved_next_stack_slot=next_stack_slot]{
3946 for (int r = 0; r < (int)regs.size(); r++) {
3947 if (regs[r] != incoming[r]) {
3948 regs[r] = incoming[r];
3949 if (regs[r] >= 0) {
3950 load_from_memory((Reg)r, regs[r]);
3951 }
3952 }
3953 }
3954 *stack_hint = std::max(*stack_hint, next_stack_slot);
3955 stack_slot = saved_stack_slot;
3956 next_stack_slot = saved_next_stack_slot;
3957 };
3958
3959 a->label(&body);
3960 {
3961 a->cmp(N, K);
3962 jump_if_less(&tail);
3963 for (Val id = 0; id < (Val)instructions.size(); id++) {
3964 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
3965 return false;
3966 }
3967 }
3968 restore_incoming_regs();
3969 for (int i = 0; i < (int)fImpl->strides.size(); i++) {
3970 if (fImpl->strides[i]) {
3971 add(arg[i], K*fImpl->strides[i]);
3972 }
3973 }
3974 sub(N, K);
3975 jump(&body);
3976 }
3977
3978 a->label(&tail);
3979 {
3980 a->cmp(N, 1);
3981 jump_if_less(&done);
3982 for (Val id = 0; id < (Val)instructions.size(); id++) {
3983 if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) {
3984 return false;
3985 }
3986 }
3987 restore_incoming_regs();
3988 for (int i = 0; i < (int)fImpl->strides.size(); i++) {
3989 if (fImpl->strides[i]) {
3990 add(arg[i], 1*fImpl->strides[i]);
3991 }
3992 }
3993 sub(N, 1);
3994 jump(&tail);
3995 }
3996
3997 a->label(&done);
3998 {
3999 exit();
4000 }
4001
4002 // Except for explicit aligned load and store instructions, AVX allows
4003 // memory operands to be unaligned. So even though we're creating 16
4004 // byte patterns on ARM or 32-byte patterns on x86, we only need to
4005 // align to 4 bytes, the element size and alignment requirement.
4006
4007 constants.foreach([&](int imm, A::Label* label) {
4008 a->align(4);
4009 a->label(label);
4010 for (int i = 0; i < K; i++) {
4011 a->word(imm);
4012 }
4013 });
4014
4015 if (!iota.references.empty()) {
4016 a->align(4);
4017 a->label(&iota); // 0,1,2,3,4,...
4018 for (int i = 0; i < K; i++) {
4019 a->word(i);
4020 }
4021 }
4022
4023 if (!load64_index.references.empty()) {
4024 a->align(4);
4025 a->label(&load64_index); // {0,2,4,6|1,3,5,7}
4026 a->word(0); a->word(2); a->word(4); a->word(6);
4027 a->word(1); a->word(3); a->word(5); a->word(7);
4028 }
4029
4030 return true;
4031 }
4032
4033 void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
4034 const char* debug_name) {
4035 // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble)
4036 // and stack_hint/registers_used to feed forward into the next jit() call.
4037 Assembler a{nullptr};
4038 int stack_hint = -1;
4039 uint32_t registers_used = 0xffff'ffff; // Start conservatively with all.
4040 if (!this->jit(instructions, &stack_hint, &registers_used, &a)) {
4041 return;
4042 }
4043
4044 fImpl->jit_size = a.size();
4045 void* jit_entry = alloc_jit_buffer(&fImpl->jit_size);
4046 fImpl->jit_entry.store(jit_entry);
4047
4048 // Assemble the program for real with stack_hint/registers_used as feedback from first call.
4049 a = Assembler{jit_entry};
4050 SkAssertResult(this->jit(instructions, &stack_hint, &registers_used, &a));
4051 SkASSERT(a.size() <= fImpl->jit_size);
4052
4053 // Remap as executable, and flush caches on platforms that need that.
4054 remap_as_executable(jit_entry, fImpl->jit_size);
4055
4056 notify_vtune(debug_name, jit_entry, fImpl->jit_size);
4057
4058 #if !defined(SK_BUILD_FOR_WIN)
4059 // For profiling and debugging, it's helpful to have this code loaded
4060 // dynamically rather than just jumping info fImpl->jit_entry.
4061 if (gSkVMJITViaDylib) {
4062 // Dump the raw program binary.
4063 SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name);
4064 int fd = mkstemp(path.writable_str());
4065 ::write(fd, jit_entry, a.size());
4066 close(fd);
4067
4068 this->dropJIT(); // (unmap and null out fImpl->jit_entry.)
4069
4070 // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4071 SkString cmd = SkStringPrintf(
4072 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4073 " | clang -x assembler -shared - -o %s",
4074 path.c_str(), path.c_str());
4075 system(cmd.c_str());
4076
4077 // Load that dynamic library and look up skvm_jit().
4078 fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL);
4079 void* sym = nullptr;
4080 for (const char* name : {"skvm_jit", "_skvm_jit"} ) {
4081 if (!sym) { sym = dlsym(fImpl->dylib, name); }
4082 }
4083 fImpl->jit_entry.store(sym);
4084 }
4085 #endif
4086 }
4087#endif
4088
4089} // namespace skvm
4090