SkVM.cpp source code [Skia/src/core/SkVM.cpp]

1	/*
2	* Copyright 2019 Google LLC
3	*
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the LICENSE file.
6	*/
7
8	#include "include/core/SkStream.h"
9	#include "include/core/SkString.h"
10	#include "include/private/SkChecksum.h"
11	#include "include/private/SkSpinlock.h"
12	#include "include/private/SkTFitsIn.h"
13	#include "include/private/SkThreadID.h"
14	#include "include/private/SkVx.h"
15	#include "src/core/SkColorSpaceXformSteps.h"
16	#include "src/core/SkCpu.h"
17	#include "src/core/SkOpts.h"
18	#include "src/core/SkVM.h"
19	#include <algorithm>
20	#include <atomic>
21	#include <queue>
22
23	#if defined(SKVM_LLVM)
24	#include <future>
25	#include <llvm/Bitcode/BitcodeWriter.h>
26	#include <llvm/ExecutionEngine/ExecutionEngine.h>
27	#include <llvm/IR/IRBuilder.h>
28	#include <llvm/IR/Verifier.h>
29	#include <llvm/Support/TargetSelect.h>
30
31	// Platform-specific intrinsics got their own files in LLVM 10.
32	#if __has_include(<llvm/IR/IntrinsicsX86.h>)
33	#include <llvm/IR/IntrinsicsX86.h>
34	#endif
35	#endif
36
37	bool gSkVMJITViaDylib{false};
38
39	// JIT code isn't MSAN-instrumented, so we won't see when it uses
40	// uninitialized memory, and we'll not see the writes it makes as properly
41	// initializing memory. Instead force the interpreter, which should let
42	// MSAN see everything our programs do properly.
43	//
44	// Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
45	#if defined(__has_feature)
46	#if __has_feature(memory_sanitizer) \|\| __has_feature(address_sanitizer)
47	#undef SKVM_JIT
48	#endif
49	#endif
50
51	#if defined(SKVM_JIT)
52	#include <dlfcn.h> // dlopen, dlsym
53	#include <sys/mman.h> // mmap, mprotect
54	#endif
55
56	namespace skvm {
57
58	struct Program::Impl {
59	std::vector<InterpreterInstruction> instructions;
60	int regs = `0`;
61	int loop = `0`;
62	std::vector<int> strides;
63
64	std::atomic<void> jit_entry{nullptr}; // TODO: minimal std::memory_orders*
65	size_t jit_size = `0`;
66	void* dylib = nullptr;
67
68	#if defined(SKVM_LLVM)
69	std::unique_ptr<llvm::LLVMContext> llvm_ctx;
70	std::unique_ptr<llvm::ExecutionEngine> llvm_ee;
71	std::future<void> llvm_compiling;
72	#endif
73	};
74
75	// Debugging tools, mostly for printing various data structures out to a stream.
76
77	namespace {
78	class SkDebugfStream final : public SkWStream {
79	size_t fBytesWritten = `0`;
80
81	bool write(const void* buffer, size_t size) override {
82	SkDebugf("%.*s", size, buffer);
83	fBytesWritten += size;
84	return true;
85	}
86
87	size_t bytesWritten() const override {
88	return fBytesWritten;
89	}
90	};
91
92	struct V { Val id; };
93	struct R { Reg id; };
94	struct Shift { int bits; };
95	struct Splat { int bits; };
96	struct Hex { int bits; };
97
98	static void write(SkWStream* o, const char* s) {
99	o->writeText(s);
100	}
101
102	static const char* name(Op op) {
103	switch (op) {
104	#define M(x) case Op::x: return #x;
105	SKVM_OPS(M)
106	#undef M
107	}
108	return "unknown op";
109	}
110
111	static void write(SkWStream* o, Op op) {
112	const char* raw = name(op);
113	if (const char* found = strstr(raw, "_imm")) {
114	o->write(raw, found-raw);
115	} else {
116	o->writeText(raw);
117	}
118	}
119	static void write(SkWStream* o, Arg a) {
120	write(o, "arg(");
121	o->writeDecAsText(a.ix);
122	write(o, ")");
123	}
124	static void write(SkWStream* o, V v) {
125	write(o, "v");
126	o->writeDecAsText(v.id);
127	}
128	static void write(SkWStream* o, R r) {
129	write(o, "r");
130	o->writeDecAsText(r.id);
131	}
132	static void write(SkWStream* o, Shift s) {
133	o->writeDecAsText(s.bits);
134	}
135	static void write(SkWStream* o, Splat s) {
136	float f;
137	memcpy(&f, &s.bits, `4`);
138	o->writeHexAsText(s.bits);
139	write(o, " (");
140	o->writeScalarAsText(f);
141	write(o, ")");
142	}
143	static void write(SkWStream* o, Hex h) {
144	o->writeHexAsText(h.bits);
145	}
146
147	template <typename T, typename... Ts>
148	static void write(SkWStream* o, T first, Ts... rest) {
149	write(o, first);
150	write(o, " ");
151	write(o, rest...);
152	}
153	}
154
155	void Builder::dot(SkWStream* o, bool for_jit) const {
156	SkDebugfStream debug;
157	if (!o) { o = &debug; }
158
159	std::vector<OptimizedInstruction> optimized = this->optimize(for_jit);
160
161	o->writeText("digraph {\n");
162	for (Val id = `0`; id < (Val)optimized.size(); id++) {
163	auto [op, x,y,z, immy,immz, death,can_hoist,used_in_loop] = optimized [id];
164
165	switch (op) {
166	default:
167	write(o, "\t", V{id}, " [label = \"", V{id}, op);
168	// Not a perfect heuristic; sometimes y/z == NA and there is no immy/z.
169	// On the other hand, sometimes immy/z=0 is meaningful and should be printed.
170	if (y == NA) { write(o, "", Hex{immy}); }
171	if (z == NA) { write(o, "", Hex{immz}); }
172	write(o, "\"]\n");
173
174	write(o, "\t", V{id}, " -> {");
175	// In contrast to the heuristic imm labels, these dependences are exact.
176	if (x != NA) { write(o, "", V{x}); }
177	if (y != NA) { write(o, "", V{y}); }
178	if (z != NA) { write(o, "", V{z}); }
179	write(o, " }\n");
180
181	break;
182
183	// That default: impl works pretty well for most instructions,
184	// but some are nicer to see with a specialized label.
185
186	case Op::splat:
187	write(o, "\t", V{id}, " [label = \"", V{id}, op, Splat{immy}, "\"]\n");
188	break;
189	}
190	}
191	o->writeText("}\n");
192	}
193
194	void Builder::dump(SkWStream* o) const {
195	SkDebugfStream debug;
196	if (!o) { o = &debug; }
197
198	std::vector<OptimizedInstruction> optimized = this->optimize();
199	o->writeDecAsText(optimized.size());
200	o->writeText(" values (originally ");
201	o->writeDecAsText(fProgram.size());
202	o->writeText("):\n");
203	for (Val id = `0`; id < (Val)optimized.size(); id++) {
204	const OptimizedInstruction& inst = optimized [id];
205	Op op = inst.op;
206	Val x = inst.x,
207	y = inst.y,
208	z = inst.z;
209	int immy = inst.immy,
210	immz = inst.immz;
211	write(o, !inst.can_hoist ? " " :
212	inst.used_in_loop ? "↑ " :
213	"↟ ");
214	switch (op) {
215	case Op::assert_true: write(o, op, V{x}, V{y}); break;
216
217	case Op::store8: write(o, op, Arg{immy}, V{x}); break;
218	case Op::store16: write(o, op, Arg{immy}, V{x}); break;
219	case Op::store32: write(o, op, Arg{immy}, V{x}); break;
220
221	case Op::index: write(o, V{id}, "=", op); break;
222
223	case Op::load8: write(o, V{id}, "=", op, Arg{immy}); break;
224	case Op::load16: write(o, V{id}, "=", op, Arg{immy}); break;
225	case Op::load32: write(o, V{id}, "=", op, Arg{immy}); break;
226
227	case Op::gather8: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
228	case Op::gather16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
229	case Op::gather32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
230
231	case Op::uniform8: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break;
232	case Op::uniform16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break;
233	case Op::uniform32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break;
234
235	case Op::splat: write(o, V{id}, "=", op, Splat{immy}); break;
236
237
238	case Op::add_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break;
239	case Op::sub_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break;
240	case Op::mul_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break;
241	case Op::div_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break;
242	case Op::min_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break;
243	case Op::max_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break;
244	case Op::fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
245	case Op::fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
246	case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
247
248
249	case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break;
250
251	case Op::add_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
252	case Op::sub_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
253	case Op::mul_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
254	case Op::min_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
255	case Op::max_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
256
257	case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
258	case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
259	case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
260	case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
261
262
263	case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
264	case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
265	case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
266
267	case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
268	case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
269	case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
270
271	case Op:: eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
272	case Op::neq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
273	case Op:: gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
274	case Op::gte_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
275
276	case Op::add_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
277	case Op::sub_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
278	case Op::mul_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
279
280	case Op::shl_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
281	case Op::shr_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
282	case Op::sra_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
283
284	case Op:: eq_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
285	case Op::neq_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
286	case Op:: gt_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
287	case Op::gte_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
288
289	case Op::bit_and : write(o, V{id}, "=", op, V{x}, V{y} ); break;
290	case Op::bit_or : write(o, V{id}, "=", op, V{x}, V{y} ); break;
291	case Op::bit_xor : write(o, V{id}, "=", op, V{x}, V{y} ); break;
292	case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y} ); break;
293
294	case Op::bit_and_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
295	case Op::bit_or_imm : write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
296	case Op::bit_xor_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
297
298	case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
299	case Op::pack: write(o, V{id}, "=", op, V{x}, V{y}, Shift{immz}); break;
300
301	case Op::floor: write(o, V{id}, "=", op, V{x}); break;
302	case Op::to_f32: write(o, V{id}, "=", op, V{x}); break;
303	case Op::trunc: write(o, V{id}, "=", op, V{x}); break;
304	case Op::round: write(o, V{id}, "=", op, V{x}); break;
305	}
306
307	write(o, "\n");
308	}
309	}
310
311	void Program::dump(SkWStream* o) const {
312	SkDebugfStream debug;
313	if (!o) { o = &debug; }
314
315	o->writeDecAsText(fImpl ->regs);
316	o->writeText(" registers, ");
317	o->writeDecAsText(fImpl ->instructions.size());
318	o->writeText(" instructions:\n");
319	for (Val i = `0`; i < (Val)fImpl ->instructions.size(); i++) {
320	if (i == fImpl ->loop) { write(o, "loop:\n"); }
321	o->writeDecAsText(i);
322	o->writeText("\t");
323	if (i >= fImpl ->loop) { write(o, " "); }
324	const InterpreterInstruction& inst = fImpl ->instructions [i];
325	Op op = inst.op;
326	Reg d = inst.d,
327	x = inst.x,
328	y = inst.y,
329	z = inst.z;
330	int immy = inst.immy,
331	immz = inst.immz;
332	switch (op) {
333	case Op::assert_true: write(o, op, R{x}, R{y}); break;
334
335	case Op::store8: write(o, op, Arg{immy}, R{x}); break;
336	case Op::store16: write(o, op, Arg{immy}, R{x}); break;
337	case Op::store32: write(o, op, Arg{immy}, R{x}); break;
338
339	case Op::index: write(o, R{d}, "=", op); break;
340
341	case Op::load8: write(o, R{d}, "=", op, Arg{immy}); break;
342	case Op::load16: write(o, R{d}, "=", op, Arg{immy}); break;
343	case Op::load32: write(o, R{d}, "=", op, Arg{immy}); break;
344
345	case Op::gather8: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
346	case Op::gather16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
347	case Op::gather32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
348
349	case Op::uniform8: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
350	case Op::uniform16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
351	case Op::uniform32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
352
353	case Op::splat: write(o, R{d}, "=", op, Splat{immy}); break;
354
355
356	case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
357	case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
358	case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
359	case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
360	case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
361	case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
362	case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
363	case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
364	case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
365
366	case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break;
367
368	case Op::add_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
369	case Op::sub_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
370	case Op::mul_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
371	case Op::min_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
372	case Op::max_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
373
374	case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
375	case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
376	case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
377	case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
378
379
380	case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
381	case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
382	case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
383
384	case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
385	case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
386	case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
387
388	case Op:: eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
389	case Op::neq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
390	case Op:: gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
391	case Op::gte_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
392
393
394	case Op::add_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
395	case Op::sub_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
396	case Op::mul_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
397
398	case Op::shl_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
399	case Op::shr_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
400	case Op::sra_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
401
402	case Op:: eq_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
403	case Op::neq_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
404	case Op:: gt_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
405	case Op::gte_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
406
407
408	case Op::bit_and : write(o, R{d}, "=", op, R{x}, R{y} ); break;
409	case Op::bit_or : write(o, R{d}, "=", op, R{x}, R{y} ); break;
410	case Op::bit_xor : write(o, R{d}, "=", op, R{x}, R{y} ); break;
411	case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y} ); break;
412
413	case Op::bit_and_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
414	case Op::bit_or_imm : write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
415	case Op::bit_xor_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
416
417	case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
418	case Op::pack: write(o, R{d}, "=", op, R{x}, R{y}, Shift{immz}); break;
419
420	case Op::floor: write(o, R{d}, "=", op, R{x}); break;
421	case Op::to_f32: write(o, R{d}, "=", op, R{x}); break;
422	case Op::trunc: write(o, R{d}, "=", op, R{x}); break;
423	case Op::round: write(o, R{d}, "=", op, R{x}); break;
424	}
425	write(o, "\n");
426	}
427	}
428
429	std::vector<Instruction> specialize_for_jit(std::vector<Instruction> program) {
430	// We could use a temporary Builder to let new Instructions participate in common
431	// sub-expression elimination, but we'll never hit anything valuable with the
432	// specializations we've got today. Worth keeping in mind for the future though.
433	for (Val i = `0`; i < (Val)program.size(); i++) {
434	#if defined(SK_CPU_X86)
435	Instruction& inst = program [i];
436
437	auto is_imm = [&](Val id, int* bits) {
438	*bits = program [id].immy;
439	return program [id].op == Op::splat;
440	};
441
442	switch (Op imm_op; inst.op) {
443	default: break;
444
445	case Op::add_f32: imm_op = Op::add_f32_imm; goto try_imm_x_and_y;
446	case Op::mul_f32: imm_op = Op::mul_f32_imm; goto try_imm_x_and_y;
447	case Op::bit_and: imm_op = Op::bit_and_imm; goto try_imm_x_and_y;
448	case Op::bit_or: imm_op = Op::bit_or_imm ; goto try_imm_x_and_y;
449	case Op::bit_xor: imm_op = Op::bit_xor_imm; goto try_imm_x_and_y;
450	case Op::min_f32: imm_op = Op::min_f32_imm; goto try_imm_x;
451	case Op::max_f32: imm_op = Op::max_f32_imm; goto try_imm_x;
452	case Op::sub_f32: imm_op = Op::sub_f32_imm; goto try_imm_y;
453
454	try_imm_x_and_y:
455	if (int bits; is_imm (inst.x, &bits)) {
456	inst.op = imm_op;
457	inst.x = inst.y;
458	inst.y = NA;
459	inst.immy = bits;
460	} else if (int bits; is_imm (inst.y, &bits)) {
461	inst.op = imm_op;
462	inst.y = NA;
463	inst.immy = bits;
464	} break;
465
466	try_imm_x:
467	if (int bits; is_imm (inst.x, &bits)) {
468	inst.op = imm_op;
469	inst.x = inst.y;
470	inst.y = NA;
471	inst.immy = bits;
472	} break;
473
474	try_imm_y:
475	if (int bits; is_imm (inst.y, &bits)) {
476	inst.op = imm_op;
477	inst.y = NA;
478	inst.immy = bits;
479	} break;
480
481	case Op::bit_clear:
482	if (int bits; is_imm (inst.y, &bits)) {
483	inst.op = Op::bit_and_imm;
484	inst.y = NA;
485	inst.immy = ~bits;
486	} break;
487	}
488	#endif
489	}
490	return program;
491	}
492
493	std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program) {
494	// Determine which Instructions are live by working back from side effects.
495	std::vector<bool> live(program.size(), false);
496	auto mark_live = [&](Val id, auto& recurse) -> void {
497	if (live [id] == false) {
498	live [id] = true;
499	Instruction inst = program [id];
500	for (Val arg : {inst.x, inst.y, inst.z}) {
501	if (arg != NA) { recurse(arg, recurse); }
502	}
503	}
504	};
505	for (Val id = `0`; id < (Val)program.size(); id++) {
506	if (has_side_effect(program [id].op)) {
507	mark_live (id, mark_live);
508	}
509	}
510
511	// Rewrite the program with only live Instructions:
512	// - remap IDs in live Instructions to what they'll be once dead Instructions are removed;
513	// - then actually remove the dead Instructions.
514	std::vector<Val> new_id(program.size(), NA);
515	for (Val id = `0`, next = `0`; id < (Val)program.size(); id++) {
516	if (live [id]) {
517	Instruction& inst = program [id];
518	for (Val* arg : {&inst.x, &inst.y, &inst.z}) {
519	if (*arg != NA) {
520	arg = new_id [arg];
521	SkASSERT(*arg != NA);
522	}
523	}
524	new_id [id] = next++;
525	}
526	}
527	auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) {
528	Val id = (Val)(&inst - program.data());
529	return !live [id];
530	});
531	program.erase(it, program.end());
532
533	return program;
534	}
535
536	std::vector<Instruction> schedule(std::vector<Instruction> program) {
537	Usage usage{program};
538
539	std::vector<int> uses(program.size());
540	for (Val id = `0`; id < (Val)program.size(); id++) {
541	uses [id] = (int)usage [id].size();
542	}
543
544	auto pressure_change = [&](Val id) -> int {
545	Instruction inst = program [id];
546
547	// If this Instruction is not a sink, its result needs a register.
548	int change = has_side_effect(inst.op) ? `0` : `1`;
549
550	// If this is the final user of an argument, the argument's register becomes free.
551	for (Val arg : {inst.x, inst.y, inst.z}) {
552	if (arg != NA && uses [arg] == `1`) { change -= `1`; }
553	}
554	return change;
555	};
556
557	auto compare = [&](Val lhs, Val rhs) {
558	SkASSERT(lhs != rhs);
559	int lhs_change = pressure_change (lhs);
560	int rhs_change = pressure_change (rhs);
561
562	// This comparison operator orders instructions from least (likely negative) register
563	// pressure to most register pressure, breaking ties arbitrarily using original
564	// program order comparing the instruction index itself.
565	//
566	// We'll use this operator with std::{make,push,pop}_heap() to maintain a max heap
567	// frontier of instructions that are ready to schedule. We iterate backwards through
568	// the program, scheduling later instruction slots before earlier ones, and that means
569	// an instruction becomes ready to schedule once all instructions using its result have
570	// been scheduled (in later slots).
571	//
572	// All together that means we'll be issuing the instructions that hurt register pressure
573	// as late as possible, and issuing the instructions that help register pressure as soon
574	// as possible.
575	//
576	// This heuristic of greedily issuing the instruction that most immediately decreases
577	// register pressure approximates a more expensive search to find a schedule that
578	// minimizes the high-water maximum register pressure, the number of registers we'll
579	// need to run this program.
580	//
581	// The tie-breaker heuristic was found through experimentation.
582	return lhs_change < rhs_change \|\| (lhs_change == rhs_change && lhs > rhs);
583	};
584
585	auto ready_to_schedule = [&](Val id) { return uses [id] == `0`; };
586
587	std::vector<Val> frontier;
588	for (Val id = `0`; id < (Val)program.size(); id++) {
589	Instruction inst = program [id];
590	if (has_side_effect(inst.op)) {
591	frontier.push_back(id);
592	}
593	// Having eliminated dead code, the only Instructions that should start
594	// with no users remaining to schedule are those with side effects.
595	SkASSERT(has_side_effect(inst.op) == (uses[id] == `0`));
596	}
597	std::make_heap(frontier.begin(), frontier.end(), compare);
598
599	// Figure out our new Instruction schedule.
600	std::vector<Val> new_id(program.size(), NA);
601	for (Val n = (Val)program.size(); n --> `0`;) {
602	SkASSERT(!frontier.empty());
603	std::pop_heap(frontier.begin(), frontier.end(), compare);
604	Val id = frontier.back();
605	frontier.pop_back();
606
607	SkASSERT(ready_to_schedule(id));
608
609	Instruction inst = program [id];
610	new_id [id] = n;
611
612	for (Val arg : {inst.x, inst.y, inst.z}) {
613	if (arg != NA) {
614	uses [arg]--;
615	if (ready_to_schedule (arg)) {
616	frontier.push_back(arg);
617	std::push_heap(frontier.begin(), frontier.end(), compare);
618	}
619	}
620	}
621	}
622	SkASSERT(frontier.empty());
623
624	// Remap each Instruction's arguments to their new IDs.
625	for (Val id = `0`; id < (Val)program.size(); id++) {
626	Instruction& inst = program [id];
627	for (Val* arg : {&inst.x, &inst.y, &inst.z}) {
628	if (*arg != NA) {
629	arg = new_id [arg];
630	SkASSERT(*arg != NA);
631	}
632	}
633	}
634
635	// Finally, reorder the Instructions themselves according to the new schedule.
636	// This is O(N)... wish I had a good reference link breaking it down.
637	for (Val id = `0`; id < (Val)program.size(); id++) {
638	while (id != new_id [id]) {
639	std::swap(program [id], program [new_id [id]]);
640	std::swap( new_id [id], new_id [new_id [id]]);
641	}
642	}
643	return program;
644	}
645
646	std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program) {
647	std::vector<OptimizedInstruction> optimized(program.size());
648	for (Val id = `0`; id < (Val)program.size(); id++) {
649	Instruction inst = program [id];
650	optimized [id] = {inst.op, inst.x,inst.y,inst.z, inst.immy,inst.immz,
651	/death=/id, /can_hoist=/true, /used_in_loop=/false};
652	}
653
654	// Each Instruction's inputs need to live at least until that Instruction issues.
655	for (Val id = `0`; id < (Val)optimized.size(); id++) {
656	OptimizedInstruction& inst = optimized [id];
657	for (Val arg : {inst.x, inst.y, inst.z}) {
658	// (We're walking in order, so this is the same as max()ing with the existing Val.)
659	if (arg != NA) { optimized [arg].death = id; }
660	}
661	}
662
663	// Mark which values don't depend on the loop and can be hoisted.
664	for (Val id = `0`; id < (Val)optimized.size(); id++) {
665	OptimizedInstruction& inst = optimized [id];
666
667	// Varying loads (and gathers) and stores cannot be hoisted out of the loop.
668	if (is_always_varying(inst.op)) {
669	inst.can_hoist = false;
670	}
671
672	// If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
673	if (inst.can_hoist) {
674	for (Val arg : {inst.x, inst.y, inst.z}) {
675	if (arg != NA) { inst.can_hoist &= optimized [arg].can_hoist; }
676	}
677	}
678
679	// We'll want to know if hoisted values are used in the loop;
680	// if not, we can recycle their registers like we do loop values.
681	if (!inst.can_hoist /i.e. we're in the loop, so the arguments are used_in_loop/) {
682	for (Val arg : {inst.x, inst.y, inst.z}) {
683	if (arg != NA) { optimized [arg].used_in_loop = true; }
684	}
685	}
686	}
687
688	return optimized;
689	}
690
691	std::vector<OptimizedInstruction> Builder::optimize(bool for_jit) const {
692	std::vector<Instruction> program = this->program();
693	if (for_jit) {
694	program = specialize_for_jit(std::move(program));
695	}
696	program = eliminate_dead_code(std::move(program));
697	program = schedule (std::move(program));
698	return finalize (std::move(program));
699	}
700
701	Program Builder::done(const char* debug_name) const {
702	char buf[`64`] = "skvm-jit-";
703	if (!debug_name) {
704	SkStrAppendU32(buf+`9`, this*->hash()) = `'\0'`;
705	debug_name = buf;
706	}
707
708	#if defined(SKVM_LLVM) \|\| defined(SKVM_JIT)
709	return {this->optimize(false), this->optimize(true), fStrides, debug_name};
710	#else
711	return {this->optimize(false), fStrides};
712	#endif
713	}
714
715	uint64_t Builder::hash() const {
716	uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), `0`),
717	hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), `1`);
718	return (uint64_t)lo \| (uint64_t)hi << `32`;
719	}
720
721	bool operator==(const Instruction& a, const Instruction& b) {
722	return a.op == b.op
723	&& a.x == b.x
724	&& a.y == b.y
725	&& a.z == b.z
726	&& a.immy == b.immy
727	&& a.immz == b.immz;
728	}
729
730	uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
731	return SkOpts::hash(&inst, sizeof(inst), seed);
732	}
733
734
735	// Most instructions produce a value and return it by ID,
736	// the value-producing instruction's own index in the program vector.
737	Val Builder::push(Instruction inst) {
738	// Basic common subexpression elimination:
739	// if we've already seen this exact Instruction, use it instead of creating a new one.
740	if (Val* id = fIndex.find(inst)) {
741	return *id;
742	}
743	Val id = static_cast<Val>(fProgram.size());
744	fProgram.push_back(inst);
745	fIndex.set(inst, id);
746	return id;
747	}
748
749	bool Builder::allImm() const { return true; }
750
751	template <typename T, typename... Rest>
752	bool Builder::allImm(Val id, T* imm, Rest... rest) const {
753	if (fProgram [id].op == Op::splat) {
754	static_assert(sizeof(T) == `4`);
755	memcpy(imm, &fProgram [id].immy, `4`);
756	return this->allImm(rest...);
757	}
758	return false;
759	}
760
761	Arg Builder::arg(int stride) {
762	int ix = (int)fStrides.size();
763	fStrides.push_back(stride);
764	return {ix};
765	}
766
767	void Builder::assert_true(I32 cond, I32 debug) {
768	#ifdef SK_DEBUG
769	int imm;
770	if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; }
771	(void)push(Op::assert_true, cond.id,debug.id,NA);
772	#endif
773	}
774
775	void Builder::store8 (Arg ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA, ptr.ix); }
776	void Builder::store16(Arg ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA, ptr.ix); }
777	void Builder::store32(Arg ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA, ptr.ix); }
778
779	I32 Builder::index() { return {this, push(Op::index , NA,NA,NA,`0`) }; }
780
781	I32 Builder::load8 (Arg ptr) { return {this, push(Op::load8 , NA,NA,NA, ptr.ix) }; }
782	I32 Builder::load16(Arg ptr) { return {this, push(Op::load16, NA,NA,NA, ptr.ix) }; }
783	I32 Builder::load32(Arg ptr) { return {this, push(Op::load32, NA,NA,NA, ptr.ix) }; }
784
785	I32 Builder::gather8 (Arg ptr, int offset, I32 index) {
786	return {this, push(Op::gather8 , index.id,NA,NA, ptr.ix,offset)};
787	}
788	I32 Builder::gather16(Arg ptr, int offset, I32 index) {
789	return {this, push(Op::gather16, index.id,NA,NA, ptr.ix,offset)};
790	}
791	I32 Builder::gather32(Arg ptr, int offset, I32 index) {
792	return {this, push(Op::gather32, index.id,NA,NA, ptr.ix,offset)};
793	}
794
795	I32 Builder::uniform8(Arg ptr, int offset) {
796	return {this, push(Op::uniform8, NA,NA,NA, ptr.ix, offset)};
797	}
798	I32 Builder::uniform16(Arg ptr, int offset) {
799	return {this, push(Op::uniform16, NA,NA,NA, ptr.ix, offset)};
800	}
801	I32 Builder::uniform32(Arg ptr, int offset) {
802	return {this, push(Op::uniform32, NA,NA,NA, ptr.ix, offset)};
803	}
804
805	// The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern.
806	I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA, n) }; }
807	F32 Builder::splat(float f) {
808	int bits;
809	memcpy(&bits, &f, `4`);
810	return {this, push(Op::splat, NA,NA,NA, bits)};
811	}
812
813	static bool fma_supported() {
814	static const bool supported =
815	#if defined(SK_CPU_X86)
816	SkCpu::Supports(SkCpu::HSW);
817	#elif defined(SK_CPU_ARM64)
818	true;
819	#else
820	false;
821	#endif
822	return supported;
823	}
824
825	// Be careful peepholing float math! Transformations you might expect to
826	// be legal can fail in the face of NaN/Inf, e.g. 0x is not always 0.*
827	// Float peepholes must pass this equivalence test for all ~4B floats:
828	//
829	// bool equiv(float x, float y) { return (x == y) \|\| (isnanf(x) && isnanf(y)); }
830	//
831	// unsigned bits = 0;
832	// do {
833	// float f;
834	// memcpy(&f, &bits, 4);
835	// if (!equiv(f, ...)) {
836	// abort();
837	// }
838	// } while (++bits != 0);
839
840	F32 Builder::add(F32 x, F32 y) {
841	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
842	if (this->isImm(y.id, `0.0f`)) { return x; } // x+0 == x
843	if (this->isImm(x.id, `0.0f`)) { return y; } // 0+y == y
844
845	if (fma_supported()) {
846	if (fProgram [x.id].op == Op::mul_f32) {
847	return {this, this->push(Op::fma_f32, fProgram [x.id].x, fProgram [x.id].y, y.id)};
848	}
849	if (fProgram [y.id].op == Op::mul_f32) {
850	return {this, this->push(Op::fma_f32, fProgram [y.id].x, fProgram [y.id].y, x.id)};
851	}
852	}
853	return {this, this->push(Op::add_f32, x.id, y.id)};
854	}
855
856	F32 Builder::sub(F32 x, F32 y) {
857	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
858	if (this->isImm(y.id, `0.0f`)) { return x; } // x-0 == x
859	if (fma_supported()) {
860	if (fProgram [x.id].op == Op::mul_f32) {
861	return {this, this->push(Op::fms_f32, fProgram [x.id].x, fProgram [x.id].y, y.id)};
862	}
863	if (fProgram [y.id].op == Op::mul_f32) {
864	return {this, this->push(Op::fnma_f32, fProgram [y.id].x, fProgram [y.id].y, x.id)};
865	}
866	}
867	return {this, this->push(Op::sub_f32, x.id, y.id)};
868	}
869
870	F32 Builder::mul(F32 x, F32 y) {
871	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
872	if (this->isImm(y.id, `1.0f`)) { return x; } // x1 == x*
873	if (this->isImm(x.id, `1.0f`)) { return y; } // 1y == y*
874	return {this, this->push(Op::mul_f32, x.id, y.id)};
875	}
876
877	F32 Builder::div(F32 x, F32 y) {
878	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X/Y); }
879	if (this->isImm(y.id, `1.0f`)) { return x; } // x/1 == x
880	return {this, this->push(Op::div_f32, x.id, y.id)};
881	}
882
883	F32 Builder::sqrt(F32 x) {
884	if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); }
885	return {this, this->push(Op::sqrt_f32, x.id,NA,NA)};
886	}
887
888	// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
889	F32 Builder::approx_log2(F32 x) {
890	// e - 127 is a fair approximation of log2(x) in its own right...
891	F32 e = mul(to_f32(bit_cast(x)), splat(`1.0f` / (`1`<<`23`)));
892
893	// ... but using the mantissa to refine its error is _much_ better.
894	F32 m = bit_cast(bit_or(bit_and(bit_cast(x), `0x007fffff`),
895	`0x3f000000`));
896	F32 approx = sub(e, `124.225514990f`);
897	approx = sub(approx, mul(`1.498030302f`, m));
898	approx = sub(approx, div(`1.725879990f`, add(`0.3520887068f`, m)));
899
900	return approx;
901	}
902
903	F32 Builder::approx_pow2(F32 x) {
904	F32 f = fract(x);
905	F32 approx = add(x, `121.274057500f`);
906	approx = sub(approx, mul( `1.490129070f`, f));
907	approx = add(approx, div(`27.728023300f`, sub(`4.84252568f`, f)));
908
909	return bit_cast(round(mul(`1.0f` * (`1`<<`23`), approx)));
910	}
911
912	F32 Builder::approx_powf(F32 x, F32 y) {
913	auto is_x = bit_or(eq(x, `0.0f`),
914	eq(x, `1.0f`));
915	return select(is_x, x, approx_pow2(mul(approx_log2(x), y)));
916	}
917
918	// Bhaskara I's sine approximation
919	// 16x(pi - x) / (5pi^2 - 4x(pi - x)*
920	// ... divide by 4
921	// 4x(pi - x) / 5pi^2/4 - x(pi - x)*
922	//
923	// This is a good approximation only for 0 <= x <= pi, so we use symmetries to get
924	// radians into that range first.
925	//
926	F32 Builder::approx_sin(F32 radians) {
927	constexpr float Pi = SK_ScalarPI;
928	// x = radians mod 2pi
929	F32 x = fract(radians * (`0.5f`/Pi)) * (`2`*Pi);
930	I32 neg = x > Pi; // are we pi < x < 2pi --> need to negate result
931	x = select(neg, x - Pi, x);
932
933	F32 pair = x * (Pi - x);
934	x = `4.0f` * pair / ((`5`PiPi/`4`) - pair);
935	x = select(neg, -x, x);
936	return x;
937	}
938
939	/ "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"*
940	https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
941
942	approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
943
944	Some simplifications:
945	1. tan(x) is periodic, -PI/2 < x < PI/2
946	2. tan(x) is odd, so tan(-x) = -tan(x)
947	3. Our polynomial approximation is best near zero, so we use the following identity
948	tan(x) + tan(y)
949	tan(x + y) = -----------------
950	1 - tan(x)tan(y)*
951	tan(PI/4) = 1
952
953	So for x > PI/8, we do the following refactor:
954	x' = x - PI/4
955
956	1 + tan(x')
957	tan(x) = ------------
958	1 - tan(x')
959	*/
960	F32 Builder::approx_tan(F32 x) {
961	constexpr float Pi = SK_ScalarPI;
962	// periodic between -pi/2 ... pi/2
963	// shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
964	x = fract((`1`/Pi)x + `0.5f`) Pi - (Pi/`2`);
965
966	I32 neg = (x < `0.0f`);
967	x = select(neg, -x, x);
968
969	// minimize total error by shifting if x > pi/8
970	I32 use_quotient = (x > (Pi/`8`));
971	x = select(use_quotient, x - (Pi/`4`), x);
972
973	// 9th order poly = 4th order(x^2) x*
974	x = poly(x x, `62`/`2835.0f`, `17`/`315.0f`, `2`/`15.0f`, `1`/`3.0f`, `1.0f`) x;
975	x = select(use_quotient, (`1`+x)/(`1`-x), x);
976	x = select(neg, -x, x);
977	return x;
978	}
979
980	F32 Builder::min(F32 x, F32 y) {
981	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); }
982	return {this, this->push(Op::min_f32, x.id, y.id)};
983	}
984	F32 Builder::max(F32 x, F32 y) {
985	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); }
986	return {this, this->push(Op::max_f32, x.id, y.id)};
987	}
988
989	I32 Builder::add(I32 x, I32 y) {
990	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
991	if (this->isImm(x.id, `0`)) { return y; }
992	if (this->isImm(y.id, `0`)) { return x; }
993	return {this, this->push(Op::add_i32, x.id, y.id)};
994	}
995	I32 Builder::sub(I32 x, I32 y) {
996	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
997	if (this->isImm(y.id, `0`)) { return x; }
998	return {this, this->push(Op::sub_i32, x.id, y.id)};
999	}
1000	I32 Builder::mul(I32 x, I32 y) {
1001	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
1002	if (this->isImm(x.id, `0`)) { return splat(`0`); }
1003	if (this->isImm(y.id, `0`)) { return splat(`0`); }
1004	if (this->isImm(x.id, `1`)) { return y; }
1005	if (this->isImm(y.id, `1`)) { return x; }
1006	return {this, this->push(Op::mul_i32, x.id, y.id)};
1007	}
1008
1009	I32 Builder::add_16x2(I32 x, I32 y) { return {this, this->push(Op::add_i16x2, x.id, y.id)}; }
1010	I32 Builder::sub_16x2(I32 x, I32 y) { return {this, this->push(Op::sub_i16x2, x.id, y.id)}; }
1011	I32 Builder::mul_16x2(I32 x, I32 y) { return {this, this->push(Op::mul_i16x2, x.id, y.id)}; }
1012
1013	I32 Builder::shl(I32 x, int bits) {
1014	if (bits == `0`) { return x; }
1015	if (int X; this->allImm(x.id,&X)) { return splat(X << bits); }
1016	return {this, this->push(Op::shl_i32, x.id,NA,NA, bits)};
1017	}
1018	I32 Builder::shr(I32 x, int bits) {
1019	if (bits == `0`) { return x; }
1020	if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); }
1021	return {this, this->push(Op::shr_i32, x.id,NA,NA, bits)};
1022	}
1023	I32 Builder::sra(I32 x, int bits) {
1024	if (bits == `0`) { return x; }
1025	if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); }
1026	return {this, this->push(Op::sra_i32, x.id,NA,NA, bits)};
1027	}
1028
1029	I32 Builder::shl_16x2(I32 x, int k) { return {this, this->push(Op::shl_i16x2, x.id,NA,NA, k)}; }
1030	I32 Builder::shr_16x2(I32 x, int k) { return {this, this->push(Op::shr_i16x2, x.id,NA,NA, k)}; }
1031	I32 Builder::sra_16x2(I32 x, int k) { return {this, this->push(Op::sra_i16x2, x.id,NA,NA, k)}; }
1032
1033	I32 Builder:: eq(F32 x, F32 y) {
1034	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~`0` : `0`); }
1035	return {this, this->push(Op::eq_f32, x.id, y.id)};
1036	}
1037	I32 Builder::neq(F32 x, F32 y) {
1038	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~`0` : `0`); }
1039	return {this, this->push(Op::neq_f32, x.id, y.id)};
1040	}
1041	I32 Builder::lt(F32 x, F32 y) {
1042	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~`0` : `0`); }
1043	return {this, this->push(Op::gt_f32, y.id, x.id)};
1044	}
1045	I32 Builder::lte(F32 x, F32 y) {
1046	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~`0` : `0`); }
1047	return {this, this->push(Op::gte_f32, y.id, x.id)};
1048	}
1049	I32 Builder::gt(F32 x, F32 y) {
1050	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~`0` : `0`); }
1051	return {this, this->push(Op::gt_f32, x.id, y.id)};
1052	}
1053	I32 Builder::gte(F32 x, F32 y) {
1054	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~`0` : `0`); }
1055	return {this, this->push(Op::gte_f32, x.id, y.id)};
1056	}
1057
1058	I32 Builder:: eq(I32 x, I32 y) {
1059	if (x.id == y.id) { return splat(~`0`); }
1060	return {this, this->push(Op:: eq_i32, x.id, y.id)};
1061	}
1062	I32 Builder::neq(I32 x, I32 y) {
1063	return {this, this->push(Op::neq_i32, x.id, y.id)};
1064	}
1065	I32 Builder:: gt(I32 x, I32 y) {
1066	return {this, this->push(Op:: gt_i32, x.id, y.id)};
1067	}
1068	I32 Builder::gte(I32 x, I32 y) {
1069	if (x.id == y.id) { return splat(~`0`); }
1070	return {this, this->push(Op::gte_i32, x.id, y.id)};
1071	}
1072	I32 Builder:: lt(I32 x, I32 y) { return y >x; }
1073	I32 Builder::lte(I32 x, I32 y) { return y >=x; }
1074
1075	I32 Builder:: eq_16x2(I32 x, I32 y) { return {this, this->push(Op:: eq_i16x2, x.id, y.id)}; }
1076	I32 Builder::neq_16x2(I32 x, I32 y) { return {this, this->push(Op::neq_i16x2, x.id, y.id)}; }
1077	I32 Builder:: lt_16x2(I32 x, I32 y) { return {this, this->push(Op:: gt_i16x2, y.id, x.id)}; }
1078	I32 Builder::lte_16x2(I32 x, I32 y) { return {this, this->push(Op::gte_i16x2, y.id, x.id)}; }
1079	I32 Builder:: gt_16x2(I32 x, I32 y) { return {this, this->push(Op:: gt_i16x2, x.id, y.id)}; }
1080	I32 Builder::gte_16x2(I32 x, I32 y) { return {this, this->push(Op::gte_i16x2, x.id, y.id)}; }
1081
1082	I32 Builder::bit_and(I32 x, I32 y) {
1083	if (x.id == y.id) { return x; }
1084	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); }
1085	if (this->isImm(y.id, `0`)) { return splat(`0`); } // (x & false) == false
1086	if (this->isImm(x.id, `0`)) { return splat(`0`); } // (false & y) == false
1087	if (this->isImm(y.id,~`0`)) { return x; } // (x & true) == x
1088	if (this->isImm(x.id,~`0`)) { return y; } // (true & y) == y
1089	return {this, this->push(Op::bit_and, x.id, y.id)};
1090	}
1091	I32 Builder::bit_or(I32 x, I32 y) {
1092	if (x.id == y.id) { return x; }
1093	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X\|Y); }
1094	if (this->isImm(y.id, `0`)) { return x; } // (x \| false) == x
1095	if (this->isImm(x.id, `0`)) { return y; } // (false \| y) == y
1096	if (this->isImm(y.id,~`0`)) { return splat(~`0`); } // (x \| true) == true
1097	if (this->isImm(x.id,~`0`)) { return splat(~`0`); } // (true \| y) == true
1098	return {this, this->push(Op::bit_or, x.id, y.id)};
1099	}
1100	I32 Builder::bit_xor(I32 x, I32 y) {
1101	if (x.id == y.id) { return splat(`0`); }
1102	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); }
1103	if (this->isImm(y.id, `0`)) { return x; } // (x ^ false) == x
1104	if (this->isImm(x.id, `0`)) { return y; } // (false ^ y) == y
1105	return {this, this->push(Op::bit_xor, x.id, y.id)};
1106	}
1107
1108	I32 Builder::bit_clear(I32 x, I32 y) {
1109	if (x.id == y.id) { return splat(`0`); }
1110	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); }
1111	if (this->isImm(y.id, `0`)) { return x; } // (x & ~false) == x
1112	if (this->isImm(y.id,~`0`)) { return splat(`0`); } // (x & ~true) == false
1113	if (this->isImm(x.id, `0`)) { return splat(`0`); } // (false & ~y) == false
1114	return {this, this->push(Op::bit_clear, x.id, y.id)};
1115	}
1116
1117	I32 Builder::select(I32 x, I32 y, I32 z) {
1118	if (y.id == z.id) { return y; }
1119	if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); }
1120	if (this->isImm(x.id,~`0`)) { return y; } // true ? y : z == y
1121	if (this->isImm(x.id, `0`)) { return z; } // false ? y : z == z
1122	if (this->isImm(y.id, `0`)) { return bit_clear(z,x); } // x ? 0 : z == ~x&z
1123	if (this->isImm(z.id, `0`)) { return bit_and (y,x); } // x ? y : 0 == x&y
1124	return {this, this->push(Op::select, x.id, y.id, z.id)};
1125	}
1126
1127	I32 Builder::extract(I32 x, int bits, I32 z) {
1128	if (unsigned Z; this->allImm(z.id,&Z) && (~`0u`>>bits) == Z) { return shr(x, bits); }
1129	return bit_and(z, shr(x, bits));
1130	}
1131
1132	I32 Builder::pack(I32 x, I32 y, int bits) {
1133	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X\|(Y<<bits)); }
1134	return {this, this->push(Op::pack, x.id,y.id,NA, `0`,bits)};
1135	}
1136
1137	F32 Builder::floor(F32 x) {
1138	if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); }
1139	return {this, this->push(Op::floor, x.id)};
1140	}
1141	F32 Builder::to_f32(I32 x) {
1142	if (int X; this->allImm(x.id,&X)) { return splat((float)X); }
1143	return {this, this->push(Op::to_f32, x.id)};
1144	}
1145	I32 Builder::trunc(F32 x) {
1146	if (float X; this->allImm(x.id,&X)) { return splat((int)X); }
1147	return {this, this->push(Op::trunc, x.id)};
1148	}
1149	I32 Builder::round(F32 x) {
1150	if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); }
1151	return {this, this->push(Op::round, x.id)};
1152	}
1153
1154	F32 Builder::from_unorm(int bits, I32 x) {
1155	F32 limit = splat(`1` / ((`1`<<bits)-`1.0f`));
1156	return mul(to_f32(x), limit);
1157	}
1158	I32 Builder::to_unorm(int bits, F32 x) {
1159	F32 limit = splat((`1`<<bits)-`1.0f`);
1160	return round(mul(x, limit));
1161	}
1162
1163	Color Builder::unpack_1010102(I32 rgba) {
1164	return {
1165	from_unorm(`10`, extract(rgba, `0`, `0x3ff`)),
1166	from_unorm(`10`, extract(rgba, `10`, `0x3ff`)),
1167	from_unorm(`10`, extract(rgba, `20`, `0x3ff`)),
1168	from_unorm( `2`, extract(rgba, `30`, `0x3` )),
1169	};
1170	}
1171	Color Builder::unpack_8888(I32 rgba) {
1172	return {
1173	from_unorm(`8`, extract(rgba, `0`, `0xff`)),
1174	from_unorm(`8`, extract(rgba, `8`, `0xff`)),
1175	from_unorm(`8`, extract(rgba, `16`, `0xff`)),
1176	from_unorm(`8`, extract(rgba, `24`, `0xff`)),
1177	};
1178	}
1179	Color Builder::unpack_565(I32 bgr) {
1180	return {
1181	from_unorm(`5`, extract(bgr, `11`, `0b011'111`)),
1182	from_unorm(`6`, extract(bgr, `5`, `0b111'111`)),
1183	from_unorm(`5`, extract(bgr, `0`, `0b011'111`)),
1184	splat(`1.0f`),
1185	};
1186	}
1187
1188	void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) {
1189	skvm::F32 invA = `1.0f` / a,
1190	inf = bit_cast(splat(`0x7f800000`));
1191	// If a is 0, so are r,g,b, so set invA to 0 to avoid 0inf=NaN (instead 00 = 0).*
1192	invA = select(invA < inf, invA
1193	, `0.0f`);
1194	r = invA;
1195	g = invA;
1196	b = invA;
1197	}
1198
1199	void Builder::premul(F32* r, F32* g, F32* b, F32 a) {
1200	r = a;
1201	g = a;
1202	b = a;
1203	}
1204
1205	Color Builder::uniformPremul(SkColor4f color, SkColorSpace* src,
1206	Uniforms* uniforms, SkColorSpace* dst) {
1207	SkColorSpaceXformSteps (src, kUnpremul_SkAlphaType,
1208	dst, kPremul_SkAlphaType).apply(color.vec());
1209	return {
1210	uniformF(uniforms->pushF(color.fR)),
1211	uniformF(uniforms->pushF(color.fG)),
1212	uniformF(uniforms->pushF(color.fB)),
1213	uniformF(uniforms->pushF(color.fA)),
1214	};
1215	}
1216
1217	Color Builder::lerp(Color lo, Color hi, F32 t) {
1218	return {
1219	lerp(lo.r, hi.r, t),
1220	lerp(lo.g, hi.g, t),
1221	lerp(lo.b, hi.b, t),
1222	lerp(lo.a, hi.a, t),
1223	};
1224	}
1225
1226	HSLA Builder::to_hsla(Color c) {
1227	F32 mx = max(max(c.r,c.g),c.b),
1228	mn = min(min(c.r,c.g),c.b),
1229	d = mx - mn,
1230	invd = `1.0f` / d,
1231	g_lt_b = select(c.g < c.b, splat(`6.0f`)
1232	, splat(`0.0f`));
1233
1234	F32 h = (`1`/`6.0f`) * select(mx == mn, `0.0f`,
1235	select(mx == c.r, invd * (c.g - c.b) + g_lt_b,
1236	select(mx == c.g, invd * (c.b - c.r) + `2.0f`
1237	, invd * (c.r - c.g) + `4.0f`)));
1238
1239	F32 sum = mx + mn,
1240	l = sum * `0.5f`,
1241	s = select(mx == mn, `0.0f`
1242	, d / select(l > `0.5f`, `2.0f` - sum
1243	, sum));
1244	return {h, s, l, c.a};
1245	}
1246
1247	Color Builder::to_rgba(HSLA c) {
1248	// See GrRGBToHSLFilterEffect.fp
1249
1250	auto [h,s,l,a] = c;
1251	F32 x = s * (`1.0f` - abs(l + l - `1.0f`));
1252
1253	auto hue_to_rgb = [&,l=l](auto hue) {
1254	auto q = abs(`6.0f` * fract(hue) - `3.0f`) - `1.0f`;
1255	return x * (clamp01(q) - `0.5f`) + l;
1256	};
1257
1258	return {
1259	hue_to_rgb (h + `0`/`3.0f`),
1260	hue_to_rgb (h + `2`/`3.0f`),
1261	hue_to_rgb (h + `1`/`3.0f`),
1262	c.a,
1263	};
1264	}
1265
1266	// We're basing our implementation of non-separable blend modes on
1267	// https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1268	// and
1269	// https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1270	// They're equivalent, but ES' math has been better simplified.
1271	//
1272	// Anything extra we add beyond that is to make the math work with premul inputs.
1273
1274	static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1275	return max(r, max(g, b))
1276	- min(r, min(g, b));
1277	}
1278
1279	static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1280	return r `0.30f` + g `0.59f` + b *`0.11f`;
1281	}
1282
1283	static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) {
1284	F32 mn = min(r, min(g, *b)),
1285	mx = max(r, max(g, *b)),
1286	sat = mx - mn;
1287
1288	// Map min channel to 0, max channel to s, and scale the middle proportionally.
1289	auto scale = [&](auto c) {
1290	// TODO: better to divide and check for non-finite result?
1291	return select(sat == `0.0f`, `0.0f`
1292	, ((c - mn) * s) / sat);
1293	};
1294	r = scale (r);
1295	g = scale (g);
1296	b = scale (b);
1297	}
1298
1299	static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) {
1300	auto diff = lu - luminance(r, g, *b);
1301	*r += diff;
1302	*g += diff;
1303	*b += diff;
1304	}
1305
1306	static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) {
1307	F32 mn = min(r, min(g, *b)),
1308	mx = max(r, max(g, *b)),
1309	lu = luminance(r, g, *b);
1310
1311	auto clip = [&](auto c) {
1312	c = select(mn >= `0`, c
1313	, lu + ((c-lu)*( lu)) / (lu -mn));
1314	c = select(mx > a, lu + ((c-lu)*(a -lu)) / (mx -lu)
1315	, c);
1316	return clamp01(c); // May be a little negative, or worse, NaN.
1317	};
1318	r = clip (r);
1319	g = clip (g);
1320	b = clip (b);
1321	}
1322
1323	Color Builder::blend(SkBlendMode mode, Color src, Color dst) {
1324	auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) {
1325	return x y + z w;
1326	};
1327
1328	auto two = [](skvm::F32 x) { return x +x; };
1329
1330	auto apply_rgba = [&](auto fn) {
1331	return Color {
1332	fn(src.r, dst.r),
1333	fn(src.g, dst.g),
1334	fn(src.b, dst.b),
1335	fn(src.a, dst.a),
1336	};
1337	};
1338
1339	auto apply_rgb_srcover_a = [&](auto fn) {
1340	return Color {
1341	fn(src.r, dst.r),
1342	fn(src.g, dst.g),
1343	fn(src.b, dst.b),
1344	mad(dst.a, `1`-src.a, src.a), // srcover for alpha
1345	};
1346	};
1347
1348	auto non_sep = [&](auto R, auto G, auto B) {
1349	return Color{
1350	R + mma (src.r, `1`-dst.a, dst.r, `1`-src.a),
1351	G + mma (src.g, `1`-dst.a, dst.g, `1`-src.a),
1352	B + mma (src.b, `1`-dst.a, dst.b, `1`-src.a),
1353	mad(dst.a, `1`-src.a, src.a), // srcover for alpha
1354	};
1355	};
1356
1357	switch (mode) {
1358	default: SkASSERT(false); /but also, for safety, fallthrough/
1359
1360	case SkBlendMode::kClear: return { splat(`0.0f`), splat(`0.0f`), splat(`0.0f`), splat(`0.0f`) };
1361
1362	case SkBlendMode::kSrc: return src;
1363	case SkBlendMode::kDst: return dst;
1364
1365	case SkBlendMode::kDstOver: std::swap(src, dst); // fall-through
1366	case SkBlendMode::kSrcOver:
1367	return apply_rgba ([&](auto s, auto d) {
1368	return mad(d,`1`-src.a, s);
1369	});
1370
1371	case SkBlendMode::kDstIn: std::swap(src, dst); // fall-through
1372	case SkBlendMode::kSrcIn:
1373	return apply_rgba ([&](auto s, auto d) {
1374	return s * dst.a;
1375	});
1376
1377	case SkBlendMode::kDstOut: std::swap(src, dst); // fall-through
1378	case SkBlendMode::kSrcOut:
1379	return apply_rgba ([&](auto s, auto d) {
1380	return s * (`1`-dst.a);
1381	});
1382
1383	case SkBlendMode::kDstATop: std::swap(src, dst); // fall-through
1384	case SkBlendMode::kSrcATop:
1385	return apply_rgba ([&](auto s, auto d) {
1386	return mma(s, dst.a, d, `1`-src.a);
1387	});
1388
1389	case SkBlendMode::kXor:
1390	return apply_rgba ([&](auto s, auto d) {
1391	return mma(s, `1`-dst.a, d, `1`-src.a);
1392	});
1393
1394	case SkBlendMode::kPlus:
1395	return apply_rgba ([&](auto s, auto d) {
1396	return min(s+d, `1.0f`);
1397	});
1398
1399	case SkBlendMode::kModulate:
1400	return apply_rgba ([&](auto s, auto d) {
1401	return s * d;
1402	});
1403
1404	case SkBlendMode::kScreen:
1405	// (s+d)-(sd) gave us trouble with our "r,g,b <= after blending" asserts.*
1406	// It's kind of plausible that s + (d - sd) keeps more precision?
1407	return apply_rgba ([&](auto s, auto d) {
1408	return s + (d - s*d);
1409	});
1410
1411	case SkBlendMode::kDarken:
1412	return apply_rgb_srcover_a ([&](auto s, auto d) {
1413	return s + (d - max(s * dst.a,
1414	d * src.a));
1415	});
1416
1417	case SkBlendMode::kLighten:
1418	return apply_rgb_srcover_a ([&](auto s, auto d) {
1419	return s + (d - min(s * dst.a,
1420	d * src.a));
1421	});
1422
1423	case SkBlendMode::kDifference:
1424	return apply_rgb_srcover_a ([&](auto s, auto d) {
1425	return s + (d - two(min(s * dst.a,
1426	d * src.a)));
1427	});
1428
1429	case SkBlendMode::kExclusion:
1430	return apply_rgb_srcover_a ([&](auto s, auto d) {
1431	return s + (d - two(s * d));
1432	});
1433
1434	case SkBlendMode::kColorBurn:
1435	return apply_rgb_srcover_a ([&](auto s, auto d) {
1436	// TODO: divide and check for non-finite result instead of checking for s == 0.
1437	auto mn = min(dst.a,
1438	src.a * (dst.a - d) / s),
1439	burn = src.a * (dst.a - mn) + mma(s, `1`-dst.a, d, `1`-src.a);
1440	return select(d == dst.a, s * (`1`-dst.a) + d,
1441	select(s == `0.0f` , d * (`1`-src.a)
1442	, burn));
1443	});
1444
1445	case SkBlendMode::kColorDodge:
1446	return apply_rgb_srcover_a ([&](auto s, auto d) {
1447	// TODO: divide and check for non-finite result instead of checking for s == sa.
1448	auto dodge = src.a * min(dst.a,
1449	d * src.a / (src.a - s))
1450	+ mma(s, `1`-dst.a, d, `1`-src.a);
1451	return select(d == `0.0f` , s * (`1`-dst.a),
1452	select(s == src.a, d * (`1`-src.a) + s
1453	, dodge));
1454	});
1455
1456	case SkBlendMode::kHardLight:
1457	return apply_rgb_srcover_a ([&](auto s, auto d) {
1458	return mma(s, `1`-dst.a, d, `1`-src.a) +
1459	select(two(s) <= src.a,
1460	two(s * d),
1461	src.a * dst.a - two((dst.a - d) * (src.a - s)));
1462	});
1463
1464	case SkBlendMode::kOverlay:
1465	return apply_rgb_srcover_a ([&](auto s, auto d) {
1466	return mma(s, `1`-dst.a, d, `1`-src.a) +
1467	select(two(d) <= dst.a,
1468	two(s * d),
1469	src.a * dst.a - two((dst.a - d) * (src.a - s)));
1470	});
1471
1472	case SkBlendMode::kMultiply:
1473	return apply_rgba ([&](auto s, auto d) {
1474	return mma(s, `1`-dst.a, d, `1`-src.a) + s * d;
1475	});
1476
1477	case SkBlendMode::kSoftLight:
1478	return apply_rgb_srcover_a ([&](auto s, auto d) {
1479	auto m = select(dst.a > `0.0f`, d / dst.a
1480	, `0.0f`),
1481	s2 = two(s),
1482	m4 = `4`*m;
1483
1484	// The logic forks three ways:
1485	// 1. dark src?
1486	// 2. light src, dark dst?
1487	// 3. light src, light dst?
1488
1489	// Used in case 1
1490	auto darkSrc = d * ((s2-src.a) * (`1`-m) + src.a),
1491	// Used in case 2
1492	darkDst = (m4 * m4 + m4) * (m-`1`) + `7`*m,
1493	// Used in case 3.
1494	liteDst = sqrt(m) - m,
1495	// Used in 2 or 3?
1496	liteSrc = dst.a * (s2 - src.a) * select(`4`*d <= dst.a, darkDst
1497	, liteDst)
1498	+ d * src.a;
1499	return s * (`1`-dst.a) + d * (`1`-src.a) + select(s2 <= src.a, darkSrc
1500	, liteSrc);
1501	});
1502
1503	case SkBlendMode::kHue: {
1504	skvm::F32 R = src.r * src.a,
1505	G = src.g * src.a,
1506	B = src.b * src.a;
1507
1508	set_sat (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b));
1509	set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1510	clip_color(&R, &G, &B, src.a * dst.a);
1511
1512	return non_sep (R, G, B);
1513	}
1514
1515	case SkBlendMode::kSaturation: {
1516	skvm::F32 R = dst.r * src.a,
1517	G = dst.g * src.a,
1518	B = dst.b * src.a;
1519
1520	set_sat (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b));
1521	set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1522	clip_color(&R, &G, &B, src.a * dst.a);
1523
1524	return non_sep (R, G, B);
1525	}
1526
1527	case SkBlendMode::kColor: {
1528	skvm::F32 R = src.r * dst.a,
1529	G = src.g * dst.a,
1530	B = src.b * dst.a;
1531
1532	set_lum (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b));
1533	clip_color(&R, &G, &B, src.a * dst.a);
1534
1535	return non_sep (R, G, B);
1536	}
1537
1538	case SkBlendMode::kLuminosity: {
1539	skvm::F32 R = dst.r * src.a,
1540	G = dst.g * src.a,
1541	B = dst.b * src.a;
1542
1543	set_lum (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b));
1544	clip_color(&R, &G, &B, dst.a * src.a);
1545
1546	return non_sep (R, G, B);
1547	}
1548	}
1549	}
1550
1551	// For a given program we'll store each Instruction's users contiguously in a table,
1552	// and track where each Instruction's span of users starts and ends in another index.
1553	// Here's a simple program that loads x and stores kx+k:
1554	//
1555	// v0 = splat(k)
1556	// v1 = load(...)
1557	// v2 = mul(v1, v0)
1558	// v3 = add(v2, v0)
1559	// v4 = store(..., v3)
1560	//
1561	// This program has 5 instructions v0-v4.
1562	// - v0 is used by v2 and v3
1563	// - v1 is used by v2
1564	// - v2 is used by v3
1565	// - v3 is used by v4
1566	// - v4 has a side-effect
1567	//
1568	// For this program we fill out these two arrays:
1569	// table: [v2,v3, v2, v3, v4]
1570	// index: [0, 2, 3, 4, 5]
1571	//
1572	// The table is just those "is used by ..." I wrote out above in order,
1573	// and the index tracks where an Instruction's span of users starts, table[index[id]].
1574	// The span continues up until the start of the next Instruction, table[index[id+1]].
1575	SkSpan<const Val> Usage::operator[](Val id) const {
1576	int begin = fIndex [id];
1577	int end = fIndex [id + `1`];
1578	return SkMakeSpan(fTable.data() + begin, end - begin);
1579	}
1580
1581	Usage::Usage(const std::vector<Instruction>& program) {
1582	// uses[id] counts the number of times each Instruction is used.
1583	std::vector<int> uses(program.size(), `0`);
1584	for (Val id = `0`; id < (Val)program.size(); id++) {
1585	Instruction inst = program [id];
1586	if (inst.x != NA) { ++uses [inst.x]; }
1587	if (inst.y != NA) { ++uses [inst.y]; }
1588	if (inst.z != NA) { ++uses [inst.z]; }
1589	}
1590
1591	// Build our index into fTable, with an extra entry marking the final Instruction's end.
1592	fIndex.reserve(program.size() + `1`);
1593	int total_uses = `0`;
1594	for (int n : uses) {
1595	fIndex.push_back(total_uses);
1596	total_uses += n;
1597	}
1598	fIndex.push_back(total_uses);
1599
1600	// Tick down each Instruction's uses to fill in fTable.
1601	fTable.resize(total_uses, NA);
1602	for (Val id = (Val)program.size(); id --> `0`; ) {
1603	Instruction inst = program [id];
1604	if (inst.x != NA) { fTable [fIndex [inst.x] + --uses [inst.x]] = id; }
1605	if (inst.y != NA) { fTable [fIndex [inst.y] + --uses [inst.y]] = id; }
1606	if (inst.z != NA) { fTable [fIndex [inst.z] + --uses [inst.z]] = id; }
1607	}
1608	for (int n : uses ) { (void)n; SkASSERT(n == `0` ); }
1609	for (Val id : fTable) { (void)id; SkASSERT(id != NA); }
1610	}
1611
1612	// ~~~~ Program::eval() and co. ~~~~ //
1613
1614	// Handy references for x86-64 instruction encoding:
1615	// https://wiki.osdev.org/X86-64_Instruction_Encoding
1616	// https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
1617	// https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
1618	// http://ref.x86asm.net/coder64.html
1619
1620	// Used for ModRM / immediate instruction encoding.
1621	static uint8_t _233(int a, int b, int c) {
1622	return (a & `3`) << `6`
1623	\| (b & `7`) << `3`
1624	\| (c & `7`) << `0`;
1625	}
1626
1627	// ModRM byte encodes the arguments of an opcode.
1628	enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
1629	static uint8_t mod_rm(Mod mod, int reg, int rm) {
1630	return _233((int)mod, reg, rm);
1631	}
1632
1633	static Mod mod(int imm) {
1634	if (imm == `0`) { return Mod::Indirect; }
1635	if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
1636	return Mod::FourByteImm;
1637	}
1638
1639	static int imm_bytes(Mod mod) {
1640	switch (mod) {
1641	case Mod::Indirect: return `0`;
1642	case Mod::OneByteImm: return `1`;
1643	case Mod::FourByteImm: return `4`;
1644	case Mod::Direct: SkUNREACHABLE;
1645	}
1646	SkUNREACHABLE;
1647	}
1648
1649	// SIB byte encodes a memory address, base + (index scale).*
1650	static uint8_t sib(Assembler::Scale scale, int index, int base) {
1651	return _233((int)scale, index, base);
1652	}
1653
1654	// The REX prefix is used to extend most old 32-bit instructions to 64-bit.
1655	static uint8_t rex(bool W, // If set, operation is 64-bit, otherwise default, usually 32-bit.
1656	bool R, // Extra top bit to select ModRM reg, registers 8-15.
1657	bool X, // Extra top bit for SIB index register.
1658	bool B) { // Extra top bit for SIB base or ModRM rm register.
1659	return `0b01000000` // Fixed 0100 for top four bits.
1660	\| (W << `3`)
1661	\| (R << `2`)
1662	\| (X << `1`)
1663	\| (B << `0`);
1664	}
1665
1666
1667	// The VEX prefix extends SSE operations to AVX. Used generally, even with XMM.
1668	struct VEX {
1669	int len;
1670	uint8_t bytes[`3`];
1671	};
1672
1673	static VEX vex(bool WE, // Like REX W for int operations, or opcode extension for float?
1674	bool R, // Same as REX R. Pass high bit of dst register, dst>>3.
1675	bool X, // Same as REX X.
1676	bool B, // Same as REX B. Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
1677	int map, // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
1678	int vvvv, // 4-bit second operand register. Pass our x for 3-arg ops.
1679	bool L, // Set for 256-bit ymm operations, off for 128-bit xmm.
1680	int pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
1681
1682	// Pack x86 opcode map selector to 5-bit VEX encoding.
1683	map = [map]{
1684	switch (map) {
1685	case `0x0f`: return `0b00001`;
1686	case `0x380f`: return `0b00010`;
1687	case `0x3a0f`: return `0b00011`;
1688	// Several more cases only used by XOP / TBM.
1689	}
1690	SkUNREACHABLE;
1691	}();
1692
1693	// Pack mandatory SSE opcode prefix byte to 2-bit VEX encoding.
1694	pp = [pp]{
1695	switch (pp) {
1696	case `0x66`: return `0b01`;
1697	case `0xf3`: return `0b10`;
1698	case `0xf2`: return `0b11`;
1699	}
1700	return `0b00`;
1701	}();
1702
1703	VEX vex = {`0`, {`0`,`0`,`0`}};
1704	if (X == `0` && B == `0` && WE == `0` && map == `0b00001`) {
1705	// With these conditions met, we can optionally compress VEX to 2-byte.
1706	vex.len = `2`;
1707	vex.bytes[`0`] = `0xc5`;
1708	vex.bytes[`1`] = (pp & `3`) << `0`
1709	\| (L & `1`) << `2`
1710	\| (~vvvv & `15`) << `3`
1711	\| (~(int)R & `1`) << `7`;
1712	} else {
1713	// We could use this 3-byte VEX prefix all the time if we like.
1714	vex.len = `3`;
1715	vex.bytes[`0`] = `0xc4`;
1716	vex.bytes[`1`] = (map & `31`) << `0`
1717	\| (~(int)B & `1`) << `5`
1718	\| (~(int)X & `1`) << `6`
1719	\| (~(int)R & `1`) << `7`;
1720	vex.bytes[`2`] = (pp & `3`) << `0`
1721	\| (L & `1`) << `2`
1722	\| (~vvvv & `15`) << `3`
1723	\| (WE & `1`) << `7`;
1724	}
1725	return vex;
1726	}
1727
1728	Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(`0`) {}
1729
1730	size_t Assembler::size() const { return fSize; }
1731
1732	void Assembler::bytes(const void* p, int n) {
1733	if (fCurr) {
1734	memcpy(fCurr, p, n);
1735	fCurr += n;
1736	}
1737	fSize += n;
1738	}
1739
1740	void Assembler::byte(uint8_t b) { this->bytes(&b, `1`); }
1741	void Assembler::word(uint32_t w) { this->bytes(&w, `4`); }
1742
1743	void Assembler::align(int mod) {
1744	while (this->size() % mod) {
1745	this->byte(`0x00`);
1746	}
1747	}
1748
1749	void Assembler::int3() {
1750	this->byte(`0xcc`);
1751	}
1752
1753	void Assembler::vzeroupper() {
1754	this->byte(`0xc5`);
1755	this->byte(`0xf8`);
1756	this->byte(`0x77`);
1757	}
1758	void Assembler::ret() { this->byte(`0xc3`); }
1759
1760	// Common instruction building for 64-bit opcodes with an immediate argument.
1761	void Assembler::op(int opcode, int opcode_ext, GP64 dst, int imm) {
1762	opcode \|= `0b0000'0001`; // low bit set for 64-bit operands
1763	opcode \|= `0b1000'0000`; // top bit set for instructions with any immediate
1764
1765	int imm_bytes = `4`;
1766	if (SkTFitsIn<int8_t>(imm)) {
1767	imm_bytes = `1`;
1768	opcode \|= `0b0000'0010`; // second bit set for 8-bit immediate, else 32-bit.
1769	}
1770
1771	this->byte(rex(`1`,`0`,`0`,dst>>`3`));
1772	this->byte(opcode);
1773	this->byte(mod_rm(Mod::Direct, opcode_ext, dst&`7`));
1774	this->bytes(&imm, imm_bytes);
1775	}
1776
1777	void Assembler::add(GP64 dst, int imm) { this->op(`0`,`0b000`, dst,imm); }
1778	void Assembler::sub(GP64 dst, int imm) { this->op(`0`,`0b101`, dst,imm); }
1779	void Assembler::cmp(GP64 reg, int imm) { this->op(`0`,`0b111`, reg,imm); }
1780
1781	void Assembler::movq(GP64 dst, GP64 src, int off) {
1782	this->byte(rex(`1`,dst>>`3`,`0`,src>>`3`));
1783	this->byte(`0x8b`);
1784	this->byte(mod_rm(mod(off), dst&`7`, src&`7`));
1785	this->bytes(&off, imm_bytes(mod(off)));
1786	}
1787
1788	void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Ymm y, bool W/=false/) {
1789	VEX v = vex(W, dst>>`3`, `0`, y>>`3`,
1790	map, x, `1`/ymm, not xmm/, prefix);
1791	this->bytes(v.bytes, v.len);
1792	this->byte(opcode);
1793	this->byte(mod_rm(Mod::Direct, dst&`7`, y&`7`));
1794	}
1795
1796	void Assembler::vpaddd (Ymm dst, Ymm x, YmmOrLabel y) { this->op(`0x66`, `0x0f`,`0xfe`, dst,x,y); }
1797	void Assembler::vpsubd (Ymm dst, Ymm x, YmmOrLabel y) { this->op(`0x66`, `0x0f`,`0xfa`, dst,x,y); }
1798	void Assembler::vpmulld(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x380f`,`0x40`, dst,x,y); }
1799
1800	void Assembler::vpsubw (Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x0f`,`0xf9`, dst,x,y); }
1801	void Assembler::vpmullw(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x0f`,`0xd5`, dst,x,y); }
1802
1803	void Assembler::vpand (Ymm dst, Ymm x, YmmOrLabel y) { this->op(`0x66`,`0x0f`,`0xdb`, dst,x,y); }
1804	void Assembler::vpor (Ymm dst, Ymm x, YmmOrLabel y) { this->op(`0x66`,`0x0f`,`0xeb`, dst,x,y); }
1805	void Assembler::vpxor (Ymm dst, Ymm x, YmmOrLabel y) { this->op(`0x66`,`0x0f`,`0xef`, dst,x,y); }
1806	void Assembler::vpandn(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x0f`,`0xdf`, dst,x,y); }
1807
1808	void Assembler::vaddps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(`0`,`0x0f`,`0x58`, dst,x,y); }
1809	void Assembler::vsubps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(`0`,`0x0f`,`0x5c`, dst,x,y); }
1810	void Assembler::vmulps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(`0`,`0x0f`,`0x59`, dst,x,y); }
1811	void Assembler::vdivps(Ymm dst, Ymm x, Ymm y) { this->op(`0`,`0x0f`,`0x5e`, dst,x,y); }
1812	void Assembler::vminps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(`0`,`0x0f`,`0x5d`, dst,x,y); }
1813	void Assembler::vmaxps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(`0`,`0x0f`,`0x5f`, dst,x,y); }
1814
1815	void Assembler::vfmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x380f`,`0x98`, dst,x,y); }
1816	void Assembler::vfmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x380f`,`0xa8`, dst,x,y); }
1817	void Assembler::vfmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x380f`,`0xb8`, dst,x,y); }
1818
1819	void Assembler::vfmsub132ps(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x380f`,`0x9a`, dst,x,y); }
1820	void Assembler::vfmsub213ps(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x380f`,`0xaa`, dst,x,y); }
1821	void Assembler::vfmsub231ps(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x380f`,`0xba`, dst,x,y); }
1822
1823	void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x380f`,`0x9c`, dst,x,y); }
1824	void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x380f`,`0xac`, dst,x,y); }
1825	void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x380f`,`0xbc`, dst,x,y); }
1826
1827	void Assembler::vpackusdw(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x380f`,`0x2b`, dst,x,y); }
1828	void Assembler::vpackuswb(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`, `0x0f`,`0x67`, dst,x,y); }
1829
1830	void Assembler::vpcmpeqd(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x0f`,`0x76`, dst,x,y); }
1831	void Assembler::vpcmpgtd(Ymm dst, Ymm x, Ymm y) { this->op(`0x66`,`0x0f`,`0x66`, dst,x,y); }
1832
1833	void Assembler::vcmpps(Ymm dst, Ymm x, Ymm y, int imm) {
1834	this->op(`0`,`0x0f`,`0xc2`, dst,x,y);
1835	this->byte(imm);
1836	}
1837
1838	void Assembler::vpblendvb(Ymm dst, Ymm x, Ymm y, Ymm z) {
1839	int prefix = `0x66`,
1840	map = `0x3a0f`,
1841	opcode = `0x4c`;
1842	VEX v = vex(`0`, dst>>`3`, `0`, y>>`3`,
1843	map, x, /ymm?/`1`, prefix);
1844	this->bytes(v.bytes, v.len);
1845	this->byte(opcode);
1846	this->byte(mod_rm(Mod::Direct, dst&`7`, y&`7`));
1847	this->byte(z << `4`);
1848	}
1849
1850	// dst = x op /opcode_ext imm
1851	void Assembler::op(int prefix, int map, int opcode, int opcode_ext, Ymm dst, Ymm x, int imm) {
1852	// This is a little weird, but if we pass the opcode_ext as if it were the dst register,
1853	// the dst register as if x, and the x register as if y, all the bits end up where we want.
1854	this->op(prefix, map, opcode, (Ymm)opcode_ext,dst,x);
1855	this->byte(imm);
1856	}
1857
1858	void Assembler::vpslld(Ymm dst, Ymm x, int imm) { this->op(`0x66`,`0x0f`,`0x72`,`6`, dst,x,imm); }
1859	void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { this->op(`0x66`,`0x0f`,`0x72`,`2`, dst,x,imm); }
1860	void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { this->op(`0x66`,`0x0f`,`0x72`,`4`, dst,x,imm); }
1861
1862	void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { this->op(`0x66`,`0x0f`,`0x71`,`2`, dst,x,imm); }
1863
1864
1865	void Assembler::vpermq(Ymm dst, Ymm x, int imm) {
1866	// A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
1867	bool W = true;
1868	this->op(`0x66`,`0x3a0f`,`0x00`, dst,x,W);
1869	this->byte(imm);
1870	}
1871
1872	void Assembler::vroundps(Ymm dst, Ymm x, int imm) {
1873	this->op(`0x66`,`0x3a0f`,`0x08`, dst,x);
1874	this->byte(imm);
1875	}
1876
1877	void Assembler::vmovdqa(Ymm dst, Ymm src) { this->op(`0x66`,`0x0f`,`0x6f`, dst,src); }
1878
1879	void Assembler::vcvtdq2ps (Ymm dst, Ymm x) { this->op( `0`,`0x0f`,`0x5b`, dst,x); }
1880	void Assembler::vcvttps2dq(Ymm dst, Ymm x) { this->op(`0xf3`,`0x0f`,`0x5b`, dst,x); }
1881	void Assembler::vcvtps2dq (Ymm dst, Ymm x) { this->op(`0x66`,`0x0f`,`0x5b`, dst,x); }
1882	void Assembler::vsqrtps (Ymm dst, Ymm x) { this->op( `0`,`0x0f`,`0x51`, dst,x); }
1883
1884	Assembler::Label Assembler::here() {
1885	return { (int)this->size(), Label::NotYetSet, {} };
1886	}
1887
1888	int Assembler::disp19(Label* l) {
1889	SkASSERT(l->kind == Label::NotYetSet \|\|
1890	l->kind == Label::ARMDisp19);
1891	l->kind = Label::ARMDisp19;
1892	l->references.push_back(here().offset);
1893	// ARM 19-bit instruction count, from the beginning of this instruction.
1894	return (l->offset - here().offset) / `4`;
1895	}
1896
1897	int Assembler::disp32(Label* l) {
1898	SkASSERT(l->kind == Label::NotYetSet \|\|
1899	l->kind == Label::X86Disp32);
1900	l->kind = Label::X86Disp32;
1901	l->references.push_back(here().offset);
1902	// x86 32-bit byte count, from the end of this instruction.
1903	return l->offset - (here().offset + `4`);
1904	}
1905
1906	void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label* l) {
1907	// IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
1908	const int rip = rbp;
1909
1910	VEX v = vex(`0`, dst>>`3`, `0`, rip>>`3`,
1911	map, x, /ymm?/`1`, prefix);
1912	this->bytes(v.bytes, v.len);
1913	this->byte(opcode);
1914	this->byte(mod_rm(Mod::Indirect, dst&`7`, rip&`7`));
1915	this->word(this->disp32(l));
1916	}
1917
1918	void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, YmmOrLabel y) {
1919	y.label ? this->op(prefix,map,opcode,dst,x, y.label)
1920	: this->op(prefix,map,opcode,dst,x, y.ymm );
1921	}
1922
1923	void Assembler::vpshufb(Ymm dst, Ymm x, Label* l) { this->op(`0x66`,`0x380f`,`0x00`, dst,x,l); }
1924	void Assembler::vptest(Ymm dst, Label* l) { this->op(`0x66`, `0x380f`, `0x17`, dst, (Ymm)`0`, l); }
1925
1926	void Assembler::vbroadcastss(Ymm dst, Label* l) { this->op(`0x66`,`0x380f`,`0x18`, dst, (Ymm)`0`, l); }
1927	void Assembler::vbroadcastss(Ymm dst, Xmm src) { this->op(`0x66`,`0x380f`,`0x18`, dst, (Ymm)src); }
1928	void Assembler::vbroadcastss(Ymm dst, GP64 ptr, int off) {
1929	int prefix = `0x66`,
1930	map = `0x380f`,
1931	opcode = `0x18`;
1932	VEX v = vex(`0`, dst>>`3`, `0`, ptr>>`3`,
1933	map, `0`, /ymm?/`1`, prefix);
1934	this->bytes(v.bytes, v.len);
1935	this->byte(opcode);
1936
1937	this->byte(mod_rm(mod(off), dst&`7`, ptr&`7`));
1938	this->bytes(&off, imm_bytes(mod(off)));
1939	}
1940
1941	void Assembler::jump(uint8_t condition, Label* l) {
1942	// These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
1943	// 7? one-byte-disp
1944	// 0F 8? four-byte-disp
1945	// We always use the near displacement to make updating labels simpler (no resizing).
1946	this->byte(`0x0f`);
1947	this->byte(condition);
1948	this->word(this->disp32(l));
1949	}
1950	void Assembler::je (Label* l) { this->jump(`0x84`, l); }
1951	void Assembler::jne(Label* l) { this->jump(`0x85`, l); }
1952	void Assembler::jl (Label* l) { this->jump(`0x8c`, l); }
1953	void Assembler::jc (Label* l) { this->jump(`0x82`, l); }
1954
1955	void Assembler::jmp(Label* l) {
1956	// Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
1957	this->byte(`0xe9`);
1958	this->word(this->disp32(l));
1959	}
1960
1961	void Assembler::load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr) {
1962	VEX v = vex(`0`, ymm>>`3`, `0`, ptr>>`3`,
1963	map, `0`, /ymm?/`1`, prefix);
1964	this->bytes(v.bytes, v.len);
1965	this->byte(opcode);
1966	this->byte(mod_rm(Mod::Indirect, ymm&`7`, ptr&`7`));
1967	}
1968
1969	void Assembler::vmovups (Ymm dst, GP64 src) { this->load_store(`0` , `0x0f`,`0x10`, dst,src); }
1970	void Assembler::vpmovzxwd(Ymm dst, GP64 src) { this->load_store(`0x66`,`0x380f`,`0x33`, dst,src); }
1971	void Assembler::vpmovzxbd(Ymm dst, GP64 src) { this->load_store(`0x66`,`0x380f`,`0x31`, dst,src); }
1972
1973	void Assembler::vmovups (GP64 dst, Ymm src) { this->load_store(`0` , `0x0f`,`0x11`, src,dst); }
1974	void Assembler::vmovups (GP64 dst, Xmm src) {
1975	// Same as vmovups(GP64,YMM) and load_store() except ymm? is 0.
1976	int prefix = `0`,
1977	map = `0x0f`,
1978	opcode = `0x11`;
1979	VEX v = vex(`0`, src>>`3`, `0`, dst>>`3`,
1980	map, `0`, /ymm?/`0`, prefix);
1981	this->bytes(v.bytes, v.len);
1982	this->byte(opcode);
1983	this->byte(mod_rm(Mod::Indirect, src&`7`, dst&`7`));
1984	}
1985
1986	void Assembler::stack_load_store(int prefix, int map, int opcode, Ymm ymm, int off) {
1987	VEX v = vex(`0`, ymm>>`3`, `0`, rsp>>`3`/i.e. 0/,
1988	map, `0`, /ymm?/`1`, prefix);
1989	this->bytes(v.bytes, v.len);
1990	this->byte(opcode);
1991	this->byte(mod_rm(mod(off), ymm&`7`, rsp/use SIB/));
1992	this->byte(sib(ONE, rsp/no index/, rsp));
1993	this->bytes(&off, imm_bytes(mod(off)));
1994	}
1995	void Assembler::vmovups(Ymm dst, int off) { this->stack_load_store(`0`, `0x0f`, `0x10`, dst,off); }
1996	void Assembler::vmovups(int off, Ymm src) { this->stack_load_store(`0`, `0x0f`, `0x11`, src,off); }
1997
1998	void Assembler::vmovq(GP64 dst, Xmm src) {
1999	int prefix = `0x66`,
2000	map = `0x0f`,
2001	opcode = `0xd6`;
2002	VEX v = vex(`0`, src>>`3`, `0`, dst>>`3`,
2003	map, `0`, /ymm?/`0`, prefix);
2004	this->bytes(v.bytes, v.len);
2005	this->byte(opcode);
2006	this->byte(mod_rm(Mod::Indirect, src&`7`, dst&`7`));
2007	}
2008
2009	void Assembler::vmovd(GP64 dst, Xmm src) {
2010	int prefix = `0x66`,
2011	map = `0x0f`,
2012	opcode = `0x7e`;
2013	VEX v = vex(`0`, src>>`3`, `0`, dst>>`3`,
2014	map, `0`, /ymm?/`0`, prefix);
2015	this->bytes(v.bytes, v.len);
2016	this->byte(opcode);
2017	this->byte(mod_rm(Mod::Indirect, src&`7`, dst&`7`));
2018	}
2019
2020	void Assembler::vmovd_direct(GP64 dst, Xmm src) {
2021	int prefix = `0x66`,
2022	map = `0x0f`,
2023	opcode = `0x7e`;
2024	VEX v = vex(`0`, src>>`3`, `0`, dst>>`3`,
2025	map, `0`, /ymm?/`0`, prefix);
2026	this->bytes(v.bytes, v.len);
2027	this->byte(opcode);
2028	this->byte(mod_rm(Mod::Direct, src&`7`, dst&`7`));
2029	}
2030
2031	void Assembler::vmovd(Xmm dst, GP64 src) {
2032	int prefix = `0x66`,
2033	map = `0x0f`,
2034	opcode = `0x6e`;
2035	VEX v = vex(`0`, dst>>`3`, `0`, src>>`3`,
2036	map, `0`, /ymm?/`0`, prefix);
2037	this->bytes(v.bytes, v.len);
2038	this->byte(opcode);
2039	this->byte(mod_rm(Mod::Indirect, dst&`7`, src&`7`));
2040	}
2041
2042	void Assembler::vmovd(Xmm dst, Scale scale, GP64 index, GP64 base) {
2043	int prefix = `0x66`,
2044	map = `0x0f`,
2045	opcode = `0x6e`;
2046	VEX v = vex(`0`, dst>>`3`, index>>`3`, base>>`3`,
2047	map, `0`, /ymm?/`0`, prefix);
2048	this->bytes(v.bytes, v.len);
2049	this->byte(opcode);
2050	this->byte(mod_rm(Mod::Indirect, dst&`7`, rsp/use SIB/));
2051	this->byte(sib(scale, index&`7`, base&`7`));
2052	}
2053
2054	void Assembler::vmovd_direct(Xmm dst, GP64 src) {
2055	int prefix = `0x66`,
2056	map = `0x0f`,
2057	opcode = `0x6e`;
2058	VEX v = vex(`0`, dst>>`3`, `0`, src>>`3`,
2059	map, `0`, /ymm?/`0`, prefix);
2060	this->bytes(v.bytes, v.len);
2061	this->byte(opcode);
2062	this->byte(mod_rm(Mod::Direct, dst&`7`, src&`7`));
2063	}
2064
2065	void Assembler::movzbl(GP64 dst, GP64 src, int off) {
2066	if ((dst>>`3`) \|\| (src>>`3`)) {
2067	this->byte(rex(`0`,dst>>`3`,`0`,src>>`3`));
2068	}
2069	this->byte(`0x0f`);
2070	this->byte(`0xb6`);
2071	this->byte(mod_rm(mod(off), dst&`7`, src&`7`));
2072	this->bytes(&off, imm_bytes(mod(off)));
2073	}
2074
2075
2076	void Assembler::movb(GP64 dst, GP64 src) {
2077	if ((dst>>`3`) \|\| (src>>`3`)) {
2078	this->byte(rex(`0`,src>>`3`,`0`,dst>>`3`));
2079	}
2080	this->byte(`0x88`);
2081	this->byte(mod_rm(Mod::Indirect, src&`7`, dst&`7`));
2082	}
2083
2084	void Assembler::vpinsrw(Xmm dst, Xmm src, GP64 ptr, int imm) {
2085	int prefix = `0x66`,
2086	map = `0x0f`,
2087	opcode = `0xc4`;
2088	VEX v = vex(`0`, dst>>`3`, `0`, ptr>>`3`,
2089	map, src, /ymm?/`0`, prefix);
2090	this->bytes(v.bytes, v.len);
2091	this->byte(opcode);
2092	this->byte(mod_rm(Mod::Indirect, dst&`7`, ptr&`7`));
2093	this->byte(imm);
2094	}
2095
2096	void Assembler::vpinsrb(Xmm dst, Xmm src, GP64 ptr, int imm) {
2097	int prefix = `0x66`,
2098	map = `0x3a0f`,
2099	opcode = `0x20`;
2100	VEX v = vex(`0`, dst>>`3`, `0`, ptr>>`3`,
2101	map, src, /ymm?/`0`, prefix);
2102	this->bytes(v.bytes, v.len);
2103	this->byte(opcode);
2104	this->byte(mod_rm(Mod::Indirect, dst&`7`, ptr&`7`));
2105	this->byte(imm);
2106	}
2107
2108	void Assembler::vpextrw(GP64 ptr, Xmm src, int imm) {
2109	int prefix = `0x66`,
2110	map = `0x3a0f`,
2111	opcode = `0x15`;
2112
2113	VEX v = vex(`0`, src>>`3`, `0`, ptr>>`3`,
2114	map, `0`, /ymm?/`0`, prefix);
2115	this->bytes(v.bytes, v.len);
2116	this->byte(opcode);
2117	this->byte(mod_rm(Mod::Indirect, src&`7`, ptr&`7`));
2118	this->byte(imm);
2119	}
2120	void Assembler::vpextrb(GP64 ptr, Xmm src, int imm) {
2121	int prefix = `0x66`,
2122	map = `0x3a0f`,
2123	opcode = `0x14`;
2124
2125	VEX v = vex(`0`, src>>`3`, `0`, ptr>>`3`,
2126	map, `0`, /ymm?/`0`, prefix);
2127	this->bytes(v.bytes, v.len);
2128	this->byte(opcode);
2129	this->byte(mod_rm(Mod::Indirect, src&`7`, ptr&`7`));
2130	this->byte(imm);
2131	}
2132
2133	void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
2134	// Unlike most instructions, no aliasing is permitted here.
2135	SkASSERT(dst != ix);
2136	SkASSERT(dst != mask);
2137	SkASSERT(mask != ix);
2138
2139	int prefix = `0x66`,
2140	map = `0x380f`,
2141	opcode = `0x92`;
2142	VEX v = vex(`0`, dst>>`3`, ix>>`3`, base>>`3`,
2143	map, mask, /ymm?/`1`, prefix);
2144	this->bytes(v.bytes, v.len);
2145	this->byte(opcode);
2146	this->byte(mod_rm(Mod::Indirect, dst&`7`, rsp/use SIB/));
2147	this->byte(sib(scale, ix&`7`, base&`7`));
2148	}
2149
2150	// https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
2151
2152	static int operator"" _mask(unsigned long long bits) { return (`1`<<(int)bits)-`1`; }
2153
2154	void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
2155	this->word( (hi & `11_mask`) << `21`
2156	\| (m & `5_mask`) << `16`
2157	\| (lo & `6_mask`) << `10`
2158	\| (n & `5_mask`) << `5`
2159	\| (d & `5_mask`) << `0`);
2160	}
2161
2162	void Assembler::and16b(V d, V n, V m) { this->op(`0b0'1'0'01110'00'1`, m, `0b00011'1`, n, d); }
2163	void Assembler::orr16b(V d, V n, V m) { this->op(`0b0'1'0'01110'10'1`, m, `0b00011'1`, n, d); }
2164	void Assembler::eor16b(V d, V n, V m) { this->op(`0b0'1'1'01110'00'1`, m, `0b00011'1`, n, d); }
2165	void Assembler::bic16b(V d, V n, V m) { this->op(`0b0'1'0'01110'01'1`, m, `0b00011'1`, n, d); }
2166	void Assembler::bsl16b(V d, V n, V m) { this->op(`0b0'1'1'01110'01'1`, m, `0b00011'1`, n, d); }
2167	void Assembler::not16b(V d, V n) { this->op(`0b0'1'1'01110'00'10000'00101'10`, n, d); }
2168
2169	void Assembler::add4s(V d, V n, V m) { this->op(`0b0'1'0'01110'10'1`, m, `0b10000'1`, n, d); }
2170	void Assembler::sub4s(V d, V n, V m) { this->op(`0b0'1'1'01110'10'1`, m, `0b10000'1`, n, d); }
2171	void Assembler::mul4s(V d, V n, V m) { this->op(`0b0'1'0'01110'10'1`, m, `0b10011'1`, n, d); }
2172
2173	void Assembler::cmeq4s(V d, V n, V m) { this->op(`0b0'1'1'01110'10'1`, m, `0b10001'1`, n, d); }
2174	void Assembler::cmgt4s(V d, V n, V m) { this->op(`0b0'1'0'01110'10'1`, m, `0b0011'0'1`, n, d); }
2175
2176	void Assembler::sub8h(V d, V n, V m) { this->op(`0b0'1'1'01110'01'1`, m, `0b10000'1`, n, d); }
2177	void Assembler::mul8h(V d, V n, V m) { this->op(`0b0'1'0'01110'01'1`, m, `0b10011'1`, n, d); }
2178
2179	void Assembler::fadd4s(V d, V n, V m) { this->op(`0b0'1'0'01110'0'0'1`, m, `0b11010'1`, n, d); }
2180	void Assembler::fsub4s(V d, V n, V m) { this->op(`0b0'1'0'01110'1'0'1`, m, `0b11010'1`, n, d); }
2181	void Assembler::fmul4s(V d, V n, V m) { this->op(`0b0'1'1'01110'0'0'1`, m, `0b11011'1`, n, d); }
2182	void Assembler::fdiv4s(V d, V n, V m) { this->op(`0b0'1'1'01110'0'0'1`, m, `0b11111'1`, n, d); }
2183	void Assembler::fmin4s(V d, V n, V m) { this->op(`0b0'1'0'01110'1'0'1`, m, `0b11110'1`, n, d); }
2184	void Assembler::fmax4s(V d, V n, V m) { this->op(`0b0'1'0'01110'0'0'1`, m, `0b11110'1`, n, d); }
2185	void Assembler::fneg4s(V d, V n) { this->op(`0b0'1'1'01110'1'0'10000'01111'10`, n, d); }
2186
2187	void Assembler::fcmeq4s(V d, V n, V m) { this->op(`0b0'1'0'01110'0'0'1`, m, `0b1110'0'1`, n, d); }
2188	void Assembler::fcmgt4s(V d, V n, V m) { this->op(`0b0'1'1'01110'1'0'1`, m, `0b1110'0'1`, n, d); }
2189	void Assembler::fcmge4s(V d, V n, V m) { this->op(`0b0'1'1'01110'0'0'1`, m, `0b1110'0'1`, n, d); }
2190
2191	void Assembler::fmla4s(V d, V n, V m) { this->op(`0b0'1'0'01110'0'0'1`, m, `0b11001'1`, n, d); }
2192	void Assembler::fmls4s(V d, V n, V m) { this->op(`0b0'1'0'01110'1'0'1`, m, `0b11001'1`, n, d); }
2193
2194	void Assembler::tbl(V d, V n, V m) { this->op(`0b0'1'001110'00'0`, m, `0b0'00'0'00`, n, d); }
2195
2196	void Assembler::op(uint32_t op22, int imm, V n, V d) {
2197	this->word( (op22 & `22_mask`) << `10`
2198	\| imm << `16` // imm is embedded inside op, bit size depends on op
2199	\| (n & `5_mask`) << `5`
2200	\| (d & `5_mask`) << `0`);
2201	}
2202
2203	void Assembler::sli4s(V d, V n, int imm) {
2204	this->op(`0b0'1'1'011110'0100'000'01010'1`, ( imm&`31`), n, d);
2205	}
2206	void Assembler::shl4s(V d, V n, int imm) {
2207	this->op(`0b0'1'0'011110'0100'000'01010'1`, ( imm&`31`), n, d);
2208	}
2209	void Assembler::sshr4s(V d, V n, int imm) {
2210	this->op(`0b0'1'0'011110'0100'000'00'0'0'0'1`, (-imm&`31`), n, d);
2211	}
2212	void Assembler::ushr4s(V d, V n, int imm) {
2213	this->op(`0b0'1'1'011110'0100'000'00'0'0'0'1`, (-imm&`31`), n, d);
2214	}
2215	void Assembler::ushr8h(V d, V n, int imm) {
2216	this->op(`0b0'1'1'011110'0010'000'00'0'0'0'1`, (-imm&`15`), n, d);
2217	}
2218
2219	void Assembler::scvtf4s (V d, V n) { this->op(`0b0'1'0'01110'0'0'10000'11101'10`, n,d); }
2220	void Assembler::fcvtzs4s(V d, V n) { this->op(`0b0'1'0'01110'1'0'10000'1101'1'10`, n,d); }
2221	void Assembler::fcvtns4s(V d, V n) { this->op(`0b0'1'0'01110'0'0'10000'1101'0'10`, n,d); }
2222
2223	void Assembler::xtns2h(V d, V n) { this->op(`0b0'0'0'01110'01'10000'10010'10`, n,d); }
2224	void Assembler::xtnh2b(V d, V n) { this->op(`0b0'0'0'01110'00'10000'10010'10`, n,d); }
2225
2226	void Assembler::uxtlb2h(V d, V n) { this->op(`0b0'0'1'011110'0001'000'10100'1`, n,d); }
2227	void Assembler::uxtlh2s(V d, V n) { this->op(`0b0'0'1'011110'0010'000'10100'1`, n,d); }
2228
2229	void Assembler::uminv4s(V d, V n) { this->op(`0b0'1'1'01110'10'11000'1'1010'10`, n,d); }
2230
2231	void Assembler::brk(int imm16) {
2232	this->word(`0b11010100'001'0000000000000000'000'00`
2233	\| (imm16 & `16_mask`) << `5`);
2234	}
2235
2236	void Assembler::ret(X n) {
2237	this->word(`0b1101011'0'0'10'11111'0000'0'0` << `10`
2238	\| (n & `5_mask`) << `5`);
2239	}
2240
2241	void Assembler::add(X d, X n, int imm12) {
2242	this->word(`0b1'0'0'10001'00` << `22`
2243	\| (imm12 & `12_mask`) << `10`
2244	\| (n & `5_mask`) << `5`
2245	\| (d & `5_mask`) << `0`);
2246	}
2247	void Assembler::sub(X d, X n, int imm12) {
2248	this->word( `0b1'1'0'10001'00` << `22`
2249	\| (imm12 & `12_mask`) << `10`
2250	\| (n & `5_mask`) << `5`
2251	\| (d & `5_mask`) << `0`);
2252	}
2253	void Assembler::subs(X d, X n, int imm12) {
2254	this->word( `0b1'1'1'10001'00` << `22`
2255	\| (imm12 & `12_mask`) << `10`
2256	\| (n & `5_mask`) << `5`
2257	\| (d & `5_mask`) << `0`);
2258	}
2259
2260	void Assembler::b(Condition cond, Label* l) {
2261	const int imm19 = this->disp19(l);
2262	this->word( `0b0101010'0` << `24`
2263	\| (imm19 & `19_mask`) << `5`
2264	\| ((int)cond & `4_mask`) << `0`);
2265	}
2266	void Assembler::cbz(X t, Label* l) {
2267	const int imm19 = this->disp19(l);
2268	this->word( `0b1'011010'0` << `24`
2269	\| (imm19 & `19_mask`) << `5`
2270	\| (t & `5_mask`) << `0`);
2271	}
2272	void Assembler::cbnz(X t, Label* l) {
2273	const int imm19 = this->disp19(l);
2274	this->word( `0b1'011010'1` << `24`
2275	\| (imm19 & `19_mask`) << `5`
2276	\| (t & `5_mask`) << `0`);
2277	}
2278
2279	void Assembler::ldrq(V dst, X src) { this->op(`0b00'111'1'01'11'000000000000`, src, dst); }
2280	void Assembler::ldrs(V dst, X src) { this->op(`0b10'111'1'01'01'000000000000`, src, dst); }
2281	void Assembler::ldrb(V dst, X src) { this->op(`0b00'111'1'01'01'000000000000`, src, dst); }
2282
2283	void Assembler::strq(V src, X dst) { this->op(`0b00'111'1'01'10'000000000000`, dst, src); }
2284	void Assembler::strs(V src, X dst) { this->op(`0b10'111'1'01'00'000000000000`, dst, src); }
2285	void Assembler::strb(V src, X dst) { this->op(`0b00'111'1'01'00'000000000000`, dst, src); }
2286
2287	void Assembler::fmovs(X dst, V src) {
2288	this->word(`0b0'0'0'11110'00'1'00'110'000000` << `10`
2289	\| (src & `5_mask`) << `5`
2290	\| (dst & `5_mask`) << `0`);
2291	}
2292
2293	void Assembler::ldrq(V dst, Label* l) {
2294	const int imm19 = this->disp19(l);
2295	this->word( `0b10'011'1'00` << `24`
2296	\| (imm19 & `19_mask`) << `5`
2297	\| (dst & `5_mask`) << `0`);
2298	}
2299
2300	void Assembler::label(Label* l) {
2301	if (fCode) {
2302	// The instructions all currently point to l->offset.
2303	// We'll want to add a delta to point them to here().
2304	int delta = here().offset - l->offset;
2305	l->offset = here().offset;
2306
2307	if (l->kind == Label::ARMDisp19) {
2308	for (int ref : l->references) {
2309	// ref points to a 32-bit instruction with 19-bit displacement in instructions.
2310	uint32_t inst;
2311	memcpy(&inst, fCode + ref, `4`);
2312
2313	// [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
2314	int disp = (int)(inst << `8`) >> `13`;
2315
2316	disp += delta/`4`; // delta is in bytes, we want instructions.
2317
2318	// Put it all back together, preserving the high 8 bits and low 5.
2319	inst = ((disp << `5`) & (`19_mask` << `5`))
2320	\| ((inst ) & ~(`19_mask` << `5`));
2321
2322	memcpy(fCode + ref, &inst, `4`);
2323	}
2324	}
2325
2326	if (l->kind == Label::X86Disp32) {
2327	for (int ref : l->references) {
2328	// ref points to a 32-bit displacement in bytes.
2329	int disp;
2330	memcpy(&disp, fCode + ref, `4`);
2331
2332	disp += delta;
2333
2334	memcpy(fCode + ref, &disp, `4`);
2335	}
2336	}
2337	}
2338	}
2339
2340	void Program::eval(int n, void* args[]) const {
2341	#define SKVM_JIT_STATS 0
2342	#if SKVM_JIT_STATS
2343	static std::atomic<int64_t> calls{`0`}, jits{`0`},
2344	pixels{`0`}, fast{`0`};
2345	pixels += n;
2346	if (`0` == calls++) {
2347	atexit([]{
2348	int64_t num = jits .load(),
2349	den = calls.load();
2350	SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (`100.0` * num)/den, den);
2351	num = fast .load();
2352	den = pixels.load();
2353	SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (`100.0` * num)/den, den);
2354	});
2355	}
2356	#endif
2357	// This may fail either simply because we can't JIT, or when using LLVM,
2358	// because the work represented by fImpl->llvm_compiling hasn't finished yet.
2359	if (const void* b = fImpl ->jit_entry.load()) {
2360	#if SKVM_JIT_STATS
2361	jits++;
2362	fast += n;
2363	#endif
2364	void** a = args;
2365	switch (fImpl ->strides.size()) {
2366	case `0`: return ((void()(int* ))b)(n );
2367	case `1`: return ((void()(int,void** ))b)(n,a[`0`] );
2368	case `2`: return ((void()(int,void*,void** ))b)(n,a[`0`],a[`1`] );
2369	case `3`: return ((void()(int,void*,void*,void** ))b)(n,a[`0`],a[`1`],a[`2`] );
2370	case `4`: return ((void()(int,void*,void*,void*,void**))b)(n,a[`0`],a[`1`],a[`2`],a[`3`]);
2371	case `5`: return ((void()(int,void*,void*,void*,void*,void**))b)
2372	(n,a[`0`],a[`1`],a[`2`],a[`3`],a[`4`]);
2373	default: SkUNREACHABLE; // TODO
2374	}
2375	}
2376
2377	// So we'll sometimes use the interpreter here even if later calls will use the JIT.
2378	SkOpts::interpret_skvm(fImpl ->instructions.data(), (int)fImpl ->instructions.size(),
2379	this->nregs(), this->loop(), fImpl ->strides.data(), this->nargs(),
2380	n, args);
2381	}
2382
2383	#if defined(SKVM_LLVM)
2384	void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions,
2385	const char* debug_name) {
2386	auto ctx = std::make_unique<llvm::LLVMContext>();
2387
2388	auto mod = std::make_unique<llvm::Module>("", *ctx);
2389	// All the scary bare pointers from here on are owned by ctx or mod, I think.
2390
2391	// Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines.
2392	const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? `8` : `4`;
2393
2394	llvm::Type ptr = llvm::Type::getInt8Ty(ctx)->getPointerTo(),
2395	i32 = llvm::Type::getInt32Ty(ctx);
2396
2397	std::vector<llvm::Type*> arg_types = { i32 };
2398	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2399	arg_types.push_back(ptr);
2400	}
2401
2402	llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx),
2403	arg_types, /vararg?=/false);
2404	llvm::Function* fn
2405	= llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod);
2406	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2407	fn->addParamAttr(i+`1`, llvm::Attribute::NoAlias);
2408	}
2409
2410	llvm::BasicBlock enter = llvm::BasicBlock::Create(ctx, "enter" , fn),
2411	hoistK = llvm::BasicBlock::Create(ctx, "hoistK", fn),
2412	testK = llvm::BasicBlock::Create(ctx, "testK" , fn),
2413	loopK = llvm::BasicBlock::Create(ctx, "loopK" , fn),
2414	hoist1 = llvm::BasicBlock::Create(ctx, "hoist1", fn),
2415	test1 = llvm::BasicBlock::Create(ctx, "test1" , fn),
2416	loop1 = llvm::BasicBlock::Create(ctx, "loop1" , fn),
2417	leave = llvm::BasicBlock::Create(ctx, "leave" , fn);
2418
2419	using IRBuilder = llvm::IRBuilder<>;
2420
2421	llvm::PHINode* n;
2422	std::vector<llvm::PHINode*> args;
2423	std::vector<llvm::Value*> vals(instructions.size());
2424
2425	auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
2426	auto [op, x,y,z, immy,immz, death,can_hoist,used_in_loop] = instructions[i];
2427
2428	llvm::Type i1 = llvm::Type::getInt1Ty (ctx),
2429	i8 = llvm::Type::getInt8Ty (ctx),
2430	i16 = llvm::Type::getInt16Ty(ctx),
2431	*i16x2 = llvm::VectorType::get(i16, `2`),
2432	f32 = llvm::Type::getFloatTy(ctx),
2433	*I1 = scalar ? i1 : llvm::VectorType::get(i1 , K ),
2434	*I8 = scalar ? i8 : llvm::VectorType::get(i8 , K ),
2435	*I16 = scalar ? i16 : llvm::VectorType::get(i16, K ),
2436	I16x2 = scalar ? i16x2 : llvm::VectorType::get(i16, K`2`),
2437	*I32 = scalar ? i32 : llvm::VectorType::get(i32, K ),
2438	*F32 = scalar ? f32 : llvm::VectorType::get(f32, K );
2439
2440	auto I = [&](llvm::Value* v) { return b->CreateBitCast(v, I32 ); };
2441	auto F = [&](llvm::Value* v) { return b->CreateBitCast(v, F32 ); };
2442	auto x2 = [&](llvm::Value* v) { return b->CreateBitCast(v, I16x2); };
2443
2444	auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); };
2445
2446	switch (llvm::Type* t = nullptr; op) {
2447	default:
2448	SkDebugf("can't llvm %s (%d)\n", name(op), op);
2449	return false;
2450
2451	case Op::assert_true: /TODO/ break;
2452
2453	case Op::index:
2454	if (I32->isVectorTy()) {
2455	std::vector<llvm::Constant*> iota(K);
2456	for (int j = `0`; j < K; j++) {
2457	iota[j] = b->getInt32(j);
2458	}
2459	vals[i] = b->CreateSub(b->CreateVectorSplat(K, n),
2460	llvm::ConstantVector::get(iota));
2461	} else {
2462	vals[i] = n;
2463	} break;
2464
2465	case Op::load8: t = I8 ; goto load;
2466	case Op::load16: t = I16; goto load;
2467	case Op::load32: t = I32; goto load;
2468	load: {
2469	llvm::Value* ptr = b->CreateBitCast(args[immy], t->getPointerTo());
2470	vals[i] = b->CreateZExt(b->CreateAlignedLoad(ptr, `1`), I32);
2471	} break;
2472
2473
2474	case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immy); break;
2475
2476	case Op::uniform8: t = i8 ; goto uniform;
2477	case Op::uniform16: t = i16; goto uniform;
2478	case Op::uniform32: t = i32; goto uniform;
2479	uniform: {
2480	llvm::Value* ptr = b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr,
2481	args[immy],
2482	immz),
2483	t->getPointerTo());
2484	llvm::Value* val = b->CreateZExt(b->CreateAlignedLoad(ptr, `1`), i32);
2485	vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val)
2486	: val;
2487	} break;
2488
2489	case Op::gather8: t = i8 ; goto gather;
2490	case Op::gather16: t = i16; goto gather;
2491	case Op::gather32: t = i32; goto gather;
2492	gather: {
2493	// Our gather base pointer is immz bytes off of uniform immy.
2494	llvm::Value* base =
2495	b->CreateLoad(b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr,
2496	args[immy],
2497	immz),
2498	t->getPointerTo()->getPointerTo()));
2499
2500	llvm::Value* ptr = b->CreateInBoundsGEP(nullptr, base, vals[x]);
2501	llvm::Value* gathered;
2502	if (ptr->getType()->isVectorTy()) {
2503	gathered = b->CreateMaskedGather(ptr, `1`);
2504	} else {
2505	gathered = b->CreateAlignedLoad(ptr, `1`);
2506	}
2507	vals[i] = b->CreateZExt(gathered, I32);
2508	} break;
2509
2510	case Op::store8: t = I8 ; goto store;
2511	case Op::store16: t = I16; goto store;
2512	case Op::store32: t = I32; goto store;
2513	store: {
2514	llvm::Value* val = b->CreateTrunc(vals[x], t);
2515	llvm::Value* ptr = b->CreateBitCast(args[immy],
2516	val->getType()->getPointerTo());
2517	vals[i] = b->CreateAlignedStore(val, ptr, `1`);
2518	} break;
2519
2520	case Op::bit_and: vals[i] = b->CreateAnd(vals[x], vals[y]); break;
2521	case Op::bit_or : vals[i] = b->CreateOr (vals[x], vals[y]); break;
2522	case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break;
2523	case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break;
2524
2525	case Op::pack: vals[i] = b->CreateOr(vals[x], b->CreateShl(vals[y], immz)); break;
2526
2527	case Op::select:
2528	vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]);
2529	break;
2530
2531	case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break;
2532	case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break;
2533	case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break;
2534
2535	case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immy); break;
2536	case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immy); break;
2537	case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immy); break;
2538
2539	case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break;
2540	case Op::neq_i32: vals[i] = S(I32, b->CreateICmpNE (vals[x], vals[y])); break;
2541	case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break;
2542	case Op::gte_i32: vals[i] = S(I32, b->CreateICmpSGE(vals[x], vals[y])); break;
2543
2544	case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break;
2545	case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break;
2546	case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break;
2547	case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break;
2548
2549	case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break;
2550	case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break;
2551	case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break;
2552	case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break;
2553
2554	case Op::fma_f32:
2555	vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2556	{F(vals[x]), F(vals[y]), F(vals[z])}));
2557	break;
2558
2559	case Op::fms_f32:
2560	vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2561	{F(vals[x]), F(vals[y]),
2562	b->CreateFNeg(F(vals[z]))}));
2563	break;
2564
2565	case Op::fnma_f32:
2566	vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2567	{b->CreateFNeg(F(vals[x])), F(vals[y]),
2568	F(vals[z])}));
2569	break;
2570
2571	case Op::floor:
2572	vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x])));
2573	break;
2574
2575	case Op::max_f32:
2576	vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])),
2577	F(vals[y]), F(vals[x])));
2578	break;
2579	case Op::min_f32:
2580	vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])),
2581	F(vals[y]), F(vals[x])));
2582	break;
2583
2584	case Op::sqrt_f32:
2585	vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x])));
2586	break;
2587
2588	case Op::to_f32: vals[i] = I(b->CreateSIToFP( vals[x] , F32)); break;
2589	case Op::trunc : vals[i] = b->CreateFPToSI(F(vals[x]), I32) ; break;
2590	case Op::round : {
2591	// Basic impl when we can't use cvtps2dq and co.
2592	auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x]));
2593	vals[i] = b->CreateFPToSI(round, I32);
2594
2595	#if 1 && defined(SK_CPU_X86)
2596	// Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling.
2597	if (scalar) {
2598	// cvtss2si is float x4 -> int, ignoring input lanes 1,2,3. ¯\_(ツ)_/¯
2599	llvm::Value* v = llvm::UndefValue::get(llvm::VectorType::get(f32, `4`));
2600	v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)`0`);
2601	vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v});
2602	} else {
2603	SkASSERT(K == `4` \|\| K == `8`);
2604	auto intr = K == `4` ? llvm::Intrinsic::x86_sse2_cvtps2dq :
2605	/ K == 8 ?/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256;
2606	vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])});
2607	}
2608	#endif
2609	} break;
2610
2611	case Op::add_i16x2: vals[i] = I(b->CreateAdd(x2(vals[x]), x2(vals[y]))); break;
2612	case Op::sub_i16x2: vals[i] = I(b->CreateSub(x2(vals[x]), x2(vals[y]))); break;
2613	case Op::mul_i16x2: vals[i] = I(b->CreateMul(x2(vals[x]), x2(vals[y]))); break;
2614
2615	case Op::shl_i16x2: vals[i] = I(b->CreateShl (x2(vals[x]), immy)); break;
2616	case Op::sra_i16x2: vals[i] = I(b->CreateAShr(x2(vals[x]), immy)); break;
2617	case Op::shr_i16x2: vals[i] = I(b->CreateLShr(x2(vals[x]), immy)); break;
2618
2619	case Op:: eq_i16x2:
2620	vals[i] = I(S(I16x2, b->CreateICmpEQ (x2(vals[x]), x2(vals[y]))));
2621	break;
2622	case Op::neq_i16x2:
2623	vals[i] = I(S(I16x2, b->CreateICmpNE (x2(vals[x]), x2(vals[y]))));
2624	break;
2625	case Op:: gt_i16x2:
2626	vals[i] = I(S(I16x2, b->CreateICmpSGT(x2(vals[x]), x2(vals[y]))));
2627	break;
2628	case Op::gte_i16x2:
2629	vals[i] = I(S(I16x2, b->CreateICmpSGE(x2(vals[x]), x2(vals[y]))));
2630	break;
2631	}
2632	return true;
2633	};
2634
2635	{
2636	IRBuilder b(enter);
2637	b.CreateBr(hoistK);
2638	}
2639
2640	// hoistK: emit each hoistable vector instruction; goto testK;
2641	// LLVM can do this sort of thing itself, but we've got the information cheap,
2642	// and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe.
2643	{
2644	IRBuilder b(hoistK);
2645
2646	// Hoisted instructions will need args (think, uniforms), so set that up now.
2647	// These phi nodes are degenerate... they'll always be the passed-in args from enter.
2648	// Later on when we start looping the phi nodes will start looking useful.
2649	llvm::Argument* arg = fn->arg_begin();
2650	(void)arg++; // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction.
2651	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2652	args.push_back(b.CreatePHI(arg->getType(), `1`));
2653	args.back()->addIncoming(arg++, enter);
2654	}
2655
2656	for (size_t i = `0`; i < instructions.size(); i++) {
2657	if (instructions[i].can_hoist && !emit(i, false, &b)) {
2658	return;
2659	}
2660	}
2661
2662	b.CreateBr(testK);
2663	}
2664
2665	// testK: if (N >= K) goto loopK; else goto hoist1;
2666	{
2667	IRBuilder b(testK);
2668
2669	// New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK.
2670	// These also start as the initial function arguments; hoistK can't have changed them.
2671	llvm::Argument* arg = fn->arg_begin();
2672
2673	n = b.CreatePHI(arg->getType(), `2`);
2674	n->addIncoming(arg++, hoistK);
2675
2676	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2677	args[i] = b.CreatePHI(arg->getType(), `2`);
2678	args[i]->addIncoming(arg++, hoistK);
2679	}
2680
2681	b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1);
2682	}
2683
2684	// loopK: ... insts on K x T vectors; N -= K, args += Kstride; goto testK;*
2685	{
2686	IRBuilder b(loopK);
2687	for (size_t i = `0`; i < instructions.size(); i++) {
2688	if (!instructions[i].can_hoist && !emit(i, false, &b)) {
2689	return;
2690	}
2691	}
2692
2693	// n -= K
2694	llvm::Value* n_next = b.CreateSub(n, b.getInt32(K));
2695	n->addIncoming(n_next, loopK);
2696
2697	// Each arg ptr += K
2698	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2699	llvm::Value* arg_next
2700	= b.CreateConstInBoundsGEP1_32(nullptr, args[i], K*fImpl->strides[i]);
2701	args[i]->addIncoming(arg_next, loopK);
2702	}
2703	b.CreateBr(testK);
2704	}
2705
2706	// hoist1: emit each hoistable scalar instruction; goto test1;
2707	{
2708	IRBuilder b(hoist1);
2709	for (size_t i = `0`; i < instructions.size(); i++) {
2710	if (instructions[i].can_hoist && !emit(i, true, &b)) {
2711	return;
2712	}
2713	}
2714	b.CreateBr(test1);
2715	}
2716
2717	// test1: if (N >= 1) goto loop1; else goto leave;
2718	{
2719	IRBuilder b(test1);
2720
2721	// Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1.
2722	llvm::PHINode* n_new = b.CreatePHI(n->getType(), `2`);
2723	n_new->addIncoming(n, hoist1);
2724	n = n_new;
2725
2726	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2727	llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), `2`);
2728	arg_new->addIncoming(args[i], hoist1);
2729	args[i] = arg_new;
2730	}
2731
2732	b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(`1`)), loop1, leave);
2733	}
2734
2735	// loop1: ... insts on scalars; N -= 1, args += stride; goto test1;
2736	{
2737	IRBuilder b(loop1);
2738	for (size_t i = `0`; i < instructions.size(); i++) {
2739	if (!instructions[i].can_hoist && !emit(i, true, &b)) {
2740	return;
2741	}
2742	}
2743
2744	// n -= 1
2745	llvm::Value* n_next = b.CreateSub(n, b.getInt32(`1`));
2746	n->addIncoming(n_next, loop1);
2747
2748	// Each arg ptr += K
2749	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2750	llvm::Value* arg_next
2751	= b.CreateConstInBoundsGEP1_32(nullptr, args[i], fImpl->strides[i]);
2752	args[i]->addIncoming(arg_next, loop1);
2753	}
2754	b.CreateBr(test1);
2755	}
2756
2757	// leave: ret
2758	{
2759	IRBuilder b(leave);
2760	b.CreateRetVoid();
2761	}
2762
2763	SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs()));
2764
2765	if (true) {
2766	SkString path = SkStringPrintf("/tmp/%s.bc", debug_name);
2767	std::error_code err;
2768	llvm::raw_fd_ostream os(path.c_str(), err);
2769	if (err) {
2770	return;
2771	}
2772	llvm::WriteBitcodeToFile(*mod, os);
2773	}
2774
2775	static SkOnce once;
2776	once([]{
2777	SkAssertResult(false == llvm::InitializeNativeTarget());
2778	SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter());
2779	});
2780
2781	if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod))
2782	.setEngineKind(llvm::EngineKind::JIT)
2783	.setMCPU(llvm::sys::getHostCPUName())
2784	.create()) {
2785	fImpl->llvm_ctx = std::move(ctx);
2786	fImpl->llvm_ee.reset(ee);
2787
2788	// We have to be careful here about what we close over and how, in case fImpl moves.
2789	// fImpl itself may change, but its pointee fields won't, so close over them by value.
2790	// Also, debug_name will almost certainly leave scope, so copy it.
2791	fImpl->llvm_compiling = std::async(std::launch::async, [dst = &fImpl->jit_entry,
2792	ee = fImpl->llvm_ee.get(),
2793	name = std::string(debug_name)]{
2794	// std::atomic<void>* dst;*
2795	// llvm::ExecutionEngine ee;*
2796	// std::string name;
2797	dst->store( (void*)ee->getFunctionAddress(name.c_str()) );
2798	});
2799	}
2800	}
2801	#endif
2802
2803	void Program::waitForLLVM() const {
2804	#if defined(SKVM_LLVM)
2805	if (fImpl->llvm_compiling.valid()) {
2806	fImpl->llvm_compiling.wait();
2807	}
2808	#endif
2809	}
2810
2811	bool Program::hasJIT() const {
2812	// Program::hasJIT() is really just a debugging / test aid,
2813	// so we don't mind adding a sync point here to wait for compilation.
2814	this->waitForLLVM();
2815
2816	return fImpl ->jit_entry.load() != nullptr;
2817	}
2818
2819	void Program::dropJIT() {
2820	#if defined(SKVM_LLVM)
2821	this->waitForLLVM();
2822	fImpl->llvm_ee .reset(nullptr);
2823	fImpl->llvm_ctx.reset(nullptr);
2824	#elif defined(SKVM_JIT)
2825	if (fImpl ->dylib) {
2826	dlclose(fImpl ->dylib);
2827	} else if (auto jit_entry = fImpl ->jit_entry.load()) {
2828	munmap(jit_entry, fImpl ->jit_size);
2829	}
2830	#else
2831	SkASSERT(!this->hasJIT());
2832	#endif
2833
2834	fImpl ->jit_entry.store(nullptr);
2835	fImpl ->jit_size = `0`;
2836	fImpl ->dylib = nullptr;
2837	}
2838
2839	Program::Program() : fImpl(std::make_unique<Impl>()) {}
2840
2841	Program::~Program() {
2842	// Moved-from Programs may have fImpl == nullptr.
2843	if (fImpl) {
2844	this->dropJIT();
2845	}
2846	}
2847
2848	Program::Program(Program&& other) : fImpl (std::move(other.fImpl)) {}
2849
2850	Program& Program::operator=(Program&& other) {
2851	fImpl = std::move(other.fImpl);
2852	return *this;
2853	}
2854
2855	Program::Program(const std::vector<OptimizedInstruction>& interpreter,
2856	const std::vector<int>& strides) : Program () {
2857	fImpl ->strides = strides;
2858	this->setupInterpreter(interpreter);
2859	}
2860
2861	Program::Program(const std::vector<OptimizedInstruction>& interpreter,
2862	const std::vector<OptimizedInstruction>& jit,
2863	const std::vector<int>& strides,
2864	const char* debug_name) : Program () {
2865	fImpl ->strides = strides;
2866	#if 1 && defined(SKVM_LLVM)
2867	this->setupLLVM(interpreter, debug_name);
2868	#elif 1 && defined(SKVM_JIT)
2869	this->setupJIT(jit, debug_name);
2870	#endif
2871
2872	// Might as well do this after setupLLVM() to get a little more time to compile.
2873	this->setupInterpreter(interpreter);
2874	}
2875
2876	std::vector<InterpreterInstruction> Program::instructions() const { return fImpl ->instructions; }
2877	int Program::nargs() const { return (int)fImpl ->strides.size(); }
2878	int Program::nregs() const { return fImpl ->regs; }
2879	int Program::loop () const { return fImpl ->loop; }
2880	bool Program::empty() const { return fImpl ->instructions.empty(); }
2881
2882	// Translate OptimizedInstructions to InterpreterInstructions.
2883	void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
2884	// Register each instruction is assigned to.
2885	std::vector<Reg> reg(instructions.size());
2886
2887	// This next bit is a bit more complicated than strictly necessary;
2888	// we could just assign every instruction to its own register.
2889	//
2890	// But recycling registers is fairly cheap, and good practice for the
2891	// JITs where minimizing register pressure really is important.
2892	//
2893	// Since we have effectively infinite registers, we hoist any value we can.
2894	// (The JIT may choose a more complex policy to reduce register pressure.)
2895	auto hoisted = [&](Val id) { return instructions [id].can_hoist; };
2896
2897	fImpl ->regs = `0`;
2898	std::vector<Reg> avail;
2899
2900	// Assign this value to a register, recycling them where we can.
2901	auto assign_register = [&](Val id) {
2902	const OptimizedInstruction& inst = instructions [id];
2903
2904	// If this is a real input and it's lifetime ends at this instruction,
2905	// we can recycle the register it's occupying.
2906	auto maybe_recycle_register = [&](Val input) {
2907	if (input != NA
2908	&& instructions [input].death == id
2909	&& !(hoisted (input) && instructions [input].used_in_loop)) {
2910	avail.push_back(reg [input]);
2911	}
2912	};
2913
2914	// Take care to not recycle the same register twice.
2915	if (true ) { maybe_recycle_register (inst.x); }
2916	if (inst.y != inst.x ) { maybe_recycle_register (inst.y); }
2917	if (inst.z != inst.x && inst.z != inst.y) { maybe_recycle_register (inst.z); }
2918
2919	// Instructions that die at themselves (stores) don't need a register.
2920	if (inst.death != id) {
2921	// Allocate a register if we have to, preferring to reuse anything available.
2922	if (avail.empty()) {
2923	reg [id] = fImpl ->regs++;
2924	} else {
2925	reg [id] = avail.back();
2926	avail.pop_back();
2927	}
2928	}
2929	};
2930
2931	// Assign a register to each hoisted instruction, then each non-hoisted loop instruction.
2932	for (Val id = `0`; id < (Val)instructions.size(); id++) {
2933	if ( hoisted (id)) { assign_register (id); }
2934	}
2935	for (Val id = `0`; id < (Val)instructions.size(); id++) {
2936	if (!hoisted (id)) { assign_register (id); }
2937	}
2938
2939	// Translate OptimizedInstructions to InterpreterIstructions by mapping values to
2940	// registers. This will be two passes, first hoisted instructions, then inside the loop.
2941
2942	// The loop begins at the fImpl->loop'th Instruction.
2943	fImpl ->loop = `0`;
2944	fImpl ->instructions.reserve(instructions.size());
2945
2946	// Add a dummy mapping for the N/A sentinel Val to any arbitrary register
2947	// so lookups don't have to know which arguments are used by which Ops.
2948	auto lookup_register = [&](Val id) {
2949	return id == NA ? (Reg)`0`
2950	: reg [id];
2951	};
2952
2953	auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
2954	InterpreterInstruction pinst{
2955	inst.op,
2956	lookup_register (id),
2957	lookup_register (inst.x),
2958	{lookup_register (inst.y)},
2959	{lookup_register (inst.z)},
2960	};
2961	if (inst.y == NA) { pinst.immy = inst.immy; }
2962	if (inst.z == NA) { pinst.immz = inst.immz; }
2963	fImpl ->instructions.push_back(pinst);
2964	};
2965
2966	for (Val id = `0`; id < (Val)instructions.size(); id++) {
2967	const OptimizedInstruction& inst = instructions [id];
2968	if (hoisted (id)) {
2969	push_instruction (id, inst);
2970	fImpl ->loop++;
2971	}
2972	}
2973	for (Val id = `0`; id < (Val)instructions.size(); id++) {
2974	const OptimizedInstruction& inst = instructions [id];
2975	if (!hoisted (id)) {
2976	push_instruction (id, inst);
2977	}
2978	}
2979	}
2980
2981	#if defined(SKVM_JIT)
2982
2983	bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
2984	const JITMode mode,
2985	Assembler* a) const {
2986	using A = Assembler;
2987	const bool try_hoisting = mode != JITMode::RegisterNoHoist;
2988
2989	auto debug_dump = [&] {
2990	#if 0
2991	SkDebugfStream stream;
2992	this->dump(&stream);
2993	return true;
2994	#else
2995	return false;
2996	#endif
2997	};
2998
2999	#if defined(__x86_64__)
3000	if (!SkCpu::Supports(SkCpu::HSW)) {
3001	return false;
3002	}
3003	const int K = `8`;
3004	const bool stack_only = mode == JITMode::Stack;
3005	A::GP64 N = A::rdi,
3006	scratch = A::rax,
3007	scratch2 = A::r11,
3008	arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 };
3009
3010	// All 16 ymm registers are available to use.
3011	using Reg = A::Ymm;
3012	const uint32_t all_regs = `0xffff`;
3013	uint32_t avail = all_regs;
3014
3015	#elif defined(__aarch64__)
3016	const int K = `4`;
3017	const bool stack_only = false; // TODO
3018	A::X N = A::x0,
3019	scratch = A::x8,
3020	arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
3021
3022	// We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15.
3023	using Reg = A::V;
3024	const uint32_t all_regs = `0xffff00ff`;
3025	uint32_t avail = all_regs;
3026	#endif
3027
3028	if (SK_ARRAY_COUNT(arg) < fImpl ->strides.size()) {
3029	return false;
3030	}
3031
3032	auto hoisted = [&](Val id) { return try_hoisting && instructions [id].can_hoist; };
3033
3034	std::vector<Reg> r(instructions.size());
3035
3036	struct LabelAndReg {
3037	A::Label label;
3038	Reg reg;
3039	};
3040	SkTHashMap<int, LabelAndReg> constants; // All constants share the same pool.
3041	LabelAndReg iota; // Exists _only_ to vary per-lane.
3042
3043	auto emit = [&](Val id, bool scalar) {
3044	if (stack_only) {
3045	SkASSERT(avail == all_regs);
3046	}
3047
3048	const OptimizedInstruction& inst = instructions [id];
3049	Op op = inst.op;
3050	Val x = inst.x,
3051	y = inst.y,
3052	z = inst.z;
3053	int immy = inst.immy,
3054	immz = inst.immz;
3055
3056	// Most (but not all) ops create an output value and need a register to hold it, dst.
3057	// We track each instruction's dst in r[] so we can thread it through as an input
3058	// to any future instructions needing that value.
3059	//
3060	// And some ops may need a temporary register, tmp. Some need both tmp and dst.
3061	//
3062	// tmp and dst are very similar and can and will often be assigned the same register,
3063	// but tmp may never alias any of the instructions's inputs, while dst may when this
3064	// instruction consumes that input, i.e. if the input reaches its end of life here.
3065	//
3066	// We'll assign both registers lazily to keep register pressure as low as possible.
3067	bool tmp_is_set = false,
3068	dst_is_set = false;
3069	Reg tmp_reg = (Reg)`0`; // This initial value won't matter... anything legal is fine.
3070
3071	bool ok = true; // Set to false if we need to assign a register and none's available.
3072
3073	if (stack_only) {
3074	// Move each unique argument into a temporary register.
3075	auto load_from_stack = [&](Val arg) {
3076	if (int found = __builtin_ffs(avail)) {
3077	Reg reg = (Reg)(found - `1`);
3078	avail ^= `1` << reg;
3079	r [arg] = reg;
3080	#if defined(__x86_64__)
3081	a->vmovups(r [arg], argK`4`);
3082	#else
3083	SkASSERT(false); // TODO
3084	#endif
3085	} else {
3086	if (debug_dump ()) {
3087	SkDebugf("\nCould not find temporary register for %d\n", arg);
3088	}
3089	ok = false;
3090	}
3091	};
3092	if (x != NA ) { load_from_stack (x); }
3093	if (y != NA && y != x ) { load_from_stack (y); }
3094	if (z != NA && z != x && z != y) { load_from_stack (z); }
3095	}
3096
3097	// First lock in how to choose tmp if we need to based on the registers
3098	// available before this instruction, not including any of its input registers.
3099	auto tmp = [&,avail/important, closing over avail's current value/]{
3100	if (!tmp_is_set) {
3101	tmp_is_set = true;
3102	if (int found = __builtin_ffs(avail)) {
3103	// This is a temporary register just for this op,
3104	// so we leave it marked available for future ops.
3105	tmp_reg = (Reg)(found - `1`);
3106	} else {
3107	// We needed a tmp register but couldn't find one available. :'(
3108	// This will cause emit() to return false, in turn causing jit() to fail.
3109	if (debug_dump ()) {
3110	SkDebugf("\nCould not find a register to hold tmp\n");
3111	}
3112	ok = false;
3113	}
3114	}
3115	return tmp_reg;
3116	};
3117
3118	// Now make available any registers that are consumed by this instruction.
3119	// (The register pool we can pick dst from is >= the pool for tmp, adding any of these.)
3120	auto maybe_recycle_register = [&](Val input) {
3121	if (input != NA
3122	&& instructions [input].death == id
3123	&& !(hoisted (input) && instructions [input].used_in_loop)) {
3124	avail \|= `1` << r [input];
3125	}
3126	};
3127	maybe_recycle_register (x);
3128	maybe_recycle_register (y);
3129	maybe_recycle_register (z);
3130	// set_dst() and dst() will work read/write with this perhaps-just-updated avail.
3131
3132	// Some ops may decide dst on their own to best fit the instruction (see Op::fma_f32).
3133	auto set_dst = [&](Reg reg){
3134	SkASSERT(dst_is_set == false);
3135	dst_is_set = true;
3136
3137	SkASSERT(avail & (`1`<<reg));
3138	avail ^= `1`<<reg;
3139
3140	r [id] = reg;
3141	};
3142
3143	// Thanks to AVX and NEON's 3-argument instruction sets,
3144	// most ops can use any register as dst.
3145	auto dst = [&]{
3146	if (!dst_is_set) {
3147	if (int found = __builtin_ffs(avail)) {
3148	set_dst ((Reg)(found-`1`));
3149	} else {
3150	// Same deal as with tmp... all the registers are occupied. Time to fail!
3151	if (debug_dump ()) {
3152	SkDebugf("\nCould not find a register to hold value %d\n", id);
3153	}
3154	ok = false;
3155	}
3156	}
3157	return r [id];
3158	};
3159
3160	// Because we use the same logic to pick an arbitrary dst and to pick tmp,
3161	// and we know that tmp will never overlap any of the inputs, `dst() == tmp()`
3162	// is a simple idiom to check that the destination does not overlap any of the inputs.
3163	// Sometimes we can use this knowledge to do better instruction selection.
3164
3165	// Ok! Keep in mind that we haven't assigned tmp or dst yet,
3166	// just laid out hooks for how to do so if we need them, depending on the instruction.
3167	//
3168	// Now let's actually assemble the instruction!
3169	switch (op) {
3170	default:
3171	if (debug_dump ()) {
3172	SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
3173	}
3174	return false; // TODO: many new ops
3175
3176	#if defined(__x86_64__)
3177	case Op::assert_true: {
3178	a->vptest (r [x], &constants [`0xffffffff`].label);
3179	A::Label all_true;
3180	a->jc(&all_true);
3181	a->int3();
3182	a->label(&all_true);
3183	} break;
3184
3185	case Op::store8: if (scalar) { a->vpextrb (arg[immy], (A::Xmm)r [x], `0`); }
3186	else { a->vpackusdw(tmp (), r [x], r [x]);
3187	a->vpermq (tmp (), tmp (), `0xd8`);
3188	a->vpackuswb(tmp (), tmp (), tmp ());
3189	a->vmovq (arg[immy], (A::Xmm)tmp ()); }
3190	break;
3191
3192	case Op::store16: if (scalar) { a->vpextrw (arg[immy], (A::Xmm)r [x], `0`); }
3193	else { a->vpackusdw(tmp (), r [x], r [x]);
3194	a->vpermq (tmp (), tmp (), `0xd8`);
3195	a->vmovups (arg[immy], (A::Xmm)tmp ()); }
3196	break;
3197
3198	case Op::store32: if (scalar) { a->vmovd (arg[immy], (A::Xmm)r [x]); }
3199	else { a->vmovups(arg[immy], r [x]); }
3200	break;
3201
3202	case Op::load8: if (scalar) {
3203	a->vpxor (dst (), dst (), dst ());
3204	a->vpinsrb((A::Xmm)dst (), (A::Xmm)dst (), arg[immy], `0`);
3205	} else {
3206	a->vpmovzxbd(dst (), arg[immy]);
3207	} break;
3208
3209	case Op::load16: if (scalar) {
3210	a->vpxor (dst (), dst (), dst ());
3211	a->vpinsrw((A::Xmm)dst (), (A::Xmm)dst (), arg[immy], `0`);
3212	} else {
3213	a->vpmovzxwd(dst (), arg[immy]);
3214	} break;
3215
3216	case Op::load32: if (scalar) { a->vmovd ((A::Xmm)dst (), arg[immy]); }
3217	else { a->vmovups( dst (), arg[immy]); }
3218	break;
3219
3220	case Op::gather32:
3221	if (scalar) {
3222	auto base = scratch,
3223	index = scratch2;
3224	// Our gather base pointer is immz bytes off of uniform immy.
3225	a->movq(base, arg[immy], immz);
3226
3227	// Grab our index from lane 0 of the index argument.
3228	a->vmovd_direct(index, (A::Xmm)r [x]);
3229
3230	// dst = (base + 4index)
3231	a->vmovd((A::Xmm)dst (), A::FOUR, index, base);
3232	} else {
3233	// We may not let any of dst(), index, or mask use the same register,
3234	// so we must allocate registers manually and very carefully.
3235
3236	// index is argument x and has already been maybe_recycle_register()'d,
3237	// so we explicitly ignore its availability during this op.
3238	A::Ymm index = r [x];
3239	uint32_t avail_during_gather = avail & ~(`1`<<index);
3240
3241	// Choose dst() to not overlap with index.
3242	if (int found = __builtin_ffs(avail_during_gather)) {
3243	set_dst ((A::Ymm)(found-`1`));
3244	avail_during_gather ^= (`1`<<dst ());
3245	} else {
3246	ok = false;
3247	break;
3248	}
3249
3250	// Choose (temporary) mask to not overlap with dst() or index.
3251	A::Ymm mask;
3252	if (int found = __builtin_ffs(avail_during_gather)) {
3253	mask = (A::Ymm)(found-`1`);
3254	} else {
3255	ok = false;
3256	break;
3257	}
3258
3259	// Our gather base pointer is immz bytes off of uniform immy.
3260	auto base = scratch;
3261	a->movq(base, arg[immy], immz);
3262	a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.)
3263	a->vgatherdps(dst (), A::FOUR, index, base, mask);
3264	}
3265	break;
3266
3267	case Op::uniform8: a->movzbl(scratch, arg[immy], immz);
3268	a->vmovd_direct((A::Xmm)dst (), scratch);
3269	a->vbroadcastss(dst (), (A::Xmm)dst ());
3270	break;
3271
3272	case Op::uniform32: a->vbroadcastss(dst (), arg[immy], immz);
3273	break;
3274
3275	case Op::index: a->vmovd_direct((A::Xmm)tmp (), N);
3276	a->vbroadcastss(tmp (), (A::Xmm)tmp ());
3277	a->vpsubd(dst (), tmp (), &iota.label);
3278	break;
3279
3280	case Op::splat: if (immy) { a->vbroadcastss(dst (), &constants [immy].label); }
3281	else { a->vpxor(dst (), dst (), dst ()); }
3282	break;
3283
3284	case Op::add_f32: a->vaddps(dst (), r [x], r [y]); break;
3285	case Op::sub_f32: a->vsubps(dst (), r [x], r [y]); break;
3286	case Op::mul_f32: a->vmulps(dst (), r [x], r [y]); break;
3287	case Op::div_f32: a->vdivps(dst (), r [x], r [y]); break;
3288	case Op::min_f32: a->vminps(dst (), r [y], r [x]); break; // Order matters,
3289	case Op::max_f32: a->vmaxps(dst (), r [y], r [x]); break; // see test SkVM_min_max.
3290
3291	case Op::fma_f32:
3292	if (avail & (`1`<<r [x])) { set_dst (r [x]); a->vfmadd132ps(r [x], r [z], r [y]); }
3293	else if (avail & (`1`<<r [y])) { set_dst (r [y]); a->vfmadd213ps(r [y], r [x], r [z]); }
3294	else if (avail & (`1`<<r [z])) { set_dst (r [z]); a->vfmadd231ps(r [z], r [x], r [y]); }
3295	else { SkASSERT(dst() == tmp());
3296	a->vmovdqa (dst (),r [x]);
3297	a->vfmadd132ps(dst (),r [z], r [y]); }
3298	break;
3299
3300	case Op::fms_f32:
3301	if (avail & (`1`<<r [x])) { set_dst (r [x]); a->vfmsub132ps(r [x], r [z], r [y]); }
3302	else if (avail & (`1`<<r [y])) { set_dst (r [y]); a->vfmsub213ps(r [y], r [x], r [z]); }
3303	else if (avail & (`1`<<r [z])) { set_dst (r [z]); a->vfmsub231ps(r [z], r [x], r [y]); }
3304	else { SkASSERT(dst() == tmp());
3305	a->vmovdqa (dst (),r [x]);
3306	a->vfmsub132ps(dst (),r [z], r [y]); }
3307	break;
3308
3309	case Op::fnma_f32:
3310	if (avail & (`1`<<r [x])) { set_dst (r [x]); a->vfnmadd132ps(r [x],r [z], r [y]); }
3311	else if (avail & (`1`<<r [y])) { set_dst (r [y]); a->vfnmadd213ps(r [y],r [x], r [z]); }
3312	else if (avail & (`1`<<r [z])) { set_dst (r [z]); a->vfnmadd231ps(r [z],r [x], r [y]); }
3313	else { SkASSERT(dst() == tmp());
3314	a->vmovdqa (dst (),r [x]);
3315	a->vfnmadd132ps(dst (),r [z],r [y]); }
3316	break;
3317
3318	case Op::sqrt_f32: a->vsqrtps(dst (), r [x]); break;
3319
3320	case Op::add_f32_imm: a->vaddps(dst (), r [x], &constants [immy].label); break;
3321	case Op::sub_f32_imm: a->vsubps(dst (), r [x], &constants [immy].label); break;
3322	case Op::mul_f32_imm: a->vmulps(dst (), r [x], &constants [immy].label); break;
3323	case Op::min_f32_imm: a->vminps(dst (), r [x], &constants [immy].label); break;
3324	case Op::max_f32_imm: a->vmaxps(dst (), r [x], &constants [immy].label); break;
3325
3326	case Op::add_i32: a->vpaddd (dst (), r [x], r [y]); break;
3327	case Op::sub_i32: a->vpsubd (dst (), r [x], r [y]); break;
3328	case Op::mul_i32: a->vpmulld(dst (), r [x], r [y]); break;
3329
3330	case Op::sub_i16x2: a->vpsubw (dst (), r [x], r [y]); break;
3331	case Op::mul_i16x2: a->vpmullw(dst (), r [x], r [y]); break;
3332	case Op::shr_i16x2: a->vpsrlw (dst (), r [x], immy); break;
3333
3334	case Op::bit_and : a->vpand (dst (), r [x], r [y]); break;
3335	case Op::bit_or : a->vpor (dst (), r [x], r [y]); break;
3336	case Op::bit_xor : a->vpxor (dst (), r [x], r [y]); break;
3337	case Op::bit_clear: a->vpandn(dst (), r [y], r [x]); break; // Notice, y then x.
3338	case Op::select : a->vpblendvb(dst (), r [z], r [y], r [x]); break;
3339
3340	case Op::bit_and_imm: a->vpand (dst (), r [x], &constants [immy].label); break;
3341	case Op::bit_or_imm : a->vpor (dst (), r [x], &constants [immy].label); break;
3342	case Op::bit_xor_imm: a->vpxor (dst (), r [x], &constants [immy].label); break;
3343
3344	case Op::shl_i32: a->vpslld(dst (), r [x], immy); break;
3345	case Op::shr_i32: a->vpsrld(dst (), r [x], immy); break;
3346	case Op::sra_i32: a->vpsrad(dst (), r [x], immy); break;
3347
3348	case Op::eq_i32: a->vpcmpeqd(dst (), r [x], r [y]); break;
3349	case Op::gt_i32: a->vpcmpgtd(dst (), r [x], r [y]); break;
3350
3351	case Op:: eq_f32: a->vcmpeqps (dst (), r [x], r [y]); break;
3352	case Op::neq_f32: a->vcmpneqps(dst (), r [x], r [y]); break;
3353	case Op:: gt_f32: a->vcmpltps (dst (), r [y], r [x]); break;
3354	case Op::gte_f32: a->vcmpleps (dst (), r [y], r [x]); break;
3355
3356	case Op::pack: a->vpslld(tmp (), r [y], immz);
3357	a->vpor (dst (), tmp (), r [x]);
3358	break;
3359
3360	case Op::floor : a->vroundps (dst (), r [x], Assembler::FLOOR); break;
3361	case Op::to_f32: a->vcvtdq2ps (dst (), r [x]); break;
3362	case Op::trunc : a->vcvttps2dq(dst (), r [x]); break;
3363	case Op::round : a->vcvtps2dq (dst (), r [x]); break;
3364
3365	#elif defined(__aarch64__)
3366	case Op::assert_true: {
3367	a->uminv4s(tmp(), r[x]); // uminv acts like an all() across the vector.
3368	a->fmovs(scratch, tmp());
3369	A::Label all_true;
3370	a->cbnz(scratch, &all_true);
3371	a->brk(`0`);
3372	a->label(&all_true);
3373	} break;
3374
3375	case Op::store8: a->xtns2h(tmp(), r[x]);
3376	a->xtnh2b(tmp(), tmp());
3377	if (scalar) { a->strb (tmp(), arg[immy]); }
3378	else { a->strs (tmp(), arg[immy]); }
3379	break;
3380	// TODO: another case where it'd be okay to alias r[x] and tmp if r[x] dies here.
3381
3382	case Op::store32: if (scalar) { a->strs(r[x], arg[immy]); }
3383	else { a->strq(r[x], arg[immy]); }
3384	break;
3385
3386	case Op::load8: if (scalar) { a->ldrb(tmp(), arg[immy]); }
3387	else { a->ldrs(tmp(), arg[immy]); }
3388	a->uxtlb2h(tmp(), tmp());
3389	a->uxtlh2s(dst(), tmp());
3390	break;
3391
3392	case Op::load32: if (scalar) { a->ldrs(dst(), arg[immy]); }
3393	else { a->ldrq(dst(), arg[immy]); }
3394	break;
3395
3396	case Op::splat: if (immy) { a->ldrq(dst(), &constants[immy].label); }
3397	else { a->eor16b(dst(), dst(), dst()); }
3398	break;
3399	// TODO: If we hoist these, pack 4 values in each register
3400	// and use vector/lane operations, cutting the register
3401	// pressure cost of hoisting by 4?
3402
3403	case Op::add_f32: a->fadd4s(dst(), r[x], r[y]); break;
3404	case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break;
3405	case Op::mul_f32: a->fmul4s(dst(), r[x], r[y]); break;
3406	case Op::div_f32: a->fdiv4s(dst(), r[x], r[y]); break;
3407
3408	case Op::fma_f32: // fmla.4s is z += xy*
3409	if (avail & (`1`<<r[z])) { set_dst(r[z]); a->fmla4s( r[z], r[x], r[y]); }
3410	else { a->orr16b(tmp(), r[z], r[z]);
3411	a->fmla4s(tmp(), r[x], r[y]);
3412	if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
3413	break;
3414
3415	case Op::fnma_f32: // fmls.4s is z -= xy*
3416	if (avail & (`1`<<r[z])) { set_dst(r[z]); a->fmls4s( r[z], r[x], r[y]); }
3417	else { a->orr16b(tmp(), r[z], r[z]);
3418	a->fmls4s(tmp(), r[x], r[y]);
3419	if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
3420	break;
3421
3422	case Op::fms_f32:
3423	// first dst() = xy - z as if fnma_f32
3424	if (avail & (`1`<<r[z])) { set_dst(r[z]); a->fmls4s( r[z], r[x], r[y]); }
3425	else { a->orr16b(tmp(), r[z], r[z]);
3426	a->fmls4s(tmp(), r[x], r[y]);
3427	if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
3428	// then dst() = -dst() (i.e. z - xy)
3429	a->fneg4s(dst(), dst());
3430	break;
3431
3432	// These _imm instructions are all x86/JIT only.
3433	case Op::add_f32_imm :
3434	case Op::sub_f32_imm :
3435	case Op::mul_f32_imm :
3436	case Op::min_f32_imm :
3437	case Op::max_f32_imm :
3438	case Op::bit_and_imm :
3439	case Op::bit_or_imm :
3440	case Op::bit_xor_imm : SkUNREACHABLE; break;
3441
3442	case Op:: gt_f32: a->fcmgt4s (dst(), r[x], r[y]); break;
3443	case Op::gte_f32: a->fcmge4s (dst(), r[x], r[y]); break;
3444	case Op:: eq_f32: a->fcmeq4s (dst(), r[x], r[y]); break;
3445	case Op::neq_f32: a->fcmeq4s (tmp(), r[x], r[y]);
3446	a->not16b (dst(), tmp()); break;
3447
3448
3449	case Op::add_i32: a->add4s(dst(), r[x], r[y]); break;
3450	case Op::sub_i32: a->sub4s(dst(), r[x], r[y]); break;
3451	case Op::mul_i32: a->mul4s(dst(), r[x], r[y]); break;
3452
3453	case Op::sub_i16x2: a->sub8h (dst(), r[x], r[y]); break;
3454	case Op::mul_i16x2: a->mul8h (dst(), r[x], r[y]); break;
3455	case Op::shr_i16x2: a->ushr8h(dst(), r[x], immy); break;
3456
3457	case Op::bit_and : a->and16b(dst(), r[x], r[y]); break;
3458	case Op::bit_or : a->orr16b(dst(), r[x], r[y]); break;
3459	case Op::bit_xor : a->eor16b(dst(), r[x], r[y]); break;
3460	case Op::bit_clear: a->bic16b(dst(), r[x], r[y]); break;
3461
3462	case Op::select: // bsl16b is x = x ? y : z
3463	if (avail & (`1`<<r[x])) { set_dst(r[x]); a->bsl16b( r[x], r[y], r[z]); }
3464	else { a->orr16b(tmp(), r[x], r[x]);
3465	a->bsl16b(tmp(), r[y], r[z]);
3466	if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
3467	break;
3468
3469	// fmin4s and fmax4s don't work the way we want with NaN,
3470	// so we write them the long way:
3471	case Op::min_f32: // min(x,y) = y<x ? y : x
3472	a->fcmgt4s(tmp(), r[x],r[y]);
3473	a->bsl16b (tmp(), r[y],r[x]);
3474	if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); }
3475	break;
3476
3477	case Op::max_f32: // max(x,y) = x<y ? y : x
3478	a->fcmgt4s(tmp(), r[y],r[x]);
3479	a->bsl16b (tmp(), r[y],r[x]);
3480	if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); }
3481	break;
3482
3483	case Op::shl_i32: a-> shl4s(dst(), r[x], immy); break;
3484	case Op::shr_i32: a->ushr4s(dst(), r[x], immy); break;
3485	case Op::sra_i32: a->sshr4s(dst(), r[x], immy); break;
3486
3487	case Op::eq_i32: a->cmeq4s(dst(), r[x], r[y]); break;
3488	case Op::gt_i32: a->cmgt4s(dst(), r[x], r[y]); break;
3489
3490	case Op::pack:
3491	if (avail & (`1`<<r[x])) { set_dst(r[x]); a->sli4s ( r[x], r[y], immz); }
3492	else { a->shl4s (tmp(), r[y], immz);
3493	a->orr16b(dst(), tmp(), r[x]); }
3494	break;
3495
3496	case Op::to_f32: a->scvtf4s (dst(), r[x]); break;
3497	case Op::trunc: a->fcvtzs4s(dst(), r[x]); break;
3498	case Op::round: a->fcvtns4s(dst(), r[x]); break;
3499	// TODO: fcvtns.4s rounds to nearest even.
3500	// I think we actually want frintx -> fcvtzs to round to current mode.
3501	#endif
3502	}
3503
3504	if (stack_only) {
3505	if (dst_is_set) {
3506	#if defined(__x86_64__)
3507	a->vmovups(idK`4`, r [id]);
3508	#else
3509	SkASSERT(false); // TODO
3510	#endif
3511	avail \|= `1` << r [id];
3512	}
3513	for (Val arg : {x,y,z}) {
3514	if (arg != NA) {
3515	avail \|= `1` << r [arg];
3516	}
3517	}
3518	SkASSERT(avail == all_regs);
3519	}
3520
3521	// Calls to tmp() or dst() might have flipped this false from its default true state.
3522	return ok;
3523	};
3524
3525
3526	#if defined(__x86_64__)
3527	auto jump_if_less = [&](A::Label* l) { a->jl (l); };
3528	auto jump = [&](A::Label* l) { a->jmp(l); };
3529
3530	auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
3531	auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
3532
3533	auto enter = [&]{ a->sub(A::rsp, instructions.size()K`4`); };
3534	auto exit = [&]{ a->add(A::rsp, instructions.size()K`4`); a->vzeroupper(); a->ret(); };
3535	#elif defined(__aarch64__)
3536	auto jump_if_less = [&](A::Label* l) { a->blt(l); };
3537	auto jump = [&](A::Label* l) { a->b (l); };
3538
3539	auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
3540	auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
3541
3542	auto enter = [&]{};
3543	auto exit = [&]{ a->ret(A::x30); };
3544	#endif
3545
3546	A::Label body,
3547	tail,
3548	done;
3549
3550	enter ();
3551	for (Val id = `0`; id < (Val)instructions.size(); id++) {
3552	if (hoisted (id) && !emit (id, /scalar=/false)) {
3553	return false;
3554	}
3555	}
3556
3557	a->label(&body);
3558	{
3559	a->cmp(N, K);
3560	jump_if_less (&tail);
3561	for (Val id = `0`; id < (Val)instructions.size(); id++) {
3562	if (!hoisted (id) && !emit (id, /scalar=/false)) {
3563	return false;
3564	}
3565	}
3566	for (int i = `0`; i < (int)fImpl ->strides.size(); i++) {
3567	if (fImpl ->strides [i]) {
3568	add (arg[i], K*fImpl ->strides [i]);
3569	}
3570	}
3571	sub (N, K);
3572	jump (&body);
3573	}
3574
3575	a->label(&tail);
3576	{
3577	a->cmp(N, `1`);
3578	jump_if_less (&done);
3579	for (Val id = `0`; id < (Val)instructions.size(); id++) {
3580	if (!hoisted (id) && !emit (id, /scalar=/true)) {
3581	return false;
3582	}
3583	}
3584	for (int i = `0`; i < (int)fImpl ->strides.size(); i++) {
3585	if (fImpl ->strides [i]) {
3586	add (arg[i], `1`*fImpl ->strides [i]);
3587	}
3588	}
3589	sub (N, `1`);
3590	jump (&tail);
3591	}
3592
3593	a->label(&done);
3594	{
3595	exit ();
3596	}
3597
3598	// Except for explicit aligned load and store instructions, AVX allows
3599	// memory operands to be unaligned. So even though we're creating 16
3600	// byte patterns on ARM or 32-byte patterns on x86, we only need to
3601	// align to 4 bytes, the element size and alignment requirement.
3602
3603	constants.foreach([&](int imm, LabelAndReg* entry) {
3604	a->align(`4`);
3605	a->label(&entry->label);
3606	for (int i = `0`; i < K; i++) {
3607	a->word(imm);
3608	}
3609	});
3610
3611	if (!iota.label.references.empty()) {
3612	a->align(`4`);
3613	a->label(&iota.label);
3614	for (int i = `0`; i < K; i++) {
3615	a->word(i);
3616	}
3617	}
3618
3619	return true;
3620	}
3621
3622	void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
3623	const char* debug_name) {
3624	// Assemble with no buffer to determine a.size(), the number of bytes we'll assemble.
3625	Assembler a{nullptr};
3626
3627	// First try allowing code hoisting (faster code)
3628	// then again without if that fails (lower register pressure).
3629	JITMode mode = JITMode::Register;
3630	bool ok = false;
3631	for (JITMode m : {JITMode::Register, JITMode::RegisterNoHoist, JITMode::Stack}) {
3632	if (this->jit(instructions, m, &a)) {
3633	ok = true;
3634	mode = m;
3635	break;
3636	}
3637	}
3638	if (!ok) { return; }
3639
3640	// Allocate space that we can remap as executable.
3641	const size_t page = sysconf(_SC_PAGESIZE);
3642
3643	// mprotect works at page granularity.
3644	fImpl ->jit_size = ((a.size() + page - `1`) / page) * page;
3645
3646	void* jit_entry
3647	= mmap(nullptr,fImpl ->jit_size, PROT_READ\|PROT_WRITE, MAP_ANONYMOUS\|MAP_PRIVATE, -`1`,`0`);
3648	fImpl ->jit_entry.store(jit_entry);
3649
3650	// Assemble the program for real.
3651	a = Assembler {jit_entry};
3652	SkAssertResult(this->jit(instructions, mode, &a));
3653	SkASSERT(a.size() <= fImpl ->jit_size);
3654
3655	// Remap as executable, and flush caches on platforms that need that.
3656	mprotect(jit_entry, fImpl ->jit_size, PROT_READ\|PROT_EXEC);
3657	__builtin___clear_cache((char*)jit_entry,
3658	(char*)jit_entry + fImpl ->jit_size);
3659
3660	// For profiling and debugging, it's helpful to have this code loaded
3661	// dynamically rather than just jumping info fImpl->jit_entry.
3662	if (gSkVMJITViaDylib) {
3663	// Dump the raw program binary.
3664	SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name);
3665	int fd = mkstemp(path.writable_str());
3666	::write(fd, jit_entry, a.size());
3667	close(fd);
3668
3669	this->dropJIT(); // (unmap and null out fImpl->jit_entry.)
3670
3671	// Convert it in-place to a dynamic library with a single symbol "skvm_jit":
3672	SkString cmd = SkStringPrintf(
3673	"echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
3674	" \| clang -x assembler -shared - -o %s",
3675	path.c_str(), path.c_str());
3676	system(cmd.c_str());
3677
3678	// Load that dynamic library and look up skvm_jit().
3679	fImpl ->dylib = dlopen(path.c_str(), RTLD_NOW\|RTLD_LOCAL);
3680	fImpl ->jit_entry.store(dlsym(fImpl ->dylib, "skvm_jit"));
3681	}
3682	}
3683	#endif
3684
3685	} // namespace skvm
3686

Browse the source code of Skia/src/core/SkVM.cpp