SkVM.cpp source code [engine/third_party/skia/src/core/SkVM.cpp]

1	/*
2	* Copyright 2019 Google LLC
3	*
4	* Use of this source code is governed by a BSD-style license that can be
5	* found in the LICENSE file.
6	*/
7
8	#include "include/core/SkStream.h"
9	#include "include/core/SkString.h"
10	#include "include/private/SkChecksum.h"
11	#include "include/private/SkHalf.h"
12	#include "include/private/SkSpinlock.h"
13	#include "include/private/SkTFitsIn.h"
14	#include "include/private/SkThreadID.h"
15	#include "include/private/SkVx.h"
16	#include "src/core/SkColorSpaceXformSteps.h"
17	#include "src/core/SkCpu.h"
18	#include "src/core/SkEnumerate.h"
19	#include "src/core/SkOpts.h"
20	#include "src/core/SkVM.h"
21	#include <algorithm>
22	#include <atomic>
23	#include <queue>
24
25	#if defined(SKVM_LLVM)
26	#include <future>
27	#include <llvm/Bitcode/BitcodeWriter.h>
28	#include <llvm/ExecutionEngine/ExecutionEngine.h>
29	#include <llvm/IR/IRBuilder.h>
30	#include <llvm/IR/Verifier.h>
31	#include <llvm/Support/TargetSelect.h>
32
33	// Platform-specific intrinsics got their own files in LLVM 10.
34	#if __has_include(<llvm/IR/IntrinsicsX86.h>)
35	#include <llvm/IR/IntrinsicsX86.h>
36	#endif
37	#endif
38
39	bool gSkVMAllowJIT{false};
40	bool gSkVMJITViaDylib{false};
41
42	#if defined(SKVM_JIT)
43	#if defined(SK_BUILD_FOR_WIN)
44	#include "src/core/SkLeanWindows.h"
45	#include <memoryapi.h>
46
47	static void* alloc_jit_buffer(size_t* len) {
48	return VirtualAlloc(NULL, *len, MEM_RESERVE\|MEM_COMMIT, PAGE_READWRITE);
49	}
50	static void unmap_jit_buffer(void* ptr, size_t len) {
51	VirtualFree(ptr, `0`, MEM_RELEASE);
52	}
53	static void remap_as_executable(void* ptr, size_t len) {
54	DWORD old;
55	VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old);
56	SkASSERT(old == PAGE_READWRITE);
57	}
58	static void close_dylib(void* dylib) {
59	SkASSERT(false); // TODO? For now just assert we never make one.
60	}
61	#else
62	#include <dlfcn.h>
63	#include <sys/mman.h>
64
65	static void* alloc_jit_buffer(size_t* len) {
66	// While mprotect and VirtualAlloc both work at page granularity,
67	// mprotect doesn't round up for you, and instead requires len is at page granularity.*
68	const size_t page = sysconf(_SC_PAGESIZE);
69	len = ((len + page - `1`) / page) * page;
70	return mmap(nullptr,*len, PROT_READ\|PROT_WRITE, MAP_ANONYMOUS\|MAP_PRIVATE, -`1`,`0`);
71	}
72	static void unmap_jit_buffer(void* ptr, size_t len) {
73	munmap(ptr, len);
74	}
75	static void remap_as_executable(void* ptr, size_t len) {
76	mprotect(ptr, len, PROT_READ\|PROT_EXEC);
77	__builtin___clear_cache((char*)ptr,
78	(char*)ptr + len);
79	}
80	static void close_dylib(void* dylib) {
81	dlclose(dylib);
82	}
83	#endif
84
85	#if defined(SKVM_JIT_VTUNE)
86	#include <jitprofiling.h>
87	static void notify_vtune(const char* name, void* addr, size_t len) {
88	if (iJIT_IsProfilingActive() == iJIT_SAMPLING_ON) {
89	iJIT_Method_Load event;
90	memset(&event, `0`, sizeof(event));
91	event.method_id = iJIT_GetNewMethodID();
92	event.method_name = const_cast<char*>(name);
93	event.method_load_address = addr;
94	event.method_size = len;
95	iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &event);
96	}
97	}
98	#else
99	static void notify_vtune(const char* name, void* addr, size_t len) {}
100	#endif
101	#endif
102
103	// JIT code isn't MSAN-instrumented, so we won't see when it uses
104	// uninitialized memory, and we'll not see the writes it makes as properly
105	// initializing memory. Instead force the interpreter, which should let
106	// MSAN see everything our programs do properly.
107	//
108	// Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
109	#if defined(__has_feature)
110	#if __has_feature(memory_sanitizer) \|\| __has_feature(address_sanitizer)
111	#define SKVM_JIT_BUT_IGNORE_IT
112	#endif
113	#endif
114
115
116
117	namespace skvm {
118
119	struct Program::Impl {
120	std::vector<InterpreterInstruction> instructions;
121	int regs = `0`;
122	int loop = `0`;
123	std::vector<int> strides;
124
125	std::atomic<void> jit_entry{nullptr}; // TODO: minimal std::memory_orders*
126	size_t jit_size = `0`;
127	void* dylib = nullptr;
128
129	#if defined(SKVM_LLVM)
130	std::unique_ptr<llvm::LLVMContext> llvm_ctx;
131	std::unique_ptr<llvm::ExecutionEngine> llvm_ee;
132	std::future<void> llvm_compiling;
133	#endif
134	};
135
136	// Debugging tools, mostly for printing various data structures out to a stream.
137
138	namespace {
139	class SkDebugfStream final : public SkWStream {
140	size_t fBytesWritten = `0`;
141
142	bool write(const void* buffer, size_t size) override {
143	SkDebugf("%.*s", size, buffer);
144	fBytesWritten += size;
145	return true;
146	}
147
148	size_t bytesWritten() const override {
149	return fBytesWritten;
150	}
151	};
152
153	struct V { Val id; };
154	struct R { Reg id; };
155	struct Shift { int bits; };
156	struct Splat { int bits; };
157	struct Hex { int bits; };
158	struct Attr { const char* label; int v; };
159
160	static void write(SkWStream* o, const char* s) {
161	o->writeText(s);
162	}
163
164	static const char* name(Op op) {
165	switch (op) {
166	#define M(x) case Op::x: return #x;
167	SKVM_OPS(M)
168	#undef M
169	}
170	return "unknown op";
171	}
172
173	static void write(SkWStream* o, Op op) {
174	o->writeText(name(op));
175	}
176	static void write(SkWStream* o, Arg a) {
177	write(o, "arg(");
178	o->writeDecAsText(a.ix);
179	write(o, ")");
180	}
181	static void write(SkWStream* o, V v) {
182	write(o, "v");
183	o->writeDecAsText(v.id);
184	}
185	static void write(SkWStream* o, R r) {
186	write(o, "r");
187	o->writeDecAsText(r.id);
188	}
189	static void write(SkWStream* o, Shift s) {
190	o->writeDecAsText(s.bits);
191	}
192	static void write(SkWStream* o, Splat s) {
193	float f;
194	memcpy(&f, &s.bits, `4`);
195	o->writeHexAsText(s.bits);
196	write(o, " (");
197	o->writeScalarAsText(f);
198	write(o, ")");
199	}
200	static void write(SkWStream* o, Hex h) {
201	o->writeHexAsText(h.bits);
202	}
203	[[maybe_unused]] static void write(SkWStream* o, Attr a) {
204	write(o, a.label);
205	write(o, " ");
206	o->writeDecAsText(a.v);
207	}
208
209	template <typename T, typename... Ts>
210	static void write(SkWStream* o, T first, Ts... rest) {
211	write(o, first);
212	write(o, " ");
213	write(o, rest...);
214	}
215	} // namespace
216
217	void Builder::dot(SkWStream* o) const {
218	SkDebugfStream debug;
219	if (!o) { o = &debug; }
220
221	std::vector<OptimizedInstruction> optimized = this->optimize();
222
223	o->writeText("digraph {\n");
224	for (Val id = `0`; id < (Val)optimized.size(); id++) {
225	auto [op, x,y,z, immy,immz, death,can_hoist] = optimized [id];
226
227	switch (op) {
228	default:
229	write(o, "\t", V{id}, " [label = \"", V{id}, op);
230	// Not a perfect heuristic; sometimes y/z == NA and there is no immy/z.
231	// On the other hand, sometimes immy/z=0 is meaningful and should be printed.
232	if (y == NA) { write(o, "", Hex{immy}); }
233	if (z == NA) { write(o, "", Hex{immz}); }
234	write(o, "\"]\n");
235
236	write(o, "\t", V{id}, " -> {");
237	// In contrast to the heuristic imm labels, these dependences are exact.
238	if (x != NA) { write(o, "", V{x}); }
239	if (y != NA) { write(o, "", V{y}); }
240	if (z != NA) { write(o, "", V{z}); }
241	write(o, " }\n");
242
243	break;
244
245	// That default: impl works pretty well for most instructions,
246	// but some are nicer to see with a specialized label.
247
248	case Op::splat:
249	write(o, "\t", V{id}, " [label = \"", V{id}, op, Splat{immy}, "\"]\n");
250	break;
251	}
252	}
253	o->writeText("}\n");
254	}
255
256	template <typename I, typename... Fs>
257	static void write_one_instruction(Val id, const I& inst, SkWStream* o, Fs... fs) {
258	Op op = inst.op;
259	Val x = inst.x,
260	y = inst.y,
261	z = inst.z;
262	int immy = inst.immy,
263	immz = inst.immz;
264	switch (op) {
265	case Op::assert_true: write(o, op, V{x}, V{y}, fs(id)...); break;
266
267	case Op::store8: write(o, op, Arg{immy} , V{x}, fs(id)...); break;
268	case Op::store16: write(o, op, Arg{immy} , V{x}, fs(id)...); break;
269	case Op::store32: write(o, op, Arg{immy} , V{x}, fs(id)...); break;
270	case Op::store64: write(o, op, Arg{immz} , V{x},V{y}, fs(id)...); break;
271	case Op::store128: write(o, op, Arg{immz>>`1`}, V{x},V{y},Hex{immz&`1`}, fs(id)...); break;
272
273	case Op::index: write(o, V{id}, "=", op, fs(id)...); break;
274
275	case Op::load8: write(o, V{id}, "=", op, Arg{immy}, fs(id)...); break;
276	case Op::load16: write(o, V{id}, "=", op, Arg{immy}, fs(id)...); break;
277	case Op::load32: write(o, V{id}, "=", op, Arg{immy}, fs(id)...); break;
278	case Op::load64: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, fs(id)...); break;
279	case Op::load128: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, fs(id)...); break;
280
281	case Op::gather8: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}, fs(id)...); break;
282	case Op::gather16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}, fs(id)...); break;
283	case Op::gather32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}, fs(id)...); break;
284
285	case Op::uniform8: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, fs(id)...); break;
286	case Op::uniform16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, fs(id)...); break;
287	case Op::uniform32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, fs(id)...); break;
288
289	case Op::splat: write(o, V{id}, "=", op, Splat{immy}, fs(id)...); break;
290
291	case Op::add_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
292	case Op::sub_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
293	case Op::mul_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
294	case Op::div_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
295	case Op::min_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
296	case Op::max_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
297	case Op::fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break;
298	case Op::fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break;
299	case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break;
300
301
302	case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
303
304	case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
305	case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
306	case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
307	case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
308
309
310	case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
311	case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
312	case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
313
314	case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}, fs(id)...); break;
315	case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}, fs(id)...); break;
316	case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}, fs(id)...); break;
317
318	case Op:: eq_i32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
319	case Op:: gt_i32: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)...); break;
320
321	case Op::bit_and : write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
322	case Op::bit_or : write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
323	case Op::bit_xor : write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
324	case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}, fs(id)... ); break;
325
326	case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}, fs(id)...); break;
327	case Op::pack: write(o, V{id}, "=", op, V{x}, V{y}, Shift{immz}, fs(id)...); break;
328
329	case Op::ceil: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
330	case Op::floor: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
331	case Op::to_f32: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
332	case Op::to_half: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
333	case Op::from_half: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
334	case Op::trunc: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
335	case Op::round: write(o, V{id}, "=", op, V{x}, fs(id)...); break;
336	}
337
338	write(o, "\n");
339	}
340
341	void Builder::dump(SkWStream* o) const {
342	SkDebugfStream debug;
343	if (!o) { o = &debug; }
344
345	std::vector<OptimizedInstruction> optimized = this->optimize();
346	o->writeDecAsText(optimized.size());
347	o->writeText(" values (originally ");
348	o->writeDecAsText(fProgram.size());
349	o->writeText("):\n");
350	for (Val id = `0`; id < (Val)optimized.size(); id++) {
351	const OptimizedInstruction& inst = optimized [id];
352	write(o, inst.can_hoist ? "↑ " : " ");
353	write_one_instruction(id, inst, o);
354	}
355	}
356
357	template <typename... Fs>
358	void dump_instructions(const std::vector<Instruction>& instructions, SkWStream* o, Fs... fs) {
359	SkDebugfStream debug;
360	if (o == nullptr) {
361	o = &debug;
362	}
363	write(o, Attr{"Instruction count:", (int)instructions.size()});
364	for (Val id = `0`; id < (Val)instructions.size(); id++) {
365	write_one_instruction(id, instructions [id], o, std::forward<Fs>(fs)...);
366	}
367	}
368
369	void Program::dump(SkWStream* o) const {
370	SkDebugfStream debug;
371	if (!o) { o = &debug; }
372
373	o->writeDecAsText(fImpl ->regs);
374	o->writeText(" registers, ");
375	o->writeDecAsText(fImpl ->instructions.size());
376	o->writeText(" instructions:\n");
377	for (Val i = `0`; i < (Val)fImpl ->instructions.size(); i++) {
378	if (i == fImpl ->loop) { write(o, "loop:\n"); }
379	o->writeDecAsText(i);
380	o->writeText("\t");
381	if (i >= fImpl ->loop) { write(o, " "); }
382	const InterpreterInstruction& inst = fImpl ->instructions [i];
383	Op op = inst.op;
384	Reg d = inst.d,
385	x = inst.x,
386	y = inst.y,
387	z = inst.z;
388	int immy = inst.immy,
389	immz = inst.immz;
390	switch (op) {
391	case Op::assert_true: write(o, op, R{x}, R{y}); break;
392
393	case Op::store8: write(o, op, Arg{immy} , R{x} ); break;
394	case Op::store16: write(o, op, Arg{immy} , R{x} ); break;
395	case Op::store32: write(o, op, Arg{immy} , R{x} ); break;
396	case Op::store64: write(o, op, Arg{immz} , R{x}, R{y} ); break;
397	case Op::store128: write(o, op, Arg{immz>>`1`}, R{x}, R{y}, Hex{immz&`1`}); break;
398
399	case Op::index: write(o, R{d}, "=", op); break;
400
401	case Op::load8: write(o, R{d}, "=", op, Arg{immy}); break;
402	case Op::load16: write(o, R{d}, "=", op, Arg{immy}); break;
403	case Op::load32: write(o, R{d}, "=", op, Arg{immy}); break;
404	case Op::load64: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
405	case Op::load128: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
406
407	case Op::gather8: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
408	case Op::gather16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
409	case Op::gather32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
410
411	case Op::uniform8: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
412	case Op::uniform16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
413	case Op::uniform32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
414
415	case Op::splat: write(o, R{d}, "=", op, Splat{immy}); break;
416
417
418	case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
419	case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
420	case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
421	case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
422	case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
423	case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break;
424	case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
425	case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
426	case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
427
428	case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break;
429
430	case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
431	case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
432	case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
433	case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
434
435
436	case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
437	case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
438	case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
439
440	case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
441	case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
442	case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
443
444	case Op:: eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
445	case Op:: gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
446
447	case Op::bit_and : write(o, R{d}, "=", op, R{x}, R{y} ); break;
448	case Op::bit_or : write(o, R{d}, "=", op, R{x}, R{y} ); break;
449	case Op::bit_xor : write(o, R{d}, "=", op, R{x}, R{y} ); break;
450	case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y} ); break;
451
452	case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
453	case Op::pack: write(o, R{d}, "=", op, R{x}, R{y}, Shift{immz}); break;
454
455	case Op::ceil: write(o, R{d}, "=", op, R{x}); break;
456	case Op::floor: write(o, R{d}, "=", op, R{x}); break;
457	case Op::to_f32: write(o, R{d}, "=", op, R{x}); break;
458	case Op::to_half: write(o, R{d}, "=", op, R{x}); break;
459	case Op::from_half: write(o, R{d}, "=", op, R{x}); break;
460	case Op::trunc: write(o, R{d}, "=", op, R{x}); break;
461	case Op::round: write(o, R{d}, "=", op, R{x}); break;
462	}
463	write(o, "\n");
464	}
465	}
466
467	std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program) {
468	// Determine which Instructions are live by working back from side effects.
469	std::vector<bool> live(program.size(), false);
470	auto mark_live = [&](Val id, auto& recurse) -> void {
471	if (live [id] == false) {
472	live [id] = true;
473	Instruction inst = program [id];
474	for (Val arg : {inst.x, inst.y, inst.z}) {
475	if (arg != NA) { recurse(arg, recurse); }
476	}
477	}
478	};
479	for (Val id = `0`; id < (Val)program.size(); id++) {
480	if (has_side_effect(program [id].op)) {
481	mark_live (id, mark_live);
482	}
483	}
484
485	// Rewrite the program with only live Instructions:
486	// - remap IDs in live Instructions to what they'll be once dead Instructions are removed;
487	// - then actually remove the dead Instructions.
488	std::vector<Val> new_id(program.size(), NA);
489	for (Val id = `0`, next = `0`; id < (Val)program.size(); id++) {
490	if (live [id]) {
491	Instruction& inst = program [id];
492	for (Val* arg : {&inst.x, &inst.y, &inst.z}) {
493	if (*arg != NA) {
494	arg = new_id [arg];
495	SkASSERT(*arg != NA);
496	}
497	}
498	new_id [id] = next++;
499	}
500	}
501	auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) {
502	Val id = (Val)(&inst - program.data());
503	return !live [id];
504	});
505	program.erase(it, program.end());
506
507	return program;
508	}
509
510	// Impose a deterministic scheduling of Instructions based on data flow alone,
511	// eliminating any influence from original program order. We'll schedule back-to-front,
512	// starting at the end of the program with Instructions that have side effects and
513	// recursing through arguments to Instructions that issue earlier in the program.
514	// We schedule each argument once all its users have been scheduled, which means it
515	// issues just before its first use. We arbitrarily schedule x, then y, then z, and so
516	// issue z, then y, then x.
517	std::vector<Instruction> schedule(std::vector<Instruction> program) {
518
519	std::vector<int> uses(program.size());
520	for (const Instruction& inst : program) {
521	for (Val arg : {inst.x, inst.y, inst.z}) {
522	if (arg != NA) { uses [arg]++; }
523	}
524	}
525
526	std::vector<Val> new_id(program.size(), NA);
527	Val next = (Val)program.size();
528	auto reorder = [&](Val id, auto& recurse) -> void {
529	new_id [id] = --next;
530	const Instruction& inst = program [id];
531	for (Val arg : {inst.x, inst.y, inst.z}) {
532	if (arg != NA && --uses [arg] == `0`) {
533	recurse(arg, recurse);
534	}
535	}
536	};
537
538	for (Val id = `0`; id < (Val)program.size(); id++) {
539	if (has_side_effect(program [id].op)) {
540	reorder (id, reorder);
541	}
542	}
543
544	// Remap each Instruction's arguments to their new IDs.
545	for (Instruction& inst : program) {
546	for (Val* arg : {&inst.x, &inst.y, &inst.z}) {
547	if (*arg != NA) {
548	arg = new_id [arg];
549	SkASSERT(*arg != NA);
550	}
551	}
552	}
553
554	// Finally, reorder the Instructions themselves according to the new schedule.
555	// This is O(N)... wish I had a good reference link breaking it down.
556	for (Val id = `0`; id < (Val)program.size(); id++) {
557	while (id != new_id [id]) {
558	std::swap(program [id], program [new_id [id]]);
559	std::swap( new_id [id], new_id [new_id [id]]);
560	}
561	}
562
563	return program;
564	}
565
566	std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program) {
567	std::vector<OptimizedInstruction> optimized(program.size());
568	for (Val id = `0`; id < (Val)program.size(); id++) {
569	Instruction inst = program [id];
570	optimized [id] = {inst.op, inst.x,inst.y,inst.z, inst.immy,inst.immz,
571	/death=/id, /can_hoist=/true};
572	}
573
574	// Each Instruction's inputs need to live at least until that Instruction issues.
575	for (Val id = `0`; id < (Val)optimized.size(); id++) {
576	OptimizedInstruction& inst = optimized [id];
577	for (Val arg : {inst.x, inst.y, inst.z}) {
578	// (We're walking in order, so this is the same as max()ing with the existing Val.)
579	if (arg != NA) { optimized [arg].death = id; }
580	}
581	}
582
583	// Mark which values don't depend on the loop and can be hoisted.
584	for (OptimizedInstruction& inst : optimized) {
585	// Varying loads (and gathers) and stores cannot be hoisted out of the loop.
586	if (is_always_varying(inst.op)) {
587	inst.can_hoist = false;
588	}
589
590	// If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
591	if (inst.can_hoist) {
592	for (Val arg : {inst.x, inst.y, inst.z}) {
593	if (arg != NA) { inst.can_hoist &= optimized [arg].can_hoist; }
594	}
595	}
596	}
597
598	// Extend the lifetime of any hoisted value that's used in the loop to infinity.
599	for (OptimizedInstruction& inst : optimized) {
600	if (!inst.can_hoist /i.e. we're in the loop, so the arguments are used-in-loop/) {
601	for (Val arg : {inst.x, inst.y, inst.z}) {
602	if (arg != NA && optimized [arg].can_hoist) {
603	optimized [arg].death = (Val)program.size();
604	}
605	}
606	}
607	}
608
609	return optimized;
610	}
611
612	std::vector<OptimizedInstruction> Builder::optimize() const {
613	std::vector<Instruction> program = this->program();
614	program = eliminate_dead_code(std::move(program));
615	program = schedule (std::move(program));
616	return finalize (std::move(program));
617	}
618
619	Program Builder::done(const char* debug_name) const {
620	char buf[`64`] = "skvm-jit-";
621	if (!debug_name) {
622	SkStrAppendU32(buf+`9`, this*->hash()) = `'\0'`;
623	debug_name = buf;
624	}
625
626	return {this->optimize(), fStrides, debug_name};
627	}
628
629	uint64_t Builder::hash() const {
630	uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), `0`),
631	hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), `1`);
632	return (uint64_t)lo \| (uint64_t)hi << `32`;
633	}
634
635	bool operator==(const Instruction& a, const Instruction& b) {
636	return a.op == b.op
637	&& a.x == b.x
638	&& a.y == b.y
639	&& a.z == b.z
640	&& a.immy == b.immy
641	&& a.immz == b.immz;
642	}
643
644	uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
645	return SkOpts::hash(&inst, sizeof(inst), seed);
646	}
647
648
649	// Most instructions produce a value and return it by ID,
650	// the value-producing instruction's own index in the program vector.
651	Val Builder::push(Instruction inst) {
652	// Basic common subexpression elimination:
653	// if we've already seen this exact Instruction, use it instead of creating a new one.
654	if (Val* id = fIndex.find(inst)) {
655	return *id;
656	}
657	Val id = static_cast<Val>(fProgram.size());
658	fProgram.push_back(inst);
659	fIndex.set(inst, id);
660	return id;
661	}
662
663	bool Builder::allImm() const { return true; }
664
665	template <typename T, typename... Rest>
666	bool Builder::allImm(Val id, T* imm, Rest... rest) const {
667	if (fProgram [id].op == Op::splat) {
668	static_assert(sizeof(T) == `4`);
669	memcpy(imm, &fProgram [id].immy, `4`);
670	return this->allImm(rest...);
671	}
672	return false;
673	}
674
675	Arg Builder::arg(int stride) {
676	int ix = (int)fStrides.size();
677	fStrides.push_back(stride);
678	return {ix};
679	}
680
681	void Builder::assert_true(I32 cond, I32 debug) {
682	#ifdef SK_DEBUG
683	int imm;
684	if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; }
685	(void)push(Op::assert_true, cond.id,debug.id,NA);
686	#endif
687	}
688
689	void Builder::store8 (Arg ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA, ptr.ix); }
690	void Builder::store16(Arg ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA, ptr.ix); }
691	void Builder::store32(Arg ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA, ptr.ix); }
692	void Builder::store64(Arg ptr, I32 lo, I32 hi) {
693	(void)push(Op::store64, lo.id,hi.id,NA, NA,ptr.ix);
694	}
695	void Builder::store128(Arg ptr, I32 lo, I32 hi, int lane) {
696	(void)push(Op::store128, lo.id,hi.id,NA, NA,(ptr.ix<<`1`)\|(lane&`1`));
697	}
698
699	I32 Builder::index() { return {this, push(Op::index , NA,NA,NA,`0`) }; }
700
701	I32 Builder::load8 (Arg ptr) { return {this, push(Op::load8 , NA,NA,NA, ptr.ix) }; }
702	I32 Builder::load16(Arg ptr) { return {this, push(Op::load16, NA,NA,NA, ptr.ix) }; }
703	I32 Builder::load32(Arg ptr) { return {this, push(Op::load32, NA,NA,NA, ptr.ix) }; }
704	I32 Builder::load64(Arg ptr, int lane) {
705	return {this, push(Op::load64 , NA,NA,NA, ptr.ix,lane) };
706	}
707	I32 Builder::load128(Arg ptr, int lane) {
708	return {this, push(Op::load128, NA,NA,NA, ptr.ix,lane) };
709	}
710
711	I32 Builder::gather8 (Arg ptr, int offset, I32 index) {
712	return {this, push(Op::gather8 , index.id,NA,NA, ptr.ix,offset)};
713	}
714	I32 Builder::gather16(Arg ptr, int offset, I32 index) {
715	return {this, push(Op::gather16, index.id,NA,NA, ptr.ix,offset)};
716	}
717	I32 Builder::gather32(Arg ptr, int offset, I32 index) {
718	return {this, push(Op::gather32, index.id,NA,NA, ptr.ix,offset)};
719	}
720
721	I32 Builder::uniform8(Arg ptr, int offset) {
722	return {this, push(Op::uniform8, NA,NA,NA, ptr.ix, offset)};
723	}
724	I32 Builder::uniform16(Arg ptr, int offset) {
725	return {this, push(Op::uniform16, NA,NA,NA, ptr.ix, offset)};
726	}
727	I32 Builder::uniform32(Arg ptr, int offset) {
728	return {this, push(Op::uniform32, NA,NA,NA, ptr.ix, offset)};
729	}
730
731	// The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern.
732	I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA, n) }; }
733	F32 Builder::splat(float f) {
734	int bits;
735	memcpy(&bits, &f, `4`);
736	return {this, push(Op::splat, NA,NA,NA, bits)};
737	}
738
739	bool fma_supported() {
740	static const bool supported =
741	#if defined(SK_CPU_X86)
742	SkCpu::Supports(SkCpu::HSW);
743	#elif defined(SK_CPU_ARM64)
744	true;
745	#else
746	false;
747	#endif
748	return supported;
749	}
750
751	// Be careful peepholing float math! Transformations you might expect to
752	// be legal can fail in the face of NaN/Inf, e.g. 0x is not always 0.*
753	// Float peepholes must pass this equivalence test for all ~4B floats:
754	//
755	// bool equiv(float x, float y) { return (x == y) \|\| (isnanf(x) && isnanf(y)); }
756	//
757	// unsigned bits = 0;
758	// do {
759	// float f;
760	// memcpy(&f, &bits, 4);
761	// if (!equiv(f, ...)) {
762	// abort();
763	// }
764	// } while (++bits != 0);
765
766	F32 Builder::add(F32 x, F32 y) {
767	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
768	if (this->isImm(y.id, `0.0f`)) { return x; } // x+0 == x
769	if (this->isImm(x.id, `0.0f`)) { return y; } // 0+y == y
770
771	if (fma_supported()) {
772	if (fProgram [x.id].op == Op::mul_f32) {
773	return {this, this->push(Op::fma_f32, fProgram [x.id].x, fProgram [x.id].y, y.id)};
774	}
775	if (fProgram [y.id].op == Op::mul_f32) {
776	return {this, this->push(Op::fma_f32, fProgram [y.id].x, fProgram [y.id].y, x.id)};
777	}
778	}
779	return {this, this->push(Op::add_f32, x.id, y.id)};
780	}
781
782	F32 Builder::sub(F32 x, F32 y) {
783	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
784	if (this->isImm(y.id, `0.0f`)) { return x; } // x-0 == x
785	if (fma_supported()) {
786	if (fProgram [x.id].op == Op::mul_f32) {
787	return {this, this->push(Op::fms_f32, fProgram [x.id].x, fProgram [x.id].y, y.id)};
788	}
789	if (fProgram [y.id].op == Op::mul_f32) {
790	return {this, this->push(Op::fnma_f32, fProgram [y.id].x, fProgram [y.id].y, x.id)};
791	}
792	}
793	return {this, this->push(Op::sub_f32, x.id, y.id)};
794	}
795
796	F32 Builder::mul(F32 x, F32 y) {
797	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
798	if (this->isImm(y.id, `1.0f`)) { return x; } // x1 == x*
799	if (this->isImm(x.id, `1.0f`)) { return y; } // 1y == y*
800	return {this, this->push(Op::mul_f32, x.id, y.id)};
801	}
802
803	F32 Builder::div(F32 x, F32 y) {
804	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X/Y); }
805	if (this->isImm(y.id, `1.0f`)) { return x; } // x/1 == x
806	return {this, this->push(Op::div_f32, x.id, y.id)};
807	}
808
809	F32 Builder::sqrt(F32 x) {
810	if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); }
811	return {this, this->push(Op::sqrt_f32, x.id,NA,NA)};
812	}
813
814	// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
815	F32 Builder::approx_log2(F32 x) {
816	// e - 127 is a fair approximation of log2(x) in its own right...
817	F32 e = mul(to_f32(bit_cast(x)), splat(`1.0f` / (`1`<<`23`)));
818
819	// ... but using the mantissa to refine its error is _much_ better.
820	F32 m = bit_cast(bit_or(bit_and(bit_cast(x), `0x007fffff`),
821	`0x3f000000`));
822	F32 approx = sub(e, `124.225514990f`);
823	approx = sub(approx, mul(`1.498030302f`, m));
824	approx = sub(approx, div(`1.725879990f`, add(`0.3520887068f`, m)));
825
826	return approx;
827	}
828
829	F32 Builder::approx_pow2(F32 x) {
830	F32 f = fract(x);
831	F32 approx = add(x, `121.274057500f`);
832	approx = sub(approx, mul( `1.490129070f`, f));
833	approx = add(approx, div(`27.728023300f`, sub(`4.84252568f`, f)));
834
835	return bit_cast(round(mul(`1.0f` * (`1`<<`23`), approx)));
836	}
837
838	F32 Builder::approx_powf(F32 x, F32 y) {
839	// TODO: assert this instead? Sometimes x is very slightly negative. See skia:10210.
840	x = max(`0.0f`, x);
841
842	auto is_x = bit_or(eq(x, `0.0f`),
843	eq(x, `1.0f`));
844	return select(is_x, x, approx_pow2(mul(approx_log2(x), y)));
845	}
846
847	// Bhaskara I's sine approximation
848	// 16x(pi - x) / (5pi^2 - 4x(pi - x)*
849	// ... divide by 4
850	// 4x(pi - x) / 5pi^2/4 - x(pi - x)*
851	//
852	// This is a good approximation only for 0 <= x <= pi, so we use symmetries to get
853	// radians into that range first.
854	//
855	F32 Builder::approx_sin(F32 radians) {
856	constexpr float Pi = SK_ScalarPI;
857	// x = radians mod 2pi
858	F32 x = fract(radians * (`0.5f`/Pi)) * (`2`*Pi);
859	I32 neg = x > Pi; // are we pi < x < 2pi --> need to negate result
860	x = select(neg, x - Pi, x);
861
862	F32 pair = x * (Pi - x);
863	x = `4.0f` * pair / ((`5`PiPi/`4`) - pair);
864	x = select(neg, -x, x);
865	return x;
866	}
867
868	/ "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"*
869	https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
870
871	approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
872
873	Some simplifications:
874	1. tan(x) is periodic, -PI/2 < x < PI/2
875	2. tan(x) is odd, so tan(-x) = -tan(x)
876	3. Our polynomial approximation is best near zero, so we use the following identity
877	tan(x) + tan(y)
878	tan(x + y) = -----------------
879	1 - tan(x)tan(y)*
880	tan(PI/4) = 1
881
882	So for x > PI/8, we do the following refactor:
883	x' = x - PI/4
884
885	1 + tan(x')
886	tan(x) = ------------
887	1 - tan(x')
888	*/
889	F32 Builder::approx_tan(F32 x) {
890	constexpr float Pi = SK_ScalarPI;
891	// periodic between -pi/2 ... pi/2
892	// shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
893	x = fract((`1`/Pi)x + `0.5f`) Pi - (Pi/`2`);
894
895	I32 neg = (x < `0.0f`);
896	x = select(neg, -x, x);
897
898	// minimize total error by shifting if x > pi/8
899	I32 use_quotient = (x > (Pi/`8`));
900	x = select(use_quotient, x - (Pi/`4`), x);
901
902	// 9th order poly = 4th order(x^2) x*
903	x = poly(x x, `62`/`2835.0f`, `17`/`315.0f`, `2`/`15.0f`, `1`/`3.0f`, `1.0f`) x;
904	x = select(use_quotient, (`1`+x)/(`1`-x), x);
905	x = select(neg, -x, x);
906	return x;
907	}
908
909	// http://mathforum.org/library/drmath/view/54137.html
910	// referencing Handbook of Mathematical Functions,
911	// by Milton Abramowitz and Irene Stegun
912	F32 Builder::approx_asin(F32 x) {
913	I32 neg = (x < `0.0f`);
914	x = select(neg, -x, x);
915	x = SK_ScalarPI/`2` - sqrt(`1`-x) * poly(x, -`0.0187293f`, `0.0742610f`, -`0.2121144f`, `1.5707288f`);
916	x = select(neg, -x, x);
917	return x;
918	}
919
920	/ Use 4th order polynomial approximation from https://arachnoid.com/polysolve/*
921	* with 129 values of x,atan(x) for x:[0...1]
922	* This only works for 0 <= x <= 1
923	*/
924	static F32 approx_atan_unit(F32 x) {
925	// for now we might be given NaN... let that through
926	x ->assert_true((x != x) \| ((x >= `0`) & (x <= `1`)));
927	return poly(x, `0.14130025741326729f`,
928	-`0.34312835980675116f`,
929	-`0.016172900528248768f`,
930	`1.0037696976200385f`,
931	-`0.00014758242182738969f`);
932	}
933
934	/ Use identity atan(x) = pi/2 - atan(1/x) for x > 1*
935	*/
936	F32 Builder::approx_atan(F32 x) {
937	I32 neg = (x < `0.0f`);
938	x = select(neg, -x, x);
939	I32 flip = (x > `1.0f`);
940	x = select(flip, `1`/x, x);
941	x = approx_atan_unit(x);
942	x = select(flip, SK_ScalarPI/`2` - x, x);
943	x = select(neg, -x, x);
944	return x;
945	}
946
947	/ Use identity atan(x) = pi/2 - atan(1/x) for x > 1*
948	* By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
949	* which avoids a 2nd divide instruction if we had instead called atan().
950	*/
951	F32 Builder::approx_atan2(F32 y0, F32 x0) {
952
953	I32 flip = (abs(y0) > abs(x0));
954	F32 y = select(flip, x0, y0);
955	F32 x = select(flip, y0, x0);
956	F32 arg = y /x;
957
958	I32 neg = (arg < `0.0f`);
959	arg = select(neg, -arg, arg);
960
961	F32 r = approx_atan_unit(arg);
962	r = select(flip, SK_ScalarPI/`2` - r, r);
963	r = select(neg, -r, r);
964
965	// handle quadrant distinctions
966	r = select((y0 >= `0`) & (x0 < `0`), r + SK_ScalarPI, r);
967	r = select((y0 < `0`) & (x0 <= `0`), r - SK_ScalarPI, r);
968	// Note: we don't try to handle 0,0 or infinities (yet)
969	return r;
970	}
971
972	F32 Builder::min(F32 x, F32 y) {
973	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); }
974	return {this, this->push(Op::min_f32, x.id, y.id)};
975	}
976	F32 Builder::max(F32 x, F32 y) {
977	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); }
978	return {this, this->push(Op::max_f32, x.id, y.id)};
979	}
980
981	I32 Builder::add(I32 x, I32 y) {
982	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
983	if (this->isImm(x.id, `0`)) { return y; }
984	if (this->isImm(y.id, `0`)) { return x; }
985	return {this, this->push(Op::add_i32, x.id, y.id)};
986	}
987	I32 Builder::sub(I32 x, I32 y) {
988	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
989	if (this->isImm(y.id, `0`)) { return x; }
990	return {this, this->push(Op::sub_i32, x.id, y.id)};
991	}
992	I32 Builder::mul(I32 x, I32 y) {
993	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
994	if (this->isImm(x.id, `0`)) { return splat(`0`); }
995	if (this->isImm(y.id, `0`)) { return splat(`0`); }
996	if (this->isImm(x.id, `1`)) { return y; }
997	if (this->isImm(y.id, `1`)) { return x; }
998	return {this, this->push(Op::mul_i32, x.id, y.id)};
999	}
1000
1001	I32 Builder::shl(I32 x, int bits) {
1002	if (bits == `0`) { return x; }
1003	if (int X; this->allImm(x.id,&X)) { return splat(X << bits); }
1004	return {this, this->push(Op::shl_i32, x.id,NA,NA, bits)};
1005	}
1006	I32 Builder::shr(I32 x, int bits) {
1007	if (bits == `0`) { return x; }
1008	if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); }
1009	return {this, this->push(Op::shr_i32, x.id,NA,NA, bits)};
1010	}
1011	I32 Builder::sra(I32 x, int bits) {
1012	if (bits == `0`) { return x; }
1013	if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); }
1014	return {this, this->push(Op::sra_i32, x.id,NA,NA, bits)};
1015	}
1016
1017	I32 Builder:: eq(F32 x, F32 y) {
1018	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~`0` : `0`); }
1019	return {this, this->push(Op::eq_f32, x.id, y.id)};
1020	}
1021	I32 Builder::neq(F32 x, F32 y) {
1022	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~`0` : `0`); }
1023	return {this, this->push(Op::neq_f32, x.id, y.id)};
1024	}
1025	I32 Builder::lt(F32 x, F32 y) {
1026	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~`0` : `0`); }
1027	return {this, this->push(Op::gt_f32, y.id, x.id)};
1028	}
1029	I32 Builder::lte(F32 x, F32 y) {
1030	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~`0` : `0`); }
1031	return {this, this->push(Op::gte_f32, y.id, x.id)};
1032	}
1033	I32 Builder::gt(F32 x, F32 y) {
1034	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~`0` : `0`); }
1035	return {this, this->push(Op::gt_f32, x.id, y.id)};
1036	}
1037	I32 Builder::gte(F32 x, F32 y) {
1038	if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~`0` : `0`); }
1039	return {this, this->push(Op::gte_f32, x.id, y.id)};
1040	}
1041
1042	I32 Builder:: eq(I32 x, I32 y) {
1043	if (x.id == y.id) { return splat(~`0`); }
1044	return {this, this->push(Op:: eq_i32, x.id, y.id)};
1045	}
1046	I32 Builder::neq(I32 x, I32 y) {
1047	return ~(x == y);
1048	}
1049	I32 Builder:: gt(I32 x, I32 y) {
1050	return {this, this->push(Op:: gt_i32, x.id, y.id)};
1051	}
1052	I32 Builder::gte(I32 x, I32 y) {
1053	if (x.id == y.id) { return splat(~`0`); }
1054	return ~(x < y);
1055	}
1056	I32 Builder:: lt(I32 x, I32 y) { return y >x; }
1057	I32 Builder::lte(I32 x, I32 y) { return y >=x; }
1058
1059	I32 Builder::bit_and(I32 x, I32 y) {
1060	if (x.id == y.id) { return x; }
1061	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); }
1062	if (this->isImm(y.id, `0`)) { return splat(`0`); } // (x & false) == false
1063	if (this->isImm(x.id, `0`)) { return splat(`0`); } // (false & y) == false
1064	if (this->isImm(y.id,~`0`)) { return x; } // (x & true) == x
1065	if (this->isImm(x.id,~`0`)) { return y; } // (true & y) == y
1066	return {this, this->push(Op::bit_and, x.id, y.id)};
1067	}
1068	I32 Builder::bit_or(I32 x, I32 y) {
1069	if (x.id == y.id) { return x; }
1070	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X\|Y); }
1071	if (this->isImm(y.id, `0`)) { return x; } // (x \| false) == x
1072	if (this->isImm(x.id, `0`)) { return y; } // (false \| y) == y
1073	if (this->isImm(y.id,~`0`)) { return splat(~`0`); } // (x \| true) == true
1074	if (this->isImm(x.id,~`0`)) { return splat(~`0`); } // (true \| y) == true
1075	return {this, this->push(Op::bit_or, x.id, y.id)};
1076	}
1077	I32 Builder::bit_xor(I32 x, I32 y) {
1078	if (x.id == y.id) { return splat(`0`); }
1079	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); }
1080	if (this->isImm(y.id, `0`)) { return x; } // (x ^ false) == x
1081	if (this->isImm(x.id, `0`)) { return y; } // (false ^ y) == y
1082	return {this, this->push(Op::bit_xor, x.id, y.id)};
1083	}
1084
1085	I32 Builder::bit_clear(I32 x, I32 y) {
1086	if (x.id == y.id) { return splat(`0`); }
1087	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); }
1088	if (this->isImm(y.id, `0`)) { return x; } // (x & ~false) == x
1089	if (this->isImm(y.id,~`0`)) { return splat(`0`); } // (x & ~true) == false
1090	if (this->isImm(x.id, `0`)) { return splat(`0`); } // (false & ~y) == false
1091	return {this, this->push(Op::bit_clear, x.id, y.id)};
1092	}
1093
1094	I32 Builder::select(I32 x, I32 y, I32 z) {
1095	if (y.id == z.id) { return y; }
1096	if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); }
1097	if (this->isImm(x.id,~`0`)) { return y; } // true ? y : z == y
1098	if (this->isImm(x.id, `0`)) { return z; } // false ? y : z == z
1099	if (this->isImm(y.id, `0`)) { return bit_clear(z,x); } // x ? 0 : z == ~x&z
1100	if (this->isImm(z.id, `0`)) { return bit_and (y,x); } // x ? y : 0 == x&y
1101	return {this, this->push(Op::select, x.id, y.id, z.id)};
1102	}
1103
1104	I32 Builder::extract(I32 x, int bits, I32 z) {
1105	if (unsigned Z; this->allImm(z.id,&Z) && (~`0u`>>bits) == Z) { return shr(x, bits); }
1106	return bit_and(z, shr(x, bits));
1107	}
1108
1109	I32 Builder::pack(I32 x, I32 y, int bits) {
1110	if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X\|(Y<<bits)); }
1111	return {this, this->push(Op::pack, x.id,y.id,NA, `0`,bits)};
1112	}
1113
1114	F32 Builder::ceil(F32 x) {
1115	if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); }
1116	return {this, this->push(Op::ceil, x.id)};
1117	}
1118	F32 Builder::floor(F32 x) {
1119	if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); }
1120	return {this, this->push(Op::floor, x.id)};
1121	}
1122	F32 Builder::to_f32(I32 x) {
1123	if (int X; this->allImm(x.id,&X)) { return splat((float)X); }
1124	return {this, this->push(Op::to_f32, x.id)};
1125	}
1126	I32 Builder::trunc(F32 x) {
1127	if (float X; this->allImm(x.id,&X)) { return splat((int)X); }
1128	return {this, this->push(Op::trunc, x.id)};
1129	}
1130	I32 Builder::round(F32 x) {
1131	if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); }
1132	return {this, this->push(Op::round, x.id)};
1133	}
1134
1135	I32 Builder::to_half(F32 x) {
1136	if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); }
1137	return {this, this->push(Op::to_half, x.id)};
1138	}
1139	F32 Builder::from_half(I32 x) {
1140	if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); }
1141	return {this, this->push(Op::from_half, x.id)};
1142	}
1143
1144	F32 Builder::from_unorm(int bits, I32 x) {
1145	F32 limit = splat(`1` / ((`1`<<bits)-`1.0f`));
1146	return mul(to_f32(x), limit);
1147	}
1148	I32 Builder::to_unorm(int bits, F32 x) {
1149	F32 limit = splat((`1`<<bits)-`1.0f`);
1150	return round(mul(x, limit));
1151	}
1152
1153	bool SkColorType_to_PixelFormat(SkColorType ct, PixelFormat* f) {
1154	auto UNORM = PixelFormat::UNORM,
1155	FLOAT = PixelFormat::FLOAT;
1156	switch (ct) {
1157	case kUnknown_SkColorType: SkASSERT(false); return false;
1158
1159	case kRGBA_F32_SkColorType: f = {FLOAT,`32`,`32`,`32`,`32`, `0`,`32`,`64`,`96`}; return* true;
1160
1161	case kRGBA_F16Norm_SkColorType: f = {FLOAT,`16`,`16`,`16`,`16`, `0`,`16`,`32`,`48`}; return* true;
1162	case kRGBA_F16_SkColorType: f = {FLOAT,`16`,`16`,`16`,`16`, `0`,`16`,`32`,`48`}; return* true;
1163	case kR16G16B16A16_unorm_SkColorType: f = {UNORM,`16`,`16`,`16`,`16`, `0`,`16`,`32`,`48`}; return* true;
1164
1165	case kA16_float_SkColorType: f = {FLOAT, `0`, `0`,`0`,`16`, `0`, `0`,`0`,`0`}; return* true;
1166	case kR16G16_float_SkColorType: f = {FLOAT, `16`,`16`,`0`, `0`, `0`,`16`,`0`,`0`}; return* true;
1167
1168	case kAlpha_8_SkColorType: f = {UNORM, `0`,`0`,`0`,`8`, `0`,`0`,`0`,`0`}; return* true;
1169	case kGray_8_SkColorType: f = {UNORM, `8`,`8`,`8`,`0`, `0`,`0`,`0`,`0`}; return* true; // Subtle.
1170
1171	case kRGB_565_SkColorType: f = {UNORM, `5`,`6`,`5`,`0`, `11`,`5`,`0`,`0`}; return* true; // (BGR)
1172	case kARGB_4444_SkColorType: f = {UNORM, `4`,`4`,`4`,`4`, `12`,`8`,`4`,`0`}; return* true; // (ABGR)
1173
1174	case kRGBA_8888_SkColorType: f = {UNORM, `8`,`8`,`8`,`8`, `0`,`8`,`16`,`24`}; return* true;
1175	case kRGB_888x_SkColorType: f = {UNORM, `8`,`8`,`8`,`0`, `0`,`8`,`16`,`32`}; return* true; // 32-bit
1176	case kBGRA_8888_SkColorType: f = {UNORM, `8`,`8`,`8`,`8`, `16`,`8`, `0`,`24`}; return* true;
1177
1178	case kRGBA_1010102_SkColorType: f = {UNORM, `10`,`10`,`10`,`2`, `0`,`10`,`20`,`30`}; return* true;
1179	case kBGRA_1010102_SkColorType: f = {UNORM, `10`,`10`,`10`,`2`, `20`,`10`, `0`,`30`}; return* true;
1180	case kRGB_101010x_SkColorType: f = {UNORM, `10`,`10`,`10`,`0`, `0`,`10`,`20`, `0`}; return* true;
1181	case kBGR_101010x_SkColorType: f = {UNORM, `10`,`10`,`10`,`0`, `20`,`10`, `0`, `0`}; return* true;
1182
1183	case kR8G8_unorm_SkColorType: f = {UNORM, `8`, `8`,`0`, `0`, `0`, `8`,`0`,`0`}; return* true;
1184	case kR16G16_unorm_SkColorType: f = {UNORM, `16`,`16`,`0`, `0`, `0`,`16`,`0`,`0`}; return* true;
1185	case kA16_unorm_SkColorType: f = {UNORM, `0`, `0`,`0`,`16`, `0`, `0`,`0`,`0`}; return* true;
1186	}
1187	return false;
1188	}
1189
1190	static int byte_size(PixelFormat f) {
1191	// What's the highest bit we read?
1192	int bits = std::max(f.r_bits + f.r_shift,
1193	std::max(f.g_bits + f.g_shift,
1194	std::max(f.b_bits + f.b_shift,
1195	f.a_bits + f.a_shift)));
1196	// Round up to bytes.
1197	return (bits + `7`) / `8`;
1198	}
1199
1200	static Color unpack(PixelFormat f, I32 x) {
1201	SkASSERT(byte_size(f) <= `4`);
1202	auto unpack_channel = [=](int bits, int shift) {
1203	I32 channel = extract(x, shift, (`1`<<bits)-`1`);
1204	switch (f.encoding) {
1205	case PixelFormat::UNORM: return from_unorm(bits, channel);
1206	case PixelFormat::FLOAT: return from_half ( channel);
1207	}
1208	SkUNREACHABLE;
1209	};
1210	return {
1211	f.r_bits ? unpack_channel (f.r_bits, f.r_shift) : x ->splat(`0.0f`),
1212	f.g_bits ? unpack_channel (f.g_bits, f.g_shift) : x ->splat(`0.0f`),
1213	f.b_bits ? unpack_channel (f.b_bits, f.b_shift) : x ->splat(`0.0f`),
1214	f.a_bits ? unpack_channel (f.a_bits, f.a_shift) : x ->splat(`1.0f`),
1215	};
1216	}
1217
1218	static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) {
1219	SkASSERT(byte_size(f) == `8`);
1220	// We assume some of the channels are in the low 32 bits, some in the high 32 bits.
1221	// The assert on byte_size(lo) will trigger if this assumption is violated.
1222	*lo = f;
1223	if (f.r_shift >= `32`) { lo->r_bits = `0`; lo->r_shift = `32`; }
1224	if (f.g_shift >= `32`) { lo->g_bits = `0`; lo->g_shift = `32`; }
1225	if (f.b_shift >= `32`) { lo->b_bits = `0`; lo->b_shift = `32`; }
1226	if (f.a_shift >= `32`) { lo->a_bits = `0`; lo->a_shift = `32`; }
1227	SkASSERT(byte_size(*lo) == `4`);
1228
1229	*hi = f;
1230	if (f.r_shift < `32`) { hi->r_bits = `0`; hi->r_shift = `32`; } else { hi->r_shift -= `32`; }
1231	if (f.g_shift < `32`) { hi->g_bits = `0`; hi->g_shift = `32`; } else { hi->g_shift -= `32`; }
1232	if (f.b_shift < `32`) { hi->b_bits = `0`; hi->b_shift = `32`; } else { hi->b_shift -= `32`; }
1233	if (f.a_shift < `32`) { hi->a_bits = `0`; hi->a_shift = `32`; } else { hi->a_shift -= `32`; }
1234	SkASSERT(byte_size(*hi) == `4`);
1235	}
1236
1237	// The only 16-byte format we support today is RGBA F32,
1238	// though, TODO, we could generalize that to any swizzle, and to allow UNORM too.
1239	static void assert_16byte_is_rgba_f32(PixelFormat f) {
1240	#if defined(SK_DEBUG)
1241	SkASSERT(byte_size(f) == `16`);
1242	PixelFormat rgba_f32;
1243	SkAssertResult(SkColorType_to_PixelFormat(kRGBA_F32_SkColorType, &rgba_f32));
1244
1245	SkASSERT(f.encoding == rgba_f32.encoding);
1246
1247	SkASSERT(f.r_bits == rgba_f32.r_bits);
1248	SkASSERT(f.g_bits == rgba_f32.g_bits);
1249	SkASSERT(f.b_bits == rgba_f32.b_bits);
1250	SkASSERT(f.a_bits == rgba_f32.a_bits);
1251
1252	SkASSERT(f.r_shift == rgba_f32.r_shift);
1253	SkASSERT(f.g_shift == rgba_f32.g_shift);
1254	SkASSERT(f.b_shift == rgba_f32.b_shift);
1255	SkASSERT(f.a_shift == rgba_f32.a_shift);
1256	#endif
1257	}
1258
1259	Color Builder::load(PixelFormat f, Arg ptr) {
1260	switch (byte_size(f)) {
1261	case `1`: return unpack(f, load8 (ptr));
1262	case `2`: return unpack(f, load16(ptr));
1263	case `4`: return unpack(f, load32(ptr));
1264	case `8`: {
1265	PixelFormat lo,hi;
1266	split_disjoint_8byte_format(f, &lo,&hi);
1267	Color l = unpack(lo, load64(ptr, `0`)),
1268	h = unpack(hi, load64(ptr, `1`));
1269	return {
1270	lo.r_bits ? l.r : h.r,
1271	lo.g_bits ? l.g : h.g,
1272	lo.b_bits ? l.b : h.b,
1273	lo.a_bits ? l.a : h.a,
1274	};
1275	}
1276	case `16`: {
1277	assert_16byte_is_rgba_f32(f);
1278	return {
1279	bit_cast(load128(ptr, `0`)),
1280	bit_cast(load128(ptr, `1`)),
1281	bit_cast(load128(ptr, `2`)),
1282	bit_cast(load128(ptr, `3`)),
1283	};
1284	}
1285	default: SkUNREACHABLE;
1286	}
1287	return {};
1288	}
1289
1290	Color Builder::gather(PixelFormat f, Arg ptr, int offset, I32 index) {
1291	switch (byte_size(f)) {
1292	case `1`: return unpack(f, gather8 (ptr, offset, index));
1293	case `2`: return unpack(f, gather16(ptr, offset, index));
1294	case `4`: return unpack(f, gather32(ptr, offset, index));
1295	case `8`: {
1296	PixelFormat lo,hi;
1297	split_disjoint_8byte_format(f, &lo,&hi);
1298	Color l = unpack(lo, gather32(ptr, offset, (index <<`1`)+`0`)),
1299	h = unpack(hi, gather32(ptr, offset, (index <<`1`)+`1`));
1300	return {
1301	lo.r_bits ? l.r : h.r,
1302	lo.g_bits ? l.g : h.g,
1303	lo.b_bits ? l.b : h.b,
1304	lo.a_bits ? l.a : h.a,
1305	};
1306	}
1307	case `16`: {
1308	assert_16byte_is_rgba_f32(f);
1309	return {
1310	gatherF(ptr, offset, (index <<`2`)+`0`),
1311	gatherF(ptr, offset, (index <<`2`)+`1`),
1312	gatherF(ptr, offset, (index <<`2`)+`2`),
1313	gatherF(ptr, offset, (index <<`2`)+`3`),
1314	};
1315	}
1316	default: SkUNREACHABLE;
1317	}
1318	return {};
1319	}
1320
1321	static I32 pack32(PixelFormat f, Color c) {
1322	SkASSERT(byte_size(f) <= `4`);
1323	I32 packed = c ->splat(`0`);
1324	auto pack_channel = [&](F32 channel, int bits, int shift) {
1325	I32 encoded;
1326	switch (f.encoding) {
1327	case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break;
1328	case PixelFormat::FLOAT: encoded = to_half ( channel); break;
1329	}
1330	packed = pack(packed, encoded, shift);
1331	};
1332	if (f.r_bits) { pack_channel (c.r, f.r_bits, f.r_shift); }
1333	if (f.g_bits) { pack_channel (c.g, f.g_bits, f.g_shift); }
1334	if (f.b_bits) { pack_channel (c.b, f.b_bits, f.b_shift); }
1335	if (f.a_bits) { pack_channel (c.a, f.a_bits, f.a_shift); }
1336	return packed;
1337	}
1338
1339	bool Builder::store(PixelFormat f, Arg ptr, Color c) {
1340	// Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal.
1341	if (f.r_bits == f.g_bits && f.g_bits == f.b_bits &&
1342	f.r_shift == f.g_shift && f.g_shift == f.b_shift) {
1343
1344	// TODO: pull these coefficients from an SkColorSpace? This is sRGB luma/luminance.
1345	c.r = c.r * `0.2126f`
1346	+ c.g * `0.7152f`
1347	+ c.b * `0.0722f`;
1348	f.g_bits = f.b_bits = `0`;
1349	}
1350
1351	switch (byte_size(f)) {
1352	case `1`: store8 (ptr, pack32(f,c)); return true;
1353	case `2`: store16(ptr, pack32(f,c)); return true;
1354	case `4`: store32(ptr, pack32(f,c)); return true;
1355	case `8`: {
1356	PixelFormat lo,hi;
1357	split_disjoint_8byte_format(f, &lo,&hi);
1358	store64(ptr, pack32(lo,c)
1359	, pack32(hi,c));
1360	return true;
1361	}
1362	case `16`: {
1363	assert_16byte_is_rgba_f32(f);
1364	store128(ptr, bit_cast(c.r), bit_cast(c.g), `0`);
1365	store128(ptr, bit_cast(c.b), bit_cast(c.a), `1`);
1366	return true;
1367	}
1368	default: SkUNREACHABLE;
1369	}
1370	return false;
1371	}
1372
1373	void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) {
1374	skvm::F32 invA = `1.0f` / a,
1375	inf = bit_cast(splat(`0x7f800000`));
1376	// If a is 0, so are r,g,b, so set invA to 0 to avoid 0inf=NaN (instead 00 = 0).*
1377	invA = select(invA < inf, invA
1378	, `0.0f`);
1379	r = invA;
1380	g = invA;
1381	b = invA;
1382	}
1383
1384	void Builder::premul(F32* r, F32* g, F32* b, F32 a) {
1385	r = a;
1386	g = a;
1387	b = a;
1388	}
1389
1390	Color Builder::uniformPremul(SkColor4f color, SkColorSpace* src,
1391	Uniforms* uniforms, SkColorSpace* dst) {
1392	SkColorSpaceXformSteps (src, kUnpremul_SkAlphaType,
1393	dst, kPremul_SkAlphaType).apply(color.vec());
1394	return {
1395	uniformF(uniforms->pushF(color.fR)),
1396	uniformF(uniforms->pushF(color.fG)),
1397	uniformF(uniforms->pushF(color.fB)),
1398	uniformF(uniforms->pushF(color.fA)),
1399	};
1400	}
1401
1402	F32 Builder::lerp(F32 lo, F32 hi, F32 t) {
1403	if (this->isImm(t.id, `0.0f`)) { return lo; }
1404	if (this->isImm(t.id, `1.0f`)) { return hi; }
1405	return mad(sub(hi, lo), t, lo);
1406	}
1407
1408	Color Builder::lerp(Color lo, Color hi, F32 t) {
1409	return {
1410	lerp(lo.r, hi.r, t),
1411	lerp(lo.g, hi.g, t),
1412	lerp(lo.b, hi.b, t),
1413	lerp(lo.a, hi.a, t),
1414	};
1415	}
1416
1417	HSLA Builder::to_hsla(Color c) {
1418	F32 mx = max(max(c.r,c.g),c.b),
1419	mn = min(min(c.r,c.g),c.b),
1420	d = mx - mn,
1421	invd = `1.0f` / d,
1422	g_lt_b = select(c.g < c.b, splat(`6.0f`)
1423	, splat(`0.0f`));
1424
1425	F32 h = (`1`/`6.0f`) * select(mx == mn, `0.0f`,
1426	select(mx == c.r, invd * (c.g - c.b) + g_lt_b,
1427	select(mx == c.g, invd * (c.b - c.r) + `2.0f`
1428	, invd * (c.r - c.g) + `4.0f`)));
1429
1430	F32 sum = mx + mn,
1431	l = sum * `0.5f`,
1432	s = select(mx == mn, `0.0f`
1433	, d / select(l > `0.5f`, `2.0f` - sum
1434	, sum));
1435	return {h, s, l, c.a};
1436	}
1437
1438	Color Builder::to_rgba(HSLA c) {
1439	// See GrRGBToHSLFilterEffect.fp
1440
1441	auto [h,s,l,a] = c;
1442	F32 x = s * (`1.0f` - abs(l + l - `1.0f`));
1443
1444	auto hue_to_rgb = [&,l=l](auto hue) {
1445	auto q = abs(`6.0f` * fract(hue) - `3.0f`) - `1.0f`;
1446	return x * (clamp01(q) - `0.5f`) + l;
1447	};
1448
1449	return {
1450	hue_to_rgb (h + `0`/`3.0f`),
1451	hue_to_rgb (h + `2`/`3.0f`),
1452	hue_to_rgb (h + `1`/`3.0f`),
1453	c.a,
1454	};
1455	}
1456
1457	// We're basing our implementation of non-separable blend modes on
1458	// https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1459	// and
1460	// https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1461	// They're equivalent, but ES' math has been better simplified.
1462	//
1463	// Anything extra we add beyond that is to make the math work with premul inputs.
1464
1465	static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1466	return max(r, max(g, b))
1467	- min(r, min(g, b));
1468	}
1469
1470	static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1471	return r `0.30f` + g `0.59f` + b *`0.11f`;
1472	}
1473
1474	static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) {
1475	F32 mn = min(r, min(g, *b)),
1476	mx = max(r, max(g, *b)),
1477	sat = mx - mn;
1478
1479	// Map min channel to 0, max channel to s, and scale the middle proportionally.
1480	auto scale = [&](skvm::F32 c) {
1481	auto scaled = ((c - mn) * s) / sat;
1482	return select(is_finite(scaled), scaled, `0.0f`);
1483	};
1484	r = scale (r);
1485	g = scale (g);
1486	b = scale (b);
1487	}
1488
1489	static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) {
1490	auto diff = lu - luminance(r, g, *b);
1491	*r += diff;
1492	*g += diff;
1493	*b += diff;
1494	}
1495
1496	static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) {
1497	F32 mn = min(r, min(g, *b)),
1498	mx = max(r, max(g, *b)),
1499	lu = luminance(r, g, *b);
1500
1501	auto clip = [&](auto c) {
1502	c = select(mn >= `0`, c
1503	, lu + ((c-lu)*( lu)) / (lu -mn));
1504	c = select(mx > a, lu + ((c-lu)*(a -lu)) / (mx -lu)
1505	, c);
1506	return clamp01(c); // May be a little negative, or worse, NaN.
1507	};
1508	r = clip (r);
1509	g = clip (g);
1510	b = clip (b);
1511	}
1512
1513	Color Builder::blend(SkBlendMode mode, Color src, Color dst) {
1514	auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) {
1515	return x y + z w;
1516	};
1517
1518	auto two = [](skvm::F32 x) { return x +x; };
1519
1520	auto apply_rgba = [&](auto fn) {
1521	return Color {
1522	fn(src.r, dst.r),
1523	fn(src.g, dst.g),
1524	fn(src.b, dst.b),
1525	fn(src.a, dst.a),
1526	};
1527	};
1528
1529	auto apply_rgb_srcover_a = [&](auto fn) {
1530	return Color {
1531	fn(src.r, dst.r),
1532	fn(src.g, dst.g),
1533	fn(src.b, dst.b),
1534	mad(dst.a, `1`-src.a, src.a), // srcover for alpha
1535	};
1536	};
1537
1538	auto non_sep = [&](auto R, auto G, auto B) {
1539	return Color{
1540	R + mma (src.r, `1`-dst.a, dst.r, `1`-src.a),
1541	G + mma (src.g, `1`-dst.a, dst.g, `1`-src.a),
1542	B + mma (src.b, `1`-dst.a, dst.b, `1`-src.a),
1543	mad(dst.a, `1`-src.a, src.a), // srcover for alpha
1544	};
1545	};
1546
1547	switch (mode) {
1548	default:
1549	SkASSERT(false);
1550	[[fallthrough]]; /but also, for safety, fallthrough/
1551
1552	case SkBlendMode::kClear: return { splat(`0.0f`), splat(`0.0f`), splat(`0.0f`), splat(`0.0f`) };
1553
1554	case SkBlendMode::kSrc: return src;
1555	case SkBlendMode::kDst: return dst;
1556
1557	case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]];
1558	case SkBlendMode::kSrcOver:
1559	return apply_rgba ([&](auto s, auto d) {
1560	return mad(d,`1`-src.a, s);
1561	});
1562
1563	case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]];
1564	case SkBlendMode::kSrcIn:
1565	return apply_rgba ([&](auto s, auto d) {
1566	return s * dst.a;
1567	});
1568
1569	case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]];
1570
1571	case SkBlendMode::kSrcOut:
1572	return apply_rgba ([&](auto s, auto d) {
1573	return s * (`1`-dst.a);
1574	});
1575
1576	case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]];
1577	case SkBlendMode::kSrcATop:
1578	return apply_rgba ([&](auto s, auto d) {
1579	return mma(s, dst.a, d, `1`-src.a);
1580	});
1581
1582	case SkBlendMode::kXor:
1583	return apply_rgba ([&](auto s, auto d) {
1584	return mma(s, `1`-dst.a, d, `1`-src.a);
1585	});
1586
1587	case SkBlendMode::kPlus:
1588	return apply_rgba ([&](auto s, auto d) {
1589	return min(s+d, `1.0f`);
1590	});
1591
1592	case SkBlendMode::kModulate:
1593	return apply_rgba ([&](auto s, auto d) {
1594	return s * d;
1595	});
1596
1597	case SkBlendMode::kScreen:
1598	// (s+d)-(sd) gave us trouble with our "r,g,b <= after blending" asserts.*
1599	// It's kind of plausible that s + (d - sd) keeps more precision?
1600	return apply_rgba ([&](auto s, auto d) {
1601	return s + (d - s*d);
1602	});
1603
1604	case SkBlendMode::kDarken:
1605	return apply_rgb_srcover_a ([&](auto s, auto d) {
1606	return s + (d - max(s * dst.a,
1607	d * src.a));
1608	});
1609
1610	case SkBlendMode::kLighten:
1611	return apply_rgb_srcover_a ([&](auto s, auto d) {
1612	return s + (d - min(s * dst.a,
1613	d * src.a));
1614	});
1615
1616	case SkBlendMode::kDifference:
1617	return apply_rgb_srcover_a ([&](auto s, auto d) {
1618	return s + (d - two(min(s * dst.a,
1619	d * src.a)));
1620	});
1621
1622	case SkBlendMode::kExclusion:
1623	return apply_rgb_srcover_a ([&](auto s, auto d) {
1624	return s + (d - two(s * d));
1625	});
1626
1627	case SkBlendMode::kColorBurn:
1628	return apply_rgb_srcover_a ([&](auto s, auto d) {
1629	auto mn = min(dst.a,
1630	src.a * (dst.a - d) / s),
1631	burn = src.a * (dst.a - mn) + mma(s, `1`-dst.a, d, `1`-src.a);
1632	return select(d == dst.a , s * (`1`-dst.a) + d,
1633	select(is_finite(burn), burn
1634	, d * (`1`-src.a) + s));
1635	});
1636
1637	case SkBlendMode::kColorDodge:
1638	return apply_rgb_srcover_a ([&](auto s, auto d) {
1639	auto dodge = src.a * min(dst.a,
1640	d * src.a / (src.a - s))
1641	+ mma(s, `1`-dst.a, d, `1`-src.a);
1642	return select(d == `0.0f` , s * (`1`-dst.a) + d,
1643	select(is_finite(dodge), dodge
1644	, d * (`1`-src.a) + s));
1645	});
1646
1647	case SkBlendMode::kHardLight:
1648	return apply_rgb_srcover_a ([&](auto s, auto d) {
1649	return mma(s, `1`-dst.a, d, `1`-src.a) +
1650	select(two(s) <= src.a,
1651	two(s * d),
1652	src.a * dst.a - two((dst.a - d) * (src.a - s)));
1653	});
1654
1655	case SkBlendMode::kOverlay:
1656	return apply_rgb_srcover_a ([&](auto s, auto d) {
1657	return mma(s, `1`-dst.a, d, `1`-src.a) +
1658	select(two(d) <= dst.a,
1659	two(s * d),
1660	src.a * dst.a - two((dst.a - d) * (src.a - s)));
1661	});
1662
1663	case SkBlendMode::kMultiply:
1664	return apply_rgba ([&](auto s, auto d) {
1665	return mma(s, `1`-dst.a, d, `1`-src.a) + s * d;
1666	});
1667
1668	case SkBlendMode::kSoftLight:
1669	return apply_rgb_srcover_a ([&](auto s, auto d) {
1670	auto m = select(dst.a > `0.0f`, d / dst.a
1671	, `0.0f`),
1672	s2 = two(s),
1673	m4 = `4`*m;
1674
1675	// The logic forks three ways:
1676	// 1. dark src?
1677	// 2. light src, dark dst?
1678	// 3. light src, light dst?
1679
1680	// Used in case 1
1681	auto darkSrc = d * ((s2-src.a) * (`1`-m) + src.a),
1682	// Used in case 2
1683	darkDst = (m4 * m4 + m4) * (m-`1`) + `7`*m,
1684	// Used in case 3.
1685	liteDst = sqrt(m) - m,
1686	// Used in 2 or 3?
1687	liteSrc = dst.a * (s2 - src.a) * select(`4`*d <= dst.a, darkDst
1688	, liteDst)
1689	+ d * src.a;
1690	return s * (`1`-dst.a) + d * (`1`-src.a) + select(s2 <= src.a, darkSrc
1691	, liteSrc);
1692	});
1693
1694	case SkBlendMode::kHue: {
1695	skvm::F32 R = src.r * src.a,
1696	G = src.g * src.a,
1697	B = src.b * src.a;
1698
1699	set_sat (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b));
1700	set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1701	clip_color(&R, &G, &B, src.a * dst.a);
1702
1703	return non_sep (R, G, B);
1704	}
1705
1706	case SkBlendMode::kSaturation: {
1707	skvm::F32 R = dst.r * src.a,
1708	G = dst.g * src.a,
1709	B = dst.b * src.a;
1710
1711	set_sat (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b));
1712	set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1713	clip_color(&R, &G, &B, src.a * dst.a);
1714
1715	return non_sep (R, G, B);
1716	}
1717
1718	case SkBlendMode::kColor: {
1719	skvm::F32 R = src.r * dst.a,
1720	G = src.g * dst.a,
1721	B = src.b * dst.a;
1722
1723	set_lum (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b));
1724	clip_color(&R, &G, &B, src.a * dst.a);
1725
1726	return non_sep (R, G, B);
1727	}
1728
1729	case SkBlendMode::kLuminosity: {
1730	skvm::F32 R = dst.r * src.a,
1731	G = dst.g * src.a,
1732	B = dst.b * src.a;
1733
1734	set_lum (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b));
1735	clip_color(&R, &G, &B, dst.a * src.a);
1736
1737	return non_sep (R, G, B);
1738	}
1739	}
1740	}
1741
1742	// For a given program we'll store each Instruction's users contiguously in a table,
1743	// and track where each Instruction's span of users starts and ends in another index.
1744	// Here's a simple program that loads x and stores kx+k:
1745	//
1746	// v0 = splat(k)
1747	// v1 = load(...)
1748	// v2 = mul(v1, v0)
1749	// v3 = add(v2, v0)
1750	// v4 = store(..., v3)
1751	//
1752	// This program has 5 instructions v0-v4.
1753	// - v0 is used by v2 and v3
1754	// - v1 is used by v2
1755	// - v2 is used by v3
1756	// - v3 is used by v4
1757	// - v4 has a side-effect
1758	//
1759	// For this program we fill out these two arrays:
1760	// table: [v2,v3, v2, v3, v4]
1761	// index: [0, 2, 3, 4, 5]
1762	//
1763	// The table is just those "is used by ..." I wrote out above in order,
1764	// and the index tracks where an Instruction's span of users starts, table[index[id]].
1765	// The span continues up until the start of the next Instruction, table[index[id+1]].
1766	SkSpan<const Val> Usage::operator[](Val id) const {
1767	int begin = fIndex [id];
1768	int end = fIndex [id + `1`];
1769	return SkMakeSpan(fTable.data() + begin, end - begin);
1770	}
1771
1772	Usage::Usage(const std::vector<Instruction>& program) {
1773	// uses[id] counts the number of times each Instruction is used.
1774	std::vector<int> uses(program.size(), `0`);
1775	for (Val id = `0`; id < (Val)program.size(); id++) {
1776	Instruction inst = program [id];
1777	if (inst.x != NA) { ++uses [inst.x]; }
1778	if (inst.y != NA) { ++uses [inst.y]; }
1779	if (inst.z != NA) { ++uses [inst.z]; }
1780	}
1781
1782	// Build our index into fTable, with an extra entry marking the final Instruction's end.
1783	fIndex.reserve(program.size() + `1`);
1784	int total_uses = `0`;
1785	for (int n : uses) {
1786	fIndex.push_back(total_uses);
1787	total_uses += n;
1788	}
1789	fIndex.push_back(total_uses);
1790
1791	// Tick down each Instruction's uses to fill in fTable.
1792	fTable.resize(total_uses, NA);
1793	for (Val id = (Val)program.size(); id --> `0`; ) {
1794	Instruction inst = program [id];
1795	if (inst.x != NA) { fTable [fIndex [inst.x] + --uses [inst.x]] = id; }
1796	if (inst.y != NA) { fTable [fIndex [inst.y] + --uses [inst.y]] = id; }
1797	if (inst.z != NA) { fTable [fIndex [inst.z] + --uses [inst.z]] = id; }
1798	}
1799	for (int n : uses ) { (void)n; SkASSERT(n == `0` ); }
1800	for (Val id : fTable) { (void)id; SkASSERT(id != NA); }
1801	}
1802
1803	// ~~~~ Program::eval() and co. ~~~~ //
1804
1805	// Handy references for x86-64 instruction encoding:
1806	// https://wiki.osdev.org/X86-64_Instruction_Encoding
1807	// https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
1808	// https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
1809	// http://ref.x86asm.net/coder64.html
1810
1811	// Used for ModRM / immediate instruction encoding.
1812	static uint8_t _233(int a, int b, int c) {
1813	return (a & `3`) << `6`
1814	\| (b & `7`) << `3`
1815	\| (c & `7`) << `0`;
1816	}
1817
1818	// ModRM byte encodes the arguments of an opcode.
1819	enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
1820	static uint8_t mod_rm(Mod mod, int reg, int rm) {
1821	return _233((int)mod, reg, rm);
1822	}
1823
1824	static Mod mod(int imm) {
1825	if (imm == `0`) { return Mod::Indirect; }
1826	if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
1827	return Mod::FourByteImm;
1828	}
1829
1830	static int imm_bytes(Mod mod) {
1831	switch (mod) {
1832	case Mod::Indirect: return `0`;
1833	case Mod::OneByteImm: return `1`;
1834	case Mod::FourByteImm: return `4`;
1835	case Mod::Direct: SkUNREACHABLE;
1836	}
1837	SkUNREACHABLE;
1838	}
1839
1840	// SIB byte encodes a memory address, base + (index scale).*
1841	static uint8_t sib(Assembler::Scale scale, int index, int base) {
1842	return _233((int)scale, index, base);
1843	}
1844
1845	// The REX prefix is used to extend most old 32-bit instructions to 64-bit.
1846	static uint8_t rex(bool W, // If set, operation is 64-bit, otherwise default, usually 32-bit.
1847	bool R, // Extra top bit to select ModRM reg, registers 8-15.
1848	bool X, // Extra top bit for SIB index register.
1849	bool B) { // Extra top bit for SIB base or ModRM rm register.
1850	return `0b01000000` // Fixed 0100 for top four bits.
1851	\| (W << `3`)
1852	\| (R << `2`)
1853	\| (X << `1`)
1854	\| (B << `0`);
1855	}
1856
1857
1858	// The VEX prefix extends SSE operations to AVX. Used generally, even with XMM.
1859	struct VEX {
1860	int len;
1861	uint8_t bytes[`3`];
1862	};
1863
1864	static VEX vex(bool WE, // Like REX W for int operations, or opcode extension for float?
1865	bool R, // Same as REX R. Pass high bit of dst register, dst>>3.
1866	bool X, // Same as REX X.
1867	bool B, // Same as REX B. Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
1868	int map, // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
1869	int vvvv, // 4-bit second operand register. Pass our x for 3-arg ops.
1870	bool L, // Set for 256-bit ymm operations, off for 128-bit xmm.
1871	int pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
1872
1873	// Pack x86 opcode map selector to 5-bit VEX encoding.
1874	map = [map]{
1875	switch (map) {
1876	case `0x0f`: return `0b00001`;
1877	case `0x380f`: return `0b00010`;
1878	case `0x3a0f`: return `0b00011`;
1879	// Several more cases only used by XOP / TBM.
1880	}
1881	SkUNREACHABLE;
1882	}();
1883
1884	// Pack mandatory SSE opcode prefix byte to 2-bit VEX encoding.
1885	pp = [pp]{
1886	switch (pp) {
1887	case `0x66`: return `0b01`;
1888	case `0xf3`: return `0b10`;
1889	case `0xf2`: return `0b11`;
1890	}
1891	return `0b00`;
1892	}();
1893
1894	VEX vex = {`0`, {`0`,`0`,`0`}};
1895	if (X == `0` && B == `0` && WE == `0` && map == `0b00001`) {
1896	// With these conditions met, we can optionally compress VEX to 2-byte.
1897	vex.len = `2`;
1898	vex.bytes[`0`] = `0xc5`;
1899	vex.bytes[`1`] = (pp & `3`) << `0`
1900	\| (L & `1`) << `2`
1901	\| (~vvvv & `15`) << `3`
1902	\| (~(int)R & `1`) << `7`;
1903	} else {
1904	// We could use this 3-byte VEX prefix all the time if we like.
1905	vex.len = `3`;
1906	vex.bytes[`0`] = `0xc4`;
1907	vex.bytes[`1`] = (map & `31`) << `0`
1908	\| (~(int)B & `1`) << `5`
1909	\| (~(int)X & `1`) << `6`
1910	\| (~(int)R & `1`) << `7`;
1911	vex.bytes[`2`] = (pp & `3`) << `0`
1912	\| (L & `1`) << `2`
1913	\| (~vvvv & `15`) << `3`
1914	\| (WE & `1`) << `7`;
1915	}
1916	return vex;
1917	}
1918
1919	Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(`0`) {}
1920
1921	size_t Assembler::size() const { return fSize; }
1922
1923	void Assembler::bytes(const void* p, int n) {
1924	if (fCurr) {
1925	memcpy(fCurr, p, n);
1926	fCurr += n;
1927	}
1928	fSize += n;
1929	}
1930
1931	void Assembler::byte(uint8_t b) { this->bytes(&b, `1`); }
1932	void Assembler::word(uint32_t w) { this->bytes(&w, `4`); }
1933
1934	void Assembler::align(int mod) {
1935	while (this->size() % mod) {
1936	this->byte(`0x00`);
1937	}
1938	}
1939
1940	void Assembler::int3() {
1941	this->byte(`0xcc`);
1942	}
1943
1944	void Assembler::vzeroupper() {
1945	this->byte(`0xc5`);
1946	this->byte(`0xf8`);
1947	this->byte(`0x77`);
1948	}
1949	void Assembler::ret() { this->byte(`0xc3`); }
1950
1951	void Assembler::op(int opcode, Operand dst, GP64 x) {
1952	if (dst.kind == Operand::REG) {
1953	this->byte(rex(W1,x>>`3`,`0`,dst.reg>>`3`));
1954	this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? `1` : `2`);
1955	this->byte(mod_rm(Mod::Direct, x, dst.reg&`7`));
1956	} else {
1957	SkASSERT(dst.kind == Operand::MEM);
1958	const Mem& m = dst.mem;
1959	const bool need_SIB = (m.base&`7`) == rsp
1960	\|\| m.index != rsp;
1961
1962	this->byte(rex(W1,x>>`3`,m.index>>`3`,m.base>>`3`));
1963	this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? `1` : `2`);
1964	this->byte(mod_rm(mod(m.disp), x&`7`, (need_SIB ? rsp : m.base)&`7`));
1965	if (need_SIB) {
1966	this->byte(sib(m.scale, m.index&`7`, m.base&`7`));
1967	}
1968	this->bytes(&m.disp, imm_bytes(mod(m.disp)));
1969	}
1970	}
1971
1972	void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) {
1973	opcode \|= `0b1000'0000`; // top bit set for instructions with any immediate
1974
1975	int imm_bytes = `4`;
1976	if (SkTFitsIn<int8_t>(imm)) {
1977	imm_bytes = `1`;
1978	opcode \|= `0b0000'0010`; // second bit set for 8-bit immediate, else 32-bit.
1979	}
1980
1981	this->op(opcode, dst, (GP64)opcode_ext);
1982	this->bytes(&imm, imm_bytes);
1983	}
1984
1985	void Assembler::add(Operand dst, int imm) { this->op(`0x01`,`0b000`, dst,imm); }
1986	void Assembler::sub(Operand dst, int imm) { this->op(`0x01`,`0b101`, dst,imm); }
1987	void Assembler::cmp(Operand dst, int imm) { this->op(`0x01`,`0b111`, dst,imm); }
1988
1989	// These don't work quite like the other instructions with immediates:
1990	// these immediates are always fixed size at 4 bytes or 1 byte.
1991	void Assembler::mov(Operand dst, int imm) {
1992	this->op(`0xC7`,dst,(GP64)`0b000`);
1993	this->word(imm);
1994	}
1995	void Assembler::movb(Operand dst, int imm) {
1996	this->op(`0xC6`,dst,(GP64)`0b000`);
1997	this->byte(imm);
1998	}
1999
2000	void Assembler::add (Operand dst, GP64 x) { this->op(`0x01`, dst,x); }
2001	void Assembler::sub (Operand dst, GP64 x) { this->op(`0x29`, dst,x); }
2002	void Assembler::cmp (Operand dst, GP64 x) { this->op(`0x39`, dst,x); }
2003	void Assembler::mov (Operand dst, GP64 x) { this->op(`0x89`, dst,x); }
2004	void Assembler::movb(Operand dst, GP64 x) { this->op(`0x88`, dst,x); }
2005
2006	void Assembler::add (GP64 dst, Operand x) { this->op(`0x03`, x,dst); }
2007	void Assembler::sub (GP64 dst, Operand x) { this->op(`0x2B`, x,dst); }
2008	void Assembler::cmp (GP64 dst, Operand x) { this->op(`0x3B`, x,dst); }
2009	void Assembler::mov (GP64 dst, Operand x) { this->op(`0x8B`, x,dst); }
2010	void Assembler::movb(GP64 dst, Operand x) { this->op(`0x8A`, x,dst); }
2011
2012	void Assembler::movzbq(GP64 dst, Operand x) { this->op(`0xB60F`, x,dst); }
2013	void Assembler::movzwq(GP64 dst, Operand x) { this->op(`0xB70F`, x,dst); }
2014
2015	void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(`0x66`, `0x0f`,`0xfe`, dst,x,y); }
2016	void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(`0x66`, `0x0f`,`0xfa`, dst,x,y); }
2017	void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0x40`, dst,x,y); }
2018
2019	void Assembler::vpsubw (Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x0f`,`0xf9`, dst,x,y); }
2020	void Assembler::vpmullw(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x0f`,`0xd5`, dst,x,y); }
2021
2022	void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x0f`,`0xdb`, dst,x,y); }
2023	void Assembler::vpor (Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x0f`,`0xeb`, dst,x,y); }
2024	void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x0f`,`0xef`, dst,x,y); }
2025	void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x0f`,`0xdf`, dst,x,y); }
2026
2027	void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(`0`,`0x0f`,`0x58`, dst,x,y); }
2028	void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(`0`,`0x0f`,`0x5c`, dst,x,y); }
2029	void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(`0`,`0x0f`,`0x59`, dst,x,y); }
2030	void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(`0`,`0x0f`,`0x5e`, dst,x,y); }
2031	void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(`0`,`0x0f`,`0x5d`, dst,x,y); }
2032	void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(`0`,`0x0f`,`0x5f`, dst,x,y); }
2033
2034	void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0x98`, dst,x,y); }
2035	void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0xa8`, dst,x,y); }
2036	void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0xb8`, dst,x,y); }
2037
2038	void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0x9a`, dst,x,y); }
2039	void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0xaa`, dst,x,y); }
2040	void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0xba`, dst,x,y); }
2041
2042	void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0x9c`, dst,x,y); }
2043	void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0xac`, dst,x,y); }
2044	void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0xbc`, dst,x,y); }
2045
2046	void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0x2b`, dst,x,y); }
2047	void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(`0x66`, `0x0f`,`0x67`, dst,x,y); }
2048
2049	void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x0f`,`0x62`, dst,x,y); }
2050	void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x0f`,`0x6a`, dst,x,y); }
2051
2052	void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x0f`,`0x76`, dst,x,y); }
2053	void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x0f`,`0x66`, dst,x,y); }
2054
2055
2056	void Assembler::imm_byte_after_operand(const Operand& operand, int imm) {
2057	// When we've embedded a label displacement in the middle of an instruction,
2058	// we need to tweak it a little so that the resolved displacement starts
2059	// from the end of the instruction and not the end of the displacement.
2060	if (operand.kind == Operand::LABEL && fCode) {
2061	int disp;
2062	memcpy(&disp, fCurr-`4`, `4`);
2063	disp--;
2064	memcpy(fCurr-`4`, &disp, `4`);
2065	}
2066	this->byte(imm);
2067	}
2068
2069	void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) {
2070	this->op(`0`,`0x0f`,`0xc2`, dst,x,y);
2071	this->imm_byte_after_operand(y, imm);
2072	}
2073
2074	void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) {
2075	this->op(`0x66`,`0x3a0f`,`0x4c`, dst,x,y);
2076	this->imm_byte_after_operand(y, z << `4`);
2077	}
2078
2079	// Shift instructions encode their opcode extension as "dst", dst as x, and x as y.
2080	void Assembler::vpslld(Ymm dst, Ymm x, int imm) {
2081	this->op(`0x66`,`0x0f`,`0x72`,(Ymm)`6`, dst,x);
2082	this->byte(imm);
2083	}
2084	void Assembler::vpsrld(Ymm dst, Ymm x, int imm) {
2085	this->op(`0x66`,`0x0f`,`0x72`,(Ymm)`2`, dst,x);
2086	this->byte(imm);
2087	}
2088	void Assembler::vpsrad(Ymm dst, Ymm x, int imm) {
2089	this->op(`0x66`,`0x0f`,`0x72`,(Ymm)`4`, dst,x);
2090	this->byte(imm);
2091	}
2092	void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) {
2093	this->op(`0x66`,`0x0f`,`0x71`,(Ymm)`2`, dst,x);
2094	this->byte(imm);
2095	}
2096
2097	void Assembler::vpermq(Ymm dst, Operand x, int imm) {
2098	// A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
2099	this->op(`0x66`,`0x3a0f`,`0x00`, dst,x,W1);
2100	this->imm_byte_after_operand(x, imm);
2101	}
2102
2103	void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) {
2104	this->op(`0x66`,`0x3a0f`,`0x06`, dst,x,y);
2105	this->imm_byte_after_operand(y, imm);
2106	}
2107
2108	void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) {
2109	this->op(`0x66`,`0x380f`,`0x16`, dst,ix,src);
2110	}
2111
2112	void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) {
2113	this->op(`0x66`,`0x3a0f`,`0x08`, dst,x);
2114	this->imm_byte_after_operand(x, imm);
2115	}
2116
2117	void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(`0x66`,`0x0f`,`0x6f`, dst,src); }
2118	void Assembler::vmovups(Ymm dst, Operand src) { this->op( `0`,`0x0f`,`0x10`, dst,src); }
2119	void Assembler::vmovups(Xmm dst, Operand src) { this->op( `0`,`0x0f`,`0x10`, dst,src); }
2120	void Assembler::vmovups(Operand dst, Ymm src) { this->op( `0`,`0x0f`,`0x11`, src,dst); }
2121	void Assembler::vmovups(Operand dst, Xmm src) { this->op( `0`,`0x0f`,`0x11`, src,dst); }
2122
2123	void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op( `0`,`0x0f`,`0x5b`, dst,x); }
2124	void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(`0xf3`,`0x0f`,`0x5b`, dst,x); }
2125	void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(`0x66`,`0x0f`,`0x5b`, dst,x); }
2126	void Assembler::vsqrtps (Ymm dst, Operand x) { this->op( `0`,`0x0f`,`0x51`, dst,x); }
2127
2128	void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) {
2129	this->op(`0x66`,`0x3a0f`,`0x1d`, x,dst);
2130	this->imm_byte_after_operand(dst, imm);
2131	}
2132	void Assembler::vcvtph2ps(Ymm dst, Operand x) {
2133	this->op(`0x66`,`0x380f`,`0x13`, dst,x);
2134	}
2135
2136	int Assembler::disp19(Label* l) {
2137	SkASSERT(l->kind == Label::NotYetSet \|\|
2138	l->kind == Label::ARMDisp19);
2139	int here = (int)this->size();
2140	l->kind = Label::ARMDisp19;
2141	l->references.push_back(here);
2142	// ARM 19-bit instruction count, from the beginning of this instruction.
2143	return (l->offset - here) / `4`;
2144	}
2145
2146	int Assembler::disp32(Label* l) {
2147	SkASSERT(l->kind == Label::NotYetSet \|\|
2148	l->kind == Label::X86Disp32);
2149	int here = (int)this->size();
2150	l->kind = Label::X86Disp32;
2151	l->references.push_back(here);
2152	// x86 32-bit byte count, from the end of this instruction.
2153	return l->offset - (here + `4`);
2154	}
2155
2156	void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) {
2157	switch (y.kind) {
2158	case Operand::REG: {
2159	VEX v = vex(w, dst>>`3`, `0`, y.reg>>`3`,
2160	map, x, l, prefix);
2161	this->bytes(v.bytes, v.len);
2162	this->byte(opcode);
2163	this->byte(mod_rm(Mod::Direct, dst&`7`, y.reg&`7`));
2164	} return;
2165
2166	case Operand::MEM: {
2167	// Passing rsp as the rm argument to mod_rm() signals an SIB byte follows;
2168	// without an SIB byte, that's where the base register would usually go.
2169	// This means we have to use an SIB byte if we want to use rsp as a base register.
2170	const Mem& m = y.mem;
2171	const bool need_SIB = m.base == rsp
2172	\|\| m.index != rsp;
2173
2174	VEX v = vex(w, dst>>`3`, m.index>>`3`, m.base>>`3`,
2175	map, x, l, prefix);
2176	this->bytes(v.bytes, v.len);
2177	this->byte(opcode);
2178	this->byte(mod_rm(mod(m.disp), dst&`7`, (need_SIB ? rsp : m.base)&`7`));
2179	if (need_SIB) {
2180	this->byte(sib(m.scale, m.index&`7`, m.base&`7`));
2181	}
2182	this->bytes(&m.disp, imm_bytes(mod(m.disp)));
2183	} return;
2184
2185	case Operand::LABEL: {
2186	// IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
2187	const int rip = rbp;
2188
2189	VEX v = vex(w, dst>>`3`, `0`, rip>>`3`,
2190	map, x, l, prefix);
2191	this->bytes(v.bytes, v.len);
2192	this->byte(opcode);
2193	this->byte(mod_rm(Mod::Indirect, dst&`7`, rip&`7`));
2194	this->word(this->disp32(y.label));
2195	} return;
2196	}
2197	}
2198
2199	void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(`0x66`,`0x380f`,`0x00`, dst,x,y); }
2200
2201	void Assembler::vptest(Ymm x, Operand y) { this->op(`0x66`, `0x380f`, `0x17`, x,y); }
2202
2203	void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(`0x66`,`0x380f`,`0x18`, dst,y); }
2204
2205	void Assembler::jump(uint8_t condition, Label* l) {
2206	// These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
2207	// 7? one-byte-disp
2208	// 0F 8? four-byte-disp
2209	// We always use the near displacement to make updating labels simpler (no resizing).
2210	this->byte(`0x0f`);
2211	this->byte(condition);
2212	this->word(this->disp32(l));
2213	}
2214	void Assembler::je (Label* l) { this->jump(`0x84`, l); }
2215	void Assembler::jne(Label* l) { this->jump(`0x85`, l); }
2216	void Assembler::jl (Label* l) { this->jump(`0x8c`, l); }
2217	void Assembler::jc (Label* l) { this->jump(`0x82`, l); }
2218
2219	void Assembler::jmp(Label* l) {
2220	// Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
2221	this->byte(`0xe9`);
2222	this->word(this->disp32(l));
2223	}
2224
2225	void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(`0x66`,`0x380f`,`0x33`, dst,src); }
2226	void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(`0x66`,`0x380f`,`0x31`, dst,src); }
2227
2228	void Assembler::vmovq(Operand dst, Xmm src) { this->op(`0x66`,`0x0f`,`0xd6`, src,dst); }
2229
2230	void Assembler::vmovd(Operand dst, Xmm src) { this->op(`0x66`,`0x0f`,`0x7e`, src,dst); }
2231	void Assembler::vmovd(Xmm dst, Operand src) { this->op(`0x66`,`0x0f`,`0x6e`, dst,src); }
2232
2233	void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) {
2234	this->op(`0x66`,`0x3a0f`,`0x22`, dst,src,y);
2235	this->imm_byte_after_operand(y, imm);
2236	}
2237	void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) {
2238	this->op(`0x66`,`0x0f`,`0xc4`, dst,src,y);
2239	this->imm_byte_after_operand(y, imm);
2240	}
2241	void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) {
2242	this->op(`0x66`,`0x3a0f`,`0x20`, dst,src,y);
2243	this->imm_byte_after_operand(y, imm);
2244	}
2245
2246	void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
2247	this->op(`0x66`,`0x3a0f`,`0x39`, src,dst);
2248	SkASSERT(dst.kind != Operand::LABEL);
2249	this->byte(imm);
2250	}
2251	void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
2252	this->op(`0x66`,`0x3a0f`,`0x16`, src,dst);
2253	SkASSERT(dst.kind != Operand::LABEL);
2254	this->byte(imm);
2255	}
2256	void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
2257	this->op(`0x66`,`0x3a0f`,`0x15`, src,dst);
2258	SkASSERT(dst.kind != Operand::LABEL);
2259	this->byte(imm);
2260	}
2261	void Assembler::vpextrb(Operand dst, Xmm src, int imm) {
2262	this->op(`0x66`,`0x3a0f`,`0x14`, src,dst);
2263	SkASSERT(dst.kind != Operand::LABEL);
2264	this->byte(imm);
2265	}
2266
2267	void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
2268	// Unlike most instructions, no aliasing is permitted here.
2269	SkASSERT(dst != ix);
2270	SkASSERT(dst != mask);
2271	SkASSERT(mask != ix);
2272
2273	int prefix = `0x66`,
2274	map = `0x380f`,
2275	opcode = `0x92`;
2276	VEX v = vex(`0`, dst>>`3`, ix>>`3`, base>>`3`,
2277	map, mask, /ymm?/`1`, prefix);
2278	this->bytes(v.bytes, v.len);
2279	this->byte(opcode);
2280	this->byte(mod_rm(Mod::Indirect, dst&`7`, rsp/use SIB/));
2281	this->byte(sib(scale, ix&`7`, base&`7`));
2282	}
2283
2284	// https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
2285
2286	static int operator"" _mask(unsigned long long bits) { return (`1`<<(int)bits)-`1`; }
2287
2288	void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
2289	this->word( (hi & `11_mask`) << `21`
2290	\| (m & `5_mask`) << `16`
2291	\| (lo & `6_mask`) << `10`
2292	\| (n & `5_mask`) << `5`
2293	\| (d & `5_mask`) << `0`);
2294	}
2295	void Assembler::op(uint32_t op22, V n, V d, int imm) {
2296	this->word( (op22 & `22_mask`) << `10`
2297	\| imm // size and location depends on the instruction
2298	\| (n & `5_mask`) << `5`
2299	\| (d & `5_mask`) << `0`);
2300	}
2301
2302	void Assembler::and16b(V d, V n, V m) { this->op(`0b0'1'0'01110'00'1`, m, `0b00011'1`, n, d); }
2303	void Assembler::orr16b(V d, V n, V m) { this->op(`0b0'1'0'01110'10'1`, m, `0b00011'1`, n, d); }
2304	void Assembler::eor16b(V d, V n, V m) { this->op(`0b0'1'1'01110'00'1`, m, `0b00011'1`, n, d); }
2305	void Assembler::bic16b(V d, V n, V m) { this->op(`0b0'1'0'01110'01'1`, m, `0b00011'1`, n, d); }
2306	void Assembler::bsl16b(V d, V n, V m) { this->op(`0b0'1'1'01110'01'1`, m, `0b00011'1`, n, d); }
2307	void Assembler::not16b(V d, V n) { this->op(`0b0'1'1'01110'00'10000'00101'10`, n, d); }
2308
2309	void Assembler::add4s(V d, V n, V m) { this->op(`0b0'1'0'01110'10'1`, m, `0b10000'1`, n, d); }
2310	void Assembler::sub4s(V d, V n, V m) { this->op(`0b0'1'1'01110'10'1`, m, `0b10000'1`, n, d); }
2311	void Assembler::mul4s(V d, V n, V m) { this->op(`0b0'1'0'01110'10'1`, m, `0b10011'1`, n, d); }
2312
2313	void Assembler::cmeq4s(V d, V n, V m) { this->op(`0b0'1'1'01110'10'1`, m, `0b10001'1`, n, d); }
2314	void Assembler::cmgt4s(V d, V n, V m) { this->op(`0b0'1'0'01110'10'1`, m, `0b0011'0'1`, n, d); }
2315
2316	void Assembler::sub8h(V d, V n, V m) { this->op(`0b0'1'1'01110'01'1`, m, `0b10000'1`, n, d); }
2317	void Assembler::mul8h(V d, V n, V m) { this->op(`0b0'1'0'01110'01'1`, m, `0b10011'1`, n, d); }
2318
2319	void Assembler::fadd4s(V d, V n, V m) { this->op(`0b0'1'0'01110'0'0'1`, m, `0b11010'1`, n, d); }
2320	void Assembler::fsub4s(V d, V n, V m) { this->op(`0b0'1'0'01110'1'0'1`, m, `0b11010'1`, n, d); }
2321	void Assembler::fmul4s(V d, V n, V m) { this->op(`0b0'1'1'01110'0'0'1`, m, `0b11011'1`, n, d); }
2322	void Assembler::fdiv4s(V d, V n, V m) { this->op(`0b0'1'1'01110'0'0'1`, m, `0b11111'1`, n, d); }
2323	void Assembler::fmin4s(V d, V n, V m) { this->op(`0b0'1'0'01110'1'0'1`, m, `0b11110'1`, n, d); }
2324	void Assembler::fmax4s(V d, V n, V m) { this->op(`0b0'1'0'01110'0'0'1`, m, `0b11110'1`, n, d); }
2325	void Assembler::fneg4s(V d, V n) { this->op(`0b0'1'1'01110'1'0'10000'01111'10`, n, d); }
2326
2327	void Assembler::fcmeq4s(V d, V n, V m) { this->op(`0b0'1'0'01110'0'0'1`, m, `0b1110'0'1`, n, d); }
2328	void Assembler::fcmgt4s(V d, V n, V m) { this->op(`0b0'1'1'01110'1'0'1`, m, `0b1110'0'1`, n, d); }
2329	void Assembler::fcmge4s(V d, V n, V m) { this->op(`0b0'1'1'01110'0'0'1`, m, `0b1110'0'1`, n, d); }
2330
2331	void Assembler::fmla4s(V d, V n, V m) { this->op(`0b0'1'0'01110'0'0'1`, m, `0b11001'1`, n, d); }
2332	void Assembler::fmls4s(V d, V n, V m) { this->op(`0b0'1'0'01110'1'0'1`, m, `0b11001'1`, n, d); }
2333
2334	void Assembler::tbl(V d, V n, V m) { this->op(`0b0'1'001110'00'0`, m, `0b0'00'0'00`, n, d); }
2335
2336	void Assembler::sli4s(V d, V n, int imm5) {
2337	this->op(`0b0'1'1'011110'0100'000'01010'1`, n, d, ( imm5 & `5_mask`)<<`16`);
2338	}
2339	void Assembler::shl4s(V d, V n, int imm5) {
2340	this->op(`0b0'1'0'011110'0100'000'01010'1`, n, d, ( imm5 & `5_mask`)<<`16`);
2341	}
2342	void Assembler::sshr4s(V d, V n, int imm5) {
2343	this->op(`0b0'1'0'011110'0100'000'00'0'0'0'1`, n, d, (-imm5 & `5_mask`)<<`16`);
2344	}
2345	void Assembler::ushr4s(V d, V n, int imm5) {
2346	this->op(`0b0'1'1'011110'0100'000'00'0'0'0'1`, n, d, (-imm5 & `5_mask`)<<`16`);
2347	}
2348	void Assembler::ushr8h(V d, V n, int imm4) {
2349	this->op(`0b0'1'1'011110'0010'000'00'0'0'0'1`, n, d, (-imm4 & `4_mask`)<<`16`);
2350	}
2351
2352	void Assembler::scvtf4s (V d, V n) { this->op(`0b0'1'0'01110'0'0'10000'11101'10`, n,d); }
2353	void Assembler::fcvtzs4s(V d, V n) { this->op(`0b0'1'0'01110'1'0'10000'1101'1'10`, n,d); }
2354	void Assembler::fcvtns4s(V d, V n) { this->op(`0b0'1'0'01110'0'0'10000'1101'0'10`, n,d); }
2355
2356	void Assembler::xtns2h(V d, V n) { this->op(`0b0'0'0'01110'01'10000'10010'10`, n,d); }
2357	void Assembler::xtnh2b(V d, V n) { this->op(`0b0'0'0'01110'00'10000'10010'10`, n,d); }
2358
2359	void Assembler::uxtlb2h(V d, V n) { this->op(`0b0'0'1'011110'0001'000'10100'1`, n,d); }
2360	void Assembler::uxtlh2s(V d, V n) { this->op(`0b0'0'1'011110'0010'000'10100'1`, n,d); }
2361
2362	void Assembler::uminv4s(V d, V n) { this->op(`0b0'1'1'01110'10'11000'1'1010'10`, n,d); }
2363
2364	void Assembler::brk(int imm16) {
2365	this->op(`0b11010100'001'00000000000`, (imm16 & `16_mask`) << `5`);
2366	}
2367
2368	void Assembler::ret(X n) { this->op(`0b1101011'0'0'10'11111'0000'0'0`, n, (X)`0`); }
2369
2370	void Assembler::add(X d, X n, int imm12) {
2371	this->op(`0b1'0'0'10001'00'000000000000`, n,d, (imm12 & `12_mask`) << `10`);
2372	}
2373	void Assembler::sub(X d, X n, int imm12) {
2374	this->op(`0b1'1'0'10001'00'000000000000`, n,d, (imm12 & `12_mask`) << `10`);
2375	}
2376	void Assembler::subs(X d, X n, int imm12) {
2377	this->op(`0b1'1'1'10001'00'000000000000`, n,d, (imm12 & `12_mask`) << `10`);
2378	}
2379
2380	void Assembler::b(Condition cond, Label* l) {
2381	const int imm19 = this->disp19(l);
2382	this->op(`0b0101010'0'00000000000000`, (X)`0`, (V)cond, (imm19 & `19_mask`) << `5`);
2383	}
2384	void Assembler::cbz(X t, Label* l) {
2385	const int imm19 = this->disp19(l);
2386	this->op(`0b1'011010'0'00000000000000`, (X)`0`, t, (imm19 & `19_mask`) << `5`);
2387	}
2388	void Assembler::cbnz(X t, Label* l) {
2389	const int imm19 = this->disp19(l);
2390	this->op(`0b1'011010'1'00000000000000`, (X)`0`, t, (imm19 & `19_mask`) << `5`);
2391	}
2392
2393	void Assembler::ldrq(V dst, X src, int imm12) {
2394	this->op(`0b00'111'1'01'11'000000000000`, src, dst, (imm12 & `12_mask`) << `10`);
2395	}
2396	void Assembler::ldrs(V dst, X src, int imm12) {
2397	this->op(`0b10'111'1'01'01'000000000000`, src, dst, (imm12 & `12_mask`) << `10`);
2398	}
2399	void Assembler::ldrb(V dst, X src, int imm12) {
2400	this->op(`0b00'111'1'01'01'000000000000`, src, dst, (imm12 & `12_mask`) << `10`);
2401	}
2402
2403	void Assembler::strq(V src, X dst, int imm12) {
2404	this->op(`0b00'111'1'01'10'000000000000`, dst, src, (imm12 & `12_mask`) << `10`);
2405	}
2406	void Assembler::strs(V src, X dst, int imm12) {
2407	this->op(`0b10'111'1'01'00'000000000000`, dst, src, (imm12 & `12_mask`) << `10`);
2408	}
2409	void Assembler::strb(V src, X dst, int imm12) {
2410	this->op(`0b00'111'1'01'00'000000000000`, dst, src, (imm12 & `12_mask`) << `10`);
2411	}
2412
2413	void Assembler::fmovs(X dst, V src) {
2414	this->op(`0b0'0'0'11110'00'1'00'110'000000`, src, dst);
2415	}
2416
2417	void Assembler::ldrq(V dst, Label* l) {
2418	const int imm19 = this->disp19(l);
2419	this->op(`0b10'011'1'00'00000000000000`, (V)`0`, dst, (imm19 & `19_mask`) << `5`);
2420	}
2421
2422	void Assembler::label(Label* l) {
2423	if (fCode) {
2424	// The instructions all currently point to l->offset.
2425	// We'll want to add a delta to point them to here.
2426	int here = (int)this->size();
2427	int delta = here - l->offset;
2428	l->offset = here;
2429
2430	if (l->kind == Label::ARMDisp19) {
2431	for (int ref : l->references) {
2432	// ref points to a 32-bit instruction with 19-bit displacement in instructions.
2433	uint32_t inst;
2434	memcpy(&inst, fCode + ref, `4`);
2435
2436	// [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
2437	int disp = (int)(inst << `8`) >> `13`;
2438
2439	disp += delta/`4`; // delta is in bytes, we want instructions.
2440
2441	// Put it all back together, preserving the high 8 bits and low 5.
2442	inst = ((disp << `5`) & (`19_mask` << `5`))
2443	\| ((inst ) & ~(`19_mask` << `5`));
2444
2445	memcpy(fCode + ref, &inst, `4`);
2446	}
2447	}
2448
2449	if (l->kind == Label::X86Disp32) {
2450	for (int ref : l->references) {
2451	// ref points to a 32-bit displacement in bytes.
2452	int disp;
2453	memcpy(&disp, fCode + ref, `4`);
2454
2455	disp += delta;
2456
2457	memcpy(fCode + ref, &disp, `4`);
2458	}
2459	}
2460	}
2461	}
2462
2463	void Program::eval(int n, void* args[]) const {
2464	#define SKVM_JIT_STATS 0
2465	#if SKVM_JIT_STATS
2466	static std::atomic<int64_t> calls{`0`}, jits{`0`},
2467	pixels{`0`}, fast{`0`};
2468	pixels += n;
2469	if (`0` == calls++) {
2470	atexit([]{
2471	int64_t num = jits .load(),
2472	den = calls.load();
2473	SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (`100.0` * num)/den, den);
2474	num = fast .load();
2475	den = pixels.load();
2476	SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (`100.0` * num)/den, den);
2477	});
2478	}
2479	#endif
2480
2481	#if !defined(SKVM_JIT_BUT_IGNORE_IT)
2482	const void* jit_entry = fImpl ->jit_entry.load();
2483	// jit_entry may be null either simply because we can't JIT, or when using LLVM
2484	// if the work represented by fImpl->llvm_compiling hasn't finished yet.
2485	//
2486	// Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it
2487	// can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off,
2488	// due to timing or program caching.
2489	if (jit_entry != nullptr && gSkVMAllowJIT) {
2490	#if SKVM_JIT_STATS
2491	jits++;
2492	fast += n;
2493	#endif
2494	void** a = args;
2495	switch (fImpl ->strides.size()) {
2496	case `0`: return ((void()(int* ))jit_entry)(n );
2497	case `1`: return ((void()(int,void** ))jit_entry)(n,a[`0`] );
2498	case `2`: return ((void()(int,void*,void** ))jit_entry)(n,a[`0`],a[`1`] );
2499	case `3`: return ((void()(int,void*,void*,void** ))jit_entry)(n,a[`0`],a[`1`],a[`2`]);
2500	case `4`: return ((void()(int,void*,void*,void*,void**))jit_entry)
2501	(n,a[`0`],a[`1`],a[`2`],a[`3`]);
2502	case `5`: return ((void()(int,void*,void*,void*,void*,void**))jit_entry)
2503	(n,a[`0`],a[`1`],a[`2`],a[`3`],a[`4`]);
2504	case `6`: return ((void()(int,void*,void*,void*,void*,void*,void**))jit_entry)
2505	(n,a[`0`],a[`1`],a[`2`],a[`3`],a[`4`],a[`5`]);
2506	default: SkASSERT(false); // TODO: >6 args?
2507	}
2508	}
2509	#endif
2510
2511	// So we'll sometimes use the interpreter here even if later calls will use the JIT.
2512	SkOpts::interpret_skvm(fImpl ->instructions.data(), (int)fImpl ->instructions.size(),
2513	this->nregs(), this->loop(), fImpl ->strides.data(), this->nargs(),
2514	n, args);
2515	}
2516
2517	#if defined(SKVM_LLVM)
2518	void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions,
2519	const char* debug_name) {
2520	auto ctx = std::make_unique<llvm::LLVMContext>();
2521
2522	auto mod = std::make_unique<llvm::Module>("", *ctx);
2523	// All the scary bare pointers from here on are owned by ctx or mod, I think.
2524
2525	// Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines.
2526	const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? `8` : `4`;
2527
2528	llvm::Type ptr = llvm::Type::getInt8Ty(ctx)->getPointerTo(),
2529	i32 = llvm::Type::getInt32Ty(ctx);
2530
2531	std::vector<llvm::Type*> arg_types = { i32 };
2532	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2533	arg_types.push_back(ptr);
2534	}
2535
2536	llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx),
2537	arg_types, /vararg?=/false);
2538	llvm::Function* fn
2539	= llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod);
2540	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2541	fn->addParamAttr(i+`1`, llvm::Attribute::NoAlias);
2542	}
2543
2544	llvm::BasicBlock enter = llvm::BasicBlock::Create(ctx, "enter" , fn),
2545	hoistK = llvm::BasicBlock::Create(ctx, "hoistK", fn),
2546	testK = llvm::BasicBlock::Create(ctx, "testK" , fn),
2547	loopK = llvm::BasicBlock::Create(ctx, "loopK" , fn),
2548	hoist1 = llvm::BasicBlock::Create(ctx, "hoist1", fn),
2549	test1 = llvm::BasicBlock::Create(ctx, "test1" , fn),
2550	loop1 = llvm::BasicBlock::Create(ctx, "loop1" , fn),
2551	leave = llvm::BasicBlock::Create(ctx, "leave" , fn);
2552
2553	using IRBuilder = llvm::IRBuilder<>;
2554
2555	llvm::PHINode* n;
2556	std::vector<llvm::PHINode*> args;
2557	std::vector<llvm::Value*> vals(instructions.size());
2558
2559	auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
2560	auto [op, x,y,z, immy,immz, death,can_hoist] = instructions[i];
2561
2562	llvm::Type i1 = llvm::Type::getInt1Ty (ctx),
2563	i8 = llvm::Type::getInt8Ty (ctx),
2564	i16 = llvm::Type::getInt16Ty(ctx),
2565	f32 = llvm::Type::getFloatTy(ctx),
2566	*I1 = scalar ? i1 : llvm::VectorType::get(i1 , K ),
2567	*I8 = scalar ? i8 : llvm::VectorType::get(i8 , K ),
2568	*I16 = scalar ? i16 : llvm::VectorType::get(i16, K ),
2569	*I32 = scalar ? i32 : llvm::VectorType::get(i32, K ),
2570	*F32 = scalar ? f32 : llvm::VectorType::get(f32, K );
2571
2572	auto I = [&](llvm::Value* v) { return b->CreateBitCast(v, I32 ); };
2573	auto F = [&](llvm::Value* v) { return b->CreateBitCast(v, F32 ); };
2574
2575	auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); };
2576
2577	switch (llvm::Type* t = nullptr; op) {
2578	default:
2579	SkDebugf("can't llvm %s (%d)\n", name(op), op);
2580	return false;
2581
2582	case Op::assert_true: /TODO/ break;
2583
2584	case Op::index:
2585	if (I32->isVectorTy()) {
2586	std::vector<llvm::Constant*> iota(K);
2587	for (int j = `0`; j < K; j++) {
2588	iota[j] = b->getInt32(j);
2589	}
2590	vals[i] = b->CreateSub(b->CreateVectorSplat(K, n),
2591	llvm::ConstantVector::get(iota));
2592	} else {
2593	vals[i] = n;
2594	} break;
2595
2596	case Op::load8: t = I8 ; goto load;
2597	case Op::load16: t = I16; goto load;
2598	case Op::load32: t = I32; goto load;
2599	load: {
2600	llvm::Value* ptr = b->CreateBitCast(args[immy], t->getPointerTo());
2601	vals[i] = b->CreateZExt(b->CreateAlignedLoad(ptr, `1`), I32);
2602	} break;
2603
2604
2605	case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immy); break;
2606
2607	case Op::uniform8: t = i8 ; goto uniform;
2608	case Op::uniform16: t = i16; goto uniform;
2609	case Op::uniform32: t = i32; goto uniform;
2610	uniform: {
2611	llvm::Value* ptr = b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr,
2612	args[immy],
2613	immz),
2614	t->getPointerTo());
2615	llvm::Value* val = b->CreateZExt(b->CreateAlignedLoad(ptr, `1`), i32);
2616	vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val)
2617	: val;
2618	} break;
2619
2620	case Op::gather8: t = i8 ; goto gather;
2621	case Op::gather16: t = i16; goto gather;
2622	case Op::gather32: t = i32; goto gather;
2623	gather: {
2624	// Our gather base pointer is immz bytes off of uniform immy.
2625	llvm::Value* base =
2626	b->CreateLoad(b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr,
2627	args[immy],
2628	immz),
2629	t->getPointerTo()->getPointerTo()));
2630
2631	llvm::Value* ptr = b->CreateInBoundsGEP(nullptr, base, vals[x]);
2632	llvm::Value* gathered;
2633	if (ptr->getType()->isVectorTy()) {
2634	gathered = b->CreateMaskedGather(ptr, `1`);
2635	} else {
2636	gathered = b->CreateAlignedLoad(ptr, `1`);
2637	}
2638	vals[i] = b->CreateZExt(gathered, I32);
2639	} break;
2640
2641	case Op::store8: t = I8 ; goto store;
2642	case Op::store16: t = I16; goto store;
2643	case Op::store32: t = I32; goto store;
2644	store: {
2645	llvm::Value* val = b->CreateTrunc(vals[x], t);
2646	llvm::Value* ptr = b->CreateBitCast(args[immy],
2647	val->getType()->getPointerTo());
2648	vals[i] = b->CreateAlignedStore(val, ptr, `1`);
2649	} break;
2650
2651	case Op::bit_and: vals[i] = b->CreateAnd(vals[x], vals[y]); break;
2652	case Op::bit_or : vals[i] = b->CreateOr (vals[x], vals[y]); break;
2653	case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break;
2654	case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break;
2655
2656	case Op::pack: vals[i] = b->CreateOr(vals[x], b->CreateShl(vals[y], immz)); break;
2657
2658	case Op::select:
2659	vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]);
2660	break;
2661
2662	case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break;
2663	case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break;
2664	case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break;
2665
2666	case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immy); break;
2667	case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immy); break;
2668	case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immy); break;
2669
2670	case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break;
2671	case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break;
2672
2673	case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break;
2674	case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break;
2675	case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break;
2676	case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break;
2677
2678	case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break;
2679	case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break;
2680	case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break;
2681	case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break;
2682
2683	case Op::fma_f32:
2684	vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2685	{F(vals[x]), F(vals[y]), F(vals[z])}));
2686	break;
2687
2688	case Op::fms_f32:
2689	vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2690	{F(vals[x]), F(vals[y]),
2691	b->CreateFNeg(F(vals[z]))}));
2692	break;
2693
2694	case Op::fnma_f32:
2695	vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2696	{b->CreateFNeg(F(vals[x])), F(vals[y]),
2697	F(vals[z])}));
2698	break;
2699
2700	case Op::ceil:
2701	vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::ceil, F(vals[x])));
2702	break;
2703	case Op::floor:
2704	vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x])));
2705	break;
2706
2707	case Op::max_f32:
2708	vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])),
2709	F(vals[y]), F(vals[x])));
2710	break;
2711	case Op::min_f32:
2712	vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])),
2713	F(vals[y]), F(vals[x])));
2714	break;
2715
2716	case Op::sqrt_f32:
2717	vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x])));
2718	break;
2719
2720	case Op::to_f32: vals[i] = I(b->CreateSIToFP( vals[x] , F32)); break;
2721	case Op::trunc : vals[i] = b->CreateFPToSI(F(vals[x]), I32) ; break;
2722	case Op::round : {
2723	// Basic impl when we can't use cvtps2dq and co.
2724	auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x]));
2725	vals[i] = b->CreateFPToSI(round, I32);
2726
2727	#if 1 && defined(SK_CPU_X86)
2728	// Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling.
2729	if (scalar) {
2730	// cvtss2si is float x4 -> int, ignoring input lanes 1,2,3. ¯\_(ツ)_/¯
2731	llvm::Value* v = llvm::UndefValue::get(llvm::VectorType::get(f32, `4`));
2732	v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)`0`);
2733	vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v});
2734	} else {
2735	SkASSERT(K == `4` \|\| K == `8`);
2736	auto intr = K == `4` ? llvm::Intrinsic::x86_sse2_cvtps2dq :
2737	/ K == 8 ?/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256;
2738	vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])});
2739	}
2740	#endif
2741	} break;
2742
2743	}
2744	return true;
2745	};
2746
2747	{
2748	IRBuilder b(enter);
2749	b.CreateBr(hoistK);
2750	}
2751
2752	// hoistK: emit each hoistable vector instruction; goto testK;
2753	// LLVM can do this sort of thing itself, but we've got the information cheap,
2754	// and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe.
2755	{
2756	IRBuilder b(hoistK);
2757
2758	// Hoisted instructions will need args (think, uniforms), so set that up now.
2759	// These phi nodes are degenerate... they'll always be the passed-in args from enter.
2760	// Later on when we start looping the phi nodes will start looking useful.
2761	llvm::Argument* arg = fn->arg_begin();
2762	(void)arg++; // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction.
2763	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2764	args.push_back(b.CreatePHI(arg->getType(), `1`));
2765	args.back()->addIncoming(arg++, enter);
2766	}
2767
2768	for (size_t i = `0`; i < instructions.size(); i++) {
2769	if (instructions[i].can_hoist && !emit(i, false, &b)) {
2770	return;
2771	}
2772	}
2773
2774	b.CreateBr(testK);
2775	}
2776
2777	// testK: if (N >= K) goto loopK; else goto hoist1;
2778	{
2779	IRBuilder b(testK);
2780
2781	// New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK.
2782	// These also start as the initial function arguments; hoistK can't have changed them.
2783	llvm::Argument* arg = fn->arg_begin();
2784
2785	n = b.CreatePHI(arg->getType(), `2`);
2786	n->addIncoming(arg++, hoistK);
2787
2788	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2789	args[i] = b.CreatePHI(arg->getType(), `2`);
2790	args[i]->addIncoming(arg++, hoistK);
2791	}
2792
2793	b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1);
2794	}
2795
2796	// loopK: ... insts on K x T vectors; N -= K, args += Kstride; goto testK;*
2797	{
2798	IRBuilder b(loopK);
2799	for (size_t i = `0`; i < instructions.size(); i++) {
2800	if (!instructions[i].can_hoist && !emit(i, false, &b)) {
2801	return;
2802	}
2803	}
2804
2805	// n -= K
2806	llvm::Value* n_next = b.CreateSub(n, b.getInt32(K));
2807	n->addIncoming(n_next, loopK);
2808
2809	// Each arg ptr += K
2810	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2811	llvm::Value* arg_next
2812	= b.CreateConstInBoundsGEP1_32(nullptr, args[i], K*fImpl->strides[i]);
2813	args[i]->addIncoming(arg_next, loopK);
2814	}
2815	b.CreateBr(testK);
2816	}
2817
2818	// hoist1: emit each hoistable scalar instruction; goto test1;
2819	{
2820	IRBuilder b(hoist1);
2821	for (size_t i = `0`; i < instructions.size(); i++) {
2822	if (instructions[i].can_hoist && !emit(i, true, &b)) {
2823	return;
2824	}
2825	}
2826	b.CreateBr(test1);
2827	}
2828
2829	// test1: if (N >= 1) goto loop1; else goto leave;
2830	{
2831	IRBuilder b(test1);
2832
2833	// Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1.
2834	llvm::PHINode* n_new = b.CreatePHI(n->getType(), `2`);
2835	n_new->addIncoming(n, hoist1);
2836	n = n_new;
2837
2838	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2839	llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), `2`);
2840	arg_new->addIncoming(args[i], hoist1);
2841	args[i] = arg_new;
2842	}
2843
2844	b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(`1`)), loop1, leave);
2845	}
2846
2847	// loop1: ... insts on scalars; N -= 1, args += stride; goto test1;
2848	{
2849	IRBuilder b(loop1);
2850	for (size_t i = `0`; i < instructions.size(); i++) {
2851	if (!instructions[i].can_hoist && !emit(i, true, &b)) {
2852	return;
2853	}
2854	}
2855
2856	// n -= 1
2857	llvm::Value* n_next = b.CreateSub(n, b.getInt32(`1`));
2858	n->addIncoming(n_next, loop1);
2859
2860	// Each arg ptr += K
2861	for (size_t i = `0`; i < fImpl->strides.size(); i++) {
2862	llvm::Value* arg_next
2863	= b.CreateConstInBoundsGEP1_32(nullptr, args[i], fImpl->strides[i]);
2864	args[i]->addIncoming(arg_next, loop1);
2865	}
2866	b.CreateBr(test1);
2867	}
2868
2869	// leave: ret
2870	{
2871	IRBuilder b(leave);
2872	b.CreateRetVoid();
2873	}
2874
2875	SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs()));
2876
2877	if (true) {
2878	SkString path = SkStringPrintf("/tmp/%s.bc", debug_name);
2879	std::error_code err;
2880	llvm::raw_fd_ostream os(path.c_str(), err);
2881	if (err) {
2882	return;
2883	}
2884	llvm::WriteBitcodeToFile(*mod, os);
2885	}
2886
2887	static SkOnce once;
2888	once([]{
2889	SkAssertResult(false == llvm::InitializeNativeTarget());
2890	SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter());
2891	});
2892
2893	if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod))
2894	.setEngineKind(llvm::EngineKind::JIT)
2895	.setMCPU(llvm::sys::getHostCPUName())
2896	.create()) {
2897	fImpl->llvm_ctx = std::move(ctx);
2898	fImpl->llvm_ee.reset(ee);
2899
2900	// We have to be careful here about what we close over and how, in case fImpl moves.
2901	// fImpl itself may change, but its pointee fields won't, so close over them by value.
2902	// Also, debug_name will almost certainly leave scope, so copy it.
2903	fImpl->llvm_compiling = std::async(std::launch::async, [dst = &fImpl->jit_entry,
2904	ee = fImpl->llvm_ee.get(),
2905	name = std::string(debug_name)]{
2906	// std::atomic<void>* dst;*
2907	// llvm::ExecutionEngine ee;*
2908	// std::string name;
2909	dst->store( (void*)ee->getFunctionAddress(name.c_str()) );
2910	});
2911	}
2912	}
2913	#endif
2914
2915	void Program::waitForLLVM() const {
2916	#if defined(SKVM_LLVM)
2917	if (fImpl->llvm_compiling.valid()) {
2918	fImpl->llvm_compiling.wait();
2919	}
2920	#endif
2921	}
2922
2923	bool Program::hasJIT() const {
2924	// Program::hasJIT() is really just a debugging / test aid,
2925	// so we don't mind adding a sync point here to wait for compilation.
2926	this->waitForLLVM();
2927
2928	return fImpl ->jit_entry.load() != nullptr;
2929	}
2930
2931	void Program::dropJIT() {
2932	#if defined(SKVM_LLVM)
2933	this->waitForLLVM();
2934	fImpl->llvm_ee .reset(nullptr);
2935	fImpl->llvm_ctx.reset(nullptr);
2936	#elif defined(SKVM_JIT)
2937	if (fImpl->dylib) {
2938	close_dylib(fImpl->dylib);
2939	} else if (auto jit_entry = fImpl->jit_entry.load()) {
2940	unmap_jit_buffer(jit_entry, fImpl->jit_size);
2941	}
2942	#else
2943	SkASSERT(!this->hasJIT());
2944	#endif
2945
2946	fImpl ->jit_entry.store(nullptr);
2947	fImpl ->jit_size = `0`;
2948	fImpl ->dylib = nullptr;
2949	}
2950
2951	Program::Program() : fImpl(std::make_unique<Impl>()) {}
2952
2953	Program::~Program() {
2954	// Moved-from Programs may have fImpl == nullptr.
2955	if (fImpl) {
2956	this->dropJIT();
2957	}
2958	}
2959
2960	Program::Program(Program&& other) : fImpl (std::move(other.fImpl)) {}
2961
2962	Program& Program::operator=(Program&& other) {
2963	fImpl = std::move(other.fImpl);
2964	return *this;
2965	}
2966
2967	Program::Program(const std::vector<OptimizedInstruction>& instructions,
2968	const std::vector<int>& strides,
2969	const char* debug_name) : Program () {
2970	fImpl ->strides = strides;
2971	if (gSkVMAllowJIT) {
2972	#if 1 && defined(SKVM_LLVM)
2973	this->setupLLVM(instructions, debug_name);
2974	#elif 1 && defined(SKVM_JIT)
2975	this->setupJIT(instructions, debug_name);
2976	#endif
2977	}
2978
2979	// Might as well do this after setupLLVM() to get a little more time to compile.
2980	this->setupInterpreter(instructions);
2981	}
2982
2983	std::vector<InterpreterInstruction> Program::instructions() const { return fImpl ->instructions; }
2984	int Program::nargs() const { return (int)fImpl ->strides.size(); }
2985	int Program::nregs() const { return fImpl ->regs; }
2986	int Program::loop () const { return fImpl ->loop; }
2987	bool Program::empty() const { return fImpl ->instructions.empty(); }
2988
2989	// Translate OptimizedInstructions to InterpreterInstructions.
2990	void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
2991	// Register each instruction is assigned to.
2992	std::vector<Reg> reg(instructions.size());
2993
2994	// This next bit is a bit more complicated than strictly necessary;
2995	// we could just assign every instruction to its own register.
2996	//
2997	// But recycling registers is fairly cheap, and good practice for the
2998	// JITs where minimizing register pressure really is important.
2999	//
3000	// Since we have effectively infinite registers, we hoist any value we can.
3001	// (The JIT may choose a more complex policy to reduce register pressure.)
3002
3003	fImpl ->regs = `0`;
3004	std::vector<Reg> avail;
3005
3006	// Assign this value to a register, recycling them where we can.
3007	auto assign_register = [&](Val id) {
3008	const OptimizedInstruction& inst = instructions [id];
3009
3010	// If this is a real input and it's lifetime ends at this instruction,
3011	// we can recycle the register it's occupying.
3012	auto maybe_recycle_register = [&](Val input) {
3013	if (input != NA && instructions [input].death == id) {
3014	avail.push_back(reg [input]);
3015	}
3016	};
3017
3018	// Take care to not recycle the same register twice.
3019	if (true ) { maybe_recycle_register(inst.x); }
3020	if (inst.y != inst.x ) { maybe_recycle_register(inst.y); }
3021	if (inst.z != inst.x && inst.z != inst.y) { maybe_recycle_register(inst.z); }
3022
3023	// Instructions that die at themselves (stores) don't need a register.
3024	if (inst.death != id) {
3025	// Allocate a register if we have to, preferring to reuse anything available.
3026	if (avail.empty()) {
3027	reg [id] = fImpl ->regs++;
3028	} else {
3029	reg [id] = avail.back();
3030	avail.pop_back();
3031	}
3032	}
3033	};
3034
3035	// Assign a register to each hoisted instruction, then each non-hoisted loop instruction.
3036	for (Val id = `0`; id < (Val)instructions.size(); id++) {
3037	if ( instructions [id].can_hoist) { assign_register (id); }
3038	}
3039	for (Val id = `0`; id < (Val)instructions.size(); id++) {
3040	if (!instructions [id].can_hoist) { assign_register (id); }
3041	}
3042
3043	// Translate OptimizedInstructions to InterpreterIstructions by mapping values to
3044	// registers. This will be two passes, first hoisted instructions, then inside the loop.
3045
3046	// The loop begins at the fImpl->loop'th Instruction.
3047	fImpl ->loop = `0`;
3048	fImpl ->instructions.reserve(instructions.size());
3049
3050	// Add a dummy mapping for the N/A sentinel Val to any arbitrary register
3051	// so lookups don't have to know which arguments are used by which Ops.
3052	auto lookup_register = [&](Val id) {
3053	return id == NA ? (Reg)`0`
3054	: reg [id];
3055	};
3056
3057	auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
3058	InterpreterInstruction pinst{
3059	inst.op,
3060	lookup_register (id),
3061	lookup_register (inst.x),
3062	{lookup_register (inst.y)},
3063	{lookup_register (inst.z)},
3064	};
3065	if (inst.y == NA) { pinst.immy = inst.immy; }
3066	if (inst.z == NA) { pinst.immz = inst.immz; }
3067	fImpl ->instructions.push_back(pinst);
3068	};
3069
3070	for (Val id = `0`; id < (Val)instructions.size(); id++) {
3071	const OptimizedInstruction& inst = instructions [id];
3072	if (inst.can_hoist) {
3073	push_instruction (id, inst);
3074	fImpl ->loop++;
3075	}
3076	}
3077	for (Val id = `0`; id < (Val)instructions.size(); id++) {
3078	const OptimizedInstruction& inst = instructions [id];
3079	if (!inst.can_hoist) {
3080	push_instruction (id, inst);
3081	}
3082	}
3083	}
3084
3085	#if defined(SKVM_JIT)
3086
3087	bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
3088	int* stack_hint,
3089	uint32_t* registers_used,
3090	Assembler* a) const {
3091	using A = Assembler;
3092
3093	SkTHashMap<int, A::Label> constants; // Constants (mostly splats) share the same pool.
3094	A::Label iota; // Varies per lane, for Op::index.
3095	A::Label load64_index; // Used to load low or high half of 64-bit lanes.
3096
3097	// The `regs` array tracks everything we know about each register's state:
3098	// - NA: empty
3099	// - RES: reserved by ABI
3100	// - TMP: holding a temporary
3101	// - id: holding Val id
3102	constexpr Val RES = NA-`1`,
3103	TMP = RES-`1`;
3104
3105	// Map val -> stack slot.
3106	std::vector<int> stack_slot(instructions.size(), NA);
3107	int next_stack_slot = `0`;
3108
3109	const int nstack_slots = stack_hint >= `0` ? stack_hint
3110	: stack_slot.size();
3111
3112	#if defined(__x86_64__) \|\| defined(_M_X64)
3113	if (!SkCpu::Supports(SkCpu::HSW)) {
3114	return false;
3115	}
3116	const int K = `8`;
3117	using Reg = A::Ymm;
3118	#if defined(_M_X64) // Important to check this first; clang-cl defines both.
3119	const A::GP64 N = A::rcx,
3120	GP0 = A::rax,
3121	GP1 = A::r11,
3122	arg[] = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi };
3123
3124	// xmm6-15 need are callee-saved.
3125	std::array<Val,`16`> regs = {
3126	NA, NA, NA, NA, NA, NA,RES,RES,
3127	RES,RES,RES,RES, RES,RES,RES,RES,
3128	};
3129	const uint32_t incoming_registers_used = *registers_used;
3130
3131	auto enter = [&]{
3132	// rcx,rdx,r8,r9 are all already holding their correct values.
3133	// Load caller-saved r10 from rsp+40 if there's a fourth arg.
3134	if (fImpl->strides.size() >= `4`) {
3135	a->mov(A::r10, A::Mem{A::rsp, `40`});
3136	}
3137	// Load callee-saved rdi from rsp+48 if there's a fifth arg,
3138	// first saving it to ABI reserved shadow area rsp+8.
3139	if (fImpl->strides.size() >= `5`) {
3140	a->mov(A::Mem{A::rsp, `8`}, A::rdi);
3141	a->mov(A::rdi, A::Mem{A::rsp, `48`});
3142	}
3143	// Load callee-saved rsi from rsp+56 if there's a sixth arg,
3144	// first saving it to ABI reserved shadow area rsp+16.
3145	if (fImpl->strides.size() >= `6`) {
3146	a->mov(A::Mem{A::rsp, `16`}, A::rsi);
3147	a->mov(A::rsi, A::Mem{A::rsp, `56`});
3148	}
3149
3150	// Allocate stack for our values and callee-saved xmm6-15.
3151	int stack_needed = nstack_slotsK`4`;
3152	for (int r = `6`; r < `16`; r++) {
3153	if (incoming_registers_used & (`1`<<r)) {
3154	stack_needed += `16`;
3155	}
3156	}
3157	if (stack_needed) { a->sub(A::rsp, stack_needed); }
3158
3159	int next_saved_xmm = nstack_slotsK`4`;
3160	for (int r = `6`; r < `16`; r++) {
3161	if (incoming_registers_used & (`1`<<r)) {
3162	a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r);
3163	next_saved_xmm += `16`;
3164	regs[r] = NA;
3165	}
3166	}
3167	};
3168	auto exit = [&]{
3169	// The second pass of jit() shouldn't use any register it didn't in the first pass.
3170	SkASSERT((registers_used & incoming_registers_used) == registers_used);
3171
3172	// Restore callee-saved xmm6-15 and the stack pointer.
3173	int stack_used = nstack_slotsK`4`;
3174	for (int r = `6`; r < `16`; r++) {
3175	if (incoming_registers_used & (`1`<<r)) {
3176	a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used});
3177	stack_used += `16`;
3178	}
3179	}
3180	if (stack_used) { a->add(A::rsp, stack_used); }
3181
3182	// Restore callee-saved rdi/rsi if we used them.
3183	if (fImpl->strides.size() >= `5`) {
3184	a->mov(A::rdi, A::Mem{A::rsp, `8`});
3185	}
3186	if (fImpl->strides.size() >= `6`) {
3187	a->mov(A::rsi, A::Mem{A::rsp, `16`});
3188	}
3189
3190	a->vzeroupper();
3191	a->ret();
3192	};
3193	#elif defined(__x86_64__)
3194	const A::GP64 N = A::rdi,
3195	GP0 = A::rax,
3196	GP1 = A::r11,
3197	arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 };
3198
3199	// All 16 ymm registers are available to use.
3200	std::array<Val,`16`> regs = {
3201	NA,NA,NA,NA, NA,NA,NA,NA,
3202	NA,NA,NA,NA, NA,NA,NA,NA,
3203	};
3204
3205	auto enter = [&]{
3206	// Load caller-saved r10 from rsp+8 if there's a sixth arg.
3207	if (fImpl->strides.size() >= `6`) {
3208	a->mov(A::r10, A::Mem{A::rsp, `8`});
3209	}
3210	if (nstack_slots) { a->sub(A::rsp, nstack_slotsK`4`); }
3211	};
3212	auto exit = [&]{
3213	if (nstack_slots) { a->add(A::rsp, nstack_slotsK`4`); }
3214	a->vzeroupper();
3215	a->ret();
3216	};
3217	#endif
3218
3219	auto load_from_memory = [&](Reg r, Val v) {
3220	if (instructions[v].op == Op::splat) {
3221	if (instructions[v].immy == `0`) {
3222	a->vpxor(r,r,r);
3223	} else {
3224	a->vmovups(r, constants.find(instructions[v].immy));
3225	}
3226	} else {
3227	SkASSERT(stack_slot[v] != NA);
3228	a->vmovups(r, A::Mem{A::rsp, stack_slot[v]K`4`});
3229	}
3230	};
3231	auto store_to_stack = [&](Reg r, Val v) {
3232	SkASSERT(next_stack_slot < nstack_slots);
3233	stack_slot[v] = next_stack_slot++;
3234	a->vmovups(A::Mem{A::rsp, stack_slot[v]K`4`}, r);
3235	};
3236	#elif defined(__aarch64__)
3237	const int K = `4`;
3238	using Reg = A::V;
3239	const A::X N = A::x0,
3240	GP0 = A::x8,
3241	arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
3242
3243	// We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit.
3244	std::array<Val,`32`> regs = {
3245	NA, NA, NA, NA, NA, NA, NA, NA,
3246	RES,RES,RES,RES, RES,RES,RES,RES,
3247	NA, NA, NA, NA, NA, NA, NA, NA,
3248	NA, NA, NA, NA, NA, NA, NA, NA,
3249	};
3250
3251	auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slotsK`4`); } };
3252	auto exit = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slotsK`4`); }
3253	a->ret(A::x30); };
3254
3255	auto load_from_memory = [&](Reg r, Val v) {
3256	if (instructions[v].op == Op::splat) {
3257	if (instructions[v].immy == `0`) {
3258	a->eor16b(r,r,r);
3259	} else {
3260	a->ldrq(r, constants.find(instructions[v].immy));
3261	}
3262	} else {
3263	SkASSERT(stack_slot[v] != NA);
3264	a->ldrq(r, A::sp, stack_slot[v]);
3265	}
3266	};
3267	auto store_to_stack = [&](Reg r, Val v) {
3268	SkASSERT(next_stack_slot < nstack_slots);
3269	stack_slot[v] = next_stack_slot++;
3270	a->strq(r, A::sp, stack_slot[v]);
3271	};
3272	#endif
3273
3274	registers_used = `0`; // We'll update this as we go.*
3275
3276	if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) {
3277	return false;
3278	}
3279
3280	auto emit = [&](Val id, bool scalar) {
3281	const OptimizedInstruction& inst = instructions[id];
3282	const Op op = inst.op;
3283	const Val x = inst.x,
3284	y = inst.y,
3285	z = inst.z;
3286	const int immy = inst.immy,
3287	immz = inst.immz;
3288
3289	// alloc_tmp() returns a temporary register, freed manually with free_tmp().
3290	auto alloc_tmp = [&]() -> Reg {
3291	// Find an available register, or spill an occupied one if nothing's available.
3292	auto avail = std::find_if(regs.begin(), regs.end(), [](Val v) { return v == NA; });
3293	if (avail == regs.end()) {
3294	auto score_spills = [&](Val v) -> int {
3295	// We cannot spill REServed registers,
3296	// nor any registers we need for this instruction.
3297	if (v == RES \|\|
3298	v == TMP \|\| v == id \|\| v == x \|\| v == y \|\| v == z) {
3299	return `0x7fff'ffff`;
3300	}
3301	// At this point spilling is arbitrary, so we're in the realm of heuristics.
3302	// Here, spill the oldest value. This is nice because,
3303	// A) it's very predictable, even in assembly, and
3304	// B) it's as cheap as you can get.
3305	return v;
3306	};
3307	avail = std::min_element(regs.begin(), regs.end(), [&](Val a, Val b) {
3308	return score_spills(a) < score_spills(b);
3309	});
3310	}
3311	SkASSERT(avail != regs.end());
3312
3313	Reg r = (Reg)std::distance(regs.begin(), avail);
3314	Val& v = regs[r];
3315	*registers_used \|= (`1`<<r);
3316
3317	SkASSERT(v == NA \|\| v >= `0`);
3318	if (v >= `0`) {
3319	if (stack_slot[v] == NA && instructions[v].op != Op::splat) {
3320	store_to_stack(r, v);
3321	}
3322	v = NA;
3323	}
3324	SkASSERT(v == NA);
3325
3326	v = TMP;
3327	return r;
3328	};
3329
3330	#if defined(__x86_64__) \|\| defined(_M_X64) // Nothing special... just unused on ARM.
3331	auto free_tmp = [&](Reg r) {
3332	SkASSERT(regs[r] == TMP);
3333	regs[r] = NA;
3334	};
3335	#endif
3336
3337	// Which register holds dst,x,y,z for this instruction? NA if none does yet.
3338	int rd = NA,
3339	rx = NA,
3340	ry = NA,
3341	rz = NA;
3342
3343	auto update_regs = [&](Reg r, Val v) {
3344	if (v == id) { rd = r; }
3345	if (v == x) { rx = r; }
3346	if (v == y) { ry = r; }
3347	if (v == z) { rz = r; }
3348	return r;
3349	};
3350
3351	auto find_existing_reg = [&](Val v) -> int {
3352	// Quick-check our working registers.
3353	if (v == id && rd != NA) { return rd; }
3354	if (v == x && rx != NA) { return rx; }
3355	if (v == y && ry != NA) { return ry; }
3356	if (v == z && rz != NA) { return rz; }
3357
3358	// Search inter-instruction register map.
3359	for (auto [r,val] : SkMakeEnumerate(regs)) {
3360	if (val == v) {
3361	return update_regs((Reg)r, v);
3362	}
3363	}
3364	return NA;
3365	};
3366
3367	// Return a register for Val, holding that value if it already exists.
3368	// During this instruction all calls to r(v) will return the same register.
3369	auto r = [&](Val v) -> Reg {
3370	SkASSERT(v >= `0`);
3371
3372	if (int found = find_existing_reg(v); found != NA) {
3373	return (Reg)found;
3374	}
3375
3376	Reg r = alloc_tmp();
3377	SkASSERT(regs[r] == TMP);
3378
3379	SkASSERT(v <= id);
3380	if (v < id) {
3381	// If v < id, we're loading one of this instruction's inputs.
3382	// If v == id we're just allocating its destination register.
3383	load_from_memory(r, v);
3384	}
3385	regs[r] = v;
3386	return update_regs(r, v);
3387	};
3388
3389	auto dies_here = [&](Val v) -> bool {
3390	SkASSERT(v >= `0`);
3391	return instructions[v].death == id;
3392	};
3393
3394	// Alias dst() to r(v) if dies_here(v).
3395	auto try_alias = [&](Val v) -> bool {
3396	SkASSERT(v == x \|\| v == y \|\| v == z);
3397	if (dies_here(v)) {
3398	rd = r(v); // Vals v and id share a register for this instruction.
3399	regs[rd] = id; // Next instruction, Val id will be in the register, not Val v.
3400	return true;
3401	}
3402	return false;
3403	};
3404
3405	// Generally r(id),
3406	// but with a hint, try to alias dst() to r(v) if dies_here(v).
3407	auto dst = [&](Val hint = NA) -> Reg {
3408	if (hint != NA) {
3409	(void)try_alias(hint);
3410	}
3411	return r(id);
3412	};
3413
3414	#if defined(__x86_64__) \|\| defined(_M_X64)
3415	// On x86 we can work with many values directly from the stack or program constant pool.
3416	auto any = [&](Val v) -> A::Operand {
3417	SkASSERT(v >= `0`);
3418	SkASSERT(v < id);
3419
3420	if (int found = find_existing_reg(v); found != NA) {
3421	return (Reg)found;
3422	}
3423	if (instructions[v].op == Op::splat) {
3424	return constants.find(instructions[v].immy);
3425	}
3426	return A::Mem{A::rsp, stack_slot[v]K`4`};
3427	};
3428
3429	// This is never really worth asking except when any() might be used;
3430	// if we need this value in ARM, might as well just call r(v) to get it into a register.
3431	auto in_reg = [&](Val v) -> bool {
3432	return find_existing_reg(v) != NA;
3433	};
3434	#endif
3435
3436	switch (op) {
3437	case Op::splat:
3438	// Make sure splat constants can be found by load_from_memory() or any().
3439	(void)constants[immy];
3440	break;
3441
3442	#if defined(__x86_64__) \|\| defined(_M_X64)
3443	case Op::assert_true: {
3444	a->vptest (r(x), &constants[`0xffffffff`]);
3445	A::Label all_true;
3446	a->jc(&all_true);
3447	a->int3();
3448	a->label(&all_true);
3449	} break;
3450
3451	case Op::store8:
3452	if (scalar) {
3453	a->vpextrb(A::Mem{arg[immy]}, (A::Xmm)r(x), `0`);
3454	} else {
3455	a->vpackusdw(dst(x), r(x), r(x));
3456	a->vpermq (dst(), dst(), `0xd8`);
3457	a->vpackuswb(dst(), dst(), dst());
3458	a->vmovq (A::Mem{arg[immy]}, (A::Xmm)dst());
3459	} break;
3460
3461	case Op::store16:
3462	if (scalar) {
3463	a->vpextrw(A::Mem{arg[immy]}, (A::Xmm)r(x), `0`);
3464	} else {
3465	a->vpackusdw(dst(x), r(x), r(x));
3466	a->vpermq (dst(), dst(), `0xd8`);
3467	a->vmovups (A::Mem{arg[immy]}, (A::Xmm)dst());
3468	} break;
3469
3470	case Op::store32: if (scalar) { a->vmovd (A::Mem{arg[immy]}, (A::Xmm)r(x)); }
3471	else { a->vmovups(A::Mem{arg[immy]}, r(x)); }
3472	break;
3473
3474	case Op::store64: if (scalar) {
3475	a->vmovd(A::Mem{arg[immz],`0`}, (A::Xmm)r(x));
3476	a->vmovd(A::Mem{arg[immz],`4`}, (A::Xmm)r(y));
3477	} else {
3478	// r(x) = {a,b,c,d\|e,f,g,h}
3479	// r(y) = {i,j,k,l\|m,n,o,p}
3480	// We want to write a,i,b,j,c,k,d,l,e,m...
3481	A::Ymm L = alloc_tmp(),
3482	H = alloc_tmp();
3483	a->vpunpckldq(L, r(x), any(y)); // L = {a,i,b,j\|e,m,f,n}
3484	a->vpunpckhdq(H, r(x), any(y)); // H = {c,k,d,l\|g,o,h,p}
3485	a->vperm2f128(dst(), L,H, `0x20`); // = {a,i,b,j\|c,k,d,l}
3486	a->vmovups(A::Mem{arg[immz], `0`}, dst());
3487	a->vperm2f128(dst(), L,H, `0x31`); // = {e,m,f,n\|g,o,h,p}
3488	a->vmovups(A::Mem{arg[immz],`32`}, dst());
3489	free_tmp(L);
3490	free_tmp(H);
3491	} break;
3492
3493	case Op::store128: {
3494	// TODO: 8 64-bit stores instead of 16 32-bit stores?
3495	int ptr = immz>>`1`,
3496	lane = immz&`1`;
3497	a->vmovd (A::Mem{arg[ptr], `0``16` + `8`lane + `0`}, (A::Xmm)r(x) );
3498	a->vmovd (A::Mem{arg[ptr], `0``16` + `8`lane + `4`}, (A::Xmm)r(y) );
3499	if (scalar) { break; }
3500	a->vpextrd(A::Mem{arg[ptr], `1``16` + `8`lane + `0`}, (A::Xmm)r(x), `1`);
3501	a->vpextrd(A::Mem{arg[ptr], `1``16` + `8`lane + `4`}, (A::Xmm)r(y), `1`);
3502	a->vpextrd(A::Mem{arg[ptr], `2``16` + `8`lane + `0`}, (A::Xmm)r(x), `2`);
3503	a->vpextrd(A::Mem{arg[ptr], `2``16` + `8`lane + `4`}, (A::Xmm)r(y), `2`);
3504	a->vpextrd(A::Mem{arg[ptr], `3``16` + `8`lane + `0`}, (A::Xmm)r(x), `3`);
3505	a->vpextrd(A::Mem{arg[ptr], `3``16` + `8`lane + `4`}, (A::Xmm)r(y), `3`);
3506	// Now we need to store the upper 128 bits of x and y.
3507	// Storing x then y rather than interlacing minimizes temporaries.
3508	a->vextracti128(dst(), r(x), `1`);
3509	a->vmovd (A::Mem{arg[ptr], `4``16` + `8`lane + `0`}, (A::Xmm)dst() );
3510	a->vpextrd(A::Mem{arg[ptr], `5``16` + `8`lane + `0`}, (A::Xmm)dst(), `1`);
3511	a->vpextrd(A::Mem{arg[ptr], `6``16` + `8`lane + `0`}, (A::Xmm)dst(), `2`);
3512	a->vpextrd(A::Mem{arg[ptr], `7``16` + `8`lane + `0`}, (A::Xmm)dst(), `3`);
3513	a->vextracti128(dst(), r(y), `1`);
3514	a->vmovd (A::Mem{arg[ptr], `4``16` + `8`lane + `4`}, (A::Xmm)dst() );
3515	a->vpextrd(A::Mem{arg[ptr], `5``16` + `8`lane + `4`}, (A::Xmm)dst(), `1`);
3516	a->vpextrd(A::Mem{arg[ptr], `6``16` + `8`lane + `4`}, (A::Xmm)dst(), `2`);
3517	a->vpextrd(A::Mem{arg[ptr], `7``16` + `8`lane + `4`}, (A::Xmm)dst(), `3`);
3518	} break;
3519
3520	case Op::load8: if (scalar) {
3521	a->vpxor (dst(), dst(), dst());
3522	a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immy]}, `0`);
3523	} else {
3524	a->vpmovzxbd(dst(), A::Mem{arg[immy]});
3525	} break;
3526
3527	case Op::load16: if (scalar) {
3528	a->vpxor (dst(), dst(), dst());
3529	a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immy]}, `0`);
3530	} else {
3531	a->vpmovzxwd(dst(), A::Mem{arg[immy]});
3532	} break;
3533
3534	case Op::load32: if (scalar) { a->vmovd ((A::Xmm)dst(), A::Mem{arg[immy]}); }
3535	else { a->vmovups( dst(), A::Mem{arg[immy]}); }
3536	break;
3537
3538	case Op::load64: if (scalar) {
3539	a->vmovd((A::Xmm)dst(), A::Mem{arg[immy], `4`*immz});
3540	} else {
3541	A::Ymm tmp = alloc_tmp();
3542	a->vmovups(tmp, &load64_index);
3543	a->vpermps(dst(), tmp, A::Mem{arg[immy], `0`});
3544	a->vpermps( tmp, tmp, A::Mem{arg[immy], `32`});
3545	// Low 128 bits holds immz=0 lanes, high 128 bits holds immz=1.
3546	a->vperm2f128(dst(), dst(),tmp, immz ? `0x31` : `0x20`);
3547	free_tmp(tmp);
3548	} break;
3549
3550	case Op::load128: if (scalar) {
3551	a->vmovd((A::Xmm)dst(), A::Mem{arg[immy], `4`*immz});
3552	} else {
3553	// Load 4 low values into xmm tmp,
3554	A::Ymm tmp = alloc_tmp();
3555	A::Xmm t = (A::Xmm)tmp;
3556	a->vmovd (t, A::Mem{arg[immy], `0``16` + `4`immz} );
3557	a->vpinsrd(t,t, A::Mem{arg[immy], `1``16` + `4`immz}, `1`);
3558	a->vpinsrd(t,t, A::Mem{arg[immy], `2``16` + `4`immz}, `2`);
3559	a->vpinsrd(t,t, A::Mem{arg[immy], `3``16` + `4`immz}, `3`);
3560
3561	// Load 4 high values into xmm dst(),
3562	A::Xmm d = (A::Xmm)dst();
3563	a->vmovd (d, A::Mem{arg[immy], `4``16` + `4`immz} );
3564	a->vpinsrd(d,d, A::Mem{arg[immy], `5``16` + `4`immz}, `1`);
3565	a->vpinsrd(d,d, A::Mem{arg[immy], `6``16` + `4`immz}, `2`);
3566	a->vpinsrd(d,d, A::Mem{arg[immy], `7``16` + `4`immz}, `3`);
3567
3568	// Merge the two, ymm dst() = {xmm tmp\|xmm dst()}
3569	a->vperm2f128(dst(), tmp,dst(), `0x20`);
3570	free_tmp(tmp);
3571	} break;
3572
3573	case Op::gather8: {
3574	// As usual, the gather base pointer is immz bytes off of uniform immy.
3575	a->mov(GP0, A::Mem{arg[immy], immz});
3576
3577	A::Ymm tmp = alloc_tmp();
3578	a->vmovups(tmp, any(x));
3579
3580	for (int i = `0`; i < (scalar ? `1` : `8`); i++) {
3581	if (i == `4`) {
3582	// vpextrd can only pluck indices out from an Xmm register,
3583	// so we manually swap over to the top when we're halfway through.
3584	a->vextracti128((A::Xmm)tmp, tmp, `1`);
3585	}
3586	a->vpextrd(GP1, (A::Xmm)tmp, i%`4`);
3587	a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,`0`,GP1,A::ONE}, i);
3588	}
3589	a->vpmovzxbd(dst(), dst());
3590	free_tmp(tmp);
3591	} break;
3592
3593	case Op::gather16: {
3594	// Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
3595	a->mov(GP0, A::Mem{arg[immy], immz});
3596
3597	A::Ymm tmp = alloc_tmp();
3598	a->vmovups(tmp, any(x));
3599
3600	for (int i = `0`; i < (scalar ? `1` : `8`); i++) {
3601	if (i == `4`) {
3602	a->vextracti128((A::Xmm)tmp, tmp, `1`);
3603	}
3604	a->vpextrd(GP1, (A::Xmm)tmp, i%`4`);
3605	a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,`0`,GP1,A::TWO}, i);
3606	}
3607	a->vpmovzxwd(dst(), dst());
3608	free_tmp(tmp);
3609	} break;
3610
3611	case Op::gather32:
3612	if (scalar) {
3613	// Our gather base pointer is immz bytes off of uniform immy.
3614	a->mov(GP0, A::Mem{arg[immy], immz});
3615
3616	// Grab our index from lane 0 of the index argument.
3617	a->vmovd(GP1, (A::Xmm)r(x));
3618
3619	// dst = (base + 4index)
3620	a->vmovd((A::Xmm)dst(x), A::Mem{GP0, `0`, GP1, A::FOUR});
3621	} else {
3622	a->mov(GP0, A::Mem{arg[immy], immz});
3623
3624	A::Ymm mask = alloc_tmp();
3625	a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.)
3626
3627	a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask);
3628	free_tmp(mask);
3629	}
3630	break;
3631
3632	case Op::uniform8: a->movzbq(GP0, A::Mem{arg[immy], immz});
3633	a->vmovd((A::Xmm)dst(), GP0);
3634	a->vbroadcastss(dst(), dst());
3635	break;
3636
3637	case Op::uniform16: a->movzwq(GP0, A::Mem{arg[immy], immz});
3638	a->vmovd((A::Xmm)dst(), GP0);
3639	a->vbroadcastss(dst(), dst());
3640	break;
3641
3642	case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immy], immz});
3643	break;
3644
3645	case Op::index: a->vmovd((A::Xmm)dst(), N);
3646	a->vbroadcastss(dst(), dst());
3647	a->vpsubd(dst(), dst(), &iota);
3648	break;
3649
3650	// We can swap the arguments of symmetric instructions to make better use of any().
3651	case Op::add_f32:
3652	if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); }
3653	else { a->vaddps(dst(y), r(y), any(x)); }
3654	break;
3655
3656	case Op::mul_f32:
3657	if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); }
3658	else { a->vmulps(dst(y), r(y), any(x)); }
3659	break;
3660
3661	case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break;
3662	case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break;
3663	case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break; // Order matters,
3664	case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break; // see test SkVM_min_max.
3665
3666	case Op::fma_f32:
3667	if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else
3668	if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else
3669	if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else
3670	{ a->vmovups (dst(), any(x));
3671	a->vfmadd132ps(dst(), r(z), any(y)); }
3672	break;
3673
3674	case Op::fms_f32:
3675	if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else
3676	if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else
3677	if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else
3678	{ a->vmovups (dst(), any(x));
3679	a->vfmsub132ps(dst(), r(z), any(y)); }
3680	break;
3681
3682	case Op::fnma_f32:
3683	if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else
3684	if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else
3685	if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else
3686	{ a->vmovups (dst(), any(x));
3687	a->vfnmadd132ps(dst(), r(z), any(y)); }
3688	break;
3689
3690	// In situations like this we want to try aliasing dst(x) when x is
3691	// already in a register, but not if we'd have to load it from the stack
3692	// just to alias it. That's done better directly into the new register.
3693	case Op::sqrt_f32:
3694	if (in_reg(x)) { a->vsqrtps(dst(x), r(x)); }
3695	else { a->vsqrtps(dst(), any(x)); }
3696	break;
3697
3698	case Op::add_i32:
3699	if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); }
3700	else { a->vpaddd(dst(y), r(y), any(x)); }
3701	break;
3702	case Op::mul_i32:
3703	if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); }
3704	else { a->vpmulld(dst(y), r(y), any(x)); }
3705	break;
3706
3707	case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break;
3708
3709	case Op::bit_and:
3710	if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); }
3711	else { a->vpand(dst(y), r(y), any(x)); }
3712	break;
3713	case Op::bit_or:
3714	if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); }
3715	else { a->vpor(dst(y), r(y), any(x)); }
3716	break;
3717	case Op::bit_xor:
3718	if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); }
3719	else { a->vpxor(dst(y), r(y), any(x)); }
3720	break;
3721
3722	case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x.
3723
3724	case Op::select:
3725	if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); }
3726	else { a->vpblendvb(dst(x), r(z), any(y), r(x)); }
3727	break;
3728
3729	case Op::shl_i32: a->vpslld(dst(x), r(x), immy); break;
3730	case Op::shr_i32: a->vpsrld(dst(x), r(x), immy); break;
3731	case Op::sra_i32: a->vpsrad(dst(x), r(x), immy); break;
3732
3733	case Op::eq_i32:
3734	if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); }
3735	else { a->vpcmpeqd(dst(y), r(y), any(x)); }
3736	break;
3737
3738	case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break;
3739
3740	case Op::eq_f32:
3741	if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); }
3742	else { a->vcmpeqps(dst(y), r(y), any(x)); }
3743	break;
3744	case Op::neq_f32:
3745	if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); }
3746	else { a->vcmpneqps(dst(y), r(y), any(x)); }
3747	break;
3748
3749	case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break;
3750	case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break;
3751
3752	// It's safe to alias dst(y) only when y != x. Otherwise we'd overwrite x!
3753	case Op::pack: a->vpslld(dst(y != x ? y : NA), r(y), immz);
3754	a->vpor (dst(), dst(), any(x));
3755	break;
3756
3757	case Op::ceil:
3758	if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::CEIL); }
3759	else { a->vroundps(dst(), any(x), Assembler::CEIL); }
3760	break;
3761
3762	case Op::floor:
3763	if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::FLOOR); }
3764	else { a->vroundps(dst(), any(x), Assembler::FLOOR); }
3765	break;
3766
3767	case Op::to_f32:
3768	if (in_reg(x)) { a->vcvtdq2ps(dst(x), r(x)); }
3769	else { a->vcvtdq2ps(dst(), any(x)); }
3770	break;
3771
3772	case Op::trunc:
3773	if (in_reg(x)) { a->vcvttps2dq(dst(x), r(x)); }
3774	else { a->vcvttps2dq(dst(), any(x)); }
3775	break;
3776
3777	case Op::round:
3778	if (in_reg(x)) { a->vcvtps2dq(dst(x), r(x)); }
3779	else { a->vcvtps2dq(dst(), any(x)); }
3780	break;
3781
3782	case Op::to_half:
3783	a->vcvtps2ph(dst(x), r(x), A::CURRENT); // f32 ymm -> f16 xmm
3784	a->vpmovzxwd(dst(), dst()); // f16 xmm -> f16 ymm
3785	break;
3786
3787	case Op::from_half:
3788	a->vpackusdw(dst(x), r(x), r(x)); // f16 ymm -> f16 xmm
3789	a->vpermq (dst(), dst(), `0xd8`); // swap middle two 64-bit lanes
3790	a->vcvtph2ps(dst(), dst()); // f16 xmm -> f32 ymm
3791	break;
3792
3793	#elif defined(__aarch64__)
3794	default: // TODO
3795	if (false) {
3796	SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
3797	}
3798	return false;
3799
3800	case Op::assert_true: {
3801	a->uminv4s(dst(), r(x)); // uminv acts like an all() across the vector.
3802	a->fmovs(GP0, dst());
3803	A::Label all_true;
3804	a->cbnz(GP0, &all_true);
3805	a->brk(`0`);
3806	a->label(&all_true);
3807	} break;
3808
3809	case Op::store8: a->xtns2h(dst(), r(x));
3810	a->xtnh2b(dst(), dst());
3811	if (scalar) { a->strb (dst(), arg[immy]); }
3812	else { a->strs (dst(), arg[immy]); }
3813	break;
3814
3815	case Op::store32: if (scalar) { a->strs(r(x), arg[immy]); }
3816	else { a->strq(r(x), arg[immy]); }
3817	break;
3818
3819	case Op::load8: if (scalar) { a->ldrb(dst(), arg[immy]); }
3820	else { a->ldrs(dst(), arg[immy]); }
3821	a->uxtlb2h(dst(), dst());
3822	a->uxtlh2s(dst(), dst());
3823	break;
3824
3825	case Op::load32: if (scalar) { a->ldrs(dst(), arg[immy]); }
3826	else { a->ldrq(dst(), arg[immy]); }
3827	break;
3828
3829	case Op::add_f32: a->fadd4s(dst(), r(x), r(y)); break;
3830	case Op::sub_f32: a->fsub4s(dst(), r(x), r(y)); break;
3831	case Op::mul_f32: a->fmul4s(dst(), r(x), r(y)); break;
3832	case Op::div_f32: a->fdiv4s(dst(), r(x), r(y)); break;
3833
3834	case Op::fma_f32: // fmla.4s is z += xy*
3835	if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); }
3836	else { a->orr16b(dst(), r(z), r(z));
3837	a->fmla4s(dst(), r(x), r(y)); }
3838	break;
3839
3840	case Op::fnma_f32: // fmls.4s is z -= xy*
3841	if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
3842	else { a->orr16b(dst(), r(z), r(z));
3843	a->fmls4s(dst(), r(x), r(y)); }
3844	break;
3845
3846	case Op::fms_f32: // calculate z - xy, then negate to xy - z
3847	if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
3848	else { a->orr16b(dst(), r(z), r(z));
3849	a->fmls4s(dst(), r(x), r(y)); }
3850	a->fneg4s(dst(), dst());
3851	break;
3852
3853	case Op:: gt_f32: a->fcmgt4s (dst(), r(x), r(y)); break;
3854	case Op::gte_f32: a->fcmge4s (dst(), r(x), r(y)); break;
3855	case Op:: eq_f32: a->fcmeq4s (dst(), r(x), r(y)); break;
3856	case Op::neq_f32: a->fcmeq4s (dst(), r(x), r(y));
3857	a->not16b (dst(), dst()); break;
3858
3859
3860	case Op::add_i32: a->add4s(dst(), r(x), r(y)); break;
3861	case Op::sub_i32: a->sub4s(dst(), r(x), r(y)); break;
3862	case Op::mul_i32: a->mul4s(dst(), r(x), r(y)); break;
3863
3864	case Op::bit_and : a->and16b(dst(), r(x), r(y)); break;
3865	case Op::bit_or : a->orr16b(dst(), r(x), r(y)); break;
3866	case Op::bit_xor : a->eor16b(dst(), r(x), r(y)); break;
3867	case Op::bit_clear: a->bic16b(dst(), r(x), r(y)); break;
3868
3869	case Op::select: // bsl16b is x = x ? y : z
3870	if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); }
3871	else { a->orr16b(dst(), r(x), r(x));
3872	a->bsl16b(dst(), r(y), r(z)); }
3873	break;
3874
3875	// fmin4s and fmax4s don't work the way we want with NaN,
3876	// so we write them the long way:
3877	case Op::min_f32: // min(x,y) = y<x ? y : x
3878	a->fcmgt4s(dst(), r(x), r(y));
3879	a->bsl16b (dst(), r(y), r(x));
3880	break;
3881
3882	case Op::max_f32: // max(x,y) = x<y ? y : x
3883	a->fcmgt4s(dst(), r(y), r(x));
3884	a->bsl16b (dst(), r(y), r(x));
3885	break;
3886
3887	case Op::shl_i32: a-> shl4s(dst(), r(x), immy); break;
3888	case Op::shr_i32: a->ushr4s(dst(), r(x), immy); break;
3889	case Op::sra_i32: a->sshr4s(dst(), r(x), immy); break;
3890
3891	case Op::eq_i32: a->cmeq4s(dst(), r(x), r(y)); break;
3892	case Op::gt_i32: a->cmgt4s(dst(), r(x), r(y)); break;
3893
3894	case Op::pack:
3895	if (try_alias(x)) { a->sli4s ( r(x), r(y), immz); }
3896	else { a->shl4s (dst(), r(y), immz);
3897	a->orr16b(dst(), dst(), r(x)); }
3898	break;
3899
3900	case Op::to_f32: a->scvtf4s (dst(), r(x)); break;
3901	case Op::trunc: a->fcvtzs4s(dst(), r(x)); break;
3902	case Op::round: a->fcvtns4s(dst(), r(x)); break;
3903	// TODO: fcvtns.4s rounds to nearest even.
3904	// I think we actually want frintx -> fcvtzs to round to current mode.
3905	#endif
3906	}
3907
3908	// Proactively free the registers holding any value that dies here.
3909	if (rd != NA && dies_here(regs[rd])) { regs[rd] = NA; }
3910	if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; }
3911	if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; }
3912	if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; }
3913	return true;
3914	};
3915
3916	#if defined(__x86_64__) \|\| defined(_M_X64)
3917	auto jump_if_less = [&](A::Label* l) { a->jl (l); };
3918	auto jump = [&](A::Label* l) { a->jmp(l); };
3919
3920	auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
3921	auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
3922	#elif defined(__aarch64__)
3923	auto jump_if_less = [&](A::Label* l) { a->blt(l); };
3924	auto jump = [&](A::Label* l) { a->b (l); };
3925
3926	auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
3927	auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
3928	#endif
3929
3930	A::Label body,
3931	tail,
3932	done;
3933
3934	enter();
3935	for (Val id = `0`; id < (Val)instructions.size(); id++) {
3936	if (instructions[id].can_hoist && !emit(id, /scalar=/false)) {
3937	return false;
3938	}
3939	}
3940
3941	// This point marks a kind of canonical fixed point for register contents: if loop
3942	// code is generated as if these registers are holding these values, the next time
3943	// the loop comes around we'd better find those same registers holding those same values.
3944	auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot,
3945	saved_next_stack_slot=next_stack_slot]{
3946	for (int r = `0`; r < (int)regs.size(); r++) {
3947	if (regs[r] != incoming[r]) {
3948	regs[r] = incoming[r];
3949	if (regs[r] >= `0`) {
3950	load_from_memory((Reg)r, regs[r]);
3951	}
3952	}
3953	}
3954	stack_hint = std::max(stack_hint, next_stack_slot);
3955	stack_slot = saved_stack_slot;
3956	next_stack_slot = saved_next_stack_slot;
3957	};
3958
3959	a->label(&body);
3960	{
3961	a->cmp(N, K);
3962	jump_if_less(&tail);
3963	for (Val id = `0`; id < (Val)instructions.size(); id++) {
3964	if (!instructions[id].can_hoist && !emit(id, /scalar=/false)) {
3965	return false;
3966	}
3967	}
3968	restore_incoming_regs();
3969	for (int i = `0`; i < (int)fImpl->strides.size(); i++) {
3970	if (fImpl->strides[i]) {
3971	add(arg[i], K*fImpl->strides[i]);
3972	}
3973	}
3974	sub(N, K);
3975	jump(&body);
3976	}
3977
3978	a->label(&tail);
3979	{
3980	a->cmp(N, `1`);
3981	jump_if_less(&done);
3982	for (Val id = `0`; id < (Val)instructions.size(); id++) {
3983	if (!instructions[id].can_hoist && !emit(id, /scalar=/true)) {
3984	return false;
3985	}
3986	}
3987	restore_incoming_regs();
3988	for (int i = `0`; i < (int)fImpl->strides.size(); i++) {
3989	if (fImpl->strides[i]) {
3990	add(arg[i], `1`*fImpl->strides[i]);
3991	}
3992	}
3993	sub(N, `1`);
3994	jump(&tail);
3995	}
3996
3997	a->label(&done);
3998	{
3999	exit();
4000	}
4001
4002	// Except for explicit aligned load and store instructions, AVX allows
4003	// memory operands to be unaligned. So even though we're creating 16
4004	// byte patterns on ARM or 32-byte patterns on x86, we only need to
4005	// align to 4 bytes, the element size and alignment requirement.
4006
4007	constants.foreach([&](int imm, A::Label* label) {
4008	a->align(`4`);
4009	a->label(label);
4010	for (int i = `0`; i < K; i++) {
4011	a->word(imm);
4012	}
4013	});
4014
4015	if (!iota.references.empty()) {
4016	a->align(`4`);
4017	a->label(&iota); // 0,1,2,3,4,...
4018	for (int i = `0`; i < K; i++) {
4019	a->word(i);
4020	}
4021	}
4022
4023	if (!load64_index.references.empty()) {
4024	a->align(`4`);
4025	a->label(&load64_index); // {0,2,4,6\|1,3,5,7}
4026	a->word(`0`); a->word(`2`); a->word(`4`); a->word(`6`);
4027	a->word(`1`); a->word(`3`); a->word(`5`); a->word(`7`);
4028	}
4029
4030	return true;
4031	}
4032
4033	void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
4034	const char* debug_name) {
4035	// Assemble with no buffer to determine a.size() (the number of bytes we'll assemble)
4036	// and stack_hint/registers_used to feed forward into the next jit() call.
4037	Assembler a{nullptr};
4038	int stack_hint = -`1`;
4039	uint32_t registers_used = `0xffff'ffff`; // Start conservatively with all.
4040	if (!this->jit(instructions, &stack_hint, &registers_used, &a)) {
4041	return;
4042	}
4043
4044	fImpl->jit_size = a.size();
4045	void* jit_entry = alloc_jit_buffer(&fImpl->jit_size);
4046	fImpl->jit_entry.store(jit_entry);
4047
4048	// Assemble the program for real with stack_hint/registers_used as feedback from first call.
4049	a = Assembler{jit_entry};
4050	SkAssertResult(this->jit(instructions, &stack_hint, &registers_used, &a));
4051	SkASSERT(a.size() <= fImpl->jit_size);
4052
4053	// Remap as executable, and flush caches on platforms that need that.
4054	remap_as_executable(jit_entry, fImpl->jit_size);
4055
4056	notify_vtune(debug_name, jit_entry, fImpl->jit_size);
4057
4058	#if !defined(SK_BUILD_FOR_WIN)
4059	// For profiling and debugging, it's helpful to have this code loaded
4060	// dynamically rather than just jumping info fImpl->jit_entry.
4061	if (gSkVMJITViaDylib) {
4062	// Dump the raw program binary.
4063	SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name);
4064	int fd = mkstemp(path.writable_str());
4065	::write(fd, jit_entry, a.size());
4066	close(fd);
4067
4068	this->dropJIT(); // (unmap and null out fImpl->jit_entry.)
4069
4070	// Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4071	SkString cmd = SkStringPrintf(
4072	"echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4073	" \| clang -x assembler -shared - -o %s",
4074	path.c_str(), path.c_str());
4075	system(cmd.c_str());
4076
4077	// Load that dynamic library and look up skvm_jit().
4078	fImpl->dylib = dlopen(path.c_str(), RTLD_NOW\|RTLD_LOCAL);
4079	void* sym = nullptr;
4080	for (const char* name : {"skvm_jit", "_skvm_jit"} ) {
4081	if (!sym) { sym = dlsym(fImpl->dylib, name); }
4082	}
4083	fImpl->jit_entry.store(sym);
4084	}
4085	#endif
4086	}
4087	#endif
4088
4089	} // namespace skvm
4090

Browse the source code of engine/third_party/skia/src/core/SkVM.cpp