1 | /* |
2 | * Copyright 2019 Google LLC |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #include "include/core/SkStream.h" |
9 | #include "include/core/SkString.h" |
10 | #include "include/private/SkChecksum.h" |
11 | #include "include/private/SkHalf.h" |
12 | #include "include/private/SkSpinlock.h" |
13 | #include "include/private/SkTFitsIn.h" |
14 | #include "include/private/SkThreadID.h" |
15 | #include "include/private/SkVx.h" |
16 | #include "src/core/SkColorSpaceXformSteps.h" |
17 | #include "src/core/SkCpu.h" |
18 | #include "src/core/SkEnumerate.h" |
19 | #include "src/core/SkOpts.h" |
20 | #include "src/core/SkVM.h" |
21 | #include <algorithm> |
22 | #include <atomic> |
23 | #include <queue> |
24 | |
25 | #if defined(SKVM_LLVM) |
26 | #include <future> |
27 | #include <llvm/Bitcode/BitcodeWriter.h> |
28 | #include <llvm/ExecutionEngine/ExecutionEngine.h> |
29 | #include <llvm/IR/IRBuilder.h> |
30 | #include <llvm/IR/Verifier.h> |
31 | #include <llvm/Support/TargetSelect.h> |
32 | |
33 | // Platform-specific intrinsics got their own files in LLVM 10. |
34 | #if __has_include(<llvm/IR/IntrinsicsX86.h>) |
35 | #include <llvm/IR/IntrinsicsX86.h> |
36 | #endif |
37 | #endif |
38 | |
39 | bool gSkVMAllowJIT{false}; |
40 | bool gSkVMJITViaDylib{false}; |
41 | |
42 | #if defined(SKVM_JIT) |
43 | #if defined(SK_BUILD_FOR_WIN) |
44 | #include "src/core/SkLeanWindows.h" |
45 | #include <memoryapi.h> |
46 | |
47 | static void* alloc_jit_buffer(size_t* len) { |
48 | return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); |
49 | } |
50 | static void unmap_jit_buffer(void* ptr, size_t len) { |
51 | VirtualFree(ptr, 0, MEM_RELEASE); |
52 | } |
53 | static void remap_as_executable(void* ptr, size_t len) { |
54 | DWORD old; |
55 | VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old); |
56 | SkASSERT(old == PAGE_READWRITE); |
57 | } |
58 | static void close_dylib(void* dylib) { |
59 | SkASSERT(false); // TODO? For now just assert we never make one. |
60 | } |
61 | #else |
62 | #include <dlfcn.h> |
63 | #include <sys/mman.h> |
64 | |
65 | static void* alloc_jit_buffer(size_t* len) { |
66 | // While mprotect and VirtualAlloc both work at page granularity, |
67 | // mprotect doesn't round up for you, and instead requires *len is at page granularity. |
68 | const size_t page = sysconf(_SC_PAGESIZE); |
69 | *len = ((*len + page - 1) / page) * page; |
70 | return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0); |
71 | } |
72 | static void unmap_jit_buffer(void* ptr, size_t len) { |
73 | munmap(ptr, len); |
74 | } |
75 | static void remap_as_executable(void* ptr, size_t len) { |
76 | mprotect(ptr, len, PROT_READ|PROT_EXEC); |
77 | __builtin___clear_cache((char*)ptr, |
78 | (char*)ptr + len); |
79 | } |
80 | static void close_dylib(void* dylib) { |
81 | dlclose(dylib); |
82 | } |
83 | #endif |
84 | |
85 | #if defined(SKVM_JIT_VTUNE) |
86 | #include <jitprofiling.h> |
87 | static void notify_vtune(const char* name, void* addr, size_t len) { |
88 | if (iJIT_IsProfilingActive() == iJIT_SAMPLING_ON) { |
89 | iJIT_Method_Load event; |
90 | memset(&event, 0, sizeof(event)); |
91 | event.method_id = iJIT_GetNewMethodID(); |
92 | event.method_name = const_cast<char*>(name); |
93 | event.method_load_address = addr; |
94 | event.method_size = len; |
95 | iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &event); |
96 | } |
97 | } |
98 | #else |
99 | static void notify_vtune(const char* name, void* addr, size_t len) {} |
100 | #endif |
101 | #endif |
102 | |
103 | // JIT code isn't MSAN-instrumented, so we won't see when it uses |
104 | // uninitialized memory, and we'll not see the writes it makes as properly |
105 | // initializing memory. Instead force the interpreter, which should let |
106 | // MSAN see everything our programs do properly. |
107 | // |
108 | // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter. |
109 | #if defined(__has_feature) |
110 | #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer) |
111 | #define SKVM_JIT_BUT_IGNORE_IT |
112 | #endif |
113 | #endif |
114 | |
115 | |
116 | |
117 | namespace skvm { |
118 | |
119 | struct Program::Impl { |
120 | std::vector<InterpreterInstruction> instructions; |
121 | int regs = 0; |
122 | int loop = 0; |
123 | std::vector<int> strides; |
124 | |
125 | std::atomic<void*> jit_entry{nullptr}; // TODO: minimal std::memory_orders |
126 | size_t jit_size = 0; |
127 | void* dylib = nullptr; |
128 | |
129 | #if defined(SKVM_LLVM) |
130 | std::unique_ptr<llvm::LLVMContext> llvm_ctx; |
131 | std::unique_ptr<llvm::ExecutionEngine> llvm_ee; |
132 | std::future<void> llvm_compiling; |
133 | #endif |
134 | }; |
135 | |
136 | // Debugging tools, mostly for printing various data structures out to a stream. |
137 | |
138 | namespace { |
139 | class SkDebugfStream final : public SkWStream { |
140 | size_t fBytesWritten = 0; |
141 | |
142 | bool write(const void* buffer, size_t size) override { |
143 | SkDebugf("%.*s" , size, buffer); |
144 | fBytesWritten += size; |
145 | return true; |
146 | } |
147 | |
148 | size_t bytesWritten() const override { |
149 | return fBytesWritten; |
150 | } |
151 | }; |
152 | |
153 | struct V { Val id; }; |
154 | struct R { Reg id; }; |
155 | struct Shift { int bits; }; |
156 | struct Splat { int bits; }; |
157 | struct Hex { int bits; }; |
158 | struct Attr { const char* label; int v; }; |
159 | |
160 | static void write(SkWStream* o, const char* s) { |
161 | o->writeText(s); |
162 | } |
163 | |
164 | static const char* name(Op op) { |
165 | switch (op) { |
166 | #define M(x) case Op::x: return #x; |
167 | SKVM_OPS(M) |
168 | #undef M |
169 | } |
170 | return "unknown op" ; |
171 | } |
172 | |
173 | static void write(SkWStream* o, Op op) { |
174 | o->writeText(name(op)); |
175 | } |
176 | static void write(SkWStream* o, Arg a) { |
177 | write(o, "arg(" ); |
178 | o->writeDecAsText(a.ix); |
179 | write(o, ")" ); |
180 | } |
181 | static void write(SkWStream* o, V v) { |
182 | write(o, "v" ); |
183 | o->writeDecAsText(v.id); |
184 | } |
185 | static void write(SkWStream* o, R r) { |
186 | write(o, "r" ); |
187 | o->writeDecAsText(r.id); |
188 | } |
189 | static void write(SkWStream* o, Shift s) { |
190 | o->writeDecAsText(s.bits); |
191 | } |
192 | static void write(SkWStream* o, Splat s) { |
193 | float f; |
194 | memcpy(&f, &s.bits, 4); |
195 | o->writeHexAsText(s.bits); |
196 | write(o, " (" ); |
197 | o->writeScalarAsText(f); |
198 | write(o, ")" ); |
199 | } |
200 | static void write(SkWStream* o, Hex h) { |
201 | o->writeHexAsText(h.bits); |
202 | } |
203 | [[maybe_unused]] static void write(SkWStream* o, Attr a) { |
204 | write(o, a.label); |
205 | write(o, " " ); |
206 | o->writeDecAsText(a.v); |
207 | } |
208 | |
209 | template <typename T, typename... Ts> |
210 | static void write(SkWStream* o, T first, Ts... rest) { |
211 | write(o, first); |
212 | write(o, " " ); |
213 | write(o, rest...); |
214 | } |
215 | } // namespace |
216 | |
217 | void Builder::dot(SkWStream* o) const { |
218 | SkDebugfStream debug; |
219 | if (!o) { o = &debug; } |
220 | |
221 | std::vector<OptimizedInstruction> optimized = this->optimize(); |
222 | |
223 | o->writeText("digraph {\n" ); |
224 | for (Val id = 0; id < (Val)optimized.size(); id++) { |
225 | auto [op, x,y,z, immy,immz, death,can_hoist] = optimized[id]; |
226 | |
227 | switch (op) { |
228 | default: |
229 | write(o, "\t" , V{id}, " [label = \"" , V{id}, op); |
230 | // Not a perfect heuristic; sometimes y/z == NA and there is no immy/z. |
231 | // On the other hand, sometimes immy/z=0 is meaningful and should be printed. |
232 | if (y == NA) { write(o, "" , Hex{immy}); } |
233 | if (z == NA) { write(o, "" , Hex{immz}); } |
234 | write(o, "\"]\n" ); |
235 | |
236 | write(o, "\t" , V{id}, " -> {" ); |
237 | // In contrast to the heuristic imm labels, these dependences are exact. |
238 | if (x != NA) { write(o, "" , V{x}); } |
239 | if (y != NA) { write(o, "" , V{y}); } |
240 | if (z != NA) { write(o, "" , V{z}); } |
241 | write(o, " }\n" ); |
242 | |
243 | break; |
244 | |
245 | // That default: impl works pretty well for most instructions, |
246 | // but some are nicer to see with a specialized label. |
247 | |
248 | case Op::splat: |
249 | write(o, "\t" , V{id}, " [label = \"" , V{id}, op, Splat{immy}, "\"]\n" ); |
250 | break; |
251 | } |
252 | } |
253 | o->writeText("}\n" ); |
254 | } |
255 | |
256 | template <typename I, typename... Fs> |
257 | static void write_one_instruction(Val id, const I& inst, SkWStream* o, Fs... fs) { |
258 | Op op = inst.op; |
259 | Val x = inst.x, |
260 | y = inst.y, |
261 | z = inst.z; |
262 | int immy = inst.immy, |
263 | immz = inst.immz; |
264 | switch (op) { |
265 | case Op::assert_true: write(o, op, V{x}, V{y}, fs(id)...); break; |
266 | |
267 | case Op::store8: write(o, op, Arg{immy} , V{x}, fs(id)...); break; |
268 | case Op::store16: write(o, op, Arg{immy} , V{x}, fs(id)...); break; |
269 | case Op::store32: write(o, op, Arg{immy} , V{x}, fs(id)...); break; |
270 | case Op::store64: write(o, op, Arg{immz} , V{x},V{y}, fs(id)...); break; |
271 | case Op::store128: write(o, op, Arg{immz>>1}, V{x},V{y},Hex{immz&1}, fs(id)...); break; |
272 | |
273 | case Op::index: write(o, V{id}, "=" , op, fs(id)...); break; |
274 | |
275 | case Op::load8: write(o, V{id}, "=" , op, Arg{immy}, fs(id)...); break; |
276 | case Op::load16: write(o, V{id}, "=" , op, Arg{immy}, fs(id)...); break; |
277 | case Op::load32: write(o, V{id}, "=" , op, Arg{immy}, fs(id)...); break; |
278 | case Op::load64: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}, fs(id)...); break; |
279 | case Op::load128: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}, fs(id)...); break; |
280 | |
281 | case Op::gather8: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}, V{x}, fs(id)...); break; |
282 | case Op::gather16: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}, V{x}, fs(id)...); break; |
283 | case Op::gather32: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}, V{x}, fs(id)...); break; |
284 | |
285 | case Op::uniform8: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}, fs(id)...); break; |
286 | case Op::uniform16: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}, fs(id)...); break; |
287 | case Op::uniform32: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}, fs(id)...); break; |
288 | |
289 | case Op::splat: write(o, V{id}, "=" , op, Splat{immy}, fs(id)...); break; |
290 | |
291 | case Op::add_f32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)... ); break; |
292 | case Op::sub_f32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)... ); break; |
293 | case Op::mul_f32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)... ); break; |
294 | case Op::div_f32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)... ); break; |
295 | case Op::min_f32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)... ); break; |
296 | case Op::max_f32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)... ); break; |
297 | case Op::fma_f32: write(o, V{id}, "=" , op, V{x}, V{y}, V{z}, fs(id)...); break; |
298 | case Op::fms_f32: write(o, V{id}, "=" , op, V{x}, V{y}, V{z}, fs(id)...); break; |
299 | case Op::fnma_f32: write(o, V{id}, "=" , op, V{x}, V{y}, V{z}, fs(id)...); break; |
300 | |
301 | |
302 | case Op::sqrt_f32: write(o, V{id}, "=" , op, V{x}, fs(id)...); break; |
303 | |
304 | case Op:: eq_f32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)...); break; |
305 | case Op::neq_f32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)...); break; |
306 | case Op:: gt_f32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)...); break; |
307 | case Op::gte_f32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)...); break; |
308 | |
309 | |
310 | case Op::add_i32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)...); break; |
311 | case Op::sub_i32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)...); break; |
312 | case Op::mul_i32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)...); break; |
313 | |
314 | case Op::shl_i32: write(o, V{id}, "=" , op, V{x}, Shift{immy}, fs(id)...); break; |
315 | case Op::shr_i32: write(o, V{id}, "=" , op, V{x}, Shift{immy}, fs(id)...); break; |
316 | case Op::sra_i32: write(o, V{id}, "=" , op, V{x}, Shift{immy}, fs(id)...); break; |
317 | |
318 | case Op:: eq_i32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)...); break; |
319 | case Op:: gt_i32: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)...); break; |
320 | |
321 | case Op::bit_and : write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)... ); break; |
322 | case Op::bit_or : write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)... ); break; |
323 | case Op::bit_xor : write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)... ); break; |
324 | case Op::bit_clear: write(o, V{id}, "=" , op, V{x}, V{y}, fs(id)... ); break; |
325 | |
326 | case Op::select: write(o, V{id}, "=" , op, V{x}, V{y}, V{z}, fs(id)...); break; |
327 | case Op::pack: write(o, V{id}, "=" , op, V{x}, V{y}, Shift{immz}, fs(id)...); break; |
328 | |
329 | case Op::ceil: write(o, V{id}, "=" , op, V{x}, fs(id)...); break; |
330 | case Op::floor: write(o, V{id}, "=" , op, V{x}, fs(id)...); break; |
331 | case Op::to_f32: write(o, V{id}, "=" , op, V{x}, fs(id)...); break; |
332 | case Op::to_half: write(o, V{id}, "=" , op, V{x}, fs(id)...); break; |
333 | case Op::from_half: write(o, V{id}, "=" , op, V{x}, fs(id)...); break; |
334 | case Op::trunc: write(o, V{id}, "=" , op, V{x}, fs(id)...); break; |
335 | case Op::round: write(o, V{id}, "=" , op, V{x}, fs(id)...); break; |
336 | } |
337 | |
338 | write(o, "\n" ); |
339 | } |
340 | |
341 | void Builder::dump(SkWStream* o) const { |
342 | SkDebugfStream debug; |
343 | if (!o) { o = &debug; } |
344 | |
345 | std::vector<OptimizedInstruction> optimized = this->optimize(); |
346 | o->writeDecAsText(optimized.size()); |
347 | o->writeText(" values (originally " ); |
348 | o->writeDecAsText(fProgram.size()); |
349 | o->writeText("):\n" ); |
350 | for (Val id = 0; id < (Val)optimized.size(); id++) { |
351 | const OptimizedInstruction& inst = optimized[id]; |
352 | write(o, inst.can_hoist ? "↑ " : " " ); |
353 | write_one_instruction(id, inst, o); |
354 | } |
355 | } |
356 | |
357 | template <typename... Fs> |
358 | void dump_instructions(const std::vector<Instruction>& instructions, SkWStream* o, Fs... fs) { |
359 | SkDebugfStream debug; |
360 | if (o == nullptr) { |
361 | o = &debug; |
362 | } |
363 | write(o, Attr{"Instruction count:" , (int)instructions.size()}); |
364 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
365 | write_one_instruction(id, instructions[id], o, std::forward<Fs>(fs)...); |
366 | } |
367 | } |
368 | |
369 | void Program::dump(SkWStream* o) const { |
370 | SkDebugfStream debug; |
371 | if (!o) { o = &debug; } |
372 | |
373 | o->writeDecAsText(fImpl->regs); |
374 | o->writeText(" registers, " ); |
375 | o->writeDecAsText(fImpl->instructions.size()); |
376 | o->writeText(" instructions:\n" ); |
377 | for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) { |
378 | if (i == fImpl->loop) { write(o, "loop:\n" ); } |
379 | o->writeDecAsText(i); |
380 | o->writeText("\t" ); |
381 | if (i >= fImpl->loop) { write(o, " " ); } |
382 | const InterpreterInstruction& inst = fImpl->instructions[i]; |
383 | Op op = inst.op; |
384 | Reg d = inst.d, |
385 | x = inst.x, |
386 | y = inst.y, |
387 | z = inst.z; |
388 | int immy = inst.immy, |
389 | immz = inst.immz; |
390 | switch (op) { |
391 | case Op::assert_true: write(o, op, R{x}, R{y}); break; |
392 | |
393 | case Op::store8: write(o, op, Arg{immy} , R{x} ); break; |
394 | case Op::store16: write(o, op, Arg{immy} , R{x} ); break; |
395 | case Op::store32: write(o, op, Arg{immy} , R{x} ); break; |
396 | case Op::store64: write(o, op, Arg{immz} , R{x}, R{y} ); break; |
397 | case Op::store128: write(o, op, Arg{immz>>1}, R{x}, R{y}, Hex{immz&1}); break; |
398 | |
399 | case Op::index: write(o, R{d}, "=" , op); break; |
400 | |
401 | case Op::load8: write(o, R{d}, "=" , op, Arg{immy}); break; |
402 | case Op::load16: write(o, R{d}, "=" , op, Arg{immy}); break; |
403 | case Op::load32: write(o, R{d}, "=" , op, Arg{immy}); break; |
404 | case Op::load64: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}); break; |
405 | case Op::load128: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}); break; |
406 | |
407 | case Op::gather8: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}, R{x}); break; |
408 | case Op::gather16: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}, R{x}); break; |
409 | case Op::gather32: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}, R{x}); break; |
410 | |
411 | case Op::uniform8: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}); break; |
412 | case Op::uniform16: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}); break; |
413 | case Op::uniform32: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}); break; |
414 | |
415 | case Op::splat: write(o, R{d}, "=" , op, Splat{immy}); break; |
416 | |
417 | |
418 | case Op::add_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
419 | case Op::sub_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
420 | case Op::mul_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
421 | case Op::div_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
422 | case Op::min_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
423 | case Op::max_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
424 | case Op::fma_f32: write(o, R{d}, "=" , op, R{x}, R{y}, R{z}); break; |
425 | case Op::fms_f32: write(o, R{d}, "=" , op, R{x}, R{y}, R{z}); break; |
426 | case Op::fnma_f32: write(o, R{d}, "=" , op, R{x}, R{y}, R{z}); break; |
427 | |
428 | case Op::sqrt_f32: write(o, R{d}, "=" , op, R{x}); break; |
429 | |
430 | case Op:: eq_f32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
431 | case Op::neq_f32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
432 | case Op:: gt_f32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
433 | case Op::gte_f32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
434 | |
435 | |
436 | case Op::add_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
437 | case Op::sub_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
438 | case Op::mul_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
439 | |
440 | case Op::shl_i32: write(o, R{d}, "=" , op, R{x}, Shift{immy}); break; |
441 | case Op::shr_i32: write(o, R{d}, "=" , op, R{x}, Shift{immy}); break; |
442 | case Op::sra_i32: write(o, R{d}, "=" , op, R{x}, Shift{immy}); break; |
443 | |
444 | case Op:: eq_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
445 | case Op:: gt_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
446 | |
447 | case Op::bit_and : write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
448 | case Op::bit_or : write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
449 | case Op::bit_xor : write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
450 | case Op::bit_clear: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
451 | |
452 | case Op::select: write(o, R{d}, "=" , op, R{x}, R{y}, R{z}); break; |
453 | case Op::pack: write(o, R{d}, "=" , op, R{x}, R{y}, Shift{immz}); break; |
454 | |
455 | case Op::ceil: write(o, R{d}, "=" , op, R{x}); break; |
456 | case Op::floor: write(o, R{d}, "=" , op, R{x}); break; |
457 | case Op::to_f32: write(o, R{d}, "=" , op, R{x}); break; |
458 | case Op::to_half: write(o, R{d}, "=" , op, R{x}); break; |
459 | case Op::from_half: write(o, R{d}, "=" , op, R{x}); break; |
460 | case Op::trunc: write(o, R{d}, "=" , op, R{x}); break; |
461 | case Op::round: write(o, R{d}, "=" , op, R{x}); break; |
462 | } |
463 | write(o, "\n" ); |
464 | } |
465 | } |
466 | |
467 | std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program) { |
468 | // Determine which Instructions are live by working back from side effects. |
469 | std::vector<bool> live(program.size(), false); |
470 | auto mark_live = [&](Val id, auto& recurse) -> void { |
471 | if (live[id] == false) { |
472 | live[id] = true; |
473 | Instruction inst = program[id]; |
474 | for (Val arg : {inst.x, inst.y, inst.z}) { |
475 | if (arg != NA) { recurse(arg, recurse); } |
476 | } |
477 | } |
478 | }; |
479 | for (Val id = 0; id < (Val)program.size(); id++) { |
480 | if (has_side_effect(program[id].op)) { |
481 | mark_live(id, mark_live); |
482 | } |
483 | } |
484 | |
485 | // Rewrite the program with only live Instructions: |
486 | // - remap IDs in live Instructions to what they'll be once dead Instructions are removed; |
487 | // - then actually remove the dead Instructions. |
488 | std::vector<Val> new_id(program.size(), NA); |
489 | for (Val id = 0, next = 0; id < (Val)program.size(); id++) { |
490 | if (live[id]) { |
491 | Instruction& inst = program[id]; |
492 | for (Val* arg : {&inst.x, &inst.y, &inst.z}) { |
493 | if (*arg != NA) { |
494 | *arg = new_id[*arg]; |
495 | SkASSERT(*arg != NA); |
496 | } |
497 | } |
498 | new_id[id] = next++; |
499 | } |
500 | } |
501 | auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) { |
502 | Val id = (Val)(&inst - program.data()); |
503 | return !live[id]; |
504 | }); |
505 | program.erase(it, program.end()); |
506 | |
507 | return program; |
508 | } |
509 | |
510 | // Impose a deterministic scheduling of Instructions based on data flow alone, |
511 | // eliminating any influence from original program order. We'll schedule back-to-front, |
512 | // starting at the end of the program with Instructions that have side effects and |
513 | // recursing through arguments to Instructions that issue earlier in the program. |
514 | // We schedule each argument once all its users have been scheduled, which means it |
515 | // issues just before its first use. We arbitrarily schedule x, then y, then z, and so |
516 | // issue z, then y, then x. |
517 | std::vector<Instruction> schedule(std::vector<Instruction> program) { |
518 | |
519 | std::vector<int> uses(program.size()); |
520 | for (const Instruction& inst : program) { |
521 | for (Val arg : {inst.x, inst.y, inst.z}) { |
522 | if (arg != NA) { uses[arg]++; } |
523 | } |
524 | } |
525 | |
526 | std::vector<Val> new_id(program.size(), NA); |
527 | Val next = (Val)program.size(); |
528 | auto reorder = [&](Val id, auto& recurse) -> void { |
529 | new_id[id] = --next; |
530 | const Instruction& inst = program[id]; |
531 | for (Val arg : {inst.x, inst.y, inst.z}) { |
532 | if (arg != NA && --uses[arg] == 0) { |
533 | recurse(arg, recurse); |
534 | } |
535 | } |
536 | }; |
537 | |
538 | for (Val id = 0; id < (Val)program.size(); id++) { |
539 | if (has_side_effect(program[id].op)) { |
540 | reorder(id, reorder); |
541 | } |
542 | } |
543 | |
544 | // Remap each Instruction's arguments to their new IDs. |
545 | for (Instruction& inst : program) { |
546 | for (Val* arg : {&inst.x, &inst.y, &inst.z}) { |
547 | if (*arg != NA) { |
548 | *arg = new_id[*arg]; |
549 | SkASSERT(*arg != NA); |
550 | } |
551 | } |
552 | } |
553 | |
554 | // Finally, reorder the Instructions themselves according to the new schedule. |
555 | // This is O(N)... wish I had a good reference link breaking it down. |
556 | for (Val id = 0; id < (Val)program.size(); id++) { |
557 | while (id != new_id[id]) { |
558 | std::swap(program[id], program[new_id[id]]); |
559 | std::swap( new_id[id], new_id[new_id[id]]); |
560 | } |
561 | } |
562 | |
563 | return program; |
564 | } |
565 | |
566 | std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program) { |
567 | std::vector<OptimizedInstruction> optimized(program.size()); |
568 | for (Val id = 0; id < (Val)program.size(); id++) { |
569 | Instruction inst = program[id]; |
570 | optimized[id] = {inst.op, inst.x,inst.y,inst.z, inst.immy,inst.immz, |
571 | /*death=*/id, /*can_hoist=*/true}; |
572 | } |
573 | |
574 | // Each Instruction's inputs need to live at least until that Instruction issues. |
575 | for (Val id = 0; id < (Val)optimized.size(); id++) { |
576 | OptimizedInstruction& inst = optimized[id]; |
577 | for (Val arg : {inst.x, inst.y, inst.z}) { |
578 | // (We're walking in order, so this is the same as max()ing with the existing Val.) |
579 | if (arg != NA) { optimized[arg].death = id; } |
580 | } |
581 | } |
582 | |
583 | // Mark which values don't depend on the loop and can be hoisted. |
584 | for (OptimizedInstruction& inst : optimized) { |
585 | // Varying loads (and gathers) and stores cannot be hoisted out of the loop. |
586 | if (is_always_varying(inst.op)) { |
587 | inst.can_hoist = false; |
588 | } |
589 | |
590 | // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself. |
591 | if (inst.can_hoist) { |
592 | for (Val arg : {inst.x, inst.y, inst.z}) { |
593 | if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; } |
594 | } |
595 | } |
596 | } |
597 | |
598 | // Extend the lifetime of any hoisted value that's used in the loop to infinity. |
599 | for (OptimizedInstruction& inst : optimized) { |
600 | if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) { |
601 | for (Val arg : {inst.x, inst.y, inst.z}) { |
602 | if (arg != NA && optimized[arg].can_hoist) { |
603 | optimized[arg].death = (Val)program.size(); |
604 | } |
605 | } |
606 | } |
607 | } |
608 | |
609 | return optimized; |
610 | } |
611 | |
612 | std::vector<OptimizedInstruction> Builder::optimize() const { |
613 | std::vector<Instruction> program = this->program(); |
614 | program = eliminate_dead_code(std::move(program)); |
615 | program = schedule (std::move(program)); |
616 | return finalize (std::move(program)); |
617 | } |
618 | |
619 | Program Builder::done(const char* debug_name) const { |
620 | char buf[64] = "skvm-jit-" ; |
621 | if (!debug_name) { |
622 | *SkStrAppendU32(buf+9, this->hash()) = '\0'; |
623 | debug_name = buf; |
624 | } |
625 | |
626 | return {this->optimize(), fStrides, debug_name}; |
627 | } |
628 | |
629 | uint64_t Builder::hash() const { |
630 | uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0), |
631 | hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1); |
632 | return (uint64_t)lo | (uint64_t)hi << 32; |
633 | } |
634 | |
635 | bool operator==(const Instruction& a, const Instruction& b) { |
636 | return a.op == b.op |
637 | && a.x == b.x |
638 | && a.y == b.y |
639 | && a.z == b.z |
640 | && a.immy == b.immy |
641 | && a.immz == b.immz; |
642 | } |
643 | |
644 | uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const { |
645 | return SkOpts::hash(&inst, sizeof(inst), seed); |
646 | } |
647 | |
648 | |
649 | // Most instructions produce a value and return it by ID, |
650 | // the value-producing instruction's own index in the program vector. |
651 | Val Builder::push(Instruction inst) { |
652 | // Basic common subexpression elimination: |
653 | // if we've already seen this exact Instruction, use it instead of creating a new one. |
654 | if (Val* id = fIndex.find(inst)) { |
655 | return *id; |
656 | } |
657 | Val id = static_cast<Val>(fProgram.size()); |
658 | fProgram.push_back(inst); |
659 | fIndex.set(inst, id); |
660 | return id; |
661 | } |
662 | |
663 | bool Builder::allImm() const { return true; } |
664 | |
665 | template <typename T, typename... Rest> |
666 | bool Builder::allImm(Val id, T* imm, Rest... rest) const { |
667 | if (fProgram[id].op == Op::splat) { |
668 | static_assert(sizeof(T) == 4); |
669 | memcpy(imm, &fProgram[id].immy, 4); |
670 | return this->allImm(rest...); |
671 | } |
672 | return false; |
673 | } |
674 | |
675 | Arg Builder::arg(int stride) { |
676 | int ix = (int)fStrides.size(); |
677 | fStrides.push_back(stride); |
678 | return {ix}; |
679 | } |
680 | |
681 | void Builder::assert_true(I32 cond, I32 debug) { |
682 | #ifdef SK_DEBUG |
683 | int imm; |
684 | if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; } |
685 | (void)push(Op::assert_true, cond.id,debug.id,NA); |
686 | #endif |
687 | } |
688 | |
689 | void Builder::store8 (Arg ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA, ptr.ix); } |
690 | void Builder::store16(Arg ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA, ptr.ix); } |
691 | void Builder::store32(Arg ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA, ptr.ix); } |
692 | void Builder::store64(Arg ptr, I32 lo, I32 hi) { |
693 | (void)push(Op::store64, lo.id,hi.id,NA, NA,ptr.ix); |
694 | } |
695 | void Builder::store128(Arg ptr, I32 lo, I32 hi, int lane) { |
696 | (void)push(Op::store128, lo.id,hi.id,NA, NA,(ptr.ix<<1)|(lane&1)); |
697 | } |
698 | |
699 | I32 Builder::index() { return {this, push(Op::index , NA,NA,NA,0) }; } |
700 | |
701 | I32 Builder::load8 (Arg ptr) { return {this, push(Op::load8 , NA,NA,NA, ptr.ix) }; } |
702 | I32 Builder::load16(Arg ptr) { return {this, push(Op::load16, NA,NA,NA, ptr.ix) }; } |
703 | I32 Builder::load32(Arg ptr) { return {this, push(Op::load32, NA,NA,NA, ptr.ix) }; } |
704 | I32 Builder::load64(Arg ptr, int lane) { |
705 | return {this, push(Op::load64 , NA,NA,NA, ptr.ix,lane) }; |
706 | } |
707 | I32 Builder::load128(Arg ptr, int lane) { |
708 | return {this, push(Op::load128, NA,NA,NA, ptr.ix,lane) }; |
709 | } |
710 | |
711 | I32 Builder::gather8 (Arg ptr, int offset, I32 index) { |
712 | return {this, push(Op::gather8 , index.id,NA,NA, ptr.ix,offset)}; |
713 | } |
714 | I32 Builder::gather16(Arg ptr, int offset, I32 index) { |
715 | return {this, push(Op::gather16, index.id,NA,NA, ptr.ix,offset)}; |
716 | } |
717 | I32 Builder::gather32(Arg ptr, int offset, I32 index) { |
718 | return {this, push(Op::gather32, index.id,NA,NA, ptr.ix,offset)}; |
719 | } |
720 | |
721 | I32 Builder::uniform8(Arg ptr, int offset) { |
722 | return {this, push(Op::uniform8, NA,NA,NA, ptr.ix, offset)}; |
723 | } |
724 | I32 Builder::uniform16(Arg ptr, int offset) { |
725 | return {this, push(Op::uniform16, NA,NA,NA, ptr.ix, offset)}; |
726 | } |
727 | I32 Builder::uniform32(Arg ptr, int offset) { |
728 | return {this, push(Op::uniform32, NA,NA,NA, ptr.ix, offset)}; |
729 | } |
730 | |
731 | // The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern. |
732 | I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA, n) }; } |
733 | F32 Builder::splat(float f) { |
734 | int bits; |
735 | memcpy(&bits, &f, 4); |
736 | return {this, push(Op::splat, NA,NA,NA, bits)}; |
737 | } |
738 | |
739 | bool fma_supported() { |
740 | static const bool supported = |
741 | #if defined(SK_CPU_X86) |
742 | SkCpu::Supports(SkCpu::HSW); |
743 | #elif defined(SK_CPU_ARM64) |
744 | true; |
745 | #else |
746 | false; |
747 | #endif |
748 | return supported; |
749 | } |
750 | |
751 | // Be careful peepholing float math! Transformations you might expect to |
752 | // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0. |
753 | // Float peepholes must pass this equivalence test for all ~4B floats: |
754 | // |
755 | // bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); } |
756 | // |
757 | // unsigned bits = 0; |
758 | // do { |
759 | // float f; |
760 | // memcpy(&f, &bits, 4); |
761 | // if (!equiv(f, ...)) { |
762 | // abort(); |
763 | // } |
764 | // } while (++bits != 0); |
765 | |
766 | F32 Builder::add(F32 x, F32 y) { |
767 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } |
768 | if (this->isImm(y.id, 0.0f)) { return x; } // x+0 == x |
769 | if (this->isImm(x.id, 0.0f)) { return y; } // 0+y == y |
770 | |
771 | if (fma_supported()) { |
772 | if (fProgram[x.id].op == Op::mul_f32) { |
773 | return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; |
774 | } |
775 | if (fProgram[y.id].op == Op::mul_f32) { |
776 | return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; |
777 | } |
778 | } |
779 | return {this, this->push(Op::add_f32, x.id, y.id)}; |
780 | } |
781 | |
782 | F32 Builder::sub(F32 x, F32 y) { |
783 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } |
784 | if (this->isImm(y.id, 0.0f)) { return x; } // x-0 == x |
785 | if (fma_supported()) { |
786 | if (fProgram[x.id].op == Op::mul_f32) { |
787 | return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; |
788 | } |
789 | if (fProgram[y.id].op == Op::mul_f32) { |
790 | return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; |
791 | } |
792 | } |
793 | return {this, this->push(Op::sub_f32, x.id, y.id)}; |
794 | } |
795 | |
796 | F32 Builder::mul(F32 x, F32 y) { |
797 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } |
798 | if (this->isImm(y.id, 1.0f)) { return x; } // x*1 == x |
799 | if (this->isImm(x.id, 1.0f)) { return y; } // 1*y == y |
800 | return {this, this->push(Op::mul_f32, x.id, y.id)}; |
801 | } |
802 | |
803 | F32 Builder::div(F32 x, F32 y) { |
804 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X/Y); } |
805 | if (this->isImm(y.id, 1.0f)) { return x; } // x/1 == x |
806 | return {this, this->push(Op::div_f32, x.id, y.id)}; |
807 | } |
808 | |
809 | F32 Builder::sqrt(F32 x) { |
810 | if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); } |
811 | return {this, this->push(Op::sqrt_f32, x.id,NA,NA)}; |
812 | } |
813 | |
814 | // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. |
815 | F32 Builder::approx_log2(F32 x) { |
816 | // e - 127 is a fair approximation of log2(x) in its own right... |
817 | F32 e = mul(to_f32(bit_cast(x)), splat(1.0f / (1<<23))); |
818 | |
819 | // ... but using the mantissa to refine its error is _much_ better. |
820 | F32 m = bit_cast(bit_or(bit_and(bit_cast(x), 0x007fffff), |
821 | 0x3f000000)); |
822 | F32 approx = sub(e, 124.225514990f); |
823 | approx = sub(approx, mul(1.498030302f, m)); |
824 | approx = sub(approx, div(1.725879990f, add(0.3520887068f, m))); |
825 | |
826 | return approx; |
827 | } |
828 | |
829 | F32 Builder::approx_pow2(F32 x) { |
830 | F32 f = fract(x); |
831 | F32 approx = add(x, 121.274057500f); |
832 | approx = sub(approx, mul( 1.490129070f, f)); |
833 | approx = add(approx, div(27.728023300f, sub(4.84252568f, f))); |
834 | |
835 | return bit_cast(round(mul(1.0f * (1<<23), approx))); |
836 | } |
837 | |
838 | F32 Builder::approx_powf(F32 x, F32 y) { |
839 | // TODO: assert this instead? Sometimes x is very slightly negative. See skia:10210. |
840 | x = max(0.0f, x); |
841 | |
842 | auto is_x = bit_or(eq(x, 0.0f), |
843 | eq(x, 1.0f)); |
844 | return select(is_x, x, approx_pow2(mul(approx_log2(x), y))); |
845 | } |
846 | |
847 | // Bhaskara I's sine approximation |
848 | // 16x(pi - x) / (5*pi^2 - 4x(pi - x) |
849 | // ... divide by 4 |
850 | // 4x(pi - x) / 5*pi^2/4 - x(pi - x) |
851 | // |
852 | // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get |
853 | // radians into that range first. |
854 | // |
855 | F32 Builder::approx_sin(F32 radians) { |
856 | constexpr float Pi = SK_ScalarPI; |
857 | // x = radians mod 2pi |
858 | F32 x = fract(radians * (0.5f/Pi)) * (2*Pi); |
859 | I32 neg = x > Pi; // are we pi < x < 2pi --> need to negate result |
860 | x = select(neg, x - Pi, x); |
861 | |
862 | F32 pair = x * (Pi - x); |
863 | x = 4.0f * pair / ((5*Pi*Pi/4) - pair); |
864 | x = select(neg, -x, x); |
865 | return x; |
866 | } |
867 | |
868 | /* "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION" |
869 | https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf |
870 | |
871 | approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9 |
872 | |
873 | Some simplifications: |
874 | 1. tan(x) is periodic, -PI/2 < x < PI/2 |
875 | 2. tan(x) is odd, so tan(-x) = -tan(x) |
876 | 3. Our polynomial approximation is best near zero, so we use the following identity |
877 | tan(x) + tan(y) |
878 | tan(x + y) = ----------------- |
879 | 1 - tan(x)*tan(y) |
880 | tan(PI/4) = 1 |
881 | |
882 | So for x > PI/8, we do the following refactor: |
883 | x' = x - PI/4 |
884 | |
885 | 1 + tan(x') |
886 | tan(x) = ------------ |
887 | 1 - tan(x') |
888 | */ |
889 | F32 Builder::approx_tan(F32 x) { |
890 | constexpr float Pi = SK_ScalarPI; |
891 | // periodic between -pi/2 ... pi/2 |
892 | // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back |
893 | x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2); |
894 | |
895 | I32 neg = (x < 0.0f); |
896 | x = select(neg, -x, x); |
897 | |
898 | // minimize total error by shifting if x > pi/8 |
899 | I32 use_quotient = (x > (Pi/8)); |
900 | x = select(use_quotient, x - (Pi/4), x); |
901 | |
902 | // 9th order poly = 4th order(x^2) * x |
903 | x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x; |
904 | x = select(use_quotient, (1+x)/(1-x), x); |
905 | x = select(neg, -x, x); |
906 | return x; |
907 | } |
908 | |
909 | // http://mathforum.org/library/drmath/view/54137.html |
910 | // referencing Handbook of Mathematical Functions, |
911 | // by Milton Abramowitz and Irene Stegun |
912 | F32 Builder::approx_asin(F32 x) { |
913 | I32 neg = (x < 0.0f); |
914 | x = select(neg, -x, x); |
915 | x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f); |
916 | x = select(neg, -x, x); |
917 | return x; |
918 | } |
919 | |
920 | /* Use 4th order polynomial approximation from https://arachnoid.com/polysolve/ |
921 | * with 129 values of x,atan(x) for x:[0...1] |
922 | * This only works for 0 <= x <= 1 |
923 | */ |
924 | static F32 approx_atan_unit(F32 x) { |
925 | // for now we might be given NaN... let that through |
926 | x->assert_true((x != x) | ((x >= 0) & (x <= 1))); |
927 | return poly(x, 0.14130025741326729f, |
928 | -0.34312835980675116f, |
929 | -0.016172900528248768f, |
930 | 1.0037696976200385f, |
931 | -0.00014758242182738969f); |
932 | } |
933 | |
934 | /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1 |
935 | */ |
936 | F32 Builder::approx_atan(F32 x) { |
937 | I32 neg = (x < 0.0f); |
938 | x = select(neg, -x, x); |
939 | I32 flip = (x > 1.0f); |
940 | x = select(flip, 1/x, x); |
941 | x = approx_atan_unit(x); |
942 | x = select(flip, SK_ScalarPI/2 - x, x); |
943 | x = select(neg, -x, x); |
944 | return x; |
945 | } |
946 | |
947 | /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1 |
948 | * By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit() |
949 | * which avoids a 2nd divide instruction if we had instead called atan(). |
950 | */ |
951 | F32 Builder::approx_atan2(F32 y0, F32 x0) { |
952 | |
953 | I32 flip = (abs(y0) > abs(x0)); |
954 | F32 y = select(flip, x0, y0); |
955 | F32 x = select(flip, y0, x0); |
956 | F32 arg = y/x; |
957 | |
958 | I32 neg = (arg < 0.0f); |
959 | arg = select(neg, -arg, arg); |
960 | |
961 | F32 r = approx_atan_unit(arg); |
962 | r = select(flip, SK_ScalarPI/2 - r, r); |
963 | r = select(neg, -r, r); |
964 | |
965 | // handle quadrant distinctions |
966 | r = select((y0 >= 0) & (x0 < 0), r + SK_ScalarPI, r); |
967 | r = select((y0 < 0) & (x0 <= 0), r - SK_ScalarPI, r); |
968 | // Note: we don't try to handle 0,0 or infinities (yet) |
969 | return r; |
970 | } |
971 | |
972 | F32 Builder::min(F32 x, F32 y) { |
973 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); } |
974 | return {this, this->push(Op::min_f32, x.id, y.id)}; |
975 | } |
976 | F32 Builder::max(F32 x, F32 y) { |
977 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); } |
978 | return {this, this->push(Op::max_f32, x.id, y.id)}; |
979 | } |
980 | |
981 | I32 Builder::add(I32 x, I32 y) { |
982 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } |
983 | if (this->isImm(x.id, 0)) { return y; } |
984 | if (this->isImm(y.id, 0)) { return x; } |
985 | return {this, this->push(Op::add_i32, x.id, y.id)}; |
986 | } |
987 | I32 Builder::sub(I32 x, I32 y) { |
988 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } |
989 | if (this->isImm(y.id, 0)) { return x; } |
990 | return {this, this->push(Op::sub_i32, x.id, y.id)}; |
991 | } |
992 | I32 Builder::mul(I32 x, I32 y) { |
993 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } |
994 | if (this->isImm(x.id, 0)) { return splat(0); } |
995 | if (this->isImm(y.id, 0)) { return splat(0); } |
996 | if (this->isImm(x.id, 1)) { return y; } |
997 | if (this->isImm(y.id, 1)) { return x; } |
998 | return {this, this->push(Op::mul_i32, x.id, y.id)}; |
999 | } |
1000 | |
1001 | I32 Builder::shl(I32 x, int bits) { |
1002 | if (bits == 0) { return x; } |
1003 | if (int X; this->allImm(x.id,&X)) { return splat(X << bits); } |
1004 | return {this, this->push(Op::shl_i32, x.id,NA,NA, bits)}; |
1005 | } |
1006 | I32 Builder::shr(I32 x, int bits) { |
1007 | if (bits == 0) { return x; } |
1008 | if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); } |
1009 | return {this, this->push(Op::shr_i32, x.id,NA,NA, bits)}; |
1010 | } |
1011 | I32 Builder::sra(I32 x, int bits) { |
1012 | if (bits == 0) { return x; } |
1013 | if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); } |
1014 | return {this, this->push(Op::sra_i32, x.id,NA,NA, bits)}; |
1015 | } |
1016 | |
1017 | I32 Builder:: eq(F32 x, F32 y) { |
1018 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); } |
1019 | return {this, this->push(Op::eq_f32, x.id, y.id)}; |
1020 | } |
1021 | I32 Builder::neq(F32 x, F32 y) { |
1022 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); } |
1023 | return {this, this->push(Op::neq_f32, x.id, y.id)}; |
1024 | } |
1025 | I32 Builder::lt(F32 x, F32 y) { |
1026 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); } |
1027 | return {this, this->push(Op::gt_f32, y.id, x.id)}; |
1028 | } |
1029 | I32 Builder::lte(F32 x, F32 y) { |
1030 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); } |
1031 | return {this, this->push(Op::gte_f32, y.id, x.id)}; |
1032 | } |
1033 | I32 Builder::gt(F32 x, F32 y) { |
1034 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); } |
1035 | return {this, this->push(Op::gt_f32, x.id, y.id)}; |
1036 | } |
1037 | I32 Builder::gte(F32 x, F32 y) { |
1038 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); } |
1039 | return {this, this->push(Op::gte_f32, x.id, y.id)}; |
1040 | } |
1041 | |
1042 | I32 Builder:: eq(I32 x, I32 y) { |
1043 | if (x.id == y.id) { return splat(~0); } |
1044 | return {this, this->push(Op:: eq_i32, x.id, y.id)}; |
1045 | } |
1046 | I32 Builder::neq(I32 x, I32 y) { |
1047 | return ~(x == y); |
1048 | } |
1049 | I32 Builder:: gt(I32 x, I32 y) { |
1050 | return {this, this->push(Op:: gt_i32, x.id, y.id)}; |
1051 | } |
1052 | I32 Builder::gte(I32 x, I32 y) { |
1053 | if (x.id == y.id) { return splat(~0); } |
1054 | return ~(x < y); |
1055 | } |
1056 | I32 Builder:: lt(I32 x, I32 y) { return y>x; } |
1057 | I32 Builder::lte(I32 x, I32 y) { return y>=x; } |
1058 | |
1059 | I32 Builder::bit_and(I32 x, I32 y) { |
1060 | if (x.id == y.id) { return x; } |
1061 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); } |
1062 | if (this->isImm(y.id, 0)) { return splat(0); } // (x & false) == false |
1063 | if (this->isImm(x.id, 0)) { return splat(0); } // (false & y) == false |
1064 | if (this->isImm(y.id,~0)) { return x; } // (x & true) == x |
1065 | if (this->isImm(x.id,~0)) { return y; } // (true & y) == y |
1066 | return {this, this->push(Op::bit_and, x.id, y.id)}; |
1067 | } |
1068 | I32 Builder::bit_or(I32 x, I32 y) { |
1069 | if (x.id == y.id) { return x; } |
1070 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); } |
1071 | if (this->isImm(y.id, 0)) { return x; } // (x | false) == x |
1072 | if (this->isImm(x.id, 0)) { return y; } // (false | y) == y |
1073 | if (this->isImm(y.id,~0)) { return splat(~0); } // (x | true) == true |
1074 | if (this->isImm(x.id,~0)) { return splat(~0); } // (true | y) == true |
1075 | return {this, this->push(Op::bit_or, x.id, y.id)}; |
1076 | } |
1077 | I32 Builder::bit_xor(I32 x, I32 y) { |
1078 | if (x.id == y.id) { return splat(0); } |
1079 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); } |
1080 | if (this->isImm(y.id, 0)) { return x; } // (x ^ false) == x |
1081 | if (this->isImm(x.id, 0)) { return y; } // (false ^ y) == y |
1082 | return {this, this->push(Op::bit_xor, x.id, y.id)}; |
1083 | } |
1084 | |
1085 | I32 Builder::bit_clear(I32 x, I32 y) { |
1086 | if (x.id == y.id) { return splat(0); } |
1087 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); } |
1088 | if (this->isImm(y.id, 0)) { return x; } // (x & ~false) == x |
1089 | if (this->isImm(y.id,~0)) { return splat(0); } // (x & ~true) == false |
1090 | if (this->isImm(x.id, 0)) { return splat(0); } // (false & ~y) == false |
1091 | return {this, this->push(Op::bit_clear, x.id, y.id)}; |
1092 | } |
1093 | |
1094 | I32 Builder::select(I32 x, I32 y, I32 z) { |
1095 | if (y.id == z.id) { return y; } |
1096 | if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); } |
1097 | if (this->isImm(x.id,~0)) { return y; } // true ? y : z == y |
1098 | if (this->isImm(x.id, 0)) { return z; } // false ? y : z == z |
1099 | if (this->isImm(y.id, 0)) { return bit_clear(z,x); } // x ? 0 : z == ~x&z |
1100 | if (this->isImm(z.id, 0)) { return bit_and (y,x); } // x ? y : 0 == x&y |
1101 | return {this, this->push(Op::select, x.id, y.id, z.id)}; |
1102 | } |
1103 | |
1104 | I32 Builder::(I32 x, int bits, I32 z) { |
1105 | if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); } |
1106 | return bit_and(z, shr(x, bits)); |
1107 | } |
1108 | |
1109 | I32 Builder::pack(I32 x, I32 y, int bits) { |
1110 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|(Y<<bits)); } |
1111 | return {this, this->push(Op::pack, x.id,y.id,NA, 0,bits)}; |
1112 | } |
1113 | |
1114 | F32 Builder::ceil(F32 x) { |
1115 | if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); } |
1116 | return {this, this->push(Op::ceil, x.id)}; |
1117 | } |
1118 | F32 Builder::floor(F32 x) { |
1119 | if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); } |
1120 | return {this, this->push(Op::floor, x.id)}; |
1121 | } |
1122 | F32 Builder::to_f32(I32 x) { |
1123 | if (int X; this->allImm(x.id,&X)) { return splat((float)X); } |
1124 | return {this, this->push(Op::to_f32, x.id)}; |
1125 | } |
1126 | I32 Builder::trunc(F32 x) { |
1127 | if (float X; this->allImm(x.id,&X)) { return splat((int)X); } |
1128 | return {this, this->push(Op::trunc, x.id)}; |
1129 | } |
1130 | I32 Builder::round(F32 x) { |
1131 | if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); } |
1132 | return {this, this->push(Op::round, x.id)}; |
1133 | } |
1134 | |
1135 | I32 Builder::to_half(F32 x) { |
1136 | if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); } |
1137 | return {this, this->push(Op::to_half, x.id)}; |
1138 | } |
1139 | F32 Builder::from_half(I32 x) { |
1140 | if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); } |
1141 | return {this, this->push(Op::from_half, x.id)}; |
1142 | } |
1143 | |
1144 | F32 Builder::from_unorm(int bits, I32 x) { |
1145 | F32 limit = splat(1 / ((1<<bits)-1.0f)); |
1146 | return mul(to_f32(x), limit); |
1147 | } |
1148 | I32 Builder::to_unorm(int bits, F32 x) { |
1149 | F32 limit = splat((1<<bits)-1.0f); |
1150 | return round(mul(x, limit)); |
1151 | } |
1152 | |
1153 | bool SkColorType_to_PixelFormat(SkColorType ct, PixelFormat* f) { |
1154 | auto UNORM = PixelFormat::UNORM, |
1155 | FLOAT = PixelFormat::FLOAT; |
1156 | switch (ct) { |
1157 | case kUnknown_SkColorType: SkASSERT(false); return false; |
1158 | |
1159 | case kRGBA_F32_SkColorType: *f = {FLOAT,32,32,32,32, 0,32,64,96}; return true; |
1160 | |
1161 | case kRGBA_F16Norm_SkColorType: *f = {FLOAT,16,16,16,16, 0,16,32,48}; return true; |
1162 | case kRGBA_F16_SkColorType: *f = {FLOAT,16,16,16,16, 0,16,32,48}; return true; |
1163 | case kR16G16B16A16_unorm_SkColorType: *f = {UNORM,16,16,16,16, 0,16,32,48}; return true; |
1164 | |
1165 | case kA16_float_SkColorType: *f = {FLOAT, 0, 0,0,16, 0, 0,0,0}; return true; |
1166 | case kR16G16_float_SkColorType: *f = {FLOAT, 16,16,0, 0, 0,16,0,0}; return true; |
1167 | |
1168 | case kAlpha_8_SkColorType: *f = {UNORM, 0,0,0,8, 0,0,0,0}; return true; |
1169 | case kGray_8_SkColorType: *f = {UNORM, 8,8,8,0, 0,0,0,0}; return true; // Subtle. |
1170 | |
1171 | case kRGB_565_SkColorType: *f = {UNORM, 5,6,5,0, 11,5,0,0}; return true; // (BGR) |
1172 | case kARGB_4444_SkColorType: *f = {UNORM, 4,4,4,4, 12,8,4,0}; return true; // (ABGR) |
1173 | |
1174 | case kRGBA_8888_SkColorType: *f = {UNORM, 8,8,8,8, 0,8,16,24}; return true; |
1175 | case kRGB_888x_SkColorType: *f = {UNORM, 8,8,8,0, 0,8,16,32}; return true; // 32-bit |
1176 | case kBGRA_8888_SkColorType: *f = {UNORM, 8,8,8,8, 16,8, 0,24}; return true; |
1177 | |
1178 | case kRGBA_1010102_SkColorType: *f = {UNORM, 10,10,10,2, 0,10,20,30}; return true; |
1179 | case kBGRA_1010102_SkColorType: *f = {UNORM, 10,10,10,2, 20,10, 0,30}; return true; |
1180 | case kRGB_101010x_SkColorType: *f = {UNORM, 10,10,10,0, 0,10,20, 0}; return true; |
1181 | case kBGR_101010x_SkColorType: *f = {UNORM, 10,10,10,0, 20,10, 0, 0}; return true; |
1182 | |
1183 | case kR8G8_unorm_SkColorType: *f = {UNORM, 8, 8,0, 0, 0, 8,0,0}; return true; |
1184 | case kR16G16_unorm_SkColorType: *f = {UNORM, 16,16,0, 0, 0,16,0,0}; return true; |
1185 | case kA16_unorm_SkColorType: *f = {UNORM, 0, 0,0,16, 0, 0,0,0}; return true; |
1186 | } |
1187 | return false; |
1188 | } |
1189 | |
1190 | static int byte_size(PixelFormat f) { |
1191 | // What's the highest bit we read? |
1192 | int bits = std::max(f.r_bits + f.r_shift, |
1193 | std::max(f.g_bits + f.g_shift, |
1194 | std::max(f.b_bits + f.b_shift, |
1195 | f.a_bits + f.a_shift))); |
1196 | // Round up to bytes. |
1197 | return (bits + 7) / 8; |
1198 | } |
1199 | |
1200 | static Color unpack(PixelFormat f, I32 x) { |
1201 | SkASSERT(byte_size(f) <= 4); |
1202 | auto unpack_channel = [=](int bits, int shift) { |
1203 | I32 channel = extract(x, shift, (1<<bits)-1); |
1204 | switch (f.encoding) { |
1205 | case PixelFormat::UNORM: return from_unorm(bits, channel); |
1206 | case PixelFormat::FLOAT: return from_half ( channel); |
1207 | } |
1208 | SkUNREACHABLE; |
1209 | }; |
1210 | return { |
1211 | f.r_bits ? unpack_channel(f.r_bits, f.r_shift) : x->splat(0.0f), |
1212 | f.g_bits ? unpack_channel(f.g_bits, f.g_shift) : x->splat(0.0f), |
1213 | f.b_bits ? unpack_channel(f.b_bits, f.b_shift) : x->splat(0.0f), |
1214 | f.a_bits ? unpack_channel(f.a_bits, f.a_shift) : x->splat(1.0f), |
1215 | }; |
1216 | } |
1217 | |
1218 | static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) { |
1219 | SkASSERT(byte_size(f) == 8); |
1220 | // We assume some of the channels are in the low 32 bits, some in the high 32 bits. |
1221 | // The assert on byte_size(lo) will trigger if this assumption is violated. |
1222 | *lo = f; |
1223 | if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; } |
1224 | if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; } |
1225 | if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; } |
1226 | if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; } |
1227 | SkASSERT(byte_size(*lo) == 4); |
1228 | |
1229 | *hi = f; |
1230 | if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; } |
1231 | if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; } |
1232 | if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; } |
1233 | if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; } |
1234 | SkASSERT(byte_size(*hi) == 4); |
1235 | } |
1236 | |
1237 | // The only 16-byte format we support today is RGBA F32, |
1238 | // though, TODO, we could generalize that to any swizzle, and to allow UNORM too. |
1239 | static void assert_16byte_is_rgba_f32(PixelFormat f) { |
1240 | #if defined(SK_DEBUG) |
1241 | SkASSERT(byte_size(f) == 16); |
1242 | PixelFormat rgba_f32; |
1243 | SkAssertResult(SkColorType_to_PixelFormat(kRGBA_F32_SkColorType, &rgba_f32)); |
1244 | |
1245 | SkASSERT(f.encoding == rgba_f32.encoding); |
1246 | |
1247 | SkASSERT(f.r_bits == rgba_f32.r_bits); |
1248 | SkASSERT(f.g_bits == rgba_f32.g_bits); |
1249 | SkASSERT(f.b_bits == rgba_f32.b_bits); |
1250 | SkASSERT(f.a_bits == rgba_f32.a_bits); |
1251 | |
1252 | SkASSERT(f.r_shift == rgba_f32.r_shift); |
1253 | SkASSERT(f.g_shift == rgba_f32.g_shift); |
1254 | SkASSERT(f.b_shift == rgba_f32.b_shift); |
1255 | SkASSERT(f.a_shift == rgba_f32.a_shift); |
1256 | #endif |
1257 | } |
1258 | |
1259 | Color Builder::load(PixelFormat f, Arg ptr) { |
1260 | switch (byte_size(f)) { |
1261 | case 1: return unpack(f, load8 (ptr)); |
1262 | case 2: return unpack(f, load16(ptr)); |
1263 | case 4: return unpack(f, load32(ptr)); |
1264 | case 8: { |
1265 | PixelFormat lo,hi; |
1266 | split_disjoint_8byte_format(f, &lo,&hi); |
1267 | Color l = unpack(lo, load64(ptr, 0)), |
1268 | h = unpack(hi, load64(ptr, 1)); |
1269 | return { |
1270 | lo.r_bits ? l.r : h.r, |
1271 | lo.g_bits ? l.g : h.g, |
1272 | lo.b_bits ? l.b : h.b, |
1273 | lo.a_bits ? l.a : h.a, |
1274 | }; |
1275 | } |
1276 | case 16: { |
1277 | assert_16byte_is_rgba_f32(f); |
1278 | return { |
1279 | bit_cast(load128(ptr, 0)), |
1280 | bit_cast(load128(ptr, 1)), |
1281 | bit_cast(load128(ptr, 2)), |
1282 | bit_cast(load128(ptr, 3)), |
1283 | }; |
1284 | } |
1285 | default: SkUNREACHABLE; |
1286 | } |
1287 | return {}; |
1288 | } |
1289 | |
1290 | Color Builder::gather(PixelFormat f, Arg ptr, int offset, I32 index) { |
1291 | switch (byte_size(f)) { |
1292 | case 1: return unpack(f, gather8 (ptr, offset, index)); |
1293 | case 2: return unpack(f, gather16(ptr, offset, index)); |
1294 | case 4: return unpack(f, gather32(ptr, offset, index)); |
1295 | case 8: { |
1296 | PixelFormat lo,hi; |
1297 | split_disjoint_8byte_format(f, &lo,&hi); |
1298 | Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)), |
1299 | h = unpack(hi, gather32(ptr, offset, (index<<1)+1)); |
1300 | return { |
1301 | lo.r_bits ? l.r : h.r, |
1302 | lo.g_bits ? l.g : h.g, |
1303 | lo.b_bits ? l.b : h.b, |
1304 | lo.a_bits ? l.a : h.a, |
1305 | }; |
1306 | } |
1307 | case 16: { |
1308 | assert_16byte_is_rgba_f32(f); |
1309 | return { |
1310 | gatherF(ptr, offset, (index<<2)+0), |
1311 | gatherF(ptr, offset, (index<<2)+1), |
1312 | gatherF(ptr, offset, (index<<2)+2), |
1313 | gatherF(ptr, offset, (index<<2)+3), |
1314 | }; |
1315 | } |
1316 | default: SkUNREACHABLE; |
1317 | } |
1318 | return {}; |
1319 | } |
1320 | |
1321 | static I32 pack32(PixelFormat f, Color c) { |
1322 | SkASSERT(byte_size(f) <= 4); |
1323 | I32 packed = c->splat(0); |
1324 | auto pack_channel = [&](F32 channel, int bits, int shift) { |
1325 | I32 encoded; |
1326 | switch (f.encoding) { |
1327 | case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break; |
1328 | case PixelFormat::FLOAT: encoded = to_half ( channel); break; |
1329 | } |
1330 | packed = pack(packed, encoded, shift); |
1331 | }; |
1332 | if (f.r_bits) { pack_channel(c.r, f.r_bits, f.r_shift); } |
1333 | if (f.g_bits) { pack_channel(c.g, f.g_bits, f.g_shift); } |
1334 | if (f.b_bits) { pack_channel(c.b, f.b_bits, f.b_shift); } |
1335 | if (f.a_bits) { pack_channel(c.a, f.a_bits, f.a_shift); } |
1336 | return packed; |
1337 | } |
1338 | |
1339 | bool Builder::store(PixelFormat f, Arg ptr, Color c) { |
1340 | // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal. |
1341 | if (f.r_bits == f.g_bits && f.g_bits == f.b_bits && |
1342 | f.r_shift == f.g_shift && f.g_shift == f.b_shift) { |
1343 | |
1344 | // TODO: pull these coefficients from an SkColorSpace? This is sRGB luma/luminance. |
1345 | c.r = c.r * 0.2126f |
1346 | + c.g * 0.7152f |
1347 | + c.b * 0.0722f; |
1348 | f.g_bits = f.b_bits = 0; |
1349 | } |
1350 | |
1351 | switch (byte_size(f)) { |
1352 | case 1: store8 (ptr, pack32(f,c)); return true; |
1353 | case 2: store16(ptr, pack32(f,c)); return true; |
1354 | case 4: store32(ptr, pack32(f,c)); return true; |
1355 | case 8: { |
1356 | PixelFormat lo,hi; |
1357 | split_disjoint_8byte_format(f, &lo,&hi); |
1358 | store64(ptr, pack32(lo,c) |
1359 | , pack32(hi,c)); |
1360 | return true; |
1361 | } |
1362 | case 16: { |
1363 | assert_16byte_is_rgba_f32(f); |
1364 | store128(ptr, bit_cast(c.r), bit_cast(c.g), 0); |
1365 | store128(ptr, bit_cast(c.b), bit_cast(c.a), 1); |
1366 | return true; |
1367 | } |
1368 | default: SkUNREACHABLE; |
1369 | } |
1370 | return false; |
1371 | } |
1372 | |
1373 | void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) { |
1374 | skvm::F32 invA = 1.0f / a, |
1375 | inf = bit_cast(splat(0x7f800000)); |
1376 | // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0). |
1377 | invA = select(invA < inf, invA |
1378 | , 0.0f); |
1379 | *r *= invA; |
1380 | *g *= invA; |
1381 | *b *= invA; |
1382 | } |
1383 | |
1384 | void Builder::premul(F32* r, F32* g, F32* b, F32 a) { |
1385 | *r *= a; |
1386 | *g *= a; |
1387 | *b *= a; |
1388 | } |
1389 | |
1390 | Color Builder::uniformPremul(SkColor4f color, SkColorSpace* src, |
1391 | Uniforms* uniforms, SkColorSpace* dst) { |
1392 | SkColorSpaceXformSteps(src, kUnpremul_SkAlphaType, |
1393 | dst, kPremul_SkAlphaType).apply(color.vec()); |
1394 | return { |
1395 | uniformF(uniforms->pushF(color.fR)), |
1396 | uniformF(uniforms->pushF(color.fG)), |
1397 | uniformF(uniforms->pushF(color.fB)), |
1398 | uniformF(uniforms->pushF(color.fA)), |
1399 | }; |
1400 | } |
1401 | |
1402 | F32 Builder::lerp(F32 lo, F32 hi, F32 t) { |
1403 | if (this->isImm(t.id, 0.0f)) { return lo; } |
1404 | if (this->isImm(t.id, 1.0f)) { return hi; } |
1405 | return mad(sub(hi, lo), t, lo); |
1406 | } |
1407 | |
1408 | Color Builder::lerp(Color lo, Color hi, F32 t) { |
1409 | return { |
1410 | lerp(lo.r, hi.r, t), |
1411 | lerp(lo.g, hi.g, t), |
1412 | lerp(lo.b, hi.b, t), |
1413 | lerp(lo.a, hi.a, t), |
1414 | }; |
1415 | } |
1416 | |
1417 | HSLA Builder::to_hsla(Color c) { |
1418 | F32 mx = max(max(c.r,c.g),c.b), |
1419 | mn = min(min(c.r,c.g),c.b), |
1420 | d = mx - mn, |
1421 | invd = 1.0f / d, |
1422 | g_lt_b = select(c.g < c.b, splat(6.0f) |
1423 | , splat(0.0f)); |
1424 | |
1425 | F32 h = (1/6.0f) * select(mx == mn, 0.0f, |
1426 | select(mx == c.r, invd * (c.g - c.b) + g_lt_b, |
1427 | select(mx == c.g, invd * (c.b - c.r) + 2.0f |
1428 | , invd * (c.r - c.g) + 4.0f))); |
1429 | |
1430 | F32 sum = mx + mn, |
1431 | l = sum * 0.5f, |
1432 | s = select(mx == mn, 0.0f |
1433 | , d / select(l > 0.5f, 2.0f - sum |
1434 | , sum)); |
1435 | return {h, s, l, c.a}; |
1436 | } |
1437 | |
1438 | Color Builder::to_rgba(HSLA c) { |
1439 | // See GrRGBToHSLFilterEffect.fp |
1440 | |
1441 | auto [h,s,l,a] = c; |
1442 | F32 x = s * (1.0f - abs(l + l - 1.0f)); |
1443 | |
1444 | auto hue_to_rgb = [&,l=l](auto hue) { |
1445 | auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f; |
1446 | return x * (clamp01(q) - 0.5f) + l; |
1447 | }; |
1448 | |
1449 | return { |
1450 | hue_to_rgb(h + 0/3.0f), |
1451 | hue_to_rgb(h + 2/3.0f), |
1452 | hue_to_rgb(h + 1/3.0f), |
1453 | c.a, |
1454 | }; |
1455 | } |
1456 | |
1457 | // We're basing our implementation of non-separable blend modes on |
1458 | // https://www.w3.org/TR/compositing-1/#blendingnonseparable. |
1459 | // and |
1460 | // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf |
1461 | // They're equivalent, but ES' math has been better simplified. |
1462 | // |
1463 | // Anything extra we add beyond that is to make the math work with premul inputs. |
1464 | |
1465 | static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) { |
1466 | return max(r, max(g, b)) |
1467 | - min(r, min(g, b)); |
1468 | } |
1469 | |
1470 | static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) { |
1471 | return r*0.30f + g*0.59f + b*0.11f; |
1472 | } |
1473 | |
1474 | static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) { |
1475 | F32 mn = min(*r, min(*g, *b)), |
1476 | mx = max(*r, max(*g, *b)), |
1477 | sat = mx - mn; |
1478 | |
1479 | // Map min channel to 0, max channel to s, and scale the middle proportionally. |
1480 | auto scale = [&](skvm::F32 c) { |
1481 | auto scaled = ((c - mn) * s) / sat; |
1482 | return select(is_finite(scaled), scaled, 0.0f); |
1483 | }; |
1484 | *r = scale(*r); |
1485 | *g = scale(*g); |
1486 | *b = scale(*b); |
1487 | } |
1488 | |
1489 | static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) { |
1490 | auto diff = lu - luminance(*r, *g, *b); |
1491 | *r += diff; |
1492 | *g += diff; |
1493 | *b += diff; |
1494 | } |
1495 | |
1496 | static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) { |
1497 | F32 mn = min(*r, min(*g, *b)), |
1498 | mx = max(*r, max(*g, *b)), |
1499 | lu = luminance(*r, *g, *b); |
1500 | |
1501 | auto clip = [&](auto c) { |
1502 | c = select(mn >= 0, c |
1503 | , lu + ((c-lu)*( lu)) / (lu-mn)); |
1504 | c = select(mx > a, lu + ((c-lu)*(a-lu)) / (mx-lu) |
1505 | , c); |
1506 | return clamp01(c); // May be a little negative, or worse, NaN. |
1507 | }; |
1508 | *r = clip(*r); |
1509 | *g = clip(*g); |
1510 | *b = clip(*b); |
1511 | } |
1512 | |
1513 | Color Builder::blend(SkBlendMode mode, Color src, Color dst) { |
1514 | auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) { |
1515 | return x*y + z*w; |
1516 | }; |
1517 | |
1518 | auto two = [](skvm::F32 x) { return x+x; }; |
1519 | |
1520 | auto apply_rgba = [&](auto fn) { |
1521 | return Color { |
1522 | fn(src.r, dst.r), |
1523 | fn(src.g, dst.g), |
1524 | fn(src.b, dst.b), |
1525 | fn(src.a, dst.a), |
1526 | }; |
1527 | }; |
1528 | |
1529 | auto apply_rgb_srcover_a = [&](auto fn) { |
1530 | return Color { |
1531 | fn(src.r, dst.r), |
1532 | fn(src.g, dst.g), |
1533 | fn(src.b, dst.b), |
1534 | mad(dst.a, 1-src.a, src.a), // srcover for alpha |
1535 | }; |
1536 | }; |
1537 | |
1538 | auto non_sep = [&](auto R, auto G, auto B) { |
1539 | return Color{ |
1540 | R + mma(src.r, 1-dst.a, dst.r, 1-src.a), |
1541 | G + mma(src.g, 1-dst.a, dst.g, 1-src.a), |
1542 | B + mma(src.b, 1-dst.a, dst.b, 1-src.a), |
1543 | mad(dst.a, 1-src.a, src.a), // srcover for alpha |
1544 | }; |
1545 | }; |
1546 | |
1547 | switch (mode) { |
1548 | default: |
1549 | SkASSERT(false); |
1550 | [[fallthrough]]; /*but also, for safety, fallthrough*/ |
1551 | |
1552 | case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) }; |
1553 | |
1554 | case SkBlendMode::kSrc: return src; |
1555 | case SkBlendMode::kDst: return dst; |
1556 | |
1557 | case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]]; |
1558 | case SkBlendMode::kSrcOver: |
1559 | return apply_rgba([&](auto s, auto d) { |
1560 | return mad(d,1-src.a, s); |
1561 | }); |
1562 | |
1563 | case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]]; |
1564 | case SkBlendMode::kSrcIn: |
1565 | return apply_rgba([&](auto s, auto d) { |
1566 | return s * dst.a; |
1567 | }); |
1568 | |
1569 | case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]]; |
1570 | |
1571 | case SkBlendMode::kSrcOut: |
1572 | return apply_rgba([&](auto s, auto d) { |
1573 | return s * (1-dst.a); |
1574 | }); |
1575 | |
1576 | case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]]; |
1577 | case SkBlendMode::kSrcATop: |
1578 | return apply_rgba([&](auto s, auto d) { |
1579 | return mma(s, dst.a, d, 1-src.a); |
1580 | }); |
1581 | |
1582 | case SkBlendMode::kXor: |
1583 | return apply_rgba([&](auto s, auto d) { |
1584 | return mma(s, 1-dst.a, d, 1-src.a); |
1585 | }); |
1586 | |
1587 | case SkBlendMode::kPlus: |
1588 | return apply_rgba([&](auto s, auto d) { |
1589 | return min(s+d, 1.0f); |
1590 | }); |
1591 | |
1592 | case SkBlendMode::kModulate: |
1593 | return apply_rgba([&](auto s, auto d) { |
1594 | return s * d; |
1595 | }); |
1596 | |
1597 | case SkBlendMode::kScreen: |
1598 | // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts. |
1599 | // It's kind of plausible that s + (d - sd) keeps more precision? |
1600 | return apply_rgba([&](auto s, auto d) { |
1601 | return s + (d - s*d); |
1602 | }); |
1603 | |
1604 | case SkBlendMode::kDarken: |
1605 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1606 | return s + (d - max(s * dst.a, |
1607 | d * src.a)); |
1608 | }); |
1609 | |
1610 | case SkBlendMode::kLighten: |
1611 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1612 | return s + (d - min(s * dst.a, |
1613 | d * src.a)); |
1614 | }); |
1615 | |
1616 | case SkBlendMode::kDifference: |
1617 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1618 | return s + (d - two(min(s * dst.a, |
1619 | d * src.a))); |
1620 | }); |
1621 | |
1622 | case SkBlendMode::kExclusion: |
1623 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1624 | return s + (d - two(s * d)); |
1625 | }); |
1626 | |
1627 | case SkBlendMode::kColorBurn: |
1628 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1629 | auto mn = min(dst.a, |
1630 | src.a * (dst.a - d) / s), |
1631 | burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a); |
1632 | return select(d == dst.a , s * (1-dst.a) + d, |
1633 | select(is_finite(burn), burn |
1634 | , d * (1-src.a) + s)); |
1635 | }); |
1636 | |
1637 | case SkBlendMode::kColorDodge: |
1638 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1639 | auto dodge = src.a * min(dst.a, |
1640 | d * src.a / (src.a - s)) |
1641 | + mma(s, 1-dst.a, d, 1-src.a); |
1642 | return select(d == 0.0f , s * (1-dst.a) + d, |
1643 | select(is_finite(dodge), dodge |
1644 | , d * (1-src.a) + s)); |
1645 | }); |
1646 | |
1647 | case SkBlendMode::kHardLight: |
1648 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1649 | return mma(s, 1-dst.a, d, 1-src.a) + |
1650 | select(two(s) <= src.a, |
1651 | two(s * d), |
1652 | src.a * dst.a - two((dst.a - d) * (src.a - s))); |
1653 | }); |
1654 | |
1655 | case SkBlendMode::kOverlay: |
1656 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1657 | return mma(s, 1-dst.a, d, 1-src.a) + |
1658 | select(two(d) <= dst.a, |
1659 | two(s * d), |
1660 | src.a * dst.a - two((dst.a - d) * (src.a - s))); |
1661 | }); |
1662 | |
1663 | case SkBlendMode::kMultiply: |
1664 | return apply_rgba([&](auto s, auto d) { |
1665 | return mma(s, 1-dst.a, d, 1-src.a) + s * d; |
1666 | }); |
1667 | |
1668 | case SkBlendMode::kSoftLight: |
1669 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1670 | auto m = select(dst.a > 0.0f, d / dst.a |
1671 | , 0.0f), |
1672 | s2 = two(s), |
1673 | m4 = 4*m; |
1674 | |
1675 | // The logic forks three ways: |
1676 | // 1. dark src? |
1677 | // 2. light src, dark dst? |
1678 | // 3. light src, light dst? |
1679 | |
1680 | // Used in case 1 |
1681 | auto darkSrc = d * ((s2-src.a) * (1-m) + src.a), |
1682 | // Used in case 2 |
1683 | darkDst = (m4 * m4 + m4) * (m-1) + 7*m, |
1684 | // Used in case 3. |
1685 | liteDst = sqrt(m) - m, |
1686 | // Used in 2 or 3? |
1687 | liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst |
1688 | , liteDst) |
1689 | + d * src.a; |
1690 | return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc |
1691 | , liteSrc); |
1692 | }); |
1693 | |
1694 | case SkBlendMode::kHue: { |
1695 | skvm::F32 R = src.r * src.a, |
1696 | G = src.g * src.a, |
1697 | B = src.b * src.a; |
1698 | |
1699 | set_sat (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b)); |
1700 | set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); |
1701 | clip_color(&R, &G, &B, src.a * dst.a); |
1702 | |
1703 | return non_sep(R, G, B); |
1704 | } |
1705 | |
1706 | case SkBlendMode::kSaturation: { |
1707 | skvm::F32 R = dst.r * src.a, |
1708 | G = dst.g * src.a, |
1709 | B = dst.b * src.a; |
1710 | |
1711 | set_sat (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b)); |
1712 | set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); |
1713 | clip_color(&R, &G, &B, src.a * dst.a); |
1714 | |
1715 | return non_sep(R, G, B); |
1716 | } |
1717 | |
1718 | case SkBlendMode::kColor: { |
1719 | skvm::F32 R = src.r * dst.a, |
1720 | G = src.g * dst.a, |
1721 | B = src.b * dst.a; |
1722 | |
1723 | set_lum (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b)); |
1724 | clip_color(&R, &G, &B, src.a * dst.a); |
1725 | |
1726 | return non_sep(R, G, B); |
1727 | } |
1728 | |
1729 | case SkBlendMode::kLuminosity: { |
1730 | skvm::F32 R = dst.r * src.a, |
1731 | G = dst.g * src.a, |
1732 | B = dst.b * src.a; |
1733 | |
1734 | set_lum (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b)); |
1735 | clip_color(&R, &G, &B, dst.a * src.a); |
1736 | |
1737 | return non_sep(R, G, B); |
1738 | } |
1739 | } |
1740 | } |
1741 | |
1742 | // For a given program we'll store each Instruction's users contiguously in a table, |
1743 | // and track where each Instruction's span of users starts and ends in another index. |
1744 | // Here's a simple program that loads x and stores kx+k: |
1745 | // |
1746 | // v0 = splat(k) |
1747 | // v1 = load(...) |
1748 | // v2 = mul(v1, v0) |
1749 | // v3 = add(v2, v0) |
1750 | // v4 = store(..., v3) |
1751 | // |
1752 | // This program has 5 instructions v0-v4. |
1753 | // - v0 is used by v2 and v3 |
1754 | // - v1 is used by v2 |
1755 | // - v2 is used by v3 |
1756 | // - v3 is used by v4 |
1757 | // - v4 has a side-effect |
1758 | // |
1759 | // For this program we fill out these two arrays: |
1760 | // table: [v2,v3, v2, v3, v4] |
1761 | // index: [0, 2, 3, 4, 5] |
1762 | // |
1763 | // The table is just those "is used by ..." I wrote out above in order, |
1764 | // and the index tracks where an Instruction's span of users starts, table[index[id]]. |
1765 | // The span continues up until the start of the next Instruction, table[index[id+1]]. |
1766 | SkSpan<const Val> Usage::operator[](Val id) const { |
1767 | int begin = fIndex[id]; |
1768 | int end = fIndex[id + 1]; |
1769 | return SkMakeSpan(fTable.data() + begin, end - begin); |
1770 | } |
1771 | |
1772 | Usage::Usage(const std::vector<Instruction>& program) { |
1773 | // uses[id] counts the number of times each Instruction is used. |
1774 | std::vector<int> uses(program.size(), 0); |
1775 | for (Val id = 0; id < (Val)program.size(); id++) { |
1776 | Instruction inst = program[id]; |
1777 | if (inst.x != NA) { ++uses[inst.x]; } |
1778 | if (inst.y != NA) { ++uses[inst.y]; } |
1779 | if (inst.z != NA) { ++uses[inst.z]; } |
1780 | } |
1781 | |
1782 | // Build our index into fTable, with an extra entry marking the final Instruction's end. |
1783 | fIndex.reserve(program.size() + 1); |
1784 | int total_uses = 0; |
1785 | for (int n : uses) { |
1786 | fIndex.push_back(total_uses); |
1787 | total_uses += n; |
1788 | } |
1789 | fIndex.push_back(total_uses); |
1790 | |
1791 | // Tick down each Instruction's uses to fill in fTable. |
1792 | fTable.resize(total_uses, NA); |
1793 | for (Val id = (Val)program.size(); id --> 0; ) { |
1794 | Instruction inst = program[id]; |
1795 | if (inst.x != NA) { fTable[fIndex[inst.x] + --uses[inst.x]] = id; } |
1796 | if (inst.y != NA) { fTable[fIndex[inst.y] + --uses[inst.y]] = id; } |
1797 | if (inst.z != NA) { fTable[fIndex[inst.z] + --uses[inst.z]] = id; } |
1798 | } |
1799 | for (int n : uses ) { (void)n; SkASSERT(n == 0 ); } |
1800 | for (Val id : fTable) { (void)id; SkASSERT(id != NA); } |
1801 | } |
1802 | |
1803 | // ~~~~ Program::eval() and co. ~~~~ // |
1804 | |
1805 | // Handy references for x86-64 instruction encoding: |
1806 | // https://wiki.osdev.org/X86-64_Instruction_Encoding |
1807 | // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm |
1808 | // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm |
1809 | // http://ref.x86asm.net/coder64.html |
1810 | |
1811 | // Used for ModRM / immediate instruction encoding. |
1812 | static uint8_t _233(int a, int b, int c) { |
1813 | return (a & 3) << 6 |
1814 | | (b & 7) << 3 |
1815 | | (c & 7) << 0; |
1816 | } |
1817 | |
1818 | // ModRM byte encodes the arguments of an opcode. |
1819 | enum class Mod { Indirect, OneByteImm, FourByteImm, Direct }; |
1820 | static uint8_t mod_rm(Mod mod, int reg, int rm) { |
1821 | return _233((int)mod, reg, rm); |
1822 | } |
1823 | |
1824 | static Mod mod(int imm) { |
1825 | if (imm == 0) { return Mod::Indirect; } |
1826 | if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; } |
1827 | return Mod::FourByteImm; |
1828 | } |
1829 | |
1830 | static int imm_bytes(Mod mod) { |
1831 | switch (mod) { |
1832 | case Mod::Indirect: return 0; |
1833 | case Mod::OneByteImm: return 1; |
1834 | case Mod::FourByteImm: return 4; |
1835 | case Mod::Direct: SkUNREACHABLE; |
1836 | } |
1837 | SkUNREACHABLE; |
1838 | } |
1839 | |
1840 | // SIB byte encodes a memory address, base + (index * scale). |
1841 | static uint8_t sib(Assembler::Scale scale, int index, int base) { |
1842 | return _233((int)scale, index, base); |
1843 | } |
1844 | |
1845 | // The REX prefix is used to extend most old 32-bit instructions to 64-bit. |
1846 | static uint8_t rex(bool W, // If set, operation is 64-bit, otherwise default, usually 32-bit. |
1847 | bool R, // Extra top bit to select ModRM reg, registers 8-15. |
1848 | bool X, // Extra top bit for SIB index register. |
1849 | bool B) { // Extra top bit for SIB base or ModRM rm register. |
1850 | return 0b01000000 // Fixed 0100 for top four bits. |
1851 | | (W << 3) |
1852 | | (R << 2) |
1853 | | (X << 1) |
1854 | | (B << 0); |
1855 | } |
1856 | |
1857 | |
1858 | // The VEX prefix extends SSE operations to AVX. Used generally, even with XMM. |
1859 | struct VEX { |
1860 | int len; |
1861 | uint8_t bytes[3]; |
1862 | }; |
1863 | |
1864 | static VEX vex(bool WE, // Like REX W for int operations, or opcode extension for float? |
1865 | bool R, // Same as REX R. Pass high bit of dst register, dst>>3. |
1866 | bool X, // Same as REX X. |
1867 | bool B, // Same as REX B. Pass y>>3 for 3-arg ops, x>>3 for 2-arg. |
1868 | int map, // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f. |
1869 | int vvvv, // 4-bit second operand register. Pass our x for 3-arg ops. |
1870 | bool L, // Set for 256-bit ymm operations, off for 128-bit xmm. |
1871 | int pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none. |
1872 | |
1873 | // Pack x86 opcode map selector to 5-bit VEX encoding. |
1874 | map = [map]{ |
1875 | switch (map) { |
1876 | case 0x0f: return 0b00001; |
1877 | case 0x380f: return 0b00010; |
1878 | case 0x3a0f: return 0b00011; |
1879 | // Several more cases only used by XOP / TBM. |
1880 | } |
1881 | SkUNREACHABLE; |
1882 | }(); |
1883 | |
1884 | // Pack mandatory SSE opcode prefix byte to 2-bit VEX encoding. |
1885 | pp = [pp]{ |
1886 | switch (pp) { |
1887 | case 0x66: return 0b01; |
1888 | case 0xf3: return 0b10; |
1889 | case 0xf2: return 0b11; |
1890 | } |
1891 | return 0b00; |
1892 | }(); |
1893 | |
1894 | VEX vex = {0, {0,0,0}}; |
1895 | if (X == 0 && B == 0 && WE == 0 && map == 0b00001) { |
1896 | // With these conditions met, we can optionally compress VEX to 2-byte. |
1897 | vex.len = 2; |
1898 | vex.bytes[0] = 0xc5; |
1899 | vex.bytes[1] = (pp & 3) << 0 |
1900 | | (L & 1) << 2 |
1901 | | (~vvvv & 15) << 3 |
1902 | | (~(int)R & 1) << 7; |
1903 | } else { |
1904 | // We could use this 3-byte VEX prefix all the time if we like. |
1905 | vex.len = 3; |
1906 | vex.bytes[0] = 0xc4; |
1907 | vex.bytes[1] = (map & 31) << 0 |
1908 | | (~(int)B & 1) << 5 |
1909 | | (~(int)X & 1) << 6 |
1910 | | (~(int)R & 1) << 7; |
1911 | vex.bytes[2] = (pp & 3) << 0 |
1912 | | (L & 1) << 2 |
1913 | | (~vvvv & 15) << 3 |
1914 | | (WE & 1) << 7; |
1915 | } |
1916 | return vex; |
1917 | } |
1918 | |
1919 | Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(0) {} |
1920 | |
1921 | size_t Assembler::size() const { return fSize; } |
1922 | |
1923 | void Assembler::bytes(const void* p, int n) { |
1924 | if (fCurr) { |
1925 | memcpy(fCurr, p, n); |
1926 | fCurr += n; |
1927 | } |
1928 | fSize += n; |
1929 | } |
1930 | |
1931 | void Assembler::byte(uint8_t b) { this->bytes(&b, 1); } |
1932 | void Assembler::word(uint32_t w) { this->bytes(&w, 4); } |
1933 | |
1934 | void Assembler::align(int mod) { |
1935 | while (this->size() % mod) { |
1936 | this->byte(0x00); |
1937 | } |
1938 | } |
1939 | |
1940 | void Assembler::int3() { |
1941 | this->byte(0xcc); |
1942 | } |
1943 | |
1944 | void Assembler::vzeroupper() { |
1945 | this->byte(0xc5); |
1946 | this->byte(0xf8); |
1947 | this->byte(0x77); |
1948 | } |
1949 | void Assembler::ret() { this->byte(0xc3); } |
1950 | |
1951 | void Assembler::op(int opcode, Operand dst, GP64 x) { |
1952 | if (dst.kind == Operand::REG) { |
1953 | this->byte(rex(W1,x>>3,0,dst.reg>>3)); |
1954 | this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2); |
1955 | this->byte(mod_rm(Mod::Direct, x, dst.reg&7)); |
1956 | } else { |
1957 | SkASSERT(dst.kind == Operand::MEM); |
1958 | const Mem& m = dst.mem; |
1959 | const bool need_SIB = (m.base&7) == rsp |
1960 | || m.index != rsp; |
1961 | |
1962 | this->byte(rex(W1,x>>3,m.index>>3,m.base>>3)); |
1963 | this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2); |
1964 | this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7)); |
1965 | if (need_SIB) { |
1966 | this->byte(sib(m.scale, m.index&7, m.base&7)); |
1967 | } |
1968 | this->bytes(&m.disp, imm_bytes(mod(m.disp))); |
1969 | } |
1970 | } |
1971 | |
1972 | void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) { |
1973 | opcode |= 0b1000'0000; // top bit set for instructions with any immediate |
1974 | |
1975 | int imm_bytes = 4; |
1976 | if (SkTFitsIn<int8_t>(imm)) { |
1977 | imm_bytes = 1; |
1978 | opcode |= 0b0000'0010; // second bit set for 8-bit immediate, else 32-bit. |
1979 | } |
1980 | |
1981 | this->op(opcode, dst, (GP64)opcode_ext); |
1982 | this->bytes(&imm, imm_bytes); |
1983 | } |
1984 | |
1985 | void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); } |
1986 | void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); } |
1987 | void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); } |
1988 | |
1989 | // These don't work quite like the other instructions with immediates: |
1990 | // these immediates are always fixed size at 4 bytes or 1 byte. |
1991 | void Assembler::mov(Operand dst, int imm) { |
1992 | this->op(0xC7,dst,(GP64)0b000); |
1993 | this->word(imm); |
1994 | } |
1995 | void Assembler::movb(Operand dst, int imm) { |
1996 | this->op(0xC6,dst,(GP64)0b000); |
1997 | this->byte(imm); |
1998 | } |
1999 | |
2000 | void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); } |
2001 | void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); } |
2002 | void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); } |
2003 | void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); } |
2004 | void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); } |
2005 | |
2006 | void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); } |
2007 | void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); } |
2008 | void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); } |
2009 | void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); } |
2010 | void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); } |
2011 | |
2012 | void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); } |
2013 | void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); } |
2014 | |
2015 | void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfe, dst,x,y); } |
2016 | void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0xfa, dst,x,y); } |
2017 | void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); } |
2018 | |
2019 | void Assembler::vpsubw (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xf9, dst,x,y); } |
2020 | void Assembler::vpmullw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xd5, dst,x,y); } |
2021 | |
2022 | void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); } |
2023 | void Assembler::vpor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); } |
2024 | void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); } |
2025 | void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); } |
2026 | |
2027 | void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); } |
2028 | void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); } |
2029 | void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); } |
2030 | void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); } |
2031 | void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); } |
2032 | void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); } |
2033 | |
2034 | void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); } |
2035 | void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); } |
2036 | void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); } |
2037 | |
2038 | void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); } |
2039 | void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); } |
2040 | void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); } |
2041 | |
2042 | void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); } |
2043 | void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); } |
2044 | void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); } |
2045 | |
2046 | void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); } |
2047 | void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66, 0x0f,0x67, dst,x,y); } |
2048 | |
2049 | void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); } |
2050 | void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); } |
2051 | |
2052 | void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); } |
2053 | void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); } |
2054 | |
2055 | |
2056 | void Assembler::imm_byte_after_operand(const Operand& operand, int imm) { |
2057 | // When we've embedded a label displacement in the middle of an instruction, |
2058 | // we need to tweak it a little so that the resolved displacement starts |
2059 | // from the end of the instruction and not the end of the displacement. |
2060 | if (operand.kind == Operand::LABEL && fCode) { |
2061 | int disp; |
2062 | memcpy(&disp, fCurr-4, 4); |
2063 | disp--; |
2064 | memcpy(fCurr-4, &disp, 4); |
2065 | } |
2066 | this->byte(imm); |
2067 | } |
2068 | |
2069 | void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) { |
2070 | this->op(0,0x0f,0xc2, dst,x,y); |
2071 | this->imm_byte_after_operand(y, imm); |
2072 | } |
2073 | |
2074 | void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) { |
2075 | this->op(0x66,0x3a0f,0x4c, dst,x,y); |
2076 | this->imm_byte_after_operand(y, z << 4); |
2077 | } |
2078 | |
2079 | // Shift instructions encode their opcode extension as "dst", dst as x, and x as y. |
2080 | void Assembler::vpslld(Ymm dst, Ymm x, int imm) { |
2081 | this->op(0x66,0x0f,0x72,(Ymm)6, dst,x); |
2082 | this->byte(imm); |
2083 | } |
2084 | void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { |
2085 | this->op(0x66,0x0f,0x72,(Ymm)2, dst,x); |
2086 | this->byte(imm); |
2087 | } |
2088 | void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { |
2089 | this->op(0x66,0x0f,0x72,(Ymm)4, dst,x); |
2090 | this->byte(imm); |
2091 | } |
2092 | void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { |
2093 | this->op(0x66,0x0f,0x71,(Ymm)2, dst,x); |
2094 | this->byte(imm); |
2095 | } |
2096 | |
2097 | void Assembler::vpermq(Ymm dst, Operand x, int imm) { |
2098 | // A bit unusual among the instructions we use, this is 64-bit operation, so we set W. |
2099 | this->op(0x66,0x3a0f,0x00, dst,x,W1); |
2100 | this->imm_byte_after_operand(x, imm); |
2101 | } |
2102 | |
2103 | void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) { |
2104 | this->op(0x66,0x3a0f,0x06, dst,x,y); |
2105 | this->imm_byte_after_operand(y, imm); |
2106 | } |
2107 | |
2108 | void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) { |
2109 | this->op(0x66,0x380f,0x16, dst,ix,src); |
2110 | } |
2111 | |
2112 | void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) { |
2113 | this->op(0x66,0x3a0f,0x08, dst,x); |
2114 | this->imm_byte_after_operand(x, imm); |
2115 | } |
2116 | |
2117 | void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); } |
2118 | void Assembler::vmovups(Ymm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); } |
2119 | void Assembler::vmovups(Xmm dst, Operand src) { this->op( 0,0x0f,0x10, dst,src); } |
2120 | void Assembler::vmovups(Operand dst, Ymm src) { this->op( 0,0x0f,0x11, src,dst); } |
2121 | void Assembler::vmovups(Operand dst, Xmm src) { this->op( 0,0x0f,0x11, src,dst); } |
2122 | |
2123 | void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op( 0,0x0f,0x5b, dst,x); } |
2124 | void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); } |
2125 | void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); } |
2126 | void Assembler::vsqrtps (Ymm dst, Operand x) { this->op( 0,0x0f,0x51, dst,x); } |
2127 | |
2128 | void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) { |
2129 | this->op(0x66,0x3a0f,0x1d, x,dst); |
2130 | this->imm_byte_after_operand(dst, imm); |
2131 | } |
2132 | void Assembler::vcvtph2ps(Ymm dst, Operand x) { |
2133 | this->op(0x66,0x380f,0x13, dst,x); |
2134 | } |
2135 | |
2136 | int Assembler::disp19(Label* l) { |
2137 | SkASSERT(l->kind == Label::NotYetSet || |
2138 | l->kind == Label::ARMDisp19); |
2139 | int here = (int)this->size(); |
2140 | l->kind = Label::ARMDisp19; |
2141 | l->references.push_back(here); |
2142 | // ARM 19-bit instruction count, from the beginning of this instruction. |
2143 | return (l->offset - here) / 4; |
2144 | } |
2145 | |
2146 | int Assembler::disp32(Label* l) { |
2147 | SkASSERT(l->kind == Label::NotYetSet || |
2148 | l->kind == Label::X86Disp32); |
2149 | int here = (int)this->size(); |
2150 | l->kind = Label::X86Disp32; |
2151 | l->references.push_back(here); |
2152 | // x86 32-bit byte count, from the end of this instruction. |
2153 | return l->offset - (here + 4); |
2154 | } |
2155 | |
2156 | void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) { |
2157 | switch (y.kind) { |
2158 | case Operand::REG: { |
2159 | VEX v = vex(w, dst>>3, 0, y.reg>>3, |
2160 | map, x, l, prefix); |
2161 | this->bytes(v.bytes, v.len); |
2162 | this->byte(opcode); |
2163 | this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7)); |
2164 | } return; |
2165 | |
2166 | case Operand::MEM: { |
2167 | // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows; |
2168 | // without an SIB byte, that's where the base register would usually go. |
2169 | // This means we have to use an SIB byte if we want to use rsp as a base register. |
2170 | const Mem& m = y.mem; |
2171 | const bool need_SIB = m.base == rsp |
2172 | || m.index != rsp; |
2173 | |
2174 | VEX v = vex(w, dst>>3, m.index>>3, m.base>>3, |
2175 | map, x, l, prefix); |
2176 | this->bytes(v.bytes, v.len); |
2177 | this->byte(opcode); |
2178 | this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7)); |
2179 | if (need_SIB) { |
2180 | this->byte(sib(m.scale, m.index&7, m.base&7)); |
2181 | } |
2182 | this->bytes(&m.disp, imm_bytes(mod(m.disp))); |
2183 | } return; |
2184 | |
2185 | case Operand::LABEL: { |
2186 | // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13. |
2187 | const int rip = rbp; |
2188 | |
2189 | VEX v = vex(w, dst>>3, 0, rip>>3, |
2190 | map, x, l, prefix); |
2191 | this->bytes(v.bytes, v.len); |
2192 | this->byte(opcode); |
2193 | this->byte(mod_rm(Mod::Indirect, dst&7, rip&7)); |
2194 | this->word(this->disp32(y.label)); |
2195 | } return; |
2196 | } |
2197 | } |
2198 | |
2199 | void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); } |
2200 | |
2201 | void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); } |
2202 | |
2203 | void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); } |
2204 | |
2205 | void Assembler::jump(uint8_t condition, Label* l) { |
2206 | // These conditional jumps can be either 2 bytes (short) or 6 bytes (near): |
2207 | // 7? one-byte-disp |
2208 | // 0F 8? four-byte-disp |
2209 | // We always use the near displacement to make updating labels simpler (no resizing). |
2210 | this->byte(0x0f); |
2211 | this->byte(condition); |
2212 | this->word(this->disp32(l)); |
2213 | } |
2214 | void Assembler::je (Label* l) { this->jump(0x84, l); } |
2215 | void Assembler::jne(Label* l) { this->jump(0x85, l); } |
2216 | void Assembler::jl (Label* l) { this->jump(0x8c, l); } |
2217 | void Assembler::jc (Label* l) { this->jump(0x82, l); } |
2218 | |
2219 | void Assembler::jmp(Label* l) { |
2220 | // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit. |
2221 | this->byte(0xe9); |
2222 | this->word(this->disp32(l)); |
2223 | } |
2224 | |
2225 | void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); } |
2226 | void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); } |
2227 | |
2228 | void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); } |
2229 | |
2230 | void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); } |
2231 | void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); } |
2232 | |
2233 | void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) { |
2234 | this->op(0x66,0x3a0f,0x22, dst,src,y); |
2235 | this->imm_byte_after_operand(y, imm); |
2236 | } |
2237 | void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) { |
2238 | this->op(0x66,0x0f,0xc4, dst,src,y); |
2239 | this->imm_byte_after_operand(y, imm); |
2240 | } |
2241 | void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) { |
2242 | this->op(0x66,0x3a0f,0x20, dst,src,y); |
2243 | this->imm_byte_after_operand(y, imm); |
2244 | } |
2245 | |
2246 | void Assembler::vextracti128(Operand dst, Ymm src, int imm) { |
2247 | this->op(0x66,0x3a0f,0x39, src,dst); |
2248 | SkASSERT(dst.kind != Operand::LABEL); |
2249 | this->byte(imm); |
2250 | } |
2251 | void Assembler::vpextrd(Operand dst, Xmm src, int imm) { |
2252 | this->op(0x66,0x3a0f,0x16, src,dst); |
2253 | SkASSERT(dst.kind != Operand::LABEL); |
2254 | this->byte(imm); |
2255 | } |
2256 | void Assembler::vpextrw(Operand dst, Xmm src, int imm) { |
2257 | this->op(0x66,0x3a0f,0x15, src,dst); |
2258 | SkASSERT(dst.kind != Operand::LABEL); |
2259 | this->byte(imm); |
2260 | } |
2261 | void Assembler::vpextrb(Operand dst, Xmm src, int imm) { |
2262 | this->op(0x66,0x3a0f,0x14, src,dst); |
2263 | SkASSERT(dst.kind != Operand::LABEL); |
2264 | this->byte(imm); |
2265 | } |
2266 | |
2267 | void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) { |
2268 | // Unlike most instructions, no aliasing is permitted here. |
2269 | SkASSERT(dst != ix); |
2270 | SkASSERT(dst != mask); |
2271 | SkASSERT(mask != ix); |
2272 | |
2273 | int prefix = 0x66, |
2274 | map = 0x380f, |
2275 | opcode = 0x92; |
2276 | VEX v = vex(0, dst>>3, ix>>3, base>>3, |
2277 | map, mask, /*ymm?*/1, prefix); |
2278 | this->bytes(v.bytes, v.len); |
2279 | this->byte(opcode); |
2280 | this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/)); |
2281 | this->byte(sib(scale, ix&7, base&7)); |
2282 | } |
2283 | |
2284 | // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf |
2285 | |
2286 | static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; } |
2287 | |
2288 | void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) { |
2289 | this->word( (hi & 11_mask) << 21 |
2290 | | (m & 5_mask) << 16 |
2291 | | (lo & 6_mask) << 10 |
2292 | | (n & 5_mask) << 5 |
2293 | | (d & 5_mask) << 0); |
2294 | } |
2295 | void Assembler::op(uint32_t op22, V n, V d, int imm) { |
2296 | this->word( (op22 & 22_mask) << 10 |
2297 | | imm // size and location depends on the instruction |
2298 | | (n & 5_mask) << 5 |
2299 | | (d & 5_mask) << 0); |
2300 | } |
2301 | |
2302 | void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); } |
2303 | void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); } |
2304 | void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); } |
2305 | void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); } |
2306 | void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); } |
2307 | void Assembler::not16b(V d, V n) { this->op(0b0'1'1'01110'00'10000'00101'10, n, d); } |
2308 | |
2309 | void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); } |
2310 | void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); } |
2311 | void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); } |
2312 | |
2313 | void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); } |
2314 | void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); } |
2315 | |
2316 | void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); } |
2317 | void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); } |
2318 | |
2319 | void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); } |
2320 | void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); } |
2321 | void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); } |
2322 | void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); } |
2323 | void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); } |
2324 | void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); } |
2325 | void Assembler::fneg4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n, d); } |
2326 | |
2327 | void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); } |
2328 | void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); } |
2329 | void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); } |
2330 | |
2331 | void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); } |
2332 | void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); } |
2333 | |
2334 | void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); } |
2335 | |
2336 | void Assembler::sli4s(V d, V n, int imm5) { |
2337 | this->op(0b0'1'1'011110'0100'000'01010'1, n, d, ( imm5 & 5_mask)<<16); |
2338 | } |
2339 | void Assembler::shl4s(V d, V n, int imm5) { |
2340 | this->op(0b0'1'0'011110'0100'000'01010'1, n, d, ( imm5 & 5_mask)<<16); |
2341 | } |
2342 | void Assembler::sshr4s(V d, V n, int imm5) { |
2343 | this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16); |
2344 | } |
2345 | void Assembler::ushr4s(V d, V n, int imm5) { |
2346 | this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16); |
2347 | } |
2348 | void Assembler::ushr8h(V d, V n, int imm4) { |
2349 | this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & 4_mask)<<16); |
2350 | } |
2351 | |
2352 | void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); } |
2353 | void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); } |
2354 | void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); } |
2355 | |
2356 | void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); } |
2357 | void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); } |
2358 | |
2359 | void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); } |
2360 | void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); } |
2361 | |
2362 | void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); } |
2363 | |
2364 | void Assembler::brk(int imm16) { |
2365 | this->op(0b11010100'001'00000000000, (imm16 & 16_mask) << 5); |
2366 | } |
2367 | |
2368 | void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); } |
2369 | |
2370 | void Assembler::add(X d, X n, int imm12) { |
2371 | this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); |
2372 | } |
2373 | void Assembler::sub(X d, X n, int imm12) { |
2374 | this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); |
2375 | } |
2376 | void Assembler::subs(X d, X n, int imm12) { |
2377 | this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10); |
2378 | } |
2379 | |
2380 | void Assembler::b(Condition cond, Label* l) { |
2381 | const int imm19 = this->disp19(l); |
2382 | this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & 19_mask) << 5); |
2383 | } |
2384 | void Assembler::cbz(X t, Label* l) { |
2385 | const int imm19 = this->disp19(l); |
2386 | this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & 19_mask) << 5); |
2387 | } |
2388 | void Assembler::cbnz(X t, Label* l) { |
2389 | const int imm19 = this->disp19(l); |
2390 | this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & 19_mask) << 5); |
2391 | } |
2392 | |
2393 | void Assembler::ldrq(V dst, X src, int imm12) { |
2394 | this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & 12_mask) << 10); |
2395 | } |
2396 | void Assembler::ldrs(V dst, X src, int imm12) { |
2397 | this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); |
2398 | } |
2399 | void Assembler::ldrb(V dst, X src, int imm12) { |
2400 | this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10); |
2401 | } |
2402 | |
2403 | void Assembler::strq(V src, X dst, int imm12) { |
2404 | this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & 12_mask) << 10); |
2405 | } |
2406 | void Assembler::strs(V src, X dst, int imm12) { |
2407 | this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); |
2408 | } |
2409 | void Assembler::strb(V src, X dst, int imm12) { |
2410 | this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10); |
2411 | } |
2412 | |
2413 | void Assembler::fmovs(X dst, V src) { |
2414 | this->op(0b0'0'0'11110'00'1'00'110'000000, src, dst); |
2415 | } |
2416 | |
2417 | void Assembler::ldrq(V dst, Label* l) { |
2418 | const int imm19 = this->disp19(l); |
2419 | this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & 19_mask) << 5); |
2420 | } |
2421 | |
2422 | void Assembler::label(Label* l) { |
2423 | if (fCode) { |
2424 | // The instructions all currently point to l->offset. |
2425 | // We'll want to add a delta to point them to here. |
2426 | int here = (int)this->size(); |
2427 | int delta = here - l->offset; |
2428 | l->offset = here; |
2429 | |
2430 | if (l->kind == Label::ARMDisp19) { |
2431 | for (int ref : l->references) { |
2432 | // ref points to a 32-bit instruction with 19-bit displacement in instructions. |
2433 | uint32_t inst; |
2434 | memcpy(&inst, fCode + ref, 4); |
2435 | |
2436 | // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ] |
2437 | int disp = (int)(inst << 8) >> 13; |
2438 | |
2439 | disp += delta/4; // delta is in bytes, we want instructions. |
2440 | |
2441 | // Put it all back together, preserving the high 8 bits and low 5. |
2442 | inst = ((disp << 5) & (19_mask << 5)) |
2443 | | ((inst ) & ~(19_mask << 5)); |
2444 | |
2445 | memcpy(fCode + ref, &inst, 4); |
2446 | } |
2447 | } |
2448 | |
2449 | if (l->kind == Label::X86Disp32) { |
2450 | for (int ref : l->references) { |
2451 | // ref points to a 32-bit displacement in bytes. |
2452 | int disp; |
2453 | memcpy(&disp, fCode + ref, 4); |
2454 | |
2455 | disp += delta; |
2456 | |
2457 | memcpy(fCode + ref, &disp, 4); |
2458 | } |
2459 | } |
2460 | } |
2461 | } |
2462 | |
2463 | void Program::eval(int n, void* args[]) const { |
2464 | #define SKVM_JIT_STATS 0 |
2465 | #if SKVM_JIT_STATS |
2466 | static std::atomic<int64_t> calls{0}, jits{0}, |
2467 | pixels{0}, fast{0}; |
2468 | pixels += n; |
2469 | if (0 == calls++) { |
2470 | atexit([]{ |
2471 | int64_t num = jits .load(), |
2472 | den = calls.load(); |
2473 | SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n" , (100.0 * num)/den, den); |
2474 | num = fast .load(); |
2475 | den = pixels.load(); |
2476 | SkDebugf("%.3g%% of %lld pixels went through JIT.\n" , (100.0 * num)/den, den); |
2477 | }); |
2478 | } |
2479 | #endif |
2480 | |
2481 | #if !defined(SKVM_JIT_BUT_IGNORE_IT) |
2482 | const void* jit_entry = fImpl->jit_entry.load(); |
2483 | // jit_entry may be null either simply because we can't JIT, or when using LLVM |
2484 | // if the work represented by fImpl->llvm_compiling hasn't finished yet. |
2485 | // |
2486 | // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it |
2487 | // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off, |
2488 | // due to timing or program caching. |
2489 | if (jit_entry != nullptr && gSkVMAllowJIT) { |
2490 | #if SKVM_JIT_STATS |
2491 | jits++; |
2492 | fast += n; |
2493 | #endif |
2494 | void** a = args; |
2495 | switch (fImpl->strides.size()) { |
2496 | case 0: return ((void(*)(int ))jit_entry)(n ); |
2497 | case 1: return ((void(*)(int,void* ))jit_entry)(n,a[0] ); |
2498 | case 2: return ((void(*)(int,void*,void* ))jit_entry)(n,a[0],a[1] ); |
2499 | case 3: return ((void(*)(int,void*,void*,void* ))jit_entry)(n,a[0],a[1],a[2]); |
2500 | case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry) |
2501 | (n,a[0],a[1],a[2],a[3]); |
2502 | case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry) |
2503 | (n,a[0],a[1],a[2],a[3],a[4]); |
2504 | case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry) |
2505 | (n,a[0],a[1],a[2],a[3],a[4],a[5]); |
2506 | default: SkASSERT(false); // TODO: >6 args? |
2507 | } |
2508 | } |
2509 | #endif |
2510 | |
2511 | // So we'll sometimes use the interpreter here even if later calls will use the JIT. |
2512 | SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(), |
2513 | this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(), |
2514 | n, args); |
2515 | } |
2516 | |
2517 | #if defined(SKVM_LLVM) |
2518 | void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions, |
2519 | const char* debug_name) { |
2520 | auto ctx = std::make_unique<llvm::LLVMContext>(); |
2521 | |
2522 | auto mod = std::make_unique<llvm::Module>("" , *ctx); |
2523 | // All the scary bare pointers from here on are owned by ctx or mod, I think. |
2524 | |
2525 | // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines. |
2526 | const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4; |
2527 | |
2528 | llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(), |
2529 | *i32 = llvm::Type::getInt32Ty(*ctx); |
2530 | |
2531 | std::vector<llvm::Type*> arg_types = { i32 }; |
2532 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2533 | arg_types.push_back(ptr); |
2534 | } |
2535 | |
2536 | llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx), |
2537 | arg_types, /*vararg?=*/false); |
2538 | llvm::Function* fn |
2539 | = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod); |
2540 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2541 | fn->addParamAttr(i+1, llvm::Attribute::NoAlias); |
2542 | } |
2543 | |
2544 | llvm::BasicBlock *enter = llvm::BasicBlock::Create(*ctx, "enter" , fn), |
2545 | *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK" , fn), |
2546 | *testK = llvm::BasicBlock::Create(*ctx, "testK" , fn), |
2547 | *loopK = llvm::BasicBlock::Create(*ctx, "loopK" , fn), |
2548 | *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1" , fn), |
2549 | *test1 = llvm::BasicBlock::Create(*ctx, "test1" , fn), |
2550 | *loop1 = llvm::BasicBlock::Create(*ctx, "loop1" , fn), |
2551 | *leave = llvm::BasicBlock::Create(*ctx, "leave" , fn); |
2552 | |
2553 | using IRBuilder = llvm::IRBuilder<>; |
2554 | |
2555 | llvm::PHINode* n; |
2556 | std::vector<llvm::PHINode*> args; |
2557 | std::vector<llvm::Value*> vals(instructions.size()); |
2558 | |
2559 | auto emit = [&](size_t i, bool scalar, IRBuilder* b) { |
2560 | auto [op, x,y,z, immy,immz, death,can_hoist] = instructions[i]; |
2561 | |
2562 | llvm::Type *i1 = llvm::Type::getInt1Ty (*ctx), |
2563 | *i8 = llvm::Type::getInt8Ty (*ctx), |
2564 | *i16 = llvm::Type::getInt16Ty(*ctx), |
2565 | *f32 = llvm::Type::getFloatTy(*ctx), |
2566 | *I1 = scalar ? i1 : llvm::VectorType::get(i1 , K ), |
2567 | *I8 = scalar ? i8 : llvm::VectorType::get(i8 , K ), |
2568 | *I16 = scalar ? i16 : llvm::VectorType::get(i16, K ), |
2569 | *I32 = scalar ? i32 : llvm::VectorType::get(i32, K ), |
2570 | *F32 = scalar ? f32 : llvm::VectorType::get(f32, K ); |
2571 | |
2572 | auto I = [&](llvm::Value* v) { return b->CreateBitCast(v, I32 ); }; |
2573 | auto F = [&](llvm::Value* v) { return b->CreateBitCast(v, F32 ); }; |
2574 | |
2575 | auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); }; |
2576 | |
2577 | switch (llvm::Type* t = nullptr; op) { |
2578 | default: |
2579 | SkDebugf("can't llvm %s (%d)\n" , name(op), op); |
2580 | return false; |
2581 | |
2582 | case Op::assert_true: /*TODO*/ break; |
2583 | |
2584 | case Op::index: |
2585 | if (I32->isVectorTy()) { |
2586 | std::vector<llvm::Constant*> iota(K); |
2587 | for (int j = 0; j < K; j++) { |
2588 | iota[j] = b->getInt32(j); |
2589 | } |
2590 | vals[i] = b->CreateSub(b->CreateVectorSplat(K, n), |
2591 | llvm::ConstantVector::get(iota)); |
2592 | } else { |
2593 | vals[i] = n; |
2594 | } break; |
2595 | |
2596 | case Op::load8: t = I8 ; goto load; |
2597 | case Op::load16: t = I16; goto load; |
2598 | case Op::load32: t = I32; goto load; |
2599 | load: { |
2600 | llvm::Value* ptr = b->CreateBitCast(args[immy], t->getPointerTo()); |
2601 | vals[i] = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), I32); |
2602 | } break; |
2603 | |
2604 | |
2605 | case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immy); break; |
2606 | |
2607 | case Op::uniform8: t = i8 ; goto uniform; |
2608 | case Op::uniform16: t = i16; goto uniform; |
2609 | case Op::uniform32: t = i32; goto uniform; |
2610 | uniform: { |
2611 | llvm::Value* ptr = b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr, |
2612 | args[immy], |
2613 | immz), |
2614 | t->getPointerTo()); |
2615 | llvm::Value* val = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), i32); |
2616 | vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val) |
2617 | : val; |
2618 | } break; |
2619 | |
2620 | case Op::gather8: t = i8 ; goto gather; |
2621 | case Op::gather16: t = i16; goto gather; |
2622 | case Op::gather32: t = i32; goto gather; |
2623 | gather: { |
2624 | // Our gather base pointer is immz bytes off of uniform immy. |
2625 | llvm::Value* base = |
2626 | b->CreateLoad(b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr, |
2627 | args[immy], |
2628 | immz), |
2629 | t->getPointerTo()->getPointerTo())); |
2630 | |
2631 | llvm::Value* ptr = b->CreateInBoundsGEP(nullptr, base, vals[x]); |
2632 | llvm::Value* gathered; |
2633 | if (ptr->getType()->isVectorTy()) { |
2634 | gathered = b->CreateMaskedGather(ptr, 1); |
2635 | } else { |
2636 | gathered = b->CreateAlignedLoad(ptr, 1); |
2637 | } |
2638 | vals[i] = b->CreateZExt(gathered, I32); |
2639 | } break; |
2640 | |
2641 | case Op::store8: t = I8 ; goto store; |
2642 | case Op::store16: t = I16; goto store; |
2643 | case Op::store32: t = I32; goto store; |
2644 | store: { |
2645 | llvm::Value* val = b->CreateTrunc(vals[x], t); |
2646 | llvm::Value* ptr = b->CreateBitCast(args[immy], |
2647 | val->getType()->getPointerTo()); |
2648 | vals[i] = b->CreateAlignedStore(val, ptr, 1); |
2649 | } break; |
2650 | |
2651 | case Op::bit_and: vals[i] = b->CreateAnd(vals[x], vals[y]); break; |
2652 | case Op::bit_or : vals[i] = b->CreateOr (vals[x], vals[y]); break; |
2653 | case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break; |
2654 | case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break; |
2655 | |
2656 | case Op::pack: vals[i] = b->CreateOr(vals[x], b->CreateShl(vals[y], immz)); break; |
2657 | |
2658 | case Op::select: |
2659 | vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]); |
2660 | break; |
2661 | |
2662 | case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break; |
2663 | case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break; |
2664 | case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break; |
2665 | |
2666 | case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immy); break; |
2667 | case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immy); break; |
2668 | case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immy); break; |
2669 | |
2670 | case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break; |
2671 | case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break; |
2672 | |
2673 | case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break; |
2674 | case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break; |
2675 | case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break; |
2676 | case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break; |
2677 | |
2678 | case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break; |
2679 | case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break; |
2680 | case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break; |
2681 | case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break; |
2682 | |
2683 | case Op::fma_f32: |
2684 | vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, |
2685 | {F(vals[x]), F(vals[y]), F(vals[z])})); |
2686 | break; |
2687 | |
2688 | case Op::fms_f32: |
2689 | vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, |
2690 | {F(vals[x]), F(vals[y]), |
2691 | b->CreateFNeg(F(vals[z]))})); |
2692 | break; |
2693 | |
2694 | case Op::fnma_f32: |
2695 | vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, |
2696 | {b->CreateFNeg(F(vals[x])), F(vals[y]), |
2697 | F(vals[z])})); |
2698 | break; |
2699 | |
2700 | case Op::ceil: |
2701 | vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::ceil, F(vals[x]))); |
2702 | break; |
2703 | case Op::floor: |
2704 | vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x]))); |
2705 | break; |
2706 | |
2707 | case Op::max_f32: |
2708 | vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])), |
2709 | F(vals[y]), F(vals[x]))); |
2710 | break; |
2711 | case Op::min_f32: |
2712 | vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])), |
2713 | F(vals[y]), F(vals[x]))); |
2714 | break; |
2715 | |
2716 | case Op::sqrt_f32: |
2717 | vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x]))); |
2718 | break; |
2719 | |
2720 | case Op::to_f32: vals[i] = I(b->CreateSIToFP( vals[x] , F32)); break; |
2721 | case Op::trunc : vals[i] = b->CreateFPToSI(F(vals[x]), I32) ; break; |
2722 | case Op::round : { |
2723 | // Basic impl when we can't use cvtps2dq and co. |
2724 | auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x])); |
2725 | vals[i] = b->CreateFPToSI(round, I32); |
2726 | |
2727 | #if 1 && defined(SK_CPU_X86) |
2728 | // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling. |
2729 | if (scalar) { |
2730 | // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3. ¯\_(ツ)_/¯ |
2731 | llvm::Value* v = llvm::UndefValue::get(llvm::VectorType::get(f32, 4)); |
2732 | v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0); |
2733 | vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v}); |
2734 | } else { |
2735 | SkASSERT(K == 4 || K == 8); |
2736 | auto intr = K == 4 ? llvm::Intrinsic::x86_sse2_cvtps2dq : |
2737 | /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256; |
2738 | vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])}); |
2739 | } |
2740 | #endif |
2741 | } break; |
2742 | |
2743 | } |
2744 | return true; |
2745 | }; |
2746 | |
2747 | { |
2748 | IRBuilder b(enter); |
2749 | b.CreateBr(hoistK); |
2750 | } |
2751 | |
2752 | // hoistK: emit each hoistable vector instruction; goto testK; |
2753 | // LLVM can do this sort of thing itself, but we've got the information cheap, |
2754 | // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe. |
2755 | { |
2756 | IRBuilder b(hoistK); |
2757 | |
2758 | // Hoisted instructions will need args (think, uniforms), so set that up now. |
2759 | // These phi nodes are degenerate... they'll always be the passed-in args from enter. |
2760 | // Later on when we start looping the phi nodes will start looking useful. |
2761 | llvm::Argument* arg = fn->arg_begin(); |
2762 | (void)arg++; // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction. |
2763 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2764 | args.push_back(b.CreatePHI(arg->getType(), 1)); |
2765 | args.back()->addIncoming(arg++, enter); |
2766 | } |
2767 | |
2768 | for (size_t i = 0; i < instructions.size(); i++) { |
2769 | if (instructions[i].can_hoist && !emit(i, false, &b)) { |
2770 | return; |
2771 | } |
2772 | } |
2773 | |
2774 | b.CreateBr(testK); |
2775 | } |
2776 | |
2777 | // testK: if (N >= K) goto loopK; else goto hoist1; |
2778 | { |
2779 | IRBuilder b(testK); |
2780 | |
2781 | // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK. |
2782 | // These also start as the initial function arguments; hoistK can't have changed them. |
2783 | llvm::Argument* arg = fn->arg_begin(); |
2784 | |
2785 | n = b.CreatePHI(arg->getType(), 2); |
2786 | n->addIncoming(arg++, hoistK); |
2787 | |
2788 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2789 | args[i] = b.CreatePHI(arg->getType(), 2); |
2790 | args[i]->addIncoming(arg++, hoistK); |
2791 | } |
2792 | |
2793 | b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1); |
2794 | } |
2795 | |
2796 | // loopK: ... insts on K x T vectors; N -= K, args += K*stride; goto testK; |
2797 | { |
2798 | IRBuilder b(loopK); |
2799 | for (size_t i = 0; i < instructions.size(); i++) { |
2800 | if (!instructions[i].can_hoist && !emit(i, false, &b)) { |
2801 | return; |
2802 | } |
2803 | } |
2804 | |
2805 | // n -= K |
2806 | llvm::Value* n_next = b.CreateSub(n, b.getInt32(K)); |
2807 | n->addIncoming(n_next, loopK); |
2808 | |
2809 | // Each arg ptr += K |
2810 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2811 | llvm::Value* arg_next |
2812 | = b.CreateConstInBoundsGEP1_32(nullptr, args[i], K*fImpl->strides[i]); |
2813 | args[i]->addIncoming(arg_next, loopK); |
2814 | } |
2815 | b.CreateBr(testK); |
2816 | } |
2817 | |
2818 | // hoist1: emit each hoistable scalar instruction; goto test1; |
2819 | { |
2820 | IRBuilder b(hoist1); |
2821 | for (size_t i = 0; i < instructions.size(); i++) { |
2822 | if (instructions[i].can_hoist && !emit(i, true, &b)) { |
2823 | return; |
2824 | } |
2825 | } |
2826 | b.CreateBr(test1); |
2827 | } |
2828 | |
2829 | // test1: if (N >= 1) goto loop1; else goto leave; |
2830 | { |
2831 | IRBuilder b(test1); |
2832 | |
2833 | // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1. |
2834 | llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2); |
2835 | n_new->addIncoming(n, hoist1); |
2836 | n = n_new; |
2837 | |
2838 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2839 | llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2); |
2840 | arg_new->addIncoming(args[i], hoist1); |
2841 | args[i] = arg_new; |
2842 | } |
2843 | |
2844 | b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave); |
2845 | } |
2846 | |
2847 | // loop1: ... insts on scalars; N -= 1, args += stride; goto test1; |
2848 | { |
2849 | IRBuilder b(loop1); |
2850 | for (size_t i = 0; i < instructions.size(); i++) { |
2851 | if (!instructions[i].can_hoist && !emit(i, true, &b)) { |
2852 | return; |
2853 | } |
2854 | } |
2855 | |
2856 | // n -= 1 |
2857 | llvm::Value* n_next = b.CreateSub(n, b.getInt32(1)); |
2858 | n->addIncoming(n_next, loop1); |
2859 | |
2860 | // Each arg ptr += K |
2861 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2862 | llvm::Value* arg_next |
2863 | = b.CreateConstInBoundsGEP1_32(nullptr, args[i], fImpl->strides[i]); |
2864 | args[i]->addIncoming(arg_next, loop1); |
2865 | } |
2866 | b.CreateBr(test1); |
2867 | } |
2868 | |
2869 | // leave: ret |
2870 | { |
2871 | IRBuilder b(leave); |
2872 | b.CreateRetVoid(); |
2873 | } |
2874 | |
2875 | SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs())); |
2876 | |
2877 | if (true) { |
2878 | SkString path = SkStringPrintf("/tmp/%s.bc" , debug_name); |
2879 | std::error_code err; |
2880 | llvm::raw_fd_ostream os(path.c_str(), err); |
2881 | if (err) { |
2882 | return; |
2883 | } |
2884 | llvm::WriteBitcodeToFile(*mod, os); |
2885 | } |
2886 | |
2887 | static SkOnce once; |
2888 | once([]{ |
2889 | SkAssertResult(false == llvm::InitializeNativeTarget()); |
2890 | SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter()); |
2891 | }); |
2892 | |
2893 | if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod)) |
2894 | .setEngineKind(llvm::EngineKind::JIT) |
2895 | .setMCPU(llvm::sys::getHostCPUName()) |
2896 | .create()) { |
2897 | fImpl->llvm_ctx = std::move(ctx); |
2898 | fImpl->llvm_ee.reset(ee); |
2899 | |
2900 | // We have to be careful here about what we close over and how, in case fImpl moves. |
2901 | // fImpl itself may change, but its pointee fields won't, so close over them by value. |
2902 | // Also, debug_name will almost certainly leave scope, so copy it. |
2903 | fImpl->llvm_compiling = std::async(std::launch::async, [dst = &fImpl->jit_entry, |
2904 | ee = fImpl->llvm_ee.get(), |
2905 | name = std::string(debug_name)]{ |
2906 | // std::atomic<void*>* dst; |
2907 | // llvm::ExecutionEngine* ee; |
2908 | // std::string name; |
2909 | dst->store( (void*)ee->getFunctionAddress(name.c_str()) ); |
2910 | }); |
2911 | } |
2912 | } |
2913 | #endif |
2914 | |
2915 | void Program::waitForLLVM() const { |
2916 | #if defined(SKVM_LLVM) |
2917 | if (fImpl->llvm_compiling.valid()) { |
2918 | fImpl->llvm_compiling.wait(); |
2919 | } |
2920 | #endif |
2921 | } |
2922 | |
2923 | bool Program::hasJIT() const { |
2924 | // Program::hasJIT() is really just a debugging / test aid, |
2925 | // so we don't mind adding a sync point here to wait for compilation. |
2926 | this->waitForLLVM(); |
2927 | |
2928 | return fImpl->jit_entry.load() != nullptr; |
2929 | } |
2930 | |
2931 | void Program::dropJIT() { |
2932 | #if defined(SKVM_LLVM) |
2933 | this->waitForLLVM(); |
2934 | fImpl->llvm_ee .reset(nullptr); |
2935 | fImpl->llvm_ctx.reset(nullptr); |
2936 | #elif defined(SKVM_JIT) |
2937 | if (fImpl->dylib) { |
2938 | close_dylib(fImpl->dylib); |
2939 | } else if (auto jit_entry = fImpl->jit_entry.load()) { |
2940 | unmap_jit_buffer(jit_entry, fImpl->jit_size); |
2941 | } |
2942 | #else |
2943 | SkASSERT(!this->hasJIT()); |
2944 | #endif |
2945 | |
2946 | fImpl->jit_entry.store(nullptr); |
2947 | fImpl->jit_size = 0; |
2948 | fImpl->dylib = nullptr; |
2949 | } |
2950 | |
2951 | Program::Program() : fImpl(std::make_unique<Impl>()) {} |
2952 | |
2953 | Program::~Program() { |
2954 | // Moved-from Programs may have fImpl == nullptr. |
2955 | if (fImpl) { |
2956 | this->dropJIT(); |
2957 | } |
2958 | } |
2959 | |
2960 | Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {} |
2961 | |
2962 | Program& Program::operator=(Program&& other) { |
2963 | fImpl = std::move(other.fImpl); |
2964 | return *this; |
2965 | } |
2966 | |
2967 | Program::Program(const std::vector<OptimizedInstruction>& instructions, |
2968 | const std::vector<int>& strides, |
2969 | const char* debug_name) : Program() { |
2970 | fImpl->strides = strides; |
2971 | if (gSkVMAllowJIT) { |
2972 | #if 1 && defined(SKVM_LLVM) |
2973 | this->setupLLVM(instructions, debug_name); |
2974 | #elif 1 && defined(SKVM_JIT) |
2975 | this->setupJIT(instructions, debug_name); |
2976 | #endif |
2977 | } |
2978 | |
2979 | // Might as well do this after setupLLVM() to get a little more time to compile. |
2980 | this->setupInterpreter(instructions); |
2981 | } |
2982 | |
2983 | std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; } |
2984 | int Program::nargs() const { return (int)fImpl->strides.size(); } |
2985 | int Program::nregs() const { return fImpl->regs; } |
2986 | int Program::loop () const { return fImpl->loop; } |
2987 | bool Program::empty() const { return fImpl->instructions.empty(); } |
2988 | |
2989 | // Translate OptimizedInstructions to InterpreterInstructions. |
2990 | void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) { |
2991 | // Register each instruction is assigned to. |
2992 | std::vector<Reg> reg(instructions.size()); |
2993 | |
2994 | // This next bit is a bit more complicated than strictly necessary; |
2995 | // we could just assign every instruction to its own register. |
2996 | // |
2997 | // But recycling registers is fairly cheap, and good practice for the |
2998 | // JITs where minimizing register pressure really is important. |
2999 | // |
3000 | // Since we have effectively infinite registers, we hoist any value we can. |
3001 | // (The JIT may choose a more complex policy to reduce register pressure.) |
3002 | |
3003 | fImpl->regs = 0; |
3004 | std::vector<Reg> avail; |
3005 | |
3006 | // Assign this value to a register, recycling them where we can. |
3007 | auto assign_register = [&](Val id) { |
3008 | const OptimizedInstruction& inst = instructions[id]; |
3009 | |
3010 | // If this is a real input and it's lifetime ends at this instruction, |
3011 | // we can recycle the register it's occupying. |
3012 | auto maybe_recycle_register = [&](Val input) { |
3013 | if (input != NA && instructions[input].death == id) { |
3014 | avail.push_back(reg[input]); |
3015 | } |
3016 | }; |
3017 | |
3018 | // Take care to not recycle the same register twice. |
3019 | if (true ) { maybe_recycle_register(inst.x); } |
3020 | if (inst.y != inst.x ) { maybe_recycle_register(inst.y); } |
3021 | if (inst.z != inst.x && inst.z != inst.y) { maybe_recycle_register(inst.z); } |
3022 | |
3023 | // Instructions that die at themselves (stores) don't need a register. |
3024 | if (inst.death != id) { |
3025 | // Allocate a register if we have to, preferring to reuse anything available. |
3026 | if (avail.empty()) { |
3027 | reg[id] = fImpl->regs++; |
3028 | } else { |
3029 | reg[id] = avail.back(); |
3030 | avail.pop_back(); |
3031 | } |
3032 | } |
3033 | }; |
3034 | |
3035 | // Assign a register to each hoisted instruction, then each non-hoisted loop instruction. |
3036 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
3037 | if ( instructions[id].can_hoist) { assign_register(id); } |
3038 | } |
3039 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
3040 | if (!instructions[id].can_hoist) { assign_register(id); } |
3041 | } |
3042 | |
3043 | // Translate OptimizedInstructions to InterpreterIstructions by mapping values to |
3044 | // registers. This will be two passes, first hoisted instructions, then inside the loop. |
3045 | |
3046 | // The loop begins at the fImpl->loop'th Instruction. |
3047 | fImpl->loop = 0; |
3048 | fImpl->instructions.reserve(instructions.size()); |
3049 | |
3050 | // Add a dummy mapping for the N/A sentinel Val to any arbitrary register |
3051 | // so lookups don't have to know which arguments are used by which Ops. |
3052 | auto lookup_register = [&](Val id) { |
3053 | return id == NA ? (Reg)0 |
3054 | : reg[id]; |
3055 | }; |
3056 | |
3057 | auto push_instruction = [&](Val id, const OptimizedInstruction& inst) { |
3058 | InterpreterInstruction pinst{ |
3059 | inst.op, |
3060 | lookup_register(id), |
3061 | lookup_register(inst.x), |
3062 | {lookup_register(inst.y)}, |
3063 | {lookup_register(inst.z)}, |
3064 | }; |
3065 | if (inst.y == NA) { pinst.immy = inst.immy; } |
3066 | if (inst.z == NA) { pinst.immz = inst.immz; } |
3067 | fImpl->instructions.push_back(pinst); |
3068 | }; |
3069 | |
3070 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
3071 | const OptimizedInstruction& inst = instructions[id]; |
3072 | if (inst.can_hoist) { |
3073 | push_instruction(id, inst); |
3074 | fImpl->loop++; |
3075 | } |
3076 | } |
3077 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
3078 | const OptimizedInstruction& inst = instructions[id]; |
3079 | if (!inst.can_hoist) { |
3080 | push_instruction(id, inst); |
3081 | } |
3082 | } |
3083 | } |
3084 | |
3085 | #if defined(SKVM_JIT) |
3086 | |
3087 | bool Program::jit(const std::vector<OptimizedInstruction>& instructions, |
3088 | int* stack_hint, |
3089 | uint32_t* registers_used, |
3090 | Assembler* a) const { |
3091 | using A = Assembler; |
3092 | |
3093 | SkTHashMap<int, A::Label> constants; // Constants (mostly splats) share the same pool. |
3094 | A::Label iota; // Varies per lane, for Op::index. |
3095 | A::Label load64_index; // Used to load low or high half of 64-bit lanes. |
3096 | |
3097 | // The `regs` array tracks everything we know about each register's state: |
3098 | // - NA: empty |
3099 | // - RES: reserved by ABI |
3100 | // - TMP: holding a temporary |
3101 | // - id: holding Val id |
3102 | constexpr Val RES = NA-1, |
3103 | TMP = RES-1; |
3104 | |
3105 | // Map val -> stack slot. |
3106 | std::vector<int> stack_slot(instructions.size(), NA); |
3107 | int next_stack_slot = 0; |
3108 | |
3109 | const int nstack_slots = *stack_hint >= 0 ? *stack_hint |
3110 | : stack_slot.size(); |
3111 | |
3112 | #if defined(__x86_64__) || defined(_M_X64) |
3113 | if (!SkCpu::Supports(SkCpu::HSW)) { |
3114 | return false; |
3115 | } |
3116 | const int K = 8; |
3117 | using Reg = A::Ymm; |
3118 | #if defined(_M_X64) // Important to check this first; clang-cl defines both. |
3119 | const A::GP64 N = A::rcx, |
3120 | GP0 = A::rax, |
3121 | GP1 = A::r11, |
3122 | arg[] = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi }; |
3123 | |
3124 | // xmm6-15 need are callee-saved. |
3125 | std::array<Val,16> regs = { |
3126 | NA, NA, NA, NA, NA, NA,RES,RES, |
3127 | RES,RES,RES,RES, RES,RES,RES,RES, |
3128 | }; |
3129 | const uint32_t incoming_registers_used = *registers_used; |
3130 | |
3131 | auto enter = [&]{ |
3132 | // rcx,rdx,r8,r9 are all already holding their correct values. |
3133 | // Load caller-saved r10 from rsp+40 if there's a fourth arg. |
3134 | if (fImpl->strides.size() >= 4) { |
3135 | a->mov(A::r10, A::Mem{A::rsp, 40}); |
3136 | } |
3137 | // Load callee-saved rdi from rsp+48 if there's a fifth arg, |
3138 | // first saving it to ABI reserved shadow area rsp+8. |
3139 | if (fImpl->strides.size() >= 5) { |
3140 | a->mov(A::Mem{A::rsp, 8}, A::rdi); |
3141 | a->mov(A::rdi, A::Mem{A::rsp, 48}); |
3142 | } |
3143 | // Load callee-saved rsi from rsp+56 if there's a sixth arg, |
3144 | // first saving it to ABI reserved shadow area rsp+16. |
3145 | if (fImpl->strides.size() >= 6) { |
3146 | a->mov(A::Mem{A::rsp, 16}, A::rsi); |
3147 | a->mov(A::rsi, A::Mem{A::rsp, 56}); |
3148 | } |
3149 | |
3150 | // Allocate stack for our values and callee-saved xmm6-15. |
3151 | int stack_needed = nstack_slots*K*4; |
3152 | for (int r = 6; r < 16; r++) { |
3153 | if (incoming_registers_used & (1<<r)) { |
3154 | stack_needed += 16; |
3155 | } |
3156 | } |
3157 | if (stack_needed) { a->sub(A::rsp, stack_needed); } |
3158 | |
3159 | int next_saved_xmm = nstack_slots*K*4; |
3160 | for (int r = 6; r < 16; r++) { |
3161 | if (incoming_registers_used & (1<<r)) { |
3162 | a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r); |
3163 | next_saved_xmm += 16; |
3164 | regs[r] = NA; |
3165 | } |
3166 | } |
3167 | }; |
3168 | auto exit = [&]{ |
3169 | // The second pass of jit() shouldn't use any register it didn't in the first pass. |
3170 | SkASSERT((*registers_used & incoming_registers_used) == *registers_used); |
3171 | |
3172 | // Restore callee-saved xmm6-15 and the stack pointer. |
3173 | int stack_used = nstack_slots*K*4; |
3174 | for (int r = 6; r < 16; r++) { |
3175 | if (incoming_registers_used & (1<<r)) { |
3176 | a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used}); |
3177 | stack_used += 16; |
3178 | } |
3179 | } |
3180 | if (stack_used) { a->add(A::rsp, stack_used); } |
3181 | |
3182 | // Restore callee-saved rdi/rsi if we used them. |
3183 | if (fImpl->strides.size() >= 5) { |
3184 | a->mov(A::rdi, A::Mem{A::rsp, 8}); |
3185 | } |
3186 | if (fImpl->strides.size() >= 6) { |
3187 | a->mov(A::rsi, A::Mem{A::rsp, 16}); |
3188 | } |
3189 | |
3190 | a->vzeroupper(); |
3191 | a->ret(); |
3192 | }; |
3193 | #elif defined(__x86_64__) |
3194 | const A::GP64 N = A::rdi, |
3195 | GP0 = A::rax, |
3196 | GP1 = A::r11, |
3197 | arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 }; |
3198 | |
3199 | // All 16 ymm registers are available to use. |
3200 | std::array<Val,16> regs = { |
3201 | NA,NA,NA,NA, NA,NA,NA,NA, |
3202 | NA,NA,NA,NA, NA,NA,NA,NA, |
3203 | }; |
3204 | |
3205 | auto enter = [&]{ |
3206 | // Load caller-saved r10 from rsp+8 if there's a sixth arg. |
3207 | if (fImpl->strides.size() >= 6) { |
3208 | a->mov(A::r10, A::Mem{A::rsp, 8}); |
3209 | } |
3210 | if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); } |
3211 | }; |
3212 | auto exit = [&]{ |
3213 | if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); } |
3214 | a->vzeroupper(); |
3215 | a->ret(); |
3216 | }; |
3217 | #endif |
3218 | |
3219 | auto load_from_memory = [&](Reg r, Val v) { |
3220 | if (instructions[v].op == Op::splat) { |
3221 | if (instructions[v].immy == 0) { |
3222 | a->vpxor(r,r,r); |
3223 | } else { |
3224 | a->vmovups(r, constants.find(instructions[v].immy)); |
3225 | } |
3226 | } else { |
3227 | SkASSERT(stack_slot[v] != NA); |
3228 | a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4}); |
3229 | } |
3230 | }; |
3231 | auto store_to_stack = [&](Reg r, Val v) { |
3232 | SkASSERT(next_stack_slot < nstack_slots); |
3233 | stack_slot[v] = next_stack_slot++; |
3234 | a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r); |
3235 | }; |
3236 | #elif defined(__aarch64__) |
3237 | const int K = 4; |
3238 | using Reg = A::V; |
3239 | const A::X N = A::x0, |
3240 | GP0 = A::x8, |
3241 | arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 }; |
3242 | |
3243 | // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit. |
3244 | std::array<Val,32> regs = { |
3245 | NA, NA, NA, NA, NA, NA, NA, NA, |
3246 | RES,RES,RES,RES, RES,RES,RES,RES, |
3247 | NA, NA, NA, NA, NA, NA, NA, NA, |
3248 | NA, NA, NA, NA, NA, NA, NA, NA, |
3249 | }; |
3250 | |
3251 | auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } }; |
3252 | auto exit = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); } |
3253 | a->ret(A::x30); }; |
3254 | |
3255 | auto load_from_memory = [&](Reg r, Val v) { |
3256 | if (instructions[v].op == Op::splat) { |
3257 | if (instructions[v].immy == 0) { |
3258 | a->eor16b(r,r,r); |
3259 | } else { |
3260 | a->ldrq(r, constants.find(instructions[v].immy)); |
3261 | } |
3262 | } else { |
3263 | SkASSERT(stack_slot[v] != NA); |
3264 | a->ldrq(r, A::sp, stack_slot[v]); |
3265 | } |
3266 | }; |
3267 | auto store_to_stack = [&](Reg r, Val v) { |
3268 | SkASSERT(next_stack_slot < nstack_slots); |
3269 | stack_slot[v] = next_stack_slot++; |
3270 | a->strq(r, A::sp, stack_slot[v]); |
3271 | }; |
3272 | #endif |
3273 | |
3274 | *registers_used = 0; // We'll update this as we go. |
3275 | |
3276 | if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) { |
3277 | return false; |
3278 | } |
3279 | |
3280 | auto emit = [&](Val id, bool scalar) { |
3281 | const OptimizedInstruction& inst = instructions[id]; |
3282 | const Op op = inst.op; |
3283 | const Val x = inst.x, |
3284 | y = inst.y, |
3285 | z = inst.z; |
3286 | const int immy = inst.immy, |
3287 | immz = inst.immz; |
3288 | |
3289 | // alloc_tmp() returns a temporary register, freed manually with free_tmp(). |
3290 | auto alloc_tmp = [&]() -> Reg { |
3291 | // Find an available register, or spill an occupied one if nothing's available. |
3292 | auto avail = std::find_if(regs.begin(), regs.end(), [](Val v) { return v == NA; }); |
3293 | if (avail == regs.end()) { |
3294 | auto score_spills = [&](Val v) -> int { |
3295 | // We cannot spill REServed registers, |
3296 | // nor any registers we need for this instruction. |
3297 | if (v == RES || |
3298 | v == TMP || v == id || v == x || v == y || v == z) { |
3299 | return 0x7fff'ffff; |
3300 | } |
3301 | // At this point spilling is arbitrary, so we're in the realm of heuristics. |
3302 | // Here, spill the oldest value. This is nice because, |
3303 | // A) it's very predictable, even in assembly, and |
3304 | // B) it's as cheap as you can get. |
3305 | return v; |
3306 | }; |
3307 | avail = std::min_element(regs.begin(), regs.end(), [&](Val a, Val b) { |
3308 | return score_spills(a) < score_spills(b); |
3309 | }); |
3310 | } |
3311 | SkASSERT(avail != regs.end()); |
3312 | |
3313 | Reg r = (Reg)std::distance(regs.begin(), avail); |
3314 | Val& v = regs[r]; |
3315 | *registers_used |= (1<<r); |
3316 | |
3317 | SkASSERT(v == NA || v >= 0); |
3318 | if (v >= 0) { |
3319 | if (stack_slot[v] == NA && instructions[v].op != Op::splat) { |
3320 | store_to_stack(r, v); |
3321 | } |
3322 | v = NA; |
3323 | } |
3324 | SkASSERT(v == NA); |
3325 | |
3326 | v = TMP; |
3327 | return r; |
3328 | }; |
3329 | |
3330 | #if defined(__x86_64__) || defined(_M_X64) // Nothing special... just unused on ARM. |
3331 | auto free_tmp = [&](Reg r) { |
3332 | SkASSERT(regs[r] == TMP); |
3333 | regs[r] = NA; |
3334 | }; |
3335 | #endif |
3336 | |
3337 | // Which register holds dst,x,y,z for this instruction? NA if none does yet. |
3338 | int rd = NA, |
3339 | rx = NA, |
3340 | ry = NA, |
3341 | rz = NA; |
3342 | |
3343 | auto update_regs = [&](Reg r, Val v) { |
3344 | if (v == id) { rd = r; } |
3345 | if (v == x) { rx = r; } |
3346 | if (v == y) { ry = r; } |
3347 | if (v == z) { rz = r; } |
3348 | return r; |
3349 | }; |
3350 | |
3351 | auto find_existing_reg = [&](Val v) -> int { |
3352 | // Quick-check our working registers. |
3353 | if (v == id && rd != NA) { return rd; } |
3354 | if (v == x && rx != NA) { return rx; } |
3355 | if (v == y && ry != NA) { return ry; } |
3356 | if (v == z && rz != NA) { return rz; } |
3357 | |
3358 | // Search inter-instruction register map. |
3359 | for (auto [r,val] : SkMakeEnumerate(regs)) { |
3360 | if (val == v) { |
3361 | return update_regs((Reg)r, v); |
3362 | } |
3363 | } |
3364 | return NA; |
3365 | }; |
3366 | |
3367 | // Return a register for Val, holding that value if it already exists. |
3368 | // During this instruction all calls to r(v) will return the same register. |
3369 | auto r = [&](Val v) -> Reg { |
3370 | SkASSERT(v >= 0); |
3371 | |
3372 | if (int found = find_existing_reg(v); found != NA) { |
3373 | return (Reg)found; |
3374 | } |
3375 | |
3376 | Reg r = alloc_tmp(); |
3377 | SkASSERT(regs[r] == TMP); |
3378 | |
3379 | SkASSERT(v <= id); |
3380 | if (v < id) { |
3381 | // If v < id, we're loading one of this instruction's inputs. |
3382 | // If v == id we're just allocating its destination register. |
3383 | load_from_memory(r, v); |
3384 | } |
3385 | regs[r] = v; |
3386 | return update_regs(r, v); |
3387 | }; |
3388 | |
3389 | auto dies_here = [&](Val v) -> bool { |
3390 | SkASSERT(v >= 0); |
3391 | return instructions[v].death == id; |
3392 | }; |
3393 | |
3394 | // Alias dst() to r(v) if dies_here(v). |
3395 | auto try_alias = [&](Val v) -> bool { |
3396 | SkASSERT(v == x || v == y || v == z); |
3397 | if (dies_here(v)) { |
3398 | rd = r(v); // Vals v and id share a register for this instruction. |
3399 | regs[rd] = id; // Next instruction, Val id will be in the register, not Val v. |
3400 | return true; |
3401 | } |
3402 | return false; |
3403 | }; |
3404 | |
3405 | // Generally r(id), |
3406 | // but with a hint, try to alias dst() to r(v) if dies_here(v). |
3407 | auto dst = [&](Val hint = NA) -> Reg { |
3408 | if (hint != NA) { |
3409 | (void)try_alias(hint); |
3410 | } |
3411 | return r(id); |
3412 | }; |
3413 | |
3414 | #if defined(__x86_64__) || defined(_M_X64) |
3415 | // On x86 we can work with many values directly from the stack or program constant pool. |
3416 | auto any = [&](Val v) -> A::Operand { |
3417 | SkASSERT(v >= 0); |
3418 | SkASSERT(v < id); |
3419 | |
3420 | if (int found = find_existing_reg(v); found != NA) { |
3421 | return (Reg)found; |
3422 | } |
3423 | if (instructions[v].op == Op::splat) { |
3424 | return constants.find(instructions[v].immy); |
3425 | } |
3426 | return A::Mem{A::rsp, stack_slot[v]*K*4}; |
3427 | }; |
3428 | |
3429 | // This is never really worth asking except when any() might be used; |
3430 | // if we need this value in ARM, might as well just call r(v) to get it into a register. |
3431 | auto in_reg = [&](Val v) -> bool { |
3432 | return find_existing_reg(v) != NA; |
3433 | }; |
3434 | #endif |
3435 | |
3436 | switch (op) { |
3437 | case Op::splat: |
3438 | // Make sure splat constants can be found by load_from_memory() or any(). |
3439 | (void)constants[immy]; |
3440 | break; |
3441 | |
3442 | #if defined(__x86_64__) || defined(_M_X64) |
3443 | case Op::assert_true: { |
3444 | a->vptest (r(x), &constants[0xffffffff]); |
3445 | A::Label all_true; |
3446 | a->jc(&all_true); |
3447 | a->int3(); |
3448 | a->label(&all_true); |
3449 | } break; |
3450 | |
3451 | case Op::store8: |
3452 | if (scalar) { |
3453 | a->vpextrb(A::Mem{arg[immy]}, (A::Xmm)r(x), 0); |
3454 | } else { |
3455 | a->vpackusdw(dst(x), r(x), r(x)); |
3456 | a->vpermq (dst(), dst(), 0xd8); |
3457 | a->vpackuswb(dst(), dst(), dst()); |
3458 | a->vmovq (A::Mem{arg[immy]}, (A::Xmm)dst()); |
3459 | } break; |
3460 | |
3461 | case Op::store16: |
3462 | if (scalar) { |
3463 | a->vpextrw(A::Mem{arg[immy]}, (A::Xmm)r(x), 0); |
3464 | } else { |
3465 | a->vpackusdw(dst(x), r(x), r(x)); |
3466 | a->vpermq (dst(), dst(), 0xd8); |
3467 | a->vmovups (A::Mem{arg[immy]}, (A::Xmm)dst()); |
3468 | } break; |
3469 | |
3470 | case Op::store32: if (scalar) { a->vmovd (A::Mem{arg[immy]}, (A::Xmm)r(x)); } |
3471 | else { a->vmovups(A::Mem{arg[immy]}, r(x)); } |
3472 | break; |
3473 | |
3474 | case Op::store64: if (scalar) { |
3475 | a->vmovd(A::Mem{arg[immz],0}, (A::Xmm)r(x)); |
3476 | a->vmovd(A::Mem{arg[immz],4}, (A::Xmm)r(y)); |
3477 | } else { |
3478 | // r(x) = {a,b,c,d|e,f,g,h} |
3479 | // r(y) = {i,j,k,l|m,n,o,p} |
3480 | // We want to write a,i,b,j,c,k,d,l,e,m... |
3481 | A::Ymm L = alloc_tmp(), |
3482 | H = alloc_tmp(); |
3483 | a->vpunpckldq(L, r(x), any(y)); // L = {a,i,b,j|e,m,f,n} |
3484 | a->vpunpckhdq(H, r(x), any(y)); // H = {c,k,d,l|g,o,h,p} |
3485 | a->vperm2f128(dst(), L,H, 0x20); // = {a,i,b,j|c,k,d,l} |
3486 | a->vmovups(A::Mem{arg[immz], 0}, dst()); |
3487 | a->vperm2f128(dst(), L,H, 0x31); // = {e,m,f,n|g,o,h,p} |
3488 | a->vmovups(A::Mem{arg[immz],32}, dst()); |
3489 | free_tmp(L); |
3490 | free_tmp(H); |
3491 | } break; |
3492 | |
3493 | case Op::store128: { |
3494 | // TODO: 8 64-bit stores instead of 16 32-bit stores? |
3495 | int ptr = immz>>1, |
3496 | lane = immz&1; |
3497 | a->vmovd (A::Mem{arg[ptr], 0*16 + 8*lane + 0}, (A::Xmm)r(x) ); |
3498 | a->vmovd (A::Mem{arg[ptr], 0*16 + 8*lane + 4}, (A::Xmm)r(y) ); |
3499 | if (scalar) { break; } |
3500 | a->vpextrd(A::Mem{arg[ptr], 1*16 + 8*lane + 0}, (A::Xmm)r(x), 1); |
3501 | a->vpextrd(A::Mem{arg[ptr], 1*16 + 8*lane + 4}, (A::Xmm)r(y), 1); |
3502 | a->vpextrd(A::Mem{arg[ptr], 2*16 + 8*lane + 0}, (A::Xmm)r(x), 2); |
3503 | a->vpextrd(A::Mem{arg[ptr], 2*16 + 8*lane + 4}, (A::Xmm)r(y), 2); |
3504 | a->vpextrd(A::Mem{arg[ptr], 3*16 + 8*lane + 0}, (A::Xmm)r(x), 3); |
3505 | a->vpextrd(A::Mem{arg[ptr], 3*16 + 8*lane + 4}, (A::Xmm)r(y), 3); |
3506 | // Now we need to store the upper 128 bits of x and y. |
3507 | // Storing x then y rather than interlacing minimizes temporaries. |
3508 | a->vextracti128(dst(), r(x), 1); |
3509 | a->vmovd (A::Mem{arg[ptr], 4*16 + 8*lane + 0}, (A::Xmm)dst() ); |
3510 | a->vpextrd(A::Mem{arg[ptr], 5*16 + 8*lane + 0}, (A::Xmm)dst(), 1); |
3511 | a->vpextrd(A::Mem{arg[ptr], 6*16 + 8*lane + 0}, (A::Xmm)dst(), 2); |
3512 | a->vpextrd(A::Mem{arg[ptr], 7*16 + 8*lane + 0}, (A::Xmm)dst(), 3); |
3513 | a->vextracti128(dst(), r(y), 1); |
3514 | a->vmovd (A::Mem{arg[ptr], 4*16 + 8*lane + 4}, (A::Xmm)dst() ); |
3515 | a->vpextrd(A::Mem{arg[ptr], 5*16 + 8*lane + 4}, (A::Xmm)dst(), 1); |
3516 | a->vpextrd(A::Mem{arg[ptr], 6*16 + 8*lane + 4}, (A::Xmm)dst(), 2); |
3517 | a->vpextrd(A::Mem{arg[ptr], 7*16 + 8*lane + 4}, (A::Xmm)dst(), 3); |
3518 | } break; |
3519 | |
3520 | case Op::load8: if (scalar) { |
3521 | a->vpxor (dst(), dst(), dst()); |
3522 | a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immy]}, 0); |
3523 | } else { |
3524 | a->vpmovzxbd(dst(), A::Mem{arg[immy]}); |
3525 | } break; |
3526 | |
3527 | case Op::load16: if (scalar) { |
3528 | a->vpxor (dst(), dst(), dst()); |
3529 | a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immy]}, 0); |
3530 | } else { |
3531 | a->vpmovzxwd(dst(), A::Mem{arg[immy]}); |
3532 | } break; |
3533 | |
3534 | case Op::load32: if (scalar) { a->vmovd ((A::Xmm)dst(), A::Mem{arg[immy]}); } |
3535 | else { a->vmovups( dst(), A::Mem{arg[immy]}); } |
3536 | break; |
3537 | |
3538 | case Op::load64: if (scalar) { |
3539 | a->vmovd((A::Xmm)dst(), A::Mem{arg[immy], 4*immz}); |
3540 | } else { |
3541 | A::Ymm tmp = alloc_tmp(); |
3542 | a->vmovups(tmp, &load64_index); |
3543 | a->vpermps(dst(), tmp, A::Mem{arg[immy], 0}); |
3544 | a->vpermps( tmp, tmp, A::Mem{arg[immy], 32}); |
3545 | // Low 128 bits holds immz=0 lanes, high 128 bits holds immz=1. |
3546 | a->vperm2f128(dst(), dst(),tmp, immz ? 0x31 : 0x20); |
3547 | free_tmp(tmp); |
3548 | } break; |
3549 | |
3550 | case Op::load128: if (scalar) { |
3551 | a->vmovd((A::Xmm)dst(), A::Mem{arg[immy], 4*immz}); |
3552 | } else { |
3553 | // Load 4 low values into xmm tmp, |
3554 | A::Ymm tmp = alloc_tmp(); |
3555 | A::Xmm t = (A::Xmm)tmp; |
3556 | a->vmovd (t, A::Mem{arg[immy], 0*16 + 4*immz} ); |
3557 | a->vpinsrd(t,t, A::Mem{arg[immy], 1*16 + 4*immz}, 1); |
3558 | a->vpinsrd(t,t, A::Mem{arg[immy], 2*16 + 4*immz}, 2); |
3559 | a->vpinsrd(t,t, A::Mem{arg[immy], 3*16 + 4*immz}, 3); |
3560 | |
3561 | // Load 4 high values into xmm dst(), |
3562 | A::Xmm d = (A::Xmm)dst(); |
3563 | a->vmovd (d, A::Mem{arg[immy], 4*16 + 4*immz} ); |
3564 | a->vpinsrd(d,d, A::Mem{arg[immy], 5*16 + 4*immz}, 1); |
3565 | a->vpinsrd(d,d, A::Mem{arg[immy], 6*16 + 4*immz}, 2); |
3566 | a->vpinsrd(d,d, A::Mem{arg[immy], 7*16 + 4*immz}, 3); |
3567 | |
3568 | // Merge the two, ymm dst() = {xmm tmp|xmm dst()} |
3569 | a->vperm2f128(dst(), tmp,dst(), 0x20); |
3570 | free_tmp(tmp); |
3571 | } break; |
3572 | |
3573 | case Op::gather8: { |
3574 | // As usual, the gather base pointer is immz bytes off of uniform immy. |
3575 | a->mov(GP0, A::Mem{arg[immy], immz}); |
3576 | |
3577 | A::Ymm tmp = alloc_tmp(); |
3578 | a->vmovups(tmp, any(x)); |
3579 | |
3580 | for (int i = 0; i < (scalar ? 1 : 8); i++) { |
3581 | if (i == 4) { |
3582 | // vpextrd can only pluck indices out from an Xmm register, |
3583 | // so we manually swap over to the top when we're halfway through. |
3584 | a->vextracti128((A::Xmm)tmp, tmp, 1); |
3585 | } |
3586 | a->vpextrd(GP1, (A::Xmm)tmp, i%4); |
3587 | a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i); |
3588 | } |
3589 | a->vpmovzxbd(dst(), dst()); |
3590 | free_tmp(tmp); |
3591 | } break; |
3592 | |
3593 | case Op::gather16: { |
3594 | // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd. |
3595 | a->mov(GP0, A::Mem{arg[immy], immz}); |
3596 | |
3597 | A::Ymm tmp = alloc_tmp(); |
3598 | a->vmovups(tmp, any(x)); |
3599 | |
3600 | for (int i = 0; i < (scalar ? 1 : 8); i++) { |
3601 | if (i == 4) { |
3602 | a->vextracti128((A::Xmm)tmp, tmp, 1); |
3603 | } |
3604 | a->vpextrd(GP1, (A::Xmm)tmp, i%4); |
3605 | a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i); |
3606 | } |
3607 | a->vpmovzxwd(dst(), dst()); |
3608 | free_tmp(tmp); |
3609 | } break; |
3610 | |
3611 | case Op::gather32: |
3612 | if (scalar) { |
3613 | // Our gather base pointer is immz bytes off of uniform immy. |
3614 | a->mov(GP0, A::Mem{arg[immy], immz}); |
3615 | |
3616 | // Grab our index from lane 0 of the index argument. |
3617 | a->vmovd(GP1, (A::Xmm)r(x)); |
3618 | |
3619 | // dst = *(base + 4*index) |
3620 | a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR}); |
3621 | } else { |
3622 | a->mov(GP0, A::Mem{arg[immy], immz}); |
3623 | |
3624 | A::Ymm mask = alloc_tmp(); |
3625 | a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.) |
3626 | |
3627 | a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask); |
3628 | free_tmp(mask); |
3629 | } |
3630 | break; |
3631 | |
3632 | case Op::uniform8: a->movzbq(GP0, A::Mem{arg[immy], immz}); |
3633 | a->vmovd((A::Xmm)dst(), GP0); |
3634 | a->vbroadcastss(dst(), dst()); |
3635 | break; |
3636 | |
3637 | case Op::uniform16: a->movzwq(GP0, A::Mem{arg[immy], immz}); |
3638 | a->vmovd((A::Xmm)dst(), GP0); |
3639 | a->vbroadcastss(dst(), dst()); |
3640 | break; |
3641 | |
3642 | case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immy], immz}); |
3643 | break; |
3644 | |
3645 | case Op::index: a->vmovd((A::Xmm)dst(), N); |
3646 | a->vbroadcastss(dst(), dst()); |
3647 | a->vpsubd(dst(), dst(), &iota); |
3648 | break; |
3649 | |
3650 | // We can swap the arguments of symmetric instructions to make better use of any(). |
3651 | case Op::add_f32: |
3652 | if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); } |
3653 | else { a->vaddps(dst(y), r(y), any(x)); } |
3654 | break; |
3655 | |
3656 | case Op::mul_f32: |
3657 | if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); } |
3658 | else { a->vmulps(dst(y), r(y), any(x)); } |
3659 | break; |
3660 | |
3661 | case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break; |
3662 | case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break; |
3663 | case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break; // Order matters, |
3664 | case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break; // see test SkVM_min_max. |
3665 | |
3666 | case Op::fma_f32: |
3667 | if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else |
3668 | if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else |
3669 | if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else |
3670 | { a->vmovups (dst(), any(x)); |
3671 | a->vfmadd132ps(dst(), r(z), any(y)); } |
3672 | break; |
3673 | |
3674 | case Op::fms_f32: |
3675 | if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else |
3676 | if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else |
3677 | if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else |
3678 | { a->vmovups (dst(), any(x)); |
3679 | a->vfmsub132ps(dst(), r(z), any(y)); } |
3680 | break; |
3681 | |
3682 | case Op::fnma_f32: |
3683 | if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else |
3684 | if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else |
3685 | if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else |
3686 | { a->vmovups (dst(), any(x)); |
3687 | a->vfnmadd132ps(dst(), r(z), any(y)); } |
3688 | break; |
3689 | |
3690 | // In situations like this we want to try aliasing dst(x) when x is |
3691 | // already in a register, but not if we'd have to load it from the stack |
3692 | // just to alias it. That's done better directly into the new register. |
3693 | case Op::sqrt_f32: |
3694 | if (in_reg(x)) { a->vsqrtps(dst(x), r(x)); } |
3695 | else { a->vsqrtps(dst(), any(x)); } |
3696 | break; |
3697 | |
3698 | case Op::add_i32: |
3699 | if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); } |
3700 | else { a->vpaddd(dst(y), r(y), any(x)); } |
3701 | break; |
3702 | case Op::mul_i32: |
3703 | if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); } |
3704 | else { a->vpmulld(dst(y), r(y), any(x)); } |
3705 | break; |
3706 | |
3707 | case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break; |
3708 | |
3709 | case Op::bit_and: |
3710 | if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); } |
3711 | else { a->vpand(dst(y), r(y), any(x)); } |
3712 | break; |
3713 | case Op::bit_or: |
3714 | if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); } |
3715 | else { a->vpor(dst(y), r(y), any(x)); } |
3716 | break; |
3717 | case Op::bit_xor: |
3718 | if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); } |
3719 | else { a->vpxor(dst(y), r(y), any(x)); } |
3720 | break; |
3721 | |
3722 | case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x. |
3723 | |
3724 | case Op::select: |
3725 | if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); } |
3726 | else { a->vpblendvb(dst(x), r(z), any(y), r(x)); } |
3727 | break; |
3728 | |
3729 | case Op::shl_i32: a->vpslld(dst(x), r(x), immy); break; |
3730 | case Op::shr_i32: a->vpsrld(dst(x), r(x), immy); break; |
3731 | case Op::sra_i32: a->vpsrad(dst(x), r(x), immy); break; |
3732 | |
3733 | case Op::eq_i32: |
3734 | if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); } |
3735 | else { a->vpcmpeqd(dst(y), r(y), any(x)); } |
3736 | break; |
3737 | |
3738 | case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break; |
3739 | |
3740 | case Op::eq_f32: |
3741 | if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); } |
3742 | else { a->vcmpeqps(dst(y), r(y), any(x)); } |
3743 | break; |
3744 | case Op::neq_f32: |
3745 | if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); } |
3746 | else { a->vcmpneqps(dst(y), r(y), any(x)); } |
3747 | break; |
3748 | |
3749 | case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break; |
3750 | case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break; |
3751 | |
3752 | // It's safe to alias dst(y) only when y != x. Otherwise we'd overwrite x! |
3753 | case Op::pack: a->vpslld(dst(y != x ? y : NA), r(y), immz); |
3754 | a->vpor (dst(), dst(), any(x)); |
3755 | break; |
3756 | |
3757 | case Op::ceil: |
3758 | if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::CEIL); } |
3759 | else { a->vroundps(dst(), any(x), Assembler::CEIL); } |
3760 | break; |
3761 | |
3762 | case Op::floor: |
3763 | if (in_reg(x)) { a->vroundps(dst(x), r(x), Assembler::FLOOR); } |
3764 | else { a->vroundps(dst(), any(x), Assembler::FLOOR); } |
3765 | break; |
3766 | |
3767 | case Op::to_f32: |
3768 | if (in_reg(x)) { a->vcvtdq2ps(dst(x), r(x)); } |
3769 | else { a->vcvtdq2ps(dst(), any(x)); } |
3770 | break; |
3771 | |
3772 | case Op::trunc: |
3773 | if (in_reg(x)) { a->vcvttps2dq(dst(x), r(x)); } |
3774 | else { a->vcvttps2dq(dst(), any(x)); } |
3775 | break; |
3776 | |
3777 | case Op::round: |
3778 | if (in_reg(x)) { a->vcvtps2dq(dst(x), r(x)); } |
3779 | else { a->vcvtps2dq(dst(), any(x)); } |
3780 | break; |
3781 | |
3782 | case Op::to_half: |
3783 | a->vcvtps2ph(dst(x), r(x), A::CURRENT); // f32 ymm -> f16 xmm |
3784 | a->vpmovzxwd(dst(), dst()); // f16 xmm -> f16 ymm |
3785 | break; |
3786 | |
3787 | case Op::from_half: |
3788 | a->vpackusdw(dst(x), r(x), r(x)); // f16 ymm -> f16 xmm |
3789 | a->vpermq (dst(), dst(), 0xd8); // swap middle two 64-bit lanes |
3790 | a->vcvtph2ps(dst(), dst()); // f16 xmm -> f32 ymm |
3791 | break; |
3792 | |
3793 | #elif defined(__aarch64__) |
3794 | default: // TODO |
3795 | if (false) { |
3796 | SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n" , name(op), op); |
3797 | } |
3798 | return false; |
3799 | |
3800 | case Op::assert_true: { |
3801 | a->uminv4s(dst(), r(x)); // uminv acts like an all() across the vector. |
3802 | a->fmovs(GP0, dst()); |
3803 | A::Label all_true; |
3804 | a->cbnz(GP0, &all_true); |
3805 | a->brk(0); |
3806 | a->label(&all_true); |
3807 | } break; |
3808 | |
3809 | case Op::store8: a->xtns2h(dst(), r(x)); |
3810 | a->xtnh2b(dst(), dst()); |
3811 | if (scalar) { a->strb (dst(), arg[immy]); } |
3812 | else { a->strs (dst(), arg[immy]); } |
3813 | break; |
3814 | |
3815 | case Op::store32: if (scalar) { a->strs(r(x), arg[immy]); } |
3816 | else { a->strq(r(x), arg[immy]); } |
3817 | break; |
3818 | |
3819 | case Op::load8: if (scalar) { a->ldrb(dst(), arg[immy]); } |
3820 | else { a->ldrs(dst(), arg[immy]); } |
3821 | a->uxtlb2h(dst(), dst()); |
3822 | a->uxtlh2s(dst(), dst()); |
3823 | break; |
3824 | |
3825 | case Op::load32: if (scalar) { a->ldrs(dst(), arg[immy]); } |
3826 | else { a->ldrq(dst(), arg[immy]); } |
3827 | break; |
3828 | |
3829 | case Op::add_f32: a->fadd4s(dst(), r(x), r(y)); break; |
3830 | case Op::sub_f32: a->fsub4s(dst(), r(x), r(y)); break; |
3831 | case Op::mul_f32: a->fmul4s(dst(), r(x), r(y)); break; |
3832 | case Op::div_f32: a->fdiv4s(dst(), r(x), r(y)); break; |
3833 | |
3834 | case Op::fma_f32: // fmla.4s is z += x*y |
3835 | if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); } |
3836 | else { a->orr16b(dst(), r(z), r(z)); |
3837 | a->fmla4s(dst(), r(x), r(y)); } |
3838 | break; |
3839 | |
3840 | case Op::fnma_f32: // fmls.4s is z -= x*y |
3841 | if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); } |
3842 | else { a->orr16b(dst(), r(z), r(z)); |
3843 | a->fmls4s(dst(), r(x), r(y)); } |
3844 | break; |
3845 | |
3846 | case Op::fms_f32: // calculate z - xy, then negate to xy - z |
3847 | if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); } |
3848 | else { a->orr16b(dst(), r(z), r(z)); |
3849 | a->fmls4s(dst(), r(x), r(y)); } |
3850 | a->fneg4s(dst(), dst()); |
3851 | break; |
3852 | |
3853 | case Op:: gt_f32: a->fcmgt4s (dst(), r(x), r(y)); break; |
3854 | case Op::gte_f32: a->fcmge4s (dst(), r(x), r(y)); break; |
3855 | case Op:: eq_f32: a->fcmeq4s (dst(), r(x), r(y)); break; |
3856 | case Op::neq_f32: a->fcmeq4s (dst(), r(x), r(y)); |
3857 | a->not16b (dst(), dst()); break; |
3858 | |
3859 | |
3860 | case Op::add_i32: a->add4s(dst(), r(x), r(y)); break; |
3861 | case Op::sub_i32: a->sub4s(dst(), r(x), r(y)); break; |
3862 | case Op::mul_i32: a->mul4s(dst(), r(x), r(y)); break; |
3863 | |
3864 | case Op::bit_and : a->and16b(dst(), r(x), r(y)); break; |
3865 | case Op::bit_or : a->orr16b(dst(), r(x), r(y)); break; |
3866 | case Op::bit_xor : a->eor16b(dst(), r(x), r(y)); break; |
3867 | case Op::bit_clear: a->bic16b(dst(), r(x), r(y)); break; |
3868 | |
3869 | case Op::select: // bsl16b is x = x ? y : z |
3870 | if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); } |
3871 | else { a->orr16b(dst(), r(x), r(x)); |
3872 | a->bsl16b(dst(), r(y), r(z)); } |
3873 | break; |
3874 | |
3875 | // fmin4s and fmax4s don't work the way we want with NaN, |
3876 | // so we write them the long way: |
3877 | case Op::min_f32: // min(x,y) = y<x ? y : x |
3878 | a->fcmgt4s(dst(), r(x), r(y)); |
3879 | a->bsl16b (dst(), r(y), r(x)); |
3880 | break; |
3881 | |
3882 | case Op::max_f32: // max(x,y) = x<y ? y : x |
3883 | a->fcmgt4s(dst(), r(y), r(x)); |
3884 | a->bsl16b (dst(), r(y), r(x)); |
3885 | break; |
3886 | |
3887 | case Op::shl_i32: a-> shl4s(dst(), r(x), immy); break; |
3888 | case Op::shr_i32: a->ushr4s(dst(), r(x), immy); break; |
3889 | case Op::sra_i32: a->sshr4s(dst(), r(x), immy); break; |
3890 | |
3891 | case Op::eq_i32: a->cmeq4s(dst(), r(x), r(y)); break; |
3892 | case Op::gt_i32: a->cmgt4s(dst(), r(x), r(y)); break; |
3893 | |
3894 | case Op::pack: |
3895 | if (try_alias(x)) { a->sli4s ( r(x), r(y), immz); } |
3896 | else { a->shl4s (dst(), r(y), immz); |
3897 | a->orr16b(dst(), dst(), r(x)); } |
3898 | break; |
3899 | |
3900 | case Op::to_f32: a->scvtf4s (dst(), r(x)); break; |
3901 | case Op::trunc: a->fcvtzs4s(dst(), r(x)); break; |
3902 | case Op::round: a->fcvtns4s(dst(), r(x)); break; |
3903 | // TODO: fcvtns.4s rounds to nearest even. |
3904 | // I think we actually want frintx -> fcvtzs to round to current mode. |
3905 | #endif |
3906 | } |
3907 | |
3908 | // Proactively free the registers holding any value that dies here. |
3909 | if (rd != NA && dies_here(regs[rd])) { regs[rd] = NA; } |
3910 | if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; } |
3911 | if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; } |
3912 | if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; } |
3913 | return true; |
3914 | }; |
3915 | |
3916 | #if defined(__x86_64__) || defined(_M_X64) |
3917 | auto jump_if_less = [&](A::Label* l) { a->jl (l); }; |
3918 | auto jump = [&](A::Label* l) { a->jmp(l); }; |
3919 | |
3920 | auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); }; |
3921 | auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); }; |
3922 | #elif defined(__aarch64__) |
3923 | auto jump_if_less = [&](A::Label* l) { a->blt(l); }; |
3924 | auto jump = [&](A::Label* l) { a->b (l); }; |
3925 | |
3926 | auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); }; |
3927 | auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); }; |
3928 | #endif |
3929 | |
3930 | A::Label body, |
3931 | tail, |
3932 | done; |
3933 | |
3934 | enter(); |
3935 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
3936 | if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) { |
3937 | return false; |
3938 | } |
3939 | } |
3940 | |
3941 | // This point marks a kind of canonical fixed point for register contents: if loop |
3942 | // code is generated as if these registers are holding these values, the next time |
3943 | // the loop comes around we'd better find those same registers holding those same values. |
3944 | auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot, |
3945 | saved_next_stack_slot=next_stack_slot]{ |
3946 | for (int r = 0; r < (int)regs.size(); r++) { |
3947 | if (regs[r] != incoming[r]) { |
3948 | regs[r] = incoming[r]; |
3949 | if (regs[r] >= 0) { |
3950 | load_from_memory((Reg)r, regs[r]); |
3951 | } |
3952 | } |
3953 | } |
3954 | *stack_hint = std::max(*stack_hint, next_stack_slot); |
3955 | stack_slot = saved_stack_slot; |
3956 | next_stack_slot = saved_next_stack_slot; |
3957 | }; |
3958 | |
3959 | a->label(&body); |
3960 | { |
3961 | a->cmp(N, K); |
3962 | jump_if_less(&tail); |
3963 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
3964 | if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) { |
3965 | return false; |
3966 | } |
3967 | } |
3968 | restore_incoming_regs(); |
3969 | for (int i = 0; i < (int)fImpl->strides.size(); i++) { |
3970 | if (fImpl->strides[i]) { |
3971 | add(arg[i], K*fImpl->strides[i]); |
3972 | } |
3973 | } |
3974 | sub(N, K); |
3975 | jump(&body); |
3976 | } |
3977 | |
3978 | a->label(&tail); |
3979 | { |
3980 | a->cmp(N, 1); |
3981 | jump_if_less(&done); |
3982 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
3983 | if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) { |
3984 | return false; |
3985 | } |
3986 | } |
3987 | restore_incoming_regs(); |
3988 | for (int i = 0; i < (int)fImpl->strides.size(); i++) { |
3989 | if (fImpl->strides[i]) { |
3990 | add(arg[i], 1*fImpl->strides[i]); |
3991 | } |
3992 | } |
3993 | sub(N, 1); |
3994 | jump(&tail); |
3995 | } |
3996 | |
3997 | a->label(&done); |
3998 | { |
3999 | exit(); |
4000 | } |
4001 | |
4002 | // Except for explicit aligned load and store instructions, AVX allows |
4003 | // memory operands to be unaligned. So even though we're creating 16 |
4004 | // byte patterns on ARM or 32-byte patterns on x86, we only need to |
4005 | // align to 4 bytes, the element size and alignment requirement. |
4006 | |
4007 | constants.foreach([&](int imm, A::Label* label) { |
4008 | a->align(4); |
4009 | a->label(label); |
4010 | for (int i = 0; i < K; i++) { |
4011 | a->word(imm); |
4012 | } |
4013 | }); |
4014 | |
4015 | if (!iota.references.empty()) { |
4016 | a->align(4); |
4017 | a->label(&iota); // 0,1,2,3,4,... |
4018 | for (int i = 0; i < K; i++) { |
4019 | a->word(i); |
4020 | } |
4021 | } |
4022 | |
4023 | if (!load64_index.references.empty()) { |
4024 | a->align(4); |
4025 | a->label(&load64_index); // {0,2,4,6|1,3,5,7} |
4026 | a->word(0); a->word(2); a->word(4); a->word(6); |
4027 | a->word(1); a->word(3); a->word(5); a->word(7); |
4028 | } |
4029 | |
4030 | return true; |
4031 | } |
4032 | |
4033 | void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions, |
4034 | const char* debug_name) { |
4035 | // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble) |
4036 | // and stack_hint/registers_used to feed forward into the next jit() call. |
4037 | Assembler a{nullptr}; |
4038 | int stack_hint = -1; |
4039 | uint32_t registers_used = 0xffff'ffff; // Start conservatively with all. |
4040 | if (!this->jit(instructions, &stack_hint, ®isters_used, &a)) { |
4041 | return; |
4042 | } |
4043 | |
4044 | fImpl->jit_size = a.size(); |
4045 | void* jit_entry = alloc_jit_buffer(&fImpl->jit_size); |
4046 | fImpl->jit_entry.store(jit_entry); |
4047 | |
4048 | // Assemble the program for real with stack_hint/registers_used as feedback from first call. |
4049 | a = Assembler{jit_entry}; |
4050 | SkAssertResult(this->jit(instructions, &stack_hint, ®isters_used, &a)); |
4051 | SkASSERT(a.size() <= fImpl->jit_size); |
4052 | |
4053 | // Remap as executable, and flush caches on platforms that need that. |
4054 | remap_as_executable(jit_entry, fImpl->jit_size); |
4055 | |
4056 | notify_vtune(debug_name, jit_entry, fImpl->jit_size); |
4057 | |
4058 | #if !defined(SK_BUILD_FOR_WIN) |
4059 | // For profiling and debugging, it's helpful to have this code loaded |
4060 | // dynamically rather than just jumping info fImpl->jit_entry. |
4061 | if (gSkVMJITViaDylib) { |
4062 | // Dump the raw program binary. |
4063 | SkString path = SkStringPrintf("/tmp/%s.XXXXXX" , debug_name); |
4064 | int fd = mkstemp(path.writable_str()); |
4065 | ::write(fd, jit_entry, a.size()); |
4066 | close(fd); |
4067 | |
4068 | this->dropJIT(); // (unmap and null out fImpl->jit_entry.) |
4069 | |
4070 | // Convert it in-place to a dynamic library with a single symbol "skvm_jit": |
4071 | SkString cmd = SkStringPrintf( |
4072 | "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'" |
4073 | " | clang -x assembler -shared - -o %s" , |
4074 | path.c_str(), path.c_str()); |
4075 | system(cmd.c_str()); |
4076 | |
4077 | // Load that dynamic library and look up skvm_jit(). |
4078 | fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL); |
4079 | void* sym = nullptr; |
4080 | for (const char* name : {"skvm_jit" , "_skvm_jit" } ) { |
4081 | if (!sym) { sym = dlsym(fImpl->dylib, name); } |
4082 | } |
4083 | fImpl->jit_entry.store(sym); |
4084 | } |
4085 | #endif |
4086 | } |
4087 | #endif |
4088 | |
4089 | } // namespace skvm |
4090 | |