1 | /* |
2 | * Copyright 2019 Google LLC |
3 | * |
4 | * Use of this source code is governed by a BSD-style license that can be |
5 | * found in the LICENSE file. |
6 | */ |
7 | |
8 | #include "include/core/SkStream.h" |
9 | #include "include/core/SkString.h" |
10 | #include "include/private/SkChecksum.h" |
11 | #include "include/private/SkSpinlock.h" |
12 | #include "include/private/SkTFitsIn.h" |
13 | #include "include/private/SkThreadID.h" |
14 | #include "include/private/SkVx.h" |
15 | #include "src/core/SkColorSpaceXformSteps.h" |
16 | #include "src/core/SkCpu.h" |
17 | #include "src/core/SkOpts.h" |
18 | #include "src/core/SkVM.h" |
19 | #include <algorithm> |
20 | #include <atomic> |
21 | #include <queue> |
22 | |
23 | #if defined(SKVM_LLVM) |
24 | #include <future> |
25 | #include <llvm/Bitcode/BitcodeWriter.h> |
26 | #include <llvm/ExecutionEngine/ExecutionEngine.h> |
27 | #include <llvm/IR/IRBuilder.h> |
28 | #include <llvm/IR/Verifier.h> |
29 | #include <llvm/Support/TargetSelect.h> |
30 | |
31 | // Platform-specific intrinsics got their own files in LLVM 10. |
32 | #if __has_include(<llvm/IR/IntrinsicsX86.h>) |
33 | #include <llvm/IR/IntrinsicsX86.h> |
34 | #endif |
35 | #endif |
36 | |
37 | bool gSkVMJITViaDylib{false}; |
38 | |
39 | // JIT code isn't MSAN-instrumented, so we won't see when it uses |
40 | // uninitialized memory, and we'll not see the writes it makes as properly |
41 | // initializing memory. Instead force the interpreter, which should let |
42 | // MSAN see everything our programs do properly. |
43 | // |
44 | // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter. |
45 | #if defined(__has_feature) |
46 | #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer) |
47 | #undef SKVM_JIT |
48 | #endif |
49 | #endif |
50 | |
51 | #if defined(SKVM_JIT) |
52 | #include <dlfcn.h> // dlopen, dlsym |
53 | #include <sys/mman.h> // mmap, mprotect |
54 | #endif |
55 | |
56 | namespace skvm { |
57 | |
58 | struct Program::Impl { |
59 | std::vector<InterpreterInstruction> instructions; |
60 | int regs = 0; |
61 | int loop = 0; |
62 | std::vector<int> strides; |
63 | |
64 | std::atomic<void*> jit_entry{nullptr}; // TODO: minimal std::memory_orders |
65 | size_t jit_size = 0; |
66 | void* dylib = nullptr; |
67 | |
68 | #if defined(SKVM_LLVM) |
69 | std::unique_ptr<llvm::LLVMContext> llvm_ctx; |
70 | std::unique_ptr<llvm::ExecutionEngine> llvm_ee; |
71 | std::future<void> llvm_compiling; |
72 | #endif |
73 | }; |
74 | |
75 | // Debugging tools, mostly for printing various data structures out to a stream. |
76 | |
77 | namespace { |
78 | class SkDebugfStream final : public SkWStream { |
79 | size_t fBytesWritten = 0; |
80 | |
81 | bool write(const void* buffer, size_t size) override { |
82 | SkDebugf("%.*s" , size, buffer); |
83 | fBytesWritten += size; |
84 | return true; |
85 | } |
86 | |
87 | size_t bytesWritten() const override { |
88 | return fBytesWritten; |
89 | } |
90 | }; |
91 | |
92 | struct V { Val id; }; |
93 | struct R { Reg id; }; |
94 | struct Shift { int bits; }; |
95 | struct Splat { int bits; }; |
96 | struct Hex { int bits; }; |
97 | |
98 | static void write(SkWStream* o, const char* s) { |
99 | o->writeText(s); |
100 | } |
101 | |
102 | static const char* name(Op op) { |
103 | switch (op) { |
104 | #define M(x) case Op::x: return #x; |
105 | SKVM_OPS(M) |
106 | #undef M |
107 | } |
108 | return "unknown op" ; |
109 | } |
110 | |
111 | static void write(SkWStream* o, Op op) { |
112 | const char* raw = name(op); |
113 | if (const char* found = strstr(raw, "_imm" )) { |
114 | o->write(raw, found-raw); |
115 | } else { |
116 | o->writeText(raw); |
117 | } |
118 | } |
119 | static void write(SkWStream* o, Arg a) { |
120 | write(o, "arg(" ); |
121 | o->writeDecAsText(a.ix); |
122 | write(o, ")" ); |
123 | } |
124 | static void write(SkWStream* o, V v) { |
125 | write(o, "v" ); |
126 | o->writeDecAsText(v.id); |
127 | } |
128 | static void write(SkWStream* o, R r) { |
129 | write(o, "r" ); |
130 | o->writeDecAsText(r.id); |
131 | } |
132 | static void write(SkWStream* o, Shift s) { |
133 | o->writeDecAsText(s.bits); |
134 | } |
135 | static void write(SkWStream* o, Splat s) { |
136 | float f; |
137 | memcpy(&f, &s.bits, 4); |
138 | o->writeHexAsText(s.bits); |
139 | write(o, " (" ); |
140 | o->writeScalarAsText(f); |
141 | write(o, ")" ); |
142 | } |
143 | static void write(SkWStream* o, Hex h) { |
144 | o->writeHexAsText(h.bits); |
145 | } |
146 | |
147 | template <typename T, typename... Ts> |
148 | static void write(SkWStream* o, T first, Ts... rest) { |
149 | write(o, first); |
150 | write(o, " " ); |
151 | write(o, rest...); |
152 | } |
153 | } |
154 | |
155 | void Builder::dot(SkWStream* o, bool for_jit) const { |
156 | SkDebugfStream debug; |
157 | if (!o) { o = &debug; } |
158 | |
159 | std::vector<OptimizedInstruction> optimized = this->optimize(for_jit); |
160 | |
161 | o->writeText("digraph {\n" ); |
162 | for (Val id = 0; id < (Val)optimized.size(); id++) { |
163 | auto [op, x,y,z, immy,immz, death,can_hoist,used_in_loop] = optimized[id]; |
164 | |
165 | switch (op) { |
166 | default: |
167 | write(o, "\t" , V{id}, " [label = \"" , V{id}, op); |
168 | // Not a perfect heuristic; sometimes y/z == NA and there is no immy/z. |
169 | // On the other hand, sometimes immy/z=0 is meaningful and should be printed. |
170 | if (y == NA) { write(o, "" , Hex{immy}); } |
171 | if (z == NA) { write(o, "" , Hex{immz}); } |
172 | write(o, "\"]\n" ); |
173 | |
174 | write(o, "\t" , V{id}, " -> {" ); |
175 | // In contrast to the heuristic imm labels, these dependences are exact. |
176 | if (x != NA) { write(o, "" , V{x}); } |
177 | if (y != NA) { write(o, "" , V{y}); } |
178 | if (z != NA) { write(o, "" , V{z}); } |
179 | write(o, " }\n" ); |
180 | |
181 | break; |
182 | |
183 | // That default: impl works pretty well for most instructions, |
184 | // but some are nicer to see with a specialized label. |
185 | |
186 | case Op::splat: |
187 | write(o, "\t" , V{id}, " [label = \"" , V{id}, op, Splat{immy}, "\"]\n" ); |
188 | break; |
189 | } |
190 | } |
191 | o->writeText("}\n" ); |
192 | } |
193 | |
194 | void Builder::dump(SkWStream* o) const { |
195 | SkDebugfStream debug; |
196 | if (!o) { o = &debug; } |
197 | |
198 | std::vector<OptimizedInstruction> optimized = this->optimize(); |
199 | o->writeDecAsText(optimized.size()); |
200 | o->writeText(" values (originally " ); |
201 | o->writeDecAsText(fProgram.size()); |
202 | o->writeText("):\n" ); |
203 | for (Val id = 0; id < (Val)optimized.size(); id++) { |
204 | const OptimizedInstruction& inst = optimized[id]; |
205 | Op op = inst.op; |
206 | Val x = inst.x, |
207 | y = inst.y, |
208 | z = inst.z; |
209 | int immy = inst.immy, |
210 | immz = inst.immz; |
211 | write(o, !inst.can_hoist ? " " : |
212 | inst.used_in_loop ? "↑ " : |
213 | "↟ " ); |
214 | switch (op) { |
215 | case Op::assert_true: write(o, op, V{x}, V{y}); break; |
216 | |
217 | case Op::store8: write(o, op, Arg{immy}, V{x}); break; |
218 | case Op::store16: write(o, op, Arg{immy}, V{x}); break; |
219 | case Op::store32: write(o, op, Arg{immy}, V{x}); break; |
220 | |
221 | case Op::index: write(o, V{id}, "=" , op); break; |
222 | |
223 | case Op::load8: write(o, V{id}, "=" , op, Arg{immy}); break; |
224 | case Op::load16: write(o, V{id}, "=" , op, Arg{immy}); break; |
225 | case Op::load32: write(o, V{id}, "=" , op, Arg{immy}); break; |
226 | |
227 | case Op::gather8: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}, V{x}); break; |
228 | case Op::gather16: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}, V{x}); break; |
229 | case Op::gather32: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}, V{x}); break; |
230 | |
231 | case Op::uniform8: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}); break; |
232 | case Op::uniform16: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}); break; |
233 | case Op::uniform32: write(o, V{id}, "=" , op, Arg{immy}, Hex{immz}); break; |
234 | |
235 | case Op::splat: write(o, V{id}, "=" , op, Splat{immy}); break; |
236 | |
237 | |
238 | case Op::add_f32: write(o, V{id}, "=" , op, V{x}, V{y} ); break; |
239 | case Op::sub_f32: write(o, V{id}, "=" , op, V{x}, V{y} ); break; |
240 | case Op::mul_f32: write(o, V{id}, "=" , op, V{x}, V{y} ); break; |
241 | case Op::div_f32: write(o, V{id}, "=" , op, V{x}, V{y} ); break; |
242 | case Op::min_f32: write(o, V{id}, "=" , op, V{x}, V{y} ); break; |
243 | case Op::max_f32: write(o, V{id}, "=" , op, V{x}, V{y} ); break; |
244 | case Op::fma_f32: write(o, V{id}, "=" , op, V{x}, V{y}, V{z}); break; |
245 | case Op::fms_f32: write(o, V{id}, "=" , op, V{x}, V{y}, V{z}); break; |
246 | case Op::fnma_f32: write(o, V{id}, "=" , op, V{x}, V{y}, V{z}); break; |
247 | |
248 | |
249 | case Op::sqrt_f32: write(o, V{id}, "=" , op, V{x}); break; |
250 | |
251 | case Op::add_f32_imm: write(o, V{id}, "=" , op, V{x}, Splat{immy}); break; |
252 | case Op::sub_f32_imm: write(o, V{id}, "=" , op, V{x}, Splat{immy}); break; |
253 | case Op::mul_f32_imm: write(o, V{id}, "=" , op, V{x}, Splat{immy}); break; |
254 | case Op::min_f32_imm: write(o, V{id}, "=" , op, V{x}, Splat{immy}); break; |
255 | case Op::max_f32_imm: write(o, V{id}, "=" , op, V{x}, Splat{immy}); break; |
256 | |
257 | case Op:: eq_f32: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
258 | case Op::neq_f32: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
259 | case Op:: gt_f32: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
260 | case Op::gte_f32: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
261 | |
262 | |
263 | case Op::add_i32: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
264 | case Op::sub_i32: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
265 | case Op::mul_i32: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
266 | |
267 | case Op::shl_i32: write(o, V{id}, "=" , op, V{x}, Shift{immy}); break; |
268 | case Op::shr_i32: write(o, V{id}, "=" , op, V{x}, Shift{immy}); break; |
269 | case Op::sra_i32: write(o, V{id}, "=" , op, V{x}, Shift{immy}); break; |
270 | |
271 | case Op:: eq_i32: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
272 | case Op::neq_i32: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
273 | case Op:: gt_i32: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
274 | case Op::gte_i32: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
275 | |
276 | case Op::add_i16x2: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
277 | case Op::sub_i16x2: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
278 | case Op::mul_i16x2: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
279 | |
280 | case Op::shl_i16x2: write(o, V{id}, "=" , op, V{x}, Shift{immy}); break; |
281 | case Op::shr_i16x2: write(o, V{id}, "=" , op, V{x}, Shift{immy}); break; |
282 | case Op::sra_i16x2: write(o, V{id}, "=" , op, V{x}, Shift{immy}); break; |
283 | |
284 | case Op:: eq_i16x2: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
285 | case Op::neq_i16x2: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
286 | case Op:: gt_i16x2: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
287 | case Op::gte_i16x2: write(o, V{id}, "=" , op, V{x}, V{y}); break; |
288 | |
289 | case Op::bit_and : write(o, V{id}, "=" , op, V{x}, V{y} ); break; |
290 | case Op::bit_or : write(o, V{id}, "=" , op, V{x}, V{y} ); break; |
291 | case Op::bit_xor : write(o, V{id}, "=" , op, V{x}, V{y} ); break; |
292 | case Op::bit_clear: write(o, V{id}, "=" , op, V{x}, V{y} ); break; |
293 | |
294 | case Op::bit_and_imm: write(o, V{id}, "=" , op, V{x}, Hex{immy}); break; |
295 | case Op::bit_or_imm : write(o, V{id}, "=" , op, V{x}, Hex{immy}); break; |
296 | case Op::bit_xor_imm: write(o, V{id}, "=" , op, V{x}, Hex{immy}); break; |
297 | |
298 | case Op::select: write(o, V{id}, "=" , op, V{x}, V{y}, V{z}); break; |
299 | case Op::pack: write(o, V{id}, "=" , op, V{x}, V{y}, Shift{immz}); break; |
300 | |
301 | case Op::floor: write(o, V{id}, "=" , op, V{x}); break; |
302 | case Op::to_f32: write(o, V{id}, "=" , op, V{x}); break; |
303 | case Op::trunc: write(o, V{id}, "=" , op, V{x}); break; |
304 | case Op::round: write(o, V{id}, "=" , op, V{x}); break; |
305 | } |
306 | |
307 | write(o, "\n" ); |
308 | } |
309 | } |
310 | |
311 | void Program::dump(SkWStream* o) const { |
312 | SkDebugfStream debug; |
313 | if (!o) { o = &debug; } |
314 | |
315 | o->writeDecAsText(fImpl->regs); |
316 | o->writeText(" registers, " ); |
317 | o->writeDecAsText(fImpl->instructions.size()); |
318 | o->writeText(" instructions:\n" ); |
319 | for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) { |
320 | if (i == fImpl->loop) { write(o, "loop:\n" ); } |
321 | o->writeDecAsText(i); |
322 | o->writeText("\t" ); |
323 | if (i >= fImpl->loop) { write(o, " " ); } |
324 | const InterpreterInstruction& inst = fImpl->instructions[i]; |
325 | Op op = inst.op; |
326 | Reg d = inst.d, |
327 | x = inst.x, |
328 | y = inst.y, |
329 | z = inst.z; |
330 | int immy = inst.immy, |
331 | immz = inst.immz; |
332 | switch (op) { |
333 | case Op::assert_true: write(o, op, R{x}, R{y}); break; |
334 | |
335 | case Op::store8: write(o, op, Arg{immy}, R{x}); break; |
336 | case Op::store16: write(o, op, Arg{immy}, R{x}); break; |
337 | case Op::store32: write(o, op, Arg{immy}, R{x}); break; |
338 | |
339 | case Op::index: write(o, R{d}, "=" , op); break; |
340 | |
341 | case Op::load8: write(o, R{d}, "=" , op, Arg{immy}); break; |
342 | case Op::load16: write(o, R{d}, "=" , op, Arg{immy}); break; |
343 | case Op::load32: write(o, R{d}, "=" , op, Arg{immy}); break; |
344 | |
345 | case Op::gather8: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}, R{x}); break; |
346 | case Op::gather16: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}, R{x}); break; |
347 | case Op::gather32: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}, R{x}); break; |
348 | |
349 | case Op::uniform8: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}); break; |
350 | case Op::uniform16: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}); break; |
351 | case Op::uniform32: write(o, R{d}, "=" , op, Arg{immy}, Hex{immz}); break; |
352 | |
353 | case Op::splat: write(o, R{d}, "=" , op, Splat{immy}); break; |
354 | |
355 | |
356 | case Op::add_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
357 | case Op::sub_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
358 | case Op::mul_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
359 | case Op::div_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
360 | case Op::min_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
361 | case Op::max_f32: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
362 | case Op::fma_f32: write(o, R{d}, "=" , op, R{x}, R{y}, R{z}); break; |
363 | case Op::fms_f32: write(o, R{d}, "=" , op, R{x}, R{y}, R{z}); break; |
364 | case Op::fnma_f32: write(o, R{d}, "=" , op, R{x}, R{y}, R{z}); break; |
365 | |
366 | case Op::sqrt_f32: write(o, R{d}, "=" , op, R{x}); break; |
367 | |
368 | case Op::add_f32_imm: write(o, R{d}, "=" , op, R{x}, Splat{immy}); break; |
369 | case Op::sub_f32_imm: write(o, R{d}, "=" , op, R{x}, Splat{immy}); break; |
370 | case Op::mul_f32_imm: write(o, R{d}, "=" , op, R{x}, Splat{immy}); break; |
371 | case Op::min_f32_imm: write(o, R{d}, "=" , op, R{x}, Splat{immy}); break; |
372 | case Op::max_f32_imm: write(o, R{d}, "=" , op, R{x}, Splat{immy}); break; |
373 | |
374 | case Op:: eq_f32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
375 | case Op::neq_f32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
376 | case Op:: gt_f32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
377 | case Op::gte_f32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
378 | |
379 | |
380 | case Op::add_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
381 | case Op::sub_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
382 | case Op::mul_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
383 | |
384 | case Op::shl_i32: write(o, R{d}, "=" , op, R{x}, Shift{immy}); break; |
385 | case Op::shr_i32: write(o, R{d}, "=" , op, R{x}, Shift{immy}); break; |
386 | case Op::sra_i32: write(o, R{d}, "=" , op, R{x}, Shift{immy}); break; |
387 | |
388 | case Op:: eq_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
389 | case Op::neq_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
390 | case Op:: gt_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
391 | case Op::gte_i32: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
392 | |
393 | |
394 | case Op::add_i16x2: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
395 | case Op::sub_i16x2: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
396 | case Op::mul_i16x2: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
397 | |
398 | case Op::shl_i16x2: write(o, R{d}, "=" , op, R{x}, Shift{immy}); break; |
399 | case Op::shr_i16x2: write(o, R{d}, "=" , op, R{x}, Shift{immy}); break; |
400 | case Op::sra_i16x2: write(o, R{d}, "=" , op, R{x}, Shift{immy}); break; |
401 | |
402 | case Op:: eq_i16x2: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
403 | case Op::neq_i16x2: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
404 | case Op:: gt_i16x2: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
405 | case Op::gte_i16x2: write(o, R{d}, "=" , op, R{x}, R{y}); break; |
406 | |
407 | |
408 | case Op::bit_and : write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
409 | case Op::bit_or : write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
410 | case Op::bit_xor : write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
411 | case Op::bit_clear: write(o, R{d}, "=" , op, R{x}, R{y} ); break; |
412 | |
413 | case Op::bit_and_imm: write(o, R{d}, "=" , op, R{x}, Hex{immy}); break; |
414 | case Op::bit_or_imm : write(o, R{d}, "=" , op, R{x}, Hex{immy}); break; |
415 | case Op::bit_xor_imm: write(o, R{d}, "=" , op, R{x}, Hex{immy}); break; |
416 | |
417 | case Op::select: write(o, R{d}, "=" , op, R{x}, R{y}, R{z}); break; |
418 | case Op::pack: write(o, R{d}, "=" , op, R{x}, R{y}, Shift{immz}); break; |
419 | |
420 | case Op::floor: write(o, R{d}, "=" , op, R{x}); break; |
421 | case Op::to_f32: write(o, R{d}, "=" , op, R{x}); break; |
422 | case Op::trunc: write(o, R{d}, "=" , op, R{x}); break; |
423 | case Op::round: write(o, R{d}, "=" , op, R{x}); break; |
424 | } |
425 | write(o, "\n" ); |
426 | } |
427 | } |
428 | |
429 | std::vector<Instruction> specialize_for_jit(std::vector<Instruction> program) { |
430 | // We could use a temporary Builder to let new Instructions participate in common |
431 | // sub-expression elimination, but we'll never hit anything valuable with the |
432 | // specializations we've got today. Worth keeping in mind for the future though. |
433 | for (Val i = 0; i < (Val)program.size(); i++) { |
434 | #if defined(SK_CPU_X86) |
435 | Instruction& inst = program[i]; |
436 | |
437 | auto is_imm = [&](Val id, int* bits) { |
438 | *bits = program[id].immy; |
439 | return program[id].op == Op::splat; |
440 | }; |
441 | |
442 | switch (Op imm_op; inst.op) { |
443 | default: break; |
444 | |
445 | case Op::add_f32: imm_op = Op::add_f32_imm; goto try_imm_x_and_y; |
446 | case Op::mul_f32: imm_op = Op::mul_f32_imm; goto try_imm_x_and_y; |
447 | case Op::bit_and: imm_op = Op::bit_and_imm; goto try_imm_x_and_y; |
448 | case Op::bit_or: imm_op = Op::bit_or_imm ; goto try_imm_x_and_y; |
449 | case Op::bit_xor: imm_op = Op::bit_xor_imm; goto try_imm_x_and_y; |
450 | case Op::min_f32: imm_op = Op::min_f32_imm; goto try_imm_x; |
451 | case Op::max_f32: imm_op = Op::max_f32_imm; goto try_imm_x; |
452 | case Op::sub_f32: imm_op = Op::sub_f32_imm; goto try_imm_y; |
453 | |
454 | try_imm_x_and_y: |
455 | if (int bits; is_imm(inst.x, &bits)) { |
456 | inst.op = imm_op; |
457 | inst.x = inst.y; |
458 | inst.y = NA; |
459 | inst.immy = bits; |
460 | } else if (int bits; is_imm(inst.y, &bits)) { |
461 | inst.op = imm_op; |
462 | inst.y = NA; |
463 | inst.immy = bits; |
464 | } break; |
465 | |
466 | try_imm_x: |
467 | if (int bits; is_imm(inst.x, &bits)) { |
468 | inst.op = imm_op; |
469 | inst.x = inst.y; |
470 | inst.y = NA; |
471 | inst.immy = bits; |
472 | } break; |
473 | |
474 | try_imm_y: |
475 | if (int bits; is_imm(inst.y, &bits)) { |
476 | inst.op = imm_op; |
477 | inst.y = NA; |
478 | inst.immy = bits; |
479 | } break; |
480 | |
481 | case Op::bit_clear: |
482 | if (int bits; is_imm(inst.y, &bits)) { |
483 | inst.op = Op::bit_and_imm; |
484 | inst.y = NA; |
485 | inst.immy = ~bits; |
486 | } break; |
487 | } |
488 | #endif |
489 | } |
490 | return program; |
491 | } |
492 | |
493 | std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program) { |
494 | // Determine which Instructions are live by working back from side effects. |
495 | std::vector<bool> live(program.size(), false); |
496 | auto mark_live = [&](Val id, auto& recurse) -> void { |
497 | if (live[id] == false) { |
498 | live[id] = true; |
499 | Instruction inst = program[id]; |
500 | for (Val arg : {inst.x, inst.y, inst.z}) { |
501 | if (arg != NA) { recurse(arg, recurse); } |
502 | } |
503 | } |
504 | }; |
505 | for (Val id = 0; id < (Val)program.size(); id++) { |
506 | if (has_side_effect(program[id].op)) { |
507 | mark_live(id, mark_live); |
508 | } |
509 | } |
510 | |
511 | // Rewrite the program with only live Instructions: |
512 | // - remap IDs in live Instructions to what they'll be once dead Instructions are removed; |
513 | // - then actually remove the dead Instructions. |
514 | std::vector<Val> new_id(program.size(), NA); |
515 | for (Val id = 0, next = 0; id < (Val)program.size(); id++) { |
516 | if (live[id]) { |
517 | Instruction& inst = program[id]; |
518 | for (Val* arg : {&inst.x, &inst.y, &inst.z}) { |
519 | if (*arg != NA) { |
520 | *arg = new_id[*arg]; |
521 | SkASSERT(*arg != NA); |
522 | } |
523 | } |
524 | new_id[id] = next++; |
525 | } |
526 | } |
527 | auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) { |
528 | Val id = (Val)(&inst - program.data()); |
529 | return !live[id]; |
530 | }); |
531 | program.erase(it, program.end()); |
532 | |
533 | return program; |
534 | } |
535 | |
536 | std::vector<Instruction> schedule(std::vector<Instruction> program) { |
537 | Usage usage{program}; |
538 | |
539 | std::vector<int> uses(program.size()); |
540 | for (Val id = 0; id < (Val)program.size(); id++) { |
541 | uses[id] = (int)usage[id].size(); |
542 | } |
543 | |
544 | auto pressure_change = [&](Val id) -> int { |
545 | Instruction inst = program[id]; |
546 | |
547 | // If this Instruction is not a sink, its result needs a register. |
548 | int change = has_side_effect(inst.op) ? 0 : 1; |
549 | |
550 | // If this is the final user of an argument, the argument's register becomes free. |
551 | for (Val arg : {inst.x, inst.y, inst.z}) { |
552 | if (arg != NA && uses[arg] == 1) { change -= 1; } |
553 | } |
554 | return change; |
555 | }; |
556 | |
557 | auto compare = [&](Val lhs, Val rhs) { |
558 | SkASSERT(lhs != rhs); |
559 | int lhs_change = pressure_change(lhs); |
560 | int rhs_change = pressure_change(rhs); |
561 | |
562 | // This comparison operator orders instructions from least (likely negative) register |
563 | // pressure to most register pressure, breaking ties arbitrarily using original |
564 | // program order comparing the instruction index itself. |
565 | // |
566 | // We'll use this operator with std::{make,push,pop}_heap() to maintain a max heap |
567 | // frontier of instructions that are ready to schedule. We iterate backwards through |
568 | // the program, scheduling later instruction slots before earlier ones, and that means |
569 | // an instruction becomes ready to schedule once all instructions using its result have |
570 | // been scheduled (in later slots). |
571 | // |
572 | // All together that means we'll be issuing the instructions that hurt register pressure |
573 | // as late as possible, and issuing the instructions that help register pressure as soon |
574 | // as possible. |
575 | // |
576 | // This heuristic of greedily issuing the instruction that most immediately decreases |
577 | // register pressure approximates a more expensive search to find a schedule that |
578 | // minimizes the high-water maximum register pressure, the number of registers we'll |
579 | // need to run this program. |
580 | // |
581 | // The tie-breaker heuristic was found through experimentation. |
582 | return lhs_change < rhs_change || (lhs_change == rhs_change && lhs > rhs); |
583 | }; |
584 | |
585 | auto ready_to_schedule = [&](Val id) { return uses[id] == 0; }; |
586 | |
587 | std::vector<Val> frontier; |
588 | for (Val id = 0; id < (Val)program.size(); id++) { |
589 | Instruction inst = program[id]; |
590 | if (has_side_effect(inst.op)) { |
591 | frontier.push_back(id); |
592 | } |
593 | // Having eliminated dead code, the only Instructions that should start |
594 | // with no users remaining to schedule are those with side effects. |
595 | SkASSERT(has_side_effect(inst.op) == (uses[id] == 0)); |
596 | } |
597 | std::make_heap(frontier.begin(), frontier.end(), compare); |
598 | |
599 | // Figure out our new Instruction schedule. |
600 | std::vector<Val> new_id(program.size(), NA); |
601 | for (Val n = (Val)program.size(); n --> 0;) { |
602 | SkASSERT(!frontier.empty()); |
603 | std::pop_heap(frontier.begin(), frontier.end(), compare); |
604 | Val id = frontier.back(); |
605 | frontier.pop_back(); |
606 | |
607 | SkASSERT(ready_to_schedule(id)); |
608 | |
609 | Instruction inst = program[id]; |
610 | new_id[id] = n; |
611 | |
612 | for (Val arg : {inst.x, inst.y, inst.z}) { |
613 | if (arg != NA) { |
614 | uses[arg]--; |
615 | if (ready_to_schedule(arg)) { |
616 | frontier.push_back(arg); |
617 | std::push_heap(frontier.begin(), frontier.end(), compare); |
618 | } |
619 | } |
620 | } |
621 | } |
622 | SkASSERT(frontier.empty()); |
623 | |
624 | // Remap each Instruction's arguments to their new IDs. |
625 | for (Val id = 0; id < (Val)program.size(); id++) { |
626 | Instruction& inst = program[id]; |
627 | for (Val* arg : {&inst.x, &inst.y, &inst.z}) { |
628 | if (*arg != NA) { |
629 | *arg = new_id[*arg]; |
630 | SkASSERT(*arg != NA); |
631 | } |
632 | } |
633 | } |
634 | |
635 | // Finally, reorder the Instructions themselves according to the new schedule. |
636 | // This is O(N)... wish I had a good reference link breaking it down. |
637 | for (Val id = 0; id < (Val)program.size(); id++) { |
638 | while (id != new_id[id]) { |
639 | std::swap(program[id], program[new_id[id]]); |
640 | std::swap( new_id[id], new_id[new_id[id]]); |
641 | } |
642 | } |
643 | return program; |
644 | } |
645 | |
646 | std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program) { |
647 | std::vector<OptimizedInstruction> optimized(program.size()); |
648 | for (Val id = 0; id < (Val)program.size(); id++) { |
649 | Instruction inst = program[id]; |
650 | optimized[id] = {inst.op, inst.x,inst.y,inst.z, inst.immy,inst.immz, |
651 | /*death=*/id, /*can_hoist=*/true, /*used_in_loop=*/false}; |
652 | } |
653 | |
654 | // Each Instruction's inputs need to live at least until that Instruction issues. |
655 | for (Val id = 0; id < (Val)optimized.size(); id++) { |
656 | OptimizedInstruction& inst = optimized[id]; |
657 | for (Val arg : {inst.x, inst.y, inst.z}) { |
658 | // (We're walking in order, so this is the same as max()ing with the existing Val.) |
659 | if (arg != NA) { optimized[arg].death = id; } |
660 | } |
661 | } |
662 | |
663 | // Mark which values don't depend on the loop and can be hoisted. |
664 | for (Val id = 0; id < (Val)optimized.size(); id++) { |
665 | OptimizedInstruction& inst = optimized[id]; |
666 | |
667 | // Varying loads (and gathers) and stores cannot be hoisted out of the loop. |
668 | if (is_always_varying(inst.op)) { |
669 | inst.can_hoist = false; |
670 | } |
671 | |
672 | // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself. |
673 | if (inst.can_hoist) { |
674 | for (Val arg : {inst.x, inst.y, inst.z}) { |
675 | if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; } |
676 | } |
677 | } |
678 | |
679 | // We'll want to know if hoisted values are used in the loop; |
680 | // if not, we can recycle their registers like we do loop values. |
681 | if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used_in_loop*/) { |
682 | for (Val arg : {inst.x, inst.y, inst.z}) { |
683 | if (arg != NA) { optimized[arg].used_in_loop = true; } |
684 | } |
685 | } |
686 | } |
687 | |
688 | return optimized; |
689 | } |
690 | |
691 | std::vector<OptimizedInstruction> Builder::optimize(bool for_jit) const { |
692 | std::vector<Instruction> program = this->program(); |
693 | if (for_jit) { |
694 | program = specialize_for_jit(std::move(program)); |
695 | } |
696 | program = eliminate_dead_code(std::move(program)); |
697 | program = schedule (std::move(program)); |
698 | return finalize (std::move(program)); |
699 | } |
700 | |
701 | Program Builder::done(const char* debug_name) const { |
702 | char buf[64] = "skvm-jit-" ; |
703 | if (!debug_name) { |
704 | *SkStrAppendU32(buf+9, this->hash()) = '\0'; |
705 | debug_name = buf; |
706 | } |
707 | |
708 | #if defined(SKVM_LLVM) || defined(SKVM_JIT) |
709 | return {this->optimize(false), this->optimize(true), fStrides, debug_name}; |
710 | #else |
711 | return {this->optimize(false), fStrides}; |
712 | #endif |
713 | } |
714 | |
715 | uint64_t Builder::hash() const { |
716 | uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0), |
717 | hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1); |
718 | return (uint64_t)lo | (uint64_t)hi << 32; |
719 | } |
720 | |
721 | bool operator==(const Instruction& a, const Instruction& b) { |
722 | return a.op == b.op |
723 | && a.x == b.x |
724 | && a.y == b.y |
725 | && a.z == b.z |
726 | && a.immy == b.immy |
727 | && a.immz == b.immz; |
728 | } |
729 | |
730 | uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const { |
731 | return SkOpts::hash(&inst, sizeof(inst), seed); |
732 | } |
733 | |
734 | |
735 | // Most instructions produce a value and return it by ID, |
736 | // the value-producing instruction's own index in the program vector. |
737 | Val Builder::push(Instruction inst) { |
738 | // Basic common subexpression elimination: |
739 | // if we've already seen this exact Instruction, use it instead of creating a new one. |
740 | if (Val* id = fIndex.find(inst)) { |
741 | return *id; |
742 | } |
743 | Val id = static_cast<Val>(fProgram.size()); |
744 | fProgram.push_back(inst); |
745 | fIndex.set(inst, id); |
746 | return id; |
747 | } |
748 | |
749 | bool Builder::allImm() const { return true; } |
750 | |
751 | template <typename T, typename... Rest> |
752 | bool Builder::allImm(Val id, T* imm, Rest... rest) const { |
753 | if (fProgram[id].op == Op::splat) { |
754 | static_assert(sizeof(T) == 4); |
755 | memcpy(imm, &fProgram[id].immy, 4); |
756 | return this->allImm(rest...); |
757 | } |
758 | return false; |
759 | } |
760 | |
761 | Arg Builder::arg(int stride) { |
762 | int ix = (int)fStrides.size(); |
763 | fStrides.push_back(stride); |
764 | return {ix}; |
765 | } |
766 | |
767 | void Builder::assert_true(I32 cond, I32 debug) { |
768 | #ifdef SK_DEBUG |
769 | int imm; |
770 | if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; } |
771 | (void)push(Op::assert_true, cond.id,debug.id,NA); |
772 | #endif |
773 | } |
774 | |
775 | void Builder::store8 (Arg ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA, ptr.ix); } |
776 | void Builder::store16(Arg ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA, ptr.ix); } |
777 | void Builder::store32(Arg ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA, ptr.ix); } |
778 | |
779 | I32 Builder::index() { return {this, push(Op::index , NA,NA,NA,0) }; } |
780 | |
781 | I32 Builder::load8 (Arg ptr) { return {this, push(Op::load8 , NA,NA,NA, ptr.ix) }; } |
782 | I32 Builder::load16(Arg ptr) { return {this, push(Op::load16, NA,NA,NA, ptr.ix) }; } |
783 | I32 Builder::load32(Arg ptr) { return {this, push(Op::load32, NA,NA,NA, ptr.ix) }; } |
784 | |
785 | I32 Builder::gather8 (Arg ptr, int offset, I32 index) { |
786 | return {this, push(Op::gather8 , index.id,NA,NA, ptr.ix,offset)}; |
787 | } |
788 | I32 Builder::gather16(Arg ptr, int offset, I32 index) { |
789 | return {this, push(Op::gather16, index.id,NA,NA, ptr.ix,offset)}; |
790 | } |
791 | I32 Builder::gather32(Arg ptr, int offset, I32 index) { |
792 | return {this, push(Op::gather32, index.id,NA,NA, ptr.ix,offset)}; |
793 | } |
794 | |
795 | I32 Builder::uniform8(Arg ptr, int offset) { |
796 | return {this, push(Op::uniform8, NA,NA,NA, ptr.ix, offset)}; |
797 | } |
798 | I32 Builder::uniform16(Arg ptr, int offset) { |
799 | return {this, push(Op::uniform16, NA,NA,NA, ptr.ix, offset)}; |
800 | } |
801 | I32 Builder::uniform32(Arg ptr, int offset) { |
802 | return {this, push(Op::uniform32, NA,NA,NA, ptr.ix, offset)}; |
803 | } |
804 | |
805 | // The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern. |
806 | I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA, n) }; } |
807 | F32 Builder::splat(float f) { |
808 | int bits; |
809 | memcpy(&bits, &f, 4); |
810 | return {this, push(Op::splat, NA,NA,NA, bits)}; |
811 | } |
812 | |
813 | static bool fma_supported() { |
814 | static const bool supported = |
815 | #if defined(SK_CPU_X86) |
816 | SkCpu::Supports(SkCpu::HSW); |
817 | #elif defined(SK_CPU_ARM64) |
818 | true; |
819 | #else |
820 | false; |
821 | #endif |
822 | return supported; |
823 | } |
824 | |
825 | // Be careful peepholing float math! Transformations you might expect to |
826 | // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0. |
827 | // Float peepholes must pass this equivalence test for all ~4B floats: |
828 | // |
829 | // bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); } |
830 | // |
831 | // unsigned bits = 0; |
832 | // do { |
833 | // float f; |
834 | // memcpy(&f, &bits, 4); |
835 | // if (!equiv(f, ...)) { |
836 | // abort(); |
837 | // } |
838 | // } while (++bits != 0); |
839 | |
840 | F32 Builder::add(F32 x, F32 y) { |
841 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } |
842 | if (this->isImm(y.id, 0.0f)) { return x; } // x+0 == x |
843 | if (this->isImm(x.id, 0.0f)) { return y; } // 0+y == y |
844 | |
845 | if (fma_supported()) { |
846 | if (fProgram[x.id].op == Op::mul_f32) { |
847 | return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; |
848 | } |
849 | if (fProgram[y.id].op == Op::mul_f32) { |
850 | return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; |
851 | } |
852 | } |
853 | return {this, this->push(Op::add_f32, x.id, y.id)}; |
854 | } |
855 | |
856 | F32 Builder::sub(F32 x, F32 y) { |
857 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } |
858 | if (this->isImm(y.id, 0.0f)) { return x; } // x-0 == x |
859 | if (fma_supported()) { |
860 | if (fProgram[x.id].op == Op::mul_f32) { |
861 | return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; |
862 | } |
863 | if (fProgram[y.id].op == Op::mul_f32) { |
864 | return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; |
865 | } |
866 | } |
867 | return {this, this->push(Op::sub_f32, x.id, y.id)}; |
868 | } |
869 | |
870 | F32 Builder::mul(F32 x, F32 y) { |
871 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } |
872 | if (this->isImm(y.id, 1.0f)) { return x; } // x*1 == x |
873 | if (this->isImm(x.id, 1.0f)) { return y; } // 1*y == y |
874 | return {this, this->push(Op::mul_f32, x.id, y.id)}; |
875 | } |
876 | |
877 | F32 Builder::div(F32 x, F32 y) { |
878 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X/Y); } |
879 | if (this->isImm(y.id, 1.0f)) { return x; } // x/1 == x |
880 | return {this, this->push(Op::div_f32, x.id, y.id)}; |
881 | } |
882 | |
883 | F32 Builder::sqrt(F32 x) { |
884 | if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); } |
885 | return {this, this->push(Op::sqrt_f32, x.id,NA,NA)}; |
886 | } |
887 | |
888 | // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. |
889 | F32 Builder::approx_log2(F32 x) { |
890 | // e - 127 is a fair approximation of log2(x) in its own right... |
891 | F32 e = mul(to_f32(bit_cast(x)), splat(1.0f / (1<<23))); |
892 | |
893 | // ... but using the mantissa to refine its error is _much_ better. |
894 | F32 m = bit_cast(bit_or(bit_and(bit_cast(x), 0x007fffff), |
895 | 0x3f000000)); |
896 | F32 approx = sub(e, 124.225514990f); |
897 | approx = sub(approx, mul(1.498030302f, m)); |
898 | approx = sub(approx, div(1.725879990f, add(0.3520887068f, m))); |
899 | |
900 | return approx; |
901 | } |
902 | |
903 | F32 Builder::approx_pow2(F32 x) { |
904 | F32 f = fract(x); |
905 | F32 approx = add(x, 121.274057500f); |
906 | approx = sub(approx, mul( 1.490129070f, f)); |
907 | approx = add(approx, div(27.728023300f, sub(4.84252568f, f))); |
908 | |
909 | return bit_cast(round(mul(1.0f * (1<<23), approx))); |
910 | } |
911 | |
912 | F32 Builder::approx_powf(F32 x, F32 y) { |
913 | auto is_x = bit_or(eq(x, 0.0f), |
914 | eq(x, 1.0f)); |
915 | return select(is_x, x, approx_pow2(mul(approx_log2(x), y))); |
916 | } |
917 | |
918 | // Bhaskara I's sine approximation |
919 | // 16x(pi - x) / (5*pi^2 - 4x(pi - x) |
920 | // ... divide by 4 |
921 | // 4x(pi - x) / 5*pi^2/4 - x(pi - x) |
922 | // |
923 | // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get |
924 | // radians into that range first. |
925 | // |
926 | F32 Builder::approx_sin(F32 radians) { |
927 | constexpr float Pi = SK_ScalarPI; |
928 | // x = radians mod 2pi |
929 | F32 x = fract(radians * (0.5f/Pi)) * (2*Pi); |
930 | I32 neg = x > Pi; // are we pi < x < 2pi --> need to negate result |
931 | x = select(neg, x - Pi, x); |
932 | |
933 | F32 pair = x * (Pi - x); |
934 | x = 4.0f * pair / ((5*Pi*Pi/4) - pair); |
935 | x = select(neg, -x, x); |
936 | return x; |
937 | } |
938 | |
939 | /* "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION" |
940 | https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf |
941 | |
942 | approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9 |
943 | |
944 | Some simplifications: |
945 | 1. tan(x) is periodic, -PI/2 < x < PI/2 |
946 | 2. tan(x) is odd, so tan(-x) = -tan(x) |
947 | 3. Our polynomial approximation is best near zero, so we use the following identity |
948 | tan(x) + tan(y) |
949 | tan(x + y) = ----------------- |
950 | 1 - tan(x)*tan(y) |
951 | tan(PI/4) = 1 |
952 | |
953 | So for x > PI/8, we do the following refactor: |
954 | x' = x - PI/4 |
955 | |
956 | 1 + tan(x') |
957 | tan(x) = ------------ |
958 | 1 - tan(x') |
959 | */ |
960 | F32 Builder::approx_tan(F32 x) { |
961 | constexpr float Pi = SK_ScalarPI; |
962 | // periodic between -pi/2 ... pi/2 |
963 | // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back |
964 | x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2); |
965 | |
966 | I32 neg = (x < 0.0f); |
967 | x = select(neg, -x, x); |
968 | |
969 | // minimize total error by shifting if x > pi/8 |
970 | I32 use_quotient = (x > (Pi/8)); |
971 | x = select(use_quotient, x - (Pi/4), x); |
972 | |
973 | // 9th order poly = 4th order(x^2) * x |
974 | x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x; |
975 | x = select(use_quotient, (1+x)/(1-x), x); |
976 | x = select(neg, -x, x); |
977 | return x; |
978 | } |
979 | |
980 | F32 Builder::min(F32 x, F32 y) { |
981 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); } |
982 | return {this, this->push(Op::min_f32, x.id, y.id)}; |
983 | } |
984 | F32 Builder::max(F32 x, F32 y) { |
985 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); } |
986 | return {this, this->push(Op::max_f32, x.id, y.id)}; |
987 | } |
988 | |
989 | I32 Builder::add(I32 x, I32 y) { |
990 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); } |
991 | if (this->isImm(x.id, 0)) { return y; } |
992 | if (this->isImm(y.id, 0)) { return x; } |
993 | return {this, this->push(Op::add_i32, x.id, y.id)}; |
994 | } |
995 | I32 Builder::sub(I32 x, I32 y) { |
996 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); } |
997 | if (this->isImm(y.id, 0)) { return x; } |
998 | return {this, this->push(Op::sub_i32, x.id, y.id)}; |
999 | } |
1000 | I32 Builder::mul(I32 x, I32 y) { |
1001 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); } |
1002 | if (this->isImm(x.id, 0)) { return splat(0); } |
1003 | if (this->isImm(y.id, 0)) { return splat(0); } |
1004 | if (this->isImm(x.id, 1)) { return y; } |
1005 | if (this->isImm(y.id, 1)) { return x; } |
1006 | return {this, this->push(Op::mul_i32, x.id, y.id)}; |
1007 | } |
1008 | |
1009 | I32 Builder::add_16x2(I32 x, I32 y) { return {this, this->push(Op::add_i16x2, x.id, y.id)}; } |
1010 | I32 Builder::sub_16x2(I32 x, I32 y) { return {this, this->push(Op::sub_i16x2, x.id, y.id)}; } |
1011 | I32 Builder::mul_16x2(I32 x, I32 y) { return {this, this->push(Op::mul_i16x2, x.id, y.id)}; } |
1012 | |
1013 | I32 Builder::shl(I32 x, int bits) { |
1014 | if (bits == 0) { return x; } |
1015 | if (int X; this->allImm(x.id,&X)) { return splat(X << bits); } |
1016 | return {this, this->push(Op::shl_i32, x.id,NA,NA, bits)}; |
1017 | } |
1018 | I32 Builder::shr(I32 x, int bits) { |
1019 | if (bits == 0) { return x; } |
1020 | if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); } |
1021 | return {this, this->push(Op::shr_i32, x.id,NA,NA, bits)}; |
1022 | } |
1023 | I32 Builder::sra(I32 x, int bits) { |
1024 | if (bits == 0) { return x; } |
1025 | if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); } |
1026 | return {this, this->push(Op::sra_i32, x.id,NA,NA, bits)}; |
1027 | } |
1028 | |
1029 | I32 Builder::shl_16x2(I32 x, int k) { return {this, this->push(Op::shl_i16x2, x.id,NA,NA, k)}; } |
1030 | I32 Builder::shr_16x2(I32 x, int k) { return {this, this->push(Op::shr_i16x2, x.id,NA,NA, k)}; } |
1031 | I32 Builder::sra_16x2(I32 x, int k) { return {this, this->push(Op::sra_i16x2, x.id,NA,NA, k)}; } |
1032 | |
1033 | I32 Builder:: eq(F32 x, F32 y) { |
1034 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); } |
1035 | return {this, this->push(Op::eq_f32, x.id, y.id)}; |
1036 | } |
1037 | I32 Builder::neq(F32 x, F32 y) { |
1038 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); } |
1039 | return {this, this->push(Op::neq_f32, x.id, y.id)}; |
1040 | } |
1041 | I32 Builder::lt(F32 x, F32 y) { |
1042 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); } |
1043 | return {this, this->push(Op::gt_f32, y.id, x.id)}; |
1044 | } |
1045 | I32 Builder::lte(F32 x, F32 y) { |
1046 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); } |
1047 | return {this, this->push(Op::gte_f32, y.id, x.id)}; |
1048 | } |
1049 | I32 Builder::gt(F32 x, F32 y) { |
1050 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); } |
1051 | return {this, this->push(Op::gt_f32, x.id, y.id)}; |
1052 | } |
1053 | I32 Builder::gte(F32 x, F32 y) { |
1054 | if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); } |
1055 | return {this, this->push(Op::gte_f32, x.id, y.id)}; |
1056 | } |
1057 | |
1058 | I32 Builder:: eq(I32 x, I32 y) { |
1059 | if (x.id == y.id) { return splat(~0); } |
1060 | return {this, this->push(Op:: eq_i32, x.id, y.id)}; |
1061 | } |
1062 | I32 Builder::neq(I32 x, I32 y) { |
1063 | return {this, this->push(Op::neq_i32, x.id, y.id)}; |
1064 | } |
1065 | I32 Builder:: gt(I32 x, I32 y) { |
1066 | return {this, this->push(Op:: gt_i32, x.id, y.id)}; |
1067 | } |
1068 | I32 Builder::gte(I32 x, I32 y) { |
1069 | if (x.id == y.id) { return splat(~0); } |
1070 | return {this, this->push(Op::gte_i32, x.id, y.id)}; |
1071 | } |
1072 | I32 Builder:: lt(I32 x, I32 y) { return y>x; } |
1073 | I32 Builder::lte(I32 x, I32 y) { return y>=x; } |
1074 | |
1075 | I32 Builder:: eq_16x2(I32 x, I32 y) { return {this, this->push(Op:: eq_i16x2, x.id, y.id)}; } |
1076 | I32 Builder::neq_16x2(I32 x, I32 y) { return {this, this->push(Op::neq_i16x2, x.id, y.id)}; } |
1077 | I32 Builder:: lt_16x2(I32 x, I32 y) { return {this, this->push(Op:: gt_i16x2, y.id, x.id)}; } |
1078 | I32 Builder::lte_16x2(I32 x, I32 y) { return {this, this->push(Op::gte_i16x2, y.id, x.id)}; } |
1079 | I32 Builder:: gt_16x2(I32 x, I32 y) { return {this, this->push(Op:: gt_i16x2, x.id, y.id)}; } |
1080 | I32 Builder::gte_16x2(I32 x, I32 y) { return {this, this->push(Op::gte_i16x2, x.id, y.id)}; } |
1081 | |
1082 | I32 Builder::bit_and(I32 x, I32 y) { |
1083 | if (x.id == y.id) { return x; } |
1084 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); } |
1085 | if (this->isImm(y.id, 0)) { return splat(0); } // (x & false) == false |
1086 | if (this->isImm(x.id, 0)) { return splat(0); } // (false & y) == false |
1087 | if (this->isImm(y.id,~0)) { return x; } // (x & true) == x |
1088 | if (this->isImm(x.id,~0)) { return y; } // (true & y) == y |
1089 | return {this, this->push(Op::bit_and, x.id, y.id)}; |
1090 | } |
1091 | I32 Builder::bit_or(I32 x, I32 y) { |
1092 | if (x.id == y.id) { return x; } |
1093 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); } |
1094 | if (this->isImm(y.id, 0)) { return x; } // (x | false) == x |
1095 | if (this->isImm(x.id, 0)) { return y; } // (false | y) == y |
1096 | if (this->isImm(y.id,~0)) { return splat(~0); } // (x | true) == true |
1097 | if (this->isImm(x.id,~0)) { return splat(~0); } // (true | y) == true |
1098 | return {this, this->push(Op::bit_or, x.id, y.id)}; |
1099 | } |
1100 | I32 Builder::bit_xor(I32 x, I32 y) { |
1101 | if (x.id == y.id) { return splat(0); } |
1102 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); } |
1103 | if (this->isImm(y.id, 0)) { return x; } // (x ^ false) == x |
1104 | if (this->isImm(x.id, 0)) { return y; } // (false ^ y) == y |
1105 | return {this, this->push(Op::bit_xor, x.id, y.id)}; |
1106 | } |
1107 | |
1108 | I32 Builder::bit_clear(I32 x, I32 y) { |
1109 | if (x.id == y.id) { return splat(0); } |
1110 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); } |
1111 | if (this->isImm(y.id, 0)) { return x; } // (x & ~false) == x |
1112 | if (this->isImm(y.id,~0)) { return splat(0); } // (x & ~true) == false |
1113 | if (this->isImm(x.id, 0)) { return splat(0); } // (false & ~y) == false |
1114 | return {this, this->push(Op::bit_clear, x.id, y.id)}; |
1115 | } |
1116 | |
1117 | I32 Builder::select(I32 x, I32 y, I32 z) { |
1118 | if (y.id == z.id) { return y; } |
1119 | if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); } |
1120 | if (this->isImm(x.id,~0)) { return y; } // true ? y : z == y |
1121 | if (this->isImm(x.id, 0)) { return z; } // false ? y : z == z |
1122 | if (this->isImm(y.id, 0)) { return bit_clear(z,x); } // x ? 0 : z == ~x&z |
1123 | if (this->isImm(z.id, 0)) { return bit_and (y,x); } // x ? y : 0 == x&y |
1124 | return {this, this->push(Op::select, x.id, y.id, z.id)}; |
1125 | } |
1126 | |
1127 | I32 Builder::(I32 x, int bits, I32 z) { |
1128 | if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); } |
1129 | return bit_and(z, shr(x, bits)); |
1130 | } |
1131 | |
1132 | I32 Builder::pack(I32 x, I32 y, int bits) { |
1133 | if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|(Y<<bits)); } |
1134 | return {this, this->push(Op::pack, x.id,y.id,NA, 0,bits)}; |
1135 | } |
1136 | |
1137 | F32 Builder::floor(F32 x) { |
1138 | if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); } |
1139 | return {this, this->push(Op::floor, x.id)}; |
1140 | } |
1141 | F32 Builder::to_f32(I32 x) { |
1142 | if (int X; this->allImm(x.id,&X)) { return splat((float)X); } |
1143 | return {this, this->push(Op::to_f32, x.id)}; |
1144 | } |
1145 | I32 Builder::trunc(F32 x) { |
1146 | if (float X; this->allImm(x.id,&X)) { return splat((int)X); } |
1147 | return {this, this->push(Op::trunc, x.id)}; |
1148 | } |
1149 | I32 Builder::round(F32 x) { |
1150 | if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); } |
1151 | return {this, this->push(Op::round, x.id)}; |
1152 | } |
1153 | |
1154 | F32 Builder::from_unorm(int bits, I32 x) { |
1155 | F32 limit = splat(1 / ((1<<bits)-1.0f)); |
1156 | return mul(to_f32(x), limit); |
1157 | } |
1158 | I32 Builder::to_unorm(int bits, F32 x) { |
1159 | F32 limit = splat((1<<bits)-1.0f); |
1160 | return round(mul(x, limit)); |
1161 | } |
1162 | |
1163 | Color Builder::unpack_1010102(I32 rgba) { |
1164 | return { |
1165 | from_unorm(10, extract(rgba, 0, 0x3ff)), |
1166 | from_unorm(10, extract(rgba, 10, 0x3ff)), |
1167 | from_unorm(10, extract(rgba, 20, 0x3ff)), |
1168 | from_unorm( 2, extract(rgba, 30, 0x3 )), |
1169 | }; |
1170 | } |
1171 | Color Builder::unpack_8888(I32 rgba) { |
1172 | return { |
1173 | from_unorm(8, extract(rgba, 0, 0xff)), |
1174 | from_unorm(8, extract(rgba, 8, 0xff)), |
1175 | from_unorm(8, extract(rgba, 16, 0xff)), |
1176 | from_unorm(8, extract(rgba, 24, 0xff)), |
1177 | }; |
1178 | } |
1179 | Color Builder::unpack_565(I32 bgr) { |
1180 | return { |
1181 | from_unorm(5, extract(bgr, 11, 0b011'111)), |
1182 | from_unorm(6, extract(bgr, 5, 0b111'111)), |
1183 | from_unorm(5, extract(bgr, 0, 0b011'111)), |
1184 | splat(1.0f), |
1185 | }; |
1186 | } |
1187 | |
1188 | void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) { |
1189 | skvm::F32 invA = 1.0f / a, |
1190 | inf = bit_cast(splat(0x7f800000)); |
1191 | // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0). |
1192 | invA = select(invA < inf, invA |
1193 | , 0.0f); |
1194 | *r *= invA; |
1195 | *g *= invA; |
1196 | *b *= invA; |
1197 | } |
1198 | |
1199 | void Builder::premul(F32* r, F32* g, F32* b, F32 a) { |
1200 | *r *= a; |
1201 | *g *= a; |
1202 | *b *= a; |
1203 | } |
1204 | |
1205 | Color Builder::uniformPremul(SkColor4f color, SkColorSpace* src, |
1206 | Uniforms* uniforms, SkColorSpace* dst) { |
1207 | SkColorSpaceXformSteps(src, kUnpremul_SkAlphaType, |
1208 | dst, kPremul_SkAlphaType).apply(color.vec()); |
1209 | return { |
1210 | uniformF(uniforms->pushF(color.fR)), |
1211 | uniformF(uniforms->pushF(color.fG)), |
1212 | uniformF(uniforms->pushF(color.fB)), |
1213 | uniformF(uniforms->pushF(color.fA)), |
1214 | }; |
1215 | } |
1216 | |
1217 | Color Builder::lerp(Color lo, Color hi, F32 t) { |
1218 | return { |
1219 | lerp(lo.r, hi.r, t), |
1220 | lerp(lo.g, hi.g, t), |
1221 | lerp(lo.b, hi.b, t), |
1222 | lerp(lo.a, hi.a, t), |
1223 | }; |
1224 | } |
1225 | |
1226 | HSLA Builder::to_hsla(Color c) { |
1227 | F32 mx = max(max(c.r,c.g),c.b), |
1228 | mn = min(min(c.r,c.g),c.b), |
1229 | d = mx - mn, |
1230 | invd = 1.0f / d, |
1231 | g_lt_b = select(c.g < c.b, splat(6.0f) |
1232 | , splat(0.0f)); |
1233 | |
1234 | F32 h = (1/6.0f) * select(mx == mn, 0.0f, |
1235 | select(mx == c.r, invd * (c.g - c.b) + g_lt_b, |
1236 | select(mx == c.g, invd * (c.b - c.r) + 2.0f |
1237 | , invd * (c.r - c.g) + 4.0f))); |
1238 | |
1239 | F32 sum = mx + mn, |
1240 | l = sum * 0.5f, |
1241 | s = select(mx == mn, 0.0f |
1242 | , d / select(l > 0.5f, 2.0f - sum |
1243 | , sum)); |
1244 | return {h, s, l, c.a}; |
1245 | } |
1246 | |
1247 | Color Builder::to_rgba(HSLA c) { |
1248 | // See GrRGBToHSLFilterEffect.fp |
1249 | |
1250 | auto [h,s,l,a] = c; |
1251 | F32 x = s * (1.0f - abs(l + l - 1.0f)); |
1252 | |
1253 | auto hue_to_rgb = [&,l=l](auto hue) { |
1254 | auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f; |
1255 | return x * (clamp01(q) - 0.5f) + l; |
1256 | }; |
1257 | |
1258 | return { |
1259 | hue_to_rgb(h + 0/3.0f), |
1260 | hue_to_rgb(h + 2/3.0f), |
1261 | hue_to_rgb(h + 1/3.0f), |
1262 | c.a, |
1263 | }; |
1264 | } |
1265 | |
1266 | // We're basing our implementation of non-separable blend modes on |
1267 | // https://www.w3.org/TR/compositing-1/#blendingnonseparable. |
1268 | // and |
1269 | // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf |
1270 | // They're equivalent, but ES' math has been better simplified. |
1271 | // |
1272 | // Anything extra we add beyond that is to make the math work with premul inputs. |
1273 | |
1274 | static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) { |
1275 | return max(r, max(g, b)) |
1276 | - min(r, min(g, b)); |
1277 | } |
1278 | |
1279 | static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) { |
1280 | return r*0.30f + g*0.59f + b*0.11f; |
1281 | } |
1282 | |
1283 | static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) { |
1284 | F32 mn = min(*r, min(*g, *b)), |
1285 | mx = max(*r, max(*g, *b)), |
1286 | sat = mx - mn; |
1287 | |
1288 | // Map min channel to 0, max channel to s, and scale the middle proportionally. |
1289 | auto scale = [&](auto c) { |
1290 | // TODO: better to divide and check for non-finite result? |
1291 | return select(sat == 0.0f, 0.0f |
1292 | , ((c - mn) * s) / sat); |
1293 | }; |
1294 | *r = scale(*r); |
1295 | *g = scale(*g); |
1296 | *b = scale(*b); |
1297 | } |
1298 | |
1299 | static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) { |
1300 | auto diff = lu - luminance(*r, *g, *b); |
1301 | *r += diff; |
1302 | *g += diff; |
1303 | *b += diff; |
1304 | } |
1305 | |
1306 | static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) { |
1307 | F32 mn = min(*r, min(*g, *b)), |
1308 | mx = max(*r, max(*g, *b)), |
1309 | lu = luminance(*r, *g, *b); |
1310 | |
1311 | auto clip = [&](auto c) { |
1312 | c = select(mn >= 0, c |
1313 | , lu + ((c-lu)*( lu)) / (lu-mn)); |
1314 | c = select(mx > a, lu + ((c-lu)*(a-lu)) / (mx-lu) |
1315 | , c); |
1316 | return clamp01(c); // May be a little negative, or worse, NaN. |
1317 | }; |
1318 | *r = clip(*r); |
1319 | *g = clip(*g); |
1320 | *b = clip(*b); |
1321 | } |
1322 | |
1323 | Color Builder::blend(SkBlendMode mode, Color src, Color dst) { |
1324 | auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) { |
1325 | return x*y + z*w; |
1326 | }; |
1327 | |
1328 | auto two = [](skvm::F32 x) { return x+x; }; |
1329 | |
1330 | auto apply_rgba = [&](auto fn) { |
1331 | return Color { |
1332 | fn(src.r, dst.r), |
1333 | fn(src.g, dst.g), |
1334 | fn(src.b, dst.b), |
1335 | fn(src.a, dst.a), |
1336 | }; |
1337 | }; |
1338 | |
1339 | auto apply_rgb_srcover_a = [&](auto fn) { |
1340 | return Color { |
1341 | fn(src.r, dst.r), |
1342 | fn(src.g, dst.g), |
1343 | fn(src.b, dst.b), |
1344 | mad(dst.a, 1-src.a, src.a), // srcover for alpha |
1345 | }; |
1346 | }; |
1347 | |
1348 | auto non_sep = [&](auto R, auto G, auto B) { |
1349 | return Color{ |
1350 | R + mma(src.r, 1-dst.a, dst.r, 1-src.a), |
1351 | G + mma(src.g, 1-dst.a, dst.g, 1-src.a), |
1352 | B + mma(src.b, 1-dst.a, dst.b, 1-src.a), |
1353 | mad(dst.a, 1-src.a, src.a), // srcover for alpha |
1354 | }; |
1355 | }; |
1356 | |
1357 | switch (mode) { |
1358 | default: SkASSERT(false); /*but also, for safety, fallthrough*/ |
1359 | |
1360 | case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) }; |
1361 | |
1362 | case SkBlendMode::kSrc: return src; |
1363 | case SkBlendMode::kDst: return dst; |
1364 | |
1365 | case SkBlendMode::kDstOver: std::swap(src, dst); // fall-through |
1366 | case SkBlendMode::kSrcOver: |
1367 | return apply_rgba([&](auto s, auto d) { |
1368 | return mad(d,1-src.a, s); |
1369 | }); |
1370 | |
1371 | case SkBlendMode::kDstIn: std::swap(src, dst); // fall-through |
1372 | case SkBlendMode::kSrcIn: |
1373 | return apply_rgba([&](auto s, auto d) { |
1374 | return s * dst.a; |
1375 | }); |
1376 | |
1377 | case SkBlendMode::kDstOut: std::swap(src, dst); // fall-through |
1378 | case SkBlendMode::kSrcOut: |
1379 | return apply_rgba([&](auto s, auto d) { |
1380 | return s * (1-dst.a); |
1381 | }); |
1382 | |
1383 | case SkBlendMode::kDstATop: std::swap(src, dst); // fall-through |
1384 | case SkBlendMode::kSrcATop: |
1385 | return apply_rgba([&](auto s, auto d) { |
1386 | return mma(s, dst.a, d, 1-src.a); |
1387 | }); |
1388 | |
1389 | case SkBlendMode::kXor: |
1390 | return apply_rgba([&](auto s, auto d) { |
1391 | return mma(s, 1-dst.a, d, 1-src.a); |
1392 | }); |
1393 | |
1394 | case SkBlendMode::kPlus: |
1395 | return apply_rgba([&](auto s, auto d) { |
1396 | return min(s+d, 1.0f); |
1397 | }); |
1398 | |
1399 | case SkBlendMode::kModulate: |
1400 | return apply_rgba([&](auto s, auto d) { |
1401 | return s * d; |
1402 | }); |
1403 | |
1404 | case SkBlendMode::kScreen: |
1405 | // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts. |
1406 | // It's kind of plausible that s + (d - sd) keeps more precision? |
1407 | return apply_rgba([&](auto s, auto d) { |
1408 | return s + (d - s*d); |
1409 | }); |
1410 | |
1411 | case SkBlendMode::kDarken: |
1412 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1413 | return s + (d - max(s * dst.a, |
1414 | d * src.a)); |
1415 | }); |
1416 | |
1417 | case SkBlendMode::kLighten: |
1418 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1419 | return s + (d - min(s * dst.a, |
1420 | d * src.a)); |
1421 | }); |
1422 | |
1423 | case SkBlendMode::kDifference: |
1424 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1425 | return s + (d - two(min(s * dst.a, |
1426 | d * src.a))); |
1427 | }); |
1428 | |
1429 | case SkBlendMode::kExclusion: |
1430 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1431 | return s + (d - two(s * d)); |
1432 | }); |
1433 | |
1434 | case SkBlendMode::kColorBurn: |
1435 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1436 | // TODO: divide and check for non-finite result instead of checking for s == 0. |
1437 | auto mn = min(dst.a, |
1438 | src.a * (dst.a - d) / s), |
1439 | burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a); |
1440 | return select(d == dst.a, s * (1-dst.a) + d, |
1441 | select(s == 0.0f , d * (1-src.a) |
1442 | , burn)); |
1443 | }); |
1444 | |
1445 | case SkBlendMode::kColorDodge: |
1446 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1447 | // TODO: divide and check for non-finite result instead of checking for s == sa. |
1448 | auto dodge = src.a * min(dst.a, |
1449 | d * src.a / (src.a - s)) |
1450 | + mma(s, 1-dst.a, d, 1-src.a); |
1451 | return select(d == 0.0f , s * (1-dst.a), |
1452 | select(s == src.a, d * (1-src.a) + s |
1453 | , dodge)); |
1454 | }); |
1455 | |
1456 | case SkBlendMode::kHardLight: |
1457 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1458 | return mma(s, 1-dst.a, d, 1-src.a) + |
1459 | select(two(s) <= src.a, |
1460 | two(s * d), |
1461 | src.a * dst.a - two((dst.a - d) * (src.a - s))); |
1462 | }); |
1463 | |
1464 | case SkBlendMode::kOverlay: |
1465 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1466 | return mma(s, 1-dst.a, d, 1-src.a) + |
1467 | select(two(d) <= dst.a, |
1468 | two(s * d), |
1469 | src.a * dst.a - two((dst.a - d) * (src.a - s))); |
1470 | }); |
1471 | |
1472 | case SkBlendMode::kMultiply: |
1473 | return apply_rgba([&](auto s, auto d) { |
1474 | return mma(s, 1-dst.a, d, 1-src.a) + s * d; |
1475 | }); |
1476 | |
1477 | case SkBlendMode::kSoftLight: |
1478 | return apply_rgb_srcover_a([&](auto s, auto d) { |
1479 | auto m = select(dst.a > 0.0f, d / dst.a |
1480 | , 0.0f), |
1481 | s2 = two(s), |
1482 | m4 = 4*m; |
1483 | |
1484 | // The logic forks three ways: |
1485 | // 1. dark src? |
1486 | // 2. light src, dark dst? |
1487 | // 3. light src, light dst? |
1488 | |
1489 | // Used in case 1 |
1490 | auto darkSrc = d * ((s2-src.a) * (1-m) + src.a), |
1491 | // Used in case 2 |
1492 | darkDst = (m4 * m4 + m4) * (m-1) + 7*m, |
1493 | // Used in case 3. |
1494 | liteDst = sqrt(m) - m, |
1495 | // Used in 2 or 3? |
1496 | liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst |
1497 | , liteDst) |
1498 | + d * src.a; |
1499 | return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc |
1500 | , liteSrc); |
1501 | }); |
1502 | |
1503 | case SkBlendMode::kHue: { |
1504 | skvm::F32 R = src.r * src.a, |
1505 | G = src.g * src.a, |
1506 | B = src.b * src.a; |
1507 | |
1508 | set_sat (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b)); |
1509 | set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); |
1510 | clip_color(&R, &G, &B, src.a * dst.a); |
1511 | |
1512 | return non_sep(R, G, B); |
1513 | } |
1514 | |
1515 | case SkBlendMode::kSaturation: { |
1516 | skvm::F32 R = dst.r * src.a, |
1517 | G = dst.g * src.a, |
1518 | B = dst.b * src.a; |
1519 | |
1520 | set_sat (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b)); |
1521 | set_lum (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b)); |
1522 | clip_color(&R, &G, &B, src.a * dst.a); |
1523 | |
1524 | return non_sep(R, G, B); |
1525 | } |
1526 | |
1527 | case SkBlendMode::kColor: { |
1528 | skvm::F32 R = src.r * dst.a, |
1529 | G = src.g * dst.a, |
1530 | B = src.b * dst.a; |
1531 | |
1532 | set_lum (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b)); |
1533 | clip_color(&R, &G, &B, src.a * dst.a); |
1534 | |
1535 | return non_sep(R, G, B); |
1536 | } |
1537 | |
1538 | case SkBlendMode::kLuminosity: { |
1539 | skvm::F32 R = dst.r * src.a, |
1540 | G = dst.g * src.a, |
1541 | B = dst.b * src.a; |
1542 | |
1543 | set_lum (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b)); |
1544 | clip_color(&R, &G, &B, dst.a * src.a); |
1545 | |
1546 | return non_sep(R, G, B); |
1547 | } |
1548 | } |
1549 | } |
1550 | |
1551 | // For a given program we'll store each Instruction's users contiguously in a table, |
1552 | // and track where each Instruction's span of users starts and ends in another index. |
1553 | // Here's a simple program that loads x and stores kx+k: |
1554 | // |
1555 | // v0 = splat(k) |
1556 | // v1 = load(...) |
1557 | // v2 = mul(v1, v0) |
1558 | // v3 = add(v2, v0) |
1559 | // v4 = store(..., v3) |
1560 | // |
1561 | // This program has 5 instructions v0-v4. |
1562 | // - v0 is used by v2 and v3 |
1563 | // - v1 is used by v2 |
1564 | // - v2 is used by v3 |
1565 | // - v3 is used by v4 |
1566 | // - v4 has a side-effect |
1567 | // |
1568 | // For this program we fill out these two arrays: |
1569 | // table: [v2,v3, v2, v3, v4] |
1570 | // index: [0, 2, 3, 4, 5] |
1571 | // |
1572 | // The table is just those "is used by ..." I wrote out above in order, |
1573 | // and the index tracks where an Instruction's span of users starts, table[index[id]]. |
1574 | // The span continues up until the start of the next Instruction, table[index[id+1]]. |
1575 | SkSpan<const Val> Usage::operator[](Val id) const { |
1576 | int begin = fIndex[id]; |
1577 | int end = fIndex[id + 1]; |
1578 | return SkMakeSpan(fTable.data() + begin, end - begin); |
1579 | } |
1580 | |
1581 | Usage::Usage(const std::vector<Instruction>& program) { |
1582 | // uses[id] counts the number of times each Instruction is used. |
1583 | std::vector<int> uses(program.size(), 0); |
1584 | for (Val id = 0; id < (Val)program.size(); id++) { |
1585 | Instruction inst = program[id]; |
1586 | if (inst.x != NA) { ++uses[inst.x]; } |
1587 | if (inst.y != NA) { ++uses[inst.y]; } |
1588 | if (inst.z != NA) { ++uses[inst.z]; } |
1589 | } |
1590 | |
1591 | // Build our index into fTable, with an extra entry marking the final Instruction's end. |
1592 | fIndex.reserve(program.size() + 1); |
1593 | int total_uses = 0; |
1594 | for (int n : uses) { |
1595 | fIndex.push_back(total_uses); |
1596 | total_uses += n; |
1597 | } |
1598 | fIndex.push_back(total_uses); |
1599 | |
1600 | // Tick down each Instruction's uses to fill in fTable. |
1601 | fTable.resize(total_uses, NA); |
1602 | for (Val id = (Val)program.size(); id --> 0; ) { |
1603 | Instruction inst = program[id]; |
1604 | if (inst.x != NA) { fTable[fIndex[inst.x] + --uses[inst.x]] = id; } |
1605 | if (inst.y != NA) { fTable[fIndex[inst.y] + --uses[inst.y]] = id; } |
1606 | if (inst.z != NA) { fTable[fIndex[inst.z] + --uses[inst.z]] = id; } |
1607 | } |
1608 | for (int n : uses ) { (void)n; SkASSERT(n == 0 ); } |
1609 | for (Val id : fTable) { (void)id; SkASSERT(id != NA); } |
1610 | } |
1611 | |
1612 | // ~~~~ Program::eval() and co. ~~~~ // |
1613 | |
1614 | // Handy references for x86-64 instruction encoding: |
1615 | // https://wiki.osdev.org/X86-64_Instruction_Encoding |
1616 | // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm |
1617 | // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm |
1618 | // http://ref.x86asm.net/coder64.html |
1619 | |
1620 | // Used for ModRM / immediate instruction encoding. |
1621 | static uint8_t _233(int a, int b, int c) { |
1622 | return (a & 3) << 6 |
1623 | | (b & 7) << 3 |
1624 | | (c & 7) << 0; |
1625 | } |
1626 | |
1627 | // ModRM byte encodes the arguments of an opcode. |
1628 | enum class Mod { Indirect, OneByteImm, FourByteImm, Direct }; |
1629 | static uint8_t mod_rm(Mod mod, int reg, int rm) { |
1630 | return _233((int)mod, reg, rm); |
1631 | } |
1632 | |
1633 | static Mod mod(int imm) { |
1634 | if (imm == 0) { return Mod::Indirect; } |
1635 | if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; } |
1636 | return Mod::FourByteImm; |
1637 | } |
1638 | |
1639 | static int imm_bytes(Mod mod) { |
1640 | switch (mod) { |
1641 | case Mod::Indirect: return 0; |
1642 | case Mod::OneByteImm: return 1; |
1643 | case Mod::FourByteImm: return 4; |
1644 | case Mod::Direct: SkUNREACHABLE; |
1645 | } |
1646 | SkUNREACHABLE; |
1647 | } |
1648 | |
1649 | // SIB byte encodes a memory address, base + (index * scale). |
1650 | static uint8_t sib(Assembler::Scale scale, int index, int base) { |
1651 | return _233((int)scale, index, base); |
1652 | } |
1653 | |
1654 | // The REX prefix is used to extend most old 32-bit instructions to 64-bit. |
1655 | static uint8_t rex(bool W, // If set, operation is 64-bit, otherwise default, usually 32-bit. |
1656 | bool R, // Extra top bit to select ModRM reg, registers 8-15. |
1657 | bool X, // Extra top bit for SIB index register. |
1658 | bool B) { // Extra top bit for SIB base or ModRM rm register. |
1659 | return 0b01000000 // Fixed 0100 for top four bits. |
1660 | | (W << 3) |
1661 | | (R << 2) |
1662 | | (X << 1) |
1663 | | (B << 0); |
1664 | } |
1665 | |
1666 | |
1667 | // The VEX prefix extends SSE operations to AVX. Used generally, even with XMM. |
1668 | struct VEX { |
1669 | int len; |
1670 | uint8_t bytes[3]; |
1671 | }; |
1672 | |
1673 | static VEX vex(bool WE, // Like REX W for int operations, or opcode extension for float? |
1674 | bool R, // Same as REX R. Pass high bit of dst register, dst>>3. |
1675 | bool X, // Same as REX X. |
1676 | bool B, // Same as REX B. Pass y>>3 for 3-arg ops, x>>3 for 2-arg. |
1677 | int map, // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f. |
1678 | int vvvv, // 4-bit second operand register. Pass our x for 3-arg ops. |
1679 | bool L, // Set for 256-bit ymm operations, off for 128-bit xmm. |
1680 | int pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none. |
1681 | |
1682 | // Pack x86 opcode map selector to 5-bit VEX encoding. |
1683 | map = [map]{ |
1684 | switch (map) { |
1685 | case 0x0f: return 0b00001; |
1686 | case 0x380f: return 0b00010; |
1687 | case 0x3a0f: return 0b00011; |
1688 | // Several more cases only used by XOP / TBM. |
1689 | } |
1690 | SkUNREACHABLE; |
1691 | }(); |
1692 | |
1693 | // Pack mandatory SSE opcode prefix byte to 2-bit VEX encoding. |
1694 | pp = [pp]{ |
1695 | switch (pp) { |
1696 | case 0x66: return 0b01; |
1697 | case 0xf3: return 0b10; |
1698 | case 0xf2: return 0b11; |
1699 | } |
1700 | return 0b00; |
1701 | }(); |
1702 | |
1703 | VEX vex = {0, {0,0,0}}; |
1704 | if (X == 0 && B == 0 && WE == 0 && map == 0b00001) { |
1705 | // With these conditions met, we can optionally compress VEX to 2-byte. |
1706 | vex.len = 2; |
1707 | vex.bytes[0] = 0xc5; |
1708 | vex.bytes[1] = (pp & 3) << 0 |
1709 | | (L & 1) << 2 |
1710 | | (~vvvv & 15) << 3 |
1711 | | (~(int)R & 1) << 7; |
1712 | } else { |
1713 | // We could use this 3-byte VEX prefix all the time if we like. |
1714 | vex.len = 3; |
1715 | vex.bytes[0] = 0xc4; |
1716 | vex.bytes[1] = (map & 31) << 0 |
1717 | | (~(int)B & 1) << 5 |
1718 | | (~(int)X & 1) << 6 |
1719 | | (~(int)R & 1) << 7; |
1720 | vex.bytes[2] = (pp & 3) << 0 |
1721 | | (L & 1) << 2 |
1722 | | (~vvvv & 15) << 3 |
1723 | | (WE & 1) << 7; |
1724 | } |
1725 | return vex; |
1726 | } |
1727 | |
1728 | Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(0) {} |
1729 | |
1730 | size_t Assembler::size() const { return fSize; } |
1731 | |
1732 | void Assembler::bytes(const void* p, int n) { |
1733 | if (fCurr) { |
1734 | memcpy(fCurr, p, n); |
1735 | fCurr += n; |
1736 | } |
1737 | fSize += n; |
1738 | } |
1739 | |
1740 | void Assembler::byte(uint8_t b) { this->bytes(&b, 1); } |
1741 | void Assembler::word(uint32_t w) { this->bytes(&w, 4); } |
1742 | |
1743 | void Assembler::align(int mod) { |
1744 | while (this->size() % mod) { |
1745 | this->byte(0x00); |
1746 | } |
1747 | } |
1748 | |
1749 | void Assembler::int3() { |
1750 | this->byte(0xcc); |
1751 | } |
1752 | |
1753 | void Assembler::vzeroupper() { |
1754 | this->byte(0xc5); |
1755 | this->byte(0xf8); |
1756 | this->byte(0x77); |
1757 | } |
1758 | void Assembler::ret() { this->byte(0xc3); } |
1759 | |
1760 | // Common instruction building for 64-bit opcodes with an immediate argument. |
1761 | void Assembler::op(int opcode, int opcode_ext, GP64 dst, int imm) { |
1762 | opcode |= 0b0000'0001; // low bit set for 64-bit operands |
1763 | opcode |= 0b1000'0000; // top bit set for instructions with any immediate |
1764 | |
1765 | int imm_bytes = 4; |
1766 | if (SkTFitsIn<int8_t>(imm)) { |
1767 | imm_bytes = 1; |
1768 | opcode |= 0b0000'0010; // second bit set for 8-bit immediate, else 32-bit. |
1769 | } |
1770 | |
1771 | this->byte(rex(1,0,0,dst>>3)); |
1772 | this->byte(opcode); |
1773 | this->byte(mod_rm(Mod::Direct, opcode_ext, dst&7)); |
1774 | this->bytes(&imm, imm_bytes); |
1775 | } |
1776 | |
1777 | void Assembler::add(GP64 dst, int imm) { this->op(0,0b000, dst,imm); } |
1778 | void Assembler::sub(GP64 dst, int imm) { this->op(0,0b101, dst,imm); } |
1779 | void Assembler::cmp(GP64 reg, int imm) { this->op(0,0b111, reg,imm); } |
1780 | |
1781 | void Assembler::movq(GP64 dst, GP64 src, int off) { |
1782 | this->byte(rex(1,dst>>3,0,src>>3)); |
1783 | this->byte(0x8b); |
1784 | this->byte(mod_rm(mod(off), dst&7, src&7)); |
1785 | this->bytes(&off, imm_bytes(mod(off))); |
1786 | } |
1787 | |
1788 | void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Ymm y, bool W/*=false*/) { |
1789 | VEX v = vex(W, dst>>3, 0, y>>3, |
1790 | map, x, 1/*ymm, not xmm*/, prefix); |
1791 | this->bytes(v.bytes, v.len); |
1792 | this->byte(opcode); |
1793 | this->byte(mod_rm(Mod::Direct, dst&7, y&7)); |
1794 | } |
1795 | |
1796 | void Assembler::vpaddd (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66, 0x0f,0xfe, dst,x,y); } |
1797 | void Assembler::vpsubd (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66, 0x0f,0xfa, dst,x,y); } |
1798 | void Assembler::vpmulld(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x40, dst,x,y); } |
1799 | |
1800 | void Assembler::vpsubw (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xf9, dst,x,y); } |
1801 | void Assembler::vpmullw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xd5, dst,x,y); } |
1802 | |
1803 | void Assembler::vpand (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xdb, dst,x,y); } |
1804 | void Assembler::vpor (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xeb, dst,x,y); } |
1805 | void Assembler::vpxor (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xef, dst,x,y); } |
1806 | void Assembler::vpandn(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xdf, dst,x,y); } |
1807 | |
1808 | void Assembler::vaddps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x58, dst,x,y); } |
1809 | void Assembler::vsubps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5c, dst,x,y); } |
1810 | void Assembler::vmulps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x59, dst,x,y); } |
1811 | void Assembler::vdivps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5e, dst,x,y); } |
1812 | void Assembler::vminps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5d, dst,x,y); } |
1813 | void Assembler::vmaxps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5f, dst,x,y); } |
1814 | |
1815 | void Assembler::vfmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x98, dst,x,y); } |
1816 | void Assembler::vfmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xa8, dst,x,y); } |
1817 | void Assembler::vfmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xb8, dst,x,y); } |
1818 | |
1819 | void Assembler::vfmsub132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x9a, dst,x,y); } |
1820 | void Assembler::vfmsub213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xaa, dst,x,y); } |
1821 | void Assembler::vfmsub231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xba, dst,x,y); } |
1822 | |
1823 | void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x9c, dst,x,y); } |
1824 | void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xac, dst,x,y); } |
1825 | void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xbc, dst,x,y); } |
1826 | |
1827 | void Assembler::vpackusdw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x2b, dst,x,y); } |
1828 | void Assembler::vpackuswb(Ymm dst, Ymm x, Ymm y) { this->op(0x66, 0x0f,0x67, dst,x,y); } |
1829 | |
1830 | void Assembler::vpcmpeqd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x76, dst,x,y); } |
1831 | void Assembler::vpcmpgtd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x66, dst,x,y); } |
1832 | |
1833 | void Assembler::vcmpps(Ymm dst, Ymm x, Ymm y, int imm) { |
1834 | this->op(0,0x0f,0xc2, dst,x,y); |
1835 | this->byte(imm); |
1836 | } |
1837 | |
1838 | void Assembler::vpblendvb(Ymm dst, Ymm x, Ymm y, Ymm z) { |
1839 | int prefix = 0x66, |
1840 | map = 0x3a0f, |
1841 | opcode = 0x4c; |
1842 | VEX v = vex(0, dst>>3, 0, y>>3, |
1843 | map, x, /*ymm?*/1, prefix); |
1844 | this->bytes(v.bytes, v.len); |
1845 | this->byte(opcode); |
1846 | this->byte(mod_rm(Mod::Direct, dst&7, y&7)); |
1847 | this->byte(z << 4); |
1848 | } |
1849 | |
1850 | // dst = x op /opcode_ext imm |
1851 | void Assembler::op(int prefix, int map, int opcode, int opcode_ext, Ymm dst, Ymm x, int imm) { |
1852 | // This is a little weird, but if we pass the opcode_ext as if it were the dst register, |
1853 | // the dst register as if x, and the x register as if y, all the bits end up where we want. |
1854 | this->op(prefix, map, opcode, (Ymm)opcode_ext,dst,x); |
1855 | this->byte(imm); |
1856 | } |
1857 | |
1858 | void Assembler::vpslld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,6, dst,x,imm); } |
1859 | void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,2, dst,x,imm); } |
1860 | void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,4, dst,x,imm); } |
1861 | |
1862 | void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x71,2, dst,x,imm); } |
1863 | |
1864 | |
1865 | void Assembler::vpermq(Ymm dst, Ymm x, int imm) { |
1866 | // A bit unusual among the instructions we use, this is 64-bit operation, so we set W. |
1867 | bool W = true; |
1868 | this->op(0x66,0x3a0f,0x00, dst,x,W); |
1869 | this->byte(imm); |
1870 | } |
1871 | |
1872 | void Assembler::vroundps(Ymm dst, Ymm x, int imm) { |
1873 | this->op(0x66,0x3a0f,0x08, dst,x); |
1874 | this->byte(imm); |
1875 | } |
1876 | |
1877 | void Assembler::vmovdqa(Ymm dst, Ymm src) { this->op(0x66,0x0f,0x6f, dst,src); } |
1878 | |
1879 | void Assembler::vcvtdq2ps (Ymm dst, Ymm x) { this->op( 0,0x0f,0x5b, dst,x); } |
1880 | void Assembler::vcvttps2dq(Ymm dst, Ymm x) { this->op(0xf3,0x0f,0x5b, dst,x); } |
1881 | void Assembler::vcvtps2dq (Ymm dst, Ymm x) { this->op(0x66,0x0f,0x5b, dst,x); } |
1882 | void Assembler::vsqrtps (Ymm dst, Ymm x) { this->op( 0,0x0f,0x51, dst,x); } |
1883 | |
1884 | Assembler::Label Assembler::here() { |
1885 | return { (int)this->size(), Label::NotYetSet, {} }; |
1886 | } |
1887 | |
1888 | int Assembler::disp19(Label* l) { |
1889 | SkASSERT(l->kind == Label::NotYetSet || |
1890 | l->kind == Label::ARMDisp19); |
1891 | l->kind = Label::ARMDisp19; |
1892 | l->references.push_back(here().offset); |
1893 | // ARM 19-bit instruction count, from the beginning of this instruction. |
1894 | return (l->offset - here().offset) / 4; |
1895 | } |
1896 | |
1897 | int Assembler::disp32(Label* l) { |
1898 | SkASSERT(l->kind == Label::NotYetSet || |
1899 | l->kind == Label::X86Disp32); |
1900 | l->kind = Label::X86Disp32; |
1901 | l->references.push_back(here().offset); |
1902 | // x86 32-bit byte count, from the end of this instruction. |
1903 | return l->offset - (here().offset + 4); |
1904 | } |
1905 | |
1906 | void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label* l) { |
1907 | // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13. |
1908 | const int rip = rbp; |
1909 | |
1910 | VEX v = vex(0, dst>>3, 0, rip>>3, |
1911 | map, x, /*ymm?*/1, prefix); |
1912 | this->bytes(v.bytes, v.len); |
1913 | this->byte(opcode); |
1914 | this->byte(mod_rm(Mod::Indirect, dst&7, rip&7)); |
1915 | this->word(this->disp32(l)); |
1916 | } |
1917 | |
1918 | void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, YmmOrLabel y) { |
1919 | y.label ? this->op(prefix,map,opcode,dst,x, y.label) |
1920 | : this->op(prefix,map,opcode,dst,x, y.ymm ); |
1921 | } |
1922 | |
1923 | void Assembler::vpshufb(Ymm dst, Ymm x, Label* l) { this->op(0x66,0x380f,0x00, dst,x,l); } |
1924 | void Assembler::vptest(Ymm dst, Label* l) { this->op(0x66, 0x380f, 0x17, dst, (Ymm)0, l); } |
1925 | |
1926 | void Assembler::vbroadcastss(Ymm dst, Label* l) { this->op(0x66,0x380f,0x18, dst, (Ymm)0, l); } |
1927 | void Assembler::vbroadcastss(Ymm dst, Xmm src) { this->op(0x66,0x380f,0x18, dst, (Ymm)src); } |
1928 | void Assembler::vbroadcastss(Ymm dst, GP64 ptr, int off) { |
1929 | int prefix = 0x66, |
1930 | map = 0x380f, |
1931 | opcode = 0x18; |
1932 | VEX v = vex(0, dst>>3, 0, ptr>>3, |
1933 | map, 0, /*ymm?*/1, prefix); |
1934 | this->bytes(v.bytes, v.len); |
1935 | this->byte(opcode); |
1936 | |
1937 | this->byte(mod_rm(mod(off), dst&7, ptr&7)); |
1938 | this->bytes(&off, imm_bytes(mod(off))); |
1939 | } |
1940 | |
1941 | void Assembler::jump(uint8_t condition, Label* l) { |
1942 | // These conditional jumps can be either 2 bytes (short) or 6 bytes (near): |
1943 | // 7? one-byte-disp |
1944 | // 0F 8? four-byte-disp |
1945 | // We always use the near displacement to make updating labels simpler (no resizing). |
1946 | this->byte(0x0f); |
1947 | this->byte(condition); |
1948 | this->word(this->disp32(l)); |
1949 | } |
1950 | void Assembler::je (Label* l) { this->jump(0x84, l); } |
1951 | void Assembler::jne(Label* l) { this->jump(0x85, l); } |
1952 | void Assembler::jl (Label* l) { this->jump(0x8c, l); } |
1953 | void Assembler::jc (Label* l) { this->jump(0x82, l); } |
1954 | |
1955 | void Assembler::jmp(Label* l) { |
1956 | // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit. |
1957 | this->byte(0xe9); |
1958 | this->word(this->disp32(l)); |
1959 | } |
1960 | |
1961 | void Assembler::load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr) { |
1962 | VEX v = vex(0, ymm>>3, 0, ptr>>3, |
1963 | map, 0, /*ymm?*/1, prefix); |
1964 | this->bytes(v.bytes, v.len); |
1965 | this->byte(opcode); |
1966 | this->byte(mod_rm(Mod::Indirect, ymm&7, ptr&7)); |
1967 | } |
1968 | |
1969 | void Assembler::vmovups (Ymm dst, GP64 src) { this->load_store(0 , 0x0f,0x10, dst,src); } |
1970 | void Assembler::vpmovzxwd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x33, dst,src); } |
1971 | void Assembler::vpmovzxbd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x31, dst,src); } |
1972 | |
1973 | void Assembler::vmovups (GP64 dst, Ymm src) { this->load_store(0 , 0x0f,0x11, src,dst); } |
1974 | void Assembler::vmovups (GP64 dst, Xmm src) { |
1975 | // Same as vmovups(GP64,YMM) and load_store() except ymm? is 0. |
1976 | int prefix = 0, |
1977 | map = 0x0f, |
1978 | opcode = 0x11; |
1979 | VEX v = vex(0, src>>3, 0, dst>>3, |
1980 | map, 0, /*ymm?*/0, prefix); |
1981 | this->bytes(v.bytes, v.len); |
1982 | this->byte(opcode); |
1983 | this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); |
1984 | } |
1985 | |
1986 | void Assembler::stack_load_store(int prefix, int map, int opcode, Ymm ymm, int off) { |
1987 | VEX v = vex(0, ymm>>3, 0, rsp>>3/*i.e. 0*/, |
1988 | map, 0, /*ymm?*/1, prefix); |
1989 | this->bytes(v.bytes, v.len); |
1990 | this->byte(opcode); |
1991 | this->byte(mod_rm(mod(off), ymm&7, rsp/*use SIB*/)); |
1992 | this->byte(sib(ONE, rsp/*no index*/, rsp)); |
1993 | this->bytes(&off, imm_bytes(mod(off))); |
1994 | } |
1995 | void Assembler::vmovups(Ymm dst, int off) { this->stack_load_store(0, 0x0f, 0x10, dst,off); } |
1996 | void Assembler::vmovups(int off, Ymm src) { this->stack_load_store(0, 0x0f, 0x11, src,off); } |
1997 | |
1998 | void Assembler::vmovq(GP64 dst, Xmm src) { |
1999 | int prefix = 0x66, |
2000 | map = 0x0f, |
2001 | opcode = 0xd6; |
2002 | VEX v = vex(0, src>>3, 0, dst>>3, |
2003 | map, 0, /*ymm?*/0, prefix); |
2004 | this->bytes(v.bytes, v.len); |
2005 | this->byte(opcode); |
2006 | this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); |
2007 | } |
2008 | |
2009 | void Assembler::vmovd(GP64 dst, Xmm src) { |
2010 | int prefix = 0x66, |
2011 | map = 0x0f, |
2012 | opcode = 0x7e; |
2013 | VEX v = vex(0, src>>3, 0, dst>>3, |
2014 | map, 0, /*ymm?*/0, prefix); |
2015 | this->bytes(v.bytes, v.len); |
2016 | this->byte(opcode); |
2017 | this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); |
2018 | } |
2019 | |
2020 | void Assembler::vmovd_direct(GP64 dst, Xmm src) { |
2021 | int prefix = 0x66, |
2022 | map = 0x0f, |
2023 | opcode = 0x7e; |
2024 | VEX v = vex(0, src>>3, 0, dst>>3, |
2025 | map, 0, /*ymm?*/0, prefix); |
2026 | this->bytes(v.bytes, v.len); |
2027 | this->byte(opcode); |
2028 | this->byte(mod_rm(Mod::Direct, src&7, dst&7)); |
2029 | } |
2030 | |
2031 | void Assembler::vmovd(Xmm dst, GP64 src) { |
2032 | int prefix = 0x66, |
2033 | map = 0x0f, |
2034 | opcode = 0x6e; |
2035 | VEX v = vex(0, dst>>3, 0, src>>3, |
2036 | map, 0, /*ymm?*/0, prefix); |
2037 | this->bytes(v.bytes, v.len); |
2038 | this->byte(opcode); |
2039 | this->byte(mod_rm(Mod::Indirect, dst&7, src&7)); |
2040 | } |
2041 | |
2042 | void Assembler::vmovd(Xmm dst, Scale scale, GP64 index, GP64 base) { |
2043 | int prefix = 0x66, |
2044 | map = 0x0f, |
2045 | opcode = 0x6e; |
2046 | VEX v = vex(0, dst>>3, index>>3, base>>3, |
2047 | map, 0, /*ymm?*/0, prefix); |
2048 | this->bytes(v.bytes, v.len); |
2049 | this->byte(opcode); |
2050 | this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/)); |
2051 | this->byte(sib(scale, index&7, base&7)); |
2052 | } |
2053 | |
2054 | void Assembler::vmovd_direct(Xmm dst, GP64 src) { |
2055 | int prefix = 0x66, |
2056 | map = 0x0f, |
2057 | opcode = 0x6e; |
2058 | VEX v = vex(0, dst>>3, 0, src>>3, |
2059 | map, 0, /*ymm?*/0, prefix); |
2060 | this->bytes(v.bytes, v.len); |
2061 | this->byte(opcode); |
2062 | this->byte(mod_rm(Mod::Direct, dst&7, src&7)); |
2063 | } |
2064 | |
2065 | void Assembler::movzbl(GP64 dst, GP64 src, int off) { |
2066 | if ((dst>>3) || (src>>3)) { |
2067 | this->byte(rex(0,dst>>3,0,src>>3)); |
2068 | } |
2069 | this->byte(0x0f); |
2070 | this->byte(0xb6); |
2071 | this->byte(mod_rm(mod(off), dst&7, src&7)); |
2072 | this->bytes(&off, imm_bytes(mod(off))); |
2073 | } |
2074 | |
2075 | |
2076 | void Assembler::movb(GP64 dst, GP64 src) { |
2077 | if ((dst>>3) || (src>>3)) { |
2078 | this->byte(rex(0,src>>3,0,dst>>3)); |
2079 | } |
2080 | this->byte(0x88); |
2081 | this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); |
2082 | } |
2083 | |
2084 | void Assembler::vpinsrw(Xmm dst, Xmm src, GP64 ptr, int imm) { |
2085 | int prefix = 0x66, |
2086 | map = 0x0f, |
2087 | opcode = 0xc4; |
2088 | VEX v = vex(0, dst>>3, 0, ptr>>3, |
2089 | map, src, /*ymm?*/0, prefix); |
2090 | this->bytes(v.bytes, v.len); |
2091 | this->byte(opcode); |
2092 | this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7)); |
2093 | this->byte(imm); |
2094 | } |
2095 | |
2096 | void Assembler::vpinsrb(Xmm dst, Xmm src, GP64 ptr, int imm) { |
2097 | int prefix = 0x66, |
2098 | map = 0x3a0f, |
2099 | opcode = 0x20; |
2100 | VEX v = vex(0, dst>>3, 0, ptr>>3, |
2101 | map, src, /*ymm?*/0, prefix); |
2102 | this->bytes(v.bytes, v.len); |
2103 | this->byte(opcode); |
2104 | this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7)); |
2105 | this->byte(imm); |
2106 | } |
2107 | |
2108 | void Assembler::vpextrw(GP64 ptr, Xmm src, int imm) { |
2109 | int prefix = 0x66, |
2110 | map = 0x3a0f, |
2111 | opcode = 0x15; |
2112 | |
2113 | VEX v = vex(0, src>>3, 0, ptr>>3, |
2114 | map, 0, /*ymm?*/0, prefix); |
2115 | this->bytes(v.bytes, v.len); |
2116 | this->byte(opcode); |
2117 | this->byte(mod_rm(Mod::Indirect, src&7, ptr&7)); |
2118 | this->byte(imm); |
2119 | } |
2120 | void Assembler::vpextrb(GP64 ptr, Xmm src, int imm) { |
2121 | int prefix = 0x66, |
2122 | map = 0x3a0f, |
2123 | opcode = 0x14; |
2124 | |
2125 | VEX v = vex(0, src>>3, 0, ptr>>3, |
2126 | map, 0, /*ymm?*/0, prefix); |
2127 | this->bytes(v.bytes, v.len); |
2128 | this->byte(opcode); |
2129 | this->byte(mod_rm(Mod::Indirect, src&7, ptr&7)); |
2130 | this->byte(imm); |
2131 | } |
2132 | |
2133 | void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) { |
2134 | // Unlike most instructions, no aliasing is permitted here. |
2135 | SkASSERT(dst != ix); |
2136 | SkASSERT(dst != mask); |
2137 | SkASSERT(mask != ix); |
2138 | |
2139 | int prefix = 0x66, |
2140 | map = 0x380f, |
2141 | opcode = 0x92; |
2142 | VEX v = vex(0, dst>>3, ix>>3, base>>3, |
2143 | map, mask, /*ymm?*/1, prefix); |
2144 | this->bytes(v.bytes, v.len); |
2145 | this->byte(opcode); |
2146 | this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/)); |
2147 | this->byte(sib(scale, ix&7, base&7)); |
2148 | } |
2149 | |
2150 | // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf |
2151 | |
2152 | static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; } |
2153 | |
2154 | void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) { |
2155 | this->word( (hi & 11_mask) << 21 |
2156 | | (m & 5_mask) << 16 |
2157 | | (lo & 6_mask) << 10 |
2158 | | (n & 5_mask) << 5 |
2159 | | (d & 5_mask) << 0); |
2160 | } |
2161 | |
2162 | void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); } |
2163 | void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); } |
2164 | void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); } |
2165 | void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); } |
2166 | void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); } |
2167 | void Assembler::not16b(V d, V n) { this->op(0b0'1'1'01110'00'10000'00101'10, n, d); } |
2168 | |
2169 | void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); } |
2170 | void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); } |
2171 | void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); } |
2172 | |
2173 | void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); } |
2174 | void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); } |
2175 | |
2176 | void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); } |
2177 | void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); } |
2178 | |
2179 | void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); } |
2180 | void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); } |
2181 | void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); } |
2182 | void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); } |
2183 | void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); } |
2184 | void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); } |
2185 | void Assembler::fneg4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n, d); } |
2186 | |
2187 | void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); } |
2188 | void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); } |
2189 | void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); } |
2190 | |
2191 | void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); } |
2192 | void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); } |
2193 | |
2194 | void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); } |
2195 | |
2196 | void Assembler::op(uint32_t op22, int imm, V n, V d) { |
2197 | this->word( (op22 & 22_mask) << 10 |
2198 | | imm << 16 // imm is embedded inside op, bit size depends on op |
2199 | | (n & 5_mask) << 5 |
2200 | | (d & 5_mask) << 0); |
2201 | } |
2202 | |
2203 | void Assembler::sli4s(V d, V n, int imm) { |
2204 | this->op(0b0'1'1'011110'0100'000'01010'1, ( imm&31), n, d); |
2205 | } |
2206 | void Assembler::shl4s(V d, V n, int imm) { |
2207 | this->op(0b0'1'0'011110'0100'000'01010'1, ( imm&31), n, d); |
2208 | } |
2209 | void Assembler::sshr4s(V d, V n, int imm) { |
2210 | this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d); |
2211 | } |
2212 | void Assembler::ushr4s(V d, V n, int imm) { |
2213 | this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d); |
2214 | } |
2215 | void Assembler::ushr8h(V d, V n, int imm) { |
2216 | this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d); |
2217 | } |
2218 | |
2219 | void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); } |
2220 | void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); } |
2221 | void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); } |
2222 | |
2223 | void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); } |
2224 | void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); } |
2225 | |
2226 | void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); } |
2227 | void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); } |
2228 | |
2229 | void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); } |
2230 | |
2231 | void Assembler::brk(int imm16) { |
2232 | this->word(0b11010100'001'0000000000000000'000'00 |
2233 | | (imm16 & 16_mask) << 5); |
2234 | } |
2235 | |
2236 | void Assembler::ret(X n) { |
2237 | this->word(0b1101011'0'0'10'11111'0000'0'0 << 10 |
2238 | | (n & 5_mask) << 5); |
2239 | } |
2240 | |
2241 | void Assembler::add(X d, X n, int imm12) { |
2242 | this->word(0b1'0'0'10001'00 << 22 |
2243 | | (imm12 & 12_mask) << 10 |
2244 | | (n & 5_mask) << 5 |
2245 | | (d & 5_mask) << 0); |
2246 | } |
2247 | void Assembler::sub(X d, X n, int imm12) { |
2248 | this->word( 0b1'1'0'10001'00 << 22 |
2249 | | (imm12 & 12_mask) << 10 |
2250 | | (n & 5_mask) << 5 |
2251 | | (d & 5_mask) << 0); |
2252 | } |
2253 | void Assembler::subs(X d, X n, int imm12) { |
2254 | this->word( 0b1'1'1'10001'00 << 22 |
2255 | | (imm12 & 12_mask) << 10 |
2256 | | (n & 5_mask) << 5 |
2257 | | (d & 5_mask) << 0); |
2258 | } |
2259 | |
2260 | void Assembler::b(Condition cond, Label* l) { |
2261 | const int imm19 = this->disp19(l); |
2262 | this->word( 0b0101010'0 << 24 |
2263 | | (imm19 & 19_mask) << 5 |
2264 | | ((int)cond & 4_mask) << 0); |
2265 | } |
2266 | void Assembler::cbz(X t, Label* l) { |
2267 | const int imm19 = this->disp19(l); |
2268 | this->word( 0b1'011010'0 << 24 |
2269 | | (imm19 & 19_mask) << 5 |
2270 | | (t & 5_mask) << 0); |
2271 | } |
2272 | void Assembler::cbnz(X t, Label* l) { |
2273 | const int imm19 = this->disp19(l); |
2274 | this->word( 0b1'011010'1 << 24 |
2275 | | (imm19 & 19_mask) << 5 |
2276 | | (t & 5_mask) << 0); |
2277 | } |
2278 | |
2279 | void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); } |
2280 | void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); } |
2281 | void Assembler::ldrb(V dst, X src) { this->op(0b00'111'1'01'01'000000000000, src, dst); } |
2282 | |
2283 | void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); } |
2284 | void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); } |
2285 | void Assembler::strb(V src, X dst) { this->op(0b00'111'1'01'00'000000000000, dst, src); } |
2286 | |
2287 | void Assembler::fmovs(X dst, V src) { |
2288 | this->word(0b0'0'0'11110'00'1'00'110'000000 << 10 |
2289 | | (src & 5_mask) << 5 |
2290 | | (dst & 5_mask) << 0); |
2291 | } |
2292 | |
2293 | void Assembler::ldrq(V dst, Label* l) { |
2294 | const int imm19 = this->disp19(l); |
2295 | this->word( 0b10'011'1'00 << 24 |
2296 | | (imm19 & 19_mask) << 5 |
2297 | | (dst & 5_mask) << 0); |
2298 | } |
2299 | |
2300 | void Assembler::label(Label* l) { |
2301 | if (fCode) { |
2302 | // The instructions all currently point to l->offset. |
2303 | // We'll want to add a delta to point them to here(). |
2304 | int delta = here().offset - l->offset; |
2305 | l->offset = here().offset; |
2306 | |
2307 | if (l->kind == Label::ARMDisp19) { |
2308 | for (int ref : l->references) { |
2309 | // ref points to a 32-bit instruction with 19-bit displacement in instructions. |
2310 | uint32_t inst; |
2311 | memcpy(&inst, fCode + ref, 4); |
2312 | |
2313 | // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ] |
2314 | int disp = (int)(inst << 8) >> 13; |
2315 | |
2316 | disp += delta/4; // delta is in bytes, we want instructions. |
2317 | |
2318 | // Put it all back together, preserving the high 8 bits and low 5. |
2319 | inst = ((disp << 5) & (19_mask << 5)) |
2320 | | ((inst ) & ~(19_mask << 5)); |
2321 | |
2322 | memcpy(fCode + ref, &inst, 4); |
2323 | } |
2324 | } |
2325 | |
2326 | if (l->kind == Label::X86Disp32) { |
2327 | for (int ref : l->references) { |
2328 | // ref points to a 32-bit displacement in bytes. |
2329 | int disp; |
2330 | memcpy(&disp, fCode + ref, 4); |
2331 | |
2332 | disp += delta; |
2333 | |
2334 | memcpy(fCode + ref, &disp, 4); |
2335 | } |
2336 | } |
2337 | } |
2338 | } |
2339 | |
2340 | void Program::eval(int n, void* args[]) const { |
2341 | #define SKVM_JIT_STATS 0 |
2342 | #if SKVM_JIT_STATS |
2343 | static std::atomic<int64_t> calls{0}, jits{0}, |
2344 | pixels{0}, fast{0}; |
2345 | pixels += n; |
2346 | if (0 == calls++) { |
2347 | atexit([]{ |
2348 | int64_t num = jits .load(), |
2349 | den = calls.load(); |
2350 | SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n" , (100.0 * num)/den, den); |
2351 | num = fast .load(); |
2352 | den = pixels.load(); |
2353 | SkDebugf("%.3g%% of %lld pixels went through JIT.\n" , (100.0 * num)/den, den); |
2354 | }); |
2355 | } |
2356 | #endif |
2357 | // This may fail either simply because we can't JIT, or when using LLVM, |
2358 | // because the work represented by fImpl->llvm_compiling hasn't finished yet. |
2359 | if (const void* b = fImpl->jit_entry.load()) { |
2360 | #if SKVM_JIT_STATS |
2361 | jits++; |
2362 | fast += n; |
2363 | #endif |
2364 | void** a = args; |
2365 | switch (fImpl->strides.size()) { |
2366 | case 0: return ((void(*)(int ))b)(n ); |
2367 | case 1: return ((void(*)(int,void* ))b)(n,a[0] ); |
2368 | case 2: return ((void(*)(int,void*,void* ))b)(n,a[0],a[1] ); |
2369 | case 3: return ((void(*)(int,void*,void*,void* ))b)(n,a[0],a[1],a[2] ); |
2370 | case 4: return ((void(*)(int,void*,void*,void*,void*))b)(n,a[0],a[1],a[2],a[3]); |
2371 | case 5: return ((void(*)(int,void*,void*,void*,void*,void*))b) |
2372 | (n,a[0],a[1],a[2],a[3],a[4]); |
2373 | default: SkUNREACHABLE; // TODO |
2374 | } |
2375 | } |
2376 | |
2377 | // So we'll sometimes use the interpreter here even if later calls will use the JIT. |
2378 | SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(), |
2379 | this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(), |
2380 | n, args); |
2381 | } |
2382 | |
2383 | #if defined(SKVM_LLVM) |
2384 | void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions, |
2385 | const char* debug_name) { |
2386 | auto ctx = std::make_unique<llvm::LLVMContext>(); |
2387 | |
2388 | auto mod = std::make_unique<llvm::Module>("" , *ctx); |
2389 | // All the scary bare pointers from here on are owned by ctx or mod, I think. |
2390 | |
2391 | // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines. |
2392 | const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4; |
2393 | |
2394 | llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(), |
2395 | *i32 = llvm::Type::getInt32Ty(*ctx); |
2396 | |
2397 | std::vector<llvm::Type*> arg_types = { i32 }; |
2398 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2399 | arg_types.push_back(ptr); |
2400 | } |
2401 | |
2402 | llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx), |
2403 | arg_types, /*vararg?=*/false); |
2404 | llvm::Function* fn |
2405 | = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod); |
2406 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2407 | fn->addParamAttr(i+1, llvm::Attribute::NoAlias); |
2408 | } |
2409 | |
2410 | llvm::BasicBlock *enter = llvm::BasicBlock::Create(*ctx, "enter" , fn), |
2411 | *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK" , fn), |
2412 | *testK = llvm::BasicBlock::Create(*ctx, "testK" , fn), |
2413 | *loopK = llvm::BasicBlock::Create(*ctx, "loopK" , fn), |
2414 | *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1" , fn), |
2415 | *test1 = llvm::BasicBlock::Create(*ctx, "test1" , fn), |
2416 | *loop1 = llvm::BasicBlock::Create(*ctx, "loop1" , fn), |
2417 | *leave = llvm::BasicBlock::Create(*ctx, "leave" , fn); |
2418 | |
2419 | using IRBuilder = llvm::IRBuilder<>; |
2420 | |
2421 | llvm::PHINode* n; |
2422 | std::vector<llvm::PHINode*> args; |
2423 | std::vector<llvm::Value*> vals(instructions.size()); |
2424 | |
2425 | auto emit = [&](size_t i, bool scalar, IRBuilder* b) { |
2426 | auto [op, x,y,z, immy,immz, death,can_hoist,used_in_loop] = instructions[i]; |
2427 | |
2428 | llvm::Type *i1 = llvm::Type::getInt1Ty (*ctx), |
2429 | *i8 = llvm::Type::getInt8Ty (*ctx), |
2430 | *i16 = llvm::Type::getInt16Ty(*ctx), |
2431 | *i16x2 = llvm::VectorType::get(i16, 2), |
2432 | *f32 = llvm::Type::getFloatTy(*ctx), |
2433 | *I1 = scalar ? i1 : llvm::VectorType::get(i1 , K ), |
2434 | *I8 = scalar ? i8 : llvm::VectorType::get(i8 , K ), |
2435 | *I16 = scalar ? i16 : llvm::VectorType::get(i16, K ), |
2436 | *I16x2 = scalar ? i16x2 : llvm::VectorType::get(i16, K*2), |
2437 | *I32 = scalar ? i32 : llvm::VectorType::get(i32, K ), |
2438 | *F32 = scalar ? f32 : llvm::VectorType::get(f32, K ); |
2439 | |
2440 | auto I = [&](llvm::Value* v) { return b->CreateBitCast(v, I32 ); }; |
2441 | auto F = [&](llvm::Value* v) { return b->CreateBitCast(v, F32 ); }; |
2442 | auto x2 = [&](llvm::Value* v) { return b->CreateBitCast(v, I16x2); }; |
2443 | |
2444 | auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); }; |
2445 | |
2446 | switch (llvm::Type* t = nullptr; op) { |
2447 | default: |
2448 | SkDebugf("can't llvm %s (%d)\n" , name(op), op); |
2449 | return false; |
2450 | |
2451 | case Op::assert_true: /*TODO*/ break; |
2452 | |
2453 | case Op::index: |
2454 | if (I32->isVectorTy()) { |
2455 | std::vector<llvm::Constant*> iota(K); |
2456 | for (int j = 0; j < K; j++) { |
2457 | iota[j] = b->getInt32(j); |
2458 | } |
2459 | vals[i] = b->CreateSub(b->CreateVectorSplat(K, n), |
2460 | llvm::ConstantVector::get(iota)); |
2461 | } else { |
2462 | vals[i] = n; |
2463 | } break; |
2464 | |
2465 | case Op::load8: t = I8 ; goto load; |
2466 | case Op::load16: t = I16; goto load; |
2467 | case Op::load32: t = I32; goto load; |
2468 | load: { |
2469 | llvm::Value* ptr = b->CreateBitCast(args[immy], t->getPointerTo()); |
2470 | vals[i] = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), I32); |
2471 | } break; |
2472 | |
2473 | |
2474 | case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immy); break; |
2475 | |
2476 | case Op::uniform8: t = i8 ; goto uniform; |
2477 | case Op::uniform16: t = i16; goto uniform; |
2478 | case Op::uniform32: t = i32; goto uniform; |
2479 | uniform: { |
2480 | llvm::Value* ptr = b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr, |
2481 | args[immy], |
2482 | immz), |
2483 | t->getPointerTo()); |
2484 | llvm::Value* val = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), i32); |
2485 | vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val) |
2486 | : val; |
2487 | } break; |
2488 | |
2489 | case Op::gather8: t = i8 ; goto gather; |
2490 | case Op::gather16: t = i16; goto gather; |
2491 | case Op::gather32: t = i32; goto gather; |
2492 | gather: { |
2493 | // Our gather base pointer is immz bytes off of uniform immy. |
2494 | llvm::Value* base = |
2495 | b->CreateLoad(b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr, |
2496 | args[immy], |
2497 | immz), |
2498 | t->getPointerTo()->getPointerTo())); |
2499 | |
2500 | llvm::Value* ptr = b->CreateInBoundsGEP(nullptr, base, vals[x]); |
2501 | llvm::Value* gathered; |
2502 | if (ptr->getType()->isVectorTy()) { |
2503 | gathered = b->CreateMaskedGather(ptr, 1); |
2504 | } else { |
2505 | gathered = b->CreateAlignedLoad(ptr, 1); |
2506 | } |
2507 | vals[i] = b->CreateZExt(gathered, I32); |
2508 | } break; |
2509 | |
2510 | case Op::store8: t = I8 ; goto store; |
2511 | case Op::store16: t = I16; goto store; |
2512 | case Op::store32: t = I32; goto store; |
2513 | store: { |
2514 | llvm::Value* val = b->CreateTrunc(vals[x], t); |
2515 | llvm::Value* ptr = b->CreateBitCast(args[immy], |
2516 | val->getType()->getPointerTo()); |
2517 | vals[i] = b->CreateAlignedStore(val, ptr, 1); |
2518 | } break; |
2519 | |
2520 | case Op::bit_and: vals[i] = b->CreateAnd(vals[x], vals[y]); break; |
2521 | case Op::bit_or : vals[i] = b->CreateOr (vals[x], vals[y]); break; |
2522 | case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break; |
2523 | case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break; |
2524 | |
2525 | case Op::pack: vals[i] = b->CreateOr(vals[x], b->CreateShl(vals[y], immz)); break; |
2526 | |
2527 | case Op::select: |
2528 | vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]); |
2529 | break; |
2530 | |
2531 | case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break; |
2532 | case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break; |
2533 | case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break; |
2534 | |
2535 | case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immy); break; |
2536 | case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immy); break; |
2537 | case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immy); break; |
2538 | |
2539 | case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break; |
2540 | case Op::neq_i32: vals[i] = S(I32, b->CreateICmpNE (vals[x], vals[y])); break; |
2541 | case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break; |
2542 | case Op::gte_i32: vals[i] = S(I32, b->CreateICmpSGE(vals[x], vals[y])); break; |
2543 | |
2544 | case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break; |
2545 | case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break; |
2546 | case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break; |
2547 | case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break; |
2548 | |
2549 | case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break; |
2550 | case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break; |
2551 | case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break; |
2552 | case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break; |
2553 | |
2554 | case Op::fma_f32: |
2555 | vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, |
2556 | {F(vals[x]), F(vals[y]), F(vals[z])})); |
2557 | break; |
2558 | |
2559 | case Op::fms_f32: |
2560 | vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, |
2561 | {F(vals[x]), F(vals[y]), |
2562 | b->CreateFNeg(F(vals[z]))})); |
2563 | break; |
2564 | |
2565 | case Op::fnma_f32: |
2566 | vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, |
2567 | {b->CreateFNeg(F(vals[x])), F(vals[y]), |
2568 | F(vals[z])})); |
2569 | break; |
2570 | |
2571 | case Op::floor: |
2572 | vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x]))); |
2573 | break; |
2574 | |
2575 | case Op::max_f32: |
2576 | vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])), |
2577 | F(vals[y]), F(vals[x]))); |
2578 | break; |
2579 | case Op::min_f32: |
2580 | vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])), |
2581 | F(vals[y]), F(vals[x]))); |
2582 | break; |
2583 | |
2584 | case Op::sqrt_f32: |
2585 | vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x]))); |
2586 | break; |
2587 | |
2588 | case Op::to_f32: vals[i] = I(b->CreateSIToFP( vals[x] , F32)); break; |
2589 | case Op::trunc : vals[i] = b->CreateFPToSI(F(vals[x]), I32) ; break; |
2590 | case Op::round : { |
2591 | // Basic impl when we can't use cvtps2dq and co. |
2592 | auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x])); |
2593 | vals[i] = b->CreateFPToSI(round, I32); |
2594 | |
2595 | #if 1 && defined(SK_CPU_X86) |
2596 | // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling. |
2597 | if (scalar) { |
2598 | // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3. ¯\_(ツ)_/¯ |
2599 | llvm::Value* v = llvm::UndefValue::get(llvm::VectorType::get(f32, 4)); |
2600 | v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0); |
2601 | vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v}); |
2602 | } else { |
2603 | SkASSERT(K == 4 || K == 8); |
2604 | auto intr = K == 4 ? llvm::Intrinsic::x86_sse2_cvtps2dq : |
2605 | /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256; |
2606 | vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])}); |
2607 | } |
2608 | #endif |
2609 | } break; |
2610 | |
2611 | case Op::add_i16x2: vals[i] = I(b->CreateAdd(x2(vals[x]), x2(vals[y]))); break; |
2612 | case Op::sub_i16x2: vals[i] = I(b->CreateSub(x2(vals[x]), x2(vals[y]))); break; |
2613 | case Op::mul_i16x2: vals[i] = I(b->CreateMul(x2(vals[x]), x2(vals[y]))); break; |
2614 | |
2615 | case Op::shl_i16x2: vals[i] = I(b->CreateShl (x2(vals[x]), immy)); break; |
2616 | case Op::sra_i16x2: vals[i] = I(b->CreateAShr(x2(vals[x]), immy)); break; |
2617 | case Op::shr_i16x2: vals[i] = I(b->CreateLShr(x2(vals[x]), immy)); break; |
2618 | |
2619 | case Op:: eq_i16x2: |
2620 | vals[i] = I(S(I16x2, b->CreateICmpEQ (x2(vals[x]), x2(vals[y])))); |
2621 | break; |
2622 | case Op::neq_i16x2: |
2623 | vals[i] = I(S(I16x2, b->CreateICmpNE (x2(vals[x]), x2(vals[y])))); |
2624 | break; |
2625 | case Op:: gt_i16x2: |
2626 | vals[i] = I(S(I16x2, b->CreateICmpSGT(x2(vals[x]), x2(vals[y])))); |
2627 | break; |
2628 | case Op::gte_i16x2: |
2629 | vals[i] = I(S(I16x2, b->CreateICmpSGE(x2(vals[x]), x2(vals[y])))); |
2630 | break; |
2631 | } |
2632 | return true; |
2633 | }; |
2634 | |
2635 | { |
2636 | IRBuilder b(enter); |
2637 | b.CreateBr(hoistK); |
2638 | } |
2639 | |
2640 | // hoistK: emit each hoistable vector instruction; goto testK; |
2641 | // LLVM can do this sort of thing itself, but we've got the information cheap, |
2642 | // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe. |
2643 | { |
2644 | IRBuilder b(hoistK); |
2645 | |
2646 | // Hoisted instructions will need args (think, uniforms), so set that up now. |
2647 | // These phi nodes are degenerate... they'll always be the passed-in args from enter. |
2648 | // Later on when we start looping the phi nodes will start looking useful. |
2649 | llvm::Argument* arg = fn->arg_begin(); |
2650 | (void)arg++; // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction. |
2651 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2652 | args.push_back(b.CreatePHI(arg->getType(), 1)); |
2653 | args.back()->addIncoming(arg++, enter); |
2654 | } |
2655 | |
2656 | for (size_t i = 0; i < instructions.size(); i++) { |
2657 | if (instructions[i].can_hoist && !emit(i, false, &b)) { |
2658 | return; |
2659 | } |
2660 | } |
2661 | |
2662 | b.CreateBr(testK); |
2663 | } |
2664 | |
2665 | // testK: if (N >= K) goto loopK; else goto hoist1; |
2666 | { |
2667 | IRBuilder b(testK); |
2668 | |
2669 | // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK. |
2670 | // These also start as the initial function arguments; hoistK can't have changed them. |
2671 | llvm::Argument* arg = fn->arg_begin(); |
2672 | |
2673 | n = b.CreatePHI(arg->getType(), 2); |
2674 | n->addIncoming(arg++, hoistK); |
2675 | |
2676 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2677 | args[i] = b.CreatePHI(arg->getType(), 2); |
2678 | args[i]->addIncoming(arg++, hoistK); |
2679 | } |
2680 | |
2681 | b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1); |
2682 | } |
2683 | |
2684 | // loopK: ... insts on K x T vectors; N -= K, args += K*stride; goto testK; |
2685 | { |
2686 | IRBuilder b(loopK); |
2687 | for (size_t i = 0; i < instructions.size(); i++) { |
2688 | if (!instructions[i].can_hoist && !emit(i, false, &b)) { |
2689 | return; |
2690 | } |
2691 | } |
2692 | |
2693 | // n -= K |
2694 | llvm::Value* n_next = b.CreateSub(n, b.getInt32(K)); |
2695 | n->addIncoming(n_next, loopK); |
2696 | |
2697 | // Each arg ptr += K |
2698 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2699 | llvm::Value* arg_next |
2700 | = b.CreateConstInBoundsGEP1_32(nullptr, args[i], K*fImpl->strides[i]); |
2701 | args[i]->addIncoming(arg_next, loopK); |
2702 | } |
2703 | b.CreateBr(testK); |
2704 | } |
2705 | |
2706 | // hoist1: emit each hoistable scalar instruction; goto test1; |
2707 | { |
2708 | IRBuilder b(hoist1); |
2709 | for (size_t i = 0; i < instructions.size(); i++) { |
2710 | if (instructions[i].can_hoist && !emit(i, true, &b)) { |
2711 | return; |
2712 | } |
2713 | } |
2714 | b.CreateBr(test1); |
2715 | } |
2716 | |
2717 | // test1: if (N >= 1) goto loop1; else goto leave; |
2718 | { |
2719 | IRBuilder b(test1); |
2720 | |
2721 | // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1. |
2722 | llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2); |
2723 | n_new->addIncoming(n, hoist1); |
2724 | n = n_new; |
2725 | |
2726 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2727 | llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2); |
2728 | arg_new->addIncoming(args[i], hoist1); |
2729 | args[i] = arg_new; |
2730 | } |
2731 | |
2732 | b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave); |
2733 | } |
2734 | |
2735 | // loop1: ... insts on scalars; N -= 1, args += stride; goto test1; |
2736 | { |
2737 | IRBuilder b(loop1); |
2738 | for (size_t i = 0; i < instructions.size(); i++) { |
2739 | if (!instructions[i].can_hoist && !emit(i, true, &b)) { |
2740 | return; |
2741 | } |
2742 | } |
2743 | |
2744 | // n -= 1 |
2745 | llvm::Value* n_next = b.CreateSub(n, b.getInt32(1)); |
2746 | n->addIncoming(n_next, loop1); |
2747 | |
2748 | // Each arg ptr += K |
2749 | for (size_t i = 0; i < fImpl->strides.size(); i++) { |
2750 | llvm::Value* arg_next |
2751 | = b.CreateConstInBoundsGEP1_32(nullptr, args[i], fImpl->strides[i]); |
2752 | args[i]->addIncoming(arg_next, loop1); |
2753 | } |
2754 | b.CreateBr(test1); |
2755 | } |
2756 | |
2757 | // leave: ret |
2758 | { |
2759 | IRBuilder b(leave); |
2760 | b.CreateRetVoid(); |
2761 | } |
2762 | |
2763 | SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs())); |
2764 | |
2765 | if (true) { |
2766 | SkString path = SkStringPrintf("/tmp/%s.bc" , debug_name); |
2767 | std::error_code err; |
2768 | llvm::raw_fd_ostream os(path.c_str(), err); |
2769 | if (err) { |
2770 | return; |
2771 | } |
2772 | llvm::WriteBitcodeToFile(*mod, os); |
2773 | } |
2774 | |
2775 | static SkOnce once; |
2776 | once([]{ |
2777 | SkAssertResult(false == llvm::InitializeNativeTarget()); |
2778 | SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter()); |
2779 | }); |
2780 | |
2781 | if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod)) |
2782 | .setEngineKind(llvm::EngineKind::JIT) |
2783 | .setMCPU(llvm::sys::getHostCPUName()) |
2784 | .create()) { |
2785 | fImpl->llvm_ctx = std::move(ctx); |
2786 | fImpl->llvm_ee.reset(ee); |
2787 | |
2788 | // We have to be careful here about what we close over and how, in case fImpl moves. |
2789 | // fImpl itself may change, but its pointee fields won't, so close over them by value. |
2790 | // Also, debug_name will almost certainly leave scope, so copy it. |
2791 | fImpl->llvm_compiling = std::async(std::launch::async, [dst = &fImpl->jit_entry, |
2792 | ee = fImpl->llvm_ee.get(), |
2793 | name = std::string(debug_name)]{ |
2794 | // std::atomic<void*>* dst; |
2795 | // llvm::ExecutionEngine* ee; |
2796 | // std::string name; |
2797 | dst->store( (void*)ee->getFunctionAddress(name.c_str()) ); |
2798 | }); |
2799 | } |
2800 | } |
2801 | #endif |
2802 | |
2803 | void Program::waitForLLVM() const { |
2804 | #if defined(SKVM_LLVM) |
2805 | if (fImpl->llvm_compiling.valid()) { |
2806 | fImpl->llvm_compiling.wait(); |
2807 | } |
2808 | #endif |
2809 | } |
2810 | |
2811 | bool Program::hasJIT() const { |
2812 | // Program::hasJIT() is really just a debugging / test aid, |
2813 | // so we don't mind adding a sync point here to wait for compilation. |
2814 | this->waitForLLVM(); |
2815 | |
2816 | return fImpl->jit_entry.load() != nullptr; |
2817 | } |
2818 | |
2819 | void Program::dropJIT() { |
2820 | #if defined(SKVM_LLVM) |
2821 | this->waitForLLVM(); |
2822 | fImpl->llvm_ee .reset(nullptr); |
2823 | fImpl->llvm_ctx.reset(nullptr); |
2824 | #elif defined(SKVM_JIT) |
2825 | if (fImpl->dylib) { |
2826 | dlclose(fImpl->dylib); |
2827 | } else if (auto jit_entry = fImpl->jit_entry.load()) { |
2828 | munmap(jit_entry, fImpl->jit_size); |
2829 | } |
2830 | #else |
2831 | SkASSERT(!this->hasJIT()); |
2832 | #endif |
2833 | |
2834 | fImpl->jit_entry.store(nullptr); |
2835 | fImpl->jit_size = 0; |
2836 | fImpl->dylib = nullptr; |
2837 | } |
2838 | |
2839 | Program::Program() : fImpl(std::make_unique<Impl>()) {} |
2840 | |
2841 | Program::~Program() { |
2842 | // Moved-from Programs may have fImpl == nullptr. |
2843 | if (fImpl) { |
2844 | this->dropJIT(); |
2845 | } |
2846 | } |
2847 | |
2848 | Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {} |
2849 | |
2850 | Program& Program::operator=(Program&& other) { |
2851 | fImpl = std::move(other.fImpl); |
2852 | return *this; |
2853 | } |
2854 | |
2855 | Program::Program(const std::vector<OptimizedInstruction>& interpreter, |
2856 | const std::vector<int>& strides) : Program() { |
2857 | fImpl->strides = strides; |
2858 | this->setupInterpreter(interpreter); |
2859 | } |
2860 | |
2861 | Program::Program(const std::vector<OptimizedInstruction>& interpreter, |
2862 | const std::vector<OptimizedInstruction>& jit, |
2863 | const std::vector<int>& strides, |
2864 | const char* debug_name) : Program() { |
2865 | fImpl->strides = strides; |
2866 | #if 1 && defined(SKVM_LLVM) |
2867 | this->setupLLVM(interpreter, debug_name); |
2868 | #elif 1 && defined(SKVM_JIT) |
2869 | this->setupJIT(jit, debug_name); |
2870 | #endif |
2871 | |
2872 | // Might as well do this after setupLLVM() to get a little more time to compile. |
2873 | this->setupInterpreter(interpreter); |
2874 | } |
2875 | |
2876 | std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; } |
2877 | int Program::nargs() const { return (int)fImpl->strides.size(); } |
2878 | int Program::nregs() const { return fImpl->regs; } |
2879 | int Program::loop () const { return fImpl->loop; } |
2880 | bool Program::empty() const { return fImpl->instructions.empty(); } |
2881 | |
2882 | // Translate OptimizedInstructions to InterpreterInstructions. |
2883 | void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) { |
2884 | // Register each instruction is assigned to. |
2885 | std::vector<Reg> reg(instructions.size()); |
2886 | |
2887 | // This next bit is a bit more complicated than strictly necessary; |
2888 | // we could just assign every instruction to its own register. |
2889 | // |
2890 | // But recycling registers is fairly cheap, and good practice for the |
2891 | // JITs where minimizing register pressure really is important. |
2892 | // |
2893 | // Since we have effectively infinite registers, we hoist any value we can. |
2894 | // (The JIT may choose a more complex policy to reduce register pressure.) |
2895 | auto hoisted = [&](Val id) { return instructions[id].can_hoist; }; |
2896 | |
2897 | fImpl->regs = 0; |
2898 | std::vector<Reg> avail; |
2899 | |
2900 | // Assign this value to a register, recycling them where we can. |
2901 | auto assign_register = [&](Val id) { |
2902 | const OptimizedInstruction& inst = instructions[id]; |
2903 | |
2904 | // If this is a real input and it's lifetime ends at this instruction, |
2905 | // we can recycle the register it's occupying. |
2906 | auto maybe_recycle_register = [&](Val input) { |
2907 | if (input != NA |
2908 | && instructions[input].death == id |
2909 | && !(hoisted(input) && instructions[input].used_in_loop)) { |
2910 | avail.push_back(reg[input]); |
2911 | } |
2912 | }; |
2913 | |
2914 | // Take care to not recycle the same register twice. |
2915 | if (true ) { maybe_recycle_register(inst.x); } |
2916 | if (inst.y != inst.x ) { maybe_recycle_register(inst.y); } |
2917 | if (inst.z != inst.x && inst.z != inst.y) { maybe_recycle_register(inst.z); } |
2918 | |
2919 | // Instructions that die at themselves (stores) don't need a register. |
2920 | if (inst.death != id) { |
2921 | // Allocate a register if we have to, preferring to reuse anything available. |
2922 | if (avail.empty()) { |
2923 | reg[id] = fImpl->regs++; |
2924 | } else { |
2925 | reg[id] = avail.back(); |
2926 | avail.pop_back(); |
2927 | } |
2928 | } |
2929 | }; |
2930 | |
2931 | // Assign a register to each hoisted instruction, then each non-hoisted loop instruction. |
2932 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
2933 | if ( hoisted(id)) { assign_register(id); } |
2934 | } |
2935 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
2936 | if (!hoisted(id)) { assign_register(id); } |
2937 | } |
2938 | |
2939 | // Translate OptimizedInstructions to InterpreterIstructions by mapping values to |
2940 | // registers. This will be two passes, first hoisted instructions, then inside the loop. |
2941 | |
2942 | // The loop begins at the fImpl->loop'th Instruction. |
2943 | fImpl->loop = 0; |
2944 | fImpl->instructions.reserve(instructions.size()); |
2945 | |
2946 | // Add a dummy mapping for the N/A sentinel Val to any arbitrary register |
2947 | // so lookups don't have to know which arguments are used by which Ops. |
2948 | auto lookup_register = [&](Val id) { |
2949 | return id == NA ? (Reg)0 |
2950 | : reg[id]; |
2951 | }; |
2952 | |
2953 | auto push_instruction = [&](Val id, const OptimizedInstruction& inst) { |
2954 | InterpreterInstruction pinst{ |
2955 | inst.op, |
2956 | lookup_register(id), |
2957 | lookup_register(inst.x), |
2958 | {lookup_register(inst.y)}, |
2959 | {lookup_register(inst.z)}, |
2960 | }; |
2961 | if (inst.y == NA) { pinst.immy = inst.immy; } |
2962 | if (inst.z == NA) { pinst.immz = inst.immz; } |
2963 | fImpl->instructions.push_back(pinst); |
2964 | }; |
2965 | |
2966 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
2967 | const OptimizedInstruction& inst = instructions[id]; |
2968 | if (hoisted(id)) { |
2969 | push_instruction(id, inst); |
2970 | fImpl->loop++; |
2971 | } |
2972 | } |
2973 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
2974 | const OptimizedInstruction& inst = instructions[id]; |
2975 | if (!hoisted(id)) { |
2976 | push_instruction(id, inst); |
2977 | } |
2978 | } |
2979 | } |
2980 | |
2981 | #if defined(SKVM_JIT) |
2982 | |
2983 | bool Program::jit(const std::vector<OptimizedInstruction>& instructions, |
2984 | const JITMode mode, |
2985 | Assembler* a) const { |
2986 | using A = Assembler; |
2987 | const bool try_hoisting = mode != JITMode::RegisterNoHoist; |
2988 | |
2989 | auto debug_dump = [&] { |
2990 | #if 0 |
2991 | SkDebugfStream stream; |
2992 | this->dump(&stream); |
2993 | return true; |
2994 | #else |
2995 | return false; |
2996 | #endif |
2997 | }; |
2998 | |
2999 | #if defined(__x86_64__) |
3000 | if (!SkCpu::Supports(SkCpu::HSW)) { |
3001 | return false; |
3002 | } |
3003 | const int K = 8; |
3004 | const bool stack_only = mode == JITMode::Stack; |
3005 | A::GP64 N = A::rdi, |
3006 | scratch = A::rax, |
3007 | scratch2 = A::r11, |
3008 | arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 }; |
3009 | |
3010 | // All 16 ymm registers are available to use. |
3011 | using Reg = A::Ymm; |
3012 | const uint32_t all_regs = 0xffff; |
3013 | uint32_t avail = all_regs; |
3014 | |
3015 | #elif defined(__aarch64__) |
3016 | const int K = 4; |
3017 | const bool stack_only = false; // TODO |
3018 | A::X N = A::x0, |
3019 | scratch = A::x8, |
3020 | arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 }; |
3021 | |
3022 | // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15. |
3023 | using Reg = A::V; |
3024 | const uint32_t all_regs = 0xffff00ff; |
3025 | uint32_t avail = all_regs; |
3026 | #endif |
3027 | |
3028 | if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) { |
3029 | return false; |
3030 | } |
3031 | |
3032 | auto hoisted = [&](Val id) { return try_hoisting && instructions[id].can_hoist; }; |
3033 | |
3034 | std::vector<Reg> r(instructions.size()); |
3035 | |
3036 | struct LabelAndReg { |
3037 | A::Label label; |
3038 | Reg reg; |
3039 | }; |
3040 | SkTHashMap<int, LabelAndReg> constants; // All constants share the same pool. |
3041 | LabelAndReg iota; // Exists _only_ to vary per-lane. |
3042 | |
3043 | auto emit = [&](Val id, bool scalar) { |
3044 | if (stack_only) { |
3045 | SkASSERT(avail == all_regs); |
3046 | } |
3047 | |
3048 | const OptimizedInstruction& inst = instructions[id]; |
3049 | Op op = inst.op; |
3050 | Val x = inst.x, |
3051 | y = inst.y, |
3052 | z = inst.z; |
3053 | int immy = inst.immy, |
3054 | immz = inst.immz; |
3055 | |
3056 | // Most (but not all) ops create an output value and need a register to hold it, dst. |
3057 | // We track each instruction's dst in r[] so we can thread it through as an input |
3058 | // to any future instructions needing that value. |
3059 | // |
3060 | // And some ops may need a temporary register, tmp. Some need both tmp and dst. |
3061 | // |
3062 | // tmp and dst are very similar and can and will often be assigned the same register, |
3063 | // but tmp may never alias any of the instructions's inputs, while dst may when this |
3064 | // instruction consumes that input, i.e. if the input reaches its end of life here. |
3065 | // |
3066 | // We'll assign both registers lazily to keep register pressure as low as possible. |
3067 | bool tmp_is_set = false, |
3068 | dst_is_set = false; |
3069 | Reg tmp_reg = (Reg)0; // This initial value won't matter... anything legal is fine. |
3070 | |
3071 | bool ok = true; // Set to false if we need to assign a register and none's available. |
3072 | |
3073 | if (stack_only) { |
3074 | // Move each unique argument into a temporary register. |
3075 | auto load_from_stack = [&](Val arg) { |
3076 | if (int found = __builtin_ffs(avail)) { |
3077 | Reg reg = (Reg)(found - 1); |
3078 | avail ^= 1 << reg; |
3079 | r[arg] = reg; |
3080 | #if defined(__x86_64__) |
3081 | a->vmovups(r[arg], arg*K*4); |
3082 | #else |
3083 | SkASSERT(false); // TODO |
3084 | #endif |
3085 | } else { |
3086 | if (debug_dump()) { |
3087 | SkDebugf("\nCould not find temporary register for %d\n" , arg); |
3088 | } |
3089 | ok = false; |
3090 | } |
3091 | }; |
3092 | if (x != NA ) { load_from_stack(x); } |
3093 | if (y != NA && y != x ) { load_from_stack(y); } |
3094 | if (z != NA && z != x && z != y) { load_from_stack(z); } |
3095 | } |
3096 | |
3097 | // First lock in how to choose tmp if we need to based on the registers |
3098 | // available before this instruction, not including any of its input registers. |
3099 | auto tmp = [&,avail/*important, closing over avail's current value*/]{ |
3100 | if (!tmp_is_set) { |
3101 | tmp_is_set = true; |
3102 | if (int found = __builtin_ffs(avail)) { |
3103 | // This is a temporary register just for this op, |
3104 | // so we leave it marked available for future ops. |
3105 | tmp_reg = (Reg)(found - 1); |
3106 | } else { |
3107 | // We needed a tmp register but couldn't find one available. :'( |
3108 | // This will cause emit() to return false, in turn causing jit() to fail. |
3109 | if (debug_dump()) { |
3110 | SkDebugf("\nCould not find a register to hold tmp\n" ); |
3111 | } |
3112 | ok = false; |
3113 | } |
3114 | } |
3115 | return tmp_reg; |
3116 | }; |
3117 | |
3118 | // Now make available any registers that are consumed by this instruction. |
3119 | // (The register pool we can pick dst from is >= the pool for tmp, adding any of these.) |
3120 | auto maybe_recycle_register = [&](Val input) { |
3121 | if (input != NA |
3122 | && instructions[input].death == id |
3123 | && !(hoisted(input) && instructions[input].used_in_loop)) { |
3124 | avail |= 1 << r[input]; |
3125 | } |
3126 | }; |
3127 | maybe_recycle_register(x); |
3128 | maybe_recycle_register(y); |
3129 | maybe_recycle_register(z); |
3130 | // set_dst() and dst() will work read/write with this perhaps-just-updated avail. |
3131 | |
3132 | // Some ops may decide dst on their own to best fit the instruction (see Op::fma_f32). |
3133 | auto set_dst = [&](Reg reg){ |
3134 | SkASSERT(dst_is_set == false); |
3135 | dst_is_set = true; |
3136 | |
3137 | SkASSERT(avail & (1<<reg)); |
3138 | avail ^= 1<<reg; |
3139 | |
3140 | r[id] = reg; |
3141 | }; |
3142 | |
3143 | // Thanks to AVX and NEON's 3-argument instruction sets, |
3144 | // most ops can use any register as dst. |
3145 | auto dst = [&]{ |
3146 | if (!dst_is_set) { |
3147 | if (int found = __builtin_ffs(avail)) { |
3148 | set_dst((Reg)(found-1)); |
3149 | } else { |
3150 | // Same deal as with tmp... all the registers are occupied. Time to fail! |
3151 | if (debug_dump()) { |
3152 | SkDebugf("\nCould not find a register to hold value %d\n" , id); |
3153 | } |
3154 | ok = false; |
3155 | } |
3156 | } |
3157 | return r[id]; |
3158 | }; |
3159 | |
3160 | // Because we use the same logic to pick an arbitrary dst and to pick tmp, |
3161 | // and we know that tmp will never overlap any of the inputs, `dst() == tmp()` |
3162 | // is a simple idiom to check that the destination does not overlap any of the inputs. |
3163 | // Sometimes we can use this knowledge to do better instruction selection. |
3164 | |
3165 | // Ok! Keep in mind that we haven't assigned tmp or dst yet, |
3166 | // just laid out hooks for how to do so if we need them, depending on the instruction. |
3167 | // |
3168 | // Now let's actually assemble the instruction! |
3169 | switch (op) { |
3170 | default: |
3171 | if (debug_dump()) { |
3172 | SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n" , name(op), op); |
3173 | } |
3174 | return false; // TODO: many new ops |
3175 | |
3176 | #if defined(__x86_64__) |
3177 | case Op::assert_true: { |
3178 | a->vptest (r[x], &constants[0xffffffff].label); |
3179 | A::Label all_true; |
3180 | a->jc(&all_true); |
3181 | a->int3(); |
3182 | a->label(&all_true); |
3183 | } break; |
3184 | |
3185 | case Op::store8: if (scalar) { a->vpextrb (arg[immy], (A::Xmm)r[x], 0); } |
3186 | else { a->vpackusdw(tmp(), r[x], r[x]); |
3187 | a->vpermq (tmp(), tmp(), 0xd8); |
3188 | a->vpackuswb(tmp(), tmp(), tmp()); |
3189 | a->vmovq (arg[immy], (A::Xmm)tmp()); } |
3190 | break; |
3191 | |
3192 | case Op::store16: if (scalar) { a->vpextrw (arg[immy], (A::Xmm)r[x], 0); } |
3193 | else { a->vpackusdw(tmp(), r[x], r[x]); |
3194 | a->vpermq (tmp(), tmp(), 0xd8); |
3195 | a->vmovups (arg[immy], (A::Xmm)tmp()); } |
3196 | break; |
3197 | |
3198 | case Op::store32: if (scalar) { a->vmovd (arg[immy], (A::Xmm)r[x]); } |
3199 | else { a->vmovups(arg[immy], r[x]); } |
3200 | break; |
3201 | |
3202 | case Op::load8: if (scalar) { |
3203 | a->vpxor (dst(), dst(), dst()); |
3204 | a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), arg[immy], 0); |
3205 | } else { |
3206 | a->vpmovzxbd(dst(), arg[immy]); |
3207 | } break; |
3208 | |
3209 | case Op::load16: if (scalar) { |
3210 | a->vpxor (dst(), dst(), dst()); |
3211 | a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), arg[immy], 0); |
3212 | } else { |
3213 | a->vpmovzxwd(dst(), arg[immy]); |
3214 | } break; |
3215 | |
3216 | case Op::load32: if (scalar) { a->vmovd ((A::Xmm)dst(), arg[immy]); } |
3217 | else { a->vmovups( dst(), arg[immy]); } |
3218 | break; |
3219 | |
3220 | case Op::gather32: |
3221 | if (scalar) { |
3222 | auto base = scratch, |
3223 | index = scratch2; |
3224 | // Our gather base pointer is immz bytes off of uniform immy. |
3225 | a->movq(base, arg[immy], immz); |
3226 | |
3227 | // Grab our index from lane 0 of the index argument. |
3228 | a->vmovd_direct(index, (A::Xmm)r[x]); |
3229 | |
3230 | // dst = *(base + 4*index) |
3231 | a->vmovd((A::Xmm)dst(), A::FOUR, index, base); |
3232 | } else { |
3233 | // We may not let any of dst(), index, or mask use the same register, |
3234 | // so we must allocate registers manually and very carefully. |
3235 | |
3236 | // index is argument x and has already been maybe_recycle_register()'d, |
3237 | // so we explicitly ignore its availability during this op. |
3238 | A::Ymm index = r[x]; |
3239 | uint32_t avail_during_gather = avail & ~(1<<index); |
3240 | |
3241 | // Choose dst() to not overlap with index. |
3242 | if (int found = __builtin_ffs(avail_during_gather)) { |
3243 | set_dst((A::Ymm)(found-1)); |
3244 | avail_during_gather ^= (1<<dst()); |
3245 | } else { |
3246 | ok = false; |
3247 | break; |
3248 | } |
3249 | |
3250 | // Choose (temporary) mask to not overlap with dst() or index. |
3251 | A::Ymm mask; |
3252 | if (int found = __builtin_ffs(avail_during_gather)) { |
3253 | mask = (A::Ymm)(found-1); |
3254 | } else { |
3255 | ok = false; |
3256 | break; |
3257 | } |
3258 | |
3259 | // Our gather base pointer is immz bytes off of uniform immy. |
3260 | auto base = scratch; |
3261 | a->movq(base, arg[immy], immz); |
3262 | a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.) |
3263 | a->vgatherdps(dst(), A::FOUR, index, base, mask); |
3264 | } |
3265 | break; |
3266 | |
3267 | case Op::uniform8: a->movzbl(scratch, arg[immy], immz); |
3268 | a->vmovd_direct((A::Xmm)dst(), scratch); |
3269 | a->vbroadcastss(dst(), (A::Xmm)dst()); |
3270 | break; |
3271 | |
3272 | case Op::uniform32: a->vbroadcastss(dst(), arg[immy], immz); |
3273 | break; |
3274 | |
3275 | case Op::index: a->vmovd_direct((A::Xmm)tmp(), N); |
3276 | a->vbroadcastss(tmp(), (A::Xmm)tmp()); |
3277 | a->vpsubd(dst(), tmp(), &iota.label); |
3278 | break; |
3279 | |
3280 | case Op::splat: if (immy) { a->vbroadcastss(dst(), &constants[immy].label); } |
3281 | else { a->vpxor(dst(), dst(), dst()); } |
3282 | break; |
3283 | |
3284 | case Op::add_f32: a->vaddps(dst(), r[x], r[y]); break; |
3285 | case Op::sub_f32: a->vsubps(dst(), r[x], r[y]); break; |
3286 | case Op::mul_f32: a->vmulps(dst(), r[x], r[y]); break; |
3287 | case Op::div_f32: a->vdivps(dst(), r[x], r[y]); break; |
3288 | case Op::min_f32: a->vminps(dst(), r[y], r[x]); break; // Order matters, |
3289 | case Op::max_f32: a->vmaxps(dst(), r[y], r[x]); break; // see test SkVM_min_max. |
3290 | |
3291 | case Op::fma_f32: |
3292 | if (avail & (1<<r[x])) { set_dst(r[x]); a->vfmadd132ps(r[x], r[z], r[y]); } |
3293 | else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmadd213ps(r[y], r[x], r[z]); } |
3294 | else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmadd231ps(r[z], r[x], r[y]); } |
3295 | else { SkASSERT(dst() == tmp()); |
3296 | a->vmovdqa (dst(),r[x]); |
3297 | a->vfmadd132ps(dst(),r[z], r[y]); } |
3298 | break; |
3299 | |
3300 | case Op::fms_f32: |
3301 | if (avail & (1<<r[x])) { set_dst(r[x]); a->vfmsub132ps(r[x], r[z], r[y]); } |
3302 | else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmsub213ps(r[y], r[x], r[z]); } |
3303 | else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmsub231ps(r[z], r[x], r[y]); } |
3304 | else { SkASSERT(dst() == tmp()); |
3305 | a->vmovdqa (dst(),r[x]); |
3306 | a->vfmsub132ps(dst(),r[z], r[y]); } |
3307 | break; |
3308 | |
3309 | case Op::fnma_f32: |
3310 | if (avail & (1<<r[x])) { set_dst(r[x]); a->vfnmadd132ps(r[x],r[z], r[y]); } |
3311 | else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfnmadd213ps(r[y],r[x], r[z]); } |
3312 | else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfnmadd231ps(r[z],r[x], r[y]); } |
3313 | else { SkASSERT(dst() == tmp()); |
3314 | a->vmovdqa (dst(),r[x]); |
3315 | a->vfnmadd132ps(dst(),r[z],r[y]); } |
3316 | break; |
3317 | |
3318 | case Op::sqrt_f32: a->vsqrtps(dst(), r[x]); break; |
3319 | |
3320 | case Op::add_f32_imm: a->vaddps(dst(), r[x], &constants[immy].label); break; |
3321 | case Op::sub_f32_imm: a->vsubps(dst(), r[x], &constants[immy].label); break; |
3322 | case Op::mul_f32_imm: a->vmulps(dst(), r[x], &constants[immy].label); break; |
3323 | case Op::min_f32_imm: a->vminps(dst(), r[x], &constants[immy].label); break; |
3324 | case Op::max_f32_imm: a->vmaxps(dst(), r[x], &constants[immy].label); break; |
3325 | |
3326 | case Op::add_i32: a->vpaddd (dst(), r[x], r[y]); break; |
3327 | case Op::sub_i32: a->vpsubd (dst(), r[x], r[y]); break; |
3328 | case Op::mul_i32: a->vpmulld(dst(), r[x], r[y]); break; |
3329 | |
3330 | case Op::sub_i16x2: a->vpsubw (dst(), r[x], r[y]); break; |
3331 | case Op::mul_i16x2: a->vpmullw(dst(), r[x], r[y]); break; |
3332 | case Op::shr_i16x2: a->vpsrlw (dst(), r[x], immy); break; |
3333 | |
3334 | case Op::bit_and : a->vpand (dst(), r[x], r[y]); break; |
3335 | case Op::bit_or : a->vpor (dst(), r[x], r[y]); break; |
3336 | case Op::bit_xor : a->vpxor (dst(), r[x], r[y]); break; |
3337 | case Op::bit_clear: a->vpandn(dst(), r[y], r[x]); break; // Notice, y then x. |
3338 | case Op::select : a->vpblendvb(dst(), r[z], r[y], r[x]); break; |
3339 | |
3340 | case Op::bit_and_imm: a->vpand (dst(), r[x], &constants[immy].label); break; |
3341 | case Op::bit_or_imm : a->vpor (dst(), r[x], &constants[immy].label); break; |
3342 | case Op::bit_xor_imm: a->vpxor (dst(), r[x], &constants[immy].label); break; |
3343 | |
3344 | case Op::shl_i32: a->vpslld(dst(), r[x], immy); break; |
3345 | case Op::shr_i32: a->vpsrld(dst(), r[x], immy); break; |
3346 | case Op::sra_i32: a->vpsrad(dst(), r[x], immy); break; |
3347 | |
3348 | case Op::eq_i32: a->vpcmpeqd(dst(), r[x], r[y]); break; |
3349 | case Op::gt_i32: a->vpcmpgtd(dst(), r[x], r[y]); break; |
3350 | |
3351 | case Op:: eq_f32: a->vcmpeqps (dst(), r[x], r[y]); break; |
3352 | case Op::neq_f32: a->vcmpneqps(dst(), r[x], r[y]); break; |
3353 | case Op:: gt_f32: a->vcmpltps (dst(), r[y], r[x]); break; |
3354 | case Op::gte_f32: a->vcmpleps (dst(), r[y], r[x]); break; |
3355 | |
3356 | case Op::pack: a->vpslld(tmp(), r[y], immz); |
3357 | a->vpor (dst(), tmp(), r[x]); |
3358 | break; |
3359 | |
3360 | case Op::floor : a->vroundps (dst(), r[x], Assembler::FLOOR); break; |
3361 | case Op::to_f32: a->vcvtdq2ps (dst(), r[x]); break; |
3362 | case Op::trunc : a->vcvttps2dq(dst(), r[x]); break; |
3363 | case Op::round : a->vcvtps2dq (dst(), r[x]); break; |
3364 | |
3365 | #elif defined(__aarch64__) |
3366 | case Op::assert_true: { |
3367 | a->uminv4s(tmp(), r[x]); // uminv acts like an all() across the vector. |
3368 | a->fmovs(scratch, tmp()); |
3369 | A::Label all_true; |
3370 | a->cbnz(scratch, &all_true); |
3371 | a->brk(0); |
3372 | a->label(&all_true); |
3373 | } break; |
3374 | |
3375 | case Op::store8: a->xtns2h(tmp(), r[x]); |
3376 | a->xtnh2b(tmp(), tmp()); |
3377 | if (scalar) { a->strb (tmp(), arg[immy]); } |
3378 | else { a->strs (tmp(), arg[immy]); } |
3379 | break; |
3380 | // TODO: another case where it'd be okay to alias r[x] and tmp if r[x] dies here. |
3381 | |
3382 | case Op::store32: if (scalar) { a->strs(r[x], arg[immy]); } |
3383 | else { a->strq(r[x], arg[immy]); } |
3384 | break; |
3385 | |
3386 | case Op::load8: if (scalar) { a->ldrb(tmp(), arg[immy]); } |
3387 | else { a->ldrs(tmp(), arg[immy]); } |
3388 | a->uxtlb2h(tmp(), tmp()); |
3389 | a->uxtlh2s(dst(), tmp()); |
3390 | break; |
3391 | |
3392 | case Op::load32: if (scalar) { a->ldrs(dst(), arg[immy]); } |
3393 | else { a->ldrq(dst(), arg[immy]); } |
3394 | break; |
3395 | |
3396 | case Op::splat: if (immy) { a->ldrq(dst(), &constants[immy].label); } |
3397 | else { a->eor16b(dst(), dst(), dst()); } |
3398 | break; |
3399 | // TODO: If we hoist these, pack 4 values in each register |
3400 | // and use vector/lane operations, cutting the register |
3401 | // pressure cost of hoisting by 4? |
3402 | |
3403 | case Op::add_f32: a->fadd4s(dst(), r[x], r[y]); break; |
3404 | case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break; |
3405 | case Op::mul_f32: a->fmul4s(dst(), r[x], r[y]); break; |
3406 | case Op::div_f32: a->fdiv4s(dst(), r[x], r[y]); break; |
3407 | |
3408 | case Op::fma_f32: // fmla.4s is z += x*y |
3409 | if (avail & (1<<r[z])) { set_dst(r[z]); a->fmla4s( r[z], r[x], r[y]); } |
3410 | else { a->orr16b(tmp(), r[z], r[z]); |
3411 | a->fmla4s(tmp(), r[x], r[y]); |
3412 | if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } } |
3413 | break; |
3414 | |
3415 | case Op::fnma_f32: // fmls.4s is z -= x*y |
3416 | if (avail & (1<<r[z])) { set_dst(r[z]); a->fmls4s( r[z], r[x], r[y]); } |
3417 | else { a->orr16b(tmp(), r[z], r[z]); |
3418 | a->fmls4s(tmp(), r[x], r[y]); |
3419 | if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } } |
3420 | break; |
3421 | |
3422 | case Op::fms_f32: |
3423 | // first dst() = xy - z as if fnma_f32 |
3424 | if (avail & (1<<r[z])) { set_dst(r[z]); a->fmls4s( r[z], r[x], r[y]); } |
3425 | else { a->orr16b(tmp(), r[z], r[z]); |
3426 | a->fmls4s(tmp(), r[x], r[y]); |
3427 | if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } } |
3428 | // then dst() = -dst() (i.e. z - xy) |
3429 | a->fneg4s(dst(), dst()); |
3430 | break; |
3431 | |
3432 | // These _imm instructions are all x86/JIT only. |
3433 | case Op::add_f32_imm : |
3434 | case Op::sub_f32_imm : |
3435 | case Op::mul_f32_imm : |
3436 | case Op::min_f32_imm : |
3437 | case Op::max_f32_imm : |
3438 | case Op::bit_and_imm : |
3439 | case Op::bit_or_imm : |
3440 | case Op::bit_xor_imm : SkUNREACHABLE; break; |
3441 | |
3442 | case Op:: gt_f32: a->fcmgt4s (dst(), r[x], r[y]); break; |
3443 | case Op::gte_f32: a->fcmge4s (dst(), r[x], r[y]); break; |
3444 | case Op:: eq_f32: a->fcmeq4s (dst(), r[x], r[y]); break; |
3445 | case Op::neq_f32: a->fcmeq4s (tmp(), r[x], r[y]); |
3446 | a->not16b (dst(), tmp()); break; |
3447 | |
3448 | |
3449 | case Op::add_i32: a->add4s(dst(), r[x], r[y]); break; |
3450 | case Op::sub_i32: a->sub4s(dst(), r[x], r[y]); break; |
3451 | case Op::mul_i32: a->mul4s(dst(), r[x], r[y]); break; |
3452 | |
3453 | case Op::sub_i16x2: a->sub8h (dst(), r[x], r[y]); break; |
3454 | case Op::mul_i16x2: a->mul8h (dst(), r[x], r[y]); break; |
3455 | case Op::shr_i16x2: a->ushr8h(dst(), r[x], immy); break; |
3456 | |
3457 | case Op::bit_and : a->and16b(dst(), r[x], r[y]); break; |
3458 | case Op::bit_or : a->orr16b(dst(), r[x], r[y]); break; |
3459 | case Op::bit_xor : a->eor16b(dst(), r[x], r[y]); break; |
3460 | case Op::bit_clear: a->bic16b(dst(), r[x], r[y]); break; |
3461 | |
3462 | case Op::select: // bsl16b is x = x ? y : z |
3463 | if (avail & (1<<r[x])) { set_dst(r[x]); a->bsl16b( r[x], r[y], r[z]); } |
3464 | else { a->orr16b(tmp(), r[x], r[x]); |
3465 | a->bsl16b(tmp(), r[y], r[z]); |
3466 | if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } } |
3467 | break; |
3468 | |
3469 | // fmin4s and fmax4s don't work the way we want with NaN, |
3470 | // so we write them the long way: |
3471 | case Op::min_f32: // min(x,y) = y<x ? y : x |
3472 | a->fcmgt4s(tmp(), r[x],r[y]); |
3473 | a->bsl16b (tmp(), r[y],r[x]); |
3474 | if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } |
3475 | break; |
3476 | |
3477 | case Op::max_f32: // max(x,y) = x<y ? y : x |
3478 | a->fcmgt4s(tmp(), r[y],r[x]); |
3479 | a->bsl16b (tmp(), r[y],r[x]); |
3480 | if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } |
3481 | break; |
3482 | |
3483 | case Op::shl_i32: a-> shl4s(dst(), r[x], immy); break; |
3484 | case Op::shr_i32: a->ushr4s(dst(), r[x], immy); break; |
3485 | case Op::sra_i32: a->sshr4s(dst(), r[x], immy); break; |
3486 | |
3487 | case Op::eq_i32: a->cmeq4s(dst(), r[x], r[y]); break; |
3488 | case Op::gt_i32: a->cmgt4s(dst(), r[x], r[y]); break; |
3489 | |
3490 | case Op::pack: |
3491 | if (avail & (1<<r[x])) { set_dst(r[x]); a->sli4s ( r[x], r[y], immz); } |
3492 | else { a->shl4s (tmp(), r[y], immz); |
3493 | a->orr16b(dst(), tmp(), r[x]); } |
3494 | break; |
3495 | |
3496 | case Op::to_f32: a->scvtf4s (dst(), r[x]); break; |
3497 | case Op::trunc: a->fcvtzs4s(dst(), r[x]); break; |
3498 | case Op::round: a->fcvtns4s(dst(), r[x]); break; |
3499 | // TODO: fcvtns.4s rounds to nearest even. |
3500 | // I think we actually want frintx -> fcvtzs to round to current mode. |
3501 | #endif |
3502 | } |
3503 | |
3504 | if (stack_only) { |
3505 | if (dst_is_set) { |
3506 | #if defined(__x86_64__) |
3507 | a->vmovups(id*K*4, r[id]); |
3508 | #else |
3509 | SkASSERT(false); // TODO |
3510 | #endif |
3511 | avail |= 1 << r[id]; |
3512 | } |
3513 | for (Val arg : {x,y,z}) { |
3514 | if (arg != NA) { |
3515 | avail |= 1 << r[arg]; |
3516 | } |
3517 | } |
3518 | SkASSERT(avail == all_regs); |
3519 | } |
3520 | |
3521 | // Calls to tmp() or dst() might have flipped this false from its default true state. |
3522 | return ok; |
3523 | }; |
3524 | |
3525 | |
3526 | #if defined(__x86_64__) |
3527 | auto jump_if_less = [&](A::Label* l) { a->jl (l); }; |
3528 | auto jump = [&](A::Label* l) { a->jmp(l); }; |
3529 | |
3530 | auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); }; |
3531 | auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); }; |
3532 | |
3533 | auto enter = [&]{ a->sub(A::rsp, instructions.size()*K*4); }; |
3534 | auto exit = [&]{ a->add(A::rsp, instructions.size()*K*4); a->vzeroupper(); a->ret(); }; |
3535 | #elif defined(__aarch64__) |
3536 | auto jump_if_less = [&](A::Label* l) { a->blt(l); }; |
3537 | auto jump = [&](A::Label* l) { a->b (l); }; |
3538 | |
3539 | auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); }; |
3540 | auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); }; |
3541 | |
3542 | auto enter = [&]{}; |
3543 | auto exit = [&]{ a->ret(A::x30); }; |
3544 | #endif |
3545 | |
3546 | A::Label body, |
3547 | tail, |
3548 | done; |
3549 | |
3550 | enter(); |
3551 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
3552 | if (hoisted(id) && !emit(id, /*scalar=*/false)) { |
3553 | return false; |
3554 | } |
3555 | } |
3556 | |
3557 | a->label(&body); |
3558 | { |
3559 | a->cmp(N, K); |
3560 | jump_if_less(&tail); |
3561 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
3562 | if (!hoisted(id) && !emit(id, /*scalar=*/false)) { |
3563 | return false; |
3564 | } |
3565 | } |
3566 | for (int i = 0; i < (int)fImpl->strides.size(); i++) { |
3567 | if (fImpl->strides[i]) { |
3568 | add(arg[i], K*fImpl->strides[i]); |
3569 | } |
3570 | } |
3571 | sub(N, K); |
3572 | jump(&body); |
3573 | } |
3574 | |
3575 | a->label(&tail); |
3576 | { |
3577 | a->cmp(N, 1); |
3578 | jump_if_less(&done); |
3579 | for (Val id = 0; id < (Val)instructions.size(); id++) { |
3580 | if (!hoisted(id) && !emit(id, /*scalar=*/true)) { |
3581 | return false; |
3582 | } |
3583 | } |
3584 | for (int i = 0; i < (int)fImpl->strides.size(); i++) { |
3585 | if (fImpl->strides[i]) { |
3586 | add(arg[i], 1*fImpl->strides[i]); |
3587 | } |
3588 | } |
3589 | sub(N, 1); |
3590 | jump(&tail); |
3591 | } |
3592 | |
3593 | a->label(&done); |
3594 | { |
3595 | exit(); |
3596 | } |
3597 | |
3598 | // Except for explicit aligned load and store instructions, AVX allows |
3599 | // memory operands to be unaligned. So even though we're creating 16 |
3600 | // byte patterns on ARM or 32-byte patterns on x86, we only need to |
3601 | // align to 4 bytes, the element size and alignment requirement. |
3602 | |
3603 | constants.foreach([&](int imm, LabelAndReg* entry) { |
3604 | a->align(4); |
3605 | a->label(&entry->label); |
3606 | for (int i = 0; i < K; i++) { |
3607 | a->word(imm); |
3608 | } |
3609 | }); |
3610 | |
3611 | if (!iota.label.references.empty()) { |
3612 | a->align(4); |
3613 | a->label(&iota.label); |
3614 | for (int i = 0; i < K; i++) { |
3615 | a->word(i); |
3616 | } |
3617 | } |
3618 | |
3619 | return true; |
3620 | } |
3621 | |
3622 | void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions, |
3623 | const char* debug_name) { |
3624 | // Assemble with no buffer to determine a.size(), the number of bytes we'll assemble. |
3625 | Assembler a{nullptr}; |
3626 | |
3627 | // First try allowing code hoisting (faster code) |
3628 | // then again without if that fails (lower register pressure). |
3629 | JITMode mode = JITMode::Register; |
3630 | bool ok = false; |
3631 | for (JITMode m : {JITMode::Register, JITMode::RegisterNoHoist, JITMode::Stack}) { |
3632 | if (this->jit(instructions, m, &a)) { |
3633 | ok = true; |
3634 | mode = m; |
3635 | break; |
3636 | } |
3637 | } |
3638 | if (!ok) { return; } |
3639 | |
3640 | // Allocate space that we can remap as executable. |
3641 | const size_t page = sysconf(_SC_PAGESIZE); |
3642 | |
3643 | // mprotect works at page granularity. |
3644 | fImpl->jit_size = ((a.size() + page - 1) / page) * page; |
3645 | |
3646 | void* jit_entry |
3647 | = mmap(nullptr,fImpl->jit_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0); |
3648 | fImpl->jit_entry.store(jit_entry); |
3649 | |
3650 | // Assemble the program for real. |
3651 | a = Assembler{jit_entry}; |
3652 | SkAssertResult(this->jit(instructions, mode, &a)); |
3653 | SkASSERT(a.size() <= fImpl->jit_size); |
3654 | |
3655 | // Remap as executable, and flush caches on platforms that need that. |
3656 | mprotect(jit_entry, fImpl->jit_size, PROT_READ|PROT_EXEC); |
3657 | __builtin___clear_cache((char*)jit_entry, |
3658 | (char*)jit_entry + fImpl->jit_size); |
3659 | |
3660 | // For profiling and debugging, it's helpful to have this code loaded |
3661 | // dynamically rather than just jumping info fImpl->jit_entry. |
3662 | if (gSkVMJITViaDylib) { |
3663 | // Dump the raw program binary. |
3664 | SkString path = SkStringPrintf("/tmp/%s.XXXXXX" , debug_name); |
3665 | int fd = mkstemp(path.writable_str()); |
3666 | ::write(fd, jit_entry, a.size()); |
3667 | close(fd); |
3668 | |
3669 | this->dropJIT(); // (unmap and null out fImpl->jit_entry.) |
3670 | |
3671 | // Convert it in-place to a dynamic library with a single symbol "skvm_jit": |
3672 | SkString cmd = SkStringPrintf( |
3673 | "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'" |
3674 | " | clang -x assembler -shared - -o %s" , |
3675 | path.c_str(), path.c_str()); |
3676 | system(cmd.c_str()); |
3677 | |
3678 | // Load that dynamic library and look up skvm_jit(). |
3679 | fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL); |
3680 | fImpl->jit_entry.store(dlsym(fImpl->dylib, "skvm_jit" )); |
3681 | } |
3682 | } |
3683 | #endif |
3684 | |
3685 | } // namespace skvm |
3686 | |