1 | // Copyright 2020 Google LLC. |
2 | // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. |
3 | |
4 | #ifndef SkVM_opts_DEFINED |
5 | #define SkVM_opts_DEFINED |
6 | |
7 | #include "include/private/SkVx.h" |
8 | #include "src/core/SkVM.h" |
9 | |
10 | namespace SK_OPTS_NS { |
11 | |
12 | inline void interpret_skvm(const skvm::InterpreterInstruction insts[], const int ninsts, |
13 | const int nregs, const int loop, |
14 | const int strides[], const int nargs, |
15 | int n, void* args[]) { |
16 | using namespace skvm; |
17 | |
18 | // We'll operate in SIMT style, knocking off K-size chunks from n while possible. |
19 | // We noticed quad-pumping is slower than single-pumping and both were slower than double. |
20 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
21 | constexpr int K = 16; |
22 | #else |
23 | constexpr int K = 8; |
24 | #endif |
25 | using I32 = skvx::Vec<K, int>; |
26 | using F32 = skvx::Vec<K, float>; |
27 | using U64 = skvx::Vec<K, uint64_t>; |
28 | using U32 = skvx::Vec<K, uint32_t>; |
29 | using U16 = skvx::Vec<K, uint16_t>; |
30 | using U8 = skvx::Vec<K, uint8_t>; |
31 | |
32 | union Slot { |
33 | F32 f32; |
34 | I32 i32; |
35 | U32 u32; |
36 | }; |
37 | |
38 | Slot few_regs[16]; |
39 | std::unique_ptr<char[]> many_regs; |
40 | |
41 | Slot* r = few_regs; |
42 | |
43 | if (nregs > (int)SK_ARRAY_COUNT(few_regs)) { |
44 | // Annoyingly we can't trust that malloc() or new will work with Slot because |
45 | // the skvx::Vec types may have alignment greater than what they provide. |
46 | // We'll overallocate one extra register so we can align manually. |
47 | many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]); |
48 | |
49 | uintptr_t addr = (uintptr_t)many_regs.get(); |
50 | addr += alignof(Slot) - |
51 | (addr & (alignof(Slot) - 1)); |
52 | SkASSERT((addr & (alignof(Slot) - 1)) == 0); |
53 | r = (Slot*)addr; |
54 | } |
55 | |
56 | |
57 | // Step each argument pointer ahead by its stride a number of times. |
58 | auto step_args = [&](int times) { |
59 | for (int i = 0; i < nargs; i++) { |
60 | args[i] = (void*)( (char*)args[i] + times * strides[i] ); |
61 | } |
62 | }; |
63 | |
64 | int start = 0, |
65 | stride; |
66 | for ( ; n > 0; start = loop, n -= stride, step_args(stride)) { |
67 | stride = n >= K ? K : 1; |
68 | |
69 | for (int i = start; i < ninsts; i++) { |
70 | InterpreterInstruction inst = insts[i]; |
71 | |
72 | // d = op(x,y/imm,z/imm) |
73 | Reg d = inst.d, |
74 | x = inst.x, |
75 | y = inst.y, |
76 | z = inst.z; |
77 | int immy = inst.immy, |
78 | immz = inst.immz; |
79 | |
80 | // Ops that interact with memory need to know whether we're stride=1 or K, |
81 | // but all non-memory ops can run the same code no matter the stride. |
82 | switch (2*(int)inst.op + (stride == K ? 1 : 0)) { |
83 | default: SkUNREACHABLE; |
84 | |
85 | #define STRIDE_1(op) case 2*(int)op |
86 | #define STRIDE_K(op) case 2*(int)op + 1 |
87 | STRIDE_1(Op::store8 ): memcpy(args[immy], &r[x].i32, 1); break; |
88 | STRIDE_1(Op::store16): memcpy(args[immy], &r[x].i32, 2); break; |
89 | STRIDE_1(Op::store32): memcpy(args[immy], &r[x].i32, 4); break; |
90 | STRIDE_1(Op::store64): memcpy((char*)args[immz]+0, &r[x].i32, 4); |
91 | memcpy((char*)args[immz]+4, &r[y].i32, 4); break; |
92 | |
93 | STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r[x].i32).store(args[immy]); break; |
94 | STRIDE_K(Op::store16): skvx::cast<uint16_t>(r[x].i32).store(args[immy]); break; |
95 | STRIDE_K(Op::store32): (r[x].i32).store(args[immy]); break; |
96 | STRIDE_K(Op::store64): (skvx::cast<uint64_t>(r[x].u32) << 0 | |
97 | skvx::cast<uint64_t>(r[y].u32) << 32).store(args[immz]); |
98 | break; |
99 | |
100 | STRIDE_1(Op::load8 ): r[d].i32 = 0; memcpy(&r[d].i32, args[immy], 1); break; |
101 | STRIDE_1(Op::load16): r[d].i32 = 0; memcpy(&r[d].i32, args[immy], 2); break; |
102 | STRIDE_1(Op::load32): r[d].i32 = 0; memcpy(&r[d].i32, args[immy], 4); break; |
103 | STRIDE_1(Op::load64): |
104 | r[d].i32 = 0; memcpy(&r[d].i32, (char*)args[immy] + 4*immz, 4); break; |
105 | |
106 | STRIDE_K(Op::load8 ): r[d].i32= skvx::cast<int>(U8 ::Load(args[immy])); break; |
107 | STRIDE_K(Op::load16): r[d].i32= skvx::cast<int>(U16::Load(args[immy])); break; |
108 | STRIDE_K(Op::load32): r[d].i32= I32::Load(args[immy]) ; break; |
109 | STRIDE_K(Op::load64): |
110 | // Low 32 bits if immz=0, or high 32 bits if immz=1. |
111 | r[d].i32 = skvx::cast<int>(U64::Load(args[immy]) >> (32*immz)); break; |
112 | |
113 | // The pointer we base our gather on is loaded indirectly from a uniform: |
114 | // - args[immy] is the uniform holding our gather base pointer somewhere; |
115 | // - (const uint8_t*)args[immy] + immz points to the gather base pointer; |
116 | // - memcpy() loads the gather base and into a pointer of the right type. |
117 | // After all that we have an ordinary (uniform) pointer `ptr` to load from, |
118 | // and we then gather from it using the varying indices in r[x]. |
119 | STRIDE_1(Op::gather8): |
120 | for (int i = 0; i < K; i++) { |
121 | const uint8_t* ptr; |
122 | memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr)); |
123 | r[d].i32[i] = (i==0) ? ptr[ r[x].i32[i] ] : 0; |
124 | } break; |
125 | STRIDE_1(Op::gather16): |
126 | for (int i = 0; i < K; i++) { |
127 | const uint16_t* ptr; |
128 | memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr)); |
129 | r[d].i32[i] = (i==0) ? ptr[ r[x].i32[i] ] : 0; |
130 | } break; |
131 | STRIDE_1(Op::gather32): |
132 | for (int i = 0; i < K; i++) { |
133 | const int* ptr; |
134 | memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr)); |
135 | r[d].i32[i] = (i==0) ? ptr[ r[x].i32[i] ] : 0; |
136 | } break; |
137 | |
138 | STRIDE_K(Op::gather8): |
139 | for (int i = 0; i < K; i++) { |
140 | const uint8_t* ptr; |
141 | memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr)); |
142 | r[d].i32[i] = ptr[ r[x].i32[i] ]; |
143 | } break; |
144 | STRIDE_K(Op::gather16): |
145 | for (int i = 0; i < K; i++) { |
146 | const uint16_t* ptr; |
147 | memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr)); |
148 | r[d].i32[i] = ptr[ r[x].i32[i] ]; |
149 | } break; |
150 | STRIDE_K(Op::gather32): |
151 | for (int i = 0; i < K; i++) { |
152 | const int* ptr; |
153 | memcpy(&ptr, (const uint8_t*)args[immy] + immz, sizeof(ptr)); |
154 | r[d].i32[i] = ptr[ r[x].i32[i] ]; |
155 | } break; |
156 | |
157 | #undef STRIDE_1 |
158 | #undef STRIDE_K |
159 | |
160 | // Ops that don't interact with memory should never care about the stride. |
161 | #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1 |
162 | |
163 | // These 128-bit ops are implemented serially for simplicity. |
164 | CASE(Op::store128): { |
165 | int ptr = immz>>1, |
166 | lane = immz&1; |
167 | U64 src = (skvx::cast<uint64_t>(r[x].u32) << 0 | |
168 | skvx::cast<uint64_t>(r[y].u32) << 32); |
169 | for (int i = 0; i < stride; i++) { |
170 | memcpy((char*)args[ptr] + 16*i + 8*lane, &src[i], 8); |
171 | } |
172 | } break; |
173 | |
174 | CASE(Op::load128): |
175 | r[d].i32 = 0; |
176 | for (int i = 0; i < stride; i++) { |
177 | memcpy(&r[d].i32[i], (const char*)args[immy] + 16*i+ 4*immz, 4); |
178 | } break; |
179 | |
180 | CASE(Op::assert_true): |
181 | #ifdef SK_DEBUG |
182 | if (!all(r[x].i32)) { |
183 | SkDebugf("inst %d, register %d\n" , i, y); |
184 | for (int i = 0; i < K; i++) { |
185 | SkDebugf("\t%2d: %08x (%g)\n" , i, r[y].i32[i], r[y].f32[i]); |
186 | } |
187 | } |
188 | SkASSERT(all(r[x].i32)); |
189 | #endif |
190 | break; |
191 | |
192 | CASE(Op::index): { |
193 | const int iota[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, |
194 | 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; |
195 | static_assert(K <= SK_ARRAY_COUNT(iota), "" ); |
196 | |
197 | r[d].i32 = n - I32::Load(iota); |
198 | } break; |
199 | |
200 | CASE(Op::uniform8): |
201 | r[d].i32 = *(const uint8_t* )( (const char*)args[immy] + immz ); |
202 | break; |
203 | CASE(Op::uniform16): |
204 | r[d].i32 = *(const uint16_t*)( (const char*)args[immy] + immz ); |
205 | break; |
206 | CASE(Op::uniform32): |
207 | r[d].i32 = *(const int* )( (const char*)args[immy] + immz ); |
208 | break; |
209 | |
210 | CASE(Op::splat): r[d].i32 = immy; break; |
211 | |
212 | CASE(Op::add_f32): r[d].f32 = r[x].f32 + r[y].f32; break; |
213 | CASE(Op::sub_f32): r[d].f32 = r[x].f32 - r[y].f32; break; |
214 | CASE(Op::mul_f32): r[d].f32 = r[x].f32 * r[y].f32; break; |
215 | CASE(Op::div_f32): r[d].f32 = r[x].f32 / r[y].f32; break; |
216 | CASE(Op::min_f32): r[d].f32 = min(r[x].f32, r[y].f32); break; |
217 | CASE(Op::max_f32): r[d].f32 = max(r[x].f32, r[y].f32); break; |
218 | |
219 | CASE(Op::fma_f32): r[d].f32 = fma( r[x].f32, r[y].f32, r[z].f32); break; |
220 | CASE(Op::fms_f32): r[d].f32 = fma( r[x].f32, r[y].f32, -r[z].f32); break; |
221 | CASE(Op::fnma_f32): r[d].f32 = fma(-r[x].f32, r[y].f32, r[z].f32); break; |
222 | |
223 | CASE(Op::sqrt_f32): r[d].f32 = sqrt(r[x].f32); break; |
224 | |
225 | CASE(Op::add_i32): r[d].i32 = r[x].i32 + r[y].i32; break; |
226 | CASE(Op::sub_i32): r[d].i32 = r[x].i32 - r[y].i32; break; |
227 | CASE(Op::mul_i32): r[d].i32 = r[x].i32 * r[y].i32; break; |
228 | |
229 | CASE(Op::shl_i32): r[d].i32 = r[x].i32 << immy; break; |
230 | CASE(Op::sra_i32): r[d].i32 = r[x].i32 >> immy; break; |
231 | CASE(Op::shr_i32): r[d].u32 = r[x].u32 >> immy; break; |
232 | |
233 | CASE(Op:: eq_f32): r[d].i32 = r[x].f32 == r[y].f32; break; |
234 | CASE(Op::neq_f32): r[d].i32 = r[x].f32 != r[y].f32; break; |
235 | CASE(Op:: gt_f32): r[d].i32 = r[x].f32 > r[y].f32; break; |
236 | CASE(Op::gte_f32): r[d].i32 = r[x].f32 >= r[y].f32; break; |
237 | |
238 | CASE(Op:: eq_i32): r[d].i32 = r[x].i32 == r[y].i32; break; |
239 | CASE(Op:: gt_i32): r[d].i32 = r[x].i32 > r[y].i32; break; |
240 | |
241 | CASE(Op::bit_and ): r[d].i32 = r[x].i32 & r[y].i32; break; |
242 | CASE(Op::bit_or ): r[d].i32 = r[x].i32 | r[y].i32; break; |
243 | CASE(Op::bit_xor ): r[d].i32 = r[x].i32 ^ r[y].i32; break; |
244 | CASE(Op::bit_clear): r[d].i32 = r[x].i32 & ~r[y].i32; break; |
245 | |
246 | CASE(Op::select): r[d].i32 = skvx::if_then_else(r[x].i32, r[y].i32, r[z].i32); |
247 | break; |
248 | |
249 | CASE(Op::pack): r[d].u32 = r[x].u32 | (r[y].u32 << immz); break; |
250 | |
251 | CASE(Op::ceil): r[d].f32 = skvx::ceil(r[x].f32) ; break; |
252 | CASE(Op::floor): r[d].f32 = skvx::floor(r[x].f32) ; break; |
253 | CASE(Op::to_f32): r[d].f32 = skvx::cast<float>( r[x].i32 ); break; |
254 | CASE(Op::trunc): r[d].i32 = skvx::cast<int> ( r[x].f32 ); break; |
255 | CASE(Op::round): r[d].i32 = skvx::cast<int> (skvx::lrint(r[x].f32)); break; |
256 | |
257 | CASE(Op::to_half): |
258 | r[d].i32 = skvx::cast<int>(skvx::to_half(r[x].f32)); |
259 | break; |
260 | CASE(Op::from_half): |
261 | r[d].f32 = skvx::from_half(skvx::cast<uint16_t>(r[x].i32)); |
262 | break; |
263 | #undef CASE |
264 | } |
265 | } |
266 | } |
267 | } |
268 | |
269 | } // namespace SK_OPTS_NS |
270 | |
271 | #endif//SkVM_opts_DEFINED |
272 | |