1 | /******************************************************************************* |
2 | * Copyright 2016-2018 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef CPU_JIT_AVX2_GENERATOR_HPP |
18 | #define CPU_JIT_AVX2_GENERATOR_HPP |
19 | |
20 | #include <limits.h> |
21 | |
22 | #include "mkldnn_thread.hpp" |
23 | #include "utils.hpp" |
24 | |
25 | #include "cpu_isa_traits.hpp" |
26 | #include "jit_utils/jit_utils.hpp" |
27 | |
28 | #if defined(_WIN32) && !defined(__GNUC__) |
29 | # define STRUCT_ALIGN(al, ...) __declspec(align(al)) __VA_ARGS__ |
30 | #else |
31 | # define STRUCT_ALIGN(al, ...) __VA_ARGS__ __attribute__((__aligned__(al))) |
32 | #endif |
33 | |
34 | #if defined(_WIN32) |
35 | # define OFFSET_SHADOWSPACE 0x28 |
36 | #endif |
37 | |
38 | #define DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_name) \ |
39 | const char *name() const override { return STRINGIFY(jit_name); } \ |
40 | const char *source_file() const override { return __FILE__; } |
41 | |
42 | namespace mkldnn { |
43 | namespace impl { |
44 | namespace cpu { |
45 | |
46 | // TODO: move this to jit_generator class? |
47 | namespace { |
48 | |
49 | typedef enum { |
50 | PAGE_4K = 4096, |
51 | PAGE_2M = 2097152, |
52 | } cpu_page_size_t; |
53 | |
54 | // TODO: move this somewhere else? Although this is only used by jit kernels |
55 | // (Roma) |
56 | static inline int float2int(float x) { |
57 | union { |
58 | float vfloat; |
59 | int vint; |
60 | } cvt; |
61 | cvt.vfloat = x; |
62 | return cvt.vint; |
63 | } |
64 | |
65 | // TODO: A GPR class that hides ABI details from the JIT kernels and allows |
66 | // numbering registers from 0 to 14 (x86_64) / 6 (x32) (gpr0, gpr1, ...) and |
67 | // stack register (sr). |
68 | // |
69 | // This will allow using syntax like this: |
70 | // |
71 | // param = gpr0; |
72 | // reg_input = gpr0; |
73 | // reg_output = gpr1; |
74 | // ... |
75 | // |
76 | // #ifndef XBYAK64 |
77 | // mov(param, ptr[sr]) |
78 | // #endif |
79 | // |
80 | // (Roma) |
81 | |
82 | #ifdef XBYAK64 |
83 | constexpr Xbyak::Operand::Code abi_save_gpr_regs[] = { |
84 | Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12, |
85 | Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15, |
86 | #ifdef _WIN32 |
87 | Xbyak::Operand::RDI, Xbyak::Operand::RSI, |
88 | #endif |
89 | }; |
90 | |
91 | #ifdef _WIN32 |
92 | static const Xbyak::Reg64 abi_param1(Xbyak::Operand::RCX), |
93 | abi_param2(Xbyak::Operand::RDX), |
94 | abi_param3(Xbyak::Operand::R8), |
95 | abi_param4(Xbyak::Operand::R9), |
96 | abi_not_param1(Xbyak::Operand::RDI); |
97 | #else |
98 | static const Xbyak::Reg64 abi_param1(Xbyak::Operand::RDI), |
99 | abi_param2(Xbyak::Operand::RSI), |
100 | abi_param3(Xbyak::Operand::RDX), |
101 | abi_param4(Xbyak::Operand::RCX), |
102 | abi_param5(Xbyak::Operand::R8), |
103 | abi_param6(Xbyak::Operand::R9), |
104 | abi_not_param1(Xbyak::Operand::RCX); |
105 | #endif |
106 | #endif |
107 | |
108 | inline unsigned int get_cache_size(int level, bool per_core = true){ |
109 | unsigned int l = level - 1; |
110 | // Currently, if XByak is not able to fetch the cache topology |
111 | // we default to 32KB of L1, 512KB of L2 and 1MB of L3 per core. |
112 | if (cpu.getDataCacheLevels() == 0){ |
113 | const int L1_cache_per_core = 32000; |
114 | const int L2_cache_per_core = 512000; |
115 | const int L3_cache_per_core = 1024000; |
116 | int num_cores = per_core ? 1 : mkldnn_get_max_threads(); |
117 | switch(l){ |
118 | case(0): return L1_cache_per_core * num_cores; |
119 | case(1): return L2_cache_per_core * num_cores; |
120 | case(2): return L3_cache_per_core * num_cores; |
121 | default: return 0; |
122 | } |
123 | } |
124 | if (l < cpu.getDataCacheLevels()) { |
125 | return cpu.getDataCacheSize(l) |
126 | / (per_core ? cpu.getCoresSharingDataCache(l) : 1); |
127 | } else |
128 | return 0; |
129 | } |
130 | |
131 | } |
132 | |
133 | class jit_generator : public Xbyak::CodeGenerator |
134 | { |
135 | private: |
136 | const size_t xmm_len = 16; |
137 | #ifdef _WIN32 |
138 | const size_t xmm_to_preserve_start = 6; |
139 | const size_t xmm_to_preserve = 10; |
140 | #else |
141 | const size_t xmm_to_preserve_start = 0; |
142 | const size_t xmm_to_preserve = 0; |
143 | #endif |
144 | |
145 | const size_t num_abi_save_gpr_regs |
146 | = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]); |
147 | |
148 | const size_t size_of_abi_save_regs |
149 | = num_abi_save_gpr_regs * rax.getBit() / 8 |
150 | + xmm_to_preserve * xmm_len; |
151 | |
152 | public: |
153 | enum { |
154 | _cmp_eq_oq = 0u, |
155 | _cmp_lt_os = 1u, |
156 | _cmp_le_os = 2u, |
157 | _cmp_neq_uq = 4u, |
158 | _cmp_nlt_us = 5u, |
159 | _cmp_nle_us = 6u, |
160 | |
161 | _op_floor = 1u, |
162 | _op_mxcsr = 4u, |
163 | }; |
164 | |
165 | Xbyak::Reg64 param1 = abi_param1; |
166 | const int EVEX_max_8b_offt = 0x200; |
167 | const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp; |
168 | |
169 | inline size_t get_size_of_abi_save_regs() { |
170 | return size_of_abi_save_regs; |
171 | } |
172 | |
173 | void preamble() { |
174 | if (xmm_to_preserve) { |
175 | sub(rsp, xmm_to_preserve * xmm_len); |
176 | for (size_t i = 0; i < xmm_to_preserve; ++i) |
177 | movdqu(ptr[rsp + i * xmm_len], Xbyak::Xmm(xmm_to_preserve_start + i)); |
178 | } |
179 | for (size_t i = 0; i < num_abi_save_gpr_regs; ++i) |
180 | push(Xbyak::Reg64(abi_save_gpr_regs[i])); |
181 | if (mayiuse(avx512_common)) { |
182 | mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); |
183 | } |
184 | } |
185 | |
186 | void mic_prefetcht0(Xbyak::Address a) { |
187 | if (mayiuse(avx512_mic)) |
188 | prefetcht0(a); |
189 | } |
190 | |
191 | void mic_prefetcht1(Xbyak::Address a) { |
192 | if (mayiuse(avx512_mic)) |
193 | prefetcht1(a); |
194 | } |
195 | |
196 | void mic_prefetcht2(Xbyak::Address a) { |
197 | if (mayiuse(avx512_mic)) |
198 | prefetcht2(a); |
199 | } |
200 | |
201 | void uni_vzeroupper() { |
202 | if (mayiuse(avx) && !mayiuse(avx512_mic)) |
203 | vzeroupper(); |
204 | } |
205 | |
206 | void postamble() { |
207 | for (size_t i = 0; i < num_abi_save_gpr_regs; ++i) |
208 | pop(Xbyak::Reg64(abi_save_gpr_regs[num_abi_save_gpr_regs - 1 - i])); |
209 | if (xmm_to_preserve) { |
210 | for (size_t i = 0; i < xmm_to_preserve; ++i) |
211 | movdqu(Xbyak::Xmm(xmm_to_preserve_start + i), ptr[rsp + i * xmm_len]); |
212 | add(rsp, xmm_to_preserve * xmm_len); |
213 | } |
214 | uni_vzeroupper(); |
215 | ret(); |
216 | } |
217 | |
218 | template<typename T> |
219 | Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, |
220 | T raw_offt, bool bcast = false) |
221 | { |
222 | using Xbyak::Zmm; |
223 | using Xbyak::Reg64; |
224 | using Xbyak::Address; |
225 | using Xbyak::RegExp; |
226 | |
227 | assert(raw_offt <= INT_MAX); |
228 | auto offt = static_cast<int>(raw_offt); |
229 | |
230 | int scale = 0; |
231 | |
232 | if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) { |
233 | offt = offt - 2 * EVEX_max_8b_offt; |
234 | scale = 1; |
235 | } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) { |
236 | offt = offt - 4 * EVEX_max_8b_offt; |
237 | scale = 2; |
238 | } |
239 | |
240 | auto re = RegExp() + base + offt; |
241 | if (scale) |
242 | re = re + reg_EVEX_max_8b_offt * scale; |
243 | |
244 | if (bcast) |
245 | return zword_b [re]; |
246 | else |
247 | return zword [re]; |
248 | } |
249 | |
250 | Xbyak::Address make_safe_addr(const Xbyak::Reg64 ®_out, size_t offt, |
251 | const Xbyak::Reg64 &tmp_reg, bool bcast = false) { |
252 | if (offt > INT_MAX) { |
253 | mov(tmp_reg, offt); |
254 | return bcast ? ptr_b[reg_out + tmp_reg] : ptr[reg_out + tmp_reg]; |
255 | } else { |
256 | return bcast ? ptr_b[reg_out + offt] : ptr[reg_out + offt]; |
257 | } |
258 | } |
259 | |
260 | Xbyak::Address EVEX_compress_addr_safe(const Xbyak::Reg64 &base, |
261 | size_t raw_offt, const Xbyak::Reg64 ®_offt, bool bcast = false) { |
262 | if (raw_offt > INT_MAX) { |
263 | return make_safe_addr(base, raw_offt, reg_offt, bcast); |
264 | } else { |
265 | return EVEX_compress_addr(base, raw_offt, bcast); |
266 | } |
267 | } |
268 | |
269 | void safe_add(const Xbyak::Reg64 &base, size_t raw_offt, |
270 | const Xbyak::Reg64 ®_offt) { |
271 | if (raw_offt > INT_MAX) { |
272 | mov(reg_offt, raw_offt); |
273 | add(base, reg_offt); |
274 | } else { |
275 | add(base, raw_offt); |
276 | } |
277 | } |
278 | |
279 | void safe_sub(const Xbyak::Reg64 &base, size_t raw_offt, |
280 | const Xbyak::Reg64 ®_offt) { |
281 | if (raw_offt > INT_MAX) { |
282 | mov(reg_offt, raw_offt); |
283 | sub(base, reg_offt); |
284 | } else { |
285 | sub(base, raw_offt); |
286 | } |
287 | } |
288 | |
289 | // Disallow char-based labels completely |
290 | void L(const char *label) = delete; |
291 | void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } |
292 | |
293 | void uni_vpxor(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, |
294 | const Xbyak::Operand &op) { |
295 | assert(x1.getIdx() == x2.getIdx()); |
296 | pxor(x2, op); |
297 | } |
298 | void uni_vpxor(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, |
299 | const Xbyak::Operand &op) { |
300 | if (mayiuse(avx2)) { |
301 | vpxor(x1, x2, op); |
302 | } else { |
303 | vxorps(x1, x2, op); |
304 | } |
305 | } |
306 | void uni_vpxor(const Xbyak::Zmm &x1, const Xbyak::Zmm &x2, |
307 | const Xbyak::Operand &op) { |
308 | vpxord(x1, x2, op); |
309 | } |
310 | |
311 | void uni_vmovss(const Xbyak::Address& addr, const Xbyak::Xmm &x) { |
312 | movss(addr, x); |
313 | } |
314 | void uni_vmovss(const Xbyak::Address& addr, const Xbyak::Ymm &x) { |
315 | vmovss(addr, x); |
316 | } |
317 | void uni_vmovss(const Xbyak::Xmm &x, const Xbyak::Address& addr) { |
318 | movss(x, addr); |
319 | } |
320 | void uni_vmovss(const Xbyak::Ymm &x, const Xbyak::Address& addr) { |
321 | vmovss(x, addr); |
322 | } |
323 | |
324 | void uni_vmovsd(const Xbyak::Address& addr, const Xbyak::Xmm &x) { |
325 | movsd(addr, x); |
326 | } |
327 | void uni_vmovsd(const Xbyak::Address& addr, const Xbyak::Ymm &x) { |
328 | vmovsd(addr, x); |
329 | } |
330 | void uni_vmovsd(const Xbyak::Xmm &x, const Xbyak::Address& addr) { |
331 | movsd(x, addr); |
332 | } |
333 | void uni_vmovsd(const Xbyak::Ymm &x, const Xbyak::Address& addr) { |
334 | vmovsd(x, addr); |
335 | } |
336 | |
337 | void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Xmm &x) { |
338 | movdqu(addr, x); |
339 | } |
340 | void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Ymm &x) { |
341 | vmovdqu(addr, x); |
342 | } |
343 | void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Zmm &x) { |
344 | vmovdqu32(addr, x); |
345 | } |
346 | |
347 | void uni_vmovdqu(const Xbyak::Xmm &x, const Xbyak::Address &addr) { |
348 | movdqu(x, addr); |
349 | } |
350 | void uni_vmovdqu(const Xbyak::Ymm &x, const Xbyak::Address &addr) { |
351 | vmovdqu(x, addr); |
352 | } |
353 | void uni_vmovdqu(const Xbyak::Zmm &x, const Xbyak::Address &addr) { |
354 | vmovdqu32(x, addr); |
355 | } |
356 | |
357 | void uni_vmovups(const Xbyak::Address &addr, const Xbyak::Xmm &x) { |
358 | movups(addr, x); |
359 | } |
360 | void uni_vmovups(const Xbyak::Address &addr, const Xbyak::Ymm &x) { |
361 | vmovups(addr, x); |
362 | } |
363 | |
364 | void uni_vmovups(const Xbyak::Xmm &x, const Xbyak::Operand &op) { |
365 | movups(x, op); |
366 | } |
367 | void uni_vmovups(const Xbyak::Ymm &x, const Xbyak::Operand &op) { |
368 | vmovups(x, op); |
369 | } |
370 | |
371 | void uni_vmovntps(const Xbyak::Address &addr, const Xbyak::Xmm &x) { |
372 | movntps(addr, x); |
373 | } |
374 | void uni_vmovntps(const Xbyak::Address &addr, const Xbyak::Ymm &x) { |
375 | vmovntps(addr, x); |
376 | } |
377 | |
378 | void uni_vbroadcastss(const Xbyak::Xmm &x, const Xbyak::Operand &op) { |
379 | movss(x, op); |
380 | shufps(x, x, 0x0); |
381 | } |
382 | void uni_vbroadcastss(const Xbyak::Ymm &x, const Xbyak::Operand &op) { |
383 | if (op.isMEM() || mayiuse(avx2)) { |
384 | vbroadcastss(x, op); |
385 | } else { |
386 | Xbyak::Xmm t(x.getIdx()); |
387 | if (t.getIdx() != op.getIdx()) movss(t, op); |
388 | vinsertf128(x, x, t, 1); |
389 | vshufps(x, x, x, 0); |
390 | } |
391 | } |
392 | |
393 | void uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand &op) { |
394 | movsd(x, op); |
395 | pshufd(x, x, 0x0); |
396 | } |
397 | void uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand &op) { |
398 | if (mayiuse(avx2)) { |
399 | vpbroadcastd(x, op); |
400 | } else { |
401 | Xbyak::Xmm t(x.getIdx()); |
402 | if (t.getIdx() != op.getIdx()) movsd(t, op); |
403 | vinsertf128(x, x, t, 1); |
404 | vshufps(x, x, x, 0); |
405 | } |
406 | } |
407 | |
408 | void uni_vrcpss(const Xbyak::Xmm &x, const Xbyak::Operand &op) { |
409 | rcpss(x, op); |
410 | } |
411 | void uni_vrcpss(const Xbyak::Ymm &x1, const Xbyak::Xmm &x2) { |
412 | Xbyak::Xmm x1_(x1.getIdx()); |
413 | Xbyak::Xmm x2_(x2.getIdx()); |
414 | vrcpss(x1_, x1_, x2_); |
415 | } |
416 | void uni_vrcpss(const Xbyak::Ymm &x, const Xbyak::Address &op) { |
417 | Xbyak::Xmm x_(x.getIdx()); |
418 | vrcpss(x_, x_, op); |
419 | } |
420 | |
421 | void uni_vrcpps(const Xbyak::Xmm &x, const Xbyak::Operand &op) { |
422 | rcpps(x, op); |
423 | } |
424 | void uni_vrcpps(const Xbyak::Ymm &x, const Xbyak::Operand &op) { |
425 | vrcpps(x, op); |
426 | } |
427 | void uni_vrcpps(const Xbyak::Zmm &x, const Xbyak::Operand &op) { |
428 | vrcp14ps(x, op); |
429 | } |
430 | |
431 | void uni_vdivps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, |
432 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
433 | assert(x.getIdx() == op1.getIdx()); |
434 | divps(x, op2); |
435 | } |
436 | void uni_vdivps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, |
437 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
438 | vdivps(x, op1, op2); |
439 | } |
440 | |
441 | void uni_vdivps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, |
442 | const Xbyak::Operand &op2, const Xbyak::Xmm &buf) { |
443 | movups(buf, op1); |
444 | divps(buf, op2); |
445 | if (x.getIdx() != buf.getIdx()) { |
446 | movups(x, buf); |
447 | } |
448 | } |
449 | |
450 | void uni_vdivps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, |
451 | const Xbyak::Operand &op2, const Xbyak::Ymm &buf) { |
452 | vdivps(x, op1, op2); |
453 | } |
454 | |
455 | void uni_vaddps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, |
456 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
457 | assert(x.getIdx() == op1.getIdx()); |
458 | addps(x, op2); |
459 | } |
460 | void uni_vaddps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, |
461 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
462 | vaddps(x, op1, op2); |
463 | } |
464 | |
465 | void uni_vpsignd(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, |
466 | const Xbyak::Operand& op) { |
467 | assert(x1.getIdx() == x2.getIdx()); |
468 | psignd(x1, op); |
469 | } |
470 | void uni_vpsignd(const Xbyak::Ymm& x1, const Xbyak::Ymm& x2, |
471 | const Xbyak::Operand& op) { |
472 | vpsignd(x1, x2, op); |
473 | } |
474 | |
475 | void uni_vsubps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, |
476 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
477 | assert(x.getIdx() == op1.getIdx()); |
478 | subps(x, op2); |
479 | } |
480 | void uni_vsubps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, |
481 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
482 | vsubps(x, op1, op2); |
483 | } |
484 | |
485 | void uni_vsubps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, |
486 | const Xbyak::Operand &op2, const Xbyak::Xmm &buf) { |
487 | movups(buf, op1); |
488 | subps(buf, op2); |
489 | if (x.getIdx() != buf.getIdx()) { |
490 | movups(x, buf); |
491 | } |
492 | } |
493 | |
494 | void uni_vsubps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, |
495 | const Xbyak::Operand &op2, const Xbyak::Ymm &buf) { |
496 | vsubps(x, op1, op2); |
497 | } |
498 | |
499 | void uni_vmulps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, |
500 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
501 | assert(x.getIdx() == op1.getIdx()); |
502 | mulps(x, op2); |
503 | } |
504 | void uni_vmulps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, |
505 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
506 | vmulps(x, op1, op2); |
507 | } |
508 | |
509 | void uni_vfmadd213ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, |
510 | const Xbyak::Operand &op) { |
511 | mulps(x1, x2); |
512 | addps(x1, op); |
513 | } |
514 | void uni_vfmadd213ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, |
515 | const Xbyak::Operand &op) { |
516 | vfmadd213ps(x1, x2, op); |
517 | } |
518 | |
519 | void uni_vfmadd231ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, |
520 | const Xbyak::Operand &op) { |
521 | mulps(x2, op); |
522 | addps(x1, x2); |
523 | } |
524 | void uni_vfmadd231ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, |
525 | const Xbyak::Operand &op) { |
526 | vfmadd231ps(x1, x2, op); |
527 | } |
528 | |
529 | void uni_vfnmadd231ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, |
530 | const Xbyak::Operand &op) { |
531 | mulps(x2, op); |
532 | subps(x1, x2); |
533 | } |
534 | |
535 | void uni_vfnmadd231ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, |
536 | const Xbyak::Operand &op) { |
537 | vfnmadd231ps(x1, x2, op); |
538 | } |
539 | |
540 | void uni_vsqrtps(const Xbyak::Xmm &x, const Xbyak::Operand &op) { |
541 | sqrtps(x, op); |
542 | } |
543 | void uni_vsqrtps(const Xbyak::Ymm &x, const Xbyak::Operand &op) { |
544 | vsqrtps(x, op); |
545 | } |
546 | |
547 | void uni_vpaddd(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, |
548 | const Xbyak::Operand &op) { |
549 | assert(x1.getIdx() == x2.getIdx()); |
550 | paddd(x2, op); |
551 | } |
552 | void uni_vpaddd(const Xbyak::Ymm &x1, const Xbyak::Xmm &x2, |
553 | const Xbyak::Operand &op) { |
554 | vpaddd(x1, x2, op); |
555 | } |
556 | |
557 | void uni_vandps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, |
558 | const Xbyak::Operand &op = Xbyak::Operand()) { |
559 | assert(x1.getIdx() == x2.getIdx()); |
560 | andps(x1, op); |
561 | } |
562 | void uni_vandps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, |
563 | const Xbyak::Operand &op = Xbyak::Operand()) { |
564 | if (!mayiuse(avx512_common) || x1.getBit() < 512) |
565 | vandps(x1, x2, op); |
566 | else |
567 | vpandd(x1, x2, op); |
568 | } |
569 | |
570 | void uni_vorps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, |
571 | const Xbyak::Operand &op = Xbyak::Operand()) { |
572 | assert(x1.getIdx() == x2.getIdx()); |
573 | orps(x1, op); |
574 | } |
575 | void uni_vorps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, |
576 | const Xbyak::Operand &op = Xbyak::Operand()) { |
577 | if (!mayiuse(avx512_common) || x1.getBit() < 512) |
578 | vorps(x1, x2, op); |
579 | else |
580 | vpord(x1, x2, op); |
581 | } |
582 | |
583 | void uni_vpslld(const Xbyak::Xmm &x, const Xbyak::Operand &op, |
584 | const int imm) { |
585 | assert(x.getIdx() == op.getIdx()); |
586 | pslld(x, imm); |
587 | } |
588 | void uni_vpslld(const Xbyak::Ymm &x, const Xbyak::Operand &op, |
589 | const int imm) { |
590 | vpslld(x, op, imm); |
591 | } |
592 | |
593 | void uni_vpsrld(const Xbyak::Xmm &x, const Xbyak::Operand &op, |
594 | const int imm) { |
595 | assert(x.getIdx() == op.getIdx()); |
596 | psrld(x, imm); |
597 | } |
598 | void uni_vpsrld(const Xbyak::Ymm &x, const Xbyak::Operand &op, |
599 | const int imm) { |
600 | vpsrld(x, op, imm); |
601 | } |
602 | |
603 | void uni_vmaxps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, |
604 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
605 | assert(x.getIdx() == op1.getIdx()); |
606 | maxps(x, op2); |
607 | } |
608 | void uni_vmaxps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, |
609 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
610 | vmaxps(x, op1, op2); |
611 | } |
612 | |
613 | void uni_vminps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, |
614 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
615 | assert(x.getIdx() == op1.getIdx()); |
616 | minps(x, op2); |
617 | } |
618 | void uni_vminps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, |
619 | const Xbyak::Operand &op2 = Xbyak::Operand()) { |
620 | vminps(x, op1, op2); |
621 | } |
622 | |
623 | void uni_vcmpgtps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, |
624 | const Xbyak::Operand &op) { |
625 | assert(x1.getIdx() == x2.getIdx()); |
626 | cmpps(x1, op, _cmp_nle_us); |
627 | } |
628 | |
629 | void uni_vcmpgtps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, |
630 | const Xbyak::Operand &op) { |
631 | vcmpgtps(x1, x2, op); |
632 | } |
633 | |
634 | void uni_vcmpgeps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, |
635 | const Xbyak::Operand &op) { |
636 | assert(x1.getIdx() == x2.getIdx()); |
637 | cmpps(x1, op, _cmp_nlt_us); |
638 | } |
639 | |
640 | void uni_vcmpgeps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, |
641 | const Xbyak::Operand &op) { |
642 | vcmpps(x1, x2, op, _cmp_nlt_us); |
643 | } |
644 | |
645 | void uni_vtestps(const Xbyak::Xmm &x1, const Xbyak::Operand &op) { |
646 | ptest(x1, op); |
647 | } |
648 | |
649 | void uni_vtestps(const Xbyak::Ymm &x1, const Xbyak::Operand &op) { |
650 | assert(!(x1.isZMM() || op.isZMM())); |
651 | vtestps(x1, op); |
652 | } |
653 | |
654 | void uni_vblendvps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, |
655 | const Xbyak::Operand &op, const Xbyak::Xmm &msk) { |
656 | assert(x1.getIdx() == x2.getIdx()); |
657 | assert(msk.getIdx() == 0); |
658 | blendvps(x1, op); |
659 | } |
660 | void uni_vblendvps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, |
661 | const Xbyak::Operand &op, const Xbyak::Ymm &msk) { |
662 | vblendvps(x1, x2, op, msk); |
663 | } |
664 | |
665 | void uni_vroundps(const Xbyak::Xmm &x, const Xbyak::Operand &op, |
666 | const int imm) { |
667 | roundps(x, op, imm); |
668 | } |
669 | void uni_vroundps(const Xbyak::Ymm &x, const Xbyak::Operand &op, |
670 | const int imm) { |
671 | vroundps(x, op, imm); |
672 | } |
673 | |
674 | void uni_vcvtps2dq(const Xbyak::Xmm &x, const Xbyak::Operand &op) { |
675 | cvtps2dq(x, op); |
676 | } |
677 | void uni_vcvtps2dq(const Xbyak::Ymm &x, const Xbyak::Operand &op) { |
678 | vcvtps2dq(x, op); |
679 | } |
680 | |
681 | void uni_vcvtdq2ps(const Xbyak::Xmm &x, const Xbyak::Operand &op) { |
682 | cvtdq2ps(x, op); |
683 | } |
684 | void uni_vcvtdq2ps(const Xbyak::Ymm &x, const Xbyak::Operand &op) { |
685 | vcvtdq2ps(x, op); |
686 | } |
687 | |
688 | void uni_vmovmskps(const Xbyak::Reg &x1, const Xbyak::Xmm &x2) { |
689 | movmskps(x1.cvt64(), x2); |
690 | } |
691 | void uni_vmovmskps(const Xbyak::Reg &x1, const Xbyak::Ymm &x2) { |
692 | vmovmskps(x1, x2); |
693 | } |
694 | |
695 | void uni_vpackssdw(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op){ |
696 | assert(x1.getIdx() == x1.getIdx()); |
697 | packssdw(x1, op); |
698 | } |
699 | void uni_vpackssdw(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op){ |
700 | vpackssdw(x1, x2, op); |
701 | } |
702 | |
703 | void uni_vpackuswb(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op){ |
704 | assert(x1.getIdx() == x1.getIdx()); |
705 | packuswb(x1, op); |
706 | } |
707 | void uni_vpackuswb(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op){ |
708 | vpackuswb(x1, x2, op); |
709 | } |
710 | |
711 | |
712 | void mul_by_const(const Xbyak::Reg &out, |
713 | const Xbyak::Reg64 &tmp, int value) { |
714 | // Generates a shift + add sequence for multiplicating contents of the |
715 | // out register by a known JIT-time value. Clobbers the tmp register. |
716 | // |
717 | // Pros compared to mul/imul: |
718 | // - does not require using known registers |
719 | // - not microcoded on Intel(R) Xeon Phi(TM) processors |
720 | // Still, there are probably a lot of cases when mul/imul is faster on |
721 | // Intel(R) Core(TM) processors. Not intended for critical path. |
722 | |
723 | // TODO: detect when overflow is emminent (Roma) |
724 | // TODO: detect when using mul/imul is a better option (Roma) |
725 | |
726 | int p = 0; // the current power of 2 |
727 | int old_p = 0; // the last seen power of 2 such that value[old_p] != 0 |
728 | |
729 | xor_(tmp, tmp); |
730 | while (value) { |
731 | if (value & 1) { |
732 | int shift = p - old_p; |
733 | if (shift) { |
734 | shl(out, shift); |
735 | old_p = p; |
736 | } |
737 | add(tmp, out); |
738 | } |
739 | value >>= 1; |
740 | p++; |
741 | } |
742 | mov(out, tmp); |
743 | } |
744 | |
745 | public: |
746 | jit_generator( |
747 | void *code_ptr = nullptr, |
748 | size_t code_size = 256 * 1024 |
749 | ) : Xbyak::CodeGenerator(code_size, code_ptr) |
750 | { |
751 | } |
752 | virtual ~jit_generator() {} |
753 | |
754 | virtual const char *name() const = 0; |
755 | virtual const char *source_file() const = 0; |
756 | |
757 | const Xbyak::uint8 *getCode() { |
758 | const Xbyak::uint8 *code = CodeGenerator::getCode(); |
759 | size_t code_size = getSize(); |
760 | jit_utils::register_jit_code(code, code_size, name(), source_file()); |
761 | return code; |
762 | } |
763 | |
764 | template<typename F> const F getCode() { |
765 | return (const F)getCode(); |
766 | } |
767 | }; |
768 | |
769 | } |
770 | } |
771 | } |
772 | |
773 | #endif |
774 | |