1/*******************************************************************************
2* Copyright 2016-2018 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef CPU_JIT_AVX2_GENERATOR_HPP
18#define CPU_JIT_AVX2_GENERATOR_HPP
19
20#include <limits.h>
21
22#include "mkldnn_thread.hpp"
23#include "utils.hpp"
24
25#include "cpu_isa_traits.hpp"
26#include "jit_utils/jit_utils.hpp"
27
28#if defined(_WIN32) && !defined(__GNUC__)
29# define STRUCT_ALIGN(al, ...) __declspec(align(al)) __VA_ARGS__
30#else
31# define STRUCT_ALIGN(al, ...) __VA_ARGS__ __attribute__((__aligned__(al)))
32#endif
33
34#if defined(_WIN32)
35# define OFFSET_SHADOWSPACE 0x28
36#endif
37
38#define DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_name) \
39 const char *name() const override { return STRINGIFY(jit_name); } \
40 const char *source_file() const override { return __FILE__; }
41
42namespace mkldnn {
43namespace impl {
44namespace cpu {
45
46// TODO: move this to jit_generator class?
47namespace {
48
49typedef enum {
50 PAGE_4K = 4096,
51 PAGE_2M = 2097152,
52} cpu_page_size_t;
53
54// TODO: move this somewhere else? Although this is only used by jit kernels
55// (Roma)
56static inline int float2int(float x) {
57 union {
58 float vfloat;
59 int vint;
60 } cvt;
61 cvt.vfloat = x;
62 return cvt.vint;
63}
64
65// TODO: A GPR class that hides ABI details from the JIT kernels and allows
66// numbering registers from 0 to 14 (x86_64) / 6 (x32) (gpr0, gpr1, ...) and
67// stack register (sr).
68//
69// This will allow using syntax like this:
70//
71// param = gpr0;
72// reg_input = gpr0;
73// reg_output = gpr1;
74// ...
75//
76// #ifndef XBYAK64
77// mov(param, ptr[sr])
78// #endif
79//
80// (Roma)
81
82#ifdef XBYAK64
83constexpr Xbyak::Operand::Code abi_save_gpr_regs[] = {
84 Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12,
85 Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15,
86#ifdef _WIN32
87 Xbyak::Operand::RDI, Xbyak::Operand::RSI,
88#endif
89};
90
91#ifdef _WIN32
92static const Xbyak::Reg64 abi_param1(Xbyak::Operand::RCX),
93 abi_param2(Xbyak::Operand::RDX),
94 abi_param3(Xbyak::Operand::R8),
95 abi_param4(Xbyak::Operand::R9),
96 abi_not_param1(Xbyak::Operand::RDI);
97#else
98static const Xbyak::Reg64 abi_param1(Xbyak::Operand::RDI),
99 abi_param2(Xbyak::Operand::RSI),
100 abi_param3(Xbyak::Operand::RDX),
101 abi_param4(Xbyak::Operand::RCX),
102 abi_param5(Xbyak::Operand::R8),
103 abi_param6(Xbyak::Operand::R9),
104 abi_not_param1(Xbyak::Operand::RCX);
105#endif
106#endif
107
108inline unsigned int get_cache_size(int level, bool per_core = true){
109 unsigned int l = level - 1;
110 // Currently, if XByak is not able to fetch the cache topology
111 // we default to 32KB of L1, 512KB of L2 and 1MB of L3 per core.
112 if (cpu.getDataCacheLevels() == 0){
113 const int L1_cache_per_core = 32000;
114 const int L2_cache_per_core = 512000;
115 const int L3_cache_per_core = 1024000;
116 int num_cores = per_core ? 1 : mkldnn_get_max_threads();
117 switch(l){
118 case(0): return L1_cache_per_core * num_cores;
119 case(1): return L2_cache_per_core * num_cores;
120 case(2): return L3_cache_per_core * num_cores;
121 default: return 0;
122 }
123 }
124 if (l < cpu.getDataCacheLevels()) {
125 return cpu.getDataCacheSize(l)
126 / (per_core ? cpu.getCoresSharingDataCache(l) : 1);
127 } else
128 return 0;
129}
130
131}
132
133class jit_generator : public Xbyak::CodeGenerator
134{
135private:
136 const size_t xmm_len = 16;
137#ifdef _WIN32
138 const size_t xmm_to_preserve_start = 6;
139 const size_t xmm_to_preserve = 10;
140#else
141 const size_t xmm_to_preserve_start = 0;
142 const size_t xmm_to_preserve = 0;
143#endif
144
145 const size_t num_abi_save_gpr_regs
146 = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]);
147
148 const size_t size_of_abi_save_regs
149 = num_abi_save_gpr_regs * rax.getBit() / 8
150 + xmm_to_preserve * xmm_len;
151
152public:
153 enum {
154 _cmp_eq_oq = 0u,
155 _cmp_lt_os = 1u,
156 _cmp_le_os = 2u,
157 _cmp_neq_uq = 4u,
158 _cmp_nlt_us = 5u,
159 _cmp_nle_us = 6u,
160
161 _op_floor = 1u,
162 _op_mxcsr = 4u,
163 };
164
165 Xbyak::Reg64 param1 = abi_param1;
166 const int EVEX_max_8b_offt = 0x200;
167 const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp;
168
169 inline size_t get_size_of_abi_save_regs() {
170 return size_of_abi_save_regs;
171 }
172
173 void preamble() {
174 if (xmm_to_preserve) {
175 sub(rsp, xmm_to_preserve * xmm_len);
176 for (size_t i = 0; i < xmm_to_preserve; ++i)
177 movdqu(ptr[rsp + i * xmm_len], Xbyak::Xmm(xmm_to_preserve_start + i));
178 }
179 for (size_t i = 0; i < num_abi_save_gpr_regs; ++i)
180 push(Xbyak::Reg64(abi_save_gpr_regs[i]));
181 if (mayiuse(avx512_common)) {
182 mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
183 }
184 }
185
186 void mic_prefetcht0(Xbyak::Address a) {
187 if (mayiuse(avx512_mic))
188 prefetcht0(a);
189 }
190
191 void mic_prefetcht1(Xbyak::Address a) {
192 if (mayiuse(avx512_mic))
193 prefetcht1(a);
194 }
195
196 void mic_prefetcht2(Xbyak::Address a) {
197 if (mayiuse(avx512_mic))
198 prefetcht2(a);
199 }
200
201 void uni_vzeroupper() {
202 if (mayiuse(avx) && !mayiuse(avx512_mic))
203 vzeroupper();
204 }
205
206 void postamble() {
207 for (size_t i = 0; i < num_abi_save_gpr_regs; ++i)
208 pop(Xbyak::Reg64(abi_save_gpr_regs[num_abi_save_gpr_regs - 1 - i]));
209 if (xmm_to_preserve) {
210 for (size_t i = 0; i < xmm_to_preserve; ++i)
211 movdqu(Xbyak::Xmm(xmm_to_preserve_start + i), ptr[rsp + i * xmm_len]);
212 add(rsp, xmm_to_preserve * xmm_len);
213 }
214 uni_vzeroupper();
215 ret();
216 }
217
218 template<typename T>
219 Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base,
220 T raw_offt, bool bcast = false)
221 {
222 using Xbyak::Zmm;
223 using Xbyak::Reg64;
224 using Xbyak::Address;
225 using Xbyak::RegExp;
226
227 assert(raw_offt <= INT_MAX);
228 auto offt = static_cast<int>(raw_offt);
229
230 int scale = 0;
231
232 if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) {
233 offt = offt - 2 * EVEX_max_8b_offt;
234 scale = 1;
235 } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) {
236 offt = offt - 4 * EVEX_max_8b_offt;
237 scale = 2;
238 }
239
240 auto re = RegExp() + base + offt;
241 if (scale)
242 re = re + reg_EVEX_max_8b_offt * scale;
243
244 if (bcast)
245 return zword_b [re];
246 else
247 return zword [re];
248 }
249
250 Xbyak::Address make_safe_addr(const Xbyak::Reg64 &reg_out, size_t offt,
251 const Xbyak::Reg64 &tmp_reg, bool bcast = false) {
252 if (offt > INT_MAX) {
253 mov(tmp_reg, offt);
254 return bcast ? ptr_b[reg_out + tmp_reg] : ptr[reg_out + tmp_reg];
255 } else {
256 return bcast ? ptr_b[reg_out + offt] : ptr[reg_out + offt];
257 }
258 }
259
260 Xbyak::Address EVEX_compress_addr_safe(const Xbyak::Reg64 &base,
261 size_t raw_offt, const Xbyak::Reg64 &reg_offt, bool bcast = false) {
262 if (raw_offt > INT_MAX) {
263 return make_safe_addr(base, raw_offt, reg_offt, bcast);
264 } else {
265 return EVEX_compress_addr(base, raw_offt, bcast);
266 }
267 }
268
269 void safe_add(const Xbyak::Reg64 &base, size_t raw_offt,
270 const Xbyak::Reg64 &reg_offt) {
271 if (raw_offt > INT_MAX) {
272 mov(reg_offt, raw_offt);
273 add(base, reg_offt);
274 } else {
275 add(base, raw_offt);
276 }
277 }
278
279 void safe_sub(const Xbyak::Reg64 &base, size_t raw_offt,
280 const Xbyak::Reg64 &reg_offt) {
281 if (raw_offt > INT_MAX) {
282 mov(reg_offt, raw_offt);
283 sub(base, reg_offt);
284 } else {
285 sub(base, raw_offt);
286 }
287 }
288
289 // Disallow char-based labels completely
290 void L(const char *label) = delete;
291 void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); }
292
293 void uni_vpxor(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
294 const Xbyak::Operand &op) {
295 assert(x1.getIdx() == x2.getIdx());
296 pxor(x2, op);
297 }
298 void uni_vpxor(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
299 const Xbyak::Operand &op) {
300 if (mayiuse(avx2)) {
301 vpxor(x1, x2, op);
302 } else {
303 vxorps(x1, x2, op);
304 }
305 }
306 void uni_vpxor(const Xbyak::Zmm &x1, const Xbyak::Zmm &x2,
307 const Xbyak::Operand &op) {
308 vpxord(x1, x2, op);
309 }
310
311 void uni_vmovss(const Xbyak::Address& addr, const Xbyak::Xmm &x) {
312 movss(addr, x);
313 }
314 void uni_vmovss(const Xbyak::Address& addr, const Xbyak::Ymm &x) {
315 vmovss(addr, x);
316 }
317 void uni_vmovss(const Xbyak::Xmm &x, const Xbyak::Address& addr) {
318 movss(x, addr);
319 }
320 void uni_vmovss(const Xbyak::Ymm &x, const Xbyak::Address& addr) {
321 vmovss(x, addr);
322 }
323
324 void uni_vmovsd(const Xbyak::Address& addr, const Xbyak::Xmm &x) {
325 movsd(addr, x);
326 }
327 void uni_vmovsd(const Xbyak::Address& addr, const Xbyak::Ymm &x) {
328 vmovsd(addr, x);
329 }
330 void uni_vmovsd(const Xbyak::Xmm &x, const Xbyak::Address& addr) {
331 movsd(x, addr);
332 }
333 void uni_vmovsd(const Xbyak::Ymm &x, const Xbyak::Address& addr) {
334 vmovsd(x, addr);
335 }
336
337 void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Xmm &x) {
338 movdqu(addr, x);
339 }
340 void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Ymm &x) {
341 vmovdqu(addr, x);
342 }
343 void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Zmm &x) {
344 vmovdqu32(addr, x);
345 }
346
347 void uni_vmovdqu(const Xbyak::Xmm &x, const Xbyak::Address &addr) {
348 movdqu(x, addr);
349 }
350 void uni_vmovdqu(const Xbyak::Ymm &x, const Xbyak::Address &addr) {
351 vmovdqu(x, addr);
352 }
353 void uni_vmovdqu(const Xbyak::Zmm &x, const Xbyak::Address &addr) {
354 vmovdqu32(x, addr);
355 }
356
357 void uni_vmovups(const Xbyak::Address &addr, const Xbyak::Xmm &x) {
358 movups(addr, x);
359 }
360 void uni_vmovups(const Xbyak::Address &addr, const Xbyak::Ymm &x) {
361 vmovups(addr, x);
362 }
363
364 void uni_vmovups(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
365 movups(x, op);
366 }
367 void uni_vmovups(const Xbyak::Ymm &x, const Xbyak::Operand &op) {
368 vmovups(x, op);
369 }
370
371 void uni_vmovntps(const Xbyak::Address &addr, const Xbyak::Xmm &x) {
372 movntps(addr, x);
373 }
374 void uni_vmovntps(const Xbyak::Address &addr, const Xbyak::Ymm &x) {
375 vmovntps(addr, x);
376 }
377
378 void uni_vbroadcastss(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
379 movss(x, op);
380 shufps(x, x, 0x0);
381 }
382 void uni_vbroadcastss(const Xbyak::Ymm &x, const Xbyak::Operand &op) {
383 if (op.isMEM() || mayiuse(avx2)) {
384 vbroadcastss(x, op);
385 } else {
386 Xbyak::Xmm t(x.getIdx());
387 if (t.getIdx() != op.getIdx()) movss(t, op);
388 vinsertf128(x, x, t, 1);
389 vshufps(x, x, x, 0);
390 }
391 }
392
393 void uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
394 movsd(x, op);
395 pshufd(x, x, 0x0);
396 }
397 void uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand &op) {
398 if (mayiuse(avx2)) {
399 vpbroadcastd(x, op);
400 } else {
401 Xbyak::Xmm t(x.getIdx());
402 if (t.getIdx() != op.getIdx()) movsd(t, op);
403 vinsertf128(x, x, t, 1);
404 vshufps(x, x, x, 0);
405 }
406 }
407
408 void uni_vrcpss(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
409 rcpss(x, op);
410 }
411 void uni_vrcpss(const Xbyak::Ymm &x1, const Xbyak::Xmm &x2) {
412 Xbyak::Xmm x1_(x1.getIdx());
413 Xbyak::Xmm x2_(x2.getIdx());
414 vrcpss(x1_, x1_, x2_);
415 }
416 void uni_vrcpss(const Xbyak::Ymm &x, const Xbyak::Address &op) {
417 Xbyak::Xmm x_(x.getIdx());
418 vrcpss(x_, x_, op);
419 }
420
421 void uni_vrcpps(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
422 rcpps(x, op);
423 }
424 void uni_vrcpps(const Xbyak::Ymm &x, const Xbyak::Operand &op) {
425 vrcpps(x, op);
426 }
427 void uni_vrcpps(const Xbyak::Zmm &x, const Xbyak::Operand &op) {
428 vrcp14ps(x, op);
429 }
430
431 void uni_vdivps(const Xbyak::Xmm &x, const Xbyak::Operand &op1,
432 const Xbyak::Operand &op2 = Xbyak::Operand()) {
433 assert(x.getIdx() == op1.getIdx());
434 divps(x, op2);
435 }
436 void uni_vdivps(const Xbyak::Ymm &x, const Xbyak::Operand &op1,
437 const Xbyak::Operand &op2 = Xbyak::Operand()) {
438 vdivps(x, op1, op2);
439 }
440
441 void uni_vdivps(const Xbyak::Xmm &x, const Xbyak::Operand &op1,
442 const Xbyak::Operand &op2, const Xbyak::Xmm &buf) {
443 movups(buf, op1);
444 divps(buf, op2);
445 if (x.getIdx() != buf.getIdx()) {
446 movups(x, buf);
447 }
448 }
449
450 void uni_vdivps(const Xbyak::Ymm &x, const Xbyak::Operand &op1,
451 const Xbyak::Operand &op2, const Xbyak::Ymm &buf) {
452 vdivps(x, op1, op2);
453 }
454
455 void uni_vaddps(const Xbyak::Xmm &x, const Xbyak::Operand &op1,
456 const Xbyak::Operand &op2 = Xbyak::Operand()) {
457 assert(x.getIdx() == op1.getIdx());
458 addps(x, op2);
459 }
460 void uni_vaddps(const Xbyak::Ymm &x, const Xbyak::Operand &op1,
461 const Xbyak::Operand &op2 = Xbyak::Operand()) {
462 vaddps(x, op1, op2);
463 }
464
465 void uni_vpsignd(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2,
466 const Xbyak::Operand& op) {
467 assert(x1.getIdx() == x2.getIdx());
468 psignd(x1, op);
469 }
470 void uni_vpsignd(const Xbyak::Ymm& x1, const Xbyak::Ymm& x2,
471 const Xbyak::Operand& op) {
472 vpsignd(x1, x2, op);
473 }
474
475 void uni_vsubps(const Xbyak::Xmm &x, const Xbyak::Operand &op1,
476 const Xbyak::Operand &op2 = Xbyak::Operand()) {
477 assert(x.getIdx() == op1.getIdx());
478 subps(x, op2);
479 }
480 void uni_vsubps(const Xbyak::Ymm &x, const Xbyak::Operand &op1,
481 const Xbyak::Operand &op2 = Xbyak::Operand()) {
482 vsubps(x, op1, op2);
483 }
484
485 void uni_vsubps(const Xbyak::Xmm &x, const Xbyak::Operand &op1,
486 const Xbyak::Operand &op2, const Xbyak::Xmm &buf) {
487 movups(buf, op1);
488 subps(buf, op2);
489 if (x.getIdx() != buf.getIdx()) {
490 movups(x, buf);
491 }
492 }
493
494 void uni_vsubps(const Xbyak::Ymm &x, const Xbyak::Operand &op1,
495 const Xbyak::Operand &op2, const Xbyak::Ymm &buf) {
496 vsubps(x, op1, op2);
497 }
498
499 void uni_vmulps(const Xbyak::Xmm &x, const Xbyak::Operand &op1,
500 const Xbyak::Operand &op2 = Xbyak::Operand()) {
501 assert(x.getIdx() == op1.getIdx());
502 mulps(x, op2);
503 }
504 void uni_vmulps(const Xbyak::Ymm &x, const Xbyak::Operand &op1,
505 const Xbyak::Operand &op2 = Xbyak::Operand()) {
506 vmulps(x, op1, op2);
507 }
508
509 void uni_vfmadd213ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
510 const Xbyak::Operand &op) {
511 mulps(x1, x2);
512 addps(x1, op);
513 }
514 void uni_vfmadd213ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
515 const Xbyak::Operand &op) {
516 vfmadd213ps(x1, x2, op);
517 }
518
519 void uni_vfmadd231ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
520 const Xbyak::Operand &op) {
521 mulps(x2, op);
522 addps(x1, x2);
523 }
524 void uni_vfmadd231ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
525 const Xbyak::Operand &op) {
526 vfmadd231ps(x1, x2, op);
527 }
528
529 void uni_vfnmadd231ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
530 const Xbyak::Operand &op) {
531 mulps(x2, op);
532 subps(x1, x2);
533 }
534
535 void uni_vfnmadd231ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
536 const Xbyak::Operand &op) {
537 vfnmadd231ps(x1, x2, op);
538 }
539
540 void uni_vsqrtps(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
541 sqrtps(x, op);
542 }
543 void uni_vsqrtps(const Xbyak::Ymm &x, const Xbyak::Operand &op) {
544 vsqrtps(x, op);
545 }
546
547 void uni_vpaddd(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
548 const Xbyak::Operand &op) {
549 assert(x1.getIdx() == x2.getIdx());
550 paddd(x2, op);
551 }
552 void uni_vpaddd(const Xbyak::Ymm &x1, const Xbyak::Xmm &x2,
553 const Xbyak::Operand &op) {
554 vpaddd(x1, x2, op);
555 }
556
557 void uni_vandps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
558 const Xbyak::Operand &op = Xbyak::Operand()) {
559 assert(x1.getIdx() == x2.getIdx());
560 andps(x1, op);
561 }
562 void uni_vandps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
563 const Xbyak::Operand &op = Xbyak::Operand()) {
564 if (!mayiuse(avx512_common) || x1.getBit() < 512)
565 vandps(x1, x2, op);
566 else
567 vpandd(x1, x2, op);
568 }
569
570 void uni_vorps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
571 const Xbyak::Operand &op = Xbyak::Operand()) {
572 assert(x1.getIdx() == x2.getIdx());
573 orps(x1, op);
574 }
575 void uni_vorps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
576 const Xbyak::Operand &op = Xbyak::Operand()) {
577 if (!mayiuse(avx512_common) || x1.getBit() < 512)
578 vorps(x1, x2, op);
579 else
580 vpord(x1, x2, op);
581 }
582
583 void uni_vpslld(const Xbyak::Xmm &x, const Xbyak::Operand &op,
584 const int imm) {
585 assert(x.getIdx() == op.getIdx());
586 pslld(x, imm);
587 }
588 void uni_vpslld(const Xbyak::Ymm &x, const Xbyak::Operand &op,
589 const int imm) {
590 vpslld(x, op, imm);
591 }
592
593 void uni_vpsrld(const Xbyak::Xmm &x, const Xbyak::Operand &op,
594 const int imm) {
595 assert(x.getIdx() == op.getIdx());
596 psrld(x, imm);
597 }
598 void uni_vpsrld(const Xbyak::Ymm &x, const Xbyak::Operand &op,
599 const int imm) {
600 vpsrld(x, op, imm);
601 }
602
603 void uni_vmaxps(const Xbyak::Xmm &x, const Xbyak::Operand &op1,
604 const Xbyak::Operand &op2 = Xbyak::Operand()) {
605 assert(x.getIdx() == op1.getIdx());
606 maxps(x, op2);
607 }
608 void uni_vmaxps(const Xbyak::Ymm &x, const Xbyak::Operand &op1,
609 const Xbyak::Operand &op2 = Xbyak::Operand()) {
610 vmaxps(x, op1, op2);
611 }
612
613 void uni_vminps(const Xbyak::Xmm &x, const Xbyak::Operand &op1,
614 const Xbyak::Operand &op2 = Xbyak::Operand()) {
615 assert(x.getIdx() == op1.getIdx());
616 minps(x, op2);
617 }
618 void uni_vminps(const Xbyak::Ymm &x, const Xbyak::Operand &op1,
619 const Xbyak::Operand &op2 = Xbyak::Operand()) {
620 vminps(x, op1, op2);
621 }
622
623 void uni_vcmpgtps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
624 const Xbyak::Operand &op) {
625 assert(x1.getIdx() == x2.getIdx());
626 cmpps(x1, op, _cmp_nle_us);
627 }
628
629 void uni_vcmpgtps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
630 const Xbyak::Operand &op) {
631 vcmpgtps(x1, x2, op);
632 }
633
634 void uni_vcmpgeps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
635 const Xbyak::Operand &op) {
636 assert(x1.getIdx() == x2.getIdx());
637 cmpps(x1, op, _cmp_nlt_us);
638 }
639
640 void uni_vcmpgeps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
641 const Xbyak::Operand &op) {
642 vcmpps(x1, x2, op, _cmp_nlt_us);
643 }
644
645 void uni_vtestps(const Xbyak::Xmm &x1, const Xbyak::Operand &op) {
646 ptest(x1, op);
647 }
648
649 void uni_vtestps(const Xbyak::Ymm &x1, const Xbyak::Operand &op) {
650 assert(!(x1.isZMM() || op.isZMM()));
651 vtestps(x1, op);
652 }
653
654 void uni_vblendvps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
655 const Xbyak::Operand &op, const Xbyak::Xmm &msk) {
656 assert(x1.getIdx() == x2.getIdx());
657 assert(msk.getIdx() == 0);
658 blendvps(x1, op);
659 }
660 void uni_vblendvps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
661 const Xbyak::Operand &op, const Xbyak::Ymm &msk) {
662 vblendvps(x1, x2, op, msk);
663 }
664
665 void uni_vroundps(const Xbyak::Xmm &x, const Xbyak::Operand &op,
666 const int imm) {
667 roundps(x, op, imm);
668 }
669 void uni_vroundps(const Xbyak::Ymm &x, const Xbyak::Operand &op,
670 const int imm) {
671 vroundps(x, op, imm);
672 }
673
674 void uni_vcvtps2dq(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
675 cvtps2dq(x, op);
676 }
677 void uni_vcvtps2dq(const Xbyak::Ymm &x, const Xbyak::Operand &op) {
678 vcvtps2dq(x, op);
679 }
680
681 void uni_vcvtdq2ps(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
682 cvtdq2ps(x, op);
683 }
684 void uni_vcvtdq2ps(const Xbyak::Ymm &x, const Xbyak::Operand &op) {
685 vcvtdq2ps(x, op);
686 }
687
688 void uni_vmovmskps(const Xbyak::Reg &x1, const Xbyak::Xmm &x2) {
689 movmskps(x1.cvt64(), x2);
690 }
691 void uni_vmovmskps(const Xbyak::Reg &x1, const Xbyak::Ymm &x2) {
692 vmovmskps(x1, x2);
693 }
694
695 void uni_vpackssdw(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op){
696 assert(x1.getIdx() == x1.getIdx());
697 packssdw(x1, op);
698 }
699 void uni_vpackssdw(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op){
700 vpackssdw(x1, x2, op);
701 }
702
703 void uni_vpackuswb(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op){
704 assert(x1.getIdx() == x1.getIdx());
705 packuswb(x1, op);
706 }
707 void uni_vpackuswb(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op){
708 vpackuswb(x1, x2, op);
709 }
710
711
712 void mul_by_const(const Xbyak::Reg &out,
713 const Xbyak::Reg64 &tmp, int value) {
714 // Generates a shift + add sequence for multiplicating contents of the
715 // out register by a known JIT-time value. Clobbers the tmp register.
716 //
717 // Pros compared to mul/imul:
718 // - does not require using known registers
719 // - not microcoded on Intel(R) Xeon Phi(TM) processors
720 // Still, there are probably a lot of cases when mul/imul is faster on
721 // Intel(R) Core(TM) processors. Not intended for critical path.
722
723 // TODO: detect when overflow is emminent (Roma)
724 // TODO: detect when using mul/imul is a better option (Roma)
725
726 int p = 0; // the current power of 2
727 int old_p = 0; // the last seen power of 2 such that value[old_p] != 0
728
729 xor_(tmp, tmp);
730 while (value) {
731 if (value & 1) {
732 int shift = p - old_p;
733 if (shift) {
734 shl(out, shift);
735 old_p = p;
736 }
737 add(tmp, out);
738 }
739 value >>= 1;
740 p++;
741 }
742 mov(out, tmp);
743 }
744
745public:
746 jit_generator(
747 void *code_ptr = nullptr,
748 size_t code_size = 256 * 1024
749 ) : Xbyak::CodeGenerator(code_size, code_ptr)
750 {
751 }
752 virtual ~jit_generator() {}
753
754 virtual const char *name() const = 0;
755 virtual const char *source_file() const = 0;
756
757 const Xbyak::uint8 *getCode() {
758 const Xbyak::uint8 *code = CodeGenerator::getCode();
759 size_t code_size = getSize();
760 jit_utils::register_jit_code(code, code_size, name(), source_file());
761 return code;
762 }
763
764 template<typename F> const F getCode() {
765 return (const F)getCode();
766 }
767};
768
769}
770}
771}
772
773#endif
774