1/*******************************************************************************
2* Copyright 2017-2018 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <assert.h>
18
19#include "mkldnn_thread.hpp"
20#include "mkldnn_types.h"
21#include "nstl.hpp"
22#include "utils.hpp"
23
24#include "cpu_reducer.hpp"
25
26namespace mkldnn {
27namespace impl {
28namespace cpu {
29
30using namespace memory_tracking::names;
31
32void reduce_balancer_t::balance() {
33 using namespace nstl;
34 using namespace utils;
35
36 assert(nthr_ > 0 && job_size_ > 0 && njobs_ > 0 && reduction_size_ > 0);
37
38 const int job_complexity = 1;
39
40 const int min_njobs_per_group = max(1, njobs_ / nthr_);
41 const int max_njobs_per_group = max(1,
42 static_cast<int>(max_buffer_size_ / (nthr_ * job_size_)));
43
44 /* initial guess */
45 int ngroups = min(njobs_ / min_njobs_per_group, nthr_);
46 int nthr_per_group = syncable_ ? min(nthr_ / ngroups, reduction_size_) : 1;
47 int njobs_per_group_ub = div_up(njobs_, ngroups);
48
49 /* rough upper-bound estimation, will be fixed during brute force */
50 size_t thread_complexity_ub = njobs_ * job_size_ * reduction_size_;
51
52 /* brute force parameters for the best balance... */
53 for (int c_njobs_per_group = min_njobs_per_group;
54 c_njobs_per_group < njobs_; ++c_njobs_per_group) {
55 /* current assumption */
56 int c_ngroups = min(njobs_ / c_njobs_per_group, nthr_);
57 int c_nthr_per_group = syncable_
58 ? min(nthr_ / c_ngroups, reduction_size_) : 1;
59 int c_njobs_per_group_ub = div_up(njobs_, c_ngroups);
60
61 if (c_nthr_per_group > 1 && c_njobs_per_group_ub > max_njobs_per_group)
62 continue;
63
64 int c_thread_reduction_ub = div_up(reduction_size_, c_nthr_per_group);
65 size_t c_group_size_ub = job_size_ * c_njobs_per_group_ub;
66 size_t c_thread_complexity_ub = c_group_size_ub * (
67 job_complexity * c_thread_reduction_ub
68 + (c_nthr_per_group != 1));
69
70 if (c_thread_complexity_ub < thread_complexity_ub) {
71 ngroups = c_ngroups;
72 nthr_per_group = c_nthr_per_group;
73 njobs_per_group_ub = c_njobs_per_group_ub;
74 thread_complexity_ub = c_thread_complexity_ub;
75 }
76 }
77
78 assert(njobs_per_group_ub <= max_njobs_per_group || nthr_per_group == 1);
79 assert(ngroups * nthr_per_group <= nthr_);
80 assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_
81 || nthr_per_group == 1); /* no reduction buffer overflow */
82 assert(IMPLICATION(!syncable_, nthr_per_group == 1));
83
84 ngroups_ = ngroups;
85 nthr_per_group_ = nthr_per_group;
86 njobs_per_group_ub_ = njobs_per_group_ub;
87}
88
89/* reducer jit-ted driver */
90
91using namespace Xbyak;
92
93template <impl::data_type_t data_type>
94struct reducer_2d_driver_t: public c_compatible {
95 typedef typename prec_traits<data_type>::type data_t;
96
97 reducer_2d_driver_t(int n_src, size_t src_ld,
98 size_t src_step, size_t dst_step, bool nullify_dst)
99 : n_src_(n_src), src_ld_(src_ld), src_step_(src_step)
100 , dst_step_(dst_step), nullify_dst_(nullify_dst), ker_(nullptr) {}
101 virtual ~reducer_2d_driver_t() {}
102 void operator()(data_t *dst, const data_t *srcs, size_t ny, size_t nx)
103 { assert(ker_); ker_(dst, srcs, ny, nx); }
104
105protected:
106 int n_src_;
107 size_t src_ld_, src_step_, dst_step_;
108 bool nullify_dst_;
109 void (*ker_)(data_t *dst, const data_t *srcs, size_t ny, size_t nx);
110};
111
112template <impl::data_type_t data_type, cpu_isa_t isa>
113struct reducer_2d_driver_f_s_32_t: public reducer_2d_driver_t<data_type>,
114 public jit_generator
115{
116 DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t)
117
118 /* cpu specific part */
119 using Vmm = typename utils::conditional<isa == avx2, Ymm, Zmm>::type;
120 const AddressFrame &vmmword = (isa == avx2) ? yword : zword;
121 void uni_vadd(const Xmm& x1, const Xmm& x2, const Operand& op)
122 { if (data_type == data_type::f32) vaddps(x1, x2, op);
123 else vpaddd(x1, x2, op); }
124 void uni_add(const Xmm& x1, const Operand& op)
125 { if (data_type == data_type::f32) addss(x1, op); else paddd(x1, op); }
126
127 const int vlen = cpu_isa_traits<isa>::vlen;
128 const int typesize
129 = sizeof(typename mkldnn::impl::prec_traits<data_type>::type);
130 Xbyak::Reg64 reg_dst = abi_param1;
131 Xbyak::Reg64 reg_src = abi_param2;
132 Xbyak::Reg64 reg_ny = abi_param3;
133 Xbyak::Reg64 reg_nx = abi_param4;
134
135 Xbyak::Reg64 reg_x = rax;
136 Xbyak::Reg64 reg_src_id = r10;
137
138 reducer_2d_driver_f_s_32_t(int n_src, size_t src_ld, size_t src_step,
139 size_t dst_step, bool nullify_dst)
140 : reducer_2d_driver_t<data_type>(n_src, src_ld, src_step,
141 dst_step, nullify_dst)
142 { generate(); }
143
144 void nullify_dst(int nloads, int load_len) {
145 UNUSED(load_len);
146 for (int i = 0; i < nloads; ++i)
147 uni_vpxor(Vmm(i), Vmm(i), Vmm(i));
148 /* prefetches[dst] ? */
149 }
150
151 void load_dst(int nloads, int load_len) {
152 for (int i = 0; i < nloads; ++i) {
153 if (load_len == typesize)
154 movd(Xmm(i), ptr[reg_dst + i * load_len]);
155 else if (load_len == vlen)
156 vmovups(Vmm(i), ptr[reg_dst + i * load_len]);
157 else
158 assert(!"unsupported");
159 }
160 }
161
162 void store_dst(int nloads, int load_len) {
163 for (int i = 0; i < nloads; ++i) {
164 if (load_len == typesize)
165 movd(ptr[reg_dst + i * load_len], Xmm(i));
166 else if (load_len == vlen)
167 vmovups(ptr[reg_dst + i * load_len], Vmm(i));
168 else
169 assert(!"unsupported");
170 }
171 }
172
173 void accumulate(int nloads, int load_len, size_t base_off) {
174 for (int i = 0; i < nloads; ++i) {
175 size_t off = base_off + i * load_len;
176
177 if (load_len == typesize)
178 uni_add(Xmm(i), ptr[reg_src + off]);
179 else if (load_len == vlen)
180 uni_vadd(Vmm(i), Vmm(i), vmmword[reg_src + off]);
181 else
182 assert(!"unsupported");
183 }
184 }
185
186 void loop_x() {
187 const int nloads[] = {cpu_isa_traits<isa>::n_vregs, 1, 1};
188 const int nbranches = sizeof(nloads) / sizeof(nloads[0]);
189
190 const int load_len[nbranches] = {vlen, vlen, typesize};
191 Label loop_x_label[nbranches + 1];
192
193 mov(reg_x, reg_nx);
194
195 for (int id = 0; id < nbranches; ++id) {
196 L(loop_x_label[id]);
197
198 cmp(reg_x, nloads[id] * load_len[id]);
199 jl(loop_x_label[id + 1], T_NEAR);
200
201 if (this->nullify_dst_)
202 nullify_dst(nloads[id], load_len[id]);
203 else
204 load_dst(nloads[id], load_len[id]);
205
206 if (nloads[id] > 1) {
207 Label loop_srcs;
208 mov(reg_src_id, this->n_src_);
209 L(loop_srcs);
210
211 accumulate(nloads[id], load_len[id], 0);
212 add(reg_src, this->src_ld_ * typesize);
213
214 dec(reg_src_id);
215 jnz(loop_srcs, T_NEAR);
216
217 sub(reg_src, this->n_src_ * this->src_ld_ * typesize);
218 } else {
219 for (int src_id = 0; src_id < this->n_src_; ++src_id) {
220 const size_t base_off = src_id * this->src_ld_ * typesize;
221 accumulate(nloads[id], load_len[id], base_off);
222 }
223 }
224
225 store_dst(nloads[id], load_len[id]);
226
227 add(reg_src, nloads[id] * load_len[id]);
228 add(reg_dst, nloads[id] * load_len[id]);
229
230 sub(reg_x, nloads[id] * load_len[id]);
231
232 jmp(loop_x_label[id], T_NEAR);
233 }
234
235 L(loop_x_label[nbranches]);
236
237 /* restore address registers */
238 sub(reg_src, reg_nx);
239 sub(reg_dst, reg_nx);
240 }
241
242 void generate() {
243 assert(isa == avx2 || isa == avx512_common || isa == avx512_mic);
244
245 preamble();
246
247 shl(reg_nx, 2);
248
249 Label ny_loop;
250 L(ny_loop);
251
252 loop_x();
253
254 add(reg_dst, this->dst_step_ * typesize);
255 add(reg_src, this->src_step_ * typesize);
256
257 dec(reg_ny);
258 jnz(ny_loop, T_NEAR);
259
260 postamble();
261 this->ker_ = reinterpret_cast<decltype(this->ker_)>(
262 const_cast<uint8_t*>(this->getCode()));
263 }
264};
265
266template <impl::data_type_t data_type>
267inline reducer_2d_driver_t<data_type> *create_reduce_2d_drv(int n_src,
268 size_t src_ld, size_t src_step, size_t dst_step, bool nullify_dst) {
269 if (mayiuse(avx512_common))
270 return new reducer_2d_driver_f_s_32_t<data_type, avx512_common>(n_src,
271 src_ld, src_step, dst_step, nullify_dst);
272 else if (mayiuse(avx2))
273 return new reducer_2d_driver_f_s_32_t<data_type, avx2>(n_src, src_ld,
274 src_step, dst_step, nullify_dst);
275 assert(!"unimplemented");
276 return nullptr;
277}
278
279/* cpu_reducer_t */
280
281template <impl::data_type_t data_type>
282void cpu_reducer_t<data_type>::conf_t::init_scratchpad(
283 memory_tracking::registrar_t &scratchpad) const {
284 if (balancer_.nthr_per_group_ == 1) return;
285
286 const size_t space_size = balancer_.ngroups_
287 * (balancer_.nthr_per_group_ - 1)
288 * cpu_reducer_t<data_type>::space_per_thread(balancer_);
289 scratchpad.book(key_reducer_space, sizeof(data_t) * space_size, PAGE_4K);
290 scratchpad.book(key_reducer_space_bctx,
291 sizeof(simple_barrier::ctx_t) * balancer_.ngroups_);
292}
293
294template <impl::data_type_t data_type>
295cpu_reducer_t<data_type>::cpu_reducer_t(const conf_t &conf)
296 : conf_(conf), drv_(nullptr)
297{
298 if (balancer().nthr_per_group_ == 1) return;
299
300 drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_ - 1,
301 space_per_thread(balancer()), 0, 0, false);
302}
303
304template <impl::data_type_t data_type>
305cpu_reducer_t<data_type>::~cpu_reducer_t() { delete drv_; }
306
307template <impl::data_type_t data_type>
308typename cpu_reducer_t<data_type>::data_t *
309cpu_reducer_t<data_type>::get_local_ptr(int ithr, data_t *dst,
310 const memory_tracking::grantor_t &scratchpad) const {
311 const int id_in_grp = balancer().id_in_group(ithr);
312
313 /* threads 0 from each group writes directly to the destination */
314 if (id_in_grp == 0)
315 return dst + balancer().ithr_job_off(ithr) * balancer().job_size_;
316
317 const int grp_id = balancer().group_id(ithr);
318 const int offset_factor = grp_id * (balancer().nthr_per_group_ - 1)
319 + (id_in_grp - 1);
320
321 auto space = scratchpad.template get<data_t>(key_reducer_space);
322 return space + offset_factor * space_per_thread(balancer());
323}
324
325template <impl::data_type_t data_type>
326void cpu_reducer_t<data_type>::reduce_nolock(int ithr, data_t *dst,
327 const memory_tracking::grantor_t &scratchpad) const {
328 bool redundant_reduction = balancer().nthr_per_group_ == 1
329 || balancer().idle(ithr);
330 if (redundant_reduction) return;
331
332#ifdef SIMPLE_IMPL
333 if (balancer().id_in_group(ithr) != 0)
334 return; /* only threads 0 do the reduction */
335
336 const int njobs_in_grp = balancer().ithr_njobs(ithr);
337 data_t *d = get_local_ptr(ithr, dst, scratchpad);
338 for (int id_in_grp = 1; id_in_grp < balancer_.nthr_per_group_; ++id_in_grp)
339 {
340 const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad);
341 for (size_t i = 0; i < (size_t)njobs_in_grp * balancer().job_size_; ++i)
342 d[i] += space[i];
343 }
344#else
345 using namespace utils;
346
347 const int id_in_grp = balancer().id_in_group(ithr);
348 const int njobs_in_grp = balancer().ithr_njobs(ithr);
349 const size_t cl = 64 / sizeof(data_t);
350
351 const size_t reduction_size = njobs_in_grp * balancer().job_size_;
352 size_t start{0}, end{0};
353 balance211(div_up(reduction_size, cl), balancer().nthr_per_group_,
354 id_in_grp, start, end);
355
356 if (start == end) return;
357
358 data_t *d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start * cl;
359 const data_t *space = get_local_ptr(ithr - id_in_grp + 1, dst, scratchpad)
360 + start * cl;
361 const size_t len = nstl::min(end * cl, reduction_size) - start * cl;
362
363 (*drv_)(d, space, 1, len);
364#endif
365}
366
367template struct cpu_reducer_t<data_type::f32>;
368template struct cpu_reducer_t<data_type::s32>;
369
370/* cpu_reducer_2d_t */
371
372template <impl::data_type_t data_type>
373void cpu_reducer_2d_t<data_type>::conf_t::init_scratchpad(
374 memory_tracking::registrar_t &scratchpad) const {
375 if (balancer_.nthr_per_group_ == 1) return;
376
377 const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_
378 * cpu_reducer_2d_t<data_type>::space_per_thread(balancer_);
379 scratchpad.book(key_reducer_space, sizeof(data_t) * space_size);
380 scratchpad.book(key_reducer_space_bctx,
381 sizeof(simple_barrier::ctx_t) * balancer_.ngroups_);
382}
383
384template <impl::data_type_t data_type>
385cpu_reducer_2d_t<data_type>::cpu_reducer_2d_t(const conf_t &conf)
386 : conf_(conf), drv_(nullptr)
387{
388 if (balancer().nthr_per_group_ == 1) return;
389
390 drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_,
391 space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_,
392 true);
393}
394
395template <impl::data_type_t data_type>
396cpu_reducer_2d_t<data_type>::~cpu_reducer_2d_t() { delete drv_; }
397
398template <impl::data_type_t data_type>
399typename cpu_reducer_2d_t<data_type>::data_t *cpu_reducer_2d_t<data_type>::
400get_local_ptr(int ithr, const memory_tracking::grantor_t &scratchpad) const {
401 const int id_in_grp = balancer().id_in_group(ithr);
402 const int grp_id = balancer().group_id(ithr);
403 const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp;
404 auto space = scratchpad.template get<data_t>(key_reducer_space);
405 return space + offset_factor * space_per_thread(balancer());
406}
407
408template <impl::data_type_t data_type>
409int cpu_reducer_2d_t<data_type>::choose_x_blocking(int nx, int ny,
410 int nthr_per_grp) const {
411 // find x_blocking for better balance reducing work between threads
412 assert(conf_.x_block_ > 0 && nx > conf_.x_block_
413 && nx % conf_.x_block_ == 0);
414 int x_blocking = nx / conf_.x_block_;
415 int min_x_blocking =
416 utils::div_up(x_blocking, nstl::max(1, nthr_per_grp / ny));
417 while (true) {
418 if (x_blocking % 2 == 0 && x_blocking >= min_x_blocking * 2)
419 x_blocking /= 2;
420 else if (x_blocking % 3 == 0 && x_blocking >= min_x_blocking * 3)
421 x_blocking /= 3;
422 else
423 break;
424 }
425 if (x_blocking >= min_x_blocking * 4) x_blocking = 1;
426 x_blocking *= conf_.x_block_;
427 return x_blocking;
428}
429
430template <impl::data_type_t data_type>
431void cpu_reducer_2d_t<data_type>::reduce_block(const data_t* space_base,
432 data_t *dst, int job, int start_y, int start_x,
433 int ny_start, int nx_start, int ny_step, int nx_step) const {
434 data_t *d = dst + (start_y + ny_start) * conf_.dst_x_
435 + start_x + nx_start;
436 const data_t *space = space_base + job * balancer().job_size_
437 + ny_start * conf_.job_size_x_ + nx_start;
438#ifdef SIMPLE_IMPL
439 for (int idg = 0; idg < balancer().nthr_per_group_; ++idg) {
440 const data_t *w = &space[idg * space_per_thread(balancer())];
441 for (int y = 0; y < ny_step; ++y)
442 for (int x = 0; x < nx_step; ++x) {
443 d[y * conf_.dst_x_ + x]
444 = (idg == 0 ? 0 : d[y * conf_.dst_x_ + x])
445 + w[y * conf_.job_size_x_ + x];
446 }
447 }
448#else
449 (*drv_)(d, space, ny_step, nx_step);
450#endif
451}
452
453template <impl::data_type_t data_type>
454void cpu_reducer_2d_t<data_type>::reduce_nolock(int ithr, data_t *dst,
455 const memory_tracking::grantor_t &scratchpad) const {
456 bool redundant_reduction = balancer().nthr_per_group_ == 1
457 || balancer().idle(ithr);
458 if (redundant_reduction) return;
459
460 const int id_in_grp = balancer().id_in_group(ithr);
461 const int njobs_in_grp = balancer().ithr_njobs(ithr);
462 const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_);
463 const int global_job_start = balancer().ithr_job_off(ithr);
464
465 const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad);
466
467 const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_);
468 const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps;
469
470 if (id_in_grp >= pr_grps * pr_nthr_per_grp)
471 return; /* idle */
472
473 const int pr_my_grp = id_in_grp / pr_nthr_per_grp;
474 const int pr_my_id = id_in_grp % pr_nthr_per_grp;
475
476 int pr_job_start{0}, pr_job_end{0};
477 balance211(njobs_in_grp, pr_grps, pr_my_grp, pr_job_start, pr_job_end);
478
479 for (int j = pr_job_start; j < pr_job_end; ++j) {
480 const int global_job = global_job_start + j;
481 const int j_y = global_job / njobs_x;
482 const int j_x = global_job % njobs_x;
483 const int start_y = j_y * conf_.job_size_y_;
484 const int start_x = j_x * conf_.job_size_x_;
485 const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_);
486 const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_);
487 int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp);
488
489 int nxy_start{0}, nxy_end{0};
490 balance211(ny * nx / x_blocking, pr_nthr_per_grp, pr_my_id,
491 nxy_start, nxy_end);
492 if (nxy_start == nxy_end) continue;
493 nxy_start *= x_blocking;
494 nxy_end *= x_blocking;
495
496 int nxy = nxy_start;
497 if (nxy % nx != 0) {
498 int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy);
499 reduce_block(space_base, dst, j, start_y, start_x,
500 nxy / nx, nxy % nx, 1, nx_step);
501 nxy += nx_step;
502 }
503 if ((nxy_end - nxy) > nx) {
504 int ny_step = (nxy_end - nxy) / nx;
505 reduce_block(space_base, dst, j, start_y, start_x,
506 nxy / nx, nxy % nx, ny_step, nx);
507 nxy += nx * ny_step;
508 }
509 if ((nxy_end - nxy) > 0) {
510 reduce_block(space_base, dst, j, start_y, start_x,
511 nxy / nx, nxy % nx, 1, nxy_end - nxy);
512 }
513 }
514}
515
516template struct cpu_reducer_2d_t<data_type::f32>;
517template struct cpu_reducer_2d_t<data_type::s32>;
518
519/* accumulator section */
520
521template <impl::data_type_t data_type>
522cpu_accumulator_1d_t<data_type>::cpu_accumulator_1d_t(): drv_(nullptr) {
523 drv_ = create_reduce_2d_drv<data_type>(1, 0, 0, 0, false);
524}
525
526template <impl::data_type_t data_type>
527cpu_accumulator_1d_t<data_type>::~cpu_accumulator_1d_t() {
528 delete drv_;
529}
530
531template <impl::data_type_t data_type>
532void cpu_accumulator_1d_t<data_type>::accumulate(data_t *dst,
533 const data_t *src, size_t size) {
534 (*drv_)(dst, src, 1, size);
535}
536
537template struct cpu_accumulator_1d_t<data_type::f32>;
538template struct cpu_accumulator_1d_t<data_type::s32>;
539
540}
541}
542}
543
544// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
545