cpu_reducer.cpp source code [Godot/thirdparty/oidn/mkl-dnn/src/cpu/cpu_reducer.cpp]

1	/*******************************************************************************
2	* Copyright 2017-2018 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#include <assert.h>
18
19	#include "mkldnn_thread.hpp"
20	#include "mkldnn_types.h"
21	#include "nstl.hpp"
22	#include "utils.hpp"
23
24	#include "cpu_reducer.hpp"
25
26	namespace mkldnn {
27	namespace impl {
28	namespace cpu {
29
30	using namespace memory_tracking::names;
31
32	void reduce_balancer_t::balance() {
33	using namespace nstl;
34	using namespace utils;
35
36	assert(nthr_ > `0` && job_size_ > `0` && njobs_ > `0` && reduction_size_ > `0`);
37
38	const int job_complexity = `1`;
39
40	const int min_njobs_per_group = max(`1`, njobs_ / nthr_);
41	const int max_njobs_per_group = max(`1`,
42	static_cast<int>(max_buffer_size_ / (nthr_ * job_size_)));
43
44	/ initial guess /
45	int ngroups = min(njobs_ / min_njobs_per_group, nthr_);
46	int nthr_per_group = syncable_ ? min(nthr_ / ngroups, reduction_size_) : `1`;
47	int njobs_per_group_ub = div_up(njobs_, ngroups);
48
49	/ rough upper-bound estimation, will be fixed during brute force /
50	size_t thread_complexity_ub = njobs_ * job_size_ * reduction_size_;
51
52	/ brute force parameters for the best balance... /
53	for (int c_njobs_per_group = min_njobs_per_group;
54	c_njobs_per_group < njobs_; ++c_njobs_per_group) {
55	/ current assumption /
56	int c_ngroups = min(njobs_ / c_njobs_per_group, nthr_);
57	int c_nthr_per_group = syncable_
58	? min(nthr_ / c_ngroups, reduction_size_) : `1`;
59	int c_njobs_per_group_ub = div_up(njobs_, c_ngroups);
60
61	if (c_nthr_per_group > `1` && c_njobs_per_group_ub > max_njobs_per_group)
62	continue;
63
64	int c_thread_reduction_ub = div_up(reduction_size_, c_nthr_per_group);
65	size_t c_group_size_ub = job_size_ * c_njobs_per_group_ub;
66	size_t c_thread_complexity_ub = c_group_size_ub * (
67	job_complexity * c_thread_reduction_ub
68	+ (c_nthr_per_group != `1`));
69
70	if (c_thread_complexity_ub < thread_complexity_ub) {
71	ngroups = c_ngroups;
72	nthr_per_group = c_nthr_per_group;
73	njobs_per_group_ub = c_njobs_per_group_ub;
74	thread_complexity_ub = c_thread_complexity_ub;
75	}
76	}
77
78	assert(njobs_per_group_ub <= max_njobs_per_group \|\| nthr_per_group == `1`);
79	assert(ngroups * nthr_per_group <= nthr_);
80	assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_
81	\|\| nthr_per_group == `1`); / no reduction buffer overflow /
82	assert(IMPLICATION(!syncable_, nthr_per_group == `1`));
83
84	ngroups_ = ngroups;
85	nthr_per_group_ = nthr_per_group;
86	njobs_per_group_ub_ = njobs_per_group_ub;
87	}
88
89	/ reducer jit-ted driver /
90
91	using namespace Xbyak;
92
93	template <impl::data_type_t data_type>
94	struct reducer_2d_driver_t: public c_compatible {
95	typedef typename prec_traits<data_type>::type data_t;
96
97	reducer_2d_driver_t(int n_src, size_t src_ld,
98	size_t src_step, size_t dst_step, bool nullify_dst)
99	: n_src_(n_src), src_ld_(src_ld), src_step_(src_step)
100	, dst_step_(dst_step), nullify_dst_(nullify_dst), ker_(nullptr) {}
101	virtual ~reducer_2d_driver_t() {}
102	void operator()(data_t dst, const* data_t *srcs, size_t ny, size_t nx)
103	{ assert(ker_); ker_(dst, srcs, ny, nx); }
104
105	protected:
106	int n_src_;
107	size_t src_ld_, src_step_, dst_step_;
108	bool nullify_dst_;
109	void (ker_)(data_t dst, const data_t *srcs, size_t ny, size_t nx);
110	};
111
112	template <impl::data_type_t data_type, cpu_isa_t isa>
113	struct reducer_2d_driver_f_s_32_t: public reducer_2d_driver_t<data_type>,
114	public jit_generator
115	{
116	DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t)
117
118	/ cpu specific part /
119	using Vmm = typename utils::conditional<isa == avx2, Ymm, Zmm>::type;
120	const AddressFrame &vmmword = (isa == avx2) ? yword : zword;
121	void uni_vadd(const Xmm& x1, const Xmm& x2, const Operand& op)
122	{ if (data_type == data_type::f32) vaddps(x1, x2, op);
123	else vpaddd(x1, x2, op); }
124	void uni_add(const Xmm& x1, const Operand& op)
125	{ if (data_type == data_type::f32) addss(x1, op); else paddd(x1, op); }
126
127	const int vlen = cpu_isa_traits<isa>::vlen;
128	const int typesize
129	= sizeof(typename mkldnn::impl::prec_traits<data_type>::type);
130	Xbyak::Reg64 reg_dst = abi_param1;
131	Xbyak::Reg64 reg_src = abi_param2;
132	Xbyak::Reg64 reg_ny = abi_param3;
133	Xbyak::Reg64 reg_nx = abi_param4;
134
135	Xbyak::Reg64 reg_x = rax;
136	Xbyak::Reg64 reg_src_id = r10;
137
138	reducer_2d_driver_f_s_32_t(int n_src, size_t src_ld, size_t src_step,
139	size_t dst_step, bool nullify_dst)
140	: reducer_2d_driver_t<data_type>(n_src, src_ld, src_step,
141	dst_step, nullify_dst)
142	{ generate(); }
143
144	void nullify_dst(int nloads, int load_len) {
145	UNUSED(load_len);
146	for (int i = `0`; i < nloads; ++i)
147	uni_vpxor(Vmm(i), Vmm(i), Vmm(i));
148	/ prefetches[dst] ? /
149	}
150
151	void load_dst(int nloads, int load_len) {
152	for (int i = `0`; i < nloads; ++i) {
153	if (load_len == typesize)
154	movd(Xmm (i), ptr[reg_dst + i * load_len]);
155	else if (load_len == vlen)
156	vmovups(Vmm(i), ptr[reg_dst + i * load_len]);
157	else
158	assert(!"unsupported");
159	}
160	}
161
162	void store_dst(int nloads, int load_len) {
163	for (int i = `0`; i < nloads; ++i) {
164	if (load_len == typesize)
165	movd(ptr[reg_dst + i * load_len], Xmm (i));
166	else if (load_len == vlen)
167	vmovups(ptr[reg_dst + i * load_len], Vmm(i));
168	else
169	assert(!"unsupported");
170	}
171	}
172
173	void accumulate(int nloads, int load_len, size_t base_off) {
174	for (int i = `0`; i < nloads; ++i) {
175	size_t off = base_off + i * load_len;
176
177	if (load_len == typesize)
178	uni_add(Xmm (i), ptr[reg_src + off]);
179	else if (load_len == vlen)
180	uni_vadd(Vmm(i), Vmm(i), vmmword [reg_src + off]);
181	else
182	assert(!"unsupported");
183	}
184	}
185
186	void loop_x() {
187	const int nloads[] = {cpu_isa_traits<isa>::n_vregs, `1`, `1`};
188	const int nbranches = sizeof(nloads) / sizeof(nloads[`0`]);
189
190	const int load_len[nbranches] = {vlen, vlen, typesize};
191	Label loop_x_label[nbranches + `1`];
192
193	mov(reg_x, reg_nx);
194
195	for (int id = `0`; id < nbranches; ++id) {
196	L(loop_x_label[id]);
197
198	cmp(reg_x, nloads[id] * load_len[id]);
199	jl(loop_x_label[id + `1`], T_NEAR);
200
201	if (this->nullify_dst_)
202	nullify_dst(nloads[id], load_len[id]);
203	else
204	load_dst(nloads[id], load_len[id]);
205
206	if (nloads[id] > `1`) {
207	Label loop_srcs;
208	mov(reg_src_id, this->n_src_);
209	L(loop_srcs);
210
211	accumulate(nloads[id], load_len[id], `0`);
212	add(reg_src, this->src_ld_ * typesize);
213
214	dec(reg_src_id);
215	jnz(loop_srcs, T_NEAR);
216
217	sub(reg_src, this->n_src_ * this->src_ld_ * typesize);
218	} else {
219	for (int src_id = `0`; src_id < this->n_src_; ++src_id) {
220	const size_t base_off = src_id * this->src_ld_ * typesize;
221	accumulate(nloads[id], load_len[id], base_off);
222	}
223	}
224
225	store_dst(nloads[id], load_len[id]);
226
227	add(reg_src, nloads[id] * load_len[id]);
228	add(reg_dst, nloads[id] * load_len[id]);
229
230	sub(reg_x, nloads[id] * load_len[id]);
231
232	jmp(loop_x_label[id], T_NEAR);
233	}
234
235	L(loop_x_label[nbranches]);
236
237	/ restore address registers /
238	sub(reg_src, reg_nx);
239	sub(reg_dst, reg_nx);
240	}
241
242	void generate() {
243	assert(isa == avx2 \|\| isa == avx512_common \|\| isa == avx512_mic);
244
245	preamble();
246
247	shl(reg_nx, `2`);
248
249	Label ny_loop;
250	L(ny_loop);
251
252	loop_x();
253
254	add(reg_dst, this->dst_step_ * typesize);
255	add(reg_src, this->src_step_ * typesize);
256
257	dec(reg_ny);
258	jnz(ny_loop, T_NEAR);
259
260	postamble();
261	this->ker_ = reinterpret_cast<decltype(this->ker_)>(
262	const_cast<uint8_t>(this*->getCode()));
263	}
264	};
265
266	template <impl::data_type_t data_type>
267	inline reducer_2d_driver_t<data_type> create_reduce_2d_drv(int* n_src,
268	size_t src_ld, size_t src_step, size_t dst_step, bool nullify_dst) {
269	if (mayiuse(avx512_common))
270	return new reducer_2d_driver_f_s_32_t<data_type, avx512_common>(n_src,
271	src_ld, src_step, dst_step, nullify_dst);
272	else if (mayiuse(avx2))
273	return new reducer_2d_driver_f_s_32_t<data_type, avx2>(n_src, src_ld,
274	src_step, dst_step, nullify_dst);
275	assert(!"unimplemented");
276	return nullptr;
277	}
278
279	/ cpu_reducer_t /
280
281	template <impl::data_type_t data_type>
282	void cpu_reducer_t<data_type>::conf_t::init_scratchpad(
283	memory_tracking::registrar_t &scratchpad) const {
284	if (balancer_.nthr_per_group_ == `1`) return;
285
286	const size_t space_size = balancer_.ngroups_
287	* (balancer_.nthr_per_group_ - `1`)
288	* cpu_reducer_t<data_type>::space_per_thread(balancer_);
289	scratchpad.book(key_reducer_space, sizeof(data_t) * space_size, PAGE_4K);
290	scratchpad.book(key_reducer_space_bctx,
291	sizeof(simple_barrier::ctx_t) * balancer_.ngroups_);
292	}
293
294	template <impl::data_type_t data_type>
295	cpu_reducer_t<data_type>::cpu_reducer_t(const conf_t &conf)
296	: conf_(conf), drv_(nullptr)
297	{
298	if (balancer().nthr_per_group_ == `1`) return;
299
300	drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_ - `1`,
301	space_per_thread(balancer()), `0`, `0`, false);
302	}
303
304	template <impl::data_type_t data_type>
305	cpu_reducer_t<data_type>::~cpu_reducer_t() { delete drv_; }
306
307	template <impl::data_type_t data_type>
308	typename cpu_reducer_t<data_type>::data_t *
309	cpu_reducer_t<data_type>::get_local_ptr(int ithr, data_t *dst,
310	const memory_tracking::grantor_t &scratchpad) const {
311	const int id_in_grp = balancer().id_in_group(ithr);
312
313	/ threads 0 from each group writes directly to the destination /
314	if (id_in_grp == `0`)
315	return dst + balancer().ithr_job_off(ithr) * balancer().job_size_;
316
317	const int grp_id = balancer().group_id(ithr);
318	const int offset_factor = grp_id * (balancer().nthr_per_group_ - `1`)
319	+ (id_in_grp - `1`);
320
321	auto space = scratchpad.template get<data_t>(key_reducer_space);
322	return space + offset_factor * space_per_thread(balancer());
323	}
324
325	template <impl::data_type_t data_type>
326	void cpu_reducer_t<data_type>::reduce_nolock(int ithr, data_t *dst,
327	const memory_tracking::grantor_t &scratchpad) const {
328	bool redundant_reduction = balancer().nthr_per_group_ == `1`
329	\|\| balancer().idle(ithr);
330	if (redundant_reduction) return;
331
332	#ifdef SIMPLE_IMPL
333	if (balancer().id_in_group(ithr) != `0`)
334	return; / only threads 0 do the reduction /
335
336	const int njobs_in_grp = balancer().ithr_njobs(ithr);
337	data_t *d = get_local_ptr(ithr, dst, scratchpad);
338	for (int id_in_grp = `1`; id_in_grp < balancer_.nthr_per_group_; ++id_in_grp)
339	{
340	const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad);
341	for (size_t i = `0`; i < (size_t)njobs_in_grp * balancer().job_size_; ++i)
342	d[i] += space[i];
343	}
344	#else
345	using namespace utils;
346
347	const int id_in_grp = balancer().id_in_group(ithr);
348	const int njobs_in_grp = balancer().ithr_njobs(ithr);
349	const size_t cl = `64` / sizeof(data_t);
350
351	const size_t reduction_size = njobs_in_grp * balancer().job_size_;
352	size_t start{`0`}, end{`0`};
353	balance211(div_up(reduction_size, cl), balancer().nthr_per_group_,
354	id_in_grp, start, end);
355
356	if (start == end) return;
357
358	data_t d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start cl;
359	const data_t *space = get_local_ptr(ithr - id_in_grp + `1`, dst, scratchpad)
360	+ start * cl;
361	const size_t len = nstl::min(end * cl, reduction_size) - start * cl;
362
363	(*drv_)(d, space, `1`, len);
364	#endif
365	}
366
367	template struct cpu_reducer_t<data_type::f32>;
368	template struct cpu_reducer_t<data_type::s32>;
369
370	/ cpu_reducer_2d_t /
371
372	template <impl::data_type_t data_type>
373	void cpu_reducer_2d_t<data_type>::conf_t::init_scratchpad(
374	memory_tracking::registrar_t &scratchpad) const {
375	if (balancer_.nthr_per_group_ == `1`) return;
376
377	const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_
378	* cpu_reducer_2d_t<data_type>::space_per_thread(balancer_);
379	scratchpad.book(key_reducer_space, sizeof(data_t) * space_size);
380	scratchpad.book(key_reducer_space_bctx,
381	sizeof(simple_barrier::ctx_t) * balancer_.ngroups_);
382	}
383
384	template <impl::data_type_t data_type>
385	cpu_reducer_2d_t<data_type>::cpu_reducer_2d_t(const conf_t &conf)
386	: conf_(conf), drv_(nullptr)
387	{
388	if (balancer().nthr_per_group_ == `1`) return;
389
390	drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_,
391	space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_,
392	true);
393	}
394
395	template <impl::data_type_t data_type>
396	cpu_reducer_2d_t<data_type>::~cpu_reducer_2d_t() { delete drv_; }
397
398	template <impl::data_type_t data_type>
399	typename cpu_reducer_2d_t<data_type>::data_t *cpu_reducer_2d_t<data_type>::
400	get_local_ptr(int ithr, const memory_tracking::grantor_t &scratchpad) const {
401	const int id_in_grp = balancer().id_in_group(ithr);
402	const int grp_id = balancer().group_id(ithr);
403	const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp;
404	auto space = scratchpad.template get<data_t>(key_reducer_space);
405	return space + offset_factor * space_per_thread(balancer());
406	}
407
408	template <impl::data_type_t data_type>
409	int cpu_reducer_2d_t<data_type>::choose_x_blocking(int nx, int ny,
410	int nthr_per_grp) const {
411	// find x_blocking for better balance reducing work between threads
412	assert(conf_.x_block_ > `0` && nx > conf_.x_block_
413	&& nx % conf_.x_block_ == `0`);
414	int x_blocking = nx / conf_.x_block_;
415	int min_x_blocking =
416	utils::div_up(x_blocking, nstl::max(`1`, nthr_per_grp / ny));
417	while (true) {
418	if (x_blocking % `2` == `0` && x_blocking >= min_x_blocking * `2`)
419	x_blocking /= `2`;
420	else if (x_blocking % `3` == `0` && x_blocking >= min_x_blocking * `3`)
421	x_blocking /= `3`;
422	else
423	break;
424	}
425	if (x_blocking >= min_x_blocking * `4`) x_blocking = `1`;
426	x_blocking *= conf_.x_block_;
427	return x_blocking;
428	}
429
430	template <impl::data_type_t data_type>
431	void cpu_reducer_2d_t<data_type>::reduce_block(const data_t* space_base,
432	data_t dst, int* job, int start_y, int start_x,
433	int ny_start, int nx_start, int ny_step, int nx_step) const {
434	data_t d = dst + (start_y + ny_start) conf_.dst_x_
435	+ start_x + nx_start;
436	const data_t space = space_base + job balancer().job_size_
437	+ ny_start * conf_.job_size_x_ + nx_start;
438	#ifdef SIMPLE_IMPL
439	for (int idg = `0`; idg < balancer().nthr_per_group_; ++idg) {
440	const data_t w = &space[idg space_per_thread(balancer())];
441	for (int y = `0`; y < ny_step; ++y)
442	for (int x = `0`; x < nx_step; ++x) {
443	d[y * conf_.dst_x_ + x]
444	= (idg == `0` ? `0` : d[y * conf_.dst_x_ + x])
445	+ w[y * conf_.job_size_x_ + x];
446	}
447	}
448	#else
449	(*drv_)(d, space, ny_step, nx_step);
450	#endif
451	}
452
453	template <impl::data_type_t data_type>
454	void cpu_reducer_2d_t<data_type>::reduce_nolock(int ithr, data_t *dst,
455	const memory_tracking::grantor_t &scratchpad) const {
456	bool redundant_reduction = balancer().nthr_per_group_ == `1`
457	\|\| balancer().idle(ithr);
458	if (redundant_reduction) return;
459
460	const int id_in_grp = balancer().id_in_group(ithr);
461	const int njobs_in_grp = balancer().ithr_njobs(ithr);
462	const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_);
463	const int global_job_start = balancer().ithr_job_off(ithr);
464
465	const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad);
466
467	const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_);
468	const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps;
469
470	if (id_in_grp >= pr_grps * pr_nthr_per_grp)
471	return; / idle /
472
473	const int pr_my_grp = id_in_grp / pr_nthr_per_grp;
474	const int pr_my_id = id_in_grp % pr_nthr_per_grp;
475
476	int pr_job_start{`0`}, pr_job_end{`0`};
477	balance211(njobs_in_grp, pr_grps, pr_my_grp, pr_job_start, pr_job_end);
478
479	for (int j = pr_job_start; j < pr_job_end; ++j) {
480	const int global_job = global_job_start + j;
481	const int j_y = global_job / njobs_x;
482	const int j_x = global_job % njobs_x;
483	const int start_y = j_y * conf_.job_size_y_;
484	const int start_x = j_x * conf_.job_size_x_;
485	const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_);
486	const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_);
487	int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp);
488
489	int nxy_start{`0`}, nxy_end{`0`};
490	balance211(ny * nx / x_blocking, pr_nthr_per_grp, pr_my_id,
491	nxy_start, nxy_end);
492	if (nxy_start == nxy_end) continue;
493	nxy_start *= x_blocking;
494	nxy_end *= x_blocking;
495
496	int nxy = nxy_start;
497	if (nxy % nx != `0`) {
498	int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy);
499	reduce_block(space_base, dst, j, start_y, start_x,
500	nxy / nx, nxy % nx, `1`, nx_step);
501	nxy += nx_step;
502	}
503	if ((nxy_end - nxy) > nx) {
504	int ny_step = (nxy_end - nxy) / nx;
505	reduce_block(space_base, dst, j, start_y, start_x,
506	nxy / nx, nxy % nx, ny_step, nx);
507	nxy += nx * ny_step;
508	}
509	if ((nxy_end - nxy) > `0`) {
510	reduce_block(space_base, dst, j, start_y, start_x,
511	nxy / nx, nxy % nx, `1`, nxy_end - nxy);
512	}
513	}
514	}
515
516	template struct cpu_reducer_2d_t<data_type::f32>;
517	template struct cpu_reducer_2d_t<data_type::s32>;
518
519	/ accumulator section /
520
521	template <impl::data_type_t data_type>
522	cpu_accumulator_1d_t<data_type>::cpu_accumulator_1d_t(): drv_(nullptr) {
523	drv_ = create_reduce_2d_drv<data_type>(`1`, `0`, `0`, `0`, false);
524	}
525
526	template <impl::data_type_t data_type>
527	cpu_accumulator_1d_t<data_type>::~cpu_accumulator_1d_t() {
528	delete drv_;
529	}
530
531	template <impl::data_type_t data_type>
532	void cpu_accumulator_1d_t<data_type>::accumulate(data_t *dst,
533	const data_t *src, size_t size) {
534	(*drv_)(dst, src, `1`, size);
535	}
536
537	template struct cpu_accumulator_1d_t<data_type::f32>;
538	template struct cpu_accumulator_1d_t<data_type::s32>;
539
540	}
541	}
542	}
543
544	// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
545

Browse the source code of Godot/thirdparty/oidn/mkl-dnn/src/cpu/cpu_reducer.cpp