cpu_reducer.hpp source code [Godot/thirdparty/oidn/mkl-dnn/src/cpu/cpu_reducer.hpp]

1	/*******************************************************************************
2	* Copyright 2017-2018 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#ifndef CPU_REDUCER_HPP
18	#define CPU_REDUCER_HPP
19
20	#include <assert.h>
21
22	#include "c_types_map.hpp"
23	#include "memory_tracking.hpp"
24	#include "mkldnn_thread.hpp"
25	#include "mkldnn_types.h"
26	#include "nstl.hpp"
27	#include "type_helpers.hpp"
28
29	#include "cpu_barrier.hpp"
30
31	namespace mkldnn {
32	namespace impl {
33	namespace cpu {
34
35	/* class to perform balancing over 3D array*
36	*
37	* Conceptually the reduction happens according to the picture below:
38	*
39	* <--job_size->
40	* +-----------+ +-----------+ +-----------+ ^
41	* \| \| \| \| \| \| \|
42	* \| \| \| \| \| \| \|
43	* \| 1 \| \| 2 \| . . . \| njobs \| \| reduction_size
44	* \| \| \| \| \| \| \|
45	* \| \| \| \| \| \| \|
46	* +-----------+ +-----------+ +-----------+ v
47	*
48	* \| \| \| \| \| \| \| \| \|
49	* v v v v v v v v v
50	* ===================================================== vertical reduction
51	*
52	* +-----------+ +-----------+ . . . +-----------+ result
53	*
54	* In a simple case the result must be contiguous in memory.
55	* @class cpu_reducer_t is an implementation.
56	*
57	* Threads are divided into groups. The groups are independent of each other.
58	* Each group may work on several jobs (the distribution is not uniform, since
59	* njobs might be not a multiple of groups). Threads within a group work on
60	* different parts of the reduction dimension. Thread 0 in each group is called
61	* master (@sa reduce_balancer_t::master()).
62	*
63	* If threading driver does not allow sync between sub-group of threads (e.g.
64	* Intel(R) TBB) the # of thread per group is enforced to be 1.
65	*/
66	struct reduce_balancer_t {
67	reduce_balancer_t() { init(`1`, `1`, `1`, `1`, `0`); } / trivial balance /
68	reduce_balancer_t(int nthr, int job_size, int njobs, int reduction_size,
69	size_t max_buffer_size)
70	{ init(nthr, job_size, njobs, reduction_size, max_buffer_size); }
71
72	reduce_balancer_t &init(int nthr, int job_size, int njobs,
73	int reduction_size, size_t max_buffer_size)
74	{
75	syncable_ = mkldnn_thr_syncable();
76	nthr_ = nthr;
77	job_size_ = job_size;
78	njobs_ = njobs;
79	reduction_size_ = reduction_size;
80	max_buffer_size_ = max_buffer_size;
81	balance();
82	return *this;
83	}
84
85	bool syncable_;
86	int nthr_;
87	int job_size_, njobs_, reduction_size_;
88
89	int ngroups_; /* number of independent work (thread) groups /
90	int nthr_per_group_; /* number of threads within a single work group /
91	int njobs_per_group_ub_; /* the max # of jobs within a work group /
92
93	bool master(int ithr) const { return id_in_group(ithr) == `0`; }
94	bool idle(int ithr) const { return ithr >= nthr_per_group_ * ngroups_; }
95
96	int group_id(int ithr) const { return ithr / nthr_per_group_; }
97	int id_in_group(int ithr) const { return ithr % nthr_per_group_; }
98
99	int grp_njobs(int grp) const {
100	if (grp >= ngroups_) return `0`;
101	return njobs_ / ngroups_ + (grp < njobs_ % ngroups_);
102	}
103	int grp_job_off(int grp) const {
104	if (grp >= ngroups_) return njobs_;
105	return njobs_ / ngroups_ * grp + nstl::min(grp, njobs_ % ngroups_);
106	}
107
108	int ithr_njobs(int ithr) const { return grp_njobs(group_id(ithr)); }
109	int ithr_job_off(int ithr) const { return grp_job_off(group_id(ithr)); }
110
111	private:
112	size_t max_buffer_size_;
113	void balance();
114	};
115
116	/* forward declaration of reduce driver /
117	template <impl::data_type_t data_type> struct reducer_2d_driver_t;
118
119	/* class to perform a reduction over 3D array*
120	*
121	* Balancing is based on @class reduce_balancer_t.
122	* Restrictions: the result of the reduction must be contiguous in memory. *
123	* The reduction happens according to the picture below (once more):
124	*
125	* <--job_size->
126	* +-----------+ +-----------+ +-----------+ ^
127	* \| \| \| \| \| \| \|
128	* \| \| \| \| \| \| \|
129	* \| 1 \| \| 2 \| . . . \| njobs \| \| reduction_size
130	* \| \| \| \| \| \| \|
131	* \| \| \| \| \| \| \|
132	* +-----------+ +-----------+ +-----------+ v
133	*
134	* \| \| \| \| \| \| \| \| \|
135	* v v v v v v v v v
136	* ===================================================== vertical reduction
137	*
138	* +-----------+ +-----------+ . . . +-----------+ (contiguous) result
139	*
140	* An example how work might be shared is shown below.
141	*
142	* In this example group 0 owns 2 (independent) jobs -- 2 big squares.
143	* The number of threads per group is also 2 (thread 0 of group 0 and thread 1
144	* of group 0). Master threads (i.e. threads with id 0 in corresponding group)
145	* from each group put the partial result directly into destination memory,
146	* while all the other threads with-in the group use workspace (on the picture
147	* the only thread 1). Once intermediate results obtained each group reduces
148	* corresponding part (own jobs) to the destination memory.
149	*
150	* <------- group 0 ------->
151	*
152	* +-----------+ +-----------+ ^
153	* \| \| \| \| \| thread 0 of reduces to the dest-memory
154	* \| \| \| \| \| group 0 +-----------+ +-----------+
155	* \|- - - - - -\| \|- - - - - -\| X
156	* \| \| \| \| \| thread 1 of reduces to workspace[tid=1]:
157	* \| \| \| \| \| group 0 +-----------+ +-----------+
158	* +-----------+ +-----------+ v
159	* \| \| \| \| \| \|
160	* v v v v v v
161	* ((barrier)) =============================
162	*
163	* dest-memory: +-----------+ +-----------+
164	*/
165	template <impl::data_type_t data_type>
166	struct cpu_reducer_t {
167	typedef typename prec_traits<data_type>::type data_t;
168
169	struct conf_t {
170	conf_t() = default;
171	conf_t &init(const reduce_balancer_t &balancer)
172	{ balancer_ = balancer; return *this; }
173
174	void init_scratchpad(memory_tracking::registrar_t &scratchpad) const;
175
176	reduce_balancer_t balancer_;
177	};
178
179	cpu_reducer_t(const conf_t &conf);
180	~cpu_reducer_t();
181
182	/* initializes reducer.*
183	* Must be called from a single thread prior to actual usage */
184	void init(const memory_tracking::grantor_t &scratchpad) const {
185	if (balancer().nthr_per_group_ == `1`) return;
186
187	auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
188	memory_tracking::names::key_reducer_space_bctx);
189	for (int i = `0`; i < balancer().ngroups_; ++i)
190	simple_barrier::ctx_init(&bctx[i]);
191	}
192
193	/* for given thread returns the pointer where to put partial results.*
194	* Reduction destination @p dst must be provided as well (master threads
195	* from each group will use it for partial result to reduce memory
196	* pressure).
197	*
198	* @note: job offset is already applied by get_local_ptr(), which means all
199	* threads should start writing from the very beginning of returned
200	* address.
201	*/
202	data_t get_local_ptr(int* ithr, data_t *dst,
203	const memory_tracking::grantor_t &scratchpad) const;
204
205	/* performs the reduction with built-in synchronization. /
206	void reduce(int ithr, data_t *dst,
207	const memory_tracking::grantor_t &scratchpad) const {
208	bool redundant_reduction = balancer().nthr_per_group_ == `1`
209	\|\| balancer().idle(ithr);
210	if (redundant_reduction) return;
211
212	auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
213	memory_tracking::names::key_reducer_space_bctx);
214	simple_barrier::barrier(&bctx[balancer().group_id(ithr)],
215	balancer().nthr_per_group_);
216
217	reduce_nolock(ithr, dst, scratchpad);
218	}
219
220	const reduce_balancer_t &balancer() const { return conf_.balancer_; }
221
222	private:
223	static size_t space_per_thread(const reduce_balancer_t &balancer)
224	{ return balancer.njobs_per_group_ub_ * balancer.job_size_; }
225
226	/ The scratchpad is organized as follows:*
227	*
228	* data_t space[nthr_][njobs_per_group_ub_][jobs_size_];
229	* simple_barrier::ctx_t barriers[groups_]; */
230
231	const conf_t conf_;
232	reducer_2d_driver_t<data_type> *drv_;
233
234	void reduce_nolock(int ithr, data_t *dst,
235	const memory_tracking::grantor_t &scratchpad) const;
236	};
237
238	template <impl::data_type_t data_type>
239	struct cpu_reducer_2d_t {
240	typedef typename prec_traits<data_type>::type data_t;
241
242	struct conf_t {
243	conf_t() = default;
244	conf_t &init(const reduce_balancer_t &balancer, int job_size_x,
245	int job_size_y, int x_block, int dst_x, int dst_y) {
246	balancer_ = balancer;
247	job_size_x_ = job_size_x;
248	job_size_y_ = job_size_y;
249	x_block_ = x_block;
250	dst_x_ = dst_x;
251	dst_y_ = dst_y;
252	return *this;
253	}
254
255	void init_scratchpad(memory_tracking::registrar_t &scratchpad) const;
256
257	reduce_balancer_t balancer_;
258	int job_size_x_, job_size_y_, x_block_, dst_x_, dst_y_;
259	};
260
261	cpu_reducer_2d_t(const conf_t &conf);
262	~cpu_reducer_2d_t();
263
264	/* initializes reducer.*
265	* Must be called from a single thread prior to actual usage */
266	void init(const memory_tracking::grantor_t &scratchpad) const {
267	if (balancer().nthr_per_group_ == `1`) return;
268
269	auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
270	memory_tracking::names::key_reducer_space_bctx);
271	for (int i = `0`; i < balancer().ngroups_; ++i)
272	simple_barrier::ctx_init(&bctx[i]);
273	}
274
275	/* for given thread returns the pointer where to put partial results /
276	data_t get_local_ptr(int* ithr,
277	const memory_tracking::grantor_t &scratchpad) const;
278
279	/* performs the reduction with built-in synchronization. /
280	void reduce(int ithr, data_t *dst,
281	const memory_tracking::grantor_t &scratchpad) const {
282	bool redundant_reduction = balancer().nthr_per_group_ == `1`
283	\|\| balancer().idle(ithr);
284	if (redundant_reduction) return;
285
286	auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
287	memory_tracking::names::key_reducer_space_bctx);
288	simple_barrier::barrier(&bctx[balancer().group_id(ithr)],
289	balancer().nthr_per_group_);
290
291	reduce_nolock(ithr, dst, scratchpad);
292	}
293
294	const reduce_balancer_t &balancer() const { return conf_.balancer_; }
295
296	private:
297	static size_t space_per_thread(const reduce_balancer_t &balancer)
298	{ return balancer.njobs_per_group_ub_ * balancer.job_size_; }
299
300	/ The scratchpad is organized as follows:*
301	*
302	* data_t space[nthr_][njobs_per_group_ub_][jobs_size_];
303	* simple_barrier::ctx_t barriers[groups_]; */
304
305	const conf_t conf_;
306	reducer_2d_driver_t<data_type> *drv_;
307
308	int choose_x_blocking(int nx, int ny, int nthr_per_grp) const;
309	void reduce_block(const data_t* space_base, data_t *dst,
310	int job, int start_y, int start_x,
311	int ny_start, int nx_start, int ny_step, int nx_step) const;
312	void reduce_nolock(int ithr, data_t *dst,
313	const memory_tracking::grantor_t &scratchpad) const;
314	};
315
316	/* simple 1d accumulator: y[:] += x[:] /
317	template <impl::data_type_t data_type>
318	struct cpu_accumulator_1d_t {
319	typedef typename prec_traits<data_type>::type data_t;
320
321	cpu_accumulator_1d_t();
322	~cpu_accumulator_1d_t();
323	void accumulate(data_t dst, const* data_t *src, size_t size);
324
325	reducer_2d_driver_t<data_type> *drv_;
326	};
327
328	}
329	}
330	}
331
332	#endif
333
334	// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
335

Browse the source code of Godot/thirdparty/oidn/mkl-dnn/src/cpu/cpu_reducer.hpp