1/*******************************************************************************
2* Copyright 2017-2018 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef CPU_REDUCER_HPP
18#define CPU_REDUCER_HPP
19
20#include <assert.h>
21
22#include "c_types_map.hpp"
23#include "memory_tracking.hpp"
24#include "mkldnn_thread.hpp"
25#include "mkldnn_types.h"
26#include "nstl.hpp"
27#include "type_helpers.hpp"
28
29#include "cpu_barrier.hpp"
30
31namespace mkldnn {
32namespace impl {
33namespace cpu {
34
35/** class to perform balancing over 3D array
36 *
37 * Conceptually the reduction happens according to the picture below:
38 *
39 * <--job_size->
40 * +-----------+ +-----------+ +-----------+ ^
41 * | | | | | | |
42 * | | | | | | |
43 * | 1 | | 2 | . . . | njobs | | reduction_size
44 * | | | | | | |
45 * | | | | | | |
46 * +-----------+ +-----------+ +-----------+ v
47 *
48 * | | | | | | | | |
49 * v v v v v v v v v
50 * ===================================================== vertical reduction
51 *
52 * +-----------+ +-----------+ . . . +-----------+ result
53 *
54 * In a simple case the result must be contiguous in memory.
55 * @class cpu_reducer_t is an implementation.
56 *
57 * Threads are divided into groups. The groups are independent of each other.
58 * Each group may work on several jobs (the distribution is not uniform, since
59 * njobs might be not a multiple of groups). Threads within a group work on
60 * different parts of the reduction dimension. Thread 0 in each group is called
61 * master (@sa reduce_balancer_t::master()).
62 *
63 * If threading driver does not allow sync between sub-group of threads (e.g.
64 * Intel(R) TBB) the # of thread per group is enforced to be 1.
65 */
66struct reduce_balancer_t {
67 reduce_balancer_t() { init(1, 1, 1, 1, 0); } /* trivial balance */
68 reduce_balancer_t(int nthr, int job_size, int njobs, int reduction_size,
69 size_t max_buffer_size)
70 { init(nthr, job_size, njobs, reduction_size, max_buffer_size); }
71
72 reduce_balancer_t &init(int nthr, int job_size, int njobs,
73 int reduction_size, size_t max_buffer_size)
74 {
75 syncable_ = mkldnn_thr_syncable();
76 nthr_ = nthr;
77 job_size_ = job_size;
78 njobs_ = njobs;
79 reduction_size_ = reduction_size;
80 max_buffer_size_ = max_buffer_size;
81 balance();
82 return *this;
83 }
84
85 bool syncable_;
86 int nthr_;
87 int job_size_, njobs_, reduction_size_;
88
89 int ngroups_; /** number of independent work (thread) groups */
90 int nthr_per_group_; /** number of threads within a single work group */
91 int njobs_per_group_ub_; /** the max # of jobs within a work group */
92
93 bool master(int ithr) const { return id_in_group(ithr) == 0; }
94 bool idle(int ithr) const { return ithr >= nthr_per_group_ * ngroups_; }
95
96 int group_id(int ithr) const { return ithr / nthr_per_group_; }
97 int id_in_group(int ithr) const { return ithr % nthr_per_group_; }
98
99 int grp_njobs(int grp) const {
100 if (grp >= ngroups_) return 0;
101 return njobs_ / ngroups_ + (grp < njobs_ % ngroups_);
102 }
103 int grp_job_off(int grp) const {
104 if (grp >= ngroups_) return njobs_;
105 return njobs_ / ngroups_ * grp + nstl::min(grp, njobs_ % ngroups_);
106 }
107
108 int ithr_njobs(int ithr) const { return grp_njobs(group_id(ithr)); }
109 int ithr_job_off(int ithr) const { return grp_job_off(group_id(ithr)); }
110
111private:
112 size_t max_buffer_size_;
113 void balance();
114};
115
116/** forward declaration of reduce driver */
117template <impl::data_type_t data_type> struct reducer_2d_driver_t;
118
119/** class to perform a reduction over 3D array
120 *
121 * Balancing is based on @class reduce_balancer_t.
122 * Restrictions: the result of the reduction must be contiguous in memory. *
123 * The reduction happens according to the picture below (once more):
124 *
125 * <--job_size->
126 * +-----------+ +-----------+ +-----------+ ^
127 * | | | | | | |
128 * | | | | | | |
129 * | 1 | | 2 | . . . | njobs | | reduction_size
130 * | | | | | | |
131 * | | | | | | |
132 * +-----------+ +-----------+ +-----------+ v
133 *
134 * | | | | | | | | |
135 * v v v v v v v v v
136 * ===================================================== vertical reduction
137 *
138 * +-----------+ +-----------+ . . . +-----------+ (contiguous) result
139 *
140 * An example how work might be shared is shown below.
141 *
142 * In this example group 0 owns 2 (independent) jobs -- 2 big squares.
143 * The number of threads per group is also 2 (thread 0 of group 0 and thread 1
144 * of group 0). Master threads (i.e. threads with id 0 in corresponding group)
145 * from each group put the partial result directly into destination memory,
146 * while all the other threads with-in the group use workspace (on the picture
147 * the only thread 1). Once intermediate results obtained each group reduces
148 * corresponding part (own jobs) to the destination memory.
149 *
150 * <------- group 0 ------->
151 *
152 * +-----------+ +-----------+ ^
153 * | | | | | thread 0 of reduces to the dest-memory
154 * | | | | | group 0 +-----------+ +-----------+
155 * |- - - - - -| |- - - - - -| X
156 * | | | | | thread 1 of reduces to workspace[tid=1]:
157 * | | | | | group 0 +-----------+ +-----------+
158 * +-----------+ +-----------+ v
159 * | | | | | |
160 * v v v v v v
161 * ((barrier)) =============================
162 *
163 * dest-memory: +-----------+ +-----------+
164 */
165template <impl::data_type_t data_type>
166struct cpu_reducer_t {
167 typedef typename prec_traits<data_type>::type data_t;
168
169 struct conf_t {
170 conf_t() = default;
171 conf_t &init(const reduce_balancer_t &balancer)
172 { balancer_ = balancer; return *this; }
173
174 void init_scratchpad(memory_tracking::registrar_t &scratchpad) const;
175
176 reduce_balancer_t balancer_;
177 };
178
179 cpu_reducer_t(const conf_t &conf);
180 ~cpu_reducer_t();
181
182 /** initializes reducer.
183 * Must be called from a single thread prior to actual usage */
184 void init(const memory_tracking::grantor_t &scratchpad) const {
185 if (balancer().nthr_per_group_ == 1) return;
186
187 auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
188 memory_tracking::names::key_reducer_space_bctx);
189 for (int i = 0; i < balancer().ngroups_; ++i)
190 simple_barrier::ctx_init(&bctx[i]);
191 }
192
193 /** for given thread returns the pointer where to put partial results.
194 * Reduction destination @p dst must be provided as well (master threads
195 * from each group will use it for partial result to reduce memory
196 * pressure).
197 *
198 * @note: job offset is already applied by get_local_ptr(), which means all
199 * threads should start writing from the very beginning of returned
200 * address.
201 */
202 data_t *get_local_ptr(int ithr, data_t *dst,
203 const memory_tracking::grantor_t &scratchpad) const;
204
205 /** performs the reduction with built-in synchronization. */
206 void reduce(int ithr, data_t *dst,
207 const memory_tracking::grantor_t &scratchpad) const {
208 bool redundant_reduction = balancer().nthr_per_group_ == 1
209 || balancer().idle(ithr);
210 if (redundant_reduction) return;
211
212 auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
213 memory_tracking::names::key_reducer_space_bctx);
214 simple_barrier::barrier(&bctx[balancer().group_id(ithr)],
215 balancer().nthr_per_group_);
216
217 reduce_nolock(ithr, dst, scratchpad);
218 }
219
220 const reduce_balancer_t &balancer() const { return conf_.balancer_; }
221
222private:
223 static size_t space_per_thread(const reduce_balancer_t &balancer)
224 { return balancer.njobs_per_group_ub_ * balancer.job_size_; }
225
226 /* The scratchpad is organized as follows:
227 *
228 * data_t space[nthr_][njobs_per_group_ub_][jobs_size_];
229 * simple_barrier::ctx_t barriers[groups_]; */
230
231 const conf_t conf_;
232 reducer_2d_driver_t<data_type> *drv_;
233
234 void reduce_nolock(int ithr, data_t *dst,
235 const memory_tracking::grantor_t &scratchpad) const;
236};
237
238template <impl::data_type_t data_type>
239struct cpu_reducer_2d_t {
240 typedef typename prec_traits<data_type>::type data_t;
241
242 struct conf_t {
243 conf_t() = default;
244 conf_t &init(const reduce_balancer_t &balancer, int job_size_x,
245 int job_size_y, int x_block, int dst_x, int dst_y) {
246 balancer_ = balancer;
247 job_size_x_ = job_size_x;
248 job_size_y_ = job_size_y;
249 x_block_ = x_block;
250 dst_x_ = dst_x;
251 dst_y_ = dst_y;
252 return *this;
253 }
254
255 void init_scratchpad(memory_tracking::registrar_t &scratchpad) const;
256
257 reduce_balancer_t balancer_;
258 int job_size_x_, job_size_y_, x_block_, dst_x_, dst_y_;
259 };
260
261 cpu_reducer_2d_t(const conf_t &conf);
262 ~cpu_reducer_2d_t();
263
264 /** initializes reducer.
265 * Must be called from a single thread prior to actual usage */
266 void init(const memory_tracking::grantor_t &scratchpad) const {
267 if (balancer().nthr_per_group_ == 1) return;
268
269 auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
270 memory_tracking::names::key_reducer_space_bctx);
271 for (int i = 0; i < balancer().ngroups_; ++i)
272 simple_barrier::ctx_init(&bctx[i]);
273 }
274
275 /** for given thread returns the pointer where to put partial results */
276 data_t *get_local_ptr(int ithr,
277 const memory_tracking::grantor_t &scratchpad) const;
278
279 /** performs the reduction with built-in synchronization. */
280 void reduce(int ithr, data_t *dst,
281 const memory_tracking::grantor_t &scratchpad) const {
282 bool redundant_reduction = balancer().nthr_per_group_ == 1
283 || balancer().idle(ithr);
284 if (redundant_reduction) return;
285
286 auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
287 memory_tracking::names::key_reducer_space_bctx);
288 simple_barrier::barrier(&bctx[balancer().group_id(ithr)],
289 balancer().nthr_per_group_);
290
291 reduce_nolock(ithr, dst, scratchpad);
292 }
293
294 const reduce_balancer_t &balancer() const { return conf_.balancer_; }
295
296private:
297 static size_t space_per_thread(const reduce_balancer_t &balancer)
298 { return balancer.njobs_per_group_ub_ * balancer.job_size_; }
299
300 /* The scratchpad is organized as follows:
301 *
302 * data_t space[nthr_][njobs_per_group_ub_][jobs_size_];
303 * simple_barrier::ctx_t barriers[groups_]; */
304
305 const conf_t conf_;
306 reducer_2d_driver_t<data_type> *drv_;
307
308 int choose_x_blocking(int nx, int ny, int nthr_per_grp) const;
309 void reduce_block(const data_t* space_base, data_t *dst,
310 int job, int start_y, int start_x,
311 int ny_start, int nx_start, int ny_step, int nx_step) const;
312 void reduce_nolock(int ithr, data_t *dst,
313 const memory_tracking::grantor_t &scratchpad) const;
314};
315
316/** simple 1d accumulator: y[:] += x[:] */
317template <impl::data_type_t data_type>
318struct cpu_accumulator_1d_t {
319 typedef typename prec_traits<data_type>::type data_t;
320
321 cpu_accumulator_1d_t();
322 ~cpu_accumulator_1d_t();
323 void accumulate(data_t *dst, const data_t *src, size_t size);
324
325 reducer_2d_driver_t<data_type> *drv_;
326};
327
328}
329}
330}
331
332#endif
333
334// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
335