1 | /******************************************************************************* |
2 | * Copyright 2017-2018 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef CPU_REDUCER_HPP |
18 | #define CPU_REDUCER_HPP |
19 | |
20 | #include <assert.h> |
21 | |
22 | #include "c_types_map.hpp" |
23 | #include "memory_tracking.hpp" |
24 | #include "mkldnn_thread.hpp" |
25 | #include "mkldnn_types.h" |
26 | #include "nstl.hpp" |
27 | #include "type_helpers.hpp" |
28 | |
29 | #include "cpu_barrier.hpp" |
30 | |
31 | namespace mkldnn { |
32 | namespace impl { |
33 | namespace cpu { |
34 | |
35 | /** class to perform balancing over 3D array |
36 | * |
37 | * Conceptually the reduction happens according to the picture below: |
38 | * |
39 | * <--job_size-> |
40 | * +-----------+ +-----------+ +-----------+ ^ |
41 | * | | | | | | | |
42 | * | | | | | | | |
43 | * | 1 | | 2 | . . . | njobs | | reduction_size |
44 | * | | | | | | | |
45 | * | | | | | | | |
46 | * +-----------+ +-----------+ +-----------+ v |
47 | * |
48 | * | | | | | | | | | |
49 | * v v v v v v v v v |
50 | * ===================================================== vertical reduction |
51 | * |
52 | * +-----------+ +-----------+ . . . +-----------+ result |
53 | * |
54 | * In a simple case the result must be contiguous in memory. |
55 | * @class cpu_reducer_t is an implementation. |
56 | * |
57 | * Threads are divided into groups. The groups are independent of each other. |
58 | * Each group may work on several jobs (the distribution is not uniform, since |
59 | * njobs might be not a multiple of groups). Threads within a group work on |
60 | * different parts of the reduction dimension. Thread 0 in each group is called |
61 | * master (@sa reduce_balancer_t::master()). |
62 | * |
63 | * If threading driver does not allow sync between sub-group of threads (e.g. |
64 | * Intel(R) TBB) the # of thread per group is enforced to be 1. |
65 | */ |
66 | struct reduce_balancer_t { |
67 | reduce_balancer_t() { init(1, 1, 1, 1, 0); } /* trivial balance */ |
68 | reduce_balancer_t(int nthr, int job_size, int njobs, int reduction_size, |
69 | size_t max_buffer_size) |
70 | { init(nthr, job_size, njobs, reduction_size, max_buffer_size); } |
71 | |
72 | reduce_balancer_t &init(int nthr, int job_size, int njobs, |
73 | int reduction_size, size_t max_buffer_size) |
74 | { |
75 | syncable_ = mkldnn_thr_syncable(); |
76 | nthr_ = nthr; |
77 | job_size_ = job_size; |
78 | njobs_ = njobs; |
79 | reduction_size_ = reduction_size; |
80 | max_buffer_size_ = max_buffer_size; |
81 | balance(); |
82 | return *this; |
83 | } |
84 | |
85 | bool syncable_; |
86 | int nthr_; |
87 | int job_size_, njobs_, reduction_size_; |
88 | |
89 | int ngroups_; /** number of independent work (thread) groups */ |
90 | int nthr_per_group_; /** number of threads within a single work group */ |
91 | int njobs_per_group_ub_; /** the max # of jobs within a work group */ |
92 | |
93 | bool master(int ithr) const { return id_in_group(ithr) == 0; } |
94 | bool idle(int ithr) const { return ithr >= nthr_per_group_ * ngroups_; } |
95 | |
96 | int group_id(int ithr) const { return ithr / nthr_per_group_; } |
97 | int id_in_group(int ithr) const { return ithr % nthr_per_group_; } |
98 | |
99 | int grp_njobs(int grp) const { |
100 | if (grp >= ngroups_) return 0; |
101 | return njobs_ / ngroups_ + (grp < njobs_ % ngroups_); |
102 | } |
103 | int grp_job_off(int grp) const { |
104 | if (grp >= ngroups_) return njobs_; |
105 | return njobs_ / ngroups_ * grp + nstl::min(grp, njobs_ % ngroups_); |
106 | } |
107 | |
108 | int ithr_njobs(int ithr) const { return grp_njobs(group_id(ithr)); } |
109 | int ithr_job_off(int ithr) const { return grp_job_off(group_id(ithr)); } |
110 | |
111 | private: |
112 | size_t max_buffer_size_; |
113 | void balance(); |
114 | }; |
115 | |
116 | /** forward declaration of reduce driver */ |
117 | template <impl::data_type_t data_type> struct reducer_2d_driver_t; |
118 | |
119 | /** class to perform a reduction over 3D array |
120 | * |
121 | * Balancing is based on @class reduce_balancer_t. |
122 | * Restrictions: the result of the reduction must be contiguous in memory. * |
123 | * The reduction happens according to the picture below (once more): |
124 | * |
125 | * <--job_size-> |
126 | * +-----------+ +-----------+ +-----------+ ^ |
127 | * | | | | | | | |
128 | * | | | | | | | |
129 | * | 1 | | 2 | . . . | njobs | | reduction_size |
130 | * | | | | | | | |
131 | * | | | | | | | |
132 | * +-----------+ +-----------+ +-----------+ v |
133 | * |
134 | * | | | | | | | | | |
135 | * v v v v v v v v v |
136 | * ===================================================== vertical reduction |
137 | * |
138 | * +-----------+ +-----------+ . . . +-----------+ (contiguous) result |
139 | * |
140 | * An example how work might be shared is shown below. |
141 | * |
142 | * In this example group 0 owns 2 (independent) jobs -- 2 big squares. |
143 | * The number of threads per group is also 2 (thread 0 of group 0 and thread 1 |
144 | * of group 0). Master threads (i.e. threads with id 0 in corresponding group) |
145 | * from each group put the partial result directly into destination memory, |
146 | * while all the other threads with-in the group use workspace (on the picture |
147 | * the only thread 1). Once intermediate results obtained each group reduces |
148 | * corresponding part (own jobs) to the destination memory. |
149 | * |
150 | * <------- group 0 -------> |
151 | * |
152 | * +-----------+ +-----------+ ^ |
153 | * | | | | | thread 0 of reduces to the dest-memory |
154 | * | | | | | group 0 +-----------+ +-----------+ |
155 | * |- - - - - -| |- - - - - -| X |
156 | * | | | | | thread 1 of reduces to workspace[tid=1]: |
157 | * | | | | | group 0 +-----------+ +-----------+ |
158 | * +-----------+ +-----------+ v |
159 | * | | | | | | |
160 | * v v v v v v |
161 | * ((barrier)) ============================= |
162 | * |
163 | * dest-memory: +-----------+ +-----------+ |
164 | */ |
165 | template <impl::data_type_t data_type> |
166 | struct cpu_reducer_t { |
167 | typedef typename prec_traits<data_type>::type data_t; |
168 | |
169 | struct conf_t { |
170 | conf_t() = default; |
171 | conf_t &init(const reduce_balancer_t &balancer) |
172 | { balancer_ = balancer; return *this; } |
173 | |
174 | void init_scratchpad(memory_tracking::registrar_t &scratchpad) const; |
175 | |
176 | reduce_balancer_t balancer_; |
177 | }; |
178 | |
179 | cpu_reducer_t(const conf_t &conf); |
180 | ~cpu_reducer_t(); |
181 | |
182 | /** initializes reducer. |
183 | * Must be called from a single thread prior to actual usage */ |
184 | void init(const memory_tracking::grantor_t &scratchpad) const { |
185 | if (balancer().nthr_per_group_ == 1) return; |
186 | |
187 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
188 | memory_tracking::names::key_reducer_space_bctx); |
189 | for (int i = 0; i < balancer().ngroups_; ++i) |
190 | simple_barrier::ctx_init(&bctx[i]); |
191 | } |
192 | |
193 | /** for given thread returns the pointer where to put partial results. |
194 | * Reduction destination @p dst must be provided as well (master threads |
195 | * from each group will use it for partial result to reduce memory |
196 | * pressure). |
197 | * |
198 | * @note: job offset is already applied by get_local_ptr(), which means all |
199 | * threads should start writing from the very beginning of returned |
200 | * address. |
201 | */ |
202 | data_t *get_local_ptr(int ithr, data_t *dst, |
203 | const memory_tracking::grantor_t &scratchpad) const; |
204 | |
205 | /** performs the reduction with built-in synchronization. */ |
206 | void reduce(int ithr, data_t *dst, |
207 | const memory_tracking::grantor_t &scratchpad) const { |
208 | bool redundant_reduction = balancer().nthr_per_group_ == 1 |
209 | || balancer().idle(ithr); |
210 | if (redundant_reduction) return; |
211 | |
212 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
213 | memory_tracking::names::key_reducer_space_bctx); |
214 | simple_barrier::barrier(&bctx[balancer().group_id(ithr)], |
215 | balancer().nthr_per_group_); |
216 | |
217 | reduce_nolock(ithr, dst, scratchpad); |
218 | } |
219 | |
220 | const reduce_balancer_t &balancer() const { return conf_.balancer_; } |
221 | |
222 | private: |
223 | static size_t space_per_thread(const reduce_balancer_t &balancer) |
224 | { return balancer.njobs_per_group_ub_ * balancer.job_size_; } |
225 | |
226 | /* The scratchpad is organized as follows: |
227 | * |
228 | * data_t space[nthr_][njobs_per_group_ub_][jobs_size_]; |
229 | * simple_barrier::ctx_t barriers[groups_]; */ |
230 | |
231 | const conf_t conf_; |
232 | reducer_2d_driver_t<data_type> *drv_; |
233 | |
234 | void reduce_nolock(int ithr, data_t *dst, |
235 | const memory_tracking::grantor_t &scratchpad) const; |
236 | }; |
237 | |
238 | template <impl::data_type_t data_type> |
239 | struct cpu_reducer_2d_t { |
240 | typedef typename prec_traits<data_type>::type data_t; |
241 | |
242 | struct conf_t { |
243 | conf_t() = default; |
244 | conf_t &init(const reduce_balancer_t &balancer, int job_size_x, |
245 | int job_size_y, int x_block, int dst_x, int dst_y) { |
246 | balancer_ = balancer; |
247 | job_size_x_ = job_size_x; |
248 | job_size_y_ = job_size_y; |
249 | x_block_ = x_block; |
250 | dst_x_ = dst_x; |
251 | dst_y_ = dst_y; |
252 | return *this; |
253 | } |
254 | |
255 | void init_scratchpad(memory_tracking::registrar_t &scratchpad) const; |
256 | |
257 | reduce_balancer_t balancer_; |
258 | int job_size_x_, job_size_y_, x_block_, dst_x_, dst_y_; |
259 | }; |
260 | |
261 | cpu_reducer_2d_t(const conf_t &conf); |
262 | ~cpu_reducer_2d_t(); |
263 | |
264 | /** initializes reducer. |
265 | * Must be called from a single thread prior to actual usage */ |
266 | void init(const memory_tracking::grantor_t &scratchpad) const { |
267 | if (balancer().nthr_per_group_ == 1) return; |
268 | |
269 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
270 | memory_tracking::names::key_reducer_space_bctx); |
271 | for (int i = 0; i < balancer().ngroups_; ++i) |
272 | simple_barrier::ctx_init(&bctx[i]); |
273 | } |
274 | |
275 | /** for given thread returns the pointer where to put partial results */ |
276 | data_t *get_local_ptr(int ithr, |
277 | const memory_tracking::grantor_t &scratchpad) const; |
278 | |
279 | /** performs the reduction with built-in synchronization. */ |
280 | void reduce(int ithr, data_t *dst, |
281 | const memory_tracking::grantor_t &scratchpad) const { |
282 | bool redundant_reduction = balancer().nthr_per_group_ == 1 |
283 | || balancer().idle(ithr); |
284 | if (redundant_reduction) return; |
285 | |
286 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
287 | memory_tracking::names::key_reducer_space_bctx); |
288 | simple_barrier::barrier(&bctx[balancer().group_id(ithr)], |
289 | balancer().nthr_per_group_); |
290 | |
291 | reduce_nolock(ithr, dst, scratchpad); |
292 | } |
293 | |
294 | const reduce_balancer_t &balancer() const { return conf_.balancer_; } |
295 | |
296 | private: |
297 | static size_t space_per_thread(const reduce_balancer_t &balancer) |
298 | { return balancer.njobs_per_group_ub_ * balancer.job_size_; } |
299 | |
300 | /* The scratchpad is organized as follows: |
301 | * |
302 | * data_t space[nthr_][njobs_per_group_ub_][jobs_size_]; |
303 | * simple_barrier::ctx_t barriers[groups_]; */ |
304 | |
305 | const conf_t conf_; |
306 | reducer_2d_driver_t<data_type> *drv_; |
307 | |
308 | int choose_x_blocking(int nx, int ny, int nthr_per_grp) const; |
309 | void reduce_block(const data_t* space_base, data_t *dst, |
310 | int job, int start_y, int start_x, |
311 | int ny_start, int nx_start, int ny_step, int nx_step) const; |
312 | void reduce_nolock(int ithr, data_t *dst, |
313 | const memory_tracking::grantor_t &scratchpad) const; |
314 | }; |
315 | |
316 | /** simple 1d accumulator: y[:] += x[:] */ |
317 | template <impl::data_type_t data_type> |
318 | struct cpu_accumulator_1d_t { |
319 | typedef typename prec_traits<data_type>::type data_t; |
320 | |
321 | cpu_accumulator_1d_t(); |
322 | ~cpu_accumulator_1d_t(); |
323 | void accumulate(data_t *dst, const data_t *src, size_t size); |
324 | |
325 | reducer_2d_driver_t<data_type> *drv_; |
326 | }; |
327 | |
328 | } |
329 | } |
330 | } |
331 | |
332 | #endif |
333 | |
334 | // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s |
335 | |