| 1 | /******************************************************************************* |
| 2 | * Copyright 2017-2018 Intel Corporation |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | *******************************************************************************/ |
| 16 | |
| 17 | #ifndef CPU_REDUCER_HPP |
| 18 | #define CPU_REDUCER_HPP |
| 19 | |
| 20 | #include <assert.h> |
| 21 | |
| 22 | #include "c_types_map.hpp" |
| 23 | #include "memory_tracking.hpp" |
| 24 | #include "mkldnn_thread.hpp" |
| 25 | #include "mkldnn_types.h" |
| 26 | #include "nstl.hpp" |
| 27 | #include "type_helpers.hpp" |
| 28 | |
| 29 | #include "cpu_barrier.hpp" |
| 30 | |
| 31 | namespace mkldnn { |
| 32 | namespace impl { |
| 33 | namespace cpu { |
| 34 | |
| 35 | /** class to perform balancing over 3D array |
| 36 | * |
| 37 | * Conceptually the reduction happens according to the picture below: |
| 38 | * |
| 39 | * <--job_size-> |
| 40 | * +-----------+ +-----------+ +-----------+ ^ |
| 41 | * | | | | | | | |
| 42 | * | | | | | | | |
| 43 | * | 1 | | 2 | . . . | njobs | | reduction_size |
| 44 | * | | | | | | | |
| 45 | * | | | | | | | |
| 46 | * +-----------+ +-----------+ +-----------+ v |
| 47 | * |
| 48 | * | | | | | | | | | |
| 49 | * v v v v v v v v v |
| 50 | * ===================================================== vertical reduction |
| 51 | * |
| 52 | * +-----------+ +-----------+ . . . +-----------+ result |
| 53 | * |
| 54 | * In a simple case the result must be contiguous in memory. |
| 55 | * @class cpu_reducer_t is an implementation. |
| 56 | * |
| 57 | * Threads are divided into groups. The groups are independent of each other. |
| 58 | * Each group may work on several jobs (the distribution is not uniform, since |
| 59 | * njobs might be not a multiple of groups). Threads within a group work on |
| 60 | * different parts of the reduction dimension. Thread 0 in each group is called |
| 61 | * master (@sa reduce_balancer_t::master()). |
| 62 | * |
| 63 | * If threading driver does not allow sync between sub-group of threads (e.g. |
| 64 | * Intel(R) TBB) the # of thread per group is enforced to be 1. |
| 65 | */ |
| 66 | struct reduce_balancer_t { |
| 67 | reduce_balancer_t() { init(1, 1, 1, 1, 0); } /* trivial balance */ |
| 68 | reduce_balancer_t(int nthr, int job_size, int njobs, int reduction_size, |
| 69 | size_t max_buffer_size) |
| 70 | { init(nthr, job_size, njobs, reduction_size, max_buffer_size); } |
| 71 | |
| 72 | reduce_balancer_t &init(int nthr, int job_size, int njobs, |
| 73 | int reduction_size, size_t max_buffer_size) |
| 74 | { |
| 75 | syncable_ = mkldnn_thr_syncable(); |
| 76 | nthr_ = nthr; |
| 77 | job_size_ = job_size; |
| 78 | njobs_ = njobs; |
| 79 | reduction_size_ = reduction_size; |
| 80 | max_buffer_size_ = max_buffer_size; |
| 81 | balance(); |
| 82 | return *this; |
| 83 | } |
| 84 | |
| 85 | bool syncable_; |
| 86 | int nthr_; |
| 87 | int job_size_, njobs_, reduction_size_; |
| 88 | |
| 89 | int ngroups_; /** number of independent work (thread) groups */ |
| 90 | int nthr_per_group_; /** number of threads within a single work group */ |
| 91 | int njobs_per_group_ub_; /** the max # of jobs within a work group */ |
| 92 | |
| 93 | bool master(int ithr) const { return id_in_group(ithr) == 0; } |
| 94 | bool idle(int ithr) const { return ithr >= nthr_per_group_ * ngroups_; } |
| 95 | |
| 96 | int group_id(int ithr) const { return ithr / nthr_per_group_; } |
| 97 | int id_in_group(int ithr) const { return ithr % nthr_per_group_; } |
| 98 | |
| 99 | int grp_njobs(int grp) const { |
| 100 | if (grp >= ngroups_) return 0; |
| 101 | return njobs_ / ngroups_ + (grp < njobs_ % ngroups_); |
| 102 | } |
| 103 | int grp_job_off(int grp) const { |
| 104 | if (grp >= ngroups_) return njobs_; |
| 105 | return njobs_ / ngroups_ * grp + nstl::min(grp, njobs_ % ngroups_); |
| 106 | } |
| 107 | |
| 108 | int ithr_njobs(int ithr) const { return grp_njobs(group_id(ithr)); } |
| 109 | int ithr_job_off(int ithr) const { return grp_job_off(group_id(ithr)); } |
| 110 | |
| 111 | private: |
| 112 | size_t max_buffer_size_; |
| 113 | void balance(); |
| 114 | }; |
| 115 | |
| 116 | /** forward declaration of reduce driver */ |
| 117 | template <impl::data_type_t data_type> struct reducer_2d_driver_t; |
| 118 | |
| 119 | /** class to perform a reduction over 3D array |
| 120 | * |
| 121 | * Balancing is based on @class reduce_balancer_t. |
| 122 | * Restrictions: the result of the reduction must be contiguous in memory. * |
| 123 | * The reduction happens according to the picture below (once more): |
| 124 | * |
| 125 | * <--job_size-> |
| 126 | * +-----------+ +-----------+ +-----------+ ^ |
| 127 | * | | | | | | | |
| 128 | * | | | | | | | |
| 129 | * | 1 | | 2 | . . . | njobs | | reduction_size |
| 130 | * | | | | | | | |
| 131 | * | | | | | | | |
| 132 | * +-----------+ +-----------+ +-----------+ v |
| 133 | * |
| 134 | * | | | | | | | | | |
| 135 | * v v v v v v v v v |
| 136 | * ===================================================== vertical reduction |
| 137 | * |
| 138 | * +-----------+ +-----------+ . . . +-----------+ (contiguous) result |
| 139 | * |
| 140 | * An example how work might be shared is shown below. |
| 141 | * |
| 142 | * In this example group 0 owns 2 (independent) jobs -- 2 big squares. |
| 143 | * The number of threads per group is also 2 (thread 0 of group 0 and thread 1 |
| 144 | * of group 0). Master threads (i.e. threads with id 0 in corresponding group) |
| 145 | * from each group put the partial result directly into destination memory, |
| 146 | * while all the other threads with-in the group use workspace (on the picture |
| 147 | * the only thread 1). Once intermediate results obtained each group reduces |
| 148 | * corresponding part (own jobs) to the destination memory. |
| 149 | * |
| 150 | * <------- group 0 -------> |
| 151 | * |
| 152 | * +-----------+ +-----------+ ^ |
| 153 | * | | | | | thread 0 of reduces to the dest-memory |
| 154 | * | | | | | group 0 +-----------+ +-----------+ |
| 155 | * |- - - - - -| |- - - - - -| X |
| 156 | * | | | | | thread 1 of reduces to workspace[tid=1]: |
| 157 | * | | | | | group 0 +-----------+ +-----------+ |
| 158 | * +-----------+ +-----------+ v |
| 159 | * | | | | | | |
| 160 | * v v v v v v |
| 161 | * ((barrier)) ============================= |
| 162 | * |
| 163 | * dest-memory: +-----------+ +-----------+ |
| 164 | */ |
| 165 | template <impl::data_type_t data_type> |
| 166 | struct cpu_reducer_t { |
| 167 | typedef typename prec_traits<data_type>::type data_t; |
| 168 | |
| 169 | struct conf_t { |
| 170 | conf_t() = default; |
| 171 | conf_t &init(const reduce_balancer_t &balancer) |
| 172 | { balancer_ = balancer; return *this; } |
| 173 | |
| 174 | void init_scratchpad(memory_tracking::registrar_t &scratchpad) const; |
| 175 | |
| 176 | reduce_balancer_t balancer_; |
| 177 | }; |
| 178 | |
| 179 | cpu_reducer_t(const conf_t &conf); |
| 180 | ~cpu_reducer_t(); |
| 181 | |
| 182 | /** initializes reducer. |
| 183 | * Must be called from a single thread prior to actual usage */ |
| 184 | void init(const memory_tracking::grantor_t &scratchpad) const { |
| 185 | if (balancer().nthr_per_group_ == 1) return; |
| 186 | |
| 187 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
| 188 | memory_tracking::names::key_reducer_space_bctx); |
| 189 | for (int i = 0; i < balancer().ngroups_; ++i) |
| 190 | simple_barrier::ctx_init(&bctx[i]); |
| 191 | } |
| 192 | |
| 193 | /** for given thread returns the pointer where to put partial results. |
| 194 | * Reduction destination @p dst must be provided as well (master threads |
| 195 | * from each group will use it for partial result to reduce memory |
| 196 | * pressure). |
| 197 | * |
| 198 | * @note: job offset is already applied by get_local_ptr(), which means all |
| 199 | * threads should start writing from the very beginning of returned |
| 200 | * address. |
| 201 | */ |
| 202 | data_t *get_local_ptr(int ithr, data_t *dst, |
| 203 | const memory_tracking::grantor_t &scratchpad) const; |
| 204 | |
| 205 | /** performs the reduction with built-in synchronization. */ |
| 206 | void reduce(int ithr, data_t *dst, |
| 207 | const memory_tracking::grantor_t &scratchpad) const { |
| 208 | bool redundant_reduction = balancer().nthr_per_group_ == 1 |
| 209 | || balancer().idle(ithr); |
| 210 | if (redundant_reduction) return; |
| 211 | |
| 212 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
| 213 | memory_tracking::names::key_reducer_space_bctx); |
| 214 | simple_barrier::barrier(&bctx[balancer().group_id(ithr)], |
| 215 | balancer().nthr_per_group_); |
| 216 | |
| 217 | reduce_nolock(ithr, dst, scratchpad); |
| 218 | } |
| 219 | |
| 220 | const reduce_balancer_t &balancer() const { return conf_.balancer_; } |
| 221 | |
| 222 | private: |
| 223 | static size_t space_per_thread(const reduce_balancer_t &balancer) |
| 224 | { return balancer.njobs_per_group_ub_ * balancer.job_size_; } |
| 225 | |
| 226 | /* The scratchpad is organized as follows: |
| 227 | * |
| 228 | * data_t space[nthr_][njobs_per_group_ub_][jobs_size_]; |
| 229 | * simple_barrier::ctx_t barriers[groups_]; */ |
| 230 | |
| 231 | const conf_t conf_; |
| 232 | reducer_2d_driver_t<data_type> *drv_; |
| 233 | |
| 234 | void reduce_nolock(int ithr, data_t *dst, |
| 235 | const memory_tracking::grantor_t &scratchpad) const; |
| 236 | }; |
| 237 | |
| 238 | template <impl::data_type_t data_type> |
| 239 | struct cpu_reducer_2d_t { |
| 240 | typedef typename prec_traits<data_type>::type data_t; |
| 241 | |
| 242 | struct conf_t { |
| 243 | conf_t() = default; |
| 244 | conf_t &init(const reduce_balancer_t &balancer, int job_size_x, |
| 245 | int job_size_y, int x_block, int dst_x, int dst_y) { |
| 246 | balancer_ = balancer; |
| 247 | job_size_x_ = job_size_x; |
| 248 | job_size_y_ = job_size_y; |
| 249 | x_block_ = x_block; |
| 250 | dst_x_ = dst_x; |
| 251 | dst_y_ = dst_y; |
| 252 | return *this; |
| 253 | } |
| 254 | |
| 255 | void init_scratchpad(memory_tracking::registrar_t &scratchpad) const; |
| 256 | |
| 257 | reduce_balancer_t balancer_; |
| 258 | int job_size_x_, job_size_y_, x_block_, dst_x_, dst_y_; |
| 259 | }; |
| 260 | |
| 261 | cpu_reducer_2d_t(const conf_t &conf); |
| 262 | ~cpu_reducer_2d_t(); |
| 263 | |
| 264 | /** initializes reducer. |
| 265 | * Must be called from a single thread prior to actual usage */ |
| 266 | void init(const memory_tracking::grantor_t &scratchpad) const { |
| 267 | if (balancer().nthr_per_group_ == 1) return; |
| 268 | |
| 269 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
| 270 | memory_tracking::names::key_reducer_space_bctx); |
| 271 | for (int i = 0; i < balancer().ngroups_; ++i) |
| 272 | simple_barrier::ctx_init(&bctx[i]); |
| 273 | } |
| 274 | |
| 275 | /** for given thread returns the pointer where to put partial results */ |
| 276 | data_t *get_local_ptr(int ithr, |
| 277 | const memory_tracking::grantor_t &scratchpad) const; |
| 278 | |
| 279 | /** performs the reduction with built-in synchronization. */ |
| 280 | void reduce(int ithr, data_t *dst, |
| 281 | const memory_tracking::grantor_t &scratchpad) const { |
| 282 | bool redundant_reduction = balancer().nthr_per_group_ == 1 |
| 283 | || balancer().idle(ithr); |
| 284 | if (redundant_reduction) return; |
| 285 | |
| 286 | auto bctx = scratchpad.template get<simple_barrier::ctx_t>( |
| 287 | memory_tracking::names::key_reducer_space_bctx); |
| 288 | simple_barrier::barrier(&bctx[balancer().group_id(ithr)], |
| 289 | balancer().nthr_per_group_); |
| 290 | |
| 291 | reduce_nolock(ithr, dst, scratchpad); |
| 292 | } |
| 293 | |
| 294 | const reduce_balancer_t &balancer() const { return conf_.balancer_; } |
| 295 | |
| 296 | private: |
| 297 | static size_t space_per_thread(const reduce_balancer_t &balancer) |
| 298 | { return balancer.njobs_per_group_ub_ * balancer.job_size_; } |
| 299 | |
| 300 | /* The scratchpad is organized as follows: |
| 301 | * |
| 302 | * data_t space[nthr_][njobs_per_group_ub_][jobs_size_]; |
| 303 | * simple_barrier::ctx_t barriers[groups_]; */ |
| 304 | |
| 305 | const conf_t conf_; |
| 306 | reducer_2d_driver_t<data_type> *drv_; |
| 307 | |
| 308 | int choose_x_blocking(int nx, int ny, int nthr_per_grp) const; |
| 309 | void reduce_block(const data_t* space_base, data_t *dst, |
| 310 | int job, int start_y, int start_x, |
| 311 | int ny_start, int nx_start, int ny_step, int nx_step) const; |
| 312 | void reduce_nolock(int ithr, data_t *dst, |
| 313 | const memory_tracking::grantor_t &scratchpad) const; |
| 314 | }; |
| 315 | |
| 316 | /** simple 1d accumulator: y[:] += x[:] */ |
| 317 | template <impl::data_type_t data_type> |
| 318 | struct cpu_accumulator_1d_t { |
| 319 | typedef typename prec_traits<data_type>::type data_t; |
| 320 | |
| 321 | cpu_accumulator_1d_t(); |
| 322 | ~cpu_accumulator_1d_t(); |
| 323 | void accumulate(data_t *dst, const data_t *src, size_t size); |
| 324 | |
| 325 | reducer_2d_driver_t<data_type> *drv_; |
| 326 | }; |
| 327 | |
| 328 | } |
| 329 | } |
| 330 | } |
| 331 | |
| 332 | #endif |
| 333 | |
| 334 | // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s |
| 335 | |