| 1 | /******************************************************************************* |
| 2 | * Copyright 2018 Intel Corporation |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | *******************************************************************************/ |
| 16 | |
| 17 | #ifndef MEMORY_TRACKING_HPP |
| 18 | #define MEMORY_TRACKING_HPP |
| 19 | |
| 20 | #include <assert.h> |
| 21 | #include <unordered_map> |
| 22 | |
| 23 | #include "nstl.hpp" |
| 24 | #include "utils.hpp" |
| 25 | |
| 26 | namespace mkldnn { |
| 27 | namespace impl { |
| 28 | namespace memory_tracking { |
| 29 | |
| 30 | /* Memory tracking capabilities |
| 31 | * |
| 32 | * The main purpose of this header file is to provide uniform way to register |
| 33 | * required memory for a scratchpad at a primitive descriptor creation time |
| 34 | * and then easily access it having only the base address of the scratchpad. |
| 35 | * |
| 36 | * Primitives might contain multiple disjoint parts that require temporary |
| 37 | * buffers (known as scratchpad) during their execution. A primitive descriptor |
| 38 | * should summarize all the needs into one single number -- the buffer size |
| 39 | * that would be requested from a user. At execution time, the corresponding |
| 40 | * primitive will receive a base pointer to a scratchpad. It then needs to |
| 41 | * provide each part of algorithm the corresponding piece of memory. Three main |
| 42 | * challenges here are: |
| 43 | * 1. Track correct offset (from the base scratchpad address) for each piece |
| 44 | * 2. Algorithm might require that different memory pieces to be aligned, so |
| 45 | * the scratchpad size is no more just a sum of size of the corresponding |
| 46 | * subparts. |
| 47 | * 3. While a primitive is responsible for its scratchpad, the implementation |
| 48 | * might use some other basic blocks (e.g. cpu_reducer) that also require |
| 49 | * scratchpad memory. So there should be a simple way of passing the |
| 50 | * information back and force between the main algorithm (a primitive) and |
| 51 | * auxiliary stuff that lives completely separately from it (e.g. reducer). |
| 52 | * |
| 53 | * To address these challenges this header file provides 3 structures: |
| 54 | * 1. registry_t -- the class the stores the information about requested |
| 55 | * memory. The information includes required size and desired |
| 56 | * alignment for each piece. This class is also responsible |
| 57 | * for computing the right offset to a given piece using the |
| 58 | * base pointer. |
| 59 | * This class is basically a ledger with all entries. |
| 60 | * Lives in primitive descriptors. |
| 61 | * |
| 62 | * 2. registrar_t -- the interface to a registry_t to book memory. Used at |
| 63 | * primitive descriptor creation time only. Contains a |
| 64 | * reference to the corresponding *mutable* registry. |
| 65 | * Always modifiable. |
| 66 | * Allows chaining (using prefixes). |
| 67 | * |
| 68 | * 3. grantor_t -- the interface to a registry_t to access memory. Used at |
| 69 | * primitive execution time only. Contains a reference to |
| 70 | * the corresponding *constant* registry and base pointer. |
| 71 | * Always constant. |
| 72 | * Allows chaining (using prefixes). |
| 73 | * |
| 74 | * Both registrar_t and grantor_t allow chaining with extra prefix provided. |
| 75 | * The feature is useful when a primitive offload a part of computations to |
| 76 | * some other primitives which require their own scratchpad space |
| 77 | * (e.g. reducer). Prefixes are used to avoid key collision in cases when |
| 78 | * multiple sub-primitive (e.g. multiple reducers) are used. |
| 79 | * |
| 80 | * A short example below demonstrates how to use aforementioned classes. In it |
| 81 | * the main primitive is convolution that uses scratchpad for keeping padded |
| 82 | * bias. It also needs a reducer, that needs its own space as well. |
| 83 | * |
| 84 | * ``` c++ |
| 85 | * struct reducer_t { |
| 86 | * static void init(registrar_t &scratchpad) { |
| 87 | * // preserve space for the reduction (one page aligned) |
| 88 | * scratchpad.book(key_space, sizeof(float) * 980 * 1024, 4096); |
| 89 | * } |
| 90 | * |
| 91 | * void exec(const grantor_t &scratchpad) { |
| 92 | * // get the pointer to preserved space. scratchpad came from |
| 93 | * // upper primitive (convolution in this example) |
| 94 | * auto space = scratchpad.get<float>(key_reducer_space); |
| 95 | * |
| 96 | * space[:] += ...; |
| 97 | * } |
| 98 | * }; |
| 99 | * |
| 100 | * struct conv_t { |
| 101 | * struct pd_t { |
| 102 | * void init() { |
| 103 | * registrar_t scratchpad(scratchpad_registry_); |
| 104 | * |
| 105 | * // preserve a space for padded bias (using default alignment) |
| 106 | * scratchpad.book(key_conv_padded_bias, 128); |
| 107 | * |
| 108 | * // create a proxy registrar for the reducer All entries made |
| 109 | * // by reducer would live in convolution's registry, but would |
| 110 | * // have their own `prefix`, so no interference with conv's |
| 111 | * // buffers. |
| 112 | * registrar_t reducer_scratchpad(scratchpad, prefix_reducer); |
| 113 | * |
| 114 | * reducer_t::init(reducer_scratchpad); |
| 115 | * } |
| 116 | * |
| 117 | * registry_t scratchpad_registry_; |
| 118 | * } |
| 119 | * |
| 120 | * void exec() { |
| 121 | * // get the base pointer to a scratchpad memory from a user |
| 122 | * void *scratchpad_ptr = this->input(MKLDNN_MEM_SCRATCHPAD); |
| 123 | * |
| 124 | * // create a grantor to the scratchpad (and provide the base |
| 125 | * // pointer). |
| 126 | * grantor_t scratchpad(pd()->scratchpad_registry_, scratchpad_ptr); |
| 127 | * |
| 128 | * // access the padded_bias (need only key name and the grantor) |
| 129 | * auto padded_bias = scratchpad.get<float>(key_conv_padded_bias); |
| 130 | * |
| 131 | * // to give the `right` grantor to reducer we need to add the |
| 132 | * // corresponding prefix, so that reducer would be able to access |
| 133 | * // its keys. The call is very similar to the one in pd_t::init |
| 134 | * // with only difference in types: grantor_t vs registrar_t. |
| 135 | * grantor_t reducer_scratchpad(scratchpad, prefix_reducer); |
| 136 | * reducer->exec(reducer_scratchpad); |
| 137 | * } |
| 138 | * }; |
| 139 | * ``` |
| 140 | */ |
| 141 | |
| 142 | |
| 143 | /* namespace with common keys and prefixes */ |
| 144 | namespace names { |
| 145 | enum { |
| 146 | key_none = 0, |
| 147 | key_bnorm_tmp_mean, |
| 148 | key_bnorm_tmp_var, |
| 149 | key_bnorm_tmp_diff_ss, |
| 150 | key_bnorm_tmp_stats, |
| 151 | key_bnorm_reduction, |
| 152 | key_concat_iptrs, |
| 153 | key_concat_istrides, |
| 154 | key_concat_nelems, |
| 155 | key_concat_optrs, |
| 156 | key_conv_adjusted_scales, |
| 157 | key_conv_bia_reduction, |
| 158 | key_conv_gemm_col, |
| 159 | key_conv_gemm_imtr, |
| 160 | key_conv_int_dat_in_acc_dt, |
| 161 | key_conv_padded_bias, |
| 162 | key_conv_rtus_space, |
| 163 | key_conv_tr_diff_dst, |
| 164 | key_conv_tr_diff_dst_bctx, |
| 165 | key_conv_tr_src, |
| 166 | key_conv_tr_src_bctx, |
| 167 | key_conv_wei_reduction, |
| 168 | key_conv_wei_bia_reduction, |
| 169 | key_conv_wei_bia_reduction_bctx, |
| 170 | key_iprod_int_dat_in_acc_dt, |
| 171 | key_reducer_space, |
| 172 | key_reducer_space_bctx, |
| 173 | key_reorder_wino_plain, |
| 174 | key_reorder_wino_transform_space, |
| 175 | key_reorder_rnn_weights_quantization, |
| 176 | key_reorder_rnn_weights_reduction, |
| 177 | key_rnn_space, |
| 178 | key_rnn_ptrs_bia, |
| 179 | key_rnn_ptrs_wei_layer, |
| 180 | key_rnn_ptrs_wei_iter, |
| 181 | key_softmax_reduction, |
| 182 | key_wino_U, |
| 183 | key_wino_V, |
| 184 | key_wino_M, |
| 185 | key_barrier, |
| 186 | }; |
| 187 | |
| 188 | enum { |
| 189 | prefix_none = 0, |
| 190 | prefix_reducer_bia, |
| 191 | prefix_reducer_wei, |
| 192 | }; |
| 193 | } |
| 194 | |
| 195 | // level 0: 00 00 00 xxx |
| 196 | // level 1: 00 00 aa xxx |
| 197 | // level 2: 00 aa bb xxx |
| 198 | // level 3: aa bb cc xxx |
| 199 | // max # of levels: 3 + 1 (base_level) |
| 200 | // here: |
| 201 | // xxx : [1 .. MAX_KEY) : key |
| 202 | // aa, bb, cc : [1 .. MAX_PREFIX) : prefixes for levels 1, 2, and 3 |
| 203 | |
| 204 | using key_t = uint32_t; |
| 205 | enum { MAX_KEY = (1u << 10), MAX_PREFIX = (1u << 7), }; |
| 206 | |
| 207 | /// generates global key based on a prefix and a local key |
| 208 | inline key_t make_key(key_t prefix, key_t key) { return prefix + key; } |
| 209 | |
| 210 | /// generates global prefix based on the global parent and the local ones |
| 211 | inline key_t make_prefix(key_t parent_prefix, key_t prefix) |
| 212 | { return MAX_PREFIX * parent_prefix + MAX_KEY * prefix; } |
| 213 | |
| 214 | struct registrar_t; |
| 215 | struct grantor_t; |
| 216 | |
| 217 | struct registry_t { |
| 218 | void book(const key_t &key, size_t size, size_t alignment) { |
| 219 | if (size == 0) return; |
| 220 | assert(offset_map_.count(key) == 0); |
| 221 | |
| 222 | size = utils::rnd_up(size, minimal_alignment); |
| 223 | alignment = nstl::max<size_t>(alignment, minimal_alignment); |
| 224 | offset_map_[key] = entry_t{size_, size, alignment}; |
| 225 | |
| 226 | size_ += size + alignment - minimal_alignment; |
| 227 | } |
| 228 | |
| 229 | void *get(const key_t &key, void *base_ptr) const { |
| 230 | if (base_ptr == nullptr) { assert(size() == 0); return nullptr; } |
| 231 | if (offset_map_.count(key) != 1) return nullptr; |
| 232 | |
| 233 | const auto &e = offset_map_.at(key); |
| 234 | base_ptr = utils::align_ptr<void>(base_ptr, minimal_alignment); |
| 235 | char *ptr = (char *)base_ptr + e.offset; |
| 236 | return utils::align_ptr<void>(ptr, e.alignment); |
| 237 | } |
| 238 | |
| 239 | size_t size() const |
| 240 | { return size_ > 0 ? size_ + minimal_alignment - 1 : 0; } |
| 241 | |
| 242 | registrar_t registrar(); |
| 243 | grantor_t grantor(void *base_ptr) const; |
| 244 | |
| 245 | protected: |
| 246 | enum { minimal_alignment = 64 }; |
| 247 | struct entry_t { size_t offset, size, alignment; }; |
| 248 | |
| 249 | std::unordered_map<key_t, entry_t> offset_map_; |
| 250 | size_t size_ = 0; |
| 251 | }; |
| 252 | |
| 253 | struct registrar_t { |
| 254 | enum { default_alignment = 64 }; |
| 255 | |
| 256 | registrar_t(registry_t ®istry): registry_(registry), prefix_(0) {} |
| 257 | registrar_t(registrar_t &parent, const key_t &prefix) |
| 258 | : registry_(parent.registry_) |
| 259 | , prefix_(make_prefix(parent.prefix_, prefix)) {} |
| 260 | |
| 261 | void book(const key_t &key, size_t size, |
| 262 | size_t alignment = default_alignment) |
| 263 | { registry_.book(make_key(prefix_, key), size, alignment); } |
| 264 | |
| 265 | protected: |
| 266 | registry_t ®istry_; |
| 267 | const key_t prefix_; |
| 268 | }; |
| 269 | |
| 270 | struct grantor_t { |
| 271 | grantor_t(const registry_t ®istry, void *base_ptr) |
| 272 | : registry_(registry), prefix_(0), base_ptr_(base_ptr) {} |
| 273 | grantor_t(const grantor_t &parent, const key_t &prefix) |
| 274 | : registry_(parent.registry_) |
| 275 | , prefix_(make_prefix(parent.prefix_, prefix)) |
| 276 | , base_ptr_(parent.base_ptr_) {} |
| 277 | |
| 278 | template <typename T = void> T *get(const key_t &key) const |
| 279 | { return (T *)registry_.get(make_key(prefix_, key), base_ptr_); } |
| 280 | |
| 281 | protected: |
| 282 | const registry_t ®istry_; |
| 283 | const key_t prefix_; |
| 284 | void *base_ptr_; |
| 285 | }; |
| 286 | |
| 287 | inline registrar_t registry_t::registrar() { return registrar_t(*this); } |
| 288 | inline grantor_t registry_t::grantor(void *base_ptr) const |
| 289 | { return grantor_t(*this, base_ptr); } |
| 290 | |
| 291 | } |
| 292 | } |
| 293 | } |
| 294 | |
| 295 | #endif |
| 296 | |