1/*******************************************************************************
2* Copyright 2018 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef MEMORY_TRACKING_HPP
18#define MEMORY_TRACKING_HPP
19
20#include <assert.h>
21#include <unordered_map>
22
23#include "nstl.hpp"
24#include "utils.hpp"
25
26namespace mkldnn {
27namespace impl {
28namespace memory_tracking {
29
30/* Memory tracking capabilities
31 *
32 * The main purpose of this header file is to provide uniform way to register
33 * required memory for a scratchpad at a primitive descriptor creation time
34 * and then easily access it having only the base address of the scratchpad.
35 *
36 * Primitives might contain multiple disjoint parts that require temporary
37 * buffers (known as scratchpad) during their execution. A primitive descriptor
38 * should summarize all the needs into one single number -- the buffer size
39 * that would be requested from a user. At execution time, the corresponding
40 * primitive will receive a base pointer to a scratchpad. It then needs to
41 * provide each part of algorithm the corresponding piece of memory. Three main
42 * challenges here are:
43 * 1. Track correct offset (from the base scratchpad address) for each piece
44 * 2. Algorithm might require that different memory pieces to be aligned, so
45 * the scratchpad size is no more just a sum of size of the corresponding
46 * subparts.
47 * 3. While a primitive is responsible for its scratchpad, the implementation
48 * might use some other basic blocks (e.g. cpu_reducer) that also require
49 * scratchpad memory. So there should be a simple way of passing the
50 * information back and force between the main algorithm (a primitive) and
51 * auxiliary stuff that lives completely separately from it (e.g. reducer).
52 *
53 * To address these challenges this header file provides 3 structures:
54 * 1. registry_t -- the class the stores the information about requested
55 * memory. The information includes required size and desired
56 * alignment for each piece. This class is also responsible
57 * for computing the right offset to a given piece using the
58 * base pointer.
59 * This class is basically a ledger with all entries.
60 * Lives in primitive descriptors.
61 *
62 * 2. registrar_t -- the interface to a registry_t to book memory. Used at
63 * primitive descriptor creation time only. Contains a
64 * reference to the corresponding *mutable* registry.
65 * Always modifiable.
66 * Allows chaining (using prefixes).
67 *
68 * 3. grantor_t -- the interface to a registry_t to access memory. Used at
69 * primitive execution time only. Contains a reference to
70 * the corresponding *constant* registry and base pointer.
71 * Always constant.
72 * Allows chaining (using prefixes).
73 *
74 * Both registrar_t and grantor_t allow chaining with extra prefix provided.
75 * The feature is useful when a primitive offload a part of computations to
76 * some other primitives which require their own scratchpad space
77 * (e.g. reducer). Prefixes are used to avoid key collision in cases when
78 * multiple sub-primitive (e.g. multiple reducers) are used.
79 *
80 * A short example below demonstrates how to use aforementioned classes. In it
81 * the main primitive is convolution that uses scratchpad for keeping padded
82 * bias. It also needs a reducer, that needs its own space as well.
83 *
84 * ``` c++
85 * struct reducer_t {
86 * static void init(registrar_t &scratchpad) {
87 * // preserve space for the reduction (one page aligned)
88 * scratchpad.book(key_space, sizeof(float) * 980 * 1024, 4096);
89 * }
90 *
91 * void exec(const grantor_t &scratchpad) {
92 * // get the pointer to preserved space. scratchpad came from
93 * // upper primitive (convolution in this example)
94 * auto space = scratchpad.get<float>(key_reducer_space);
95 *
96 * space[:] += ...;
97 * }
98 * };
99 *
100 * struct conv_t {
101 * struct pd_t {
102 * void init() {
103 * registrar_t scratchpad(scratchpad_registry_);
104 *
105 * // preserve a space for padded bias (using default alignment)
106 * scratchpad.book(key_conv_padded_bias, 128);
107 *
108 * // create a proxy registrar for the reducer All entries made
109 * // by reducer would live in convolution's registry, but would
110 * // have their own `prefix`, so no interference with conv's
111 * // buffers.
112 * registrar_t reducer_scratchpad(scratchpad, prefix_reducer);
113 *
114 * reducer_t::init(reducer_scratchpad);
115 * }
116 *
117 * registry_t scratchpad_registry_;
118 * }
119 *
120 * void exec() {
121 * // get the base pointer to a scratchpad memory from a user
122 * void *scratchpad_ptr = this->input(MKLDNN_MEM_SCRATCHPAD);
123 *
124 * // create a grantor to the scratchpad (and provide the base
125 * // pointer).
126 * grantor_t scratchpad(pd()->scratchpad_registry_, scratchpad_ptr);
127 *
128 * // access the padded_bias (need only key name and the grantor)
129 * auto padded_bias = scratchpad.get<float>(key_conv_padded_bias);
130 *
131 * // to give the `right` grantor to reducer we need to add the
132 * // corresponding prefix, so that reducer would be able to access
133 * // its keys. The call is very similar to the one in pd_t::init
134 * // with only difference in types: grantor_t vs registrar_t.
135 * grantor_t reducer_scratchpad(scratchpad, prefix_reducer);
136 * reducer->exec(reducer_scratchpad);
137 * }
138 * };
139 * ```
140 */
141
142
143/* namespace with common keys and prefixes */
144namespace names {
145enum {
146 key_none = 0,
147 key_bnorm_tmp_mean,
148 key_bnorm_tmp_var,
149 key_bnorm_tmp_diff_ss,
150 key_bnorm_tmp_stats,
151 key_bnorm_reduction,
152 key_concat_iptrs,
153 key_concat_istrides,
154 key_concat_nelems,
155 key_concat_optrs,
156 key_conv_adjusted_scales,
157 key_conv_bia_reduction,
158 key_conv_gemm_col,
159 key_conv_gemm_imtr,
160 key_conv_int_dat_in_acc_dt,
161 key_conv_padded_bias,
162 key_conv_rtus_space,
163 key_conv_tr_diff_dst,
164 key_conv_tr_diff_dst_bctx,
165 key_conv_tr_src,
166 key_conv_tr_src_bctx,
167 key_conv_wei_reduction,
168 key_conv_wei_bia_reduction,
169 key_conv_wei_bia_reduction_bctx,
170 key_iprod_int_dat_in_acc_dt,
171 key_reducer_space,
172 key_reducer_space_bctx,
173 key_reorder_wino_plain,
174 key_reorder_wino_transform_space,
175 key_reorder_rnn_weights_quantization,
176 key_reorder_rnn_weights_reduction,
177 key_rnn_space,
178 key_rnn_ptrs_bia,
179 key_rnn_ptrs_wei_layer,
180 key_rnn_ptrs_wei_iter,
181 key_softmax_reduction,
182 key_wino_U,
183 key_wino_V,
184 key_wino_M,
185 key_barrier,
186};
187
188enum {
189 prefix_none = 0,
190 prefix_reducer_bia,
191 prefix_reducer_wei,
192};
193}
194
195// level 0: 00 00 00 xxx
196// level 1: 00 00 aa xxx
197// level 2: 00 aa bb xxx
198// level 3: aa bb cc xxx
199// max # of levels: 3 + 1 (base_level)
200// here:
201// xxx : [1 .. MAX_KEY) : key
202// aa, bb, cc : [1 .. MAX_PREFIX) : prefixes for levels 1, 2, and 3
203
204using key_t = uint32_t;
205enum { MAX_KEY = (1u << 10), MAX_PREFIX = (1u << 7), };
206
207/// generates global key based on a prefix and a local key
208inline key_t make_key(key_t prefix, key_t key) { return prefix + key; }
209
210/// generates global prefix based on the global parent and the local ones
211inline key_t make_prefix(key_t parent_prefix, key_t prefix)
212{ return MAX_PREFIX * parent_prefix + MAX_KEY * prefix; }
213
214struct registrar_t;
215struct grantor_t;
216
217struct registry_t {
218 void book(const key_t &key, size_t size, size_t alignment) {
219 if (size == 0) return;
220 assert(offset_map_.count(key) == 0);
221
222 size = utils::rnd_up(size, minimal_alignment);
223 alignment = nstl::max<size_t>(alignment, minimal_alignment);
224 offset_map_[key] = entry_t{size_, size, alignment};
225
226 size_ += size + alignment - minimal_alignment;
227 }
228
229 void *get(const key_t &key, void *base_ptr) const {
230 if (base_ptr == nullptr) { assert(size() == 0); return nullptr; }
231 if (offset_map_.count(key) != 1) return nullptr;
232
233 const auto &e = offset_map_.at(key);
234 base_ptr = utils::align_ptr<void>(base_ptr, minimal_alignment);
235 char *ptr = (char *)base_ptr + e.offset;
236 return utils::align_ptr<void>(ptr, e.alignment);
237 }
238
239 size_t size() const
240 { return size_ > 0 ? size_ + minimal_alignment - 1 : 0; }
241
242 registrar_t registrar();
243 grantor_t grantor(void *base_ptr) const;
244
245protected:
246 enum { minimal_alignment = 64 };
247 struct entry_t { size_t offset, size, alignment; };
248
249 std::unordered_map<key_t, entry_t> offset_map_;
250 size_t size_ = 0;
251};
252
253struct registrar_t {
254 enum { default_alignment = 64 };
255
256 registrar_t(registry_t &registry): registry_(registry), prefix_(0) {}
257 registrar_t(registrar_t &parent, const key_t &prefix)
258 : registry_(parent.registry_)
259 , prefix_(make_prefix(parent.prefix_, prefix)) {}
260
261 void book(const key_t &key, size_t size,
262 size_t alignment = default_alignment)
263 { registry_.book(make_key(prefix_, key), size, alignment); }
264
265protected:
266 registry_t &registry_;
267 const key_t prefix_;
268};
269
270struct grantor_t {
271 grantor_t(const registry_t &registry, void *base_ptr)
272 : registry_(registry), prefix_(0), base_ptr_(base_ptr) {}
273 grantor_t(const grantor_t &parent, const key_t &prefix)
274 : registry_(parent.registry_)
275 , prefix_(make_prefix(parent.prefix_, prefix))
276 , base_ptr_(parent.base_ptr_) {}
277
278 template <typename T = void> T *get(const key_t &key) const
279 { return (T *)registry_.get(make_key(prefix_, key), base_ptr_); }
280
281protected:
282 const registry_t &registry_;
283 const key_t prefix_;
284 void *base_ptr_;
285};
286
287inline registrar_t registry_t::registrar() { return registrar_t(*this); }
288inline grantor_t registry_t::grantor(void *base_ptr) const
289{ return grantor_t(*this, base_ptr); }
290
291}
292}
293}
294
295#endif
296