1 | /******************************************************************************* |
2 | * Copyright 2018 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef MEMORY_TRACKING_HPP |
18 | #define MEMORY_TRACKING_HPP |
19 | |
20 | #include <assert.h> |
21 | #include <unordered_map> |
22 | |
23 | #include "nstl.hpp" |
24 | #include "utils.hpp" |
25 | |
26 | namespace mkldnn { |
27 | namespace impl { |
28 | namespace memory_tracking { |
29 | |
30 | /* Memory tracking capabilities |
31 | * |
32 | * The main purpose of this header file is to provide uniform way to register |
33 | * required memory for a scratchpad at a primitive descriptor creation time |
34 | * and then easily access it having only the base address of the scratchpad. |
35 | * |
36 | * Primitives might contain multiple disjoint parts that require temporary |
37 | * buffers (known as scratchpad) during their execution. A primitive descriptor |
38 | * should summarize all the needs into one single number -- the buffer size |
39 | * that would be requested from a user. At execution time, the corresponding |
40 | * primitive will receive a base pointer to a scratchpad. It then needs to |
41 | * provide each part of algorithm the corresponding piece of memory. Three main |
42 | * challenges here are: |
43 | * 1. Track correct offset (from the base scratchpad address) for each piece |
44 | * 2. Algorithm might require that different memory pieces to be aligned, so |
45 | * the scratchpad size is no more just a sum of size of the corresponding |
46 | * subparts. |
47 | * 3. While a primitive is responsible for its scratchpad, the implementation |
48 | * might use some other basic blocks (e.g. cpu_reducer) that also require |
49 | * scratchpad memory. So there should be a simple way of passing the |
50 | * information back and force between the main algorithm (a primitive) and |
51 | * auxiliary stuff that lives completely separately from it (e.g. reducer). |
52 | * |
53 | * To address these challenges this header file provides 3 structures: |
54 | * 1. registry_t -- the class the stores the information about requested |
55 | * memory. The information includes required size and desired |
56 | * alignment for each piece. This class is also responsible |
57 | * for computing the right offset to a given piece using the |
58 | * base pointer. |
59 | * This class is basically a ledger with all entries. |
60 | * Lives in primitive descriptors. |
61 | * |
62 | * 2. registrar_t -- the interface to a registry_t to book memory. Used at |
63 | * primitive descriptor creation time only. Contains a |
64 | * reference to the corresponding *mutable* registry. |
65 | * Always modifiable. |
66 | * Allows chaining (using prefixes). |
67 | * |
68 | * 3. grantor_t -- the interface to a registry_t to access memory. Used at |
69 | * primitive execution time only. Contains a reference to |
70 | * the corresponding *constant* registry and base pointer. |
71 | * Always constant. |
72 | * Allows chaining (using prefixes). |
73 | * |
74 | * Both registrar_t and grantor_t allow chaining with extra prefix provided. |
75 | * The feature is useful when a primitive offload a part of computations to |
76 | * some other primitives which require their own scratchpad space |
77 | * (e.g. reducer). Prefixes are used to avoid key collision in cases when |
78 | * multiple sub-primitive (e.g. multiple reducers) are used. |
79 | * |
80 | * A short example below demonstrates how to use aforementioned classes. In it |
81 | * the main primitive is convolution that uses scratchpad for keeping padded |
82 | * bias. It also needs a reducer, that needs its own space as well. |
83 | * |
84 | * ``` c++ |
85 | * struct reducer_t { |
86 | * static void init(registrar_t &scratchpad) { |
87 | * // preserve space for the reduction (one page aligned) |
88 | * scratchpad.book(key_space, sizeof(float) * 980 * 1024, 4096); |
89 | * } |
90 | * |
91 | * void exec(const grantor_t &scratchpad) { |
92 | * // get the pointer to preserved space. scratchpad came from |
93 | * // upper primitive (convolution in this example) |
94 | * auto space = scratchpad.get<float>(key_reducer_space); |
95 | * |
96 | * space[:] += ...; |
97 | * } |
98 | * }; |
99 | * |
100 | * struct conv_t { |
101 | * struct pd_t { |
102 | * void init() { |
103 | * registrar_t scratchpad(scratchpad_registry_); |
104 | * |
105 | * // preserve a space for padded bias (using default alignment) |
106 | * scratchpad.book(key_conv_padded_bias, 128); |
107 | * |
108 | * // create a proxy registrar for the reducer All entries made |
109 | * // by reducer would live in convolution's registry, but would |
110 | * // have their own `prefix`, so no interference with conv's |
111 | * // buffers. |
112 | * registrar_t reducer_scratchpad(scratchpad, prefix_reducer); |
113 | * |
114 | * reducer_t::init(reducer_scratchpad); |
115 | * } |
116 | * |
117 | * registry_t scratchpad_registry_; |
118 | * } |
119 | * |
120 | * void exec() { |
121 | * // get the base pointer to a scratchpad memory from a user |
122 | * void *scratchpad_ptr = this->input(MKLDNN_MEM_SCRATCHPAD); |
123 | * |
124 | * // create a grantor to the scratchpad (and provide the base |
125 | * // pointer). |
126 | * grantor_t scratchpad(pd()->scratchpad_registry_, scratchpad_ptr); |
127 | * |
128 | * // access the padded_bias (need only key name and the grantor) |
129 | * auto padded_bias = scratchpad.get<float>(key_conv_padded_bias); |
130 | * |
131 | * // to give the `right` grantor to reducer we need to add the |
132 | * // corresponding prefix, so that reducer would be able to access |
133 | * // its keys. The call is very similar to the one in pd_t::init |
134 | * // with only difference in types: grantor_t vs registrar_t. |
135 | * grantor_t reducer_scratchpad(scratchpad, prefix_reducer); |
136 | * reducer->exec(reducer_scratchpad); |
137 | * } |
138 | * }; |
139 | * ``` |
140 | */ |
141 | |
142 | |
143 | /* namespace with common keys and prefixes */ |
144 | namespace names { |
145 | enum { |
146 | key_none = 0, |
147 | key_bnorm_tmp_mean, |
148 | key_bnorm_tmp_var, |
149 | key_bnorm_tmp_diff_ss, |
150 | key_bnorm_tmp_stats, |
151 | key_bnorm_reduction, |
152 | key_concat_iptrs, |
153 | key_concat_istrides, |
154 | key_concat_nelems, |
155 | key_concat_optrs, |
156 | key_conv_adjusted_scales, |
157 | key_conv_bia_reduction, |
158 | key_conv_gemm_col, |
159 | key_conv_gemm_imtr, |
160 | key_conv_int_dat_in_acc_dt, |
161 | key_conv_padded_bias, |
162 | key_conv_rtus_space, |
163 | key_conv_tr_diff_dst, |
164 | key_conv_tr_diff_dst_bctx, |
165 | key_conv_tr_src, |
166 | key_conv_tr_src_bctx, |
167 | key_conv_wei_reduction, |
168 | key_conv_wei_bia_reduction, |
169 | key_conv_wei_bia_reduction_bctx, |
170 | key_iprod_int_dat_in_acc_dt, |
171 | key_reducer_space, |
172 | key_reducer_space_bctx, |
173 | key_reorder_wino_plain, |
174 | key_reorder_wino_transform_space, |
175 | key_reorder_rnn_weights_quantization, |
176 | key_reorder_rnn_weights_reduction, |
177 | key_rnn_space, |
178 | key_rnn_ptrs_bia, |
179 | key_rnn_ptrs_wei_layer, |
180 | key_rnn_ptrs_wei_iter, |
181 | key_softmax_reduction, |
182 | key_wino_U, |
183 | key_wino_V, |
184 | key_wino_M, |
185 | key_barrier, |
186 | }; |
187 | |
188 | enum { |
189 | prefix_none = 0, |
190 | prefix_reducer_bia, |
191 | prefix_reducer_wei, |
192 | }; |
193 | } |
194 | |
195 | // level 0: 00 00 00 xxx |
196 | // level 1: 00 00 aa xxx |
197 | // level 2: 00 aa bb xxx |
198 | // level 3: aa bb cc xxx |
199 | // max # of levels: 3 + 1 (base_level) |
200 | // here: |
201 | // xxx : [1 .. MAX_KEY) : key |
202 | // aa, bb, cc : [1 .. MAX_PREFIX) : prefixes for levels 1, 2, and 3 |
203 | |
204 | using key_t = uint32_t; |
205 | enum { MAX_KEY = (1u << 10), MAX_PREFIX = (1u << 7), }; |
206 | |
207 | /// generates global key based on a prefix and a local key |
208 | inline key_t make_key(key_t prefix, key_t key) { return prefix + key; } |
209 | |
210 | /// generates global prefix based on the global parent and the local ones |
211 | inline key_t make_prefix(key_t parent_prefix, key_t prefix) |
212 | { return MAX_PREFIX * parent_prefix + MAX_KEY * prefix; } |
213 | |
214 | struct registrar_t; |
215 | struct grantor_t; |
216 | |
217 | struct registry_t { |
218 | void book(const key_t &key, size_t size, size_t alignment) { |
219 | if (size == 0) return; |
220 | assert(offset_map_.count(key) == 0); |
221 | |
222 | size = utils::rnd_up(size, minimal_alignment); |
223 | alignment = nstl::max<size_t>(alignment, minimal_alignment); |
224 | offset_map_[key] = entry_t{size_, size, alignment}; |
225 | |
226 | size_ += size + alignment - minimal_alignment; |
227 | } |
228 | |
229 | void *get(const key_t &key, void *base_ptr) const { |
230 | if (base_ptr == nullptr) { assert(size() == 0); return nullptr; } |
231 | if (offset_map_.count(key) != 1) return nullptr; |
232 | |
233 | const auto &e = offset_map_.at(key); |
234 | base_ptr = utils::align_ptr<void>(base_ptr, minimal_alignment); |
235 | char *ptr = (char *)base_ptr + e.offset; |
236 | return utils::align_ptr<void>(ptr, e.alignment); |
237 | } |
238 | |
239 | size_t size() const |
240 | { return size_ > 0 ? size_ + minimal_alignment - 1 : 0; } |
241 | |
242 | registrar_t registrar(); |
243 | grantor_t grantor(void *base_ptr) const; |
244 | |
245 | protected: |
246 | enum { minimal_alignment = 64 }; |
247 | struct entry_t { size_t offset, size, alignment; }; |
248 | |
249 | std::unordered_map<key_t, entry_t> offset_map_; |
250 | size_t size_ = 0; |
251 | }; |
252 | |
253 | struct registrar_t { |
254 | enum { default_alignment = 64 }; |
255 | |
256 | registrar_t(registry_t ®istry): registry_(registry), prefix_(0) {} |
257 | registrar_t(registrar_t &parent, const key_t &prefix) |
258 | : registry_(parent.registry_) |
259 | , prefix_(make_prefix(parent.prefix_, prefix)) {} |
260 | |
261 | void book(const key_t &key, size_t size, |
262 | size_t alignment = default_alignment) |
263 | { registry_.book(make_key(prefix_, key), size, alignment); } |
264 | |
265 | protected: |
266 | registry_t ®istry_; |
267 | const key_t prefix_; |
268 | }; |
269 | |
270 | struct grantor_t { |
271 | grantor_t(const registry_t ®istry, void *base_ptr) |
272 | : registry_(registry), prefix_(0), base_ptr_(base_ptr) {} |
273 | grantor_t(const grantor_t &parent, const key_t &prefix) |
274 | : registry_(parent.registry_) |
275 | , prefix_(make_prefix(parent.prefix_, prefix)) |
276 | , base_ptr_(parent.base_ptr_) {} |
277 | |
278 | template <typename T = void> T *get(const key_t &key) const |
279 | { return (T *)registry_.get(make_key(prefix_, key), base_ptr_); } |
280 | |
281 | protected: |
282 | const registry_t ®istry_; |
283 | const key_t prefix_; |
284 | void *base_ptr_; |
285 | }; |
286 | |
287 | inline registrar_t registry_t::registrar() { return registrar_t(*this); } |
288 | inline grantor_t registry_t::grantor(void *base_ptr) const |
289 | { return grantor_t(*this, base_ptr); } |
290 | |
291 | } |
292 | } |
293 | } |
294 | |
295 | #endif |
296 | |