1 | // ======================================================================== // |
2 | // Copyright 2009-2019 Intel Corporation // |
3 | // // |
4 | // Licensed under the Apache License, Version 2.0 (the "License"); // |
5 | // you may not use this file except in compliance with the License. // |
6 | // You may obtain a copy of the License at // |
7 | // // |
8 | // http://www.apache.org/licenses/LICENSE-2.0 // |
9 | // // |
10 | // Unless required by applicable law or agreed to in writing, software // |
11 | // distributed under the License is distributed on an "AS IS" BASIS, // |
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // |
13 | // See the License for the specific language governing permissions and // |
14 | // limitations under the License. // |
15 | // ======================================================================== // |
16 | |
17 | #include "upsample.h" |
18 | #include "weights_reorder.h" |
19 | #include "network.h" |
20 | // -- GODOT start -- |
21 | #include <cstring> |
22 | // -- GODOT end -- |
23 | |
24 | namespace oidn { |
25 | |
26 | template<int K> |
27 | Network<K>::Network(const Ref<Device>& device, const std::map<std::string, Tensor>& weightMap) |
28 | : device(device), |
29 | eng(engine::cpu, 0), |
30 | sm(eng), |
31 | weightMap(weightMap) |
32 | { |
33 | } |
34 | |
35 | template<int K> |
36 | void Network<K>::execute(const Progress& progress, int taskIndex) |
37 | { |
38 | if (progress.func) |
39 | { |
40 | const double value = double(taskIndex) / double(progress.taskCount); |
41 | if (!progress.func(progress.userPtr, value)) |
42 | throw Exception(Error::Cancelled, "execution was cancelled" ); |
43 | } |
44 | |
45 | for (size_t i = 0; i < nodes.size(); ++i) |
46 | { |
47 | nodes[i]->execute(sm); |
48 | |
49 | if (progress.func) |
50 | { |
51 | const double value = (double(taskIndex) + double(i+1) / double(nodes.size())) / double(progress.taskCount); |
52 | if (!progress.func(progress.userPtr, value)) |
53 | throw Exception(Error::Cancelled, "execution was cancelled" ); |
54 | } |
55 | } |
56 | } |
57 | |
58 | template<int K> |
59 | std::shared_ptr<memory> Network<K>::allocTensor(const memory::dims& dims, |
60 | memory::format_tag format, |
61 | void* data) |
62 | { |
63 | if (format == memory::format_tag::any) |
64 | { |
65 | if (dims.size() == 4) |
66 | format = BlockedFormat<K>::nChwKc; |
67 | else if (dims.size() == 1) |
68 | format = memory::format_tag::x; |
69 | else |
70 | assert(0); |
71 | } |
72 | memory::desc desc(dims, memory::data_type::f32, format); |
73 | if (data == nullptr) |
74 | { |
75 | const size_t bytes = getTensorSize(dims) * sizeof(float); |
76 | if (format == BlockedFormat<K>::nChwKc) |
77 | activationAllocBytes += bytes; |
78 | totalAllocBytes += bytes; |
79 | |
80 | return std::make_shared<memory>(desc, eng); |
81 | } |
82 | else |
83 | { |
84 | return std::make_shared<memory>(desc, eng, data); |
85 | } |
86 | } |
87 | |
88 | template<int K> |
89 | std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims, |
90 | const std::shared_ptr<memory>& src, |
91 | size_t srcOffset, |
92 | memory::format_tag format) |
93 | { |
94 | const mkldnn_memory_desc_t& srcDesc = src->get_desc().data; |
95 | MAYBE_UNUSED(srcDesc); |
96 | assert(srcDesc.data_type == memory::data_type::f32); |
97 | assert(getTensorSize(src) >= srcOffset + getTensorSize(dims)); |
98 | |
99 | if (format == memory::format_tag::any) |
100 | { |
101 | if (dims.size() == 4) |
102 | format = BlockedFormat<K>::nChwKc; |
103 | else if (dims.size() == 1) |
104 | format = memory::format_tag::x; |
105 | else |
106 | assert(0); |
107 | } |
108 | memory::desc desc(dims, memory::data_type::f32, format); |
109 | float* srcPtr = (float*)src->get_data_handle() + srcOffset; |
110 | return std::make_shared<memory>(desc, eng, srcPtr); |
111 | } |
112 | |
113 | template<int K> |
114 | std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims, |
115 | const std::shared_ptr<memory>& src, |
116 | const memory::dims& srcOffset) |
117 | { |
118 | return castTensor(dims, src, getTensorSize(srcOffset)); |
119 | } |
120 | |
121 | template<int K> |
122 | void Network<K>::zeroTensor(const std::shared_ptr<memory>& dst) |
123 | { |
124 | assert(getTensorType(dst) == memory::data_type::f32); |
125 | memset(dst->get_data_handle(), 0, getTensorSize(dst)*sizeof(float)); |
126 | } |
127 | |
128 | template<int K> |
129 | memory::dims Network<K>::getInputReorderDims(const memory::dims& srcDims, int alignment) |
130 | { |
131 | memory::dims dstDims = srcDims; |
132 | dstDims[1] = getPadded<K>(srcDims[1]); // round up C |
133 | dstDims[2] = roundUp(srcDims[2], memory::dim(alignment)); // round up H |
134 | dstDims[3] = roundUp(srcDims[3], memory::dim(alignment)); // round up W |
135 | return dstDims; |
136 | } |
137 | |
138 | template<int K> |
139 | std::shared_ptr<Node> Network<K>::addInputReorder(const Image& color, |
140 | const Image& albedo, |
141 | const Image& normal, |
142 | const std::shared_ptr<TransferFunction>& transferFunc, |
143 | int alignment, |
144 | const std::shared_ptr<memory>& userDst) |
145 | { |
146 | assert(color); |
147 | int inputC = 3; |
148 | if (albedo) inputC += 3; |
149 | if (normal) inputC += 3; |
150 | |
151 | memory::dims srcDims = {1, inputC, color.height, color.width}; |
152 | memory::dims dstDims = getInputReorderDims(srcDims, alignment); |
153 | |
154 | // Allocate padded memory |
155 | auto dst = userDst; |
156 | if (!dst) |
157 | dst = allocTensor(dstDims); |
158 | |
159 | // Push node |
160 | std::shared_ptr<Node> node; |
161 | |
162 | if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc)) |
163 | node = std::make_shared<InputReorderNode<K, LinearTransferFunction>>(color, albedo, normal, dst, tf); |
164 | else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc)) |
165 | node = std::make_shared<InputReorderNode<K, GammaTransferFunction>>(color, albedo, normal, dst, tf); |
166 | else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc)) |
167 | node = std::make_shared<InputReorderNode<K, LogTransferFunction>>(color, albedo, normal, dst, tf); |
168 | else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc)) |
169 | node = std::make_shared<InputReorderNode<K, PQXTransferFunction>>(color, albedo, normal, dst, tf); |
170 | else |
171 | assert(0); |
172 | |
173 | nodes.push_back(node); |
174 | return node; |
175 | } |
176 | |
177 | template<int K> |
178 | std::shared_ptr<Node> Network<K>::addOutputReorder(const std::shared_ptr<memory>& src, |
179 | const std::shared_ptr<TransferFunction>& transferFunc, |
180 | const Image& output) |
181 | { |
182 | memory::dims srcDims = getTensorDims(src); |
183 | assert(srcDims[1] == K); |
184 | |
185 | // Push node |
186 | std::shared_ptr<Node> node; |
187 | |
188 | if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc)) |
189 | node = std::make_shared<OutputReorderNode<K, LinearTransferFunction>>(src, output, tf); |
190 | else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc)) |
191 | node = std::make_shared<OutputReorderNode<K, GammaTransferFunction>>(src, output, tf); |
192 | else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc)) |
193 | node = std::make_shared<OutputReorderNode<K, LogTransferFunction>>(src, output, tf); |
194 | else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc)) |
195 | node = std::make_shared<OutputReorderNode<K, PQXTransferFunction>>(src, output, tf); |
196 | else |
197 | assert(0); |
198 | |
199 | nodes.push_back(node); |
200 | return node; |
201 | } |
202 | |
203 | template<int K> |
204 | memory::dims Network<K>::getConvDims(const std::string& name, const memory::dims& srcDims) |
205 | { |
206 | auto b = weightMap[name + "/b" ]; |
207 | memory::dims dstDims = srcDims; |
208 | dstDims[1] = getPadded<K>(b.dims[0]); // dstDims[C] = getPadded(OC) |
209 | return dstDims; |
210 | } |
211 | |
212 | template<int K> |
213 | std::shared_ptr<Node> Network<K>::addConv(const std::string& name, |
214 | const std::shared_ptr<memory>& src, |
215 | const std::shared_ptr<memory>& userDst, |
216 | bool relu) |
217 | { |
218 | const memory::dims strides = {1, 1}; |
219 | const memory::dims padding = {1, 1}; |
220 | |
221 | memory::dims srcDims = getTensorDims(src); |
222 | |
223 | // Get the weights |
224 | const auto& W = weightMap[name + "/W" ]; |
225 | if (W.ndims() != 4 || W.format != "oihw" ) |
226 | throw Exception(Error::InvalidOperation, "invalid convolution weights" ); |
227 | memory::dims weightsDims = W.dims; |
228 | auto userWeights = allocTensor(weightsDims, memory::format_tag::oihw, W.data); |
229 | |
230 | // Pad the weights |
231 | memory::dims weightsPadDims = weightsDims; |
232 | weightsPadDims[1] = getPadded<K>(weightsDims[1]); // IC |
233 | weightsPadDims[0] = getPadded<K>(weightsDims[0]); // OC |
234 | assert(srcDims[1] == weightsPadDims[1]); // srcDims[C] == weightsPadDims[IC] |
235 | auto weightsPad = allocTensor(weightsPadDims, memory::format_tag::oihw); |
236 | WeightsReorderNode<K>(userWeights, weightsPad).execute(sm); |
237 | |
238 | // Get the biases |
239 | const auto& b = weightMap[name + "/b" ]; |
240 | if (b.ndims() != 1) |
241 | throw Exception(Error::InvalidOperation, "invalid convolution biases" ); |
242 | memory::dims biasDims = b.dims; |
243 | |
244 | // Copy/pad the biases |
245 | memory::dims biasPadDims = {getPadded<K>(biasDims[0])}; |
246 | auto bias = allocTensor(biasPadDims); |
247 | if (biasDims[0] != biasPadDims[0]) |
248 | memset(bias->get_data_handle(), 0, biasPadDims[0]*sizeof(float)); |
249 | memcpy(bias->get_data_handle(), b.data, biasDims[0]*sizeof(float)); |
250 | |
251 | // Allocate memory for destination |
252 | memory::dims dstDims = srcDims; |
253 | dstDims[1] = weightsPadDims[0]; // dstDims[C] = weightsPadDims[OC] |
254 | |
255 | std::shared_ptr<memory> dst; |
256 | if (!userDst) |
257 | dst = allocTensor(dstDims); |
258 | else if (getTensorDims(userDst) == dstDims) |
259 | dst = userDst; |
260 | else |
261 | dst = castTensor(dstDims, userDst); |
262 | |
263 | // Create a convolution |
264 | // Let the convolution primitive choose the weights format |
265 | auto weightsDesc = memory::desc({ weightsPadDims }, memory::data_type::f32, memory::format_tag::any); |
266 | |
267 | auto convAlgo = (K == 16) ? convolution_winograd : convolution_direct; |
268 | auto convDesc = convolution_forward::desc( |
269 | prop_kind::forward_inference, convAlgo, |
270 | src->get_desc(), |
271 | weightsDesc, |
272 | bias->get_desc(), |
273 | dst->get_desc(), |
274 | strides, padding, padding, padding_kind::zero); |
275 | |
276 | // Incorporate relu |
277 | mkldnn::primitive_attr convAttr; |
278 | if (relu) |
279 | { |
280 | mkldnn::post_ops ops; |
281 | ops.append_eltwise( |
282 | 1.f, // scale factor, not used |
283 | algorithm::eltwise_relu, |
284 | 0.f, // max with |
285 | 0.f // unused |
286 | ); |
287 | convAttr.set_post_ops(ops); |
288 | } |
289 | convAttr.set_scratchpad_mode(scratchpad_mode_user); |
290 | |
291 | auto convPrimDesc = convolution_forward::primitive_desc(convDesc, convAttr, eng); |
292 | |
293 | // Reorder the weights to the final format, if necessary |
294 | auto weights = weightsPad; |
295 | if (convPrimDesc.weights_desc() != weightsPad->get_desc()) |
296 | { |
297 | weights = std::make_shared<memory>(convPrimDesc.weights_desc(), eng); |
298 | ReorderNode(weightsPad, weights).execute(sm); |
299 | } |
300 | |
301 | // Create convolution node and add it to the net |
302 | auto node = std::make_shared<ConvNode>(convPrimDesc, src, weights, bias, dst); |
303 | nodes.push_back(node); |
304 | return node; |
305 | } |
306 | |
307 | template<int K> |
308 | memory::dims Network<K>::getPoolDims(const memory::dims& srcDims) |
309 | { |
310 | memory::dims dstDims = srcDims; |
311 | dstDims[2] /= 2; // H/2 |
312 | dstDims[3] /= 2; // W/2 |
313 | return dstDims; |
314 | } |
315 | |
316 | template<int K> |
317 | std::shared_ptr<Node> Network<K>::addPool(const std::shared_ptr<memory>& src, |
318 | const std::shared_ptr<memory>& userDst) |
319 | { |
320 | const memory::dims kernel = {2, 2}; |
321 | const memory::dims strides = {2, 2}; |
322 | const memory::dims padding = {0, 0}; |
323 | |
324 | memory::dims srcDims = getTensorDims(src); |
325 | memory::dims dstDims = getPoolDims(srcDims); |
326 | |
327 | std::shared_ptr<memory> dst; |
328 | if (!userDst) |
329 | dst = allocTensor(dstDims); |
330 | else if (getTensorDims(userDst) == dstDims) |
331 | dst = userDst; |
332 | else |
333 | dst = castTensor(dstDims, userDst); |
334 | |
335 | auto poolDesc = pooling_forward::desc( |
336 | prop_kind::forward_inference, pooling_max, |
337 | src->get_desc(), |
338 | dst->get_desc(), |
339 | strides, kernel, padding, padding, padding_kind::zero); |
340 | |
341 | mkldnn::primitive_attr poolAttr; |
342 | poolAttr.set_scratchpad_mode(scratchpad_mode_user); |
343 | |
344 | auto poolPrimDesc = pooling_forward::primitive_desc(poolDesc, poolAttr, eng); |
345 | |
346 | auto node = std::make_shared<PoolNode>(poolPrimDesc, src, dst); |
347 | nodes.push_back(node); |
348 | return node; |
349 | } |
350 | |
351 | template<int K> |
352 | memory::dims Network<K>::getUpsampleDims(const memory::dims& srcDims) |
353 | { |
354 | memory::dims dstDims = srcDims; |
355 | dstDims[2] *= 2; // H*2 |
356 | dstDims[3] *= 2; // W*2 |
357 | return dstDims; |
358 | } |
359 | |
360 | template<int K> |
361 | std::shared_ptr<Node> Network<K>::addUpsample(const std::shared_ptr<memory>& src, |
362 | const std::shared_ptr<memory>& userDst) |
363 | { |
364 | memory::dims srcDims = getTensorDims(src); |
365 | memory::dims dstDims = getUpsampleDims(srcDims); |
366 | |
367 | std::shared_ptr<memory> dst; |
368 | if (!userDst) |
369 | dst = allocTensor(dstDims); |
370 | else if (getTensorDims(userDst) == dstDims) |
371 | dst = userDst; |
372 | else |
373 | dst = castTensor(dstDims, userDst); |
374 | |
375 | // Create upsampling node and add it to net |
376 | auto node = std::make_shared<UpsampleNode<K>>(src, dst); |
377 | nodes.push_back(node); |
378 | return node; |
379 | } |
380 | |
381 | template<int K> |
382 | memory::dims Network<K>::getConcatDims(const memory::dims& src1Dims, const memory::dims& src2Dims) |
383 | { |
384 | assert(src1Dims[0] == src2Dims[0]); // N |
385 | assert(src1Dims[2] == src2Dims[2]); // H |
386 | assert(src1Dims[3] == src2Dims[3]); // W |
387 | |
388 | memory::dims dstDims = src1Dims; |
389 | dstDims[1] += src2Dims[1]; // C |
390 | return dstDims; |
391 | } |
392 | |
393 | template<int K> |
394 | std::shared_ptr<Node> Network<K>::addAutoexposure(const Image& color, |
395 | const std::shared_ptr<HDRTransferFunction>& transferFunc) |
396 | { |
397 | auto node = std::make_shared<AutoexposureNode>(color, transferFunc); |
398 | nodes.push_back(node); |
399 | return node; |
400 | } |
401 | |
402 | template <int K> |
403 | void Network<K>::finalize() |
404 | { |
405 | // Compute the size of the scratchpad |
406 | size_t scratchpadSize = 0; |
407 | for (const auto& node : nodes) |
408 | scratchpadSize = max(scratchpadSize, node->getScratchpadSize()); |
409 | |
410 | // Allocate the scratchpad |
411 | memory::dims scratchpadDims = { memory::dim(scratchpadSize) }; |
412 | memory::desc scratchpadDesc(scratchpadDims, memory::data_type::u8, memory::format_tag::x); |
413 | auto scratchpad = std::make_shared<memory>(scratchpadDesc, eng); |
414 | activationAllocBytes += scratchpadSize; |
415 | totalAllocBytes += scratchpadSize; |
416 | |
417 | // Set the scratchpad for the nodes |
418 | for (auto& node : nodes) |
419 | node->setScratchpad(scratchpad); |
420 | |
421 | // Free the weights |
422 | weightMap.clear(); |
423 | |
424 | // Print statistics |
425 | if (device->isVerbose(2)) |
426 | { |
427 | std::cout << "Activation bytes: " << activationAllocBytes << std::endl; |
428 | std::cout << "Scratchpad bytes: " << scratchpadSize << std::endl; |
429 | std::cout << "Total bytes : " << totalAllocBytes << std::endl; |
430 | } |
431 | } |
432 | |
433 | template class Network<8>; |
434 | template class Network<16>; |
435 | |
436 | } // namespace oidn |
437 | |