1// ======================================================================== //
2// Copyright 2009-2019 Intel Corporation //
3// //
4// Licensed under the Apache License, Version 2.0 (the "License"); //
5// you may not use this file except in compliance with the License. //
6// You may obtain a copy of the License at //
7// //
8// http://www.apache.org/licenses/LICENSE-2.0 //
9// //
10// Unless required by applicable law or agreed to in writing, software //
11// distributed under the License is distributed on an "AS IS" BASIS, //
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
13// See the License for the specific language governing permissions and //
14// limitations under the License. //
15// ======================================================================== //
16
17#include "upsample.h"
18#include "weights_reorder.h"
19#include "network.h"
20// -- GODOT start --
21#include <cstring>
22// -- GODOT end --
23
24namespace oidn {
25
26 template<int K>
27 Network<K>::Network(const Ref<Device>& device, const std::map<std::string, Tensor>& weightMap)
28 : device(device),
29 eng(engine::cpu, 0),
30 sm(eng),
31 weightMap(weightMap)
32 {
33 }
34
35 template<int K>
36 void Network<K>::execute(const Progress& progress, int taskIndex)
37 {
38 if (progress.func)
39 {
40 const double value = double(taskIndex) / double(progress.taskCount);
41 if (!progress.func(progress.userPtr, value))
42 throw Exception(Error::Cancelled, "execution was cancelled");
43 }
44
45 for (size_t i = 0; i < nodes.size(); ++i)
46 {
47 nodes[i]->execute(sm);
48
49 if (progress.func)
50 {
51 const double value = (double(taskIndex) + double(i+1) / double(nodes.size())) / double(progress.taskCount);
52 if (!progress.func(progress.userPtr, value))
53 throw Exception(Error::Cancelled, "execution was cancelled");
54 }
55 }
56 }
57
58 template<int K>
59 std::shared_ptr<memory> Network<K>::allocTensor(const memory::dims& dims,
60 memory::format_tag format,
61 void* data)
62 {
63 if (format == memory::format_tag::any)
64 {
65 if (dims.size() == 4)
66 format = BlockedFormat<K>::nChwKc;
67 else if (dims.size() == 1)
68 format = memory::format_tag::x;
69 else
70 assert(0);
71 }
72 memory::desc desc(dims, memory::data_type::f32, format);
73 if (data == nullptr)
74 {
75 const size_t bytes = getTensorSize(dims) * sizeof(float);
76 if (format == BlockedFormat<K>::nChwKc)
77 activationAllocBytes += bytes;
78 totalAllocBytes += bytes;
79
80 return std::make_shared<memory>(desc, eng);
81 }
82 else
83 {
84 return std::make_shared<memory>(desc, eng, data);
85 }
86 }
87
88 template<int K>
89 std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims,
90 const std::shared_ptr<memory>& src,
91 size_t srcOffset,
92 memory::format_tag format)
93 {
94 const mkldnn_memory_desc_t& srcDesc = src->get_desc().data;
95 MAYBE_UNUSED(srcDesc);
96 assert(srcDesc.data_type == memory::data_type::f32);
97 assert(getTensorSize(src) >= srcOffset + getTensorSize(dims));
98
99 if (format == memory::format_tag::any)
100 {
101 if (dims.size() == 4)
102 format = BlockedFormat<K>::nChwKc;
103 else if (dims.size() == 1)
104 format = memory::format_tag::x;
105 else
106 assert(0);
107 }
108 memory::desc desc(dims, memory::data_type::f32, format);
109 float* srcPtr = (float*)src->get_data_handle() + srcOffset;
110 return std::make_shared<memory>(desc, eng, srcPtr);
111 }
112
113 template<int K>
114 std::shared_ptr<memory> Network<K>::castTensor(const memory::dims& dims,
115 const std::shared_ptr<memory>& src,
116 const memory::dims& srcOffset)
117 {
118 return castTensor(dims, src, getTensorSize(srcOffset));
119 }
120
121 template<int K>
122 void Network<K>::zeroTensor(const std::shared_ptr<memory>& dst)
123 {
124 assert(getTensorType(dst) == memory::data_type::f32);
125 memset(dst->get_data_handle(), 0, getTensorSize(dst)*sizeof(float));
126 }
127
128 template<int K>
129 memory::dims Network<K>::getInputReorderDims(const memory::dims& srcDims, int alignment)
130 {
131 memory::dims dstDims = srcDims;
132 dstDims[1] = getPadded<K>(srcDims[1]); // round up C
133 dstDims[2] = roundUp(srcDims[2], memory::dim(alignment)); // round up H
134 dstDims[3] = roundUp(srcDims[3], memory::dim(alignment)); // round up W
135 return dstDims;
136 }
137
138 template<int K>
139 std::shared_ptr<Node> Network<K>::addInputReorder(const Image& color,
140 const Image& albedo,
141 const Image& normal,
142 const std::shared_ptr<TransferFunction>& transferFunc,
143 int alignment,
144 const std::shared_ptr<memory>& userDst)
145 {
146 assert(color);
147 int inputC = 3;
148 if (albedo) inputC += 3;
149 if (normal) inputC += 3;
150
151 memory::dims srcDims = {1, inputC, color.height, color.width};
152 memory::dims dstDims = getInputReorderDims(srcDims, alignment);
153
154 // Allocate padded memory
155 auto dst = userDst;
156 if (!dst)
157 dst = allocTensor(dstDims);
158
159 // Push node
160 std::shared_ptr<Node> node;
161
162 if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc))
163 node = std::make_shared<InputReorderNode<K, LinearTransferFunction>>(color, albedo, normal, dst, tf);
164 else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc))
165 node = std::make_shared<InputReorderNode<K, GammaTransferFunction>>(color, albedo, normal, dst, tf);
166 else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc))
167 node = std::make_shared<InputReorderNode<K, LogTransferFunction>>(color, albedo, normal, dst, tf);
168 else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc))
169 node = std::make_shared<InputReorderNode<K, PQXTransferFunction>>(color, albedo, normal, dst, tf);
170 else
171 assert(0);
172
173 nodes.push_back(node);
174 return node;
175 }
176
177 template<int K>
178 std::shared_ptr<Node> Network<K>::addOutputReorder(const std::shared_ptr<memory>& src,
179 const std::shared_ptr<TransferFunction>& transferFunc,
180 const Image& output)
181 {
182 memory::dims srcDims = getTensorDims(src);
183 assert(srcDims[1] == K);
184
185 // Push node
186 std::shared_ptr<Node> node;
187
188 if (auto tf = std::dynamic_pointer_cast<LinearTransferFunction>(transferFunc))
189 node = std::make_shared<OutputReorderNode<K, LinearTransferFunction>>(src, output, tf);
190 else if (auto tf = std::dynamic_pointer_cast<GammaTransferFunction>(transferFunc))
191 node = std::make_shared<OutputReorderNode<K, GammaTransferFunction>>(src, output, tf);
192 else if (auto tf = std::dynamic_pointer_cast<LogTransferFunction>(transferFunc))
193 node = std::make_shared<OutputReorderNode<K, LogTransferFunction>>(src, output, tf);
194 else if (auto tf = std::dynamic_pointer_cast<PQXTransferFunction>(transferFunc))
195 node = std::make_shared<OutputReorderNode<K, PQXTransferFunction>>(src, output, tf);
196 else
197 assert(0);
198
199 nodes.push_back(node);
200 return node;
201 }
202
203 template<int K>
204 memory::dims Network<K>::getConvDims(const std::string& name, const memory::dims& srcDims)
205 {
206 auto b = weightMap[name + "/b"];
207 memory::dims dstDims = srcDims;
208 dstDims[1] = getPadded<K>(b.dims[0]); // dstDims[C] = getPadded(OC)
209 return dstDims;
210 }
211
212 template<int K>
213 std::shared_ptr<Node> Network<K>::addConv(const std::string& name,
214 const std::shared_ptr<memory>& src,
215 const std::shared_ptr<memory>& userDst,
216 bool relu)
217 {
218 const memory::dims strides = {1, 1};
219 const memory::dims padding = {1, 1};
220
221 memory::dims srcDims = getTensorDims(src);
222
223 // Get the weights
224 const auto& W = weightMap[name + "/W"];
225 if (W.ndims() != 4 || W.format != "oihw")
226 throw Exception(Error::InvalidOperation, "invalid convolution weights");
227 memory::dims weightsDims = W.dims;
228 auto userWeights = allocTensor(weightsDims, memory::format_tag::oihw, W.data);
229
230 // Pad the weights
231 memory::dims weightsPadDims = weightsDims;
232 weightsPadDims[1] = getPadded<K>(weightsDims[1]); // IC
233 weightsPadDims[0] = getPadded<K>(weightsDims[0]); // OC
234 assert(srcDims[1] == weightsPadDims[1]); // srcDims[C] == weightsPadDims[IC]
235 auto weightsPad = allocTensor(weightsPadDims, memory::format_tag::oihw);
236 WeightsReorderNode<K>(userWeights, weightsPad).execute(sm);
237
238 // Get the biases
239 const auto& b = weightMap[name + "/b"];
240 if (b.ndims() != 1)
241 throw Exception(Error::InvalidOperation, "invalid convolution biases");
242 memory::dims biasDims = b.dims;
243
244 // Copy/pad the biases
245 memory::dims biasPadDims = {getPadded<K>(biasDims[0])};
246 auto bias = allocTensor(biasPadDims);
247 if (biasDims[0] != biasPadDims[0])
248 memset(bias->get_data_handle(), 0, biasPadDims[0]*sizeof(float));
249 memcpy(bias->get_data_handle(), b.data, biasDims[0]*sizeof(float));
250
251 // Allocate memory for destination
252 memory::dims dstDims = srcDims;
253 dstDims[1] = weightsPadDims[0]; // dstDims[C] = weightsPadDims[OC]
254
255 std::shared_ptr<memory> dst;
256 if (!userDst)
257 dst = allocTensor(dstDims);
258 else if (getTensorDims(userDst) == dstDims)
259 dst = userDst;
260 else
261 dst = castTensor(dstDims, userDst);
262
263 // Create a convolution
264 // Let the convolution primitive choose the weights format
265 auto weightsDesc = memory::desc({ weightsPadDims }, memory::data_type::f32, memory::format_tag::any);
266
267 auto convAlgo = (K == 16) ? convolution_winograd : convolution_direct;
268 auto convDesc = convolution_forward::desc(
269 prop_kind::forward_inference, convAlgo,
270 src->get_desc(),
271 weightsDesc,
272 bias->get_desc(),
273 dst->get_desc(),
274 strides, padding, padding, padding_kind::zero);
275
276 // Incorporate relu
277 mkldnn::primitive_attr convAttr;
278 if (relu)
279 {
280 mkldnn::post_ops ops;
281 ops.append_eltwise(
282 1.f, // scale factor, not used
283 algorithm::eltwise_relu,
284 0.f, // max with
285 0.f // unused
286 );
287 convAttr.set_post_ops(ops);
288 }
289 convAttr.set_scratchpad_mode(scratchpad_mode_user);
290
291 auto convPrimDesc = convolution_forward::primitive_desc(convDesc, convAttr, eng);
292
293 // Reorder the weights to the final format, if necessary
294 auto weights = weightsPad;
295 if (convPrimDesc.weights_desc() != weightsPad->get_desc())
296 {
297 weights = std::make_shared<memory>(convPrimDesc.weights_desc(), eng);
298 ReorderNode(weightsPad, weights).execute(sm);
299 }
300
301 // Create convolution node and add it to the net
302 auto node = std::make_shared<ConvNode>(convPrimDesc, src, weights, bias, dst);
303 nodes.push_back(node);
304 return node;
305 }
306
307 template<int K>
308 memory::dims Network<K>::getPoolDims(const memory::dims& srcDims)
309 {
310 memory::dims dstDims = srcDims;
311 dstDims[2] /= 2; // H/2
312 dstDims[3] /= 2; // W/2
313 return dstDims;
314 }
315
316 template<int K>
317 std::shared_ptr<Node> Network<K>::addPool(const std::shared_ptr<memory>& src,
318 const std::shared_ptr<memory>& userDst)
319 {
320 const memory::dims kernel = {2, 2};
321 const memory::dims strides = {2, 2};
322 const memory::dims padding = {0, 0};
323
324 memory::dims srcDims = getTensorDims(src);
325 memory::dims dstDims = getPoolDims(srcDims);
326
327 std::shared_ptr<memory> dst;
328 if (!userDst)
329 dst = allocTensor(dstDims);
330 else if (getTensorDims(userDst) == dstDims)
331 dst = userDst;
332 else
333 dst = castTensor(dstDims, userDst);
334
335 auto poolDesc = pooling_forward::desc(
336 prop_kind::forward_inference, pooling_max,
337 src->get_desc(),
338 dst->get_desc(),
339 strides, kernel, padding, padding, padding_kind::zero);
340
341 mkldnn::primitive_attr poolAttr;
342 poolAttr.set_scratchpad_mode(scratchpad_mode_user);
343
344 auto poolPrimDesc = pooling_forward::primitive_desc(poolDesc, poolAttr, eng);
345
346 auto node = std::make_shared<PoolNode>(poolPrimDesc, src, dst);
347 nodes.push_back(node);
348 return node;
349 }
350
351 template<int K>
352 memory::dims Network<K>::getUpsampleDims(const memory::dims& srcDims)
353 {
354 memory::dims dstDims = srcDims;
355 dstDims[2] *= 2; // H*2
356 dstDims[3] *= 2; // W*2
357 return dstDims;
358 }
359
360 template<int K>
361 std::shared_ptr<Node> Network<K>::addUpsample(const std::shared_ptr<memory>& src,
362 const std::shared_ptr<memory>& userDst)
363 {
364 memory::dims srcDims = getTensorDims(src);
365 memory::dims dstDims = getUpsampleDims(srcDims);
366
367 std::shared_ptr<memory> dst;
368 if (!userDst)
369 dst = allocTensor(dstDims);
370 else if (getTensorDims(userDst) == dstDims)
371 dst = userDst;
372 else
373 dst = castTensor(dstDims, userDst);
374
375 // Create upsampling node and add it to net
376 auto node = std::make_shared<UpsampleNode<K>>(src, dst);
377 nodes.push_back(node);
378 return node;
379 }
380
381 template<int K>
382 memory::dims Network<K>::getConcatDims(const memory::dims& src1Dims, const memory::dims& src2Dims)
383 {
384 assert(src1Dims[0] == src2Dims[0]); // N
385 assert(src1Dims[2] == src2Dims[2]); // H
386 assert(src1Dims[3] == src2Dims[3]); // W
387
388 memory::dims dstDims = src1Dims;
389 dstDims[1] += src2Dims[1]; // C
390 return dstDims;
391 }
392
393 template<int K>
394 std::shared_ptr<Node> Network<K>::addAutoexposure(const Image& color,
395 const std::shared_ptr<HDRTransferFunction>& transferFunc)
396 {
397 auto node = std::make_shared<AutoexposureNode>(color, transferFunc);
398 nodes.push_back(node);
399 return node;
400 }
401
402 template <int K>
403 void Network<K>::finalize()
404 {
405 // Compute the size of the scratchpad
406 size_t scratchpadSize = 0;
407 for (const auto& node : nodes)
408 scratchpadSize = max(scratchpadSize, node->getScratchpadSize());
409
410 // Allocate the scratchpad
411 memory::dims scratchpadDims = { memory::dim(scratchpadSize) };
412 memory::desc scratchpadDesc(scratchpadDims, memory::data_type::u8, memory::format_tag::x);
413 auto scratchpad = std::make_shared<memory>(scratchpadDesc, eng);
414 activationAllocBytes += scratchpadSize;
415 totalAllocBytes += scratchpadSize;
416
417 // Set the scratchpad for the nodes
418 for (auto& node : nodes)
419 node->setScratchpad(scratchpad);
420
421 // Free the weights
422 weightMap.clear();
423
424 // Print statistics
425 if (device->isVerbose(2))
426 {
427 std::cout << "Activation bytes: " << activationAllocBytes << std::endl;
428 std::cout << "Scratchpad bytes: " << scratchpadSize << std::endl;
429 std::cout << "Total bytes : " << totalAllocBytes << std::endl;
430 }
431 }
432
433 template class Network<8>;
434 template class Network<16>;
435
436} // namespace oidn
437