1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "parallel_for.h" |
7 | |
8 | namespace embree |
9 | { |
10 | template<typename ArrayArray, typename Func> |
11 | __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) |
12 | { |
13 | size_t k=0; |
14 | for (size_t i=0; i!=array2.size(); ++i) { |
15 | const size_t N = array2[i]->size(); |
16 | if (N) func(array2[i],range<size_t>(0,N),k); |
17 | k+=N; |
18 | } |
19 | } |
20 | |
21 | class ParallelForForState |
22 | { |
23 | public: |
24 | |
25 | enum { MAX_TASKS = 64 }; |
26 | |
27 | __forceinline ParallelForForState () |
28 | : taskCount(0) {} |
29 | |
30 | template<typename ArrayArray> |
31 | __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { |
32 | init(array2,minStepSize); |
33 | } |
34 | |
35 | template<typename SizeFunc> |
36 | __forceinline ParallelForForState (const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize) { |
37 | init(numArrays,getSize,minStepSize); |
38 | } |
39 | |
40 | template<typename SizeFunc> |
41 | __forceinline void init ( const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize ) |
42 | { |
43 | /* first calculate total number of elements */ |
44 | size_t N = 0; |
45 | for (size_t i=0; i<numArrays; i++) { |
46 | N += getSize(i); |
47 | } |
48 | this->N = N; |
49 | |
50 | /* calculate number of tasks to use */ |
51 | const size_t numThreads = TaskScheduler::threadCount(); |
52 | const size_t numBlocks = (N+minStepSize-1)/minStepSize; |
53 | taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS))); |
54 | |
55 | /* calculate start (i,j) for each task */ |
56 | size_t taskIndex = 0; |
57 | i0[taskIndex] = 0; |
58 | j0[taskIndex] = 0; |
59 | size_t k0 = (++taskIndex)*N/taskCount; |
60 | for (size_t i=0, k=0; taskIndex < taskCount; i++) |
61 | { |
62 | assert(i<numArrays); |
63 | size_t j=0, M = getSize(i); |
64 | while (j<M && k+M-j >= k0 && taskIndex < taskCount) { |
65 | assert(taskIndex<taskCount); |
66 | i0[taskIndex] = i; |
67 | j0[taskIndex] = j += k0-k; |
68 | k=k0; |
69 | k0 = (++taskIndex)*N/taskCount; |
70 | } |
71 | k+=M-j; |
72 | } |
73 | } |
74 | |
75 | template<typename ArrayArray> |
76 | __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) |
77 | { |
78 | init(array2.size(),[&](size_t i) { return array2[i] ? array2[i]->size() : 0; },minStepSize); |
79 | } |
80 | |
81 | __forceinline size_t size() const { |
82 | return N; |
83 | } |
84 | |
85 | public: |
86 | size_t i0[MAX_TASKS]; |
87 | size_t j0[MAX_TASKS]; |
88 | size_t taskCount; |
89 | size_t N; |
90 | }; |
91 | |
92 | template<typename ArrayArray, typename Func> |
93 | __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) |
94 | { |
95 | ParallelForForState state(array2,minStepSize); |
96 | |
97 | parallel_for(state.taskCount, [&](const size_t taskIndex) |
98 | { |
99 | /* calculate range */ |
100 | const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; |
101 | const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; |
102 | size_t i0 = state.i0[taskIndex]; |
103 | size_t j0 = state.j0[taskIndex]; |
104 | |
105 | /* iterate over arrays */ |
106 | size_t k=k0; |
107 | for (size_t i=i0; k<k1; i++) { |
108 | const size_t N = array2[i] ? array2[i]->size() : 0; |
109 | const size_t r0 = j0, r1 = min(N,r0+k1-k); |
110 | if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k); |
111 | k+=r1-r0; j0 = 0; |
112 | } |
113 | }); |
114 | } |
115 | |
116 | template<typename ArrayArray, typename Func> |
117 | __forceinline void parallel_for_for( ArrayArray& array2, const Func& func ) |
118 | { |
119 | parallel_for_for(array2,1,func); |
120 | } |
121 | |
122 | template<typename ArrayArray, typename Value, typename Func, typename Reduction> |
123 | __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction ) |
124 | { |
125 | ParallelForForState state(array2,minStepSize); |
126 | Value temp[ParallelForForState::MAX_TASKS]; |
127 | |
128 | for (size_t i=0; i<state.taskCount; i++) |
129 | temp[i] = identity; |
130 | |
131 | parallel_for(state.taskCount, [&](const size_t taskIndex) |
132 | { |
133 | /* calculate range */ |
134 | const size_t k0 = (taskIndex+0)*state.size()/state.taskCount; |
135 | const size_t k1 = (taskIndex+1)*state.size()/state.taskCount; |
136 | size_t i0 = state.i0[taskIndex]; |
137 | size_t j0 = state.j0[taskIndex]; |
138 | |
139 | /* iterate over arrays */ |
140 | size_t k=k0; |
141 | for (size_t i=i0; k<k1; i++) { |
142 | const size_t N = array2[i] ? array2[i]->size() : 0; |
143 | const size_t r0 = j0, r1 = min(N,r0+k1-k); |
144 | if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k)); |
145 | k+=r1-r0; j0 = 0; |
146 | } |
147 | }); |
148 | |
149 | Value ret = identity; |
150 | for (size_t i=0; i<state.taskCount; i++) |
151 | ret = reduction(ret,temp[i]); |
152 | return ret; |
153 | } |
154 | |
155 | template<typename ArrayArray, typename Value, typename Func, typename Reduction> |
156 | __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) |
157 | { |
158 | return parallel_for_for_reduce(array2,1,identity,func,reduction); |
159 | } |
160 | } |
161 | |