| 1 | // Copyright 2009-2021 Intel Corporation |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| 4 | #pragma once |
| 5 | |
| 6 | #include "parallel_for.h" |
| 7 | #include "../math/range.h" |
| 8 | |
| 9 | namespace embree |
| 10 | { |
| 11 | /* serial partitioning */ |
| 12 | template<typename T, typename V, typename IsLeft, typename Reduction_T> |
| 13 | __forceinline size_t serial_partitioning(T* array, |
| 14 | const size_t begin, |
| 15 | const size_t end, |
| 16 | V& leftReduction, |
| 17 | V& rightReduction, |
| 18 | const IsLeft& is_left, |
| 19 | const Reduction_T& reduction_t) |
| 20 | { |
| 21 | T* l = array + begin; |
| 22 | T* r = array + end - 1; |
| 23 | |
| 24 | while(1) |
| 25 | { |
| 26 | /* *l < pivot */ |
| 27 | while (likely(l <= r && is_left(*l) )) |
| 28 | { |
| 29 | //prefetchw(l+4); // FIXME: enable? |
| 30 | reduction_t(leftReduction,*l); |
| 31 | ++l; |
| 32 | } |
| 33 | /* *r >= pivot) */ |
| 34 | while (likely(l <= r && !is_left(*r))) |
| 35 | { |
| 36 | //prefetchw(r-4); FIXME: enable? |
| 37 | reduction_t(rightReduction,*r); |
| 38 | --r; |
| 39 | } |
| 40 | if (r<l) break; |
| 41 | |
| 42 | reduction_t(leftReduction ,*r); |
| 43 | reduction_t(rightReduction,*l); |
| 44 | xchg(*l,*r); |
| 45 | l++; r--; |
| 46 | } |
| 47 | |
| 48 | return l - array; |
| 49 | } |
| 50 | |
| 51 | template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> |
| 52 | class __aligned(64) parallel_partition_task |
| 53 | { |
| 54 | ALIGNED_CLASS_(64); |
| 55 | private: |
| 56 | |
| 57 | static const size_t MAX_TASKS = 64; |
| 58 | |
| 59 | T* array; |
| 60 | size_t N; |
| 61 | const IsLeft& is_left; |
| 62 | const Reduction_T& reduction_t; |
| 63 | const Reduction_V& reduction_v; |
| 64 | const Vi& identity; |
| 65 | |
| 66 | size_t numTasks; |
| 67 | __aligned(64) size_t counter_start[MAX_TASKS+1]; |
| 68 | __aligned(64) size_t counter_left[MAX_TASKS+1]; |
| 69 | __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS]; |
| 70 | __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; |
| 71 | __aligned(64) V leftReductions[MAX_TASKS]; |
| 72 | __aligned(64) V rightReductions[MAX_TASKS]; |
| 73 | |
| 74 | public: |
| 75 | |
| 76 | __forceinline parallel_partition_task(T* array, |
| 77 | const size_t N, |
| 78 | const Vi& identity, |
| 79 | const IsLeft& is_left, |
| 80 | const Reduction_T& reduction_t, |
| 81 | const Reduction_V& reduction_v, |
| 82 | const size_t BLOCK_SIZE) |
| 83 | |
| 84 | : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), |
| 85 | numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {} |
| 86 | |
| 87 | __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges) |
| 88 | { |
| 89 | size_t i = 0; |
| 90 | while(index >= (size_t)r[i].size()) |
| 91 | { |
| 92 | assert(i < numRanges); |
| 93 | index -= (size_t)r[i].size(); |
| 94 | i++; |
| 95 | } |
| 96 | return &r[i]; |
| 97 | } |
| 98 | |
| 99 | __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, |
| 100 | const size_t numRightMisplacedRanges, |
| 101 | const size_t startID, |
| 102 | const size_t endID) |
| 103 | { |
| 104 | size_t leftLocalIndex = startID; |
| 105 | size_t rightLocalIndex = startID; |
| 106 | const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges); |
| 107 | const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges); |
| 108 | |
| 109 | size_t l_left = l_range->size() - leftLocalIndex; |
| 110 | size_t r_left = r_range->size() - rightLocalIndex; |
| 111 | T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; |
| 112 | T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; |
| 113 | size_t size = endID - startID; |
| 114 | size_t items = min(size,min(l_left,r_left)); |
| 115 | |
| 116 | while (size) |
| 117 | { |
| 118 | if (unlikely(l_left == 0)) |
| 119 | { |
| 120 | l_range++; |
| 121 | l_left = l_range->size(); |
| 122 | l = &array[l_range->begin()]; |
| 123 | items = min(size,min(l_left,r_left)); |
| 124 | } |
| 125 | |
| 126 | if (unlikely(r_left == 0)) |
| 127 | { |
| 128 | r_range++; |
| 129 | r_left = r_range->size(); |
| 130 | r = &array[r_range->begin()]; |
| 131 | items = min(size,min(l_left,r_left)); |
| 132 | } |
| 133 | |
| 134 | size -= items; |
| 135 | l_left -= items; |
| 136 | r_left -= items; |
| 137 | |
| 138 | while(items) { |
| 139 | items--; |
| 140 | xchg(*l++,*r++); |
| 141 | } |
| 142 | } |
| 143 | } |
| 144 | |
| 145 | __forceinline size_t partition(V& leftReduction, V& rightReduction) |
| 146 | { |
| 147 | /* partition the individual ranges for each task */ |
| 148 | parallel_for(numTasks,[&] (const size_t taskID) { |
| 149 | const size_t startID = (taskID+0)*N/numTasks; |
| 150 | const size_t endID = (taskID+1)*N/numTasks; |
| 151 | V local_left(identity); |
| 152 | V local_right(identity); |
| 153 | const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); |
| 154 | counter_start[taskID] = startID; |
| 155 | counter_left [taskID] = mid-startID; |
| 156 | leftReductions[taskID] = local_left; |
| 157 | rightReductions[taskID] = local_right; |
| 158 | }); |
| 159 | counter_start[numTasks] = N; |
| 160 | counter_left[numTasks] = 0; |
| 161 | |
| 162 | /* finalize the reductions */ |
| 163 | for (size_t i=0; i<numTasks; i++) { |
| 164 | reduction_v(leftReduction,leftReductions[i]); |
| 165 | reduction_v(rightReduction,rightReductions[i]); |
| 166 | } |
| 167 | |
| 168 | /* calculate mid point for partitioning */ |
| 169 | size_t mid = counter_left[0]; |
| 170 | for (size_t i=1; i<numTasks; i++) |
| 171 | mid += counter_left[i]; |
| 172 | const range<ssize_t> globalLeft (0,mid); |
| 173 | const range<ssize_t> globalRight(mid,N); |
| 174 | |
| 175 | /* calculate all left and right ranges that are on the wrong global side */ |
| 176 | size_t numMisplacedRangesLeft = 0; |
| 177 | size_t numMisplacedRangesRight = 0; |
| 178 | size_t numMisplacedItemsLeft = 0; |
| 179 | size_t numMisplacedItemsRight = 0; |
| 180 | |
| 181 | for (size_t i=0; i<numTasks; i++) |
| 182 | { |
| 183 | const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]); |
| 184 | const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]); |
| 185 | const range<ssize_t> left_misplaced = globalLeft. intersect(right_range); |
| 186 | const range<ssize_t> right_misplaced = globalRight.intersect(left_range); |
| 187 | |
| 188 | if (!left_misplaced.empty()) |
| 189 | { |
| 190 | numMisplacedItemsLeft += left_misplaced.size(); |
| 191 | leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; |
| 192 | } |
| 193 | |
| 194 | if (!right_misplaced.empty()) |
| 195 | { |
| 196 | numMisplacedItemsRight += right_misplaced.size(); |
| 197 | rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; |
| 198 | } |
| 199 | } |
| 200 | assert( numMisplacedItemsLeft == numMisplacedItemsRight ); |
| 201 | |
| 202 | /* if no items are misplaced we are done */ |
| 203 | if (numMisplacedItemsLeft == 0) |
| 204 | return mid; |
| 205 | |
| 206 | /* otherwise we copy the items to the right place in parallel */ |
| 207 | parallel_for(numTasks,[&] (const size_t taskID) { |
| 208 | const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; |
| 209 | const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks; |
| 210 | swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID); |
| 211 | }); |
| 212 | |
| 213 | return mid; |
| 214 | } |
| 215 | }; |
| 216 | |
| 217 | template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> |
| 218 | __noinline size_t parallel_partitioning(T* array, |
| 219 | const size_t begin, |
| 220 | const size_t end, |
| 221 | const Vi &identity, |
| 222 | V &leftReduction, |
| 223 | V &rightReduction, |
| 224 | const IsLeft& is_left, |
| 225 | const Reduction_T& reduction_t, |
| 226 | const Reduction_V& reduction_v, |
| 227 | size_t BLOCK_SIZE = 128) |
| 228 | { |
| 229 | /* fall back to single threaded partitioning for small N */ |
| 230 | if (unlikely(end-begin < BLOCK_SIZE)) |
| 231 | return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); |
| 232 | |
| 233 | /* otherwise use parallel code */ |
| 234 | else { |
| 235 | typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; |
| 236 | std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); |
| 237 | return begin+p->partition(leftReduction,rightReduction); |
| 238 | } |
| 239 | } |
| 240 | |
| 241 | template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> |
| 242 | __noinline size_t parallel_partitioning(T* array, |
| 243 | const size_t begin, |
| 244 | const size_t end, |
| 245 | const Vi &identity, |
| 246 | V &leftReduction, |
| 247 | V &rightReduction, |
| 248 | const IsLeft& is_left, |
| 249 | const Reduction_T& reduction_t, |
| 250 | const Reduction_V& reduction_v, |
| 251 | size_t BLOCK_SIZE, |
| 252 | size_t PARALLEL_THRESHOLD) |
| 253 | { |
| 254 | /* fall back to single threaded partitioning for small N */ |
| 255 | if (unlikely(end-begin < PARALLEL_THRESHOLD)) |
| 256 | return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); |
| 257 | |
| 258 | /* otherwise use parallel code */ |
| 259 | else { |
| 260 | typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; |
| 261 | std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); |
| 262 | return begin+p->partition(leftReduction,rightReduction); |
| 263 | } |
| 264 | } |
| 265 | |
| 266 | |
| 267 | template<typename T, typename IsLeft> |
| 268 | inline size_t parallel_partitioning(T* array, |
| 269 | const size_t begin, |
| 270 | const size_t end, |
| 271 | const IsLeft& is_left, |
| 272 | size_t BLOCK_SIZE = 128) |
| 273 | { |
| 274 | size_t leftReduction = 0; |
| 275 | size_t rightReduction = 0; |
| 276 | return parallel_partitioning( |
| 277 | array,begin,end,0,leftReduction,rightReduction,is_left, |
| 278 | [] (size_t& t,const T& ref) { }, |
| 279 | [] (size_t& t0,size_t& t1) { }, |
| 280 | BLOCK_SIZE); |
| 281 | } |
| 282 | |
| 283 | } |
| 284 | |