| 1 | // Copyright 2009-2021 Intel Corporation |
| 2 | // SPDX-License-Identifier: Apache-2.0 |
| 3 | |
| 4 | #pragma once |
| 5 | |
| 6 | #include "parallel_for.h" |
| 7 | |
| 8 | namespace embree |
| 9 | { |
| 10 | template<typename Value> |
| 11 | struct ParallelPrefixSumState |
| 12 | { |
| 13 | enum { MAX_TASKS = 64 }; |
| 14 | Value counts[MAX_TASKS]; |
| 15 | Value sums [MAX_TASKS]; |
| 16 | }; |
| 17 | |
| 18 | template<typename Index, typename Value, typename Func, typename Reduction> |
| 19 | __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction) |
| 20 | { |
| 21 | /* calculate number of tasks to use */ |
| 22 | const size_t numThreads = TaskScheduler::threadCount(); |
| 23 | const size_t numBlocks = (last-first+minStepSize-1)/minStepSize; |
| 24 | const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS)); |
| 25 | |
| 26 | /* perform parallel prefix sum */ |
| 27 | parallel_for(taskCount, [&](const size_t taskIndex) |
| 28 | { |
| 29 | const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount; |
| 30 | const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount; |
| 31 | state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]); |
| 32 | }); |
| 33 | |
| 34 | /* calculate prefix sum */ |
| 35 | Value sum=identity; |
| 36 | for (size_t i=0; i<taskCount; i++) |
| 37 | { |
| 38 | const Value c = state.counts[i]; |
| 39 | state.sums[i] = sum; |
| 40 | sum=reduction(sum,c); |
| 41 | } |
| 42 | |
| 43 | return sum; |
| 44 | } |
| 45 | |
| 46 | /*! parallel calculation of prefix sums */ |
| 47 | template<typename SrcArray, typename DstArray, typename Value, typename Add> |
| 48 | __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) |
| 49 | { |
| 50 | /* perform single threaded prefix operation for small N */ |
| 51 | if (N < SINGLE_THREAD_THRESHOLD) |
| 52 | { |
| 53 | Value sum=identity; |
| 54 | for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum; |
| 55 | return sum; |
| 56 | } |
| 57 | |
| 58 | /* perform parallel prefix operation for large N */ |
| 59 | else |
| 60 | { |
| 61 | ParallelPrefixSumState<Value> state; |
| 62 | |
| 63 | /* initial run just sets up start values for subtasks */ |
| 64 | parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { |
| 65 | |
| 66 | Value s = identity; |
| 67 | for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]); |
| 68 | return s; |
| 69 | |
| 70 | }, add); |
| 71 | |
| 72 | /* final run calculates prefix sum */ |
| 73 | return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { |
| 74 | |
| 75 | Value s = identity; |
| 76 | for (size_t i=r.begin(); i<r.end(); i++) { |
| 77 | dst[i] = add(sum,s); |
| 78 | s = add(s,src[i]); |
| 79 | } |
| 80 | return s; |
| 81 | |
| 82 | }, add); |
| 83 | } |
| 84 | } |
| 85 | } |
| 86 | |