1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "parallel_for.h"
7
8namespace embree
9{
10 template<typename Value>
11 struct ParallelPrefixSumState
12 {
13 enum { MAX_TASKS = 64 };
14 Value counts[MAX_TASKS];
15 Value sums [MAX_TASKS];
16 };
17
18 template<typename Index, typename Value, typename Func, typename Reduction>
19 __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
20 {
21 /* calculate number of tasks to use */
22 const size_t numThreads = TaskScheduler::threadCount();
23 const size_t numBlocks = (last-first+minStepSize-1)/minStepSize;
24 const size_t taskCount = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
25
26 /* perform parallel prefix sum */
27 parallel_for(taskCount, [&](const size_t taskIndex)
28 {
29 const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
30 const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
31 state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
32 });
33
34 /* calculate prefix sum */
35 Value sum=identity;
36 for (size_t i=0; i<taskCount; i++)
37 {
38 const Value c = state.counts[i];
39 state.sums[i] = sum;
40 sum=reduction(sum,c);
41 }
42
43 return sum;
44 }
45
46 /*! parallel calculation of prefix sums */
47 template<typename SrcArray, typename DstArray, typename Value, typename Add>
48 __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096)
49 {
50 /* perform single threaded prefix operation for small N */
51 if (N < SINGLE_THREAD_THRESHOLD)
52 {
53 Value sum=identity;
54 for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
55 return sum;
56 }
57
58 /* perform parallel prefix operation for large N */
59 else
60 {
61 ParallelPrefixSumState<Value> state;
62
63 /* initial run just sets up start values for subtasks */
64 parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
65
66 Value s = identity;
67 for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
68 return s;
69
70 }, add);
71
72 /* final run calculates prefix sum */
73 return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
74
75 Value s = identity;
76 for (size_t i=r.begin(); i<r.end(); i++) {
77 dst[i] = add(sum,s);
78 s = add(s,src[i]);
79 }
80 return s;
81
82 }, add);
83 }
84 }
85}
86