1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "../simd/simd.h" |
7 | #include "parallel_for.h" |
8 | #include <algorithm> |
9 | |
10 | namespace embree |
11 | { |
12 | template<class T> |
13 | __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length) |
14 | { |
15 | for(size_t i = 1;i<length;++i) |
16 | { |
17 | T v = array[i]; |
18 | size_t j = i; |
19 | while(j > 0 && v < array[j-1]) |
20 | { |
21 | array[j] = array[j-1]; |
22 | --j; |
23 | } |
24 | array[j] = v; |
25 | } |
26 | } |
27 | |
28 | template<class T> |
29 | __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length) |
30 | { |
31 | for(size_t i = 1;i<length;++i) |
32 | { |
33 | T v = array[i]; |
34 | size_t j = i; |
35 | while(j > 0 && v > array[j-1]) |
36 | { |
37 | array[j] = array[j-1]; |
38 | --j; |
39 | } |
40 | array[j] = v; |
41 | } |
42 | } |
43 | |
44 | template<class T> |
45 | void quicksort_ascending(T *__restrict__ t, |
46 | const ssize_t begin, |
47 | const ssize_t end) |
48 | { |
49 | if (likely(begin < end)) |
50 | { |
51 | const T pivotvalue = t[begin]; |
52 | ssize_t left = begin - 1; |
53 | ssize_t right = end + 1; |
54 | |
55 | while(1) |
56 | { |
57 | while (t[--right] > pivotvalue); |
58 | while (t[++left] < pivotvalue); |
59 | |
60 | if (left >= right) break; |
61 | |
62 | const T temp = t[right]; |
63 | t[right] = t[left]; |
64 | t[left] = temp; |
65 | } |
66 | |
67 | const int pivot = right; |
68 | quicksort_ascending(t, begin, pivot); |
69 | quicksort_ascending(t, pivot + 1, end); |
70 | } |
71 | } |
72 | |
73 | template<class T> |
74 | void quicksort_decending(T *__restrict__ t, |
75 | const ssize_t begin, |
76 | const ssize_t end) |
77 | { |
78 | if (likely(begin < end)) |
79 | { |
80 | const T pivotvalue = t[begin]; |
81 | ssize_t left = begin - 1; |
82 | ssize_t right = end + 1; |
83 | |
84 | while(1) |
85 | { |
86 | while (t[--right] < pivotvalue); |
87 | while (t[++left] > pivotvalue); |
88 | |
89 | if (left >= right) break; |
90 | |
91 | const T temp = t[right]; |
92 | t[right] = t[left]; |
93 | t[left] = temp; |
94 | } |
95 | |
96 | const int pivot = right; |
97 | quicksort_decending(t, begin, pivot); |
98 | quicksort_decending(t, pivot + 1, end); |
99 | } |
100 | } |
101 | |
102 | |
103 | template<class T, ssize_t THRESHOLD> |
104 | void quicksort_insertionsort_ascending(T *__restrict__ t, |
105 | const ssize_t begin, |
106 | const ssize_t end) |
107 | { |
108 | if (likely(begin < end)) |
109 | { |
110 | const ssize_t size = end-begin+1; |
111 | if (likely(size <= THRESHOLD)) |
112 | { |
113 | insertionsort_ascending<T>(&t[begin],size); |
114 | } |
115 | else |
116 | { |
117 | const T pivotvalue = t[begin]; |
118 | ssize_t left = begin - 1; |
119 | ssize_t right = end + 1; |
120 | |
121 | while(1) |
122 | { |
123 | while (t[--right] > pivotvalue); |
124 | while (t[++left] < pivotvalue); |
125 | |
126 | if (left >= right) break; |
127 | |
128 | const T temp = t[right]; |
129 | t[right] = t[left]; |
130 | t[left] = temp; |
131 | } |
132 | |
133 | const ssize_t pivot = right; |
134 | quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot); |
135 | quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end); |
136 | } |
137 | } |
138 | } |
139 | |
140 | |
141 | template<class T, ssize_t THRESHOLD> |
142 | void quicksort_insertionsort_decending(T *__restrict__ t, |
143 | const ssize_t begin, |
144 | const ssize_t end) |
145 | { |
146 | if (likely(begin < end)) |
147 | { |
148 | const ssize_t size = end-begin+1; |
149 | if (likely(size <= THRESHOLD)) |
150 | { |
151 | insertionsort_decending<T>(&t[begin],size); |
152 | } |
153 | else |
154 | { |
155 | |
156 | const T pivotvalue = t[begin]; |
157 | ssize_t left = begin - 1; |
158 | ssize_t right = end + 1; |
159 | |
160 | while(1) |
161 | { |
162 | while (t[--right] < pivotvalue); |
163 | while (t[++left] > pivotvalue); |
164 | |
165 | if (left >= right) break; |
166 | |
167 | const T temp = t[right]; |
168 | t[right] = t[left]; |
169 | t[left] = temp; |
170 | } |
171 | |
172 | const ssize_t pivot = right; |
173 | quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot); |
174 | quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end); |
175 | } |
176 | } |
177 | } |
178 | |
179 | template<typename T> |
180 | static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8) |
181 | { |
182 | static const unsigned int BITS = 8; |
183 | static const unsigned int BUCKETS = (1 << BITS); |
184 | static const unsigned int CMP_SORT_THRESHOLD = 16; |
185 | |
186 | __aligned(64) unsigned int count[BUCKETS]; |
187 | |
188 | /* clear buckets */ |
189 | for (size_t i=0;i<BUCKETS;i++) count[i] = 0; |
190 | |
191 | /* count buckets */ |
192 | #if defined(__INTEL_COMPILER) |
193 | #pragma nounroll |
194 | #endif |
195 | for (size_t i=0;i<num;i++) |
196 | count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++; |
197 | |
198 | /* prefix sums */ |
199 | __aligned(64) unsigned int head[BUCKETS]; |
200 | __aligned(64) unsigned int tail[BUCKETS]; |
201 | |
202 | head[0] = 0; |
203 | for (size_t i=1; i<BUCKETS; i++) |
204 | head[i] = head[i-1] + count[i-1]; |
205 | |
206 | for (size_t i=0; i<BUCKETS-1; i++) |
207 | tail[i] = head[i+1]; |
208 | |
209 | tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1]; |
210 | |
211 | assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]); |
212 | assert(tail[BUCKETS-1] == num); |
213 | |
214 | /* in-place swap */ |
215 | for (size_t i=0;i<BUCKETS;i++) |
216 | { |
217 | /* process bucket */ |
218 | while(head[i] < tail[i]) |
219 | { |
220 | T v = morton[head[i]]; |
221 | while(1) |
222 | { |
223 | const size_t b = (unsigned(v) >> shift) & (BUCKETS-1); |
224 | if (b == i) break; |
225 | std::swap(v,morton[head[b]++]); |
226 | } |
227 | assert((unsigned(v) >> shift & (BUCKETS-1)) == i); |
228 | morton[head[i]++] = v; |
229 | } |
230 | } |
231 | if (shift == 0) return; |
232 | |
233 | size_t offset = 0; |
234 | for (size_t i=0;i<BUCKETS;i++) |
235 | if (count[i]) |
236 | { |
237 | |
238 | for (size_t j=offset;j<offset+count[i]-1;j++) |
239 | assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i); |
240 | |
241 | if (unlikely(count[i] < CMP_SORT_THRESHOLD)) |
242 | insertionsort_ascending(morton + offset, count[i]); |
243 | else |
244 | radixsort32(morton + offset, count[i], shift-BITS); |
245 | |
246 | for (size_t j=offset;j<offset+count[i]-1;j++) |
247 | assert(morton[j] <= morton[j+1]); |
248 | |
249 | offset += count[i]; |
250 | } |
251 | } |
252 | |
253 | template<typename Ty, typename Key> |
254 | class ParallelRadixSort |
255 | { |
256 | static const size_t MAX_TASKS = 64; |
257 | static const size_t BITS = 8; |
258 | static const size_t BUCKETS = (1 << BITS); |
259 | typedef unsigned int TyRadixCount[BUCKETS]; |
260 | |
261 | template<typename T> |
262 | static bool compare(const T& v0, const T& v1) { |
263 | return (Key)v0 < (Key)v1; |
264 | } |
265 | |
266 | private: |
267 | ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement |
268 | ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement |
269 | |
270 | |
271 | public: |
272 | ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N) |
273 | : radixCount(nullptr), src(src), tmp(tmp), N(N) {} |
274 | |
275 | void sort(const size_t blockSize) |
276 | { |
277 | assert(blockSize > 0); |
278 | |
279 | /* perform single threaded sort for small N */ |
280 | if (N<=blockSize) // handles also special case of 0! |
281 | { |
282 | /* do inplace sort inside destination array */ |
283 | std::sort(src,src+N,compare<Ty>); |
284 | } |
285 | |
286 | /* perform parallel sort for large N */ |
287 | else |
288 | { |
289 | const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS)); |
290 | tbbRadixSort(numThreads); |
291 | } |
292 | } |
293 | |
294 | ~ParallelRadixSort() |
295 | { |
296 | alignedFree(radixCount); |
297 | radixCount = nullptr; |
298 | } |
299 | |
300 | private: |
301 | |
302 | void tbbRadixIteration0(const Key shift, |
303 | const Ty* __restrict const src, |
304 | Ty* __restrict const dst, |
305 | const size_t threadIndex, const size_t threadCount) |
306 | { |
307 | const size_t startID = (threadIndex+0)*N/threadCount; |
308 | const size_t endID = (threadIndex+1)*N/threadCount; |
309 | |
310 | /* mask to extract some number of bits */ |
311 | const Key mask = BUCKETS-1; |
312 | |
313 | /* count how many items go into the buckets */ |
314 | for (size_t i=0; i<BUCKETS; i++) |
315 | radixCount[threadIndex][i] = 0; |
316 | |
317 | /* iterate over src array and count buckets */ |
318 | unsigned int * __restrict const count = radixCount[threadIndex]; |
319 | #if defined(__INTEL_COMPILER) |
320 | #pragma nounroll |
321 | #endif |
322 | for (size_t i=startID; i<endID; i++) { |
323 | #if defined(__64BIT__) |
324 | const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; |
325 | #else |
326 | const Key index = ((Key)src[i] >> shift) & mask; |
327 | #endif |
328 | count[index]++; |
329 | } |
330 | } |
331 | |
332 | void tbbRadixIteration1(const Key shift, |
333 | const Ty* __restrict const src, |
334 | Ty* __restrict const dst, |
335 | const size_t threadIndex, const size_t threadCount) |
336 | { |
337 | const size_t startID = (threadIndex+0)*N/threadCount; |
338 | const size_t endID = (threadIndex+1)*N/threadCount; |
339 | |
340 | /* mask to extract some number of bits */ |
341 | const Key mask = BUCKETS-1; |
342 | |
343 | /* calculate total number of items for each bucket */ |
344 | __aligned(64) unsigned int total[BUCKETS]; |
345 | /* |
346 | for (size_t i=0; i<BUCKETS; i++) |
347 | total[i] = 0; |
348 | */ |
349 | for (size_t i=0; i<BUCKETS; i+=VSIZEX) |
350 | vintx::store(&total[i], zero); |
351 | |
352 | for (size_t i=0; i<threadCount; i++) |
353 | { |
354 | /* |
355 | for (size_t j=0; j<BUCKETS; j++) |
356 | total[j] += radixCount[i][j]; |
357 | */ |
358 | for (size_t j=0; j<BUCKETS; j+=VSIZEX) |
359 | vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j])); |
360 | } |
361 | |
362 | /* calculate start offset of each bucket */ |
363 | __aligned(64) unsigned int offset[BUCKETS]; |
364 | offset[0] = 0; |
365 | for (size_t i=1; i<BUCKETS; i++) |
366 | offset[i] = offset[i-1] + total[i-1]; |
367 | |
368 | /* calculate start offset of each bucket for this thread */ |
369 | for (size_t i=0; i<threadIndex; i++) |
370 | { |
371 | /* |
372 | for (size_t j=0; j<BUCKETS; j++) |
373 | offset[j] += radixCount[i][j]; |
374 | */ |
375 | for (size_t j=0; j<BUCKETS; j+=VSIZEX) |
376 | vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j])); |
377 | } |
378 | |
379 | /* copy items into their buckets */ |
380 | #if defined(__INTEL_COMPILER) |
381 | #pragma nounroll |
382 | #endif |
383 | for (size_t i=startID; i<endID; i++) { |
384 | const Ty elt = src[i]; |
385 | #if defined(__64BIT__) |
386 | const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; |
387 | #else |
388 | const size_t index = ((Key)src[i] >> shift) & mask; |
389 | #endif |
390 | dst[offset[index]++] = elt; |
391 | } |
392 | } |
393 | |
394 | void tbbRadixIteration(const Key shift, const bool last, |
395 | const Ty* __restrict src, Ty* __restrict dst, |
396 | const size_t numTasks) |
397 | { |
398 | affinity_partitioner ap; |
399 | parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap); |
400 | parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap); |
401 | } |
402 | |
403 | void tbbRadixSort(const size_t numTasks) |
404 | { |
405 | radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64); |
406 | |
407 | if (sizeof(Key) == sizeof(uint32_t)) { |
408 | tbbRadixIteration(0*BITS,0,src,tmp,numTasks); |
409 | tbbRadixIteration(1*BITS,0,tmp,src,numTasks); |
410 | tbbRadixIteration(2*BITS,0,src,tmp,numTasks); |
411 | tbbRadixIteration(3*BITS,1,tmp,src,numTasks); |
412 | } |
413 | else if (sizeof(Key) == sizeof(uint64_t)) |
414 | { |
415 | tbbRadixIteration(0*BITS,0,src,tmp,numTasks); |
416 | tbbRadixIteration(1*BITS,0,tmp,src,numTasks); |
417 | tbbRadixIteration(2*BITS,0,src,tmp,numTasks); |
418 | tbbRadixIteration(3*BITS,0,tmp,src,numTasks); |
419 | tbbRadixIteration(4*BITS,0,src,tmp,numTasks); |
420 | tbbRadixIteration(5*BITS,0,tmp,src,numTasks); |
421 | tbbRadixIteration(6*BITS,0,src,tmp,numTasks); |
422 | tbbRadixIteration(7*BITS,1,tmp,src,numTasks); |
423 | } |
424 | } |
425 | |
426 | private: |
427 | TyRadixCount* radixCount; |
428 | Ty* const src; |
429 | Ty* const tmp; |
430 | const size_t N; |
431 | }; |
432 | |
433 | template<typename Ty> |
434 | void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) |
435 | { |
436 | ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize); |
437 | } |
438 | |
439 | template<typename Ty, typename Key> |
440 | void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) |
441 | { |
442 | ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize); |
443 | } |
444 | |
445 | template<typename Ty> |
446 | void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { |
447 | radix_sort<Ty,uint32_t>(src,tmp,N,blockSize); |
448 | } |
449 | |
450 | template<typename Ty> |
451 | void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { |
452 | radix_sort<Ty,uint64_t>(src,tmp,N,blockSize); |
453 | } |
454 | } |
455 | |