1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "../simd/simd.h"
7#include "parallel_for.h"
8#include <algorithm>
9
10namespace embree
11{
12 template<class T>
13 __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
14 {
15 for(size_t i = 1;i<length;++i)
16 {
17 T v = array[i];
18 size_t j = i;
19 while(j > 0 && v < array[j-1])
20 {
21 array[j] = array[j-1];
22 --j;
23 }
24 array[j] = v;
25 }
26 }
27
28 template<class T>
29 __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
30 {
31 for(size_t i = 1;i<length;++i)
32 {
33 T v = array[i];
34 size_t j = i;
35 while(j > 0 && v > array[j-1])
36 {
37 array[j] = array[j-1];
38 --j;
39 }
40 array[j] = v;
41 }
42 }
43
44 template<class T>
45 void quicksort_ascending(T *__restrict__ t,
46 const ssize_t begin,
47 const ssize_t end)
48 {
49 if (likely(begin < end))
50 {
51 const T pivotvalue = t[begin];
52 ssize_t left = begin - 1;
53 ssize_t right = end + 1;
54
55 while(1)
56 {
57 while (t[--right] > pivotvalue);
58 while (t[++left] < pivotvalue);
59
60 if (left >= right) break;
61
62 const T temp = t[right];
63 t[right] = t[left];
64 t[left] = temp;
65 }
66
67 const int pivot = right;
68 quicksort_ascending(t, begin, pivot);
69 quicksort_ascending(t, pivot + 1, end);
70 }
71 }
72
73 template<class T>
74 void quicksort_decending(T *__restrict__ t,
75 const ssize_t begin,
76 const ssize_t end)
77 {
78 if (likely(begin < end))
79 {
80 const T pivotvalue = t[begin];
81 ssize_t left = begin - 1;
82 ssize_t right = end + 1;
83
84 while(1)
85 {
86 while (t[--right] < pivotvalue);
87 while (t[++left] > pivotvalue);
88
89 if (left >= right) break;
90
91 const T temp = t[right];
92 t[right] = t[left];
93 t[left] = temp;
94 }
95
96 const int pivot = right;
97 quicksort_decending(t, begin, pivot);
98 quicksort_decending(t, pivot + 1, end);
99 }
100 }
101
102
103 template<class T, ssize_t THRESHOLD>
104 void quicksort_insertionsort_ascending(T *__restrict__ t,
105 const ssize_t begin,
106 const ssize_t end)
107 {
108 if (likely(begin < end))
109 {
110 const ssize_t size = end-begin+1;
111 if (likely(size <= THRESHOLD))
112 {
113 insertionsort_ascending<T>(&t[begin],size);
114 }
115 else
116 {
117 const T pivotvalue = t[begin];
118 ssize_t left = begin - 1;
119 ssize_t right = end + 1;
120
121 while(1)
122 {
123 while (t[--right] > pivotvalue);
124 while (t[++left] < pivotvalue);
125
126 if (left >= right) break;
127
128 const T temp = t[right];
129 t[right] = t[left];
130 t[left] = temp;
131 }
132
133 const ssize_t pivot = right;
134 quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
135 quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
136 }
137 }
138 }
139
140
141 template<class T, ssize_t THRESHOLD>
142 void quicksort_insertionsort_decending(T *__restrict__ t,
143 const ssize_t begin,
144 const ssize_t end)
145 {
146 if (likely(begin < end))
147 {
148 const ssize_t size = end-begin+1;
149 if (likely(size <= THRESHOLD))
150 {
151 insertionsort_decending<T>(&t[begin],size);
152 }
153 else
154 {
155
156 const T pivotvalue = t[begin];
157 ssize_t left = begin - 1;
158 ssize_t right = end + 1;
159
160 while(1)
161 {
162 while (t[--right] < pivotvalue);
163 while (t[++left] > pivotvalue);
164
165 if (left >= right) break;
166
167 const T temp = t[right];
168 t[right] = t[left];
169 t[left] = temp;
170 }
171
172 const ssize_t pivot = right;
173 quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
174 quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
175 }
176 }
177 }
178
179 template<typename T>
180 static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
181 {
182 static const unsigned int BITS = 8;
183 static const unsigned int BUCKETS = (1 << BITS);
184 static const unsigned int CMP_SORT_THRESHOLD = 16;
185
186 __aligned(64) unsigned int count[BUCKETS];
187
188 /* clear buckets */
189 for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
190
191 /* count buckets */
192#if defined(__INTEL_COMPILER)
193#pragma nounroll
194#endif
195 for (size_t i=0;i<num;i++)
196 count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
197
198 /* prefix sums */
199 __aligned(64) unsigned int head[BUCKETS];
200 __aligned(64) unsigned int tail[BUCKETS];
201
202 head[0] = 0;
203 for (size_t i=1; i<BUCKETS; i++)
204 head[i] = head[i-1] + count[i-1];
205
206 for (size_t i=0; i<BUCKETS-1; i++)
207 tail[i] = head[i+1];
208
209 tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
210
211 assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);
212 assert(tail[BUCKETS-1] == num);
213
214 /* in-place swap */
215 for (size_t i=0;i<BUCKETS;i++)
216 {
217 /* process bucket */
218 while(head[i] < tail[i])
219 {
220 T v = morton[head[i]];
221 while(1)
222 {
223 const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
224 if (b == i) break;
225 std::swap(v,morton[head[b]++]);
226 }
227 assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
228 morton[head[i]++] = v;
229 }
230 }
231 if (shift == 0) return;
232
233 size_t offset = 0;
234 for (size_t i=0;i<BUCKETS;i++)
235 if (count[i])
236 {
237
238 for (size_t j=offset;j<offset+count[i]-1;j++)
239 assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
240
241 if (unlikely(count[i] < CMP_SORT_THRESHOLD))
242 insertionsort_ascending(morton + offset, count[i]);
243 else
244 radixsort32(morton + offset, count[i], shift-BITS);
245
246 for (size_t j=offset;j<offset+count[i]-1;j++)
247 assert(morton[j] <= morton[j+1]);
248
249 offset += count[i];
250 }
251 }
252
253 template<typename Ty, typename Key>
254 class ParallelRadixSort
255 {
256 static const size_t MAX_TASKS = 64;
257 static const size_t BITS = 8;
258 static const size_t BUCKETS = (1 << BITS);
259 typedef unsigned int TyRadixCount[BUCKETS];
260
261 template<typename T>
262 static bool compare(const T& v0, const T& v1) {
263 return (Key)v0 < (Key)v1;
264 }
265
266 private:
267 ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
268 ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
269
270
271 public:
272 ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
273 : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
274
275 void sort(const size_t blockSize)
276 {
277 assert(blockSize > 0);
278
279 /* perform single threaded sort for small N */
280 if (N<=blockSize) // handles also special case of 0!
281 {
282 /* do inplace sort inside destination array */
283 std::sort(src,src+N,compare<Ty>);
284 }
285
286 /* perform parallel sort for large N */
287 else
288 {
289 const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
290 tbbRadixSort(numThreads);
291 }
292 }
293
294 ~ParallelRadixSort()
295 {
296 alignedFree(radixCount);
297 radixCount = nullptr;
298 }
299
300 private:
301
302 void tbbRadixIteration0(const Key shift,
303 const Ty* __restrict const src,
304 Ty* __restrict const dst,
305 const size_t threadIndex, const size_t threadCount)
306 {
307 const size_t startID = (threadIndex+0)*N/threadCount;
308 const size_t endID = (threadIndex+1)*N/threadCount;
309
310 /* mask to extract some number of bits */
311 const Key mask = BUCKETS-1;
312
313 /* count how many items go into the buckets */
314 for (size_t i=0; i<BUCKETS; i++)
315 radixCount[threadIndex][i] = 0;
316
317 /* iterate over src array and count buckets */
318 unsigned int * __restrict const count = radixCount[threadIndex];
319#if defined(__INTEL_COMPILER)
320#pragma nounroll
321#endif
322 for (size_t i=startID; i<endID; i++) {
323#if defined(__64BIT__)
324 const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
325#else
326 const Key index = ((Key)src[i] >> shift) & mask;
327#endif
328 count[index]++;
329 }
330 }
331
332 void tbbRadixIteration1(const Key shift,
333 const Ty* __restrict const src,
334 Ty* __restrict const dst,
335 const size_t threadIndex, const size_t threadCount)
336 {
337 const size_t startID = (threadIndex+0)*N/threadCount;
338 const size_t endID = (threadIndex+1)*N/threadCount;
339
340 /* mask to extract some number of bits */
341 const Key mask = BUCKETS-1;
342
343 /* calculate total number of items for each bucket */
344 __aligned(64) unsigned int total[BUCKETS];
345 /*
346 for (size_t i=0; i<BUCKETS; i++)
347 total[i] = 0;
348 */
349 for (size_t i=0; i<BUCKETS; i+=VSIZEX)
350 vintx::store(&total[i], zero);
351
352 for (size_t i=0; i<threadCount; i++)
353 {
354 /*
355 for (size_t j=0; j<BUCKETS; j++)
356 total[j] += radixCount[i][j];
357 */
358 for (size_t j=0; j<BUCKETS; j+=VSIZEX)
359 vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
360 }
361
362 /* calculate start offset of each bucket */
363 __aligned(64) unsigned int offset[BUCKETS];
364 offset[0] = 0;
365 for (size_t i=1; i<BUCKETS; i++)
366 offset[i] = offset[i-1] + total[i-1];
367
368 /* calculate start offset of each bucket for this thread */
369 for (size_t i=0; i<threadIndex; i++)
370 {
371 /*
372 for (size_t j=0; j<BUCKETS; j++)
373 offset[j] += radixCount[i][j];
374 */
375 for (size_t j=0; j<BUCKETS; j+=VSIZEX)
376 vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
377 }
378
379 /* copy items into their buckets */
380#if defined(__INTEL_COMPILER)
381#pragma nounroll
382#endif
383 for (size_t i=startID; i<endID; i++) {
384 const Ty elt = src[i];
385#if defined(__64BIT__)
386 const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
387#else
388 const size_t index = ((Key)src[i] >> shift) & mask;
389#endif
390 dst[offset[index]++] = elt;
391 }
392 }
393
394 void tbbRadixIteration(const Key shift, const bool last,
395 const Ty* __restrict src, Ty* __restrict dst,
396 const size_t numTasks)
397 {
398 affinity_partitioner ap;
399 parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
400 parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
401 }
402
403 void tbbRadixSort(const size_t numTasks)
404 {
405 radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
406
407 if (sizeof(Key) == sizeof(uint32_t)) {
408 tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
409 tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
410 tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
411 tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
412 }
413 else if (sizeof(Key) == sizeof(uint64_t))
414 {
415 tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
416 tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
417 tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
418 tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
419 tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
420 tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
421 tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
422 tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
423 }
424 }
425
426 private:
427 TyRadixCount* radixCount;
428 Ty* const src;
429 Ty* const tmp;
430 const size_t N;
431 };
432
433 template<typename Ty>
434 void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
435 {
436 ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
437 }
438
439 template<typename Ty, typename Key>
440 void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
441 {
442 ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
443 }
444
445 template<typename Ty>
446 void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
447 radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
448 }
449
450 template<typename Ty>
451 void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
452 radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
453 }
454}
455