BsGpuSort.cpp source code [bsFramework/Source/Plugins/bsfRenderBeast/Utility/BsGpuSort.cpp]

1	//********************************* bs::framework - Copyright 2018 Marko Pintera ***********************************//
2	//******** Licensed under the MIT license. See LICENSE.md for full terms. This notice is not to be removed. ********//
3	#include "Utility/BsGpuSort.h"
4	#include "RenderAPI/BsGpuBuffer.h"
5	#include "Math/BsRandom.h"
6	#include "Renderer/BsRendererUtility.h"
7
8	namespace bs { namespace ct
9	{
10	static constexpr UINT32 BIT_COUNT = `32`;
11	static constexpr UINT32 RADIX_NUM_BITS = `4`;
12	static constexpr UINT32 NUM_DIGITS = `1` << RADIX_NUM_BITS;
13	static constexpr UINT32 KEY_MASK = (NUM_DIGITS - `1`);
14	static constexpr UINT32 NUM_PASSES = BIT_COUNT / RADIX_NUM_BITS;
15
16	static constexpr UINT32 NUM_THREADS = `128`;
17	static constexpr UINT32 KEYS_PER_LOOP = `8`;
18	static constexpr UINT32 TILE_SIZE = NUM_THREADS * KEYS_PER_LOOP;
19	static constexpr UINT32 MAX_NUM_GROUPS = `64`;
20
21	RadixSortParamsDef gRadixSortParamsDef;
22
23	/* Contains various constants required during the GpuSort algorithm. /
24	struct GpuSortProperties
25	{
26	GpuSortProperties(UINT32 count)
27	: count(count)
28	{ }
29
30	const UINT32 count;
31	const UINT32 numTiles = count / TILE_SIZE;
32	const UINT32 numGroups = Math::clamp(numTiles, `1U`, MAX_NUM_GROUPS);
33
34	const UINT32 tilesPerGroup = numTiles / numGroups;
35	const UINT32 extraTiles = numTiles % numGroups;
36	const UINT32 extraKeys = count % TILE_SIZE;
37	};
38
39	/* Set up common defines required by all radix sort shaders. /
40	void initCommonDefines(ShaderDefines& defines)
41	{
42	defines.set("RADIX_NUM_BITS", RADIX_NUM_BITS);
43	defines.set("NUM_THREADS", NUM_THREADS);
44	defines.set("KEYS_PER_LOOP", KEYS_PER_LOOP);
45	defines.set("MAX_NUM_GROUPS", MAX_NUM_GROUPS);
46	}
47
48	void runSortTest();
49
50	/**
51	* Creates a new GPU parameter block buffer according to gRadixSortParamDef definition and writes GpuSort properties
52	* into the buffer.
53	*/
54	SPtr<GpuParamBlockBuffer> createGpuSortParams(const GpuSortProperties& props)
55	{
56	SPtr<GpuParamBlockBuffer> buffer = gRadixSortParamsDef.createBuffer();
57
58	gRadixSortParamsDef.gTilesPerGroup.set(buffer, props.tilesPerGroup);
59	gRadixSortParamsDef.gNumGroups.set(buffer, props.numGroups);
60	gRadixSortParamsDef.gNumExtraTiles.set(buffer, props.extraTiles);
61	gRadixSortParamsDef.gNumExtraKeys.set(buffer, props.extraKeys);
62	gRadixSortParamsDef.gBitOffset.set(buffer, `0`);
63
64	return buffer;
65	}
66
67	/**
68	* Checks can the provided buffer be used for GPU sort operation. Returns a pointer to the error message if check failed
69	* or nullptr if check passed.
70	*/
71	const char* checkSortBuffer(GpuBuffer& buffer)
72	{
73	static constexpr const char* INVALID_GPU_WRITE_MSG =
74	"All buffers provided to GpuSort must be created with GBU_LOADSTORE flags enabled.";
75	static constexpr const char* INVALID_TYPE_MSG =
76	"All buffers provided to GpuSort must be of GBT_STANDARD type.";
77	static constexpr const char* INVALID_FORMAT_MSG =
78	"All buffers provided to GpuSort must use a 32-bit unsigned integer format.";
79
80	const GpuBufferProperties& bufferProps = buffer.getProperties();
81	if ((bufferProps.getUsage() & GBU_LOADSTORE) != GBU_LOADSTORE)
82	return INVALID_GPU_WRITE_MSG;
83
84	if(bufferProps.getType() != GBT_STANDARD)
85	return INVALID_TYPE_MSG;
86
87	if(bufferProps.getFormat() != BF_32X1U)
88	return INVALID_FORMAT_MSG;
89
90	return nullptr;
91	}
92
93	/* Creates a helper buffers used for storing intermediate information during GpuSort::sort. /
94	SPtr<GpuBuffer> createHelperBuffer()
95	{
96	GPU_BUFFER_DESC desc;
97	desc.elementCount = MAX_NUM_GROUPS * NUM_DIGITS;
98	desc.format = BF_32X1U;
99	desc.usage = GBU_LOADSTORE;
100	desc.type = GBT_STANDARD;
101
102	return GpuBuffer::create(desc);
103	}
104
105	RadixSortClearMat::RadixSortClearMat()
106	{
107	mParams ->getBufferParam(GPT_COMPUTE_PROGRAM, "gOutput", mOutputParam);
108	}
109
110	void RadixSortClearMat::_initDefines(ShaderDefines& defines)
111	{
112	initCommonDefines(defines);
113	}
114
115	void RadixSortClearMat::execute(const SPtr<GpuBuffer>& outputOffsets)
116	{
117	BS_RENMAT_PROFILE_BLOCK
118
119	mOutputParam.set(outputOffsets);
120
121	bind();
122	RenderAPI::instance().dispatchCompute(`1`);
123	}
124
125	RadixSortCountMat::RadixSortCountMat()
126	{
127	mParams ->getBufferParam(GPT_COMPUTE_PROGRAM, "gInputKeys", mInputKeysParam);
128	mParams ->getBufferParam(GPT_COMPUTE_PROGRAM, "gOutputCounts", mOutputCountsParam);
129	}
130
131	void RadixSortCountMat::_initDefines(ShaderDefines& defines)
132	{
133	initCommonDefines(defines);
134	}
135
136	void RadixSortCountMat::execute(UINT32 numGroups, const SPtr<GpuParamBlockBuffer>& params,
137	const SPtr<GpuBuffer>& inputKeys, const SPtr<GpuBuffer>& outputOffsets)
138	{
139	BS_RENMAT_PROFILE_BLOCK
140
141	mInputKeysParam.set(inputKeys);
142	mOutputCountsParam.set(outputOffsets);
143
144	mParams ->setParamBlockBuffer("Params", params);
145
146	bind();
147	RenderAPI::instance().dispatchCompute(numGroups);
148	}
149
150	RadixSortPrefixScanMat::RadixSortPrefixScanMat()
151	{
152	mParams ->getBufferParam(GPT_COMPUTE_PROGRAM, "gInputCounts", mInputCountsParam);
153	mParams ->getBufferParam(GPT_COMPUTE_PROGRAM, "gOutputOffsets", mOutputOffsetsParam);
154	}
155
156	void RadixSortPrefixScanMat::_initDefines(ShaderDefines& defines)
157	{
158	initCommonDefines(defines);
159	}
160
161	void RadixSortPrefixScanMat::execute(const SPtr<GpuParamBlockBuffer>& params, const SPtr<GpuBuffer>& inputCounts,
162	const SPtr<GpuBuffer>& outputOffsets)
163	{
164	BS_RENMAT_PROFILE_BLOCK
165
166	mInputCountsParam.set(inputCounts);
167	mOutputOffsetsParam.set(outputOffsets);
168
169	mParams ->setParamBlockBuffer("Params", params);
170
171	bind();
172	RenderAPI::instance().dispatchCompute(`1`);
173	}
174
175	RadixSortReorderMat::RadixSortReorderMat()
176	{
177	mParams ->getBufferParam(GPT_COMPUTE_PROGRAM, "gInputOffsets", mInputOffsetsBufferParam);
178	mParams ->getBufferParam(GPT_COMPUTE_PROGRAM, "gInputKeys", mInputKeysBufferParam);
179	mParams ->getBufferParam(GPT_COMPUTE_PROGRAM, "gInputValues", mInputValuesBufferParam);
180	mParams ->getBufferParam(GPT_COMPUTE_PROGRAM, "gOutputKeys", mOutputKeysBufferParam);
181	mParams ->getBufferParam(GPT_COMPUTE_PROGRAM, "gOutputValues", mOutputValuesBufferParam);
182	}
183
184	void RadixSortReorderMat::_initDefines(ShaderDefines& defines)
185	{
186	initCommonDefines(defines);
187	}
188
189	void RadixSortReorderMat::execute(UINT32 numGroups, const SPtr<GpuParamBlockBuffer>& params,
190	const SPtr<GpuBuffer>& inputPrefix, const GpuSortBuffers& buffers, UINT32 inputBufferIdx)
191	{
192	BS_RENMAT_PROFILE_BLOCK
193
194	const UINT32 outputBufferIdx = (inputBufferIdx + `1`) % `2`;
195
196	mInputOffsetsBufferParam.set(inputPrefix);
197	mInputKeysBufferParam.set(buffers.keys[inputBufferIdx]);
198	mInputValuesBufferParam.set(buffers.values[inputBufferIdx]);
199	mOutputKeysBufferParam.set(buffers.keys[outputBufferIdx]);
200	mOutputValuesBufferParam.set(buffers.values[outputBufferIdx]);
201
202	mParams ->setParamBlockBuffer("Params", params);
203
204	bind();
205	RenderAPI::instance().dispatchCompute(numGroups);
206	}
207
208	GpuSort::GpuSort()
209	{
210	mHelperBuffers[`0`] = createHelperBuffer();
211	mHelperBuffers[`1`] = createHelperBuffer();
212	}
213
214	UINT32 GpuSort::sort(const GpuSortBuffers& buffers, UINT32 numKeys, UINT32 keyMask)
215	{
216	// Nothing to do if no input or output key buffers
217	if(buffers.keys[`0`] == nullptr \|\| buffers.keys[`1`] == nullptr)
218	return `0`;
219
220	// Check if all buffers have been created with required options
221	const char* errorMsg = nullptr;
222	for(UINT32 i = `0`; i < `2`; i++)
223	{
224	errorMsg = checkSortBuffer(*buffers.keys[i]);
225	if(errorMsg) break;
226
227	if(buffers.values[i])
228	{
229	errorMsg = checkSortBuffer(*buffers.values[i]);
230	if(errorMsg) break;
231	}
232	}
233
234	if(errorMsg)
235	{
236	LOGERR("GpuSort failed: " + String (errorMsg));
237	return `0`;
238	}
239
240	// Check if all buffers have the same size
241	bool validSize = buffers.keys[`0`]->getSize() == buffers.keys[`1`]->getSize();
242	if(buffers.values[`0`] && buffers.values[`1`])
243	{
244	validSize = buffers.keys[`0`]->getSize() == buffers.values[`0`]->getSize() &&
245	buffers.keys[`0`]->getSize() == buffers.values[`1`]->getSize();
246
247	}
248
249	if (!validSize)
250	{
251	LOGERR("GpuSort failed: All sort buffers must have the same size.");
252	return `0`;
253	}
254
255	const GpuSortProperties gpuSortProps(numKeys);
256	SPtr<GpuParamBlockBuffer> params = createGpuSortParams(gpuSortProps);
257
258	UINT32 bitOffset = `0`;
259	UINT32 inputBufferIdx = `0`;
260	for(UINT32 i = `0`; i < NUM_PASSES; i++)
261	{
262	if(((KEY_MASK << bitOffset) & keyMask) != `0`)
263	{
264	gRadixSortParamsDef.gBitOffset.set(params, bitOffset);
265
266	RadixSortClearMat::get()->execute(mHelperBuffers[`0`]);
267	RadixSortCountMat::get()->execute(gpuSortProps.numGroups, params, buffers.keys[inputBufferIdx], mHelperBuffers[`0`]);
268	RadixSortPrefixScanMat::get()->execute(params, mHelperBuffers[`0`], mHelperBuffers[`1`]);
269	RadixSortReorderMat::get()->execute(gpuSortProps.numGroups, params, mHelperBuffers[`1`], buffers, inputBufferIdx);
270
271	inputBufferIdx = (inputBufferIdx + `1`) % `2`;
272	}
273
274	bitOffset += RADIX_NUM_BITS;
275	}
276
277	return inputBufferIdx;
278	}
279
280	GpuSortBuffers GpuSort::createSortBuffers(UINT32 numElements, bool values)
281	{
282	GpuSortBuffers output;
283
284	GPU_BUFFER_DESC bufferDesc;
285	bufferDesc.elementCount = numElements;
286	bufferDesc.format = BF_32X1U;
287	bufferDesc.type = GBT_STANDARD;
288	bufferDesc.usage = GBU_LOADSTORE;
289
290	output.keys[`0`] = GpuBuffer::create(bufferDesc);
291	output.keys[`1`] = GpuBuffer::create(bufferDesc);
292
293	if(values)
294	{
295	output.values[`0`] = GpuBuffer::create(bufferDesc);
296	output.values[`1`] = GpuBuffer::create(bufferDesc);
297	}
298
299	return output;
300	}
301
302	// Note: This test isn't currently hooked up anywhere. It might be a good idea to set it up as a unit test, but it would
303	// require exposing parts of GpuSort to the public, which I don't feel like it's worth doing just for a test. So instead
304	// just make sure to run the test below if you modify any of the GpuSort code.
305	void runSortTest()
306	{
307	// Generate test keys
308	static constexpr UINT32 NUM_INPUT_KEYS = `10000`;
309	Vector<UINT32> inputKeys;
310	inputKeys.reserve(NUM_INPUT_KEYS);
311
312	Random random;
313	for(UINT32 i = `0`; i < NUM_INPUT_KEYS; i++)
314	inputKeys.push_back(random.getRange(`0`, `15`) << `4` \| std::min(NUM_DIGITS - `1`, (i / (NUM_INPUT_KEYS / `16`))));
315
316	const auto count = (UINT32)inputKeys.size();
317	UINT32 bitOffset = `4`;
318	UINT32 bitMask = (`1` << RADIX_NUM_BITS) - `1`;
319
320	// Prepare buffers
321	const GpuSortProperties gpuSortProps(count);
322	SPtr<GpuParamBlockBuffer> params = createGpuSortParams(gpuSortProps);
323
324	gRadixSortParamsDef.gBitOffset.set(params, bitOffset);
325
326	GpuSortBuffers sortBuffers = GpuSort::createSortBuffers(count);
327	sortBuffers.keys[`0`]->writeData(`0`, sortBuffers.keys[`0`]->getSize(), inputKeys.data(), BWT_DISCARD);
328
329	SPtr<GpuBuffer> helperBuffers[`2`];
330	helperBuffers[`0`] = createHelperBuffer();
331	helperBuffers[`1`] = createHelperBuffer();
332
333	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
334	//////////////////////////////////////////// Count keys per group //////////////////////////////////////////////////
335	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
336
337	// SERIAL:
338	Vector<UINT32> counts(gpuSortProps.numGroups * NUM_DIGITS);
339	for(UINT32 groupIdx = `0`; groupIdx < gpuSortProps.numGroups; groupIdx++)
340	{
341	// Count keys per thread
342	UINT32 localCounts[NUM_THREADS * NUM_DIGITS] = { `0` };
343
344	UINT32 tileIdx;
345	UINT32 numTiles;
346	if(groupIdx < gpuSortProps.extraTiles)
347	{
348	numTiles = gpuSortProps.tilesPerGroup + `1`;
349	tileIdx = groupIdx * numTiles;
350	}
351	else
352	{
353	numTiles = gpuSortProps.tilesPerGroup;
354	tileIdx = groupIdx * numTiles + gpuSortProps.extraTiles;
355	}
356
357	UINT32 keyBegin = tileIdx * TILE_SIZE;
358	UINT32 keyEnd = keyBegin + numTiles * TILE_SIZE;
359
360	while(keyBegin < keyEnd)
361	{
362	for(UINT32 threadIdx = `0`; threadIdx < NUM_THREADS; threadIdx++)
363	{
364	UINT32 key = inputKeys [keyBegin + threadIdx];
365	UINT32 digit = (key >> bitOffset) & bitMask;
366
367	localCounts[threadIdx * NUM_DIGITS + digit] += `1`;
368	}
369
370	keyBegin += NUM_THREADS;
371	}
372
373	if(groupIdx == (gpuSortProps.numGroups - `1`))
374	{
375	keyBegin = keyEnd;
376	keyEnd = keyBegin + gpuSortProps.extraKeys;
377
378	while(keyBegin < keyEnd)
379	{
380	for (UINT32 threadIdx = `0`; threadIdx < NUM_THREADS; threadIdx++)
381	{
382	if((keyBegin + threadIdx) < keyEnd)
383	{
384	UINT32 key = inputKeys [keyBegin + threadIdx];
385	UINT32 digit = (key >> bitOffset) & bitMask;
386
387	localCounts[threadIdx * NUM_DIGITS + digit] += `1`;
388	}
389	}
390
391	keyBegin += NUM_THREADS;
392	}
393	}
394
395	// Sum up all key counts in a group
396	static constexpr UINT32 NUM_REDUCE_THREADS = `64`;
397	static constexpr UINT32 NUM_REDUCE_THREADS_PER_DIGIT = NUM_REDUCE_THREADS / NUM_DIGITS;
398	static constexpr UINT32 NUM_REDUCE_ELEMS_PER_THREAD_PER_DIGIT = NUM_THREADS / NUM_REDUCE_THREADS_PER_DIGIT;
399
400	UINT32 reduceCounters[NUM_REDUCE_THREADS] = { `0` };
401	UINT32 reduceTotals[NUM_REDUCE_THREADS] = { `0` };
402	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
403	{
404	if(threadId < NUM_REDUCE_THREADS)
405	{
406	UINT32 digitIdx = threadId / NUM_REDUCE_THREADS_PER_DIGIT;
407	UINT32 setIdx = threadId & (NUM_REDUCE_THREADS_PER_DIGIT - `1`);
408
409	UINT32 total = `0`;
410	for(UINT32 i = `0`; i < NUM_REDUCE_ELEMS_PER_THREAD_PER_DIGIT; i++)
411	{
412	UINT32 threadIdx = (setIdx * NUM_REDUCE_ELEMS_PER_THREAD_PER_DIGIT + i) * NUM_DIGITS;
413	total += localCounts[threadIdx + digitIdx];
414	}
415
416	reduceCounters[digitIdx * NUM_REDUCE_THREADS_PER_DIGIT + setIdx] = total;
417	reduceTotals[threadId] = total;
418
419	}
420	}
421
422	// And do parallel reduction on the result of serial additions
423	for (UINT32 i = `1`; i < NUM_REDUCE_THREADS_PER_DIGIT; i <<= `1`)
424	{
425	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
426	{
427	if (threadId < NUM_REDUCE_THREADS)
428	{
429	UINT32 digitIdx = threadId / NUM_REDUCE_THREADS_PER_DIGIT;
430	UINT32 setIdx = threadId & (NUM_REDUCE_THREADS_PER_DIGIT - `1`);
431
432	reduceTotals[threadId] += reduceCounters[digitIdx * NUM_REDUCE_THREADS_PER_DIGIT + setIdx + i];
433	}
434	}
435
436	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
437	{
438	if (threadId < NUM_REDUCE_THREADS)
439	{
440	UINT32 digitIdx = threadId / NUM_REDUCE_THREADS_PER_DIGIT;
441	UINT32 setIdx = threadId & (NUM_REDUCE_THREADS_PER_DIGIT - `1`);
442
443	reduceCounters[digitIdx * NUM_REDUCE_THREADS_PER_DIGIT + setIdx] = reduceTotals[threadId];
444	}
445	}
446	}
447
448	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
449	{
450	if(threadId < NUM_DIGITS)
451	counts [groupIdx * NUM_DIGITS + threadId] = reduceCounters[threadId * NUM_REDUCE_THREADS_PER_DIGIT];
452	}
453	}
454
455	// PARALLEL:
456	RadixSortClearMat::get()->execute(helperBuffers[`0`]);
457	RadixSortCountMat::get()->execute(gpuSortProps.numGroups, params, sortBuffers.keys[`0`], helperBuffers[`0`]);
458	RenderAPI::instance().submitCommandBuffer(nullptr);
459
460	// Compare with GPU count
461	const UINT32 helperBufferLength = helperBuffers[`0`]->getProperties().getElementCount();
462	Vector<UINT32> bufferCounts(helperBufferLength);
463	helperBuffers[`0`]->readData(`0`, helperBufferLength * sizeof(UINT32), bufferCounts.data());
464
465	for(UINT32 i = `0`; i < (UINT32)counts.size(); i++)
466	assert(bufferCounts[i] == counts[i]);
467
468	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
469	/////////////////////////////////////////////// Calculate offsets //////////////////////////////////////////////////
470	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
471
472	// SERIAL:
473	// Prefix sum over per-digit counts over all groups
474	Vector<UINT32> perDigitPrefixSum(NUM_DIGITS * MAX_NUM_GROUPS);
475	for(UINT32 groupIdx = `0`; groupIdx < gpuSortProps.numGroups; groupIdx++)
476	{
477	for (UINT32 j = `0`; j < NUM_DIGITS; j++)
478	perDigitPrefixSum [groupIdx * NUM_DIGITS + j] = counts [groupIdx * NUM_DIGITS + j];
479	}
480
481	// Prefix sum over per-digit counts over all groups
482	//// Upsweep
483	UINT32 offset = `1`;
484	for (UINT32 i = MAX_NUM_GROUPS >> `1`; i > `0`; i >>= `1`)
485	{
486	for (UINT32 groupIdx = `0`; groupIdx < MAX_NUM_GROUPS; groupIdx++)
487	{
488	if (groupIdx < i)
489	{
490	for (UINT32 j = `0`; j < NUM_DIGITS; j++)
491	{
492	UINT32 idx0 = (offset * (`2` * groupIdx + `1`) - `1`) * NUM_DIGITS + j;
493	UINT32 idx1 = (offset * (`2` * groupIdx + `2`) - `1`) * NUM_DIGITS + j;
494
495	perDigitPrefixSum [idx1] += perDigitPrefixSum [idx0];
496	}
497	}
498	}
499
500	offset <<= `1`;
501	}
502
503	//// Downsweep
504	UINT32 totalsPrefixSum[NUM_DIGITS] = { `0` };
505	for(UINT32 groupIdx = `0`; groupIdx < NUM_DIGITS; groupIdx++)
506	{
507	if (groupIdx < NUM_DIGITS)
508	{
509	UINT32 idx = (MAX_NUM_GROUPS - `1`) * NUM_DIGITS + groupIdx;
510	totalsPrefixSum[groupIdx] = perDigitPrefixSum [idx];
511	perDigitPrefixSum [idx] = `0`;
512	}
513	}
514
515	for (UINT32 i = `1`; i < MAX_NUM_GROUPS; i <<= `1`)
516	{
517	offset >>= `1`;
518
519	for (UINT32 groupIdx = `0`; groupIdx < MAX_NUM_GROUPS; groupIdx++)
520	{
521	if (groupIdx < i)
522	{
523	for (UINT32 j = `0`; j < NUM_DIGITS; j++)
524	{
525	UINT32 idx0 = (offset * (`2` * groupIdx + `1`) - `1`) * NUM_DIGITS + j;
526	UINT32 idx1 = (offset * (`2` * groupIdx + `2`) - `1`) * NUM_DIGITS + j;
527
528	UINT32 temp = perDigitPrefixSum [idx0];
529	perDigitPrefixSum [idx0] = perDigitPrefixSum [idx1];
530	perDigitPrefixSum [idx1] += temp;
531	}
532	}
533	}
534	}
535
536	// Prefix sum over the total count
537	for(UINT32 i = `1`; i < NUM_DIGITS; i++)
538	totalsPrefixSum[i] += totalsPrefixSum[i - `1`];
539
540	// Make it exclusive by shifting
541	for(UINT32 i = NUM_DIGITS - `1`; i > `0`; i--)
542	totalsPrefixSum[i] = totalsPrefixSum[i - `1`];
543
544	totalsPrefixSum[`0`] = `0`;
545
546	Vector<UINT32> offsets(gpuSortProps.numGroups * NUM_DIGITS);
547	for (UINT32 groupIdx = `0`; groupIdx < gpuSortProps.numGroups; groupIdx++)
548	{
549	for (UINT32 i = `0`; i < NUM_DIGITS; i++)
550	offsets [groupIdx * NUM_DIGITS + i] = totalsPrefixSum[i] + perDigitPrefixSum [groupIdx * NUM_DIGITS + i];
551	}
552
553	// PARALLEL:
554	RadixSortPrefixScanMat::get()->execute(params, helperBuffers[`0`], helperBuffers[`1`]);
555	RenderAPI::instance().submitCommandBuffer(nullptr);
556
557	// Compare with GPU offsets
558	Vector<UINT32> bufferOffsets(helperBufferLength);
559	helperBuffers[`1`]->readData(`0`, helperBufferLength * sizeof(UINT32), bufferOffsets.data());
560
561	for(UINT32 i = `0`; i < (UINT32)offsets.size(); i++)
562	assert(bufferOffsets[i] == offsets[i]);
563
564	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
565	/////////////////////////////////////////////////// Reorder ////////////////////////////////////////////////////////
566	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
567
568	// SERIAL:
569	// Reorder within each tile
570	Vector<UINT32> sortedKeys(inputKeys.size());
571	UINT32 sGroupOffsets[NUM_DIGITS];
572	UINT32 sLocalScratch[NUM_DIGITS * NUM_THREADS];
573	UINT32 sTileTotals[NUM_DIGITS];
574	UINT32 sCurrentTileTotal[NUM_DIGITS];
575
576	for(UINT32 groupIdx = `0`; groupIdx < gpuSortProps.numGroups; groupIdx++)
577	{
578	for(UINT32 i = `0`; i < NUM_DIGITS; i++)
579	{
580	// Load offsets for this group to local memory
581	sGroupOffsets[i] = offsets [groupIdx * NUM_DIGITS + i];
582
583	// Clear tile totals
584	sTileTotals[i] = `0`;
585	}
586
587	// Handle case when number of tiles isn't exactly divisible by number of groups, in
588	// which case first N groups handle those extra tiles
589	UINT32 tileIdx;
590	UINT32 numTiles;
591	if(groupIdx < gpuSortProps.extraTiles)
592	{
593	numTiles = gpuSortProps.tilesPerGroup + `1`;
594	tileIdx = groupIdx * numTiles;
595	}
596	else
597	{
598	numTiles = gpuSortProps.tilesPerGroup;
599	tileIdx = groupIdx * numTiles + gpuSortProps.extraTiles;
600	}
601
602	// We need to generate per-thread offsets (prefix sum) of where to store the keys at
603	// (This is equivalent to what was done in count & prefix sum shaders, except that was done per-group)
604
605	//// First, count all digits
606	UINT32 keyBegin[NUM_THREADS];
607	for(UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
608	keyBegin[threadId] = tileIdx * TILE_SIZE;
609
610	auto prefixSum = [&sLocalScratch, &sCurrentTileTotal]()
611	{
612	// Upsweep to generate partial sums
613	UINT32 offsets[NUM_THREADS];
614	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
615	offsets[threadId] = `1`;
616
617	for (UINT32 i = NUM_THREADS >> `1`; i > `0`; i >>= `1`)
618	{
619	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
620	{
621	if (threadId < i)
622	{
623	// Note: If I run more than NUM_THREADS threads I wouldn't have to
624	// iterate over all digits in a single thread
625	// Note: Perhaps run part of this step serially for better performance
626	for (UINT32 j = `0`; j < NUM_DIGITS; j++)
627	{
628	UINT32 idx0 = (offsets[threadId] * (`2` * threadId + `1`) - `1`) * NUM_DIGITS + j;
629	UINT32 idx1 = (offsets[threadId] * (`2` * threadId + `2`) - `1`) * NUM_DIGITS + j;
630
631	// Note: Check and remove bank conflicts
632	sLocalScratch[idx1] += sLocalScratch[idx0];
633	}
634	}
635
636	offsets[threadId] <<= `1`;
637	}
638	}
639
640	// Set tree roots to zero (prepare for downsweep)
641	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
642	{
643	if (threadId < NUM_DIGITS)
644	{
645	UINT32 idx = (NUM_THREADS - `1`) * NUM_DIGITS + threadId;
646	sCurrentTileTotal[threadId] = sLocalScratch[idx];
647
648	sLocalScratch[idx] = `0`;
649	}
650	}
651
652	// Downsweep to calculate the prefix sum from partial sums that were generated
653	// during upsweep
654	for (UINT32 i = `1`; i < NUM_THREADS; i <<= `1`)
655	{
656	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
657	{
658	offsets[threadId] >>= `1`;
659
660	if (threadId < i)
661	{
662	for (UINT32 j = `0`; j < NUM_DIGITS; j++)
663	{
664	UINT32 idx0 = (offsets[threadId] * (`2` * threadId + `1`) - `1`) * NUM_DIGITS + j;
665	UINT32 idx1 = (offsets[threadId] * (`2` * threadId + `2`) - `1`) * NUM_DIGITS + j;
666
667	// Note: Check and resolve bank conflicts
668	UINT32 temp = sLocalScratch[idx0];
669	sLocalScratch[idx0] = sLocalScratch[idx1];
670	sLocalScratch[idx1] += temp;
671	}
672	}
673	}
674	}
675	};
676
677	for(UINT32 tileIdx = `0`; tileIdx < numTiles; tileIdx++)
678	{
679	// Zero out local counter
680	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
681	for (UINT32 i = `0`; i < NUM_DIGITS; i++)
682	sLocalScratch[i * NUM_THREADS + threadId] = `0`;
683
684	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
685	{
686	for (UINT32 i = `0`; i < KEYS_PER_LOOP; i++)
687	{
688	UINT32 idx = keyBegin[threadId] + threadId * KEYS_PER_LOOP + i;
689	UINT32 key = inputKeys [idx];
690	UINT32 digit = (key >> bitOffset) & KEY_MASK;
691
692	sLocalScratch[threadId * NUM_DIGITS + digit] += `1`;
693	}
694	}
695
696	prefixSum ();
697
698	// Actually re-order the keys
699	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
700	{
701	UINT32 localOffsets[NUM_DIGITS];
702	for (UINT32 i = `0`; i < NUM_DIGITS; i++)
703	localOffsets[i] = `0`;
704
705	for (UINT32 i = `0`; i < KEYS_PER_LOOP; i++)
706	{
707	UINT32 idx = keyBegin[threadId] + threadId * KEYS_PER_LOOP + i;
708	UINT32 key = inputKeys [idx];
709	UINT32 digit = (key >> bitOffset) & KEY_MASK;
710
711	UINT32 offset = sGroupOffsets[digit] + sTileTotals[digit] + sLocalScratch[threadId * NUM_DIGITS + digit] + localOffsets[digit];
712	localOffsets[digit]++;
713
714	// Note: First write to local memory then attempt to coalesce when writing to global?
715	sortedKeys [offset] = key;
716	}
717	}
718
719	// Update tile totals
720	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
721	{
722	if (threadId < NUM_DIGITS)
723	sTileTotals[threadId] += sCurrentTileTotal[threadId];
724	}
725
726	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
727	keyBegin[threadId] += TILE_SIZE;
728	}
729
730	if (groupIdx == (gpuSortProps.numGroups - `1`) && gpuSortProps.extraKeys > `0`)
731	{
732	// Zero out local counter
733	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
734	for (UINT32 i = `0`; i < NUM_DIGITS; i++)
735	sLocalScratch[i * NUM_THREADS + threadId] = `0`;
736
737	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
738	{
739	for (UINT32 i = `0`; i < KEYS_PER_LOOP; i++)
740	{
741	UINT32 localIdx = threadId * KEYS_PER_LOOP + i;
742
743	if (localIdx >= gpuSortProps.extraKeys)
744	continue;
745
746	UINT32 idx = keyBegin[threadId] + localIdx;
747	UINT32 key = inputKeys [idx];
748	UINT32 digit = (key >> bitOffset) & KEY_MASK;
749
750	sLocalScratch[threadId * NUM_DIGITS + digit] += `1`;
751	}
752	}
753
754	prefixSum ();
755
756	// Actually re-order the keys
757	for (UINT32 threadId = `0`; threadId < NUM_THREADS; threadId++)
758	{
759	UINT32 localOffsets[NUM_DIGITS];
760	for (UINT32 i = `0`; i < NUM_DIGITS; i++)
761	localOffsets[i] = `0`;
762
763	for (UINT32 i = `0`; i < KEYS_PER_LOOP; i++)
764	{
765	UINT32 localIdx = threadId * KEYS_PER_LOOP + i;
766
767	if (localIdx >= gpuSortProps.extraKeys)
768	continue;
769
770	UINT32 idx = keyBegin[threadId] + localIdx;
771	UINT32 key = inputKeys [idx];
772	UINT32 digit = (key >> bitOffset) & KEY_MASK;
773
774	UINT32 offset = sGroupOffsets[digit] + sTileTotals[digit] + sLocalScratch[threadId * NUM_DIGITS + digit] + localOffsets[digit];
775	localOffsets[digit]++;
776
777	// Note: First write to local memory then attempt to coalesce when writing to global?
778	sortedKeys [offset] = key;
779	}
780	}
781	}
782	}
783
784	// PARALLEL:
785	RadixSortReorderMat::get()->execute(gpuSortProps.numGroups, params, helperBuffers[`1`], sortBuffers, `0`);
786	RenderAPI::instance().submitCommandBuffer(nullptr);
787
788	// Compare with GPU keys
789	Vector<UINT32> bufferSortedKeys(count);
790	sortBuffers.keys[`1`]->readData(`0`, count * sizeof(UINT32), bufferSortedKeys.data());
791
792	for(UINT32 i = `0`; i < count; i++)
793	assert(bufferSortedKeys[i] == sortedKeys[i]);
794
795	// Ensure everything is actually sorted
796	assert(std::is_sorted(sortedKeys.begin(), sortedKeys.end()));
797	}
798	}}
799

Browse the source code of bsFramework/Source/Plugins/bsfRenderBeast/Utility/BsGpuSort.cpp