1// Copyright 2019 The SwiftShader Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "ComputeProgram.hpp"
16#include "Constants.hpp"
17
18#include "Vulkan/VkDebug.hpp"
19#include "Vulkan/VkPipelineLayout.hpp"
20
21#include "marl/defer.h"
22#include "marl/trace.h"
23#include "marl/waitgroup.h"
24
25#include <queue>
26
27namespace
28{
29 enum { X, Y, Z };
30} // anonymous namespace
31
32namespace sw
33{
34 ComputeProgram::ComputeProgram(SpirvShader const *shader, vk::PipelineLayout const *pipelineLayout, const vk::DescriptorSet::Bindings &descriptorSets)
35 : shader(shader),
36 pipelineLayout(pipelineLayout),
37 descriptorSets(descriptorSets)
38 {
39 }
40
41 ComputeProgram::~ComputeProgram()
42 {
43 }
44
45 void ComputeProgram::generate()
46 {
47 MARL_SCOPED_EVENT("ComputeProgram::generate");
48
49 SpirvRoutine routine(pipelineLayout);
50 shader->emitProlog(&routine);
51 emit(&routine);
52 shader->emitEpilog(&routine);
53 }
54
55 void ComputeProgram::setWorkgroupBuiltins(Pointer<Byte> data, SpirvRoutine* routine, Int workgroupID[3])
56 {
57 routine->setInputBuiltin(shader, spv::BuiltInNumWorkgroups, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
58 {
59 auto numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
60 for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
61 {
62 value[builtin.FirstComponent + component] =
63 As<SIMD::Float>(SIMD::Int(Extract(numWorkgroups, component)));
64 }
65 });
66
67 routine->setInputBuiltin(shader, spv::BuiltInWorkgroupId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
68 {
69 for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
70 {
71 value[builtin.FirstComponent + component] =
72 As<SIMD::Float>(SIMD::Int(workgroupID[component]));
73 }
74 });
75
76 routine->setInputBuiltin(shader, spv::BuiltInWorkgroupSize, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
77 {
78 auto workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
79 for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
80 {
81 value[builtin.FirstComponent + component] =
82 As<SIMD::Float>(SIMD::Int(Extract(workgroupSize, component)));
83 }
84 });
85
86 routine->setInputBuiltin(shader, spv::BuiltInNumSubgroups, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
87 {
88 ASSERT(builtin.SizeInComponents == 1);
89 auto subgroupsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, subgroupsPerWorkgroup));
90 value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupsPerWorkgroup));
91 });
92
93 routine->setInputBuiltin(shader, spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
94 {
95 ASSERT(builtin.SizeInComponents == 1);
96 auto invocationsPerSubgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerSubgroup));
97 value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(invocationsPerSubgroup));
98 });
99
100 routine->setImmutableInputBuiltins(shader);
101 }
102
103 void ComputeProgram::setSubgroupBuiltins(Pointer<Byte> data, SpirvRoutine* routine, Int workgroupID[3], SIMD::Int localInvocationIndex, Int subgroupIndex)
104 {
105 Int4 numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
106 Int4 workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
107
108 // TODO: Fix Int4 swizzles so we can just use workgroupSize.x, workgroupSize.y.
109 Int workgroupSizeX = Extract(workgroupSize, X);
110 Int workgroupSizeY = Extract(workgroupSize, Y);
111
112 SIMD::Int localInvocationID[3];
113 {
114 SIMD::Int idx = localInvocationIndex;
115 localInvocationID[Z] = idx / SIMD::Int(workgroupSizeX * workgroupSizeY);
116 idx -= localInvocationID[Z] * SIMD::Int(workgroupSizeX * workgroupSizeY); // modulo
117 localInvocationID[Y] = idx / SIMD::Int(workgroupSizeX);
118 idx -= localInvocationID[Y] * SIMD::Int(workgroupSizeX); // modulo
119 localInvocationID[X] = idx;
120 }
121
122 routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
123 {
124 ASSERT(builtin.SizeInComponents == 1);
125 value[builtin.FirstComponent] = As<SIMD::Float>(localInvocationIndex);
126 });
127
128 routine->setInputBuiltin(shader, spv::BuiltInSubgroupId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
129 {
130 ASSERT(builtin.SizeInComponents == 1);
131 value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupIndex));
132 });
133
134 routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
135 {
136 for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
137 {
138 value[builtin.FirstComponent + component] =
139 As<SIMD::Float>(localInvocationID[component]);
140 }
141 });
142
143 routine->setInputBuiltin(shader, spv::BuiltInGlobalInvocationId, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
144 {
145 SIMD::Int wgID = 0;
146 wgID = Insert(wgID, workgroupID[X], X);
147 wgID = Insert(wgID, workgroupID[Y], Y);
148 wgID = Insert(wgID, workgroupID[Z], Z);
149 auto localBase = workgroupSize * wgID;
150 for (uint32_t component = 0; component < builtin.SizeInComponents; component++)
151 {
152 auto globalInvocationID = SIMD::Int(Extract(localBase, component)) + localInvocationID[component];
153 value[builtin.FirstComponent + component] = As<SIMD::Float>(globalInvocationID);
154 }
155 });
156 }
157
158 void ComputeProgram::emit(SpirvRoutine* routine)
159 {
160 Pointer<Byte> data = Arg<0>();
161 Int workgroupX = Arg<1>();
162 Int workgroupY = Arg<2>();
163 Int workgroupZ = Arg<3>();
164 Pointer<Byte> workgroupMemory = Arg<4>();
165 Int firstSubgroup = Arg<5>();
166 Int subgroupCount = Arg<6>();
167
168 routine->descriptorSets = data + OFFSET(Data, descriptorSets);
169 routine->descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
170 routine->pushConstants = data + OFFSET(Data, pushConstants);
171 routine->constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants));
172 routine->workgroupMemory = workgroupMemory;
173
174 Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
175
176 Int workgroupID[3] = {workgroupX, workgroupY, workgroupZ};
177 setWorkgroupBuiltins(data, routine, workgroupID);
178
179 For(Int i = 0, i < subgroupCount, i++)
180 {
181 auto subgroupIndex = firstSubgroup + i;
182
183 // TODO: Replace SIMD::Int(0, 1, 2, 3) with SIMD-width equivalent
184 auto localInvocationIndex = SIMD::Int(subgroupIndex * SIMD::Width) + SIMD::Int(0, 1, 2, 3);
185
186 // Disable lanes where (invocationIDs >= invocationsPerWorkgroup)
187 auto activeLaneMask = CmpLT(localInvocationIndex, SIMD::Int(invocationsPerWorkgroup));
188
189 setSubgroupBuiltins(data, routine, workgroupID, localInvocationIndex, subgroupIndex);
190
191 shader->emit(routine, activeLaneMask, activeLaneMask, descriptorSets);
192 }
193 }
194
195 void ComputeProgram::run(
196 vk::DescriptorSet::Bindings const &descriptorSets,
197 vk::DescriptorSet::DynamicOffsets const &descriptorDynamicOffsets,
198 PushConstantStorage const &pushConstants,
199 uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ,
200 uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
201 {
202 auto &modes = shader->getModes();
203
204 auto invocationsPerSubgroup = SIMD::Width;
205 auto invocationsPerWorkgroup = modes.WorkgroupSizeX * modes.WorkgroupSizeY * modes.WorkgroupSizeZ;
206 auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
207
208 Data data;
209 data.descriptorSets = descriptorSets;
210 data.descriptorDynamicOffsets = descriptorDynamicOffsets;
211 data.numWorkgroups[X] = groupCountX;
212 data.numWorkgroups[Y] = groupCountY;
213 data.numWorkgroups[Z] = groupCountZ;
214 data.numWorkgroups[3] = 0;
215 data.workgroupSize[X] = modes.WorkgroupSizeX;
216 data.workgroupSize[Y] = modes.WorkgroupSizeY;
217 data.workgroupSize[Z] = modes.WorkgroupSizeZ;
218 data.workgroupSize[3] = 0;
219 data.invocationsPerSubgroup = invocationsPerSubgroup;
220 data.invocationsPerWorkgroup = invocationsPerWorkgroup;
221 data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
222 data.pushConstants = pushConstants;
223 data.constants = &sw::constants;
224
225 marl::WaitGroup wg;
226 const uint32_t batchCount = 16;
227
228 auto groupCount = groupCountX * groupCountY * groupCountZ;
229
230 for (uint32_t batchID = 0; batchID < batchCount && batchID < groupCount; batchID++)
231 {
232 wg.add(1);
233 marl::schedule([=, &data]
234 {
235 defer(wg.done());
236 std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
237
238 for (uint32_t groupIndex = batchID; groupIndex < groupCount; groupIndex += batchCount)
239 {
240 auto modulo = groupIndex;
241 auto groupOffsetZ = modulo / (groupCountX * groupCountY);
242 modulo -= groupOffsetZ * (groupCountX * groupCountY);
243 auto groupOffsetY = modulo / groupCountX;
244 modulo -= groupOffsetY * groupCountX;
245 auto groupOffsetX = modulo;
246
247 auto groupZ = baseGroupZ + groupOffsetZ;
248 auto groupY = baseGroupY + groupOffsetY;
249 auto groupX = baseGroupX + groupOffsetX;
250 MARL_SCOPED_EVENT("groupX: %d, groupY: %d, groupZ: %d", groupX, groupY, groupZ);
251
252 using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
253 std::queue<Coroutine> coroutines;
254
255 if (modes.ContainsControlBarriers)
256 {
257 // Make a function call per subgroup so each subgroup
258 // can yield, bringing all subgroups to the barrier
259 // together.
260 for(int subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
261 {
262 auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
263 coroutines.push(std::move(coroutine));
264 }
265 }
266 else
267 {
268 auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
269 coroutines.push(std::move(coroutine));
270 }
271
272 while (coroutines.size() > 0)
273 {
274 auto coroutine = std::move(coroutines.front());
275 coroutines.pop();
276
277 SpirvShader::YieldResult result;
278 if (coroutine->await(result))
279 {
280 // TODO: Consider result (when the enum is more than 1 entry).
281 coroutines.push(std::move(coroutine));
282 }
283 }
284 }
285 });
286 }
287
288 wg.wait();
289 }
290
291} // namespace sw
292