1 | /* |
2 | * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. |
3 | * |
4 | * NVIDIA CORPORATION and its licensors retain all intellectual property |
5 | * and proprietary rights in and to this software, related documentation |
6 | * and any modifications thereto. Any use, reproduction, disclosure or |
7 | * distribution of this software and related documentation without an express |
8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. |
9 | */ |
10 | |
11 | #ifndef PX_GPU_DISPATCHER_H |
12 | #define PX_GPU_DISPATCHER_H |
13 | |
14 | #include "pxtask/PxTask.h" |
15 | #include "pxtask/PxGpuCopyDesc.h" |
16 | |
17 | /* forward decl to avoid including <cuda.h> */ |
18 | typedef struct CUstream_st* CUstream; |
19 | |
20 | #ifndef PX_DOXYGEN |
21 | namespace physx |
22 | { |
23 | #endif |
24 | |
25 | PX_PUSH_PACK_DEFAULT |
26 | |
27 | class PxCudaContextManager; |
28 | class PxTaskManager; |
29 | |
30 | /** \brief A GpuTask dispatcher |
31 | * |
32 | * A PxGpuDispatcher executes GpuTasks submitted by one or more TaskManagers (one |
33 | * or more scenes). It maintains a CPU worker thread which waits on GpuTask |
34 | * "groups" to be submitted. The submission API is explicitly sessioned so that |
35 | * GpuTasks are dispatched together as a group whenever possible to improve |
36 | * parallelism on the GPU. |
37 | * |
38 | * A PxGpuDispatcher cannot be allocated ad-hoc, they are created as a result of |
39 | * creating a PxCudaContextManager. Every PxCudaContextManager has a PxGpuDispatcher |
40 | * instance that can be queried. In this way, each PxGpuDispatcher is tied to |
41 | * exactly one CUDA context. |
42 | * |
43 | * A scene will use CPU fallback Tasks for GpuTasks if the PxTaskManager provided |
44 | * to it does not have a PxGpuDispatcher. For this reason, the PxGpuDispatcher must |
45 | * be assigned to the PxTaskManager before the PxTaskManager is given to a scene. |
46 | * |
47 | * Multiple TaskManagers may safely share a single PxGpuDispatcher instance, thus |
48 | * enabling scenes to share a CUDA context. |
49 | * |
50 | * Only failureDetected() is intended for use by the user. The rest of the |
51 | * PxGpuDispatcher public methods are reserved for internal use by only both |
52 | * TaskManagers and GpuTasks. |
53 | */ |
54 | class PxGpuDispatcher |
55 | { |
56 | public: |
57 | /** \brief Record the start of a simulation step |
58 | * |
59 | * A PxTaskManager calls this function to record the beginning of a simulation |
60 | * step. The PxGpuDispatcher uses this notification to initialize the |
61 | * profiler state. |
62 | */ |
63 | virtual void startSimulation() = 0; |
64 | |
65 | /** \brief Record the start of a GpuTask batch submission |
66 | * |
67 | * A PxTaskManager calls this function to notify the PxGpuDispatcher that one or |
68 | * more GpuTasks are about to be submitted for execution. The PxGpuDispatcher |
69 | * will not read the incoming task queue until it receives one finishGroup() |
70 | * call for each startGroup() call. This is to ensure as many GpuTasks as |
71 | * possible are executed together as a group, generating optimal parallelism |
72 | * on the GPU. |
73 | */ |
74 | virtual void startGroup() = 0; |
75 | |
76 | /** \brief Submit a GpuTask for execution |
77 | * |
78 | * Submitted tasks are pushed onto an incoming queue. The PxGpuDispatcher |
79 | * will take the contents of this queue every time the pending group count |
80 | * reaches 0 and run the group of submitted GpuTasks as an interleaved |
81 | * group. |
82 | */ |
83 | virtual void submitTask(PxTask& task) = 0; |
84 | |
85 | /** \brief Record the end of a GpuTask batch submission |
86 | * |
87 | * A PxTaskManager calls this function to notify the PxGpuDispatcher that it is |
88 | * done submitting a group of GpuTasks (GpuTasks which were all make ready |
89 | * to run by the same prerequisite dependency becoming resolved). If no |
90 | * other group submissions are in progress, the PxGpuDispatcher will execute |
91 | * the set of ready tasks. |
92 | */ |
93 | virtual void finishGroup() = 0; |
94 | |
95 | /** \brief Add a CUDA completion prerequisite dependency to a task |
96 | * |
97 | * A GpuTask calls this function to add a prerequisite dependency on another |
98 | * task (usually a CpuTask) preventing that task from starting until all of |
99 | * the CUDA kernels and copies already launched have been completed. The |
100 | * PxGpuDispatcher will increment that task's reference count, blocking its |
101 | * execution, until the CUDA work is complete. |
102 | * |
103 | * This is generally only required when a CPU task is expecting the results |
104 | * of the CUDA kernels to have been copied into host memory. |
105 | * |
106 | * This mechanism is not at all not required to ensure CUDA kernels and |
107 | * copies are issued in the correct order. Kernel issue order is determined |
108 | * by normal task dependencies. The rule of thumb is to only use a blocking |
109 | * completion prerequisite if the task in question depends on a completed |
110 | * GPU->Host DMA. |
111 | * |
112 | * The PxGpuDispatcher issues a blocking event record to CUDA for the purposes |
113 | * of tracking the already submitted CUDA work. When this event is |
114 | * resolved, the PxGpuDispatcher manually decrements the reference count of |
115 | * the specified task, allowing it to execute (assuming it does not have |
116 | * other pending prerequisites). |
117 | */ |
118 | virtual void addCompletionPrereq(PxBaseTask& task) = 0; |
119 | |
120 | /** \brief Retrieve the PxCudaContextManager associated with this |
121 | * PxGpuDispatcher |
122 | * |
123 | * Every PxCudaContextManager has one PxGpuDispatcher, and every PxGpuDispatcher |
124 | * has one PxCudaContextManager. |
125 | */ |
126 | virtual PxCudaContextManager* getCudaContextManager() = 0; |
127 | |
128 | /** \brief Record the end of a simulation frame |
129 | * |
130 | * A PxTaskManager calls this function to record the completion of its |
131 | * dependency graph. If profiling is enabled, the PxGpuDispatcher will |
132 | * trigger the retrieval of profiling data from the GPU at this point. |
133 | */ |
134 | virtual void stopSimulation() = 0; |
135 | |
136 | /** \brief Returns true if a CUDA call has returned a non-recoverable error |
137 | * |
138 | * A return value of true indicates a fatal error has occurred. To protect |
139 | * itself, the PxGpuDispatcher enters a fall through mode that allows GpuTasks |
140 | * to complete without being executed. This allows simulations to continue |
141 | * but leaves GPU content static or corrupted. |
142 | * |
143 | * The user may try to recover from these failures by deleting GPU content |
144 | * so the visual artifacts are mimimized. But there is no way to recover |
145 | * the state of the GPU actors before the failure. Once a CUDA context is |
146 | * in this state, the only recourse is to create a new CUDA context, a new |
147 | * scene, and start over. |
148 | * |
149 | * This is our "Best Effort" attempt to not turn a soft failure into a hard |
150 | * failure because continued use of a CUDA context after it has returned an |
151 | * error will usually result in a driver reset. However if the initial |
152 | * failure was serious enough, a reset may have already occurred by the time |
153 | * we learn of it. |
154 | */ |
155 | virtual bool failureDetected() const = 0; |
156 | |
157 | /** \brief Force the PxGpuDispatcher into failure mode |
158 | * |
159 | * This API should be used if user code detects a non-recoverable CUDA |
160 | * error. This ensures the PxGpuDispatcher does not launch any further |
161 | * CUDA work. Subsequent calls to failureDetected() will return true. |
162 | */ |
163 | virtual void forceFailureMode() = 0; |
164 | |
165 | /** \brief Returns a pointer to the current in-use profile buffer |
166 | * |
167 | * The returned pointer should be passed to all kernel launches to enable |
168 | * CTA/Warp level profiling. If a data collector is not attached, or CTA |
169 | * profiling is not enabled, the pointer will be zero. |
170 | */ |
171 | virtual void* getCurrentProfileBuffer() const = 0; |
172 | |
173 | /** \brief Register kernel names with PlatformAnalyzer |
174 | * |
175 | * The returned PxU16 must be stored and used as a base offset for the ID |
176 | * passed to the KERNEL_START|STOP_EVENT macros. |
177 | */ |
178 | virtual PxU16 registerKernelNames(const char**, PxU16 count) = 0; |
179 | |
180 | /** \brief Launch a copy kernel with arbitrary number of copy commands |
181 | * |
182 | * This method is intended to be called from Kernel GpuTasks, but it can |
183 | * function outside of that context as well. |
184 | * |
185 | * If count is 1, the descriptor is passed to the kernel as arguments, so it |
186 | * may be declared on the stack. |
187 | * |
188 | * If count is greater than 1, the kernel will read the descriptors out of |
189 | * host memory. Because of this, the descriptor array must be located in |
190 | * page locked (pinned) memory. The provided descriptors may be modified by |
191 | * this method (converting host pointers to their GPU mapped equivalents) |
192 | * and should be considered *owned* by CUDA until the current batch of work |
193 | * has completed, so descriptor arrays should not be freed or modified until |
194 | * you have received a completion notification. |
195 | * |
196 | * If your GPU does not support mapping of page locked memory (SM>=1.1), |
197 | * this function degrades to calling CUDA copy methods. |
198 | */ |
199 | virtual void launchCopyKernel(PxGpuCopyDesc* desc, PxU32 count, CUstream stream) = 0; |
200 | |
201 | /** \brief Query pre launch task that runs before launching gpu kernels. |
202 | * |
203 | * This is part of an optional feature to schedule multiple gpu features |
204 | * at the same time to get kernels to run in parallel. |
205 | * \note Do *not* set the continuation on the returned task, but use addPreLaunchDependent(). |
206 | */ |
207 | virtual PxBaseTask& getPreLaunchTask() = 0; |
208 | |
209 | /** \brief Adds a gpu launch task that gets executed after the pre launch task. |
210 | * |
211 | * This is part of an optional feature to schedule multiple gpu features |
212 | * at the same time to get kernels to run in parallel. |
213 | * \note Each call adds a reference to the pre-launch task. |
214 | */ |
215 | virtual void addPreLaunchDependent(PxBaseTask& dependent) = 0; |
216 | |
217 | /** \brief Query post launch task that runs after the gpu is done. |
218 | * |
219 | * This is part of an optional feature to schedule multiple gpu features |
220 | * at the same time to get kernels to run in parallel. |
221 | * \note Do *not* set the continuation on the returned task, but use addPostLaunchDependent(). |
222 | */ |
223 | virtual PxBaseTask& getPostLaunchTask() = 0; |
224 | |
225 | /** \brief Adds a task that gets executed after the post launch task. |
226 | * |
227 | * This is part of an optional feature to schedule multiple gpu features |
228 | * at the same time to get kernels to run in parallel. |
229 | * \note Each call adds a reference to the pre-launch task. |
230 | */ |
231 | virtual void addPostLaunchDependent(PxBaseTask& dependent) = 0; |
232 | |
233 | protected: |
234 | /** \brief protected destructor |
235 | * |
236 | * GpuDispatchers are allocated and freed by their PxCudaContextManager. |
237 | */ |
238 | virtual ~PxGpuDispatcher() {} |
239 | }; |
240 | |
241 | PX_POP_PACK |
242 | |
243 | #ifndef PX_DOXYGEN |
244 | } // end physx namespace |
245 | #endif |
246 | |
247 | #endif |
248 | |