1/*
2 * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved.
3 *
4 * NVIDIA CORPORATION and its licensors retain all intellectual property
5 * and proprietary rights in and to this software, related documentation
6 * and any modifications thereto. Any use, reproduction, disclosure or
7 * distribution of this software and related documentation without an express
8 * license agreement from NVIDIA CORPORATION is strictly prohibited.
9 */
10
11#ifndef PX_GPU_DISPATCHER_H
12#define PX_GPU_DISPATCHER_H
13
14#include "pxtask/PxTask.h"
15#include "pxtask/PxGpuCopyDesc.h"
16
17/* forward decl to avoid including <cuda.h> */
18typedef struct CUstream_st* CUstream;
19
20#ifndef PX_DOXYGEN
21namespace physx
22{
23#endif
24
25PX_PUSH_PACK_DEFAULT
26
27class PxCudaContextManager;
28class PxTaskManager;
29
30/** \brief A GpuTask dispatcher
31 *
32 * A PxGpuDispatcher executes GpuTasks submitted by one or more TaskManagers (one
33 * or more scenes). It maintains a CPU worker thread which waits on GpuTask
34 * "groups" to be submitted. The submission API is explicitly sessioned so that
35 * GpuTasks are dispatched together as a group whenever possible to improve
36 * parallelism on the GPU.
37 *
38 * A PxGpuDispatcher cannot be allocated ad-hoc, they are created as a result of
39 * creating a PxCudaContextManager. Every PxCudaContextManager has a PxGpuDispatcher
40 * instance that can be queried. In this way, each PxGpuDispatcher is tied to
41 * exactly one CUDA context.
42 *
43 * A scene will use CPU fallback Tasks for GpuTasks if the PxTaskManager provided
44 * to it does not have a PxGpuDispatcher. For this reason, the PxGpuDispatcher must
45 * be assigned to the PxTaskManager before the PxTaskManager is given to a scene.
46 *
47 * Multiple TaskManagers may safely share a single PxGpuDispatcher instance, thus
48 * enabling scenes to share a CUDA context.
49 *
50 * Only failureDetected() is intended for use by the user. The rest of the
51 * PxGpuDispatcher public methods are reserved for internal use by only both
52 * TaskManagers and GpuTasks.
53 */
54class PxGpuDispatcher
55{
56public:
57 /** \brief Record the start of a simulation step
58 *
59 * A PxTaskManager calls this function to record the beginning of a simulation
60 * step. The PxGpuDispatcher uses this notification to initialize the
61 * profiler state.
62 */
63 virtual void startSimulation() = 0;
64
65 /** \brief Record the start of a GpuTask batch submission
66 *
67 * A PxTaskManager calls this function to notify the PxGpuDispatcher that one or
68 * more GpuTasks are about to be submitted for execution. The PxGpuDispatcher
69 * will not read the incoming task queue until it receives one finishGroup()
70 * call for each startGroup() call. This is to ensure as many GpuTasks as
71 * possible are executed together as a group, generating optimal parallelism
72 * on the GPU.
73 */
74 virtual void startGroup() = 0;
75
76 /** \brief Submit a GpuTask for execution
77 *
78 * Submitted tasks are pushed onto an incoming queue. The PxGpuDispatcher
79 * will take the contents of this queue every time the pending group count
80 * reaches 0 and run the group of submitted GpuTasks as an interleaved
81 * group.
82 */
83 virtual void submitTask(PxTask& task) = 0;
84
85 /** \brief Record the end of a GpuTask batch submission
86 *
87 * A PxTaskManager calls this function to notify the PxGpuDispatcher that it is
88 * done submitting a group of GpuTasks (GpuTasks which were all make ready
89 * to run by the same prerequisite dependency becoming resolved). If no
90 * other group submissions are in progress, the PxGpuDispatcher will execute
91 * the set of ready tasks.
92 */
93 virtual void finishGroup() = 0;
94
95 /** \brief Add a CUDA completion prerequisite dependency to a task
96 *
97 * A GpuTask calls this function to add a prerequisite dependency on another
98 * task (usually a CpuTask) preventing that task from starting until all of
99 * the CUDA kernels and copies already launched have been completed. The
100 * PxGpuDispatcher will increment that task's reference count, blocking its
101 * execution, until the CUDA work is complete.
102 *
103 * This is generally only required when a CPU task is expecting the results
104 * of the CUDA kernels to have been copied into host memory.
105 *
106 * This mechanism is not at all not required to ensure CUDA kernels and
107 * copies are issued in the correct order. Kernel issue order is determined
108 * by normal task dependencies. The rule of thumb is to only use a blocking
109 * completion prerequisite if the task in question depends on a completed
110 * GPU->Host DMA.
111 *
112 * The PxGpuDispatcher issues a blocking event record to CUDA for the purposes
113 * of tracking the already submitted CUDA work. When this event is
114 * resolved, the PxGpuDispatcher manually decrements the reference count of
115 * the specified task, allowing it to execute (assuming it does not have
116 * other pending prerequisites).
117 */
118 virtual void addCompletionPrereq(PxBaseTask& task) = 0;
119
120 /** \brief Retrieve the PxCudaContextManager associated with this
121 * PxGpuDispatcher
122 *
123 * Every PxCudaContextManager has one PxGpuDispatcher, and every PxGpuDispatcher
124 * has one PxCudaContextManager.
125 */
126 virtual PxCudaContextManager* getCudaContextManager() = 0;
127
128 /** \brief Record the end of a simulation frame
129 *
130 * A PxTaskManager calls this function to record the completion of its
131 * dependency graph. If profiling is enabled, the PxGpuDispatcher will
132 * trigger the retrieval of profiling data from the GPU at this point.
133 */
134 virtual void stopSimulation() = 0;
135
136 /** \brief Returns true if a CUDA call has returned a non-recoverable error
137 *
138 * A return value of true indicates a fatal error has occurred. To protect
139 * itself, the PxGpuDispatcher enters a fall through mode that allows GpuTasks
140 * to complete without being executed. This allows simulations to continue
141 * but leaves GPU content static or corrupted.
142 *
143 * The user may try to recover from these failures by deleting GPU content
144 * so the visual artifacts are mimimized. But there is no way to recover
145 * the state of the GPU actors before the failure. Once a CUDA context is
146 * in this state, the only recourse is to create a new CUDA context, a new
147 * scene, and start over.
148 *
149 * This is our "Best Effort" attempt to not turn a soft failure into a hard
150 * failure because continued use of a CUDA context after it has returned an
151 * error will usually result in a driver reset. However if the initial
152 * failure was serious enough, a reset may have already occurred by the time
153 * we learn of it.
154 */
155 virtual bool failureDetected() const = 0;
156
157 /** \brief Force the PxGpuDispatcher into failure mode
158 *
159 * This API should be used if user code detects a non-recoverable CUDA
160 * error. This ensures the PxGpuDispatcher does not launch any further
161 * CUDA work. Subsequent calls to failureDetected() will return true.
162 */
163 virtual void forceFailureMode() = 0;
164
165 /** \brief Returns a pointer to the current in-use profile buffer
166 *
167 * The returned pointer should be passed to all kernel launches to enable
168 * CTA/Warp level profiling. If a data collector is not attached, or CTA
169 * profiling is not enabled, the pointer will be zero.
170 */
171 virtual void* getCurrentProfileBuffer() const = 0;
172
173 /** \brief Register kernel names with PlatformAnalyzer
174 *
175 * The returned PxU16 must be stored and used as a base offset for the ID
176 * passed to the KERNEL_START|STOP_EVENT macros.
177 */
178 virtual PxU16 registerKernelNames(const char**, PxU16 count) = 0;
179
180 /** \brief Launch a copy kernel with arbitrary number of copy commands
181 *
182 * This method is intended to be called from Kernel GpuTasks, but it can
183 * function outside of that context as well.
184 *
185 * If count is 1, the descriptor is passed to the kernel as arguments, so it
186 * may be declared on the stack.
187 *
188 * If count is greater than 1, the kernel will read the descriptors out of
189 * host memory. Because of this, the descriptor array must be located in
190 * page locked (pinned) memory. The provided descriptors may be modified by
191 * this method (converting host pointers to their GPU mapped equivalents)
192 * and should be considered *owned* by CUDA until the current batch of work
193 * has completed, so descriptor arrays should not be freed or modified until
194 * you have received a completion notification.
195 *
196 * If your GPU does not support mapping of page locked memory (SM>=1.1),
197 * this function degrades to calling CUDA copy methods.
198 */
199 virtual void launchCopyKernel(PxGpuCopyDesc* desc, PxU32 count, CUstream stream) = 0;
200
201 /** \brief Query pre launch task that runs before launching gpu kernels.
202 *
203 * This is part of an optional feature to schedule multiple gpu features
204 * at the same time to get kernels to run in parallel.
205 * \note Do *not* set the continuation on the returned task, but use addPreLaunchDependent().
206 */
207 virtual PxBaseTask& getPreLaunchTask() = 0;
208
209 /** \brief Adds a gpu launch task that gets executed after the pre launch task.
210 *
211 * This is part of an optional feature to schedule multiple gpu features
212 * at the same time to get kernels to run in parallel.
213 * \note Each call adds a reference to the pre-launch task.
214 */
215 virtual void addPreLaunchDependent(PxBaseTask& dependent) = 0;
216
217 /** \brief Query post launch task that runs after the gpu is done.
218 *
219 * This is part of an optional feature to schedule multiple gpu features
220 * at the same time to get kernels to run in parallel.
221 * \note Do *not* set the continuation on the returned task, but use addPostLaunchDependent().
222 */
223 virtual PxBaseTask& getPostLaunchTask() = 0;
224
225 /** \brief Adds a task that gets executed after the post launch task.
226 *
227 * This is part of an optional feature to schedule multiple gpu features
228 * at the same time to get kernels to run in parallel.
229 * \note Each call adds a reference to the pre-launch task.
230 */
231 virtual void addPostLaunchDependent(PxBaseTask& dependent) = 0;
232
233protected:
234 /** \brief protected destructor
235 *
236 * GpuDispatchers are allocated and freed by their PxCudaContextManager.
237 */
238 virtual ~PxGpuDispatcher() {}
239};
240
241PX_POP_PACK
242
243#ifndef PX_DOXYGEN
244} // end physx namespace
245#endif
246
247#endif
248