| 1 | /* |
| 2 | * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. |
| 3 | * |
| 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property |
| 5 | * and proprietary rights in and to this software, related documentation |
| 6 | * and any modifications thereto. Any use, reproduction, disclosure or |
| 7 | * distribution of this software and related documentation without an express |
| 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. |
| 9 | */ |
| 10 | |
| 11 | #ifndef PX_GPU_DISPATCHER_H |
| 12 | #define PX_GPU_DISPATCHER_H |
| 13 | |
| 14 | #include "pxtask/PxTask.h" |
| 15 | #include "pxtask/PxGpuCopyDesc.h" |
| 16 | |
| 17 | /* forward decl to avoid including <cuda.h> */ |
| 18 | typedef struct CUstream_st* CUstream; |
| 19 | |
| 20 | #ifndef PX_DOXYGEN |
| 21 | namespace physx |
| 22 | { |
| 23 | #endif |
| 24 | |
| 25 | PX_PUSH_PACK_DEFAULT |
| 26 | |
| 27 | class PxCudaContextManager; |
| 28 | class PxTaskManager; |
| 29 | |
| 30 | /** \brief A GpuTask dispatcher |
| 31 | * |
| 32 | * A PxGpuDispatcher executes GpuTasks submitted by one or more TaskManagers (one |
| 33 | * or more scenes). It maintains a CPU worker thread which waits on GpuTask |
| 34 | * "groups" to be submitted. The submission API is explicitly sessioned so that |
| 35 | * GpuTasks are dispatched together as a group whenever possible to improve |
| 36 | * parallelism on the GPU. |
| 37 | * |
| 38 | * A PxGpuDispatcher cannot be allocated ad-hoc, they are created as a result of |
| 39 | * creating a PxCudaContextManager. Every PxCudaContextManager has a PxGpuDispatcher |
| 40 | * instance that can be queried. In this way, each PxGpuDispatcher is tied to |
| 41 | * exactly one CUDA context. |
| 42 | * |
| 43 | * A scene will use CPU fallback Tasks for GpuTasks if the PxTaskManager provided |
| 44 | * to it does not have a PxGpuDispatcher. For this reason, the PxGpuDispatcher must |
| 45 | * be assigned to the PxTaskManager before the PxTaskManager is given to a scene. |
| 46 | * |
| 47 | * Multiple TaskManagers may safely share a single PxGpuDispatcher instance, thus |
| 48 | * enabling scenes to share a CUDA context. |
| 49 | * |
| 50 | * Only failureDetected() is intended for use by the user. The rest of the |
| 51 | * PxGpuDispatcher public methods are reserved for internal use by only both |
| 52 | * TaskManagers and GpuTasks. |
| 53 | */ |
| 54 | class PxGpuDispatcher |
| 55 | { |
| 56 | public: |
| 57 | /** \brief Record the start of a simulation step |
| 58 | * |
| 59 | * A PxTaskManager calls this function to record the beginning of a simulation |
| 60 | * step. The PxGpuDispatcher uses this notification to initialize the |
| 61 | * profiler state. |
| 62 | */ |
| 63 | virtual void startSimulation() = 0; |
| 64 | |
| 65 | /** \brief Record the start of a GpuTask batch submission |
| 66 | * |
| 67 | * A PxTaskManager calls this function to notify the PxGpuDispatcher that one or |
| 68 | * more GpuTasks are about to be submitted for execution. The PxGpuDispatcher |
| 69 | * will not read the incoming task queue until it receives one finishGroup() |
| 70 | * call for each startGroup() call. This is to ensure as many GpuTasks as |
| 71 | * possible are executed together as a group, generating optimal parallelism |
| 72 | * on the GPU. |
| 73 | */ |
| 74 | virtual void startGroup() = 0; |
| 75 | |
| 76 | /** \brief Submit a GpuTask for execution |
| 77 | * |
| 78 | * Submitted tasks are pushed onto an incoming queue. The PxGpuDispatcher |
| 79 | * will take the contents of this queue every time the pending group count |
| 80 | * reaches 0 and run the group of submitted GpuTasks as an interleaved |
| 81 | * group. |
| 82 | */ |
| 83 | virtual void submitTask(PxTask& task) = 0; |
| 84 | |
| 85 | /** \brief Record the end of a GpuTask batch submission |
| 86 | * |
| 87 | * A PxTaskManager calls this function to notify the PxGpuDispatcher that it is |
| 88 | * done submitting a group of GpuTasks (GpuTasks which were all make ready |
| 89 | * to run by the same prerequisite dependency becoming resolved). If no |
| 90 | * other group submissions are in progress, the PxGpuDispatcher will execute |
| 91 | * the set of ready tasks. |
| 92 | */ |
| 93 | virtual void finishGroup() = 0; |
| 94 | |
| 95 | /** \brief Add a CUDA completion prerequisite dependency to a task |
| 96 | * |
| 97 | * A GpuTask calls this function to add a prerequisite dependency on another |
| 98 | * task (usually a CpuTask) preventing that task from starting until all of |
| 99 | * the CUDA kernels and copies already launched have been completed. The |
| 100 | * PxGpuDispatcher will increment that task's reference count, blocking its |
| 101 | * execution, until the CUDA work is complete. |
| 102 | * |
| 103 | * This is generally only required when a CPU task is expecting the results |
| 104 | * of the CUDA kernels to have been copied into host memory. |
| 105 | * |
| 106 | * This mechanism is not at all not required to ensure CUDA kernels and |
| 107 | * copies are issued in the correct order. Kernel issue order is determined |
| 108 | * by normal task dependencies. The rule of thumb is to only use a blocking |
| 109 | * completion prerequisite if the task in question depends on a completed |
| 110 | * GPU->Host DMA. |
| 111 | * |
| 112 | * The PxGpuDispatcher issues a blocking event record to CUDA for the purposes |
| 113 | * of tracking the already submitted CUDA work. When this event is |
| 114 | * resolved, the PxGpuDispatcher manually decrements the reference count of |
| 115 | * the specified task, allowing it to execute (assuming it does not have |
| 116 | * other pending prerequisites). |
| 117 | */ |
| 118 | virtual void addCompletionPrereq(PxBaseTask& task) = 0; |
| 119 | |
| 120 | /** \brief Retrieve the PxCudaContextManager associated with this |
| 121 | * PxGpuDispatcher |
| 122 | * |
| 123 | * Every PxCudaContextManager has one PxGpuDispatcher, and every PxGpuDispatcher |
| 124 | * has one PxCudaContextManager. |
| 125 | */ |
| 126 | virtual PxCudaContextManager* getCudaContextManager() = 0; |
| 127 | |
| 128 | /** \brief Record the end of a simulation frame |
| 129 | * |
| 130 | * A PxTaskManager calls this function to record the completion of its |
| 131 | * dependency graph. If profiling is enabled, the PxGpuDispatcher will |
| 132 | * trigger the retrieval of profiling data from the GPU at this point. |
| 133 | */ |
| 134 | virtual void stopSimulation() = 0; |
| 135 | |
| 136 | /** \brief Returns true if a CUDA call has returned a non-recoverable error |
| 137 | * |
| 138 | * A return value of true indicates a fatal error has occurred. To protect |
| 139 | * itself, the PxGpuDispatcher enters a fall through mode that allows GpuTasks |
| 140 | * to complete without being executed. This allows simulations to continue |
| 141 | * but leaves GPU content static or corrupted. |
| 142 | * |
| 143 | * The user may try to recover from these failures by deleting GPU content |
| 144 | * so the visual artifacts are mimimized. But there is no way to recover |
| 145 | * the state of the GPU actors before the failure. Once a CUDA context is |
| 146 | * in this state, the only recourse is to create a new CUDA context, a new |
| 147 | * scene, and start over. |
| 148 | * |
| 149 | * This is our "Best Effort" attempt to not turn a soft failure into a hard |
| 150 | * failure because continued use of a CUDA context after it has returned an |
| 151 | * error will usually result in a driver reset. However if the initial |
| 152 | * failure was serious enough, a reset may have already occurred by the time |
| 153 | * we learn of it. |
| 154 | */ |
| 155 | virtual bool failureDetected() const = 0; |
| 156 | |
| 157 | /** \brief Force the PxGpuDispatcher into failure mode |
| 158 | * |
| 159 | * This API should be used if user code detects a non-recoverable CUDA |
| 160 | * error. This ensures the PxGpuDispatcher does not launch any further |
| 161 | * CUDA work. Subsequent calls to failureDetected() will return true. |
| 162 | */ |
| 163 | virtual void forceFailureMode() = 0; |
| 164 | |
| 165 | /** \brief Returns a pointer to the current in-use profile buffer |
| 166 | * |
| 167 | * The returned pointer should be passed to all kernel launches to enable |
| 168 | * CTA/Warp level profiling. If a data collector is not attached, or CTA |
| 169 | * profiling is not enabled, the pointer will be zero. |
| 170 | */ |
| 171 | virtual void* getCurrentProfileBuffer() const = 0; |
| 172 | |
| 173 | /** \brief Register kernel names with PlatformAnalyzer |
| 174 | * |
| 175 | * The returned PxU16 must be stored and used as a base offset for the ID |
| 176 | * passed to the KERNEL_START|STOP_EVENT macros. |
| 177 | */ |
| 178 | virtual PxU16 registerKernelNames(const char**, PxU16 count) = 0; |
| 179 | |
| 180 | /** \brief Launch a copy kernel with arbitrary number of copy commands |
| 181 | * |
| 182 | * This method is intended to be called from Kernel GpuTasks, but it can |
| 183 | * function outside of that context as well. |
| 184 | * |
| 185 | * If count is 1, the descriptor is passed to the kernel as arguments, so it |
| 186 | * may be declared on the stack. |
| 187 | * |
| 188 | * If count is greater than 1, the kernel will read the descriptors out of |
| 189 | * host memory. Because of this, the descriptor array must be located in |
| 190 | * page locked (pinned) memory. The provided descriptors may be modified by |
| 191 | * this method (converting host pointers to their GPU mapped equivalents) |
| 192 | * and should be considered *owned* by CUDA until the current batch of work |
| 193 | * has completed, so descriptor arrays should not be freed or modified until |
| 194 | * you have received a completion notification. |
| 195 | * |
| 196 | * If your GPU does not support mapping of page locked memory (SM>=1.1), |
| 197 | * this function degrades to calling CUDA copy methods. |
| 198 | */ |
| 199 | virtual void launchCopyKernel(PxGpuCopyDesc* desc, PxU32 count, CUstream stream) = 0; |
| 200 | |
| 201 | /** \brief Query pre launch task that runs before launching gpu kernels. |
| 202 | * |
| 203 | * This is part of an optional feature to schedule multiple gpu features |
| 204 | * at the same time to get kernels to run in parallel. |
| 205 | * \note Do *not* set the continuation on the returned task, but use addPreLaunchDependent(). |
| 206 | */ |
| 207 | virtual PxBaseTask& getPreLaunchTask() = 0; |
| 208 | |
| 209 | /** \brief Adds a gpu launch task that gets executed after the pre launch task. |
| 210 | * |
| 211 | * This is part of an optional feature to schedule multiple gpu features |
| 212 | * at the same time to get kernels to run in parallel. |
| 213 | * \note Each call adds a reference to the pre-launch task. |
| 214 | */ |
| 215 | virtual void addPreLaunchDependent(PxBaseTask& dependent) = 0; |
| 216 | |
| 217 | /** \brief Query post launch task that runs after the gpu is done. |
| 218 | * |
| 219 | * This is part of an optional feature to schedule multiple gpu features |
| 220 | * at the same time to get kernels to run in parallel. |
| 221 | * \note Do *not* set the continuation on the returned task, but use addPostLaunchDependent(). |
| 222 | */ |
| 223 | virtual PxBaseTask& getPostLaunchTask() = 0; |
| 224 | |
| 225 | /** \brief Adds a task that gets executed after the post launch task. |
| 226 | * |
| 227 | * This is part of an optional feature to schedule multiple gpu features |
| 228 | * at the same time to get kernels to run in parallel. |
| 229 | * \note Each call adds a reference to the pre-launch task. |
| 230 | */ |
| 231 | virtual void addPostLaunchDependent(PxBaseTask& dependent) = 0; |
| 232 | |
| 233 | protected: |
| 234 | /** \brief protected destructor |
| 235 | * |
| 236 | * GpuDispatchers are allocated and freed by their PxCudaContextManager. |
| 237 | */ |
| 238 | virtual ~PxGpuDispatcher() {} |
| 239 | }; |
| 240 | |
| 241 | PX_POP_PACK |
| 242 | |
| 243 | #ifndef PX_DOXYGEN |
| 244 | } // end physx namespace |
| 245 | #endif |
| 246 | |
| 247 | #endif |
| 248 | |