| 1 | //************************************ bs::framework - Copyright 2018 Marko Pintera **************************************// |
| 2 | //*********** Licensed under the MIT license. See LICENSE.md for full terms. This notice is not to be removed. ***********// |
| 3 | #pragma once |
| 4 | |
| 5 | #include "BsCorePrerequisites.h" |
| 6 | #include "Utility/BsModule.h" |
| 7 | |
| 8 | namespace bs |
| 9 | { |
| 10 | /** @addtogroup Profiling |
| 11 | * @{ |
| 12 | */ |
| 13 | |
| 14 | class CPUProfilerReport; |
| 15 | |
| 16 | /** |
| 17 | * Provides various performance measuring methods. |
| 18 | * |
| 19 | * @note Thread safe. Matching begin* \ end* calls must belong to the same thread though. |
| 20 | */ |
| 21 | class BS_CORE_EXPORT ProfilerCPU : public Module<ProfilerCPU> |
| 22 | { |
| 23 | /** Timer class responsible for tracking elapsed time. */ |
| 24 | class Timer |
| 25 | { |
| 26 | public: |
| 27 | Timer(); |
| 28 | |
| 29 | /** Sets the start time for the timer. */ |
| 30 | void start(); |
| 31 | |
| 32 | /** Stops the timer and calculates the elapsed time from start time to now. */ |
| 33 | void stop(); |
| 34 | |
| 35 | /** Resets the elapsed time to zero. */ |
| 36 | void reset(); |
| 37 | |
| 38 | double time; |
| 39 | private: |
| 40 | double startTime = 0.0f; |
| 41 | std::chrono::high_resolution_clock mHRClock; |
| 42 | |
| 43 | /** Returns time elapsed since CPU was started in millseconds. */ |
| 44 | inline double getCurrentTime() const; |
| 45 | }; |
| 46 | |
| 47 | /** Timer class responsible for tracking number of elapsed CPU cycles. */ |
| 48 | class TimerPrecise |
| 49 | { |
| 50 | public: |
| 51 | TimerPrecise(); |
| 52 | |
| 53 | /** Starts the counter marking the current number of executed CPU cycles since CPU was started. */ |
| 54 | void start(); |
| 55 | |
| 56 | /** Ends the counter and calculates the number of CPU cycles between now and the start time. */ |
| 57 | void stop(); |
| 58 | |
| 59 | /** Resets the cycle count to zero. */ |
| 60 | void reset(); |
| 61 | |
| 62 | UINT64 cycles; |
| 63 | private: |
| 64 | UINT64 startCycles; |
| 65 | |
| 66 | /** Queries the CPU for the current number of CPU cycles executed since the program was started. */ |
| 67 | static inline UINT64 getNumCycles(); |
| 68 | }; |
| 69 | |
| 70 | /** |
| 71 | * Contains data about a single profiler sample (counting time in milliseconds). |
| 72 | * |
| 73 | * @note |
| 74 | * A sample is created whenever a named profile block is entered. For example if you have a function you are |
| 75 | * profiling, and it gets called 10 times, there will be 10 samples. |
| 76 | */ |
| 77 | struct ProfileSample |
| 78 | { |
| 79 | ProfileSample(double _time, UINT64 _numAllocs, UINT64 _numFrees) |
| 80 | :time(_time), numAllocs(_numAllocs), numFrees(_numFrees) |
| 81 | { } |
| 82 | |
| 83 | double time; |
| 84 | UINT64 numAllocs; |
| 85 | UINT64 numFrees; |
| 86 | }; |
| 87 | |
| 88 | /** |
| 89 | * Contains data about a single precise profiler sample (counting CPU cycles). |
| 90 | * |
| 91 | * @note |
| 92 | * A sample is created whenever a named profile block is entered. For example if you have a function you are |
| 93 | * profiling, and it gets called 10 times, there will be 10 samples. |
| 94 | */ |
| 95 | struct PreciseProfileSample |
| 96 | { |
| 97 | PreciseProfileSample(UINT64 _cycles, UINT64 _numAllocs, UINT64 _numFrees) |
| 98 | :cycles(_cycles), numAllocs(_numAllocs), numFrees(_numFrees) |
| 99 | { } |
| 100 | |
| 101 | UINT64 cycles; |
| 102 | UINT64 numAllocs; |
| 103 | UINT64 numFrees; |
| 104 | }; |
| 105 | |
| 106 | /** Contains basic (time based) profiling data contained in a profiling block. */ |
| 107 | struct ProfileData |
| 108 | { |
| 109 | ProfileData(FrameAlloc* alloc); |
| 110 | |
| 111 | /** Begins a new sample and records current sample state. Previous sample must not be active. */ |
| 112 | void beginSample(); |
| 113 | |
| 114 | /** |
| 115 | * Records current sample state and creates a new sample based on start and end state. Adds the sample to the |
| 116 | * sample list. |
| 117 | */ |
| 118 | void endSample(); |
| 119 | |
| 120 | /** |
| 121 | * Removes the last added sample from the sample list and makes it active again. You must call endSample() |
| 122 | * when done as if you called beginSample(). |
| 123 | */ |
| 124 | void resumeLastSample(); |
| 125 | |
| 126 | Vector<ProfileSample, StdFrameAlloc<ProfileSample>> samples; |
| 127 | Timer timer; |
| 128 | |
| 129 | UINT64 memAllocs; |
| 130 | UINT64 memFrees; |
| 131 | }; |
| 132 | |
| 133 | /** Contains precise (CPU cycle based) profiling data contained in a profiling block. */ |
| 134 | struct PreciseProfileData |
| 135 | { |
| 136 | PreciseProfileData(FrameAlloc* alloc); |
| 137 | |
| 138 | /** Begins a new sample and records current sample state. Previous sample must not be active. */ |
| 139 | void beginSample(); |
| 140 | |
| 141 | /** |
| 142 | * Records current sample state and creates a new sample based on start and end state. Adds the sample to the |
| 143 | * sample list. |
| 144 | */ |
| 145 | void endSample(); |
| 146 | |
| 147 | /** |
| 148 | * Removes the last added sample from the sample list and makes it active again. You must call endSample() |
| 149 | * when done as if you called beginSample. |
| 150 | */ |
| 151 | void resumeLastSample(); |
| 152 | |
| 153 | Vector<PreciseProfileSample, StdFrameAlloc<PreciseProfileSample>> samples; |
| 154 | TimerPrecise timer; |
| 155 | |
| 156 | UINT64 memAllocs; |
| 157 | UINT64 memFrees; |
| 158 | }; |
| 159 | |
| 160 | /** |
| 161 | * Contains all sampling information about a single named profiling block. Each block has its own sampling |
| 162 | * information and optionally child blocks. |
| 163 | */ |
| 164 | struct ProfiledBlock |
| 165 | { |
| 166 | ProfiledBlock(FrameAlloc* alloc); |
| 167 | ~ProfiledBlock(); |
| 168 | |
| 169 | /** Attempts to find a child block with the specified name. Returns null if not found. */ |
| 170 | ProfiledBlock* findChild(const char* name) const; |
| 171 | |
| 172 | char* name; |
| 173 | |
| 174 | ProfileData basic; |
| 175 | PreciseProfileData precise; |
| 176 | |
| 177 | Vector<ProfiledBlock*, StdFrameAlloc<ProfiledBlock*>> children; |
| 178 | }; |
| 179 | |
| 180 | /** CPU sampling type. */ |
| 181 | enum class ActiveSamplingType |
| 182 | { |
| 183 | Basic, /**< Sample using milliseconds. */ |
| 184 | Precise /**< Sample using CPU cycles. */ |
| 185 | }; |
| 186 | |
| 187 | /** Contains data about the currently active profiling block. */ |
| 188 | struct ActiveBlock |
| 189 | { |
| 190 | ActiveBlock() |
| 191 | :type(ActiveSamplingType::Basic), block(nullptr) |
| 192 | { } |
| 193 | |
| 194 | ActiveBlock(ActiveSamplingType _type, ProfiledBlock* _block) |
| 195 | :type(_type), block(_block) |
| 196 | { } |
| 197 | |
| 198 | ActiveSamplingType type; |
| 199 | ProfiledBlock* block; |
| 200 | }; |
| 201 | |
| 202 | /** Contains data about an active profiling thread. */ |
| 203 | struct ThreadInfo |
| 204 | { |
| 205 | ThreadInfo(); |
| 206 | |
| 207 | /** |
| 208 | * Starts profiling on the thread. New primary profiling block is created with the given name. |
| 209 | */ |
| 210 | void begin(const char* _name); |
| 211 | |
| 212 | /** |
| 213 | * Ends profiling on the thread. You should end all samples before calling this, but if you don't they will be |
| 214 | * terminated automatically. |
| 215 | */ |
| 216 | void end(); |
| 217 | |
| 218 | /** |
| 219 | * Deletes all internal profiling data and makes the object ready for another iteration. Should be called |
| 220 | * after end in order to delete any existing data. |
| 221 | */ |
| 222 | void reset(); |
| 223 | |
| 224 | /** Gets the primary profiling block used by the thread. */ |
| 225 | ProfiledBlock* getBlock(const char* name); |
| 226 | |
| 227 | /** Deletes the provided block. */ |
| 228 | void releaseBlock(ProfiledBlock* block); |
| 229 | |
| 230 | static BS_THREADLOCAL ThreadInfo* activeThread; |
| 231 | bool isActive = false; |
| 232 | |
| 233 | ProfiledBlock* rootBlock = nullptr; |
| 234 | |
| 235 | FrameAlloc frameAlloc; |
| 236 | ActiveBlock activeBlock; |
| 237 | Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>* activeBlocks = nullptr; |
| 238 | }; |
| 239 | |
| 240 | public: |
| 241 | ProfilerCPU(); |
| 242 | ~ProfilerCPU(); |
| 243 | |
| 244 | /** |
| 245 | * Registers a new thread we will be doing sampling in. This needs to be called before any beginSample* \ endSample* |
| 246 | * calls are made in that thread. |
| 247 | * |
| 248 | * @param[in] name Name that will allow you to more easily identify the thread. |
| 249 | */ |
| 250 | void beginThread(const char* name); |
| 251 | |
| 252 | /** Ends sampling for the current thread. No beginSample* \ endSample* calls after this point. */ |
| 253 | void endThread(); |
| 254 | |
| 255 | /** |
| 256 | * Begins sample measurement. Must be followed by endSample(). |
| 257 | * |
| 258 | * @param[in] name Unique name for the sample you can later use to find the sampling data. |
| 259 | */ |
| 260 | void beginSample(const char* name); |
| 261 | |
| 262 | /** |
| 263 | * Ends sample measurement. |
| 264 | * |
| 265 | * @param[in] name Unique name for the sample. |
| 266 | * |
| 267 | * @note |
| 268 | * Unique name is primarily needed to more easily identify mismatched begin/end sample pairs. Otherwise the name in |
| 269 | * beginSample() would be enough. |
| 270 | */ |
| 271 | void endSample(const char* name); |
| 272 | |
| 273 | /** |
| 274 | * Begins precise sample measurement. Must be followed by endSamplePrecise(). |
| 275 | * |
| 276 | * @param[in] name Unique name for the sample you can later use to find the sampling data. |
| 277 | * |
| 278 | * @note |
| 279 | * This method uses very precise CPU counters to determine variety of data not provided by standard beginSample(). |
| 280 | * However due to the way these counters work you should not use this method for larger parts of code. It does not |
| 281 | * consider context switches so if the OS decides to switch context between measurements you will get invalid data. |
| 282 | */ |
| 283 | void beginSamplePrecise(const char* name); |
| 284 | |
| 285 | /** |
| 286 | * Ends precise sample measurement. |
| 287 | * |
| 288 | * @param[in] name Unique name for the sample. |
| 289 | * |
| 290 | * @note |
| 291 | * Unique name is primarily needed to more easily identify mismatched begin/end sample pairs. Otherwise the name |
| 292 | * in beginSamplePrecise() would be enough. |
| 293 | */ |
| 294 | void endSamplePrecise(const char* name); |
| 295 | |
| 296 | /** Clears all sampling data, and ends any unfinished sampling blocks. */ |
| 297 | void reset(); |
| 298 | |
| 299 | /** |
| 300 | * Generates a report from all previously sampled data. |
| 301 | * |
| 302 | * @note Generating a report will stop all in-progress sampling. You should make sure |
| 303 | * you call endSample* manually beforehand so this doesn't have to happen. |
| 304 | */ |
| 305 | CPUProfilerReport generateReport(); |
| 306 | |
| 307 | private: |
| 308 | /** |
| 309 | * Calculates overhead that the timing and sampling methods themselves introduce so we might get more accurate |
| 310 | * measurements when creating reports. |
| 311 | */ |
| 312 | void estimateTimerOverhead(); |
| 313 | |
| 314 | private: |
| 315 | double mBasicTimerOverhead = 0.0; |
| 316 | UINT64 mPreciseTimerOverhead = 0; |
| 317 | |
| 318 | double mBasicSamplingOverheadMs = 0.0; |
| 319 | double mPreciseSamplingOverheadMs = 0.0; |
| 320 | UINT64 mBasicSamplingOverheadCycles = 0; |
| 321 | UINT64 mPreciseSamplingOverheadCycles = 0; |
| 322 | |
| 323 | ProfilerVector<ThreadInfo*> mActiveThreads; |
| 324 | Mutex mThreadSync; |
| 325 | }; |
| 326 | |
| 327 | /** Profiling entry containing information about a single CPU profiling block containing timing information. */ |
| 328 | struct BS_CORE_EXPORT CPUProfilerBasicSamplingEntry |
| 329 | { |
| 330 | struct BS_CORE_EXPORT Data |
| 331 | { |
| 332 | Data() = default; |
| 333 | |
| 334 | String name; /**< Name of the profiling block. */ |
| 335 | UINT32 numCalls = 0; /**< Number of times the block was entered. */ |
| 336 | |
| 337 | UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */ |
| 338 | UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */ |
| 339 | |
| 340 | double avgTimeMs = 0.0; /**< Average time it took to execute the block, per call. In milliseconds. */ |
| 341 | double maxTimeMs = 0.0; /**< Maximum time of a single call in the block. In milliseconds. */ |
| 342 | double totalTimeMs = 0.0; /**< Total time the block took, across all calls. In milliseconds. */ |
| 343 | |
| 344 | double avgSelfTimeMs = 0.0; /**< Average time it took to execute the block, per call. Ignores time used by child blocks. In milliseconds. */ |
| 345 | double totalSelfTimeMs = 0.0; /**< Total time the block took, across all calls. Ignores time used by child blocks. In milliseconds. */ |
| 346 | |
| 347 | double estimatedSelfOverheadMs = 0.0; /**< Estimated overhead of profiling methods, only for this exact block. In milliseconds. */ |
| 348 | double estimatedOverheadMs = 0.0; /**< Estimated overhead of profiling methods for this block and all children. In milliseconds. */ |
| 349 | |
| 350 | float pctOfParent = 1.0f; /**< Percent of parent block time this block took to execute. Ranging [0.0, 1.0]. */ |
| 351 | } data; |
| 352 | |
| 353 | ProfilerVector<CPUProfilerBasicSamplingEntry> childEntries; |
| 354 | }; |
| 355 | |
| 356 | /** |
| 357 | * Profiling entry containing information about a single CPU profiling block containing CPU cycle count based |
| 358 | * information. |
| 359 | */ |
| 360 | struct BS_CORE_EXPORT CPUProfilerPreciseSamplingEntry |
| 361 | { |
| 362 | struct BS_CORE_EXPORT Data |
| 363 | { |
| 364 | Data() = default; |
| 365 | |
| 366 | String name; /**< Name of the profiling block. */ |
| 367 | UINT32 numCalls = 0; /**< Number of times the block was entered. */ |
| 368 | |
| 369 | UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */ |
| 370 | UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */ |
| 371 | |
| 372 | UINT64 avgCycles = 0; /**< Average number of cycles it took to execute the block, per call. */ |
| 373 | UINT64 maxCycles = 0; /**< Maximum number of cycles of a single call in the block. */ |
| 374 | UINT64 totalCycles = 0; /**< Total number of cycles across all calls in the block. */ |
| 375 | |
| 376 | UINT64 avgSelfCycles = 0; /**< Average number of cycles it took to execute the block, per call. Ignores cycles used by child blocks. */ |
| 377 | UINT64 totalSelfCycles = 0; /**< Total number of cycles across all calls in the block. Ignores time used by child blocks. */ |
| 378 | |
| 379 | UINT64 estimatedSelfOverhead = 0; /**< Estimated overhead of profiling methods, only for this exact block. In cycles. */ |
| 380 | UINT64 estimatedOverhead = 0; /**< Estimated overhead of profiling methods for this block and all children. In cycles. */ |
| 381 | |
| 382 | float pctOfParent = 1.0f; /**< Percent of parent block cycles used by this block. Ranging [0.0, 1.0]. */ |
| 383 | } data; |
| 384 | |
| 385 | ProfilerVector<CPUProfilerPreciseSamplingEntry> childEntries; |
| 386 | }; |
| 387 | |
| 388 | /** CPU profiling report containing all profiling information for a single profiling session. */ |
| 389 | class BS_CORE_EXPORT CPUProfilerReport |
| 390 | { |
| 391 | public: |
| 392 | CPUProfilerReport() = default; |
| 393 | |
| 394 | /** |
| 395 | * Returns root entry for the basic (time based) sampling data. Root entry always contains the profiling block |
| 396 | * associated with the entire thread. |
| 397 | */ |
| 398 | const CPUProfilerBasicSamplingEntry& getBasicSamplingData() const { return mBasicSamplingRootEntry; } |
| 399 | |
| 400 | /** |
| 401 | * Returns root entry for the precise (CPU cycle based) sampling data. Root entry always contains the profiling |
| 402 | * block associated with the entire thread. |
| 403 | */ |
| 404 | const CPUProfilerPreciseSamplingEntry& getPreciseSamplingData() const { return mPreciseSamplingRootEntry; } |
| 405 | |
| 406 | private: |
| 407 | friend class ProfilerCPU; |
| 408 | |
| 409 | CPUProfilerBasicSamplingEntry mBasicSamplingRootEntry; |
| 410 | CPUProfilerPreciseSamplingEntry mPreciseSamplingRootEntry; |
| 411 | }; |
| 412 | |
| 413 | /** Provides global access to ProfilerCPU instance. */ |
| 414 | BS_CORE_EXPORT ProfilerCPU& gProfilerCPU(); |
| 415 | |
| 416 | /** Shortcut for profiling a single function call. */ |
| 417 | #define PROFILE_CALL(call, name) \ |
| 418 | { \ |
| 419 | bs::gProfilerCPU().beginSample(name); \ |
| 420 | call; \ |
| 421 | bs::gProfilerCPU().endSample(name); \ |
| 422 | } |
| 423 | |
| 424 | /** @} */ |
| 425 | } |
| 426 | |