1 | //************************************ bs::framework - Copyright 2018 Marko Pintera **************************************// |
2 | //*********** Licensed under the MIT license. See LICENSE.md for full terms. This notice is not to be removed. ***********// |
3 | #pragma once |
4 | |
5 | #include "BsCorePrerequisites.h" |
6 | #include "Utility/BsModule.h" |
7 | |
8 | namespace bs |
9 | { |
10 | /** @addtogroup Profiling |
11 | * @{ |
12 | */ |
13 | |
14 | class CPUProfilerReport; |
15 | |
16 | /** |
17 | * Provides various performance measuring methods. |
18 | * |
19 | * @note Thread safe. Matching begin* \ end* calls must belong to the same thread though. |
20 | */ |
21 | class BS_CORE_EXPORT ProfilerCPU : public Module<ProfilerCPU> |
22 | { |
23 | /** Timer class responsible for tracking elapsed time. */ |
24 | class Timer |
25 | { |
26 | public: |
27 | Timer(); |
28 | |
29 | /** Sets the start time for the timer. */ |
30 | void start(); |
31 | |
32 | /** Stops the timer and calculates the elapsed time from start time to now. */ |
33 | void stop(); |
34 | |
35 | /** Resets the elapsed time to zero. */ |
36 | void reset(); |
37 | |
38 | double time; |
39 | private: |
40 | double startTime = 0.0f; |
41 | std::chrono::high_resolution_clock mHRClock; |
42 | |
43 | /** Returns time elapsed since CPU was started in millseconds. */ |
44 | inline double getCurrentTime() const; |
45 | }; |
46 | |
47 | /** Timer class responsible for tracking number of elapsed CPU cycles. */ |
48 | class TimerPrecise |
49 | { |
50 | public: |
51 | TimerPrecise(); |
52 | |
53 | /** Starts the counter marking the current number of executed CPU cycles since CPU was started. */ |
54 | void start(); |
55 | |
56 | /** Ends the counter and calculates the number of CPU cycles between now and the start time. */ |
57 | void stop(); |
58 | |
59 | /** Resets the cycle count to zero. */ |
60 | void reset(); |
61 | |
62 | UINT64 cycles; |
63 | private: |
64 | UINT64 startCycles; |
65 | |
66 | /** Queries the CPU for the current number of CPU cycles executed since the program was started. */ |
67 | static inline UINT64 getNumCycles(); |
68 | }; |
69 | |
70 | /** |
71 | * Contains data about a single profiler sample (counting time in milliseconds). |
72 | * |
73 | * @note |
74 | * A sample is created whenever a named profile block is entered. For example if you have a function you are |
75 | * profiling, and it gets called 10 times, there will be 10 samples. |
76 | */ |
77 | struct ProfileSample |
78 | { |
79 | ProfileSample(double _time, UINT64 _numAllocs, UINT64 _numFrees) |
80 | :time(_time), numAllocs(_numAllocs), numFrees(_numFrees) |
81 | { } |
82 | |
83 | double time; |
84 | UINT64 numAllocs; |
85 | UINT64 numFrees; |
86 | }; |
87 | |
88 | /** |
89 | * Contains data about a single precise profiler sample (counting CPU cycles). |
90 | * |
91 | * @note |
92 | * A sample is created whenever a named profile block is entered. For example if you have a function you are |
93 | * profiling, and it gets called 10 times, there will be 10 samples. |
94 | */ |
95 | struct PreciseProfileSample |
96 | { |
97 | PreciseProfileSample(UINT64 _cycles, UINT64 _numAllocs, UINT64 _numFrees) |
98 | :cycles(_cycles), numAllocs(_numAllocs), numFrees(_numFrees) |
99 | { } |
100 | |
101 | UINT64 cycles; |
102 | UINT64 numAllocs; |
103 | UINT64 numFrees; |
104 | }; |
105 | |
106 | /** Contains basic (time based) profiling data contained in a profiling block. */ |
107 | struct ProfileData |
108 | { |
109 | ProfileData(FrameAlloc* alloc); |
110 | |
111 | /** Begins a new sample and records current sample state. Previous sample must not be active. */ |
112 | void beginSample(); |
113 | |
114 | /** |
115 | * Records current sample state and creates a new sample based on start and end state. Adds the sample to the |
116 | * sample list. |
117 | */ |
118 | void endSample(); |
119 | |
120 | /** |
121 | * Removes the last added sample from the sample list and makes it active again. You must call endSample() |
122 | * when done as if you called beginSample(). |
123 | */ |
124 | void resumeLastSample(); |
125 | |
126 | Vector<ProfileSample, StdFrameAlloc<ProfileSample>> samples; |
127 | Timer timer; |
128 | |
129 | UINT64 memAllocs; |
130 | UINT64 memFrees; |
131 | }; |
132 | |
133 | /** Contains precise (CPU cycle based) profiling data contained in a profiling block. */ |
134 | struct PreciseProfileData |
135 | { |
136 | PreciseProfileData(FrameAlloc* alloc); |
137 | |
138 | /** Begins a new sample and records current sample state. Previous sample must not be active. */ |
139 | void beginSample(); |
140 | |
141 | /** |
142 | * Records current sample state and creates a new sample based on start and end state. Adds the sample to the |
143 | * sample list. |
144 | */ |
145 | void endSample(); |
146 | |
147 | /** |
148 | * Removes the last added sample from the sample list and makes it active again. You must call endSample() |
149 | * when done as if you called beginSample. |
150 | */ |
151 | void resumeLastSample(); |
152 | |
153 | Vector<PreciseProfileSample, StdFrameAlloc<PreciseProfileSample>> samples; |
154 | TimerPrecise timer; |
155 | |
156 | UINT64 memAllocs; |
157 | UINT64 memFrees; |
158 | }; |
159 | |
160 | /** |
161 | * Contains all sampling information about a single named profiling block. Each block has its own sampling |
162 | * information and optionally child blocks. |
163 | */ |
164 | struct ProfiledBlock |
165 | { |
166 | ProfiledBlock(FrameAlloc* alloc); |
167 | ~ProfiledBlock(); |
168 | |
169 | /** Attempts to find a child block with the specified name. Returns null if not found. */ |
170 | ProfiledBlock* findChild(const char* name) const; |
171 | |
172 | char* name; |
173 | |
174 | ProfileData basic; |
175 | PreciseProfileData precise; |
176 | |
177 | Vector<ProfiledBlock*, StdFrameAlloc<ProfiledBlock*>> children; |
178 | }; |
179 | |
180 | /** CPU sampling type. */ |
181 | enum class ActiveSamplingType |
182 | { |
183 | Basic, /**< Sample using milliseconds. */ |
184 | Precise /**< Sample using CPU cycles. */ |
185 | }; |
186 | |
187 | /** Contains data about the currently active profiling block. */ |
188 | struct ActiveBlock |
189 | { |
190 | ActiveBlock() |
191 | :type(ActiveSamplingType::Basic), block(nullptr) |
192 | { } |
193 | |
194 | ActiveBlock(ActiveSamplingType _type, ProfiledBlock* _block) |
195 | :type(_type), block(_block) |
196 | { } |
197 | |
198 | ActiveSamplingType type; |
199 | ProfiledBlock* block; |
200 | }; |
201 | |
202 | /** Contains data about an active profiling thread. */ |
203 | struct ThreadInfo |
204 | { |
205 | ThreadInfo(); |
206 | |
207 | /** |
208 | * Starts profiling on the thread. New primary profiling block is created with the given name. |
209 | */ |
210 | void begin(const char* _name); |
211 | |
212 | /** |
213 | * Ends profiling on the thread. You should end all samples before calling this, but if you don't they will be |
214 | * terminated automatically. |
215 | */ |
216 | void end(); |
217 | |
218 | /** |
219 | * Deletes all internal profiling data and makes the object ready for another iteration. Should be called |
220 | * after end in order to delete any existing data. |
221 | */ |
222 | void reset(); |
223 | |
224 | /** Gets the primary profiling block used by the thread. */ |
225 | ProfiledBlock* getBlock(const char* name); |
226 | |
227 | /** Deletes the provided block. */ |
228 | void releaseBlock(ProfiledBlock* block); |
229 | |
230 | static BS_THREADLOCAL ThreadInfo* activeThread; |
231 | bool isActive = false; |
232 | |
233 | ProfiledBlock* rootBlock = nullptr; |
234 | |
235 | FrameAlloc frameAlloc; |
236 | ActiveBlock activeBlock; |
237 | Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>* activeBlocks = nullptr; |
238 | }; |
239 | |
240 | public: |
241 | ProfilerCPU(); |
242 | ~ProfilerCPU(); |
243 | |
244 | /** |
245 | * Registers a new thread we will be doing sampling in. This needs to be called before any beginSample* \ endSample* |
246 | * calls are made in that thread. |
247 | * |
248 | * @param[in] name Name that will allow you to more easily identify the thread. |
249 | */ |
250 | void beginThread(const char* name); |
251 | |
252 | /** Ends sampling for the current thread. No beginSample* \ endSample* calls after this point. */ |
253 | void endThread(); |
254 | |
255 | /** |
256 | * Begins sample measurement. Must be followed by endSample(). |
257 | * |
258 | * @param[in] name Unique name for the sample you can later use to find the sampling data. |
259 | */ |
260 | void beginSample(const char* name); |
261 | |
262 | /** |
263 | * Ends sample measurement. |
264 | * |
265 | * @param[in] name Unique name for the sample. |
266 | * |
267 | * @note |
268 | * Unique name is primarily needed to more easily identify mismatched begin/end sample pairs. Otherwise the name in |
269 | * beginSample() would be enough. |
270 | */ |
271 | void endSample(const char* name); |
272 | |
273 | /** |
274 | * Begins precise sample measurement. Must be followed by endSamplePrecise(). |
275 | * |
276 | * @param[in] name Unique name for the sample you can later use to find the sampling data. |
277 | * |
278 | * @note |
279 | * This method uses very precise CPU counters to determine variety of data not provided by standard beginSample(). |
280 | * However due to the way these counters work you should not use this method for larger parts of code. It does not |
281 | * consider context switches so if the OS decides to switch context between measurements you will get invalid data. |
282 | */ |
283 | void beginSamplePrecise(const char* name); |
284 | |
285 | /** |
286 | * Ends precise sample measurement. |
287 | * |
288 | * @param[in] name Unique name for the sample. |
289 | * |
290 | * @note |
291 | * Unique name is primarily needed to more easily identify mismatched begin/end sample pairs. Otherwise the name |
292 | * in beginSamplePrecise() would be enough. |
293 | */ |
294 | void endSamplePrecise(const char* name); |
295 | |
296 | /** Clears all sampling data, and ends any unfinished sampling blocks. */ |
297 | void reset(); |
298 | |
299 | /** |
300 | * Generates a report from all previously sampled data. |
301 | * |
302 | * @note Generating a report will stop all in-progress sampling. You should make sure |
303 | * you call endSample* manually beforehand so this doesn't have to happen. |
304 | */ |
305 | CPUProfilerReport generateReport(); |
306 | |
307 | private: |
308 | /** |
309 | * Calculates overhead that the timing and sampling methods themselves introduce so we might get more accurate |
310 | * measurements when creating reports. |
311 | */ |
312 | void estimateTimerOverhead(); |
313 | |
314 | private: |
315 | double mBasicTimerOverhead = 0.0; |
316 | UINT64 mPreciseTimerOverhead = 0; |
317 | |
318 | double mBasicSamplingOverheadMs = 0.0; |
319 | double mPreciseSamplingOverheadMs = 0.0; |
320 | UINT64 mBasicSamplingOverheadCycles = 0; |
321 | UINT64 mPreciseSamplingOverheadCycles = 0; |
322 | |
323 | ProfilerVector<ThreadInfo*> mActiveThreads; |
324 | Mutex mThreadSync; |
325 | }; |
326 | |
327 | /** Profiling entry containing information about a single CPU profiling block containing timing information. */ |
328 | struct BS_CORE_EXPORT CPUProfilerBasicSamplingEntry |
329 | { |
330 | struct BS_CORE_EXPORT Data |
331 | { |
332 | Data() = default; |
333 | |
334 | String name; /**< Name of the profiling block. */ |
335 | UINT32 numCalls = 0; /**< Number of times the block was entered. */ |
336 | |
337 | UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */ |
338 | UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */ |
339 | |
340 | double avgTimeMs = 0.0; /**< Average time it took to execute the block, per call. In milliseconds. */ |
341 | double maxTimeMs = 0.0; /**< Maximum time of a single call in the block. In milliseconds. */ |
342 | double totalTimeMs = 0.0; /**< Total time the block took, across all calls. In milliseconds. */ |
343 | |
344 | double avgSelfTimeMs = 0.0; /**< Average time it took to execute the block, per call. Ignores time used by child blocks. In milliseconds. */ |
345 | double totalSelfTimeMs = 0.0; /**< Total time the block took, across all calls. Ignores time used by child blocks. In milliseconds. */ |
346 | |
347 | double estimatedSelfOverheadMs = 0.0; /**< Estimated overhead of profiling methods, only for this exact block. In milliseconds. */ |
348 | double estimatedOverheadMs = 0.0; /**< Estimated overhead of profiling methods for this block and all children. In milliseconds. */ |
349 | |
350 | float pctOfParent = 1.0f; /**< Percent of parent block time this block took to execute. Ranging [0.0, 1.0]. */ |
351 | } data; |
352 | |
353 | ProfilerVector<CPUProfilerBasicSamplingEntry> childEntries; |
354 | }; |
355 | |
356 | /** |
357 | * Profiling entry containing information about a single CPU profiling block containing CPU cycle count based |
358 | * information. |
359 | */ |
360 | struct BS_CORE_EXPORT CPUProfilerPreciseSamplingEntry |
361 | { |
362 | struct BS_CORE_EXPORT Data |
363 | { |
364 | Data() = default; |
365 | |
366 | String name; /**< Name of the profiling block. */ |
367 | UINT32 numCalls = 0; /**< Number of times the block was entered. */ |
368 | |
369 | UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */ |
370 | UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */ |
371 | |
372 | UINT64 avgCycles = 0; /**< Average number of cycles it took to execute the block, per call. */ |
373 | UINT64 maxCycles = 0; /**< Maximum number of cycles of a single call in the block. */ |
374 | UINT64 totalCycles = 0; /**< Total number of cycles across all calls in the block. */ |
375 | |
376 | UINT64 avgSelfCycles = 0; /**< Average number of cycles it took to execute the block, per call. Ignores cycles used by child blocks. */ |
377 | UINT64 totalSelfCycles = 0; /**< Total number of cycles across all calls in the block. Ignores time used by child blocks. */ |
378 | |
379 | UINT64 estimatedSelfOverhead = 0; /**< Estimated overhead of profiling methods, only for this exact block. In cycles. */ |
380 | UINT64 estimatedOverhead = 0; /**< Estimated overhead of profiling methods for this block and all children. In cycles. */ |
381 | |
382 | float pctOfParent = 1.0f; /**< Percent of parent block cycles used by this block. Ranging [0.0, 1.0]. */ |
383 | } data; |
384 | |
385 | ProfilerVector<CPUProfilerPreciseSamplingEntry> childEntries; |
386 | }; |
387 | |
388 | /** CPU profiling report containing all profiling information for a single profiling session. */ |
389 | class BS_CORE_EXPORT CPUProfilerReport |
390 | { |
391 | public: |
392 | CPUProfilerReport() = default; |
393 | |
394 | /** |
395 | * Returns root entry for the basic (time based) sampling data. Root entry always contains the profiling block |
396 | * associated with the entire thread. |
397 | */ |
398 | const CPUProfilerBasicSamplingEntry& getBasicSamplingData() const { return mBasicSamplingRootEntry; } |
399 | |
400 | /** |
401 | * Returns root entry for the precise (CPU cycle based) sampling data. Root entry always contains the profiling |
402 | * block associated with the entire thread. |
403 | */ |
404 | const CPUProfilerPreciseSamplingEntry& getPreciseSamplingData() const { return mPreciseSamplingRootEntry; } |
405 | |
406 | private: |
407 | friend class ProfilerCPU; |
408 | |
409 | CPUProfilerBasicSamplingEntry mBasicSamplingRootEntry; |
410 | CPUProfilerPreciseSamplingEntry mPreciseSamplingRootEntry; |
411 | }; |
412 | |
413 | /** Provides global access to ProfilerCPU instance. */ |
414 | BS_CORE_EXPORT ProfilerCPU& gProfilerCPU(); |
415 | |
416 | /** Shortcut for profiling a single function call. */ |
417 | #define PROFILE_CALL(call, name) \ |
418 | { \ |
419 | bs::gProfilerCPU().beginSample(name); \ |
420 | call; \ |
421 | bs::gProfilerCPU().endSample(name); \ |
422 | } |
423 | |
424 | /** @} */ |
425 | } |
426 | |