1//************************************ bs::framework - Copyright 2018 Marko Pintera **************************************//
2//*********** Licensed under the MIT license. See LICENSE.md for full terms. This notice is not to be removed. ***********//
3#pragma once
4
5#include "BsCorePrerequisites.h"
6#include "Utility/BsModule.h"
7
8namespace bs
9{
10 /** @addtogroup Profiling
11 * @{
12 */
13
14 class CPUProfilerReport;
15
16 /**
17 * Provides various performance measuring methods.
18 *
19 * @note Thread safe. Matching begin* \ end* calls must belong to the same thread though.
20 */
21 class BS_CORE_EXPORT ProfilerCPU : public Module<ProfilerCPU>
22 {
23 /** Timer class responsible for tracking elapsed time. */
24 class Timer
25 {
26 public:
27 Timer();
28
29 /** Sets the start time for the timer. */
30 void start();
31
32 /** Stops the timer and calculates the elapsed time from start time to now. */
33 void stop();
34
35 /** Resets the elapsed time to zero. */
36 void reset();
37
38 double time;
39 private:
40 double startTime = 0.0f;
41 std::chrono::high_resolution_clock mHRClock;
42
43 /** Returns time elapsed since CPU was started in millseconds. */
44 inline double getCurrentTime() const;
45 };
46
47 /** Timer class responsible for tracking number of elapsed CPU cycles. */
48 class TimerPrecise
49 {
50 public:
51 TimerPrecise();
52
53 /** Starts the counter marking the current number of executed CPU cycles since CPU was started. */
54 void start();
55
56 /** Ends the counter and calculates the number of CPU cycles between now and the start time. */
57 void stop();
58
59 /** Resets the cycle count to zero. */
60 void reset();
61
62 UINT64 cycles;
63 private:
64 UINT64 startCycles;
65
66 /** Queries the CPU for the current number of CPU cycles executed since the program was started. */
67 static inline UINT64 getNumCycles();
68 };
69
70 /**
71 * Contains data about a single profiler sample (counting time in milliseconds).
72 *
73 * @note
74 * A sample is created whenever a named profile block is entered. For example if you have a function you are
75 * profiling, and it gets called 10 times, there will be 10 samples.
76 */
77 struct ProfileSample
78 {
79 ProfileSample(double _time, UINT64 _numAllocs, UINT64 _numFrees)
80 :time(_time), numAllocs(_numAllocs), numFrees(_numFrees)
81 { }
82
83 double time;
84 UINT64 numAllocs;
85 UINT64 numFrees;
86 };
87
88 /**
89 * Contains data about a single precise profiler sample (counting CPU cycles).
90 *
91 * @note
92 * A sample is created whenever a named profile block is entered. For example if you have a function you are
93 * profiling, and it gets called 10 times, there will be 10 samples.
94 */
95 struct PreciseProfileSample
96 {
97 PreciseProfileSample(UINT64 _cycles, UINT64 _numAllocs, UINT64 _numFrees)
98 :cycles(_cycles), numAllocs(_numAllocs), numFrees(_numFrees)
99 { }
100
101 UINT64 cycles;
102 UINT64 numAllocs;
103 UINT64 numFrees;
104 };
105
106 /** Contains basic (time based) profiling data contained in a profiling block. */
107 struct ProfileData
108 {
109 ProfileData(FrameAlloc* alloc);
110
111 /** Begins a new sample and records current sample state. Previous sample must not be active. */
112 void beginSample();
113
114 /**
115 * Records current sample state and creates a new sample based on start and end state. Adds the sample to the
116 * sample list.
117 */
118 void endSample();
119
120 /**
121 * Removes the last added sample from the sample list and makes it active again. You must call endSample()
122 * when done as if you called beginSample().
123 */
124 void resumeLastSample();
125
126 Vector<ProfileSample, StdFrameAlloc<ProfileSample>> samples;
127 Timer timer;
128
129 UINT64 memAllocs;
130 UINT64 memFrees;
131 };
132
133 /** Contains precise (CPU cycle based) profiling data contained in a profiling block. */
134 struct PreciseProfileData
135 {
136 PreciseProfileData(FrameAlloc* alloc);
137
138 /** Begins a new sample and records current sample state. Previous sample must not be active. */
139 void beginSample();
140
141 /**
142 * Records current sample state and creates a new sample based on start and end state. Adds the sample to the
143 * sample list.
144 */
145 void endSample();
146
147 /**
148 * Removes the last added sample from the sample list and makes it active again. You must call endSample()
149 * when done as if you called beginSample.
150 */
151 void resumeLastSample();
152
153 Vector<PreciseProfileSample, StdFrameAlloc<PreciseProfileSample>> samples;
154 TimerPrecise timer;
155
156 UINT64 memAllocs;
157 UINT64 memFrees;
158 };
159
160 /**
161 * Contains all sampling information about a single named profiling block. Each block has its own sampling
162 * information and optionally child blocks.
163 */
164 struct ProfiledBlock
165 {
166 ProfiledBlock(FrameAlloc* alloc);
167 ~ProfiledBlock();
168
169 /** Attempts to find a child block with the specified name. Returns null if not found. */
170 ProfiledBlock* findChild(const char* name) const;
171
172 char* name;
173
174 ProfileData basic;
175 PreciseProfileData precise;
176
177 Vector<ProfiledBlock*, StdFrameAlloc<ProfiledBlock*>> children;
178 };
179
180 /** CPU sampling type. */
181 enum class ActiveSamplingType
182 {
183 Basic, /**< Sample using milliseconds. */
184 Precise /**< Sample using CPU cycles. */
185 };
186
187 /** Contains data about the currently active profiling block. */
188 struct ActiveBlock
189 {
190 ActiveBlock()
191 :type(ActiveSamplingType::Basic), block(nullptr)
192 { }
193
194 ActiveBlock(ActiveSamplingType _type, ProfiledBlock* _block)
195 :type(_type), block(_block)
196 { }
197
198 ActiveSamplingType type;
199 ProfiledBlock* block;
200 };
201
202 /** Contains data about an active profiling thread. */
203 struct ThreadInfo
204 {
205 ThreadInfo();
206
207 /**
208 * Starts profiling on the thread. New primary profiling block is created with the given name.
209 */
210 void begin(const char* _name);
211
212 /**
213 * Ends profiling on the thread. You should end all samples before calling this, but if you don't they will be
214 * terminated automatically.
215 */
216 void end();
217
218 /**
219 * Deletes all internal profiling data and makes the object ready for another iteration. Should be called
220 * after end in order to delete any existing data.
221 */
222 void reset();
223
224 /** Gets the primary profiling block used by the thread. */
225 ProfiledBlock* getBlock(const char* name);
226
227 /** Deletes the provided block. */
228 void releaseBlock(ProfiledBlock* block);
229
230 static BS_THREADLOCAL ThreadInfo* activeThread;
231 bool isActive = false;
232
233 ProfiledBlock* rootBlock = nullptr;
234
235 FrameAlloc frameAlloc;
236 ActiveBlock activeBlock;
237 Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>* activeBlocks = nullptr;
238 };
239
240 public:
241 ProfilerCPU();
242 ~ProfilerCPU();
243
244 /**
245 * Registers a new thread we will be doing sampling in. This needs to be called before any beginSample* \ endSample*
246 * calls are made in that thread.
247 *
248 * @param[in] name Name that will allow you to more easily identify the thread.
249 */
250 void beginThread(const char* name);
251
252 /** Ends sampling for the current thread. No beginSample* \ endSample* calls after this point. */
253 void endThread();
254
255 /**
256 * Begins sample measurement. Must be followed by endSample().
257 *
258 * @param[in] name Unique name for the sample you can later use to find the sampling data.
259 */
260 void beginSample(const char* name);
261
262 /**
263 * Ends sample measurement.
264 *
265 * @param[in] name Unique name for the sample.
266 *
267 * @note
268 * Unique name is primarily needed to more easily identify mismatched begin/end sample pairs. Otherwise the name in
269 * beginSample() would be enough.
270 */
271 void endSample(const char* name);
272
273 /**
274 * Begins precise sample measurement. Must be followed by endSamplePrecise().
275 *
276 * @param[in] name Unique name for the sample you can later use to find the sampling data.
277 *
278 * @note
279 * This method uses very precise CPU counters to determine variety of data not provided by standard beginSample().
280 * However due to the way these counters work you should not use this method for larger parts of code. It does not
281 * consider context switches so if the OS decides to switch context between measurements you will get invalid data.
282 */
283 void beginSamplePrecise(const char* name);
284
285 /**
286 * Ends precise sample measurement.
287 *
288 * @param[in] name Unique name for the sample.
289 *
290 * @note
291 * Unique name is primarily needed to more easily identify mismatched begin/end sample pairs. Otherwise the name
292 * in beginSamplePrecise() would be enough.
293 */
294 void endSamplePrecise(const char* name);
295
296 /** Clears all sampling data, and ends any unfinished sampling blocks. */
297 void reset();
298
299 /**
300 * Generates a report from all previously sampled data.
301 *
302 * @note Generating a report will stop all in-progress sampling. You should make sure
303 * you call endSample* manually beforehand so this doesn't have to happen.
304 */
305 CPUProfilerReport generateReport();
306
307 private:
308 /**
309 * Calculates overhead that the timing and sampling methods themselves introduce so we might get more accurate
310 * measurements when creating reports.
311 */
312 void estimateTimerOverhead();
313
314 private:
315 double mBasicTimerOverhead = 0.0;
316 UINT64 mPreciseTimerOverhead = 0;
317
318 double mBasicSamplingOverheadMs = 0.0;
319 double mPreciseSamplingOverheadMs = 0.0;
320 UINT64 mBasicSamplingOverheadCycles = 0;
321 UINT64 mPreciseSamplingOverheadCycles = 0;
322
323 ProfilerVector<ThreadInfo*> mActiveThreads;
324 Mutex mThreadSync;
325 };
326
327 /** Profiling entry containing information about a single CPU profiling block containing timing information. */
328 struct BS_CORE_EXPORT CPUProfilerBasicSamplingEntry
329 {
330 struct BS_CORE_EXPORT Data
331 {
332 Data() = default;
333
334 String name; /**< Name of the profiling block. */
335 UINT32 numCalls = 0; /**< Number of times the block was entered. */
336
337 UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */
338 UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */
339
340 double avgTimeMs = 0.0; /**< Average time it took to execute the block, per call. In milliseconds. */
341 double maxTimeMs = 0.0; /**< Maximum time of a single call in the block. In milliseconds. */
342 double totalTimeMs = 0.0; /**< Total time the block took, across all calls. In milliseconds. */
343
344 double avgSelfTimeMs = 0.0; /**< Average time it took to execute the block, per call. Ignores time used by child blocks. In milliseconds. */
345 double totalSelfTimeMs = 0.0; /**< Total time the block took, across all calls. Ignores time used by child blocks. In milliseconds. */
346
347 double estimatedSelfOverheadMs = 0.0; /**< Estimated overhead of profiling methods, only for this exact block. In milliseconds. */
348 double estimatedOverheadMs = 0.0; /**< Estimated overhead of profiling methods for this block and all children. In milliseconds. */
349
350 float pctOfParent = 1.0f; /**< Percent of parent block time this block took to execute. Ranging [0.0, 1.0]. */
351 } data;
352
353 ProfilerVector<CPUProfilerBasicSamplingEntry> childEntries;
354 };
355
356 /**
357 * Profiling entry containing information about a single CPU profiling block containing CPU cycle count based
358 * information.
359 */
360 struct BS_CORE_EXPORT CPUProfilerPreciseSamplingEntry
361 {
362 struct BS_CORE_EXPORT Data
363 {
364 Data() = default;
365
366 String name; /**< Name of the profiling block. */
367 UINT32 numCalls = 0; /**< Number of times the block was entered. */
368
369 UINT64 memAllocs; /**< Number of memory allocations that happened within the block. */
370 UINT64 memFrees; /**< Number of memory deallocations that happened within the block. */
371
372 UINT64 avgCycles = 0; /**< Average number of cycles it took to execute the block, per call. */
373 UINT64 maxCycles = 0; /**< Maximum number of cycles of a single call in the block. */
374 UINT64 totalCycles = 0; /**< Total number of cycles across all calls in the block. */
375
376 UINT64 avgSelfCycles = 0; /**< Average number of cycles it took to execute the block, per call. Ignores cycles used by child blocks. */
377 UINT64 totalSelfCycles = 0; /**< Total number of cycles across all calls in the block. Ignores time used by child blocks. */
378
379 UINT64 estimatedSelfOverhead = 0; /**< Estimated overhead of profiling methods, only for this exact block. In cycles. */
380 UINT64 estimatedOverhead = 0; /**< Estimated overhead of profiling methods for this block and all children. In cycles. */
381
382 float pctOfParent = 1.0f; /**< Percent of parent block cycles used by this block. Ranging [0.0, 1.0]. */
383 } data;
384
385 ProfilerVector<CPUProfilerPreciseSamplingEntry> childEntries;
386 };
387
388 /** CPU profiling report containing all profiling information for a single profiling session. */
389 class BS_CORE_EXPORT CPUProfilerReport
390 {
391 public:
392 CPUProfilerReport() = default;
393
394 /**
395 * Returns root entry for the basic (time based) sampling data. Root entry always contains the profiling block
396 * associated with the entire thread.
397 */
398 const CPUProfilerBasicSamplingEntry& getBasicSamplingData() const { return mBasicSamplingRootEntry; }
399
400 /**
401 * Returns root entry for the precise (CPU cycle based) sampling data. Root entry always contains the profiling
402 * block associated with the entire thread.
403 */
404 const CPUProfilerPreciseSamplingEntry& getPreciseSamplingData() const { return mPreciseSamplingRootEntry; }
405
406 private:
407 friend class ProfilerCPU;
408
409 CPUProfilerBasicSamplingEntry mBasicSamplingRootEntry;
410 CPUProfilerPreciseSamplingEntry mPreciseSamplingRootEntry;
411 };
412
413 /** Provides global access to ProfilerCPU instance. */
414 BS_CORE_EXPORT ProfilerCPU& gProfilerCPU();
415
416 /** Shortcut for profiling a single function call. */
417#define PROFILE_CALL(call, name) \
418 { \
419 bs::gProfilerCPU().beginSample(name); \
420 call; \
421 bs::gProfilerCPU().endSample(name); \
422 }
423
424 /** @} */
425}
426