1//************************************ bs::framework - Copyright 2018 Marko Pintera **************************************//
2//*********** Licensed under the MIT license. See LICENSE.md for full terms. This notice is not to be removed. ***********//
3#include "Profiling/BsProfilerCPU.h"
4#include "Debug/BsDebug.h"
5#include "Platform/BsPlatform.h"
6#include <chrono>
7
8#if BS_COMPILER == BS_COMPILER_MSVC
9 #include <intrin.h>
10#endif
11
12#if BS_COMPILER == BS_COMPILER_GNUC || BS_COMPILER == BS_COMPILER_CLANG
13 #include "BsCpuid.h"
14#endif
15
16#if BS_COMPILER == BS_COMPILER_CLANG
17 #if BS_PLATFORM == BS_PLATFORM_WIN32
18 #include "intrin.h"
19 #else
20 #include <x86intrin.h>
21 #endif
22#endif
23
24using namespace std::chrono;
25
26namespace bs
27{
28 ProfilerCPU::Timer::Timer()
29 {
30 time = 0.0f;
31 }
32
33 void ProfilerCPU::Timer::start()
34 {
35 startTime = getCurrentTime();
36 }
37
38 void ProfilerCPU::Timer::stop()
39 {
40 time += getCurrentTime() - startTime;
41 }
42
43 void ProfilerCPU::Timer::reset()
44 {
45 time = 0.0f;
46 }
47
48 inline double ProfilerCPU::Timer::getCurrentTime() const
49 {
50 high_resolution_clock::time_point timeNow = mHRClock.now();
51 nanoseconds timeNowNs = timeNow.time_since_epoch();
52
53 return timeNowNs.count() * 0.000001;
54 }
55
56 ProfilerCPU::TimerPrecise::TimerPrecise()
57 {
58 cycles = 0;
59 }
60
61 void ProfilerCPU::TimerPrecise::start()
62 {
63 startCycles = getNumCycles();
64 }
65
66 void ProfilerCPU::TimerPrecise::stop()
67 {
68 cycles += getNumCycles() - startCycles;
69 }
70
71 void ProfilerCPU::TimerPrecise::reset()
72 {
73 cycles = 0;
74 }
75
76 inline UINT64 ProfilerCPU::TimerPrecise::getNumCycles()
77 {
78#if BS_COMPILER == BS_COMPILER_GNUC || BS_COMPILER == BS_COMPILER_CLANG
79 unsigned int a = 0;
80 unsigned int b[4];
81 __get_cpuid(a, &b[0], &b[1], &b[2], &b[3]);
82
83#if BS_ARCH_TYPE == BS_ARCHITECTURE_x86_64
84 UINT32 __a, __d;
85 __asm__ __volatile__ ("rdtsc" : "=a" (__a), "=d" (__d));
86 return (UINT64(__a) | UINT64(__d) << 32);
87#else
88 UINT64 x;
89 __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
90 return x;
91#endif
92#elif BS_COMPILER == BS_COMPILER_MSVC
93 int a[4];
94 int b = 0;
95 __cpuid(a, b);
96 return __rdtsc();
97#else
98 static_assert(false, "Unsupported compiler");
99#endif
100 }
101
102 ProfilerCPU::ProfileData::ProfileData(FrameAlloc* alloc)
103 :samples(alloc)
104 { }
105
106 void ProfilerCPU::ProfileData::beginSample()
107 {
108 memAllocs = MemoryCounter::getNumAllocs();
109 memFrees = MemoryCounter::getNumFrees();
110
111 timer.reset();
112 timer.start();
113 }
114
115 void ProfilerCPU::ProfileData::endSample()
116 {
117 timer.stop();
118
119 UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
120 UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
121
122 samples.push_back(ProfileSample(timer.time, numAllocs, numFrees));
123 }
124
125 void ProfilerCPU::ProfileData::resumeLastSample()
126 {
127 timer.start();
128 samples.erase(samples.end() - 1);
129 }
130
131 ProfilerCPU::PreciseProfileData::PreciseProfileData(FrameAlloc* alloc)
132 :samples(alloc)
133 { }
134
135 void ProfilerCPU::PreciseProfileData::beginSample()
136 {
137 memAllocs = MemoryCounter::getNumAllocs();
138 memFrees = MemoryCounter::getNumFrees();
139
140 timer.reset();
141 timer.start();
142 }
143
144 void ProfilerCPU::PreciseProfileData::endSample()
145 {
146 timer.stop();
147
148 UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs;
149 UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees;
150
151 samples.push_back(PreciseProfileSample(timer.cycles, numAllocs, numFrees));
152 }
153
154 void ProfilerCPU::PreciseProfileData::resumeLastSample()
155 {
156 timer.start();
157 samples.erase(samples.end() - 1);
158 }
159
160 BS_THREADLOCAL ProfilerCPU::ThreadInfo* ProfilerCPU::ThreadInfo::activeThread = nullptr;
161
162 ProfilerCPU::ThreadInfo::ThreadInfo()
163 :frameAlloc(1024 * 512)
164 {
165
166 }
167
168 void ProfilerCPU::ThreadInfo::begin(const char* _name)
169 {
170 if(isActive)
171 {
172 LOGWRN("Profiler::beginThread called on a thread that was already being sampled");
173 return;
174 }
175
176 if(rootBlock == nullptr)
177 rootBlock = getBlock(_name);
178
179 activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock);
180 if (activeBlocks == nullptr)
181 activeBlocks = frameAlloc.construct<Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>>
182 (StdFrameAlloc<ActiveBlock>(&frameAlloc));
183
184 activeBlocks->push(activeBlock);
185
186 rootBlock->basic.beginSample();
187 isActive = true;
188 }
189
190 void ProfilerCPU::ThreadInfo::end()
191 {
192 if(activeBlock.type == ActiveSamplingType::Basic)
193 activeBlock.block->basic.endSample();
194 else
195 activeBlock.block->precise.endSample();
196
197 activeBlocks->pop();
198
199 if(!isActive)
200 LOGWRN("Profiler::endThread called on a thread that isn't being sampled.");
201
202 if (activeBlocks->size() > 0)
203 {
204 LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid.");
205
206 while (activeBlocks->size() > 0)
207 {
208 ActiveBlock& curBlock = activeBlocks->top();
209 if(curBlock.type == ActiveSamplingType::Basic)
210 curBlock.block->basic.endSample();
211 else
212 curBlock.block->precise.endSample();
213
214 activeBlocks->pop();
215 }
216 }
217
218 isActive = false;
219 activeBlock = ActiveBlock();
220
221 frameAlloc.free(activeBlocks);
222 activeBlocks = nullptr;
223 }
224
225 void ProfilerCPU::ThreadInfo::reset()
226 {
227 if(isActive)
228 end();
229
230 if(rootBlock != nullptr)
231 releaseBlock(rootBlock);
232
233 rootBlock = nullptr;
234 frameAlloc.clear(); // Note: This never actually frees memory
235 }
236
237 ProfilerCPU::ProfiledBlock* ProfilerCPU::ThreadInfo::getBlock(const char* name)
238 {
239 ProfiledBlock* block = frameAlloc.construct<ProfiledBlock>(&frameAlloc);
240 block->name = (char*)frameAlloc.alloc(((UINT32)strlen(name) + 1) * sizeof(char));
241 strcpy(block->name, name);
242
243 return block;
244 }
245
246 void ProfilerCPU::ThreadInfo::releaseBlock(ProfiledBlock* block)
247 {
248 frameAlloc.free((UINT8*)block->name);
249 frameAlloc.free(block);
250 }
251
252 ProfilerCPU::ProfiledBlock::ProfiledBlock(FrameAlloc* alloc)
253 :basic(alloc), precise(alloc), children(alloc)
254 { }
255
256 ProfilerCPU::ProfiledBlock::~ProfiledBlock()
257 {
258 ThreadInfo* thread = ThreadInfo::activeThread;
259
260 for(auto& child : children)
261 thread->releaseBlock(child);
262
263 children.clear();
264 }
265
266 ProfilerCPU::ProfiledBlock* ProfilerCPU::ProfiledBlock::findChild(const char* name) const
267 {
268 for(auto& child : children)
269 {
270 if(strcmp(child->name, name) == 0)
271 return child;
272 }
273
274 return nullptr;
275 }
276
277 ProfilerCPU::ProfilerCPU()
278 {
279 // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called,
280 // and keep separate values per thread.
281 estimateTimerOverhead();
282 }
283
284 ProfilerCPU::~ProfilerCPU()
285 {
286 reset();
287
288 Lock lock(mThreadSync);
289
290 for(auto& threadInfo : mActiveThreads)
291 bs_delete<ThreadInfo, ProfilerAlloc>(threadInfo);
292 }
293
294 void ProfilerCPU::beginThread(const char* name)
295 {
296 ThreadInfo* thread = ThreadInfo::activeThread;
297 if(thread == nullptr)
298 {
299 ThreadInfo::activeThread = bs_new<ThreadInfo, ProfilerAlloc>();
300 thread = ThreadInfo::activeThread;
301
302 {
303 Lock lock(mThreadSync);
304
305 mActiveThreads.push_back(thread);
306 }
307 }
308
309 thread->begin(name);
310 }
311
312 void ProfilerCPU::endThread()
313 {
314 // I don't do a nullcheck where on purpose, so endSample can be called ASAP
315 ThreadInfo::activeThread->end();
316 }
317
318 void ProfilerCPU::beginSample(const char* name)
319 {
320 ThreadInfo* thread = ThreadInfo::activeThread;
321 if(thread == nullptr || !thread->isActive)
322 {
323 beginThread("Unknown");
324 thread = ThreadInfo::activeThread;
325 }
326
327 ProfiledBlock* parent = thread->activeBlock.block;
328 ProfiledBlock* block = nullptr;
329
330 if(parent != nullptr)
331 block = parent->findChild(name);
332
333 if(block == nullptr)
334 {
335 block = thread->getBlock(name);
336
337 if(parent != nullptr)
338 parent->children.push_back(block);
339 else
340 thread->rootBlock->children.push_back(block);
341 }
342
343 thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block);
344 thread->activeBlocks->push(thread->activeBlock);
345
346 block->basic.beginSample();
347 }
348
349 void ProfilerCPU::endSample(const char* name)
350 {
351 ThreadInfo* thread = ThreadInfo::activeThread;
352 ProfiledBlock* block = thread->activeBlock.block;
353
354#if BS_DEBUG_MODE
355 if(block == nullptr)
356 {
357 LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called.");
358 return;
359 }
360
361 if(thread->activeBlock.type == ActiveSamplingType::Precise)
362 {
363 LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise.");
364 return;
365 }
366
367 if(strcmp(block->name, name) != 0)
368 {
369 LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + String(block->name) +
370 "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
371 return;
372 }
373#endif
374
375 block->basic.endSample();
376
377 thread->activeBlocks->pop();
378
379 if (!thread->activeBlocks->empty())
380 thread->activeBlock = thread->activeBlocks->top();
381 else
382 thread->activeBlock = ActiveBlock();
383 }
384
385 void ProfilerCPU::beginSamplePrecise(const char* name)
386 {
387 // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed.
388 // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length)
389
390 ThreadInfo* thread = ThreadInfo::activeThread;
391 if(thread == nullptr || !thread->isActive)
392 beginThread("Unknown");
393
394 ProfiledBlock* parent = thread->activeBlock.block;
395 ProfiledBlock* block = nullptr;
396
397 if(parent != nullptr)
398 block = parent->findChild(name);
399
400 if(block == nullptr)
401 {
402 block = thread->getBlock(name);
403
404 if(parent != nullptr)
405 parent->children.push_back(block);
406 else
407 thread->rootBlock->children.push_back(block);
408 }
409
410 thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block);
411 thread->activeBlocks->push(thread->activeBlock);
412
413 block->precise.beginSample();
414 }
415
416 void ProfilerCPU::endSamplePrecise(const char* name)
417 {
418 ThreadInfo* thread = ThreadInfo::activeThread;
419 ProfiledBlock* block = thread->activeBlock.block;
420
421#if BS_DEBUG_MODE
422 if(block == nullptr)
423 {
424 LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called.");
425 return;
426 }
427
428 if(thread->activeBlock.type == ActiveSamplingType::Basic)
429 {
430 LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample.");
431 return;
432 }
433
434 if (strcmp(block->name, name) != 0)
435 {
436 LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + String(block->name) +
437 "\" but got \"" + String(name) + "\". Sampling data will not be valid.");
438 return;
439 }
440#endif
441
442 block->precise.endSample();
443
444 thread->activeBlocks->pop();
445
446 if (!thread->activeBlocks->empty())
447 thread->activeBlock = thread->activeBlocks->top();
448 else
449 thread->activeBlock = ActiveBlock();
450 }
451
452 void ProfilerCPU::reset()
453 {
454 ThreadInfo* thread = ThreadInfo::activeThread;
455
456 if(thread != nullptr)
457 thread->reset();
458 }
459
460 CPUProfilerReport ProfilerCPU::generateReport()
461 {
462 CPUProfilerReport report;
463
464 ThreadInfo* thread = ThreadInfo::activeThread;
465 if(thread == nullptr)
466 return report;
467
468 if(thread->isActive)
469 thread->end();
470
471 // We need to separate out basic and precise data and form two separate hierarchies
472 if(thread->rootBlock == nullptr)
473 return report;
474
475 struct TempEntry
476 {
477 TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx)
478 :parentBlock(_parentBlock), entryIdx(_entryIdx)
479 { }
480
481 ProfiledBlock* parentBlock;
482 UINT32 entryIdx;
483 ProfilerVector<UINT32> childIndexes;
484 };
485
486 ProfilerVector<CPUProfilerBasicSamplingEntry> basicEntries;
487 ProfilerVector<CPUProfilerPreciseSamplingEntry> preciseEntries;
488
489 // Fill up flatHierarchy array in a way so we always process children before parents
490 ProfilerStack<UINT32> todo;
491 ProfilerVector<TempEntry> flatHierarchy;
492
493 UINT32 entryIdx = 0;
494 todo.push(entryIdx);
495 flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx));
496
497 entryIdx++;
498 while(!todo.empty())
499 {
500 UINT32 curDataIdx = todo.top();
501 ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock;
502
503 todo.pop();
504
505 for(auto& child : curBlock->children)
506 {
507 flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx);
508
509 todo.push(entryIdx);
510 flatHierarchy.push_back(TempEntry(child, entryIdx));
511
512 entryIdx++;
513 }
514 }
515
516 // Calculate sampling data for all entries
517 basicEntries.resize(flatHierarchy.size());
518 preciseEntries.resize(flatHierarchy.size());
519
520 for(auto iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter)
521 {
522 TempEntry& curData = *iter;
523 ProfiledBlock* curBlock = curData.parentBlock;
524
525 CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx];
526 CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx];
527
528 // Calculate basic data
529 entryBasic->data.name = String(curBlock->name);
530
531 entryBasic->data.memAllocs = 0;
532 entryBasic->data.memFrees = 0;
533 entryBasic->data.totalTimeMs = 0.0;
534 entryBasic->data.maxTimeMs = 0.0;
535 for(auto& sample : curBlock->basic.samples)
536 {
537 entryBasic->data.totalTimeMs += sample.time;
538 entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time);
539 entryBasic->data.memAllocs += sample.numAllocs;
540 entryBasic->data.memFrees += sample.numFrees;
541 }
542
543 entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size();
544
545 if(entryBasic->data.numCalls > 0)
546 entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls;
547
548 double totalChildTime = 0.0;
549 for(auto& childIdx : curData.childIndexes)
550 {
551 CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx];
552 totalChildTime += childEntry->data.totalTimeMs;
553 childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs);
554
555 entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs;
556 }
557
558 entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs;
559 entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs;
560
561 entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime;
562
563 if(entryBasic->data.numCalls > 0)
564 entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls;
565
566 entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead;
567
568 // Calculate precise data
569 entryPrecise->data.name = String(curBlock->name);
570
571 entryPrecise->data.memAllocs = 0;
572 entryPrecise->data.memFrees = 0;
573 entryPrecise->data.totalCycles = 0;
574 entryPrecise->data.maxCycles = 0;
575 for(auto& sample : curBlock->precise.samples)
576 {
577 entryPrecise->data.totalCycles += sample.cycles;
578 entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles);
579 entryPrecise->data.memAllocs += sample.numAllocs;
580 entryPrecise->data.memFrees += sample.numFrees;
581 }
582
583 entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size();
584
585 if(entryPrecise->data.numCalls > 0)
586 entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls;
587
588 UINT64 totalChildCycles = 0;
589 for(auto& childIdx : curData.childIndexes)
590 {
591 CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx];
592 totalChildCycles += childEntry->data.totalCycles;
593 childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles;
594
595 entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead;
596 }
597
598 entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles;
599 entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles;
600
601 entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles;
602
603 if(entryPrecise->data.numCalls > 0)
604 entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls;
605
606 entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead;
607 }
608
609 // Prune empty basic entries
610 ProfilerStack<UINT32> finalBasicHierarchyTodo;
611 ProfilerStack<UINT32> parentBasicEntryIndexes;
612 ProfilerVector<TempEntry> newBasicEntries;
613
614 finalBasicHierarchyTodo.push(0);
615
616 entryIdx = 0;
617 parentBasicEntryIndexes.push(entryIdx);
618 newBasicEntries.push_back(TempEntry(nullptr, entryIdx));
619
620 entryIdx++;
621
622 while(!finalBasicHierarchyTodo.empty())
623 {
624 UINT32 parentEntryIdx = parentBasicEntryIndexes.top();
625 parentBasicEntryIndexes.pop();
626
627 UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
628 TempEntry& curEntry = flatHierarchy[curEntryIdx];
629 finalBasicHierarchyTodo.pop();
630
631 for(auto& childIdx : curEntry.childIndexes)
632 {
633 finalBasicHierarchyTodo.push(childIdx);
634
635 CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx];
636 if(basicEntry.data.numCalls > 0)
637 {
638 newBasicEntries.push_back(TempEntry(nullptr, childIdx));
639 newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
640
641 parentBasicEntryIndexes.push(entryIdx);
642
643 entryIdx++;
644 }
645 else
646 parentBasicEntryIndexes.push(parentEntryIdx);
647 }
648 }
649
650 if(newBasicEntries.size() > 0)
651 {
652 ProfilerVector<CPUProfilerBasicSamplingEntry*> finalBasicEntries;
653
654 report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx];
655 finalBasicEntries.push_back(&report.mBasicSamplingRootEntry);
656
657 finalBasicHierarchyTodo.push(0);
658
659 while(!finalBasicHierarchyTodo.empty())
660 {
661 UINT32 curEntryIdx = finalBasicHierarchyTodo.top();
662 finalBasicHierarchyTodo.pop();
663
664 TempEntry& curEntry = newBasicEntries[curEntryIdx];
665
666 CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx];
667
668 basicEntry->childEntries.resize(curEntry.childIndexes.size());
669 UINT32 idx = 0;
670
671 for(auto& childIdx : curEntry.childIndexes)
672 {
673 TempEntry& childEntry = newBasicEntries[childIdx];
674 basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx];
675
676 finalBasicEntries.push_back(&(basicEntry->childEntries[idx]));
677 finalBasicHierarchyTodo.push(childIdx);
678 idx++;
679 }
680 }
681 }
682
683 // Prune empty precise entries
684 ProfilerStack<UINT32> finalPreciseHierarchyTodo;
685 ProfilerStack<UINT32> parentPreciseEntryIndexes;
686 ProfilerVector<TempEntry> newPreciseEntries;
687
688 finalPreciseHierarchyTodo.push(0);
689
690 entryIdx = 0;
691 parentPreciseEntryIndexes.push(entryIdx);
692 newPreciseEntries.push_back(TempEntry(nullptr, entryIdx));
693
694 entryIdx++;
695
696 while(!finalPreciseHierarchyTodo.empty())
697 {
698 UINT32 parentEntryIdx = parentPreciseEntryIndexes.top();
699 parentPreciseEntryIndexes.pop();
700
701 UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
702 TempEntry& curEntry = flatHierarchy[curEntryIdx];
703 finalPreciseHierarchyTodo.pop();
704
705 for(auto& childIdx : curEntry.childIndexes)
706 {
707 finalPreciseHierarchyTodo.push(childIdx);
708
709 CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx];
710 if(preciseEntry.data.numCalls > 0)
711 {
712 newPreciseEntries.push_back(TempEntry(nullptr, childIdx));
713 newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx);
714
715 parentPreciseEntryIndexes.push(entryIdx);
716
717 entryIdx++;
718 }
719 else
720 parentPreciseEntryIndexes.push(parentEntryIdx);
721 }
722 }
723
724 if(newPreciseEntries.size() > 0)
725 {
726 ProfilerVector<CPUProfilerPreciseSamplingEntry*> finalPreciseEntries;
727
728 report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx];
729 finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry);
730
731 finalPreciseHierarchyTodo.push(0);
732
733 while(!finalPreciseHierarchyTodo.empty())
734 {
735 UINT32 curEntryIdx = finalPreciseHierarchyTodo.top();
736 finalPreciseHierarchyTodo.pop();
737
738 TempEntry& curEntry = newPreciseEntries[curEntryIdx];
739
740 CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx];
741
742 preciseEntry->childEntries.resize(curEntry.childIndexes.size());
743 UINT32 idx = 0;
744
745 for(auto& childIdx : curEntry.childIndexes)
746 {
747 TempEntry& childEntry = newPreciseEntries[childIdx];
748 preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx];
749
750 finalPreciseEntries.push_back(&preciseEntry->childEntries.back());
751 finalPreciseHierarchyTodo.push(childIdx);
752 idx++;
753 }
754 }
755 }
756
757 return report;
758 }
759
760 void ProfilerCPU::estimateTimerOverhead()
761 {
762 // Get an idea of how long timer calls and RDTSC takes
763 const UINT32 reps = 1000, sampleReps = 20;
764
765 mBasicTimerOverhead = 1000000.0;
766 mPreciseTimerOverhead = 1000000;
767 for (UINT32 tries = 0; tries < 20; tries++)
768 {
769 Timer timer;
770 for (UINT32 i = 0; i < reps; i++)
771 {
772 timer.start();
773 timer.stop();
774 }
775
776 double avgTime = double(timer.time)/double(reps);
777 if (avgTime < mBasicTimerOverhead)
778 mBasicTimerOverhead = avgTime;
779
780 TimerPrecise timerPrecise;
781 for (UINT32 i = 0; i < reps; i++)
782 {
783 timerPrecise.start();
784 timerPrecise.stop();
785 }
786
787 UINT64 avgCycles = timerPrecise.cycles/reps;
788 if (avgCycles < mPreciseTimerOverhead)
789 mPreciseTimerOverhead = avgCycles;
790 }
791
792 mBasicSamplingOverheadMs = 1000000.0;
793 mPreciseSamplingOverheadMs = 1000000.0;
794 mBasicSamplingOverheadCycles = 1000000;
795 mPreciseSamplingOverheadCycles = 1000000;
796 for (UINT32 tries = 0; tries < 3; tries++)
797 {
798 /************************************************************************/
799 /* AVERAGE TIME IN MS FOR BASIC SAMPLING */
800 /************************************************************************/
801
802 Timer timerA;
803 timerA.start();
804
805 beginThread("Main");
806
807 // Two different cases that can effect performance, one where
808 // sample already exists and other where new one needs to be created
809 for (UINT32 i = 0; i < sampleReps; i++)
810 {
811 beginSample("TestAvg1");
812 endSample("TestAvg1");
813 beginSample("TestAvg2");
814 endSample("TestAvg2");
815 beginSample("TestAvg3");
816 endSample("TestAvg3");
817 beginSample("TestAvg4");
818 endSample("TestAvg4");
819 beginSample("TestAvg5");
820 endSample("TestAvg5");
821 beginSample("TestAvg6");
822 endSample("TestAvg6");
823 beginSample("TestAvg7");
824 endSample("TestAvg7");
825 beginSample("TestAvg8");
826 endSample("TestAvg8");
827 beginSample("TestAvg9");
828 endSample("TestAvg9");
829 beginSample("TestAvg10");
830 endSample("TestAvg10");
831 }
832
833 for (UINT32 i = 0; i < sampleReps * 5; i++)
834 {
835 beginSample(("TestAvg#" + toString(i)).c_str());
836 endSample(("TestAvg#" + toString(i)).c_str());
837 }
838
839 endThread();
840
841 timerA.stop();
842
843 reset();
844
845 double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead;
846 if (avgTimeBasic < mBasicSamplingOverheadMs)
847 mBasicSamplingOverheadMs = avgTimeBasic;
848
849 /************************************************************************/
850 /* AVERAGE CYCLES FOR BASIC SAMPLING */
851 /************************************************************************/
852
853 TimerPrecise timerPreciseA;
854 timerPreciseA.start();
855
856 beginThread("Main");
857
858 // Two different cases that can effect performance, one where
859 // sample already exists and other where new one needs to be created
860 for (UINT32 i = 0; i < sampleReps; i++)
861 {
862 beginSample("TestAvg1");
863 endSample("TestAvg1");
864 beginSample("TestAvg2");
865 endSample("TestAvg2");
866 beginSample("TestAvg3");
867 endSample("TestAvg3");
868 beginSample("TestAvg4");
869 endSample("TestAvg4");
870 beginSample("TestAvg5");
871 endSample("TestAvg5");
872 beginSample("TestAvg6");
873 endSample("TestAvg6");
874 beginSample("TestAvg7");
875 endSample("TestAvg7");
876 beginSample("TestAvg8");
877 endSample("TestAvg8");
878 beginSample("TestAvg9");
879 endSample("TestAvg9");
880 beginSample("TestAvg10");
881 endSample("TestAvg10");
882 }
883
884 for (UINT32 i = 0; i < sampleReps * 5; i++)
885 {
886 beginSample(("TestAvg#" + toString(i)).c_str());
887 endSample(("TestAvg#" + toString(i)).c_str());
888 }
889
890 endThread();
891 timerPreciseA.stop();
892
893 reset();
894
895 UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead;
896 if (avgCyclesBasic < mBasicSamplingOverheadCycles)
897 mBasicSamplingOverheadCycles = avgCyclesBasic;
898
899 /************************************************************************/
900 /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */
901 /************************************************************************/
902
903 Timer timerB;
904 timerB.start();
905 beginThread("Main");
906
907 // Two different cases that can effect performance, one where
908 // sample already exists and other where new one needs to be created
909 for (UINT32 i = 0; i < sampleReps; i++)
910 {
911 beginSamplePrecise("TestAvg1");
912 endSamplePrecise("TestAvg1");
913 beginSamplePrecise("TestAvg2");
914 endSamplePrecise("TestAvg2");
915 beginSamplePrecise("TestAvg3");
916 endSamplePrecise("TestAvg3");
917 beginSamplePrecise("TestAvg4");
918 endSamplePrecise("TestAvg4");
919 beginSamplePrecise("TestAvg5");
920 endSamplePrecise("TestAvg5");
921 beginSamplePrecise("TestAvg6");
922 endSamplePrecise("TestAvg6");
923 beginSamplePrecise("TestAvg7");
924 endSamplePrecise("TestAvg7");
925 beginSamplePrecise("TestAvg8");
926 endSamplePrecise("TestAvg8");
927 beginSamplePrecise("TestAvg9");
928 endSamplePrecise("TestAvg9");
929 beginSamplePrecise("TestAvg10");
930 endSamplePrecise("TestAvg10");
931 }
932
933 for (UINT32 i = 0; i < sampleReps * 5; i++)
934 {
935 beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
936 endSamplePrecise(("TestAvg#" + toString(i)).c_str());
937 }
938
939 endThread();
940 timerB.stop();
941
942 reset();
943
944 double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5);
945 if (avgTimesPrecise < mPreciseSamplingOverheadMs)
946 mPreciseSamplingOverheadMs = avgTimesPrecise;
947
948 /************************************************************************/
949 /* AVERAGE CYCLES FOR PRECISE SAMPLING */
950 /************************************************************************/
951
952 TimerPrecise timerPreciseB;
953 timerPreciseB.start();
954 beginThread("Main");
955
956 // Two different cases that can effect performance, one where
957 // sample already exists and other where new one needs to be created
958 for (UINT32 i = 0; i < sampleReps; i++)
959 {
960 beginSamplePrecise("TestAvg1");
961 endSamplePrecise("TestAvg1");
962 beginSamplePrecise("TestAvg2");
963 endSamplePrecise("TestAvg2");
964 beginSamplePrecise("TestAvg3");
965 endSamplePrecise("TestAvg3");
966 beginSamplePrecise("TestAvg4");
967 endSamplePrecise("TestAvg4");
968 beginSamplePrecise("TestAvg5");
969 endSamplePrecise("TestAvg5");
970 beginSamplePrecise("TestAvg6");
971 endSamplePrecise("TestAvg6");
972 beginSamplePrecise("TestAvg7");
973 endSamplePrecise("TestAvg7");
974 beginSamplePrecise("TestAvg8");
975 endSamplePrecise("TestAvg8");
976 beginSamplePrecise("TestAvg9");
977 endSamplePrecise("TestAvg9");
978 beginSamplePrecise("TestAvg10");
979 endSamplePrecise("TestAvg10");
980 }
981
982 for (UINT32 i = 0; i < sampleReps * 5; i++)
983 {
984 beginSamplePrecise(("TestAvg#" + toString(i)).c_str());
985 endSamplePrecise(("TestAvg#" + toString(i)).c_str());
986 }
987
988 endThread();
989 timerPreciseB.stop();
990
991 reset();
992
993 UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5);
994 if (avgCyclesPrecise < mPreciseSamplingOverheadCycles)
995 mPreciseSamplingOverheadCycles = avgCyclesPrecise;
996 }
997 }
998
999 ProfilerCPU& gProfilerCPU()
1000 {
1001 return ProfilerCPU::instance();
1002 }
1003}
1004