1 | //************************************ bs::framework - Copyright 2018 Marko Pintera **************************************// |
2 | //*********** Licensed under the MIT license. See LICENSE.md for full terms. This notice is not to be removed. ***********// |
3 | #include "Profiling/BsProfilerCPU.h" |
4 | #include "Debug/BsDebug.h" |
5 | #include "Platform/BsPlatform.h" |
6 | #include <chrono> |
7 | |
8 | #if BS_COMPILER == BS_COMPILER_MSVC |
9 | #include <intrin.h> |
10 | #endif |
11 | |
12 | #if BS_COMPILER == BS_COMPILER_GNUC || BS_COMPILER == BS_COMPILER_CLANG |
13 | #include "BsCpuid.h" |
14 | #endif |
15 | |
16 | #if BS_COMPILER == BS_COMPILER_CLANG |
17 | #if BS_PLATFORM == BS_PLATFORM_WIN32 |
18 | #include "intrin.h" |
19 | #else |
20 | #include <x86intrin.h> |
21 | #endif |
22 | #endif |
23 | |
24 | using namespace std::chrono; |
25 | |
26 | namespace bs |
27 | { |
28 | ProfilerCPU::Timer::Timer() |
29 | { |
30 | time = 0.0f; |
31 | } |
32 | |
33 | void ProfilerCPU::Timer::start() |
34 | { |
35 | startTime = getCurrentTime(); |
36 | } |
37 | |
38 | void ProfilerCPU::Timer::stop() |
39 | { |
40 | time += getCurrentTime() - startTime; |
41 | } |
42 | |
43 | void ProfilerCPU::Timer::reset() |
44 | { |
45 | time = 0.0f; |
46 | } |
47 | |
48 | inline double ProfilerCPU::Timer::getCurrentTime() const |
49 | { |
50 | high_resolution_clock::time_point timeNow = mHRClock.now(); |
51 | nanoseconds timeNowNs = timeNow.time_since_epoch(); |
52 | |
53 | return timeNowNs.count() * 0.000001; |
54 | } |
55 | |
56 | ProfilerCPU::TimerPrecise::TimerPrecise() |
57 | { |
58 | cycles = 0; |
59 | } |
60 | |
61 | void ProfilerCPU::TimerPrecise::start() |
62 | { |
63 | startCycles = getNumCycles(); |
64 | } |
65 | |
66 | void ProfilerCPU::TimerPrecise::stop() |
67 | { |
68 | cycles += getNumCycles() - startCycles; |
69 | } |
70 | |
71 | void ProfilerCPU::TimerPrecise::reset() |
72 | { |
73 | cycles = 0; |
74 | } |
75 | |
76 | inline UINT64 ProfilerCPU::TimerPrecise::getNumCycles() |
77 | { |
78 | #if BS_COMPILER == BS_COMPILER_GNUC || BS_COMPILER == BS_COMPILER_CLANG |
79 | unsigned int a = 0; |
80 | unsigned int b[4]; |
81 | __get_cpuid(a, &b[0], &b[1], &b[2], &b[3]); |
82 | |
83 | #if BS_ARCH_TYPE == BS_ARCHITECTURE_x86_64 |
84 | UINT32 __a, __d; |
85 | __asm__ __volatile__ ("rdtsc" : "=a" (__a), "=d" (__d)); |
86 | return (UINT64(__a) | UINT64(__d) << 32); |
87 | #else |
88 | UINT64 x; |
89 | __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x)); |
90 | return x; |
91 | #endif |
92 | #elif BS_COMPILER == BS_COMPILER_MSVC |
93 | int a[4]; |
94 | int b = 0; |
95 | __cpuid(a, b); |
96 | return __rdtsc(); |
97 | #else |
98 | static_assert(false, "Unsupported compiler" ); |
99 | #endif |
100 | } |
101 | |
102 | ProfilerCPU::ProfileData::ProfileData(FrameAlloc* alloc) |
103 | :samples(alloc) |
104 | { } |
105 | |
106 | void ProfilerCPU::ProfileData::beginSample() |
107 | { |
108 | memAllocs = MemoryCounter::getNumAllocs(); |
109 | memFrees = MemoryCounter::getNumFrees(); |
110 | |
111 | timer.reset(); |
112 | timer.start(); |
113 | } |
114 | |
115 | void ProfilerCPU::ProfileData::endSample() |
116 | { |
117 | timer.stop(); |
118 | |
119 | UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs; |
120 | UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees; |
121 | |
122 | samples.push_back(ProfileSample(timer.time, numAllocs, numFrees)); |
123 | } |
124 | |
125 | void ProfilerCPU::ProfileData::resumeLastSample() |
126 | { |
127 | timer.start(); |
128 | samples.erase(samples.end() - 1); |
129 | } |
130 | |
131 | ProfilerCPU::PreciseProfileData::PreciseProfileData(FrameAlloc* alloc) |
132 | :samples(alloc) |
133 | { } |
134 | |
135 | void ProfilerCPU::PreciseProfileData::beginSample() |
136 | { |
137 | memAllocs = MemoryCounter::getNumAllocs(); |
138 | memFrees = MemoryCounter::getNumFrees(); |
139 | |
140 | timer.reset(); |
141 | timer.start(); |
142 | } |
143 | |
144 | void ProfilerCPU::PreciseProfileData::endSample() |
145 | { |
146 | timer.stop(); |
147 | |
148 | UINT64 numAllocs = MemoryCounter::getNumAllocs() - memAllocs; |
149 | UINT64 numFrees = MemoryCounter::getNumFrees() - memFrees; |
150 | |
151 | samples.push_back(PreciseProfileSample(timer.cycles, numAllocs, numFrees)); |
152 | } |
153 | |
154 | void ProfilerCPU::PreciseProfileData::resumeLastSample() |
155 | { |
156 | timer.start(); |
157 | samples.erase(samples.end() - 1); |
158 | } |
159 | |
160 | BS_THREADLOCAL ProfilerCPU::ThreadInfo* ProfilerCPU::ThreadInfo::activeThread = nullptr; |
161 | |
162 | ProfilerCPU::ThreadInfo::ThreadInfo() |
163 | :frameAlloc(1024 * 512) |
164 | { |
165 | |
166 | } |
167 | |
168 | void ProfilerCPU::ThreadInfo::begin(const char* _name) |
169 | { |
170 | if(isActive) |
171 | { |
172 | LOGWRN("Profiler::beginThread called on a thread that was already being sampled" ); |
173 | return; |
174 | } |
175 | |
176 | if(rootBlock == nullptr) |
177 | rootBlock = getBlock(_name); |
178 | |
179 | activeBlock = ActiveBlock(ActiveSamplingType::Basic, rootBlock); |
180 | if (activeBlocks == nullptr) |
181 | activeBlocks = frameAlloc.construct<Stack<ActiveBlock, StdFrameAlloc<ActiveBlock>>> |
182 | (StdFrameAlloc<ActiveBlock>(&frameAlloc)); |
183 | |
184 | activeBlocks->push(activeBlock); |
185 | |
186 | rootBlock->basic.beginSample(); |
187 | isActive = true; |
188 | } |
189 | |
190 | void ProfilerCPU::ThreadInfo::end() |
191 | { |
192 | if(activeBlock.type == ActiveSamplingType::Basic) |
193 | activeBlock.block->basic.endSample(); |
194 | else |
195 | activeBlock.block->precise.endSample(); |
196 | |
197 | activeBlocks->pop(); |
198 | |
199 | if(!isActive) |
200 | LOGWRN("Profiler::endThread called on a thread that isn't being sampled." ); |
201 | |
202 | if (activeBlocks->size() > 0) |
203 | { |
204 | LOGWRN("Profiler::endThread called but not all sample pairs were closed. Sampling data will not be valid." ); |
205 | |
206 | while (activeBlocks->size() > 0) |
207 | { |
208 | ActiveBlock& curBlock = activeBlocks->top(); |
209 | if(curBlock.type == ActiveSamplingType::Basic) |
210 | curBlock.block->basic.endSample(); |
211 | else |
212 | curBlock.block->precise.endSample(); |
213 | |
214 | activeBlocks->pop(); |
215 | } |
216 | } |
217 | |
218 | isActive = false; |
219 | activeBlock = ActiveBlock(); |
220 | |
221 | frameAlloc.free(activeBlocks); |
222 | activeBlocks = nullptr; |
223 | } |
224 | |
225 | void ProfilerCPU::ThreadInfo::reset() |
226 | { |
227 | if(isActive) |
228 | end(); |
229 | |
230 | if(rootBlock != nullptr) |
231 | releaseBlock(rootBlock); |
232 | |
233 | rootBlock = nullptr; |
234 | frameAlloc.clear(); // Note: This never actually frees memory |
235 | } |
236 | |
237 | ProfilerCPU::ProfiledBlock* ProfilerCPU::ThreadInfo::getBlock(const char* name) |
238 | { |
239 | ProfiledBlock* block = frameAlloc.construct<ProfiledBlock>(&frameAlloc); |
240 | block->name = (char*)frameAlloc.alloc(((UINT32)strlen(name) + 1) * sizeof(char)); |
241 | strcpy(block->name, name); |
242 | |
243 | return block; |
244 | } |
245 | |
246 | void ProfilerCPU::ThreadInfo::releaseBlock(ProfiledBlock* block) |
247 | { |
248 | frameAlloc.free((UINT8*)block->name); |
249 | frameAlloc.free(block); |
250 | } |
251 | |
252 | ProfilerCPU::ProfiledBlock::ProfiledBlock(FrameAlloc* alloc) |
253 | :basic(alloc), precise(alloc), children(alloc) |
254 | { } |
255 | |
256 | ProfilerCPU::ProfiledBlock::~ProfiledBlock() |
257 | { |
258 | ThreadInfo* thread = ThreadInfo::activeThread; |
259 | |
260 | for(auto& child : children) |
261 | thread->releaseBlock(child); |
262 | |
263 | children.clear(); |
264 | } |
265 | |
266 | ProfilerCPU::ProfiledBlock* ProfilerCPU::ProfiledBlock::findChild(const char* name) const |
267 | { |
268 | for(auto& child : children) |
269 | { |
270 | if(strcmp(child->name, name) == 0) |
271 | return child; |
272 | } |
273 | |
274 | return nullptr; |
275 | } |
276 | |
277 | ProfilerCPU::ProfilerCPU() |
278 | { |
279 | // TODO - We only estimate overhead on program start. It might be better to estimate it each time beginThread is called, |
280 | // and keep separate values per thread. |
281 | estimateTimerOverhead(); |
282 | } |
283 | |
284 | ProfilerCPU::~ProfilerCPU() |
285 | { |
286 | reset(); |
287 | |
288 | Lock lock(mThreadSync); |
289 | |
290 | for(auto& threadInfo : mActiveThreads) |
291 | bs_delete<ThreadInfo, ProfilerAlloc>(threadInfo); |
292 | } |
293 | |
294 | void ProfilerCPU::beginThread(const char* name) |
295 | { |
296 | ThreadInfo* thread = ThreadInfo::activeThread; |
297 | if(thread == nullptr) |
298 | { |
299 | ThreadInfo::activeThread = bs_new<ThreadInfo, ProfilerAlloc>(); |
300 | thread = ThreadInfo::activeThread; |
301 | |
302 | { |
303 | Lock lock(mThreadSync); |
304 | |
305 | mActiveThreads.push_back(thread); |
306 | } |
307 | } |
308 | |
309 | thread->begin(name); |
310 | } |
311 | |
312 | void ProfilerCPU::endThread() |
313 | { |
314 | // I don't do a nullcheck where on purpose, so endSample can be called ASAP |
315 | ThreadInfo::activeThread->end(); |
316 | } |
317 | |
318 | void ProfilerCPU::beginSample(const char* name) |
319 | { |
320 | ThreadInfo* thread = ThreadInfo::activeThread; |
321 | if(thread == nullptr || !thread->isActive) |
322 | { |
323 | beginThread("Unknown" ); |
324 | thread = ThreadInfo::activeThread; |
325 | } |
326 | |
327 | ProfiledBlock* parent = thread->activeBlock.block; |
328 | ProfiledBlock* block = nullptr; |
329 | |
330 | if(parent != nullptr) |
331 | block = parent->findChild(name); |
332 | |
333 | if(block == nullptr) |
334 | { |
335 | block = thread->getBlock(name); |
336 | |
337 | if(parent != nullptr) |
338 | parent->children.push_back(block); |
339 | else |
340 | thread->rootBlock->children.push_back(block); |
341 | } |
342 | |
343 | thread->activeBlock = ActiveBlock(ActiveSamplingType::Basic, block); |
344 | thread->activeBlocks->push(thread->activeBlock); |
345 | |
346 | block->basic.beginSample(); |
347 | } |
348 | |
349 | void ProfilerCPU::endSample(const char* name) |
350 | { |
351 | ThreadInfo* thread = ThreadInfo::activeThread; |
352 | ProfiledBlock* block = thread->activeBlock.block; |
353 | |
354 | #if BS_DEBUG_MODE |
355 | if(block == nullptr) |
356 | { |
357 | LOGWRN("Mismatched CPUProfiler::endSample. No beginSample was called." ); |
358 | return; |
359 | } |
360 | |
361 | if(thread->activeBlock.type == ActiveSamplingType::Precise) |
362 | { |
363 | LOGWRN("Mismatched CPUProfiler::endSample. Was expecting Profiler::endSamplePrecise." ); |
364 | return; |
365 | } |
366 | |
367 | if(strcmp(block->name, name) != 0) |
368 | { |
369 | LOGWRN("Mismatched CPUProfiler::endSample. Was expecting \"" + String(block->name) + |
370 | "\" but got \"" + String(name) + "\". Sampling data will not be valid." ); |
371 | return; |
372 | } |
373 | #endif |
374 | |
375 | block->basic.endSample(); |
376 | |
377 | thread->activeBlocks->pop(); |
378 | |
379 | if (!thread->activeBlocks->empty()) |
380 | thread->activeBlock = thread->activeBlocks->top(); |
381 | else |
382 | thread->activeBlock = ActiveBlock(); |
383 | } |
384 | |
385 | void ProfilerCPU::beginSamplePrecise(const char* name) |
386 | { |
387 | // Note: There is a (small) possibility a context switch will happen during this measurement in which case result will be skewed. |
388 | // Increasing thread priority might help. This is generally only a problem with code that executes a long time (10-15+ ms - depending on OS quant length) |
389 | |
390 | ThreadInfo* thread = ThreadInfo::activeThread; |
391 | if(thread == nullptr || !thread->isActive) |
392 | beginThread("Unknown" ); |
393 | |
394 | ProfiledBlock* parent = thread->activeBlock.block; |
395 | ProfiledBlock* block = nullptr; |
396 | |
397 | if(parent != nullptr) |
398 | block = parent->findChild(name); |
399 | |
400 | if(block == nullptr) |
401 | { |
402 | block = thread->getBlock(name); |
403 | |
404 | if(parent != nullptr) |
405 | parent->children.push_back(block); |
406 | else |
407 | thread->rootBlock->children.push_back(block); |
408 | } |
409 | |
410 | thread->activeBlock = ActiveBlock(ActiveSamplingType::Precise, block); |
411 | thread->activeBlocks->push(thread->activeBlock); |
412 | |
413 | block->precise.beginSample(); |
414 | } |
415 | |
416 | void ProfilerCPU::endSamplePrecise(const char* name) |
417 | { |
418 | ThreadInfo* thread = ThreadInfo::activeThread; |
419 | ProfiledBlock* block = thread->activeBlock.block; |
420 | |
421 | #if BS_DEBUG_MODE |
422 | if(block == nullptr) |
423 | { |
424 | LOGWRN("Mismatched Profiler::endSamplePrecise. No beginSamplePrecise was called." ); |
425 | return; |
426 | } |
427 | |
428 | if(thread->activeBlock.type == ActiveSamplingType::Basic) |
429 | { |
430 | LOGWRN("Mismatched CPUProfiler::endSamplePrecise. Was expecting Profiler::endSample." ); |
431 | return; |
432 | } |
433 | |
434 | if (strcmp(block->name, name) != 0) |
435 | { |
436 | LOGWRN("Mismatched Profiler::endSamplePrecise. Was expecting \"" + String(block->name) + |
437 | "\" but got \"" + String(name) + "\". Sampling data will not be valid." ); |
438 | return; |
439 | } |
440 | #endif |
441 | |
442 | block->precise.endSample(); |
443 | |
444 | thread->activeBlocks->pop(); |
445 | |
446 | if (!thread->activeBlocks->empty()) |
447 | thread->activeBlock = thread->activeBlocks->top(); |
448 | else |
449 | thread->activeBlock = ActiveBlock(); |
450 | } |
451 | |
452 | void ProfilerCPU::reset() |
453 | { |
454 | ThreadInfo* thread = ThreadInfo::activeThread; |
455 | |
456 | if(thread != nullptr) |
457 | thread->reset(); |
458 | } |
459 | |
460 | CPUProfilerReport ProfilerCPU::generateReport() |
461 | { |
462 | CPUProfilerReport report; |
463 | |
464 | ThreadInfo* thread = ThreadInfo::activeThread; |
465 | if(thread == nullptr) |
466 | return report; |
467 | |
468 | if(thread->isActive) |
469 | thread->end(); |
470 | |
471 | // We need to separate out basic and precise data and form two separate hierarchies |
472 | if(thread->rootBlock == nullptr) |
473 | return report; |
474 | |
475 | struct TempEntry |
476 | { |
477 | TempEntry(ProfiledBlock* _parentBlock, UINT32 _entryIdx) |
478 | :parentBlock(_parentBlock), entryIdx(_entryIdx) |
479 | { } |
480 | |
481 | ProfiledBlock* parentBlock; |
482 | UINT32 entryIdx; |
483 | ProfilerVector<UINT32> childIndexes; |
484 | }; |
485 | |
486 | ProfilerVector<CPUProfilerBasicSamplingEntry> basicEntries; |
487 | ProfilerVector<CPUProfilerPreciseSamplingEntry> preciseEntries; |
488 | |
489 | // Fill up flatHierarchy array in a way so we always process children before parents |
490 | ProfilerStack<UINT32> todo; |
491 | ProfilerVector<TempEntry> flatHierarchy; |
492 | |
493 | UINT32 entryIdx = 0; |
494 | todo.push(entryIdx); |
495 | flatHierarchy.push_back(TempEntry(thread->rootBlock, entryIdx)); |
496 | |
497 | entryIdx++; |
498 | while(!todo.empty()) |
499 | { |
500 | UINT32 curDataIdx = todo.top(); |
501 | ProfiledBlock* curBlock = flatHierarchy[curDataIdx].parentBlock; |
502 | |
503 | todo.pop(); |
504 | |
505 | for(auto& child : curBlock->children) |
506 | { |
507 | flatHierarchy[curDataIdx].childIndexes.push_back(entryIdx); |
508 | |
509 | todo.push(entryIdx); |
510 | flatHierarchy.push_back(TempEntry(child, entryIdx)); |
511 | |
512 | entryIdx++; |
513 | } |
514 | } |
515 | |
516 | // Calculate sampling data for all entries |
517 | basicEntries.resize(flatHierarchy.size()); |
518 | preciseEntries.resize(flatHierarchy.size()); |
519 | |
520 | for(auto iter = flatHierarchy.rbegin(); iter != flatHierarchy.rend(); ++iter) |
521 | { |
522 | TempEntry& curData = *iter; |
523 | ProfiledBlock* curBlock = curData.parentBlock; |
524 | |
525 | CPUProfilerBasicSamplingEntry* entryBasic = &basicEntries[curData.entryIdx]; |
526 | CPUProfilerPreciseSamplingEntry* entryPrecise = &preciseEntries[curData.entryIdx]; |
527 | |
528 | // Calculate basic data |
529 | entryBasic->data.name = String(curBlock->name); |
530 | |
531 | entryBasic->data.memAllocs = 0; |
532 | entryBasic->data.memFrees = 0; |
533 | entryBasic->data.totalTimeMs = 0.0; |
534 | entryBasic->data.maxTimeMs = 0.0; |
535 | for(auto& sample : curBlock->basic.samples) |
536 | { |
537 | entryBasic->data.totalTimeMs += sample.time; |
538 | entryBasic->data.maxTimeMs = std::max(entryBasic->data.maxTimeMs, sample.time); |
539 | entryBasic->data.memAllocs += sample.numAllocs; |
540 | entryBasic->data.memFrees += sample.numFrees; |
541 | } |
542 | |
543 | entryBasic->data.numCalls = (UINT32)curBlock->basic.samples.size(); |
544 | |
545 | if(entryBasic->data.numCalls > 0) |
546 | entryBasic->data.avgTimeMs = entryBasic->data.totalTimeMs / entryBasic->data.numCalls; |
547 | |
548 | double totalChildTime = 0.0; |
549 | for(auto& childIdx : curData.childIndexes) |
550 | { |
551 | CPUProfilerBasicSamplingEntry* childEntry = &basicEntries[childIdx]; |
552 | totalChildTime += childEntry->data.totalTimeMs; |
553 | childEntry->data.pctOfParent = (float)(childEntry->data.totalTimeMs / entryBasic->data.totalTimeMs); |
554 | |
555 | entryBasic->data.estimatedOverheadMs += childEntry->data.estimatedOverheadMs; |
556 | } |
557 | |
558 | entryBasic->data.estimatedOverheadMs += curBlock->basic.samples.size() * mBasicSamplingOverheadMs; |
559 | entryBasic->data.estimatedOverheadMs += curBlock->precise.samples.size() * mPreciseSamplingOverheadMs; |
560 | |
561 | entryBasic->data.totalSelfTimeMs = entryBasic->data.totalTimeMs - totalChildTime; |
562 | |
563 | if(entryBasic->data.numCalls > 0) |
564 | entryBasic->data.avgSelfTimeMs = entryBasic->data.totalSelfTimeMs / entryBasic->data.numCalls; |
565 | |
566 | entryBasic->data.estimatedSelfOverheadMs = mBasicTimerOverhead; |
567 | |
568 | // Calculate precise data |
569 | entryPrecise->data.name = String(curBlock->name); |
570 | |
571 | entryPrecise->data.memAllocs = 0; |
572 | entryPrecise->data.memFrees = 0; |
573 | entryPrecise->data.totalCycles = 0; |
574 | entryPrecise->data.maxCycles = 0; |
575 | for(auto& sample : curBlock->precise.samples) |
576 | { |
577 | entryPrecise->data.totalCycles += sample.cycles; |
578 | entryPrecise->data.maxCycles = std::max(entryPrecise->data.maxCycles, sample.cycles); |
579 | entryPrecise->data.memAllocs += sample.numAllocs; |
580 | entryPrecise->data.memFrees += sample.numFrees; |
581 | } |
582 | |
583 | entryPrecise->data.numCalls = (UINT32)curBlock->precise.samples.size(); |
584 | |
585 | if(entryPrecise->data.numCalls > 0) |
586 | entryPrecise->data.avgCycles = entryPrecise->data.totalCycles / entryPrecise->data.numCalls; |
587 | |
588 | UINT64 totalChildCycles = 0; |
589 | for(auto& childIdx : curData.childIndexes) |
590 | { |
591 | CPUProfilerPreciseSamplingEntry* childEntry = &preciseEntries[childIdx]; |
592 | totalChildCycles += childEntry->data.totalCycles; |
593 | childEntry->data.pctOfParent = childEntry->data.totalCycles / (float)entryPrecise->data.totalCycles; |
594 | |
595 | entryPrecise->data.estimatedOverhead += childEntry->data.estimatedOverhead; |
596 | } |
597 | |
598 | entryPrecise->data.estimatedOverhead += curBlock->precise.samples.size() * mPreciseSamplingOverheadCycles; |
599 | entryPrecise->data.estimatedOverhead += curBlock->basic.samples.size() * mBasicSamplingOverheadCycles; |
600 | |
601 | entryPrecise->data.totalSelfCycles = entryPrecise->data.totalCycles - totalChildCycles; |
602 | |
603 | if(entryPrecise->data.numCalls > 0) |
604 | entryPrecise->data.avgSelfCycles = entryPrecise->data.totalSelfCycles / entryPrecise->data.numCalls; |
605 | |
606 | entryPrecise->data.estimatedSelfOverhead = mPreciseTimerOverhead; |
607 | } |
608 | |
609 | // Prune empty basic entries |
610 | ProfilerStack<UINT32> finalBasicHierarchyTodo; |
611 | ProfilerStack<UINT32> parentBasicEntryIndexes; |
612 | ProfilerVector<TempEntry> newBasicEntries; |
613 | |
614 | finalBasicHierarchyTodo.push(0); |
615 | |
616 | entryIdx = 0; |
617 | parentBasicEntryIndexes.push(entryIdx); |
618 | newBasicEntries.push_back(TempEntry(nullptr, entryIdx)); |
619 | |
620 | entryIdx++; |
621 | |
622 | while(!finalBasicHierarchyTodo.empty()) |
623 | { |
624 | UINT32 parentEntryIdx = parentBasicEntryIndexes.top(); |
625 | parentBasicEntryIndexes.pop(); |
626 | |
627 | UINT32 curEntryIdx = finalBasicHierarchyTodo.top(); |
628 | TempEntry& curEntry = flatHierarchy[curEntryIdx]; |
629 | finalBasicHierarchyTodo.pop(); |
630 | |
631 | for(auto& childIdx : curEntry.childIndexes) |
632 | { |
633 | finalBasicHierarchyTodo.push(childIdx); |
634 | |
635 | CPUProfilerBasicSamplingEntry& basicEntry = basicEntries[childIdx]; |
636 | if(basicEntry.data.numCalls > 0) |
637 | { |
638 | newBasicEntries.push_back(TempEntry(nullptr, childIdx)); |
639 | newBasicEntries[parentEntryIdx].childIndexes.push_back(entryIdx); |
640 | |
641 | parentBasicEntryIndexes.push(entryIdx); |
642 | |
643 | entryIdx++; |
644 | } |
645 | else |
646 | parentBasicEntryIndexes.push(parentEntryIdx); |
647 | } |
648 | } |
649 | |
650 | if(newBasicEntries.size() > 0) |
651 | { |
652 | ProfilerVector<CPUProfilerBasicSamplingEntry*> finalBasicEntries; |
653 | |
654 | report.mBasicSamplingRootEntry = basicEntries[newBasicEntries[0].entryIdx]; |
655 | finalBasicEntries.push_back(&report.mBasicSamplingRootEntry); |
656 | |
657 | finalBasicHierarchyTodo.push(0); |
658 | |
659 | while(!finalBasicHierarchyTodo.empty()) |
660 | { |
661 | UINT32 curEntryIdx = finalBasicHierarchyTodo.top(); |
662 | finalBasicHierarchyTodo.pop(); |
663 | |
664 | TempEntry& curEntry = newBasicEntries[curEntryIdx]; |
665 | |
666 | CPUProfilerBasicSamplingEntry* basicEntry = finalBasicEntries[curEntryIdx]; |
667 | |
668 | basicEntry->childEntries.resize(curEntry.childIndexes.size()); |
669 | UINT32 idx = 0; |
670 | |
671 | for(auto& childIdx : curEntry.childIndexes) |
672 | { |
673 | TempEntry& childEntry = newBasicEntries[childIdx]; |
674 | basicEntry->childEntries[idx] = basicEntries[childEntry.entryIdx]; |
675 | |
676 | finalBasicEntries.push_back(&(basicEntry->childEntries[idx])); |
677 | finalBasicHierarchyTodo.push(childIdx); |
678 | idx++; |
679 | } |
680 | } |
681 | } |
682 | |
683 | // Prune empty precise entries |
684 | ProfilerStack<UINT32> finalPreciseHierarchyTodo; |
685 | ProfilerStack<UINT32> parentPreciseEntryIndexes; |
686 | ProfilerVector<TempEntry> newPreciseEntries; |
687 | |
688 | finalPreciseHierarchyTodo.push(0); |
689 | |
690 | entryIdx = 0; |
691 | parentPreciseEntryIndexes.push(entryIdx); |
692 | newPreciseEntries.push_back(TempEntry(nullptr, entryIdx)); |
693 | |
694 | entryIdx++; |
695 | |
696 | while(!finalPreciseHierarchyTodo.empty()) |
697 | { |
698 | UINT32 parentEntryIdx = parentPreciseEntryIndexes.top(); |
699 | parentPreciseEntryIndexes.pop(); |
700 | |
701 | UINT32 curEntryIdx = finalPreciseHierarchyTodo.top(); |
702 | TempEntry& curEntry = flatHierarchy[curEntryIdx]; |
703 | finalPreciseHierarchyTodo.pop(); |
704 | |
705 | for(auto& childIdx : curEntry.childIndexes) |
706 | { |
707 | finalPreciseHierarchyTodo.push(childIdx); |
708 | |
709 | CPUProfilerPreciseSamplingEntry& preciseEntry = preciseEntries[childIdx]; |
710 | if(preciseEntry.data.numCalls > 0) |
711 | { |
712 | newPreciseEntries.push_back(TempEntry(nullptr, childIdx)); |
713 | newPreciseEntries[parentEntryIdx].childIndexes.push_back(entryIdx); |
714 | |
715 | parentPreciseEntryIndexes.push(entryIdx); |
716 | |
717 | entryIdx++; |
718 | } |
719 | else |
720 | parentPreciseEntryIndexes.push(parentEntryIdx); |
721 | } |
722 | } |
723 | |
724 | if(newPreciseEntries.size() > 0) |
725 | { |
726 | ProfilerVector<CPUProfilerPreciseSamplingEntry*> finalPreciseEntries; |
727 | |
728 | report.mPreciseSamplingRootEntry = preciseEntries[newPreciseEntries[0].entryIdx]; |
729 | finalPreciseEntries.push_back(&report.mPreciseSamplingRootEntry); |
730 | |
731 | finalPreciseHierarchyTodo.push(0); |
732 | |
733 | while(!finalPreciseHierarchyTodo.empty()) |
734 | { |
735 | UINT32 curEntryIdx = finalPreciseHierarchyTodo.top(); |
736 | finalPreciseHierarchyTodo.pop(); |
737 | |
738 | TempEntry& curEntry = newPreciseEntries[curEntryIdx]; |
739 | |
740 | CPUProfilerPreciseSamplingEntry* preciseEntry = finalPreciseEntries[curEntryIdx]; |
741 | |
742 | preciseEntry->childEntries.resize(curEntry.childIndexes.size()); |
743 | UINT32 idx = 0; |
744 | |
745 | for(auto& childIdx : curEntry.childIndexes) |
746 | { |
747 | TempEntry& childEntry = newPreciseEntries[childIdx]; |
748 | preciseEntry->childEntries[idx] = preciseEntries[childEntry.entryIdx]; |
749 | |
750 | finalPreciseEntries.push_back(&preciseEntry->childEntries.back()); |
751 | finalPreciseHierarchyTodo.push(childIdx); |
752 | idx++; |
753 | } |
754 | } |
755 | } |
756 | |
757 | return report; |
758 | } |
759 | |
760 | void ProfilerCPU::estimateTimerOverhead() |
761 | { |
762 | // Get an idea of how long timer calls and RDTSC takes |
763 | const UINT32 reps = 1000, sampleReps = 20; |
764 | |
765 | mBasicTimerOverhead = 1000000.0; |
766 | mPreciseTimerOverhead = 1000000; |
767 | for (UINT32 tries = 0; tries < 20; tries++) |
768 | { |
769 | Timer timer; |
770 | for (UINT32 i = 0; i < reps; i++) |
771 | { |
772 | timer.start(); |
773 | timer.stop(); |
774 | } |
775 | |
776 | double avgTime = double(timer.time)/double(reps); |
777 | if (avgTime < mBasicTimerOverhead) |
778 | mBasicTimerOverhead = avgTime; |
779 | |
780 | TimerPrecise timerPrecise; |
781 | for (UINT32 i = 0; i < reps; i++) |
782 | { |
783 | timerPrecise.start(); |
784 | timerPrecise.stop(); |
785 | } |
786 | |
787 | UINT64 avgCycles = timerPrecise.cycles/reps; |
788 | if (avgCycles < mPreciseTimerOverhead) |
789 | mPreciseTimerOverhead = avgCycles; |
790 | } |
791 | |
792 | mBasicSamplingOverheadMs = 1000000.0; |
793 | mPreciseSamplingOverheadMs = 1000000.0; |
794 | mBasicSamplingOverheadCycles = 1000000; |
795 | mPreciseSamplingOverheadCycles = 1000000; |
796 | for (UINT32 tries = 0; tries < 3; tries++) |
797 | { |
798 | /************************************************************************/ |
799 | /* AVERAGE TIME IN MS FOR BASIC SAMPLING */ |
800 | /************************************************************************/ |
801 | |
802 | Timer timerA; |
803 | timerA.start(); |
804 | |
805 | beginThread("Main" ); |
806 | |
807 | // Two different cases that can effect performance, one where |
808 | // sample already exists and other where new one needs to be created |
809 | for (UINT32 i = 0; i < sampleReps; i++) |
810 | { |
811 | beginSample("TestAvg1" ); |
812 | endSample("TestAvg1" ); |
813 | beginSample("TestAvg2" ); |
814 | endSample("TestAvg2" ); |
815 | beginSample("TestAvg3" ); |
816 | endSample("TestAvg3" ); |
817 | beginSample("TestAvg4" ); |
818 | endSample("TestAvg4" ); |
819 | beginSample("TestAvg5" ); |
820 | endSample("TestAvg5" ); |
821 | beginSample("TestAvg6" ); |
822 | endSample("TestAvg6" ); |
823 | beginSample("TestAvg7" ); |
824 | endSample("TestAvg7" ); |
825 | beginSample("TestAvg8" ); |
826 | endSample("TestAvg8" ); |
827 | beginSample("TestAvg9" ); |
828 | endSample("TestAvg9" ); |
829 | beginSample("TestAvg10" ); |
830 | endSample("TestAvg10" ); |
831 | } |
832 | |
833 | for (UINT32 i = 0; i < sampleReps * 5; i++) |
834 | { |
835 | beginSample(("TestAvg#" + toString(i)).c_str()); |
836 | endSample(("TestAvg#" + toString(i)).c_str()); |
837 | } |
838 | |
839 | endThread(); |
840 | |
841 | timerA.stop(); |
842 | |
843 | reset(); |
844 | |
845 | double avgTimeBasic = double(timerA.time)/double(sampleReps * 10 + sampleReps * 5) - mBasicTimerOverhead; |
846 | if (avgTimeBasic < mBasicSamplingOverheadMs) |
847 | mBasicSamplingOverheadMs = avgTimeBasic; |
848 | |
849 | /************************************************************************/ |
850 | /* AVERAGE CYCLES FOR BASIC SAMPLING */ |
851 | /************************************************************************/ |
852 | |
853 | TimerPrecise timerPreciseA; |
854 | timerPreciseA.start(); |
855 | |
856 | beginThread("Main" ); |
857 | |
858 | // Two different cases that can effect performance, one where |
859 | // sample already exists and other where new one needs to be created |
860 | for (UINT32 i = 0; i < sampleReps; i++) |
861 | { |
862 | beginSample("TestAvg1" ); |
863 | endSample("TestAvg1" ); |
864 | beginSample("TestAvg2" ); |
865 | endSample("TestAvg2" ); |
866 | beginSample("TestAvg3" ); |
867 | endSample("TestAvg3" ); |
868 | beginSample("TestAvg4" ); |
869 | endSample("TestAvg4" ); |
870 | beginSample("TestAvg5" ); |
871 | endSample("TestAvg5" ); |
872 | beginSample("TestAvg6" ); |
873 | endSample("TestAvg6" ); |
874 | beginSample("TestAvg7" ); |
875 | endSample("TestAvg7" ); |
876 | beginSample("TestAvg8" ); |
877 | endSample("TestAvg8" ); |
878 | beginSample("TestAvg9" ); |
879 | endSample("TestAvg9" ); |
880 | beginSample("TestAvg10" ); |
881 | endSample("TestAvg10" ); |
882 | } |
883 | |
884 | for (UINT32 i = 0; i < sampleReps * 5; i++) |
885 | { |
886 | beginSample(("TestAvg#" + toString(i)).c_str()); |
887 | endSample(("TestAvg#" + toString(i)).c_str()); |
888 | } |
889 | |
890 | endThread(); |
891 | timerPreciseA.stop(); |
892 | |
893 | reset(); |
894 | |
895 | UINT64 avgCyclesBasic = timerPreciseA.cycles/(sampleReps * 10 + sampleReps * 5) - mPreciseTimerOverhead; |
896 | if (avgCyclesBasic < mBasicSamplingOverheadCycles) |
897 | mBasicSamplingOverheadCycles = avgCyclesBasic; |
898 | |
899 | /************************************************************************/ |
900 | /* AVERAGE TIME IN MS FOR PRECISE SAMPLING */ |
901 | /************************************************************************/ |
902 | |
903 | Timer timerB; |
904 | timerB.start(); |
905 | beginThread("Main" ); |
906 | |
907 | // Two different cases that can effect performance, one where |
908 | // sample already exists and other where new one needs to be created |
909 | for (UINT32 i = 0; i < sampleReps; i++) |
910 | { |
911 | beginSamplePrecise("TestAvg1" ); |
912 | endSamplePrecise("TestAvg1" ); |
913 | beginSamplePrecise("TestAvg2" ); |
914 | endSamplePrecise("TestAvg2" ); |
915 | beginSamplePrecise("TestAvg3" ); |
916 | endSamplePrecise("TestAvg3" ); |
917 | beginSamplePrecise("TestAvg4" ); |
918 | endSamplePrecise("TestAvg4" ); |
919 | beginSamplePrecise("TestAvg5" ); |
920 | endSamplePrecise("TestAvg5" ); |
921 | beginSamplePrecise("TestAvg6" ); |
922 | endSamplePrecise("TestAvg6" ); |
923 | beginSamplePrecise("TestAvg7" ); |
924 | endSamplePrecise("TestAvg7" ); |
925 | beginSamplePrecise("TestAvg8" ); |
926 | endSamplePrecise("TestAvg8" ); |
927 | beginSamplePrecise("TestAvg9" ); |
928 | endSamplePrecise("TestAvg9" ); |
929 | beginSamplePrecise("TestAvg10" ); |
930 | endSamplePrecise("TestAvg10" ); |
931 | } |
932 | |
933 | for (UINT32 i = 0; i < sampleReps * 5; i++) |
934 | { |
935 | beginSamplePrecise(("TestAvg#" + toString(i)).c_str()); |
936 | endSamplePrecise(("TestAvg#" + toString(i)).c_str()); |
937 | } |
938 | |
939 | endThread(); |
940 | timerB.stop(); |
941 | |
942 | reset(); |
943 | |
944 | double avgTimesPrecise = timerB.time/(sampleReps * 10 + sampleReps * 5); |
945 | if (avgTimesPrecise < mPreciseSamplingOverheadMs) |
946 | mPreciseSamplingOverheadMs = avgTimesPrecise; |
947 | |
948 | /************************************************************************/ |
949 | /* AVERAGE CYCLES FOR PRECISE SAMPLING */ |
950 | /************************************************************************/ |
951 | |
952 | TimerPrecise timerPreciseB; |
953 | timerPreciseB.start(); |
954 | beginThread("Main" ); |
955 | |
956 | // Two different cases that can effect performance, one where |
957 | // sample already exists and other where new one needs to be created |
958 | for (UINT32 i = 0; i < sampleReps; i++) |
959 | { |
960 | beginSamplePrecise("TestAvg1" ); |
961 | endSamplePrecise("TestAvg1" ); |
962 | beginSamplePrecise("TestAvg2" ); |
963 | endSamplePrecise("TestAvg2" ); |
964 | beginSamplePrecise("TestAvg3" ); |
965 | endSamplePrecise("TestAvg3" ); |
966 | beginSamplePrecise("TestAvg4" ); |
967 | endSamplePrecise("TestAvg4" ); |
968 | beginSamplePrecise("TestAvg5" ); |
969 | endSamplePrecise("TestAvg5" ); |
970 | beginSamplePrecise("TestAvg6" ); |
971 | endSamplePrecise("TestAvg6" ); |
972 | beginSamplePrecise("TestAvg7" ); |
973 | endSamplePrecise("TestAvg7" ); |
974 | beginSamplePrecise("TestAvg8" ); |
975 | endSamplePrecise("TestAvg8" ); |
976 | beginSamplePrecise("TestAvg9" ); |
977 | endSamplePrecise("TestAvg9" ); |
978 | beginSamplePrecise("TestAvg10" ); |
979 | endSamplePrecise("TestAvg10" ); |
980 | } |
981 | |
982 | for (UINT32 i = 0; i < sampleReps * 5; i++) |
983 | { |
984 | beginSamplePrecise(("TestAvg#" + toString(i)).c_str()); |
985 | endSamplePrecise(("TestAvg#" + toString(i)).c_str()); |
986 | } |
987 | |
988 | endThread(); |
989 | timerPreciseB.stop(); |
990 | |
991 | reset(); |
992 | |
993 | UINT64 avgCyclesPrecise = timerPreciseB.cycles/(sampleReps * 10 + sampleReps * 5); |
994 | if (avgCyclesPrecise < mPreciseSamplingOverheadCycles) |
995 | mPreciseSamplingOverheadCycles = avgCyclesPrecise; |
996 | } |
997 | } |
998 | |
999 | ProfilerCPU& gProfilerCPU() |
1000 | { |
1001 | return ProfilerCPU::instance(); |
1002 | } |
1003 | } |
1004 | |