1 | /* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. |
2 | |
3 | This program is free software; you can redistribute it and/or modify |
4 | it under the terms of the GNU General Public License as published by |
5 | the Free Software Foundation; version 2 of the License. |
6 | |
7 | This program is distributed in the hope that it will be useful, |
8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
10 | GNU General Public License for more details. |
11 | |
12 | You should have received a copy of the GNU General Public License |
13 | along with this program; if not, write to the Free Software |
14 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
15 | |
16 | /* |
17 | rdtsc3 -- multi-platform timer code |
18 | pgulutzan@mysql.com, 2005-08-29 |
19 | modified 2008-11-02 |
20 | |
21 | Functions: |
22 | |
23 | my_timer_cycles ulonglong cycles |
24 | my_timer_nanoseconds ulonglong nanoseconds |
25 | my_timer_microseconds ulonglong "microseconds" |
26 | my_timer_milliseconds ulonglong milliseconds |
27 | my_timer_ticks ulonglong ticks |
28 | my_timer_init initialization / test |
29 | |
30 | We'll call the first 5 functions (the ones that return |
31 | a ulonglong) "my_timer_xxx" functions. |
32 | Each my_timer_xxx function returns a 64-bit timing value |
33 | since an arbitrary 'epoch' start. Since the only purpose |
34 | is to determine elapsed times, wall-clock time-of-day |
35 | is not known and not relevant. |
36 | |
37 | The my_timer_init function is necessary for initializing. |
38 | It returns information (underlying routine name, |
39 | frequency, resolution, overhead) about all my_timer_xxx |
40 | functions. A program should call my_timer_init once, |
41 | use the information to decide what my_timer_xxx function |
42 | to use, and subsequently call that function by function |
43 | pointer. |
44 | |
45 | A typical use would be: |
46 | my_timer_init() ... once, at program start |
47 | ... |
48 | time1= my_timer_xxx() ... time before start |
49 | [code that's timed] |
50 | time2= my_timer_xxx() ... time after end |
51 | elapsed_time= (time2 - time1) - overhead |
52 | */ |
53 | |
54 | #include "my_global.h" |
55 | #include "my_rdtsc.h" |
56 | |
57 | #if defined(_WIN32) |
58 | #include <stdio.h> |
59 | #include "windows.h" |
60 | #else |
61 | #include <stdio.h> |
62 | #endif |
63 | |
64 | #if !defined(_WIN32) |
65 | #if TIME_WITH_SYS_TIME |
66 | #include <sys/time.h> |
67 | #include <time.h> /* for clock_gettime */ |
68 | #else |
69 | #if HAVE_SYS_TIME_H |
70 | #include <sys/time.h> |
71 | #elif defined(HAVE_TIME_H) |
72 | #include <time.h> |
73 | #endif |
74 | #endif |
75 | #endif |
76 | |
77 | #if defined(HAVE_ASM_MSR_H) && defined(HAVE_RDTSCLL) |
78 | #include <asm/msr.h> /* for rdtscll */ |
79 | #endif |
80 | |
81 | #if defined(HAVE_SYS_TIMEB_H) && defined(HAVE_FTIME) |
82 | #include <sys/timeb.h> /* for ftime */ |
83 | #endif |
84 | |
85 | #if defined(HAVE_SYS_TIMES_H) && defined(HAVE_TIMES) |
86 | #include <sys/times.h> /* for times */ |
87 | #endif |
88 | |
89 | #if defined(__INTEL_COMPILER) && defined(__ia64__) && defined(HAVE_IA64INTRIN_H) |
90 | #include <ia64intrin.h> /* for __GetReg */ |
91 | #endif |
92 | |
93 | #if defined(__APPLE__) && defined(__MACH__) |
94 | #include <mach/mach_time.h> |
95 | #endif |
96 | |
97 | #if defined(__SUNPRO_CC) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7) |
98 | extern "C" ulonglong my_timer_cycles_il_sparc64(); |
99 | #elif defined(__SUNPRO_CC) && defined(_ILP32) && !defined(__SunOS_5_7) |
100 | extern "C" ulonglong my_timer_cycles_il_sparc32(); |
101 | #elif defined(__SUNPRO_CC) && defined(__i386) && defined(_ILP32) |
102 | extern "C" ulonglong my_timer_cycles_il_i386(); |
103 | #elif defined(__SUNPRO_CC) && defined(__x86_64) && defined(_LP64) |
104 | extern "C" ulonglong my_timer_cycles_il_x86_64(); |
105 | #elif defined(__SUNPRO_C) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7) |
106 | ulonglong my_timer_cycles_il_sparc64(); |
107 | #elif defined(__SUNPRO_C) && defined(_ILP32) && !defined(__SunOS_5_7) |
108 | ulonglong my_timer_cycles_il_sparc32(); |
109 | #elif defined(__SUNPRO_C) && defined(__i386) && defined(_ILP32) |
110 | ulonglong my_timer_cycles_il_i386(); |
111 | #elif defined(__SUNPRO_C) && defined(__x86_64) && defined(_LP64) |
112 | ulonglong my_timer_cycles_il_x86_64(); |
113 | #endif |
114 | |
115 | #if defined(__INTEL_COMPILER) |
116 | /* |
117 | icc warning #1011 is: |
118 | missing return statement at end of non-void function |
119 | */ |
120 | #pragma warning (disable:1011) |
121 | #endif |
122 | |
123 | /* |
124 | For cycles, we depend on RDTSC for x86 platforms, |
125 | or on time buffer (which is not really a cycle count |
126 | but a separate counter with less than nanosecond |
127 | resolution) for most PowerPC platforms, or on |
128 | gethrtime which is okay for hpux and solaris, or on |
129 | clock_gettime(CLOCK_SGI_CYCLE) for Irix platforms, |
130 | or on read_real_time for aix platforms. There is |
131 | nothing for Alpha platforms, they would be tricky. |
132 | |
133 | On the platforms that do not have a CYCLE timer, |
134 | "wait" events are initialized to use NANOSECOND instead of CYCLE |
135 | during performance_schema initialization (at the server startup). |
136 | |
137 | Linux performance monitor (see "man perf_event_open") can |
138 | provide cycle counter on the platforms that do not have |
139 | other kinds of cycle counters. But we don't use it so far. |
140 | |
141 | ARM notes |
142 | --------- |
143 | During tests on ARMv7 Debian, perf_even_open() based cycle counter provided |
144 | too low frequency with too high overhead: |
145 | MariaDB [performance_schema]> SELECT * FROM performance_timers; |
146 | +-------------+-----------------+------------------+----------------+ |
147 | | TIMER_NAME | TIMER_FREQUENCY | TIMER_RESOLUTION | TIMER_OVERHEAD | |
148 | +-------------+-----------------+------------------+----------------+ |
149 | | CYCLE | 689368159 | 1 | 970 | |
150 | | NANOSECOND | 1000000000 | 1 | 308 | |
151 | | MICROSECOND | 1000000 | 1 | 417 | |
152 | | MILLISECOND | 1000 | 1000 | 407 | |
153 | | TICK | 127 | 1 | 612 | |
154 | +-------------+-----------------+------------------+----------------+ |
155 | Therefore, it was decided not to use perf_even_open() on ARM |
156 | (i.e. go without CYCLE and have "wait" events use NANOSECOND by default). |
157 | */ |
158 | |
159 | ulonglong my_timer_cycles(void) |
160 | { |
161 | #if defined(__GNUC__) && defined(__i386__) |
162 | /* This works much better if compiled with "gcc -O3". */ |
163 | ulonglong result; |
164 | __asm__ __volatile__ ("rdtsc" : "=A" (result)); |
165 | return result; |
166 | #elif defined(__SUNPRO_C) && defined(__i386) |
167 | __asm("rdtsc" ); |
168 | #elif defined(__GNUC__) && defined(__x86_64__) |
169 | ulonglong result; |
170 | __asm__ __volatile__ ("rdtsc\n\t" \ |
171 | "shlq $32,%%rdx\n\t" \ |
172 | "orq %%rdx,%%rax" |
173 | : "=a" (result) :: "%edx" ); |
174 | return result; |
175 | #elif defined(HAVE_ASM_MSR_H) && defined(HAVE_RDTSCLL) |
176 | { |
177 | ulonglong result; |
178 | rdtscll(result); |
179 | return result; |
180 | } |
181 | #elif defined(_WIN32) && defined(_M_IX86) |
182 | __asm {rdtsc}; |
183 | #elif defined(_WIN64) && defined(_M_X64) |
184 | /* For 64-bit Windows: unsigned __int64 __rdtsc(); */ |
185 | return __rdtsc(); |
186 | #elif defined(__INTEL_COMPILER) && defined(__ia64__) && defined(HAVE_IA64INTRIN_H) |
187 | return (ulonglong) __getReg(_IA64_REG_AR_ITC); /* (3116) */ |
188 | #elif defined(__GNUC__) && defined(__ia64__) |
189 | { |
190 | ulonglong result; |
191 | __asm __volatile__ ("mov %0=ar.itc" : "=r" (result)); |
192 | return result; |
193 | } |
194 | #elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (defined(__64BIT__) || defined(_ARCH_PPC64)) |
195 | { |
196 | ulonglong result; |
197 | __asm __volatile__ ("mftb %0" : "=r" (result)); |
198 | return result; |
199 | } |
200 | #elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (!defined(__64BIT__) && !defined(_ARCH_PPC64)) |
201 | { |
202 | /* |
203 | mftbu means "move from time-buffer-upper to result". |
204 | The loop is saying: x1=upper, x2=lower, x3=upper, |
205 | if x1!=x3 there was an overflow so repeat. |
206 | */ |
207 | unsigned int x1, x2, x3; |
208 | ulonglong result; |
209 | for (;;) |
210 | { |
211 | __asm __volatile__ ( "mftbu %0" : "=r" (x1) ); |
212 | __asm __volatile__ ( "mftb %0" : "=r" (x2) ); |
213 | __asm __volatile__ ( "mftbu %0" : "=r" (x3) ); |
214 | if (x1 == x3) break; |
215 | } |
216 | result = x1; |
217 | return ( result << 32 ) | x2; |
218 | } |
219 | #elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7) |
220 | return (my_timer_cycles_il_sparc64()); |
221 | #elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(_ILP32) && !defined(__SunOS_5_7) |
222 | return (my_timer_cycles_il_sparc32()); |
223 | #elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__i386) && defined(_ILP32) |
224 | /* This is probably redundant for __SUNPRO_C. */ |
225 | return (my_timer_cycles_il_i386()); |
226 | #elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__x86_64) && defined(_LP64) |
227 | return (my_timer_cycles_il_x86_64()); |
228 | #elif defined(__GNUC__) && defined(__sparcv9) && defined(_LP64) && (__GNUC__>2) |
229 | { |
230 | ulonglong result; |
231 | __asm __volatile__ ("rd %%tick,%0" : "=r" (result)); |
232 | return result; |
233 | } |
234 | #elif defined(__GNUC__) && defined(__sparc__) && !defined(_LP64) && (__GNUC__>2) |
235 | { |
236 | union { |
237 | ulonglong wholeresult; |
238 | struct { |
239 | ulong high; |
240 | ulong low; |
241 | } splitresult; |
242 | } result; |
243 | __asm __volatile__ ("rd %%tick,%1; srlx %1,32,%0" : "=r" (result.splitresult.high), "=r" (result.splitresult.low)); |
244 | return result.wholeresult; |
245 | } |
246 | #elif defined(__sgi) && defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) |
247 | { |
248 | struct timespec tp; |
249 | clock_gettime(CLOCK_SGI_CYCLE, &tp); |
250 | return (ulonglong) tp.tv_sec * 1000000000 + (ulonglong) tp.tv_nsec; |
251 | } |
252 | #elif defined(__GNUC__) && defined(__s390__) |
253 | /* covers both s390 and s390x */ |
254 | { |
255 | ulonglong result; |
256 | __asm__ __volatile__ ("stck %0" : "=Q" (result) : : "cc" ); |
257 | return result; |
258 | } |
259 | #elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME) |
260 | /* gethrtime may appear as either cycle or nanosecond counter */ |
261 | return (ulonglong) gethrtime(); |
262 | #else |
263 | return 0; |
264 | #endif |
265 | } |
266 | |
267 | #if defined(__INTEL_COMPILER) |
268 | /* re-enable warning#1011 which was only for my_timer_cycles() */ |
269 | /* There may be an icc bug which means we must leave disabled. */ |
270 | #pragma warning (default:1011) |
271 | #endif |
272 | |
273 | /* |
274 | For nanoseconds, most platforms have nothing available that |
275 | (a) doesn't require bringing in a 40-kb librt.so library |
276 | (b) really has nanosecond resolution. |
277 | */ |
278 | |
279 | ulonglong my_timer_nanoseconds(void) |
280 | { |
281 | #if defined(HAVE_READ_REAL_TIME) |
282 | { |
283 | timebasestruct_t tr; |
284 | read_real_time(&tr, TIMEBASE_SZ); |
285 | return (ulonglong) tr.tb_high * 1000000000 + (ulonglong) tr.tb_low; |
286 | } |
287 | #elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME) |
288 | /* SunOS 5.10+, Solaris, HP-UX: hrtime_t gethrtime(void) */ |
289 | return (ulonglong) gethrtime(); |
290 | #elif defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_REALTIME) |
291 | { |
292 | struct timespec tp; |
293 | clock_gettime(CLOCK_REALTIME, &tp); |
294 | return (ulonglong) tp.tv_sec * 1000000000 + (ulonglong) tp.tv_nsec; |
295 | } |
296 | #elif defined(__APPLE__) && defined(__MACH__) |
297 | { |
298 | ulonglong tm; |
299 | static mach_timebase_info_data_t timebase_info= {0,0}; |
300 | if (timebase_info.denom == 0) |
301 | (void) mach_timebase_info(&timebase_info); |
302 | tm= mach_absolute_time(); |
303 | return (tm * timebase_info.numer) / timebase_info.denom; |
304 | } |
305 | #else |
306 | return 0; |
307 | #endif |
308 | } |
309 | |
310 | /* |
311 | For microseconds, gettimeofday() is available on |
312 | almost all platforms. On Windows we use |
313 | QueryPerformanceCounter which will usually tick over |
314 | 3.5 million times per second, and we don't throw |
315 | away the extra precision. (On Windows Server 2003 |
316 | the frequency is same as the cycle frequency.) |
317 | */ |
318 | |
319 | ulonglong my_timer_microseconds(void) |
320 | { |
321 | #if defined(HAVE_GETTIMEOFDAY) |
322 | { |
323 | static ulonglong last_value= 0; |
324 | struct timeval tv; |
325 | if (gettimeofday(&tv, NULL) == 0) |
326 | last_value= (ulonglong) tv.tv_sec * 1000000 + (ulonglong) tv.tv_usec; |
327 | else |
328 | { |
329 | /* |
330 | There are reports that gettimeofday(2) can have intermittent failures |
331 | on some platform, see for example Bug#36819. |
332 | We are not trying again or looping, just returning the best value possible |
333 | under the circumstances ... |
334 | */ |
335 | last_value++; |
336 | } |
337 | return last_value; |
338 | } |
339 | #elif defined(_WIN32) |
340 | { |
341 | /* QueryPerformanceCounter usually works with about 1/3 microsecond. */ |
342 | LARGE_INTEGER t_cnt; |
343 | |
344 | QueryPerformanceCounter(&t_cnt); |
345 | return (ulonglong) t_cnt.QuadPart; |
346 | } |
347 | #else |
348 | return 0; |
349 | #endif |
350 | } |
351 | |
352 | /* |
353 | For milliseconds, we use ftime() if it's supported |
354 | or time()*1000 if it's not. With modern versions of |
355 | Windows and with HP Itanium, resolution is 10-15 |
356 | milliseconds. |
357 | */ |
358 | |
359 | ulonglong my_timer_milliseconds(void) |
360 | { |
361 | #if defined(HAVE_SYS_TIMEB_H) && defined(HAVE_FTIME) |
362 | /* ftime() is obsolete but maybe the platform is old */ |
363 | struct timeb ft; |
364 | ftime(&ft); |
365 | return (ulonglong)ft.time * 1000 + (ulonglong)ft.millitm; |
366 | #elif defined(HAVE_TIME) |
367 | return (ulonglong) time(NULL) * 1000; |
368 | #elif defined(_WIN32) |
369 | FILETIME ft; |
370 | GetSystemTimeAsFileTime( &ft ); |
371 | return ((ulonglong)ft.dwLowDateTime + |
372 | (((ulonglong)ft.dwHighDateTime) << 32))/10000; |
373 | #else |
374 | return 0; |
375 | #endif |
376 | } |
377 | |
378 | /* |
379 | For ticks, which we handle with times(), the frequency |
380 | is usually 100/second and the overhead is surprisingly |
381 | bad, sometimes even worse than gettimeofday's overhead. |
382 | */ |
383 | |
384 | ulonglong my_timer_ticks(void) |
385 | { |
386 | #if defined(HAVE_SYS_TIMES_H) && defined(HAVE_TIMES) |
387 | { |
388 | struct tms times_buf; |
389 | return (ulonglong) times(×_buf); |
390 | } |
391 | #elif defined(_WIN32) |
392 | return (ulonglong) GetTickCount(); |
393 | #else |
394 | return 0; |
395 | #endif |
396 | } |
397 | |
398 | /* |
399 | The my_timer_init() function and its sub-functions |
400 | have several loops which call timers. If there's |
401 | something wrong with a timer -- which has never |
402 | happened in tests -- we want the loop to end after |
403 | an arbitrary number of iterations, and my_timer_info |
404 | will show a discouraging result. The arbitrary |
405 | number is 1,000,000. |
406 | */ |
407 | #define MY_TIMER_ITERATIONS 1000000 |
408 | |
409 | /* |
410 | Calculate overhead. Called from my_timer_init(). |
411 | Usually best_timer_overhead = cycles.overhead or |
412 | nanoseconds.overhead, so returned amount is in |
413 | cycles or nanoseconds. We repeat the calculation |
414 | ten times, so that we can disregard effects of |
415 | caching or interrupts. Result is quite consistent |
416 | for cycles, at least. But remember it's a minimum. |
417 | */ |
418 | |
419 | static void my_timer_init_overhead(ulonglong *overhead, |
420 | ulonglong (*cycle_timer)(void), |
421 | ulonglong (*this_timer)(void), |
422 | ulonglong best_timer_overhead) |
423 | { |
424 | ulonglong time1, time2; |
425 | int i; |
426 | |
427 | /* *overhead, least of 20 calculations - cycles.overhead */ |
428 | for (i= 0, *overhead= 1000000000; i < 20; ++i) |
429 | { |
430 | time1= cycle_timer(); |
431 | this_timer(); /* rather than 'time_tmp= timer();' */ |
432 | time2= cycle_timer() - time1; |
433 | if (*overhead > time2) |
434 | *overhead= time2; |
435 | } |
436 | *overhead-= best_timer_overhead; |
437 | } |
438 | |
439 | /* |
440 | Calculate Resolution. Called from my_timer_init(). |
441 | If a timer goes up by jumps, e.g. 1050, 1075, 1100, ... |
442 | then the best resolution is the minimum jump, e.g. 25. |
443 | If it's always divisible by 1000 then it's just a |
444 | result of multiplication of a lower-precision timer |
445 | result, e.g. nanoseconds are often microseconds * 1000. |
446 | If the minimum jump is less than an arbitrary passed |
447 | figure (a guess based on maximum overhead * 2), ignore. |
448 | Usually we end up with nanoseconds = 1 because it's too |
449 | hard to detect anything <= 100 nanoseconds. |
450 | Often GetTickCount() has resolution = 15. |
451 | We don't check with ticks because they take too long. |
452 | */ |
453 | static ulonglong my_timer_init_resolution(ulonglong (*this_timer)(void), |
454 | ulonglong overhead_times_2) |
455 | { |
456 | ulonglong time1, time2; |
457 | ulonglong best_jump; |
458 | int i, jumps, divisible_by_1000, divisible_by_1000000; |
459 | |
460 | divisible_by_1000= divisible_by_1000000= 0; |
461 | best_jump= 1000000; |
462 | for (i= jumps= 0; jumps < 3 && i < MY_TIMER_ITERATIONS * 10; ++i) |
463 | { |
464 | time1= this_timer(); |
465 | time2= this_timer(); |
466 | time2-= time1; |
467 | if (time2) |
468 | { |
469 | ++jumps; |
470 | if (!(time2 % 1000)) |
471 | { |
472 | ++divisible_by_1000; |
473 | if (!(time2 % 1000000)) |
474 | ++divisible_by_1000000; |
475 | } |
476 | if (best_jump > time2) |
477 | best_jump= time2; |
478 | /* For milliseconds, one jump is enough. */ |
479 | if (overhead_times_2 == 0) |
480 | break; |
481 | } |
482 | } |
483 | if (jumps == 3) |
484 | { |
485 | if (jumps == divisible_by_1000000) |
486 | return 1000000; |
487 | if (jumps == divisible_by_1000) |
488 | return 1000; |
489 | } |
490 | if (best_jump > overhead_times_2) |
491 | return best_jump; |
492 | return 1; |
493 | } |
494 | |
495 | /* |
496 | Calculate cycle frequency by seeing how many cycles pass |
497 | in a 200-microsecond period. I tried with 10-microsecond |
498 | periods originally, and the result was often very wrong. |
499 | */ |
500 | |
501 | static ulonglong my_timer_init_frequency(MY_TIMER_INFO *mti) |
502 | { |
503 | int i; |
504 | ulonglong time1, time2, time3, time4; |
505 | time1= my_timer_cycles(); |
506 | time2= my_timer_microseconds(); |
507 | time3= time2; /* Avoids a Microsoft/IBM compiler warning */ |
508 | for (i= 0; i < MY_TIMER_ITERATIONS; ++i) |
509 | { |
510 | time3= my_timer_microseconds(); |
511 | if (time3 - time2 > 200) break; |
512 | } |
513 | time4= my_timer_cycles() - mti->cycles.overhead; |
514 | time4-= mti->microseconds.overhead; |
515 | return (mti->microseconds.frequency * (time4 - time1)) / (time3 - time2); |
516 | } |
517 | |
518 | /* |
519 | Call my_timer_init before the first call to my_timer_xxx(). |
520 | If something must be initialized, it happens here. |
521 | Set: what routine is being used e.g. "asm_x86" |
522 | Set: function, overhead, actual frequency, resolution. |
523 | */ |
524 | |
525 | void my_timer_init(MY_TIMER_INFO *mti) |
526 | { |
527 | ulonglong (*best_timer)(void); |
528 | ulonglong best_timer_overhead; |
529 | ulonglong time1, time2; |
530 | int i; |
531 | |
532 | /* cycles */ |
533 | mti->cycles.frequency= 1000000000; |
534 | #if defined(__GNUC__) && defined(__i386__) |
535 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86; |
536 | #elif defined(__SUNPRO_C) && defined(__i386) |
537 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86; |
538 | #elif defined(__GNUC__) && defined(__x86_64__) |
539 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86_64; |
540 | #elif defined(HAVE_ASM_MSR_H) && defined(HAVE_RDTSCLL) |
541 | mti->cycles.routine= MY_TIMER_ROUTINE_RDTSCLL; |
542 | #elif defined(_WIN32) && defined(_M_IX86) |
543 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86_WIN; |
544 | #elif defined(_WIN64) && defined(_M_X64) |
545 | mti->cycles.routine= MY_TIMER_ROUTINE_RDTSC; |
546 | #elif defined(__INTEL_COMPILER) && defined(__ia64__) && defined(HAVE_IA64INTRIN_H) |
547 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_IA64; |
548 | #elif defined(__GNUC__) && defined(__ia64__) |
549 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_IA64; |
550 | #elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (defined(__64BIT__) || defined(_ARCH_PPC64)) |
551 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_PPC64; |
552 | #elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (!defined(__64BIT__) && !defined(_ARCH_PPC64)) |
553 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_PPC; |
554 | #elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7) |
555 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_SPARC64; |
556 | #elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(_ILP32) && !defined(__SunOS_5_7) |
557 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_SPARC32; |
558 | #elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__i386) && defined(_ILP32) |
559 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_I386; |
560 | #elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__x86_64) && defined(_LP64) |
561 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_X86_64; |
562 | #elif defined(__GNUC__) && defined(__sparcv9) && defined(_LP64) && (__GNUC__>2) |
563 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_GCC_SPARC64; |
564 | #elif defined(__GNUC__) && defined(__sparc__) && !defined(_LP64) && (__GNUC__>2) |
565 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_GCC_SPARC32; |
566 | #elif defined(__sgi) && defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) |
567 | mti->cycles.routine= MY_TIMER_ROUTINE_SGI_CYCLE; |
568 | #elif defined(__GNUC__) && defined(__s390__) |
569 | mti->cycles.routine= MY_TIMER_ROUTINE_ASM_S390; |
570 | #elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME) |
571 | mti->cycles.routine= MY_TIMER_ROUTINE_GETHRTIME; |
572 | #else |
573 | mti->cycles.routine= 0; |
574 | #endif |
575 | |
576 | if (!mti->cycles.routine || !my_timer_cycles()) |
577 | { |
578 | mti->cycles.routine= 0; |
579 | mti->cycles.resolution= 0; |
580 | mti->cycles.frequency= 0; |
581 | mti->cycles.overhead= 0; |
582 | } |
583 | |
584 | /* nanoseconds */ |
585 | mti->nanoseconds.frequency= 1000000000; /* initial assumption */ |
586 | #if defined(HAVE_READ_REAL_TIME) |
587 | mti->nanoseconds.routine= MY_TIMER_ROUTINE_READ_REAL_TIME; |
588 | #elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME) |
589 | mti->nanoseconds.routine= MY_TIMER_ROUTINE_GETHRTIME; |
590 | #elif defined(HAVE_CLOCK_GETTIME) |
591 | mti->nanoseconds.routine= MY_TIMER_ROUTINE_CLOCK_GETTIME; |
592 | #elif defined(__APPLE__) && defined(__MACH__) |
593 | mti->nanoseconds.routine= MY_TIMER_ROUTINE_MACH_ABSOLUTE_TIME; |
594 | #else |
595 | mti->nanoseconds.routine= 0; |
596 | #endif |
597 | if (!mti->nanoseconds.routine || !my_timer_nanoseconds()) |
598 | { |
599 | mti->nanoseconds.routine= 0; |
600 | mti->nanoseconds.resolution= 0; |
601 | mti->nanoseconds.frequency= 0; |
602 | mti->nanoseconds.overhead= 0; |
603 | } |
604 | |
605 | /* microseconds */ |
606 | mti->microseconds.frequency= 1000000; /* initial assumption */ |
607 | #if defined(HAVE_GETTIMEOFDAY) |
608 | mti->microseconds.routine= MY_TIMER_ROUTINE_GETTIMEOFDAY; |
609 | #elif defined(_WIN32) |
610 | { |
611 | LARGE_INTEGER li; |
612 | /* Windows: typical frequency = 3579545, actually 1/3 microsecond. */ |
613 | if (!QueryPerformanceFrequency(&li)) |
614 | mti->microseconds.routine= 0; |
615 | else |
616 | { |
617 | mti->microseconds.frequency= li.QuadPart; |
618 | mti->microseconds.routine= MY_TIMER_ROUTINE_QUERYPERFORMANCECOUNTER; |
619 | } |
620 | } |
621 | #else |
622 | mti->microseconds.routine= 0; |
623 | #endif |
624 | if (!mti->microseconds.routine || !my_timer_microseconds()) |
625 | { |
626 | mti->microseconds.routine= 0; |
627 | mti->microseconds.resolution= 0; |
628 | mti->microseconds.frequency= 0; |
629 | mti->microseconds.overhead= 0; |
630 | } |
631 | |
632 | /* milliseconds */ |
633 | mti->milliseconds.frequency= 1000; /* initial assumption */ |
634 | #if defined(HAVE_SYS_TIMEB_H) && defined(HAVE_FTIME) |
635 | mti->milliseconds.routine= MY_TIMER_ROUTINE_FTIME; |
636 | #elif defined(_WIN32) |
637 | mti->milliseconds.routine= MY_TIMER_ROUTINE_GETSYSTEMTIMEASFILETIME; |
638 | #elif defined(HAVE_TIME) |
639 | mti->milliseconds.routine= MY_TIMER_ROUTINE_TIME; |
640 | #else |
641 | mti->milliseconds.routine= 0; |
642 | #endif |
643 | if (!mti->milliseconds.routine || !my_timer_milliseconds()) |
644 | { |
645 | mti->milliseconds.routine= 0; |
646 | mti->milliseconds.resolution= 0; |
647 | mti->milliseconds.frequency= 0; |
648 | mti->milliseconds.overhead= 0; |
649 | } |
650 | |
651 | /* ticks */ |
652 | mti->ticks.frequency= 100; /* permanent assumption */ |
653 | #if defined(HAVE_SYS_TIMES_H) && defined(HAVE_TIMES) |
654 | mti->ticks.routine= MY_TIMER_ROUTINE_TIMES; |
655 | #elif defined(_WIN32) |
656 | mti->ticks.routine= MY_TIMER_ROUTINE_GETTICKCOUNT; |
657 | #else |
658 | mti->ticks.routine= 0; |
659 | #endif |
660 | if (!mti->ticks.routine || !my_timer_ticks()) |
661 | { |
662 | mti->ticks.routine= 0; |
663 | mti->ticks.resolution= 0; |
664 | mti->ticks.frequency= 0; |
665 | mti->ticks.overhead= 0; |
666 | } |
667 | |
668 | /* |
669 | Calculate overhead in terms of the timer that |
670 | gives the best resolution: cycles or nanoseconds. |
671 | I doubt it ever will be as bad as microseconds. |
672 | */ |
673 | if (mti->cycles.routine) |
674 | best_timer= &my_timer_cycles; |
675 | else |
676 | { |
677 | if (mti->nanoseconds.routine) |
678 | { |
679 | best_timer= &my_timer_nanoseconds; |
680 | } |
681 | else |
682 | best_timer= &my_timer_microseconds; |
683 | } |
684 | |
685 | /* best_timer_overhead = least of 20 calculations */ |
686 | for (i= 0, best_timer_overhead= 1000000000; i < 20; ++i) |
687 | { |
688 | time1= best_timer(); |
689 | time2= best_timer() - time1; |
690 | if (best_timer_overhead > time2) |
691 | best_timer_overhead= time2; |
692 | } |
693 | if (mti->cycles.routine) |
694 | my_timer_init_overhead(&mti->cycles.overhead, |
695 | best_timer, |
696 | &my_timer_cycles, |
697 | best_timer_overhead); |
698 | if (mti->nanoseconds.routine) |
699 | my_timer_init_overhead(&mti->nanoseconds.overhead, |
700 | best_timer, |
701 | &my_timer_nanoseconds, |
702 | best_timer_overhead); |
703 | if (mti->microseconds.routine) |
704 | my_timer_init_overhead(&mti->microseconds.overhead, |
705 | best_timer, |
706 | &my_timer_microseconds, |
707 | best_timer_overhead); |
708 | if (mti->milliseconds.routine) |
709 | my_timer_init_overhead(&mti->milliseconds.overhead, |
710 | best_timer, |
711 | &my_timer_milliseconds, |
712 | best_timer_overhead); |
713 | if (mti->ticks.routine) |
714 | my_timer_init_overhead(&mti->ticks.overhead, |
715 | best_timer, |
716 | &my_timer_ticks, |
717 | best_timer_overhead); |
718 | |
719 | /* |
720 | Calculate resolution for nanoseconds or microseconds |
721 | or milliseconds, by seeing if it's always divisible |
722 | by 1000, and by noticing how much jumping occurs. |
723 | For ticks, just assume the resolution is 1. |
724 | */ |
725 | if (mti->cycles.routine) |
726 | mti->cycles.resolution= 1; |
727 | if (mti->nanoseconds.routine) |
728 | mti->nanoseconds.resolution= |
729 | my_timer_init_resolution(&my_timer_nanoseconds, 20000); |
730 | if (mti->microseconds.routine) |
731 | mti->microseconds.resolution= |
732 | my_timer_init_resolution(&my_timer_microseconds, 20); |
733 | if (mti->milliseconds.routine) |
734 | { |
735 | if (mti->milliseconds.routine == MY_TIMER_ROUTINE_TIME) |
736 | mti->milliseconds.resolution= 1000; |
737 | else |
738 | mti->milliseconds.resolution= |
739 | my_timer_init_resolution(&my_timer_milliseconds, 0); |
740 | } |
741 | if (mti->ticks.routine) |
742 | mti->ticks.resolution= 1; |
743 | |
744 | /* |
745 | Calculate cycles frequency, |
746 | if we have both a cycles routine and a microseconds routine. |
747 | In tests, this usually results in a figure within 2% of |
748 | what "cat /proc/cpuinfo" says. |
749 | If the microseconds routine is QueryPerformanceCounter |
750 | (i.e. it's Windows), and the microseconds frequency is > |
751 | 500,000,000 (i.e. it's Windows Server so it uses RDTSC) |
752 | and the microseconds resolution is > 100 (i.e. dreadful), |
753 | then calculate cycles frequency = microseconds frequency. |
754 | */ |
755 | if (mti->cycles.routine |
756 | && mti->microseconds.routine) |
757 | { |
758 | if (mti->microseconds.routine == |
759 | MY_TIMER_ROUTINE_QUERYPERFORMANCECOUNTER |
760 | && mti->microseconds.frequency > 500000000 |
761 | && mti->microseconds.resolution > 100) |
762 | mti->cycles.frequency= mti->microseconds.frequency; |
763 | else |
764 | { |
765 | time1= my_timer_init_frequency(mti); |
766 | /* Repeat once in case there was an interruption. */ |
767 | time2= my_timer_init_frequency(mti); |
768 | if (time1 < time2) mti->cycles.frequency= time1; |
769 | else mti->cycles.frequency= time2; |
770 | } |
771 | } |
772 | |
773 | /* |
774 | Calculate milliseconds frequency = |
775 | (cycles-frequency/#-of-cycles) * #-of-milliseconds, |
776 | if we have both a milliseconds routine and a cycles |
777 | routine. |
778 | This will be inaccurate if milliseconds resolution > 1. |
779 | This is probably only useful when testing new platforms. |
780 | */ |
781 | if (mti->milliseconds.routine |
782 | && mti->milliseconds.resolution < 1000 |
783 | && mti->microseconds.routine |
784 | && mti->cycles.routine) |
785 | { |
786 | ulonglong time3, time4; |
787 | time1= my_timer_cycles(); |
788 | time2= my_timer_milliseconds(); |
789 | time3= time2; /* Avoids a Microsoft/IBM compiler warning */ |
790 | for (i= 0; i < MY_TIMER_ITERATIONS * 1000; ++i) |
791 | { |
792 | time3= my_timer_milliseconds(); |
793 | if (time3 - time2 > 10) break; |
794 | } |
795 | time4= my_timer_cycles(); |
796 | mti->milliseconds.frequency= |
797 | (mti->cycles.frequency * (time3 - time2)) / (time4 - time1); |
798 | } |
799 | |
800 | /* |
801 | Calculate ticks.frequency = |
802 | (cycles-frequency/#-of-cycles * #-of-ticks, |
803 | if we have both a ticks routine and a cycles |
804 | routine, |
805 | This is probably only useful when testing new platforms. |
806 | */ |
807 | if (mti->ticks.routine |
808 | && mti->microseconds.routine |
809 | && mti->cycles.routine) |
810 | { |
811 | ulonglong time3, time4; |
812 | time1= my_timer_cycles(); |
813 | time2= my_timer_ticks(); |
814 | time3= time2; /* Avoids a Microsoft/IBM compiler warning */ |
815 | for (i= 0; i < MY_TIMER_ITERATIONS * 1000; ++i) |
816 | { |
817 | time3= my_timer_ticks(); |
818 | if (time3 - time2 > 10) break; |
819 | } |
820 | time4= my_timer_cycles(); |
821 | mti->ticks.frequency= |
822 | (mti->cycles.frequency * (time3 - time2)) / (time4 - time1); |
823 | } |
824 | } |
825 | |
826 | /* |
827 | Additional Comments |
828 | ------------------- |
829 | |
830 | This is for timing, i.e. finding out how long a piece of code |
831 | takes. If you want time of day matching a wall clock, the |
832 | my_timer_xxx functions won't help you. |
833 | |
834 | The best timer is the one with highest frequency, lowest |
835 | overhead, and resolution=1. The my_timer_info() routine will tell |
836 | you at runtime which timer that is. Usually it will be |
837 | my_timer_cycles() but be aware that, although it's best, |
838 | it has possible flaws and dangers. Depending on platform: |
839 | - The frequency might change. We don't test for this. It |
840 | happens on laptops for power saving, and on blade servers |
841 | for avoiding overheating. |
842 | - The overhead that my_timer_init() returns is the minimum. |
843 | In fact it could be slightly greater because of caching or |
844 | because you call the routine by address, as recommended. |
845 | It could be hugely greater if there's an interrupt. |
846 | - The x86 cycle counter, RDTSC doesn't "serialize". That is, |
847 | if there is out-of-order execution, rdtsc might be processed |
848 | after an instruction that logically follows it. |
849 | (We could force serialization, but that would be slower.) |
850 | - It is possible to set a flag which renders RDTSC |
851 | inoperative. Somebody responsible for the kernel |
852 | of the operating system would have to make this |
853 | decision. For the platforms we've tested with, there's |
854 | no such problem. |
855 | - With a multi-processor arrangement, it's possible |
856 | to get the cycle count from one processor in |
857 | thread X, and the cycle count from another processor |
858 | in thread Y. They may not always be in synch. |
859 | - You can't depend on a cycle counter being available for |
860 | all platforms. On Alphas, the |
861 | cycle counter is only 32-bit, so it would overflow quickly, |
862 | so we don't bother with it. On platforms that we haven't |
863 | tested, there might be some if/endif combination that we |
864 | didn't expect, or some assembler routine that we didn't |
865 | supply. |
866 | |
867 | The recommended way to use the timer routines is: |
868 | 1. Somewhere near the beginning of the program, call |
869 | my_timer_init(). This should only be necessary once, |
870 | although you can call it again if you think that the |
871 | frequency has changed. |
872 | 2. Determine the best timer based on frequency, resolution, |
873 | overhead -- all things that my_timer_init() returns. |
874 | Preserve the address of the timer and the my_timer_into |
875 | results in an easily-accessible place. |
876 | 3. Instrument the code section that you're monitoring, thus: |
877 | time1= my_timer_xxx(); |
878 | Instrumented code; |
879 | time2= my_timer_xxx(); |
880 | elapsed_time= (time2 - time1) - overhead; |
881 | If the timer is always on, then overhead is always there, |
882 | so don't subtract it. |
883 | 4. Save the elapsed time, or add it to a totaller. |
884 | 5. When all timing processes are complete, transfer the |
885 | saved / totalled elapsed time to permanent storage. |
886 | Optionally you can convert cycles to microseconds at |
887 | this point. (Don't do so every time you calculate |
888 | elapsed_time! That would waste time and lose precision!) |
889 | For converting cycles to microseconds, use the frequency |
890 | that my_timer_init() returns. You'll also need to convert |
891 | if the my_timer_microseconds() function is the Windows |
892 | function QueryPerformanceCounter(), since that's sometimes |
893 | a counter with precision slightly better than microseconds. |
894 | |
895 | Since we recommend calls by function pointer, we supply |
896 | no inline functions. |
897 | |
898 | Some comments on the many candidate routines for timing ... |
899 | |
900 | clock() -- We don't use because it would overflow frequently. |
901 | |
902 | clock_gettime() -- In tests, clock_gettime often had |
903 | resolution = 1000. |
904 | |
905 | ftime() -- A "man ftime" says: "This function is obsolete. |
906 | Don't use it." On every platform that we tested, if ftime() |
907 | was available, then so was gettimeofday(), and gettimeofday() |
908 | overhead was always at least as good as ftime() overhead. |
909 | |
910 | gettimeofday() -- available on most platforms, though not |
911 | on Windows. There is a hardware timer (sometimes a Programmable |
912 | Interrupt Timer or "PIT") (sometimes a "HPET") used for |
913 | interrupt generation. When it interrupts (a "tick" or "jiffy", |
914 | typically 1 centisecond) it sets xtime. For gettimeofday, a |
915 | Linux kernel routine usually gets xtime and then gets rdtsc |
916 | to get elapsed nanoseconds since the last tick. On Red Hat |
917 | Enterprise Linux 3, there was once a bug which caused the |
918 | resolution to be 1000, i.e. one centisecond. We never check |
919 | for time-zone change. |
920 | |
921 | getnstimeofday() -- something to watch for in future Linux |
922 | |
923 | do_gettimeofday() -- exists on Linux but not for "userland" |
924 | |
925 | get_cycles() -- a multi-platform function, worth watching |
926 | in future Linux versions. But we found platform-specific |
927 | functions which were better documented in operating-system |
928 | manuals. And get_cycles() can fail or return a useless |
929 | 32-bit number. It might be available on some platforms, |
930 | such as arm, which we didn't test. Using |
931 | "include <linux/timex.h>" or "include <asm/timex.h>" |
932 | can lead to autoconf or compile errors, depending on system. |
933 | |
934 | rdtsc, __rdtsc, rdtscll: available for x86 with Linux BSD, |
935 | Solaris, Windows. See "possible flaws and dangers" comments. |
936 | |
937 | times(): what we use for ticks. Should just read the last |
938 | (xtime) tick count, therefore should be fast, but usually |
939 | isn't. |
940 | |
941 | GetTickCount(): we use this for my_timer_ticks() on |
942 | Windows. Actually it really is a tick counter, so resolution |
943 | >= 10 milliseconds unless you have a very old Windows version. |
944 | With Windows 95 or 98 or ME, timeGetTime() has better resolution than |
945 | GetTickCount (1ms rather than 55ms). But with Windows NT or XP or 2000, |
946 | they're both getting from a variable in the Process Environment Block |
947 | (PEB), and the variable is set by the programmable interrupt timer, so |
948 | the resolution is the same (usually 10-15 milliseconds). Also timeGetTime |
949 | is slower on old machines: |
950 | http://www.doumo.jp/aon-java/jsp/postgretips/tips.jsp?tips=74. |
951 | Also timeGetTime requires linking winmm.lib, |
952 | Therefore we use GetTickCount. |
953 | It will overflow every 49 days because the return is 32-bit. |
954 | There is also a GetTickCount64 but it requires Vista or Windows Server 2008. |
955 | (As for GetSystemTimeAsFileTime, its precision is spurious, it |
956 | just reads the tick variable like the other functions do. |
957 | However, we don't expect it to overflow every 49 days, so we |
958 | will prefer it for my_timer_milliseconds().) |
959 | |
960 | QueryPerformanceCounter() we use this for my_timer_microseconds() |
961 | on Windows. 1-PIT-tick (often 1/3-microsecond). Usually reads |
962 | the PIT so it's slow. On some Windows variants, uses RDTSC. |
963 | |
964 | GetLocalTime() this is available on Windows but we don't use it. |
965 | |
966 | getclock(): documented for Alpha, but not found during tests. |
967 | |
968 | mach_absolute_time() and UpTime() are recommended for Apple. |
969 | Inititally they weren't tried, because asm_ppc seems to do the job. |
970 | But now we use mach_absolute_time for nanoseconds. |
971 | |
972 | Any clock-based timer can be affected by NPT (ntpd program), |
973 | which means: |
974 | - full-second correction can occur for leap second |
975 | - tiny corrections can occcur approimately every 11 minutes |
976 | (but I think they only affect the RTC which isn't the PIT). |
977 | |
978 | We define "precision" as "frequency" and "high precision" is |
979 | "frequency better than 1 microsecond". We define "resolution" |
980 | as a synonym for "granularity". We define "accuracy" as |
981 | "closeness to the truth" as established by some authoritative |
982 | clock, but we can't measure accuracy. |
983 | |
984 | Do not expect any of our timers to be monotonic; we |
985 | won't guarantee that they return constantly-increasing |
986 | unique numbers. |
987 | |
988 | We tested with AIX, Solaris (x86 + Sparc), Linux (x86 + |
989 | Itanium), Windows, 64-bit Windows, QNX, FreeBSD, HPUX, |
990 | Irix, Mac. We didn't test with SCO. |
991 | |
992 | */ |
993 | |
994 | |