1 | /* benchmark.h |
2 | */ |
3 | |
4 | #ifndef BENCHMARKS_INCLUDE_BENCHMARK_H_ |
5 | #define BENCHMARKS_INCLUDE_BENCHMARK_H_ |
6 | #include <roaring/portability.h> |
7 | #include <time.h> |
8 | |
9 | #ifdef ROARING_INLINE_ASM |
10 | #define CLOBBER_MEMORY __asm volatile("" ::: /* pretend to clobber */ "memory") |
11 | #else |
12 | #define CLOBBER_MEMORY |
13 | #endif |
14 | |
15 | #if defined(IS_X64) && defined(ROARING_INLINE_ASM) |
16 | #define RDTSC_START(cycles) \ |
17 | do { \ |
18 | register unsigned cyc_high, cyc_low; \ |
19 | __asm volatile( \ |
20 | "cpuid\n\t" \ |
21 | "rdtsc\n\t" \ |
22 | "mov %%edx, %0\n\t" \ |
23 | "mov %%eax, %1\n\t" \ |
24 | : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ |
25 | (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ |
26 | } while (0) |
27 | |
28 | #define RDTSC_FINAL(cycles) \ |
29 | do { \ |
30 | register unsigned cyc_high, cyc_low; \ |
31 | __asm volatile( \ |
32 | "rdtscp\n\t" \ |
33 | "mov %%edx, %0\n\t" \ |
34 | "mov %%eax, %1\n\t" \ |
35 | "cpuid\n\t" \ |
36 | : "=r"(cyc_high), "=r"(cyc_low)::"%rax", "%rbx", "%rcx", "%rdx"); \ |
37 | (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ |
38 | } while (0) |
39 | |
40 | #elif defined(__linux__) && defined(__GLIBC__) |
41 | |
42 | #include <time.h> |
43 | #ifdef CLOCK_THREAD_CPUTIME_ID |
44 | #define RDTSC_START(cycles) \ |
45 | do { \ |
46 | struct timespec ts; \ |
47 | clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); \ |
48 | cycles = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; \ |
49 | } while (0) |
50 | |
51 | #define RDTSC_FINAL(cycles) \ |
52 | do { \ |
53 | struct timespec ts; \ |
54 | clock_gettime(CLOCK_REALTIME, &ts); \ |
55 | cycles = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; \ |
56 | } while (0) |
57 | |
58 | #elif defined(CLOCK_REALTIME) // #ifdef CLOCK_THREAD_CPUTIME_ID |
59 | #define RDTSC_START(cycles) \ |
60 | do { \ |
61 | struct timespec ts; \ |
62 | clock_gettime(CLOCK_REALTIME, &ts); \ |
63 | cycles = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; \ |
64 | } while (0) |
65 | |
66 | #define RDTSC_FINAL(cycles) \ |
67 | do { \ |
68 | struct timespec ts; \ |
69 | clock_gettime(CLOCK_REALTIME, &ts); \ |
70 | cycles = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; \ |
71 | } while (0) |
72 | |
73 | #else |
74 | #define RDTSC_START(cycles) \ |
75 | do { \ |
76 | cycles = clock(); \ |
77 | } while(0) |
78 | |
79 | #define RDTSC_FINAL(cycles) \ |
80 | do { \ |
81 | cycles = clock(); \ |
82 | } while(0) |
83 | |
84 | #endif // #ifdef CLOCK_THREAD_CPUTIME_ID |
85 | |
86 | #else |
87 | |
88 | /** |
89 | * Other architectures do not support rdtsc ? |
90 | */ |
91 | #include <time.h> |
92 | |
93 | #define RDTSC_START(cycles) \ |
94 | do { \ |
95 | cycles = clock(); \ |
96 | } while (0) |
97 | |
98 | #define RDTSC_FINAL(cycles) \ |
99 | do { \ |
100 | cycles = clock(); \ |
101 | } while (0) |
102 | |
103 | #endif |
104 | |
105 | /* |
106 | * Prints the best number of operations per cycle where |
107 | * test is the function call, answer is the expected answer generated by |
108 | * test, repeat is the number of times we should repeat and size is the |
109 | * number of operations represented by test. |
110 | */ |
111 | #define BEST_TIME(test, answer, repeat, size) \ |
112 | do { \ |
113 | printf("%s: ", #test); \ |
114 | fflush(NULL); \ |
115 | uint64_t cycles_start, cycles_final, cycles_diff; \ |
116 | uint64_t min_diff = (uint64_t)-1; \ |
117 | int wrong_answer = 0; \ |
118 | for (int i = 0; i < repeat; i++) { \ |
119 | CLOBBER_MEMORY; \ |
120 | RDTSC_START(cycles_start); \ |
121 | if (test != answer) wrong_answer = 1; \ |
122 | RDTSC_FINAL(cycles_final); \ |
123 | cycles_diff = (cycles_final - cycles_start); \ |
124 | if (cycles_diff < min_diff) min_diff = cycles_diff; \ |
125 | } \ |
126 | uint64_t S = (uint64_t)size; \ |
127 | float cycle_per_op = (min_diff) / (float)S; \ |
128 | printf(" %.2f cycles per operation", cycle_per_op); \ |
129 | if (wrong_answer) printf(" [ERROR]"); \ |
130 | printf("\n"); \ |
131 | fflush(NULL); \ |
132 | } while (0) |
133 | |
134 | /* |
135 | * This is like BEST_TIME except that ... it runs functions "test" using the |
136 | * first parameter "base" and various parameters from "testvalues" (there |
137 | * are nbrtestvalues), calling pre on base between tests |
138 | */ |
139 | #define BEST_TIME_PRE_ARRAY(base, test, pre, testvalues, nbrtestvalues) \ |
140 | do { \ |
141 | printf("%s %s: ", #test, #pre); \ |
142 | fflush(NULL); \ |
143 | uint64_t cycles_start, cycles_final, cycles_diff; \ |
144 | int sum = 0; \ |
145 | for (size_t j = 0; j < nbrtestvalues; j++) { \ |
146 | pre(base); \ |
147 | CLOBBER_MEMORY; \ |
148 | RDTSC_START(cycles_start); \ |
149 | test(base, testvalues[j]); \ |
150 | RDTSC_FINAL(cycles_final); \ |
151 | cycles_diff = (cycles_final - cycles_start); \ |
152 | sum += cycles_diff; \ |
153 | } \ |
154 | uint64_t S = (uint64_t)nbrtestvalues; \ |
155 | float cycle_per_op = sum / (float)S; \ |
156 | printf(" %.2f cycles per operation", cycle_per_op); \ |
157 | printf("\n"); \ |
158 | fflush(NULL); \ |
159 | } while (0) |
160 | |
161 | #endif /* BENCHMARKS_INCLUDE_BENCHMARK_H_ */ |
162 | |