1// Benchmark quantization specific functions on synthetic data
2
3#include "ggml.h"
4#include "ggml-cpu.h"
5
6#undef NDEBUG
7#include <algorithm>
8#include <assert.h>
9#include <functional>
10#include <math.h>
11#include <memory>
12#include <stdio.h>
13#include <string>
14#include <vector>
15
16#if defined(_MSC_VER)
17#pragma warning(disable: 4244 4267) // possible loss of data
18#endif
19
20#define MAX_ALIGNMENT 64
21#define QK 32
22#define WARMUP 5
23#define ITERATIONS 10
24#define MAX_ITERATIONS 100000000
25
26#define L1_SIZE 32*128
27#define L2_SIZE 32*2048
28#define L3_SIZE 32*20480
29#define MEM_SIZE 32*2048000
30
31struct quantize_perf_params {
32 std::vector<std::string> include_types;
33 std::vector<size_t> test_sizes;
34 size_t alignment_offset = 0;
35 bool op_quantize_row_q_reference = false;
36 bool op_quantize_row_q = false;
37 bool op_dequantize_row_q = false;
38 bool op_quantize_row_q_dot = false;
39 bool op_vec_dot_q = false;
40 int64_t iterations = ITERATIONS;
41};
42
43#if defined(__x86_64__) || defined(__i386__)
44
45#include <x86intrin.h>
46inline int64_t cpu_cycles() {
47// Rough way to detect new-ish CPUs
48#ifdef __POPCNT__
49 unsigned int dummy;
50 return __rdtscp(&dummy);
51#else
52 return __rdtsc();
53#endif
54}
55
56#else
57
58#define cpu_cycles() 0
59
60#endif
61
62
63// Generate synthetic data
64static void generate_data(float offset, size_t n, float * dst) {
65 for (size_t i = 0; i < n; i++) {
66 dst[i] = 0.1 + 2*cosf(x: i + offset);
67 }
68}
69
70static float gigabytes_per_second(size_t bytes, int64_t usecs) {
71 return bytes / (float) usecs * 1000000 / (1024*1024*1024);
72}
73
74static void * align_with_offset(void * ptr, int offset) {
75 size_t dummy_size = MAX_ALIGNMENT * 4;
76 return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr&: ptr, space&: dummy_size) + offset;
77}
78
79static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
80 int64_t min_time_us = INT64_MAX;
81 int64_t total_time_us = 0;
82 int64_t min_time_cycles = INT64_MAX;
83 int64_t total_time_cycles = 0;
84
85 for (int i = 0; i < WARMUP; i++) {
86 func();
87 }
88
89 for (int i = 0; i < iterations; i++) {
90 const int64_t start_time = ggml_time_us();
91 const int64_t start_cycles = cpu_cycles();
92
93 func();
94
95 const int64_t end_cycles = cpu_cycles();
96 const int64_t end_time = ggml_time_us();
97
98 total_time_cycles += end_cycles - start_cycles;
99 min_time_cycles = std::min(a: min_time_cycles, b: end_cycles - start_cycles);
100 total_time_us += end_time - start_time;
101 min_time_us = std::min(a: min_time_us, b: end_time - start_time);
102 }
103
104 printf(format: " min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size);
105 printf(format: " avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * iterations));
106 printf(format: " float32 throughput : %9.2f GB/s\n", gigabytes_per_second(bytes: 4 * size * iterations, usecs: total_time_us));
107 printf(format: " quantized throughput : %9.2f GB/s\n", gigabytes_per_second(bytes: q_size * iterations, usecs: total_time_us));
108}
109
110static void usage(char * argv[]) {
111 printf(format: "Benchmark quantization specific functions on synthetic data\n");
112 printf(format: "\n");
113 printf(format: "usage: %s [options]\n", argv[0]);
114 printf(format: "\n");
115 printf(format: "options: (default)\n");
116 printf(format: " -h, --help show this help message and exit\n");
117 printf(format: " --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
118 printf(format: " -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
119 printf(format: " -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
120 printf(format: " --op OP set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
121 printf(format: " quantize_row_q_dot, vec_dot_q (all)\n");
122 printf(format: " --type TYPE set test type as");
123 for (int i = 0; i < GGML_TYPE_COUNT; i++) {
124 ggml_type type = (ggml_type) i;
125 const auto * qfns = ggml_get_type_traits(type);
126 const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
127 if (ggml_type_name(type) != NULL) {
128 if (qfns_cpu->from_float && qfns->to_float) {
129 printf(format: " %s", ggml_type_name(type));
130 }
131 }
132 }
133 printf(format: " (all)\n");
134 printf(format: " --alignment-offset OFFSET\n");
135 printf(format: " set alignment offset as OFFSET (0)\n");
136 printf(format: " -i NUM, --iterations NUM\n");
137 printf(format: " set test iteration number (%d)\n", ITERATIONS);
138}
139
140int main(int argc, char * argv[]) {
141 quantize_perf_params params {};
142
143 // read command line
144
145 bool invalid_param = false;
146 std::string arg;
147 for (int i = 1; i < argc; i++) {
148 arg = argv[i];
149
150 if (arg == "--size") {
151 if (++i >= argc) {
152 invalid_param = true;
153 break;
154 }
155 size_t size = std::stoi(str: argv[i]);
156 if (size % 32 != 0) {
157 fprintf(stderr, format: "error: size %zu not divisible by 32\n", size);
158 invalid_param = true;
159 break;
160 }
161 params.test_sizes.push_back(x: size);
162 } else if (arg == "-3") {
163 // quick select sizes that probably fit in CPU caches
164 params.test_sizes.push_back(L1_SIZE);
165 params.test_sizes.push_back(L2_SIZE);
166 params.test_sizes.push_back(L3_SIZE);
167 } else if (arg == "-4") {
168 // quick select cache sizes + memory
169 params.test_sizes.push_back(L1_SIZE);
170 params.test_sizes.push_back(L2_SIZE);
171 params.test_sizes.push_back(L3_SIZE);
172 params.test_sizes.push_back(MEM_SIZE);
173 } else if (arg == "--op") {
174 if (++i >= argc) {
175 invalid_param = true;
176 break;
177 }
178 std::string op {argv[i]};
179 if (op == "quantize_row_q_reference") {
180 params.op_quantize_row_q_reference = true;
181 } else if (op == "quantize_row_q") {
182 params.op_quantize_row_q = true;
183 } else if (op == "dequantize_row_q") {
184 params.op_dequantize_row_q = true;
185 } else if (op == "quantize_row_q_dot") {
186 params.op_quantize_row_q_dot = true;
187 } else if (op == "vec_dot_q") {
188 params.op_vec_dot_q = true;
189 } else {
190 invalid_param = true;
191 break;
192 }
193 } else if (arg == "--type") {
194 if (++i >= argc) {
195 invalid_param = true;
196 break;
197 }
198 params.include_types.push_back(x: argv[i]);
199 } else if (arg == "--alignment-offset") {
200 if (++i >= argc) {
201 invalid_param = true;
202 break;
203 }
204 int alignment = std::stoi(str: argv[i]);
205 if (alignment < 0 || alignment > MAX_ALIGNMENT) {
206 fprintf(stderr, format: "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
207 invalid_param = true;
208 break;
209 }
210 params.alignment_offset = alignment;
211 } else if ((arg == "-i") || (arg == "--iterations")) {
212 if (++i >= argc) {
213 invalid_param = true;
214 break;
215 }
216 int number = std::stoi(str: argv[i]);
217 if (number < 0 || number > MAX_ITERATIONS) {
218 fprintf(stderr, format: "error: iterations must be less than %d\n", MAX_ITERATIONS);
219 invalid_param = true;
220 break;
221 }
222 params.iterations = number;
223 } else if ((arg == "-h") || (arg == "--help")) {
224 usage(argv);
225 return 1;
226 } else {
227 fprintf(stderr, format: "error: unknown argument: %s\n", arg.c_str());
228 return 1;
229 }
230 }
231 if (invalid_param) {
232 fprintf(stderr, format: "error: invalid parameter for argument: %s\n", arg.c_str());
233 return 1;
234 }
235
236 if (params.test_sizes.empty()) {
237 params.test_sizes.push_back(L1_SIZE);
238 }
239 if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
240 params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
241 }
242
243 std::sort(first: params.test_sizes.begin(), last: params.test_sizes.end());
244 size_t largest = params.test_sizes.back();
245
246 std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
247 std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
248 std::vector<uint8_t> test_q1_v (largest*4 + MAX_ALIGNMENT*2);
249 std::vector<uint8_t> test_q2_v (largest*4 + MAX_ALIGNMENT*2);
250 std::vector<uint8_t> test_out_v (largest*4 + MAX_ALIGNMENT*2);
251
252 float * test_data1 = (float *) align_with_offset(ptr: test_data1_v.data(), offset: params.alignment_offset);
253 float * test_data2 = (float *) align_with_offset(ptr: test_data2_v.data(), offset: params.alignment_offset);
254 float * test_q1 = (float *) align_with_offset(ptr: test_q1_v.data(), offset: params.alignment_offset);
255 float * test_q2 = (float *) align_with_offset(ptr: test_q2_v.data(), offset: params.alignment_offset);
256 float * test_out = (float *) align_with_offset(ptr: test_out_v.data(), offset: params.alignment_offset);
257
258 generate_data(offset: 0, n: largest, dst: test_data1);
259 generate_data(offset: 1, n: largest, dst: test_data2);
260
261 int64_t iterations = params.iterations;
262
263 ggml_cpu_init();
264
265 for (int i = 0; i < GGML_TYPE_COUNT; i++) {
266 ggml_type type = (ggml_type) i;
267 const auto * qfns = ggml_get_type_traits(type);
268 const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
269 if (!params.include_types.empty() && ggml_type_name(type) && std::find(first: params.include_types.begin(), last: params.include_types.end(), val: ggml_type_name(type)) == params.include_types.end()) {
270 continue;
271 }
272
273 if (qfns_cpu->from_float && qfns->to_float) {
274 printf(format: "%s\n", ggml_type_name(type));
275
276 ggml_quantize_init(type);
277
278 if (params.op_quantize_row_q_reference) {
279 printf(format: " quantize_row_q_reference\n");
280 for (size_t size : params.test_sizes) {
281 printf(format: " %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
282 auto quantize_fn = [&](void) -> float {
283 qfns->from_float_ref(test_data1, test_q1, size);
284 return test_q1[0];
285 };
286 size_t quantized_size = ggml_row_size(type, ne: size);
287 benchmark_function(size, q_size: quantized_size, iterations, func: quantize_fn);
288 }
289 printf(format: "\n");
290 }
291
292 if (params.op_quantize_row_q) {
293 printf(format: " quantize_row_q\n");
294 for (size_t size : params.test_sizes) {
295 printf(format: " %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
296 auto quantize_fn = [&](void) -> float {
297 qfns_cpu->from_float(test_data1, test_q1, size);
298 return test_q1[0];
299 };
300 size_t quantized_size = ggml_row_size(type, ne: size);
301 benchmark_function(size, q_size: quantized_size, iterations, func: quantize_fn);
302 }
303 printf(format: "\n");
304 }
305
306 if (params.op_dequantize_row_q) {
307 printf(format: " dequantize_row_q\n");
308 qfns_cpu->from_float(test_data1, test_q1, largest);
309 for (size_t size : params.test_sizes) {
310 printf(format: " %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
311 auto quantize_fn = [&](void) -> float {
312 qfns->to_float(test_q1, test_out, size);
313 return test_out[0];
314 };
315 size_t quantized_size = ggml_row_size(type, ne: size);
316 benchmark_function(size, q_size: quantized_size, iterations, func: quantize_fn);
317 }
318 printf(format: "\n");
319 }
320
321 if (params.op_quantize_row_q_dot) {
322 printf(format: " quantize_row_q_dot\n");
323 for (size_t size : params.test_sizes) {
324 printf(format: " %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
325 auto quantize_fn = [&](void) -> float {
326 const auto * vdot = ggml_get_type_traits_cpu(type: qfns_cpu->vec_dot_type);
327 vdot->from_float(test_data1, test_q1, size);
328 return test_q1[0];
329 };
330 size_t quantized_size = ggml_row_size(type, ne: size);
331 benchmark_function(size, q_size: quantized_size, iterations, func: quantize_fn);
332 }
333 printf(format: "\n");
334 }
335
336 if (params.op_vec_dot_q) {
337 printf(format: " vec_dot_q\n");
338 qfns_cpu->from_float(test_data1, test_q1, largest);
339 qfns_cpu->from_float(test_data2, test_q2, largest);
340 for (size_t size : params.test_sizes) {
341 printf(format: " %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
342 auto quantize_fn = [&](void) -> float {
343 float result;
344 qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
345 return result;
346 };
347 size_t quantized_size = ggml_row_size(type, ne: size);
348 benchmark_function(size, q_size: quantized_size, iterations, func: quantize_fn);
349 }
350 printf(format: "\n");
351 }
352 }
353 }
354
355 return 0;
356}
357