test-quantize-perf.cpp source code [llama.cpp/tests/test-quantize-perf.cpp]

1	// Benchmark quantization specific functions on synthetic data
2
3	#include "ggml.h"
4	#include "ggml-cpu.h"
5
6	#undef NDEBUG
7	#include <algorithm>
8	#include <assert.h>
9	#include <functional>
10	#include <math.h>
11	#include <memory>
12	#include <stdio.h>
13	#include <string>
14	#include <vector>
15
16	#if defined(_MSC_VER)
17	#pragma warning(disable: 4244 4267) // possible loss of data
18	#endif
19
20	#define MAX_ALIGNMENT 64
21	#define QK 32
22	#define WARMUP 5
23	#define ITERATIONS 10
24	#define MAX_ITERATIONS 100000000
25
26	#define L1_SIZE 32*128
27	#define L2_SIZE 32*2048
28	#define L3_SIZE 32*20480
29	#define MEM_SIZE 32*2048000
30
31	struct quantize_perf_params {
32	std::vector<std::string> include_types;
33	std::vector<size_t> test_sizes;
34	size_t alignment_offset = `0`;
35	bool op_quantize_row_q_reference = false;
36	bool op_quantize_row_q = false;
37	bool op_dequantize_row_q = false;
38	bool op_quantize_row_q_dot = false;
39	bool op_vec_dot_q = false;
40	int64_t iterations = ITERATIONS;
41	};
42
43	#if defined(__x86_64__) \|\| defined(__i386__)
44
45	#include <x86intrin.h>
46	inline int64_t cpu_cycles() {
47	// Rough way to detect new-ish CPUs
48	#ifdef __POPCNT__
49	unsigned int dummy;
50	return __rdtscp(&dummy);
51	#else
52	return __rdtsc();
53	#endif
54	}
55
56	#else
57
58	#define cpu_cycles() 0
59
60	#endif
61
62
63	// Generate synthetic data
64	static void generate_data(float offset, size_t n, float * dst) {
65	for (size_t i = `0`; i < n; i++) {
66	dst[i] = `0.1` + `2`*cosf(x: i + offset);
67	}
68	}
69
70	static float gigabytes_per_second(size_t bytes, int64_t usecs) {
71	return bytes / (float) usecs * `1000000` / (`1024``1024``1024`);
72	}
73
74	static void * align_with_offset(void * ptr, int offset) {
75	size_t dummy_size = MAX_ALIGNMENT * `4`;
76	return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr&: ptr, space&: dummy_size) + offset;
77	}
78
79	static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
80	int64_t min_time_us = INT64_MAX;
81	int64_t total_time_us = `0`;
82	int64_t min_time_cycles = INT64_MAX;
83	int64_t total_time_cycles = `0`;
84
85	for (int i = `0`; i < WARMUP; i++) {
86	func ();
87	}
88
89	for (int i = `0`; i < iterations; i++) {
90	const int64_t start_time = ggml_time_us();
91	const int64_t start_cycles = cpu_cycles();
92
93	func ();
94
95	const int64_t end_cycles = cpu_cycles();
96	const int64_t end_time = ggml_time_us();
97
98	total_time_cycles += end_cycles - start_cycles;
99	min_time_cycles = std::min(a: min_time_cycles, b: end_cycles - start_cycles);
100	total_time_us += end_time - start_time;
101	min_time_us = std::min(a: min_time_us, b: end_time - start_time);
102	}
103
104	printf(format: " min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size);
105	printf(format: " avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * iterations));
106	printf(format: " float32 throughput : %9.2f GB/s\n", gigabytes_per_second(bytes: `4` * size * iterations, usecs: total_time_us));
107	printf(format: " quantized throughput : %9.2f GB/s\n", gigabytes_per_second(bytes: q_size * iterations, usecs: total_time_us));
108	}
109
110	static void usage(char * argv[]) {
111	printf(format: "Benchmark quantization specific functions on synthetic data\n");
112	printf(format: "\n");
113	printf(format: "usage: %s [options]\n", argv[`0`]);
114	printf(format: "\n");
115	printf(format: "options: (default)\n");
116	printf(format: " -h, --help show this help message and exit\n");
117	printf(format: " --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
118	printf(format: " -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
119	printf(format: " -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
120	printf(format: " --op OP set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
121	printf(format: " quantize_row_q_dot, vec_dot_q (all)\n");
122	printf(format: " --type TYPE set test type as");
123	for (int i = `0`; i < GGML_TYPE_COUNT; i++) {
124	ggml_type type = (ggml_type) i;
125	const auto * qfns = ggml_get_type_traits(type);
126	const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
127	if (ggml_type_name(type) != NULL) {
128	if (qfns_cpu->from_float && qfns->to_float) {
129	printf(format: " %s", ggml_type_name(type));
130	}
131	}
132	}
133	printf(format: " (all)\n");
134	printf(format: " --alignment-offset OFFSET\n");
135	printf(format: " set alignment offset as OFFSET (0)\n");
136	printf(format: " -i NUM, --iterations NUM\n");
137	printf(format: " set test iteration number (%d)\n", ITERATIONS);
138	}
139
140	int main(int argc, char * argv[]) {
141	quantize_perf_params params {};
142
143	// read command line
144
145	bool invalid_param = false;
146	std::string arg;
147	for (int i = `1`; i < argc; i++) {
148	arg = argv[i];
149
150	if (arg == "--size") {
151	if (++i >= argc) {
152	invalid_param = true;
153	break;
154	}
155	size_t size = std::stoi(str: argv[i]);
156	if (size % `32` != `0`) {
157	fprintf(stderr, format: "error: size %zu not divisible by 32\n", size);
158	invalid_param = true;
159	break;
160	}
161	params.test_sizes.push_back(x: size);
162	} else if (arg == "-3") {
163	// quick select sizes that probably fit in CPU caches
164	params.test_sizes.push_back(L1_SIZE);
165	params.test_sizes.push_back(L2_SIZE);
166	params.test_sizes.push_back(L3_SIZE);
167	} else if (arg == "-4") {
168	// quick select cache sizes + memory
169	params.test_sizes.push_back(L1_SIZE);
170	params.test_sizes.push_back(L2_SIZE);
171	params.test_sizes.push_back(L3_SIZE);
172	params.test_sizes.push_back(MEM_SIZE);
173	} else if (arg == "--op") {
174	if (++i >= argc) {
175	invalid_param = true;
176	break;
177	}
178	std::string op {argv[i]};
179	if (op == "quantize_row_q_reference") {
180	params.op_quantize_row_q_reference = true;
181	} else if (op == "quantize_row_q") {
182	params.op_quantize_row_q = true;
183	} else if (op == "dequantize_row_q") {
184	params.op_dequantize_row_q = true;
185	} else if (op == "quantize_row_q_dot") {
186	params.op_quantize_row_q_dot = true;
187	} else if (op == "vec_dot_q") {
188	params.op_vec_dot_q = true;
189	} else {
190	invalid_param = true;
191	break;
192	}
193	} else if (arg == "--type") {
194	if (++i >= argc) {
195	invalid_param = true;
196	break;
197	}
198	params.include_types.push_back(x: argv[i]);
199	} else if (arg == "--alignment-offset") {
200	if (++i >= argc) {
201	invalid_param = true;
202	break;
203	}
204	int alignment = std::stoi(str: argv[i]);
205	if (alignment < `0` \|\| alignment > MAX_ALIGNMENT) {
206	fprintf(stderr, format: "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
207	invalid_param = true;
208	break;
209	}
210	params.alignment_offset = alignment;
211	} else if ((arg == "-i") \|\| (arg == "--iterations")) {
212	if (++i >= argc) {
213	invalid_param = true;
214	break;
215	}
216	int number = std::stoi(str: argv[i]);
217	if (number < `0` \|\| number > MAX_ITERATIONS) {
218	fprintf(stderr, format: "error: iterations must be less than %d\n", MAX_ITERATIONS);
219	invalid_param = true;
220	break;
221	}
222	params.iterations = number;
223	} else if ((arg == "-h") \|\| (arg == "--help")) {
224	usage(argv);
225	return `1`;
226	} else {
227	fprintf(stderr, format: "error: unknown argument: %s\n", arg.c_str());
228	return `1`;
229	}
230	}
231	if (invalid_param) {
232	fprintf(stderr, format: "error: invalid parameter for argument: %s\n", arg.c_str());
233	return `1`;
234	}
235
236	if (params.test_sizes.empty()) {
237	params.test_sizes.push_back(L1_SIZE);
238	}
239	if (!(params.op_quantize_row_q_reference \|\| params.op_quantize_row_q \|\| params.op_dequantize_row_q \|\| params.op_quantize_row_q_dot \|\| params.op_vec_dot_q)) {
240	params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
241	}
242
243	std::sort(first: params.test_sizes.begin(), last: params.test_sizes.end());
244	size_t largest = params.test_sizes.back();
245
246	std::vector<uint8_t> test_data1_v(largest`4` + MAX_ALIGNMENT`2`);
247	std::vector<uint8_t> test_data2_v(largest`4` + MAX_ALIGNMENT`2`);
248	std::vector<uint8_t> test_q1_v (largest`4` + MAX_ALIGNMENT`2`);
249	std::vector<uint8_t> test_q2_v (largest`4` + MAX_ALIGNMENT`2`);
250	std::vector<uint8_t> test_out_v (largest`4` + MAX_ALIGNMENT`2`);
251
252	float * test_data1 = (float *) align_with_offset(ptr: test_data1_v.data(), offset: params.alignment_offset);
253	float * test_data2 = (float *) align_with_offset(ptr: test_data2_v.data(), offset: params.alignment_offset);
254	float * test_q1 = (float *) align_with_offset(ptr: test_q1_v.data(), offset: params.alignment_offset);
255	float * test_q2 = (float *) align_with_offset(ptr: test_q2_v.data(), offset: params.alignment_offset);
256	float * test_out = (float *) align_with_offset(ptr: test_out_v.data(), offset: params.alignment_offset);
257
258	generate_data(offset: `0`, n: largest, dst: test_data1);
259	generate_data(offset: `1`, n: largest, dst: test_data2);
260
261	int64_t iterations = params.iterations;
262
263	ggml_cpu_init();
264
265	for (int i = `0`; i < GGML_TYPE_COUNT; i++) {
266	ggml_type type = (ggml_type) i;
267	const auto * qfns = ggml_get_type_traits(type);
268	const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
269	if (!params.include_types.empty() && ggml_type_name(type) && std::find(first: params.include_types.begin(), last: params.include_types.end(), val: ggml_type_name(type)) == params.include_types.end()) {
270	continue;
271	}
272
273	if (qfns_cpu->from_float && qfns->to_float) {
274	printf(format: "%s\n", ggml_type_name(type));
275
276	ggml_quantize_init(type);
277
278	if (params.op_quantize_row_q_reference) {
279	printf(format: " quantize_row_q_reference\n");
280	for (size_t size : params.test_sizes) {
281	printf(format: " %zu values (%.2f MB)\n", size, `4`size/(float)(`1024``1024`));
282	auto quantize_fn = [&](void) -> float {
283	qfns->from_float_ref(test_data1, test_q1, size);
284	return test_q1[`0`];
285	};
286	size_t quantized_size = ggml_row_size(type, ne: size);
287	benchmark_function(size, q_size: quantized_size, iterations, func: quantize_fn);
288	}
289	printf(format: "\n");
290	}
291
292	if (params.op_quantize_row_q) {
293	printf(format: " quantize_row_q\n");
294	for (size_t size : params.test_sizes) {
295	printf(format: " %zu values (%.2f MB)\n", size, `4`size/(float)(`1024``1024`));
296	auto quantize_fn = [&](void) -> float {
297	qfns_cpu->from_float(test_data1, test_q1, size);
298	return test_q1[`0`];
299	};
300	size_t quantized_size = ggml_row_size(type, ne: size);
301	benchmark_function(size, q_size: quantized_size, iterations, func: quantize_fn);
302	}
303	printf(format: "\n");
304	}
305
306	if (params.op_dequantize_row_q) {
307	printf(format: " dequantize_row_q\n");
308	qfns_cpu->from_float(test_data1, test_q1, largest);
309	for (size_t size : params.test_sizes) {
310	printf(format: " %zu values (%.2f MB)\n", size, `4`size/(float)(`1024``1024`));
311	auto quantize_fn = [&](void) -> float {
312	qfns->to_float(test_q1, test_out, size);
313	return test_out[`0`];
314	};
315	size_t quantized_size = ggml_row_size(type, ne: size);
316	benchmark_function(size, q_size: quantized_size, iterations, func: quantize_fn);
317	}
318	printf(format: "\n");
319	}
320
321	if (params.op_quantize_row_q_dot) {
322	printf(format: " quantize_row_q_dot\n");
323	for (size_t size : params.test_sizes) {
324	printf(format: " %zu values (%.2f MB)\n", size, `4`size/(float)(`1024``1024`));
325	auto quantize_fn = [&](void) -> float {
326	const auto * vdot = ggml_get_type_traits_cpu(type: qfns_cpu->vec_dot_type);
327	vdot->from_float(test_data1, test_q1, size);
328	return test_q1[`0`];
329	};
330	size_t quantized_size = ggml_row_size(type, ne: size);
331	benchmark_function(size, q_size: quantized_size, iterations, func: quantize_fn);
332	}
333	printf(format: "\n");
334	}
335
336	if (params.op_vec_dot_q) {
337	printf(format: " vec_dot_q\n");
338	qfns_cpu->from_float(test_data1, test_q1, largest);
339	qfns_cpu->from_float(test_data2, test_q2, largest);
340	for (size_t size : params.test_sizes) {
341	printf(format: " %zu values (%.2f MB)\n", size, `4`size/(float)(`1024``1024`));
342	auto quantize_fn = [&](void) -> float {
343	float result;
344	qfns_cpu->vec_dot(size, &result, `0`, test_q1, `0`, test_q2, `0`, `1`);
345	return result;
346	};
347	size_t quantized_size = ggml_row_size(type, ne: size);
348	benchmark_function(size, q_size: quantized_size, iterations, func: quantize_fn);
349	}
350	printf(format: "\n");
351	}
352	}
353	}
354
355	return `0`;
356	}
357

Browse the source code of llama.cpp/tests/test-quantize-perf.cpp