test-quantize-stats.cpp source code [llama.cpp/tests/test-quantize-stats.cpp]

1	#include "ggml.h"
2	#include "ggml-cpu.h"
3	#include "llama.h"
4	#include "common.h"
5
6	#include "../src/llama-model.h"
7
8	#include <algorithm>
9	#include <cassert>
10	#include <cinttypes>
11	#include <cmath>
12	#include <cstdio>
13	#include <cstring>
14	#include <numeric>
15	#include <regex>
16	#include <string>
17	#include <vector>
18	#include <thread>
19	#include <mutex>
20
21	#if defined(_MSC_VER)
22	#pragma warning(disable: 4244 4267) // possible loss of data
23	#endif
24
25	struct quantize_stats_params {
26	std::string model = DEFAULT_MODEL_PATH;
27	bool verbose = false;
28	bool per_layer_stats = false;
29	bool print_histogram = false;
30	bool reference = false;
31	std::vector<std::string> include_layers;
32	std::vector<std::string> exclude_layers;
33	std::vector<enum ggml_type> include_types;
34	};
35
36	constexpr size_t HISTOGRAM_BUCKETS = `150`;
37	constexpr double HISTOGRAM_RANGE = `0.03`;
38
39	struct error_stats {
40	size_t num_samples;
41	double total_error;
42	double max_error;
43	uint64_t error_histogram[HISTOGRAM_BUCKETS];
44	};
45
46	static void quantize_stats_print_usage(int /argc/, char ** argv) {
47	quantize_stats_params params;
48	fprintf(stderr, format: "usage: %s [options]\n", argv[`0`]);
49	fprintf(stderr, format: "\n");
50	fprintf(stderr, format: "options:\n");
51	fprintf(stderr, format: " -h, --help show this help message and exit\n");
52	fprintf(stderr, format: " -m FNAME, --model FNAME\n");
53	fprintf(stderr, format: " model path (default: %s)\n", params.model.c_str());
54	fprintf(stderr, format: " -r, --reference\n");
55	fprintf(stderr, format: " use reference implementation (default: false)\n");
56	fprintf(stderr, format: " -v, --verbose\n");
57	fprintf(stderr, format: " verbose output (default: false)\n");
58	fprintf(stderr, format: " -p, --per-layer-stats\n");
59	fprintf(stderr, format: " print stats per layer (default: false)\n");
60	fprintf(stderr, format: " --histogram\n");
61	fprintf(stderr, format: " print error histogram (default: false)\n");
62	fprintf(stderr, format: " -l LAYER, --include-layer LAYER\n");
63	fprintf(stderr, format: " only test layers matching pattern\n");
64	fprintf(stderr, format: " -L LAYER, --exclude-layer LAYER\n");
65	fprintf(stderr, format: " exclude layers matching pattern\n");
66	fprintf(stderr, format: " -t TYPE, --type TYPE\n");
67	fprintf(stderr, format: " only test given type (q4_0, q4_1)\n");
68	fprintf(stderr, format: "\n");
69	}
70
71	// Check if a layer is included/excluded by command line
72	static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
73	for (const auto& excluded : params.exclude_layers) {
74	if (std::regex_search(s: layer, e: std::regex (excluded))) {
75	return false;
76	}
77	}
78	for (const auto& included : params.include_layers) {
79	if (std::regex_search(s: layer, e: std::regex (included))) {
80	return true;
81	}
82	}
83	return params.include_layers.empty();
84	}
85
86	// Update error statistics given vectors with the before/after result of quantization
87	static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
88	for (int64_t i = `0`; i < nelements; i++) {
89	double diff = input[i] - output[i];
90	stats.total_error += diff * diff;
91	stats.max_error = fmax(x: fabs(x: diff), y: stats.max_error);
92	stats.error_histogram[std::max(a: std::min(a: (size_t) floor(x: fabs(x: diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), b: HISTOGRAM_BUCKETS-`1`), b: (size_t) `0`)]++;
93	}
94	stats.num_samples += nelements;
95	}
96
97	static void combine_error_stats(error_stats & into, const error_stats & from) {
98	into.num_samples += from.num_samples;
99	into.total_error += from.total_error;
100	if (from.max_error > into.max_error) into.max_error = from.max_error;
101	for (size_t i=`0`; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
102	}
103
104	static double find_quantile(const error_stats & stats, double quantile) {
105	double sum = std::accumulate(first: std::begin(arr: stats.error_histogram), last: std::end(arr: stats.error_histogram), init: `0.0`);
106
107	double accum = `0`;
108	for (size_t i = `0`; i < HISTOGRAM_BUCKETS; i++) {
109	accum += stats.error_histogram[i];
110	if (accum >= sum*quantile) {
111	return (i+`1`) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
112	}
113	}
114	return INFINITY;
115	}
116
117	static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
118	double rmse = sqrt(x: stats.total_error / (double) stats.num_samples);
119	double median = find_quantile(stats, quantile: `.5`);
120	double pct95 = find_quantile(stats, quantile: `.95`);
121	printf(format: "%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
122	if (print_histogram) {
123	printf(format: "Error distribution:\n");
124	for (size_t i = `0`; i < HISTOGRAM_BUCKETS; i++) {
125	double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
126	double upper = (i+`1`) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
127	if (i == HISTOGRAM_BUCKETS -`1`) upper = INFINITY;
128	printf(format: "[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
129	}
130	}
131	}
132
133	// copied from ggml.h - verify that we can access this as a flat array
134	static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
135	static_assert(GGML_MAX_DIMS == `4`, "GGML_MAX_DIMS is not 4 - update this function");
136
137	return
138	tensor->nb[`0`] == ggml_type_size(type: tensor->type) &&
139	tensor->nb[`1`] == (tensor->nb[`0`]*tensor->ne[`0`])/ggml_blck_size(type: tensor->type) &&
140	tensor->nb[`2`] == tensor->nb[`1`]*tensor->ne[`1`] &&
141	tensor->nb[`3`] == tensor->nb[`2`]*tensor->ne[`2`];
142	}
143
144	static void test_roundtrip_on_chunk(
145	const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
146	float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
147	) {
148	if (layer->type == GGML_TYPE_F16) {
149	for (int i = `0`; i < chunk_size; i++) {
150	input_scratch[i] = ggml_get_f32_1d(tensor: layer, i: i + offset);
151	}
152	} else {
153	input_scratch = ggml_get_data_f32(tensor: layer) + offset;
154	}
155
156	if (use_reference) {
157	qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
158	} else {
159	qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
160	}
161	qfns.to_float(quantized_scratch, output_scratch, chunk_size);
162
163	update_error_stats(nelements: chunk_size, input: input_scratch, output: output_scratch, stats);
164	}
165
166
167	// Run quantization function for a single layer and update error stats
168	static void test_roundtrip_on_layer(
169	std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
170	const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
171	std::vector<float> & output_scratch, error_stats & total_error, int max_thread = `0`
172	) {
173	assert(tensor_is_contiguous(layer));
174	error_stats layer_error {};
175	uint64_t nelements = ggml_nelements(tensor: layer);
176
177	float* input_scratch_ptr = nullptr;
178	if (layer->type == GGML_TYPE_F16) {
179	if (input_scratch.size() < nelements) input_scratch.resize(new_size: nelements);
180	input_scratch_ptr = input_scratch.data();
181	}
182	if (quantized_scratch.size() < `4`nelements) quantized_scratch.resize(new_size: `4`nelements);
183	if (output_scratch.size() < nelements) output_scratch.resize(new_size: nelements);
184
185	if (max_thread < `1`) max_thread = std::thread::hardware_concurrency();
186	int chunk_size = `32`*`512`;
187	int num_chunks = (nelements + chunk_size - `1`)/chunk_size;
188
189	if (num_chunks < `2` \|\| max_thread < `2`) {
190	test_roundtrip_on_chunk(layer, offset: `0`, chunk_size: nelements, qfns, qfns_cpu, use_reference, input_scratch: input_scratch_ptr, quantized_scratch: quantized_scratch.data(),
191	output_scratch: output_scratch.data(), stats&: print_layer_stats ? layer_error : total_error);
192	} else {
193	auto & stats = print_layer_stats ? layer_error : total_error;
194	std::mutex mutex;
195	uint64_t counter = `0`;
196	auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
197	&quantized_scratch, &output_scratch, chunk_size] () {
198	error_stats local_stats {};
199	while (true) {
200	std::unique_lock<std::mutex> lock(mutex);
201	uint64_t offset = counter; counter += chunk_size;
202	if (offset >= nelements) {
203	combine_error_stats(into&: stats, from: local_stats);
204	break;
205	}
206	lock.unlock();
207	uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
208	test_roundtrip_on_chunk(layer, offset, chunk_size: chunk, qfns, qfns_cpu, use_reference, input_scratch: input_scratch_ptr + offset,
209	quantized_scratch: quantized_scratch.data() + `4`*offset, output_scratch: output_scratch.data() + offset, stats&: local_stats);
210	}
211	};
212	int nthread = std::min(a: num_chunks, b: max_thread);
213	std::vector<std::thread> workers(nthread-`1`);
214	for (auto& w : workers) w = std::thread (compute);
215	compute ();
216	for (auto& w : workers) w.join();
217	}
218
219	if (print_layer_stats) {
220	print_error_stats(name, stats: layer_error, print_histogram: false);
221	combine_error_stats(into&: total_error, from: layer_error);
222	}
223	}
224
225	int main(int argc, char ** argv) {
226	ggml_time_init();
227
228	quantize_stats_params params;
229
230	// read command line
231
232	int max_thread = `0`;
233	bool invalid_param = false;
234	std::string arg;
235	for (int i = `1`; i < argc; i++) {
236	arg = argv[i];
237
238	if (arg == "-h" \|\| arg == "--help") {
239	quantize_stats_print_usage(argc, argv);
240	exit(status: `0`);
241	} else if (arg == "-r" \|\| arg == "--reference") {
242	params.reference = true;
243	} else if (arg == "-v") {
244	params.verbose = true;
245	} else if (arg == "-p" \|\| arg == "--per-layer-stats") {
246	params.per_layer_stats = true;
247	} else if (arg == "--histogram") {
248	params.print_histogram = true;
249	} else if (arg == "-m" \|\| arg == "--model") {
250	if (++i >= argc) {
251	invalid_param = true;
252	break;
253	}
254	params.model = argv[i];
255	} else if (arg == "-l" \|\| arg == "--include-layer") {
256	if (++i >= argc) {
257	invalid_param = true;
258	break;
259	}
260	params.include_layers.emplace_back(args&: argv[i]);
261	} else if (arg == "-L" \|\| arg == "--exclude-layer") {
262	if (++i >= argc) {
263	invalid_param = true;
264	break;
265	}
266	params.exclude_layers.emplace_back(args&: argv[i]);
267	} else if (arg == "-t" \|\| arg == "--type") {
268	if (++i >= argc) {
269	invalid_param = true;
270	break;
271	}
272	int j;
273	for (j = `0`; j < GGML_TYPE_COUNT; ++j) {
274	const auto * name = ggml_type_name(type: (ggml_type) j);
275	if (name && strcmp(s1: argv[i], s2: name) == `0`) break;
276	}
277	if (j < GGML_TYPE_COUNT) {
278	params.include_types.push_back(x: (ggml_type) j);
279	} else {
280	fprintf(stderr, format: "error: %s not in list of types\n", argv[i]);
281	invalid_param = true;
282	}
283	} else if (arg == "-n" \|\| arg == "--num-threads") {
284	if (++i >= argc) {
285	invalid_param = true;
286	break;
287	}
288	max_thread = atoi(nptr: argv[i]);
289	} else {
290	fprintf(stderr, format: "error: unknown argument: %s\n", arg.c_str());
291	quantize_stats_print_usage(argc, argv);
292	return `1`;
293	}
294	}
295	if (invalid_param) {
296	fprintf(stderr, format: "error: invalid parameter for argument: %s\n", arg.c_str());
297	quantize_stats_print_usage(argc, argv);
298	return `1`;
299	}
300
301	print_build_info();
302
303	// load the model
304	fprintf(stderr, format: "Loading model\n");
305
306	const int64_t t_main_start_us = ggml_time_us();
307	llama_model * model;
308	llama_context * ctx;
309
310	{
311	auto mparams = llama_model_default_params();
312	mparams.use_mlock = false;
313
314	model = llama_model_load_from_file(path_model: params.model.c_str(), params: mparams);
315
316	if (model == NULL) {
317	fprintf(stderr, format: "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
318	return `1`;
319	}
320
321	auto cparams = llama_context_default_params();
322	cparams.n_ctx = `256`;
323
324	ctx = llama_init_from_model(model, params: cparams);
325
326	if (ctx == NULL) {
327	fprintf(stderr, format: "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
328	llama_model_free(model);
329	return `1`;
330	}
331	}
332
333	const auto & tensors = llama_internal_get_tensor_map(model);
334
335	// check layer tensors
336	int included_layers = `0`;
337	int64_t max_nelements = `0`;
338	bool is_f16 = false;
339	for (const auto & kv_tensor : tensors) {
340	if (!layer_included(params, layer: kv_tensor.first)) {
341	continue;
342	}
343	if (params.verbose) {
344	printf(format: "%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(type: kv_tensor.second->type), ggml_nelements(tensor: kv_tensor.second));
345	}
346	if (kv_tensor.second->type == GGML_TYPE_F16) {
347	is_f16 = true;
348	} else if (kv_tensor.second->type != GGML_TYPE_F32) {
349	fprintf(stderr, format: "%s: error: Quantization should be tested with a float model, "
350	"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
351	llama_free(ctx);
352	llama_model_free(model);
353	return `1`;
354	}
355	included_layers++;
356	max_nelements = std::max(a: max_nelements, b: ggml_nelements(tensor: kv_tensor.second));
357	}
358
359	if (is_f16) {
360	printf(format: "note: source model is f16\n");
361	}
362	printf(format: "testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
363	// allocate scratch space
364	std::vector<float> input_scratch;
365	std::vector<char> quantized_scratch;
366	std::vector<float> output_scratch;
367
368	// loop throught quantization types
369	for (int i = `0`; i < GGML_TYPE_COUNT; i++) {
370	const ggml_type type = (ggml_type) i;
371	if (!params.include_types.empty() && std::find(first: params.include_types.begin(), last: params.include_types.end(), val: i) == params.include_types.end()) {
372	continue;
373	}
374	const auto * qfns = ggml_get_type_traits(type);
375	const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
376	if (qfns_cpu->from_float && qfns->to_float) {
377	if (params.verbose) {
378	printf(format: "testing %s ...\n", ggml_type_name(type));
379	}
380
381	ggml_quantize_init(type);
382
383	error_stats global_stats {};
384
385	for (const auto & kv_tensor : tensors) {
386	if (!layer_included(params, layer: kv_tensor.first)) {
387	continue;
388	}
389	if (params.verbose) {
390	printf(format: " %s ...\n", kv_tensor.first.c_str());
391	}
392	std::string layer_name { ggml_type_name(type) };
393	layer_name += "::" + kv_tensor.first;
394	test_roundtrip_on_layer(
395	name&: layer_name,
396	print_layer_stats: params.per_layer_stats,
397	qfns: qfns, qfns_cpu: qfns_cpu,
398	use_reference: params.reference,
399	layer: kv_tensor.second,
400	input_scratch,
401	quantized_scratch,
402	output_scratch,
403	total_error&: global_stats,
404	max_thread
405	);
406	}
407
408	print_error_stats(name: ggml_type_name(type), stats: global_stats, print_histogram: params.print_histogram);
409	}
410	}
411
412
413	llama_free(ctx);
414	llama_model_free(model);
415	// report timing
416	{
417	const int64_t t_main_end_us = ggml_time_us();
418
419	printf(format: "\n");
420	printf(format: "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/`1000.0`);
421	}
422
423	return `0`;
424	}
425

Browse the source code of llama.cpp/tests/test-quantize-stats.cpp