imatrix.cpp source code [llama.cpp/tools/imatrix/imatrix.cpp]

1	#include "arg.h"
2	#include "common.h"
3	#include "log.h"
4	#include "llama.h"
5	#include "gguf.h"
6
7	#include <algorithm>
8	#include <chrono>
9	#include <cmath>
10	#include <cstdio>
11	#include <cstring>
12	#include <ctime>
13	#include <thread>
14	#include <mutex>
15	#include <vector>
16	#include <fstream>
17	#include <unordered_map>
18	#include <map>
19	#include <regex>
20	#include <numeric>
21
22	#if defined(_MSC_VER)
23	#pragma warning(disable: 4244 4267) // possible loss of data
24	#endif
25
26	static void print_usage(int, char ** argv) {
27	LOG("\nexample usage:\n");
28	LOG("\n %s \\\n"
29	" -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \\\n"
30	" [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
31	" [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
32	" [--show-statistics] [...]\n" , argv[`0`]);
33	LOG("\n");
34	}
35
36	static const char * const LLM_KV_IMATRIX_DATASETS = "imatrix.datasets";
37	static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
38	static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size";
39
40	struct Stats {
41	std::vector<float> values;
42	std::vector<int64_t> counts;
43	};
44
45	struct tensor_statistics {
46	std::string tensor;
47	Stats stats;
48	float total_sqract = `0.0f`;
49	float mean_sqract = `0.0f`;
50	float max_sqract = `0.0f`;
51	float min_sqract = `0.0f`;
52	int elements = `0`;
53	float stddev = `0.0f`;
54	float active = `0.0f`;
55	float entropy = `0.0f`;
56	float zd = `0.0f`;
57	float cossim = `0.0f`;
58	};
59
60	class IMatrixCollector {
61	public:
62	IMatrixCollector() = default;
63	void set_params(common_params params) { m_params = std::move(params); }
64	bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
65	void save_imatrix_legacy(int32_t ncall = -`1`) const;
66	void save_imatrix(int32_t n_chunk = -`1`) const;
67	bool load_imatrix_legacy(const char * fname);
68	bool load_imatrix(const char * file_name);
69	const std::unordered_map<std::string, Stats> & get_mstats() const { return m_stats; }
70	private:
71	std::unordered_map<std::string, Stats> m_stats;
72	common_params m_params;
73	std::mutex m_mutex;
74	std::vector<std::string> m_datasets;
75	int32_t m_last_chunk = `0`;
76	std::vector<char> m_src1_data;
77	std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
78	};
79
80	// remove any prefix and suffixes from the name
81	// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
82	static std::string filter_tensor_name(const char * name) {
83	std::string wname;
84	const char * p = strchr(s: name, c: `'#'`);
85	if (p != NULL) {
86	p = p + `1`;
87	const char * q = strchr(s: p, c: `'#'`);
88	if (q != NULL) {
89	wname = std::string (p, q - p);
90	} else {
91	wname = p;
92	}
93	} else {
94	wname = name;
95	}
96	return wname;
97	}
98
99	static void process_tensor_name(const std::string & input, std::string & layer, std::string & tensor) {
100	std::vector<std::string> name;
101	std::istringstream stream(input);
102	std::string item;
103
104	while (std::getline(in&: stream, str&: item, delim: `'.'`)) {
105	name.push_back(x: item);
106	}
107	for (size_t i = `0`; i < name.size(); ++i) {
108	if (name [i] == "blk" && i + `1` < name.size()) {
109	layer = name [i + `1`];
110	break;
111	}
112	}
113	for (size_t i = `0`; i < name.size(); ++i) {
114	if (name [i] == "weight" && i > `0`) {
115	tensor = name [i - `1`];
116	break;
117	}
118	}
119
120	if (tensor.empty()) {
121	tensor = input;
122	}
123	if (layer.empty()) {
124	layer = "-";
125	}
126	}
127
128	static void compute_statistics(std::vector<tensor_statistics> & tstats, const std::string & name, const Stats & e) {
129	if (e.values.size() % e.counts.size() != `0`) {
130	LOG_ERR("%s: activation size mismatch for tensor %s (%zu vs %zu)\n", __func__, name.c_str(), e.counts.size(), e.values.size());
131	return;
132	}
133	if (e.counts.empty()) {
134	LOG_ERR("%s: there are no activations for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str());
135	return;
136	}
137
138	const int n_mat = e.counts.size();
139	const int row_size = e.values.size() / n_mat;
140
141	std::vector<float> activations;
142	activations.reserve(n: e.values.size());
143
144	for (int i = `0`; i < n_mat; ++i) {
145	for (int j = `0`; j < row_size; ++j) {
146	activations.push_back(x: e.values [i*row_size + j] / e.counts [i]);
147	}
148	}
149
150	const float act_total = std::accumulate(first: activations.begin(), last: activations.end(), init: `0.0f`);
151	const float act_max = *std::max_element(first: activations.begin(), last: activations.end());
152	const float act_min = *std::min_element(first: activations.begin(), last: activations.end());
153	const float act_mean = act_total / activations.size();
154	const float act_sqr_total = std::inner_product(first1: activations.begin(), last1: activations.end(), first2: activations.begin(), init: `0.0f`);
155	const float act_var = (act_sqr_total / activations.size()) - (act_mean * act_mean);
156	const float act_dev = std::sqrt(x: std::max(a: `0.0f`, b: act_var));
157	float threshold = `1e-5f`;
158	const int inactive_count = std::count_if(first: activations.begin(), last: activations.end(),
159	pred: [threshold](const float v) { return fabsf(x: v) <= threshold; });
160	const float active_ratio = `1` - static_cast<float>(inactive_count) / activations.size();
161
162	float entropy = `0`;
163	if (act_total > `0`) {
164	for (const auto act : activations) {
165	if (const float p = act / act_total; p > `0`) {
166	entropy -= p * std::log2(x: p);
167	}
168	}
169	}
170
171	int z_score = `0`;
172	if (act_dev > `0.0f`) {
173	for (const auto act : activations) {
174	if (const float p = (act - act_mean) / act_dev; p > `1`) {
175	z_score++;
176	}
177	}
178	}
179
180	auto & ts = tstats.emplace_back();
181	ts.tensor = name;
182	ts.stats = e;
183	ts.total_sqract = act_total;
184	ts.mean_sqract = act_mean;
185	ts.max_sqract = act_max;
186	ts.min_sqract = act_min;
187	ts.elements = static_cast<int>(activations.size());
188	ts.stddev = act_dev;
189	ts.active = active_ratio;
190	ts.entropy = entropy;
191	ts.zd = static_cast<float>(z_score) / ts.elements;
192	}
193
194	static void compute_cossim(std::vector<tensor_statistics> & tstats) {
195	static const std::regex pattern(R"(blk\.(\d+)\.)");
196	for (auto & ts : tstats) {
197	if (std::smatch match; std::regex_search(s: ts.tensor, m&: match, e: pattern)) {
198	const int blk = std::stoi(str: match [`1`]);
199	std::string tname(ts.tensor);
200	tname.replace(pos: match.position(sub: `1`), n: match.length(sub: `1`), str: std::to_string(val: blk-`1`));
201	auto prev = std::find_if(first: tstats.begin(), last: tstats.end(),
202	pred: [tname](const tensor_statistics & t) { return t.tensor == tname; });
203	if (prev != tstats.end()) {
204	const float dp = std::inner_product(first1: ts.stats.values.begin(), last1: ts.stats.values.end(),
205	first2: prev ->stats.values.begin(), init: `0.0f`);
206	const float curr_mag = std::sqrt(x: std::inner_product(first1: ts.stats.values.begin(), last1: ts.stats.values.end(),
207	first2: ts.stats.values.begin(), init: `0.0f`));
208	const float prev_mag = std::sqrt(x: std::inner_product(first1: prev ->stats.values.begin(), last1: prev ->stats.values.end(),
209	first2: prev ->stats.values.begin(), init: `0.0f`));
210	const float cs = dp / (curr_mag * prev_mag);
211	ts.cossim = cs;
212	}
213	} else {
214	ts.cossim = `0`;
215	}
216	}
217	}
218
219	bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
220	GGML_UNUSED(user_data);
221
222	const struct ggml_tensor * src0 = t->src[`0`];
223	const struct ggml_tensor * src1 = t->src[`1`];
224	std::string wname = filter_tensor_name(name: src0->name);
225
226	const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
227
228	// when ask is true, the scheduler wants to know if we are interested in data from this tensor
229	// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
230	if (ask) {
231	if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
232	if (t->op != GGML_OP_MUL_MAT) return false;
233	// why are small batches ignored (<16 tokens)?
234	if (src1->ne[`1`] < `16` \|\| src1->type != GGML_TYPE_F32) return false;
235	if (!(wname.substr(pos: `0`, n: `4`) == "blk." \|\| (m_params.process_output && wname == "output.weight"))) return false;
236	return true;
237	}
238
239	std::lock_guard<std::mutex> lock(m_mutex);
240
241	// copy the data from the GPU memory if needed
242	const bool is_host = ggml_backend_buffer_is_host(buffer: src1->buffer);
243
244	if (!is_host) {
245	const size_t src1_nbytes = ggml_nbytes(tensor: src1);
246	m_src1_data.resize(new_size: src1_nbytes);
247	ggml_backend_tensor_get(tensor: src1, data: m_src1_data.data(), offset: `0`, size: src1_nbytes);
248	}
249
250	const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
251	GGML_ASSERT(src1->nb[`0`] == ggml_element_size(src1));
252
253	// this has been adapted to the new format of storing merged experts in a single 3d tensor
254	// ref: https://github.com/ggml-org/llama.cpp/pull/6387
255	if (t->op == GGML_OP_MUL_MAT_ID) {
256	// ids -> [n_experts_used, n_tokens]
257	// src1 -> [cols, n_expert_used, n_tokens]
258	const ggml_tensor * ids = t->src[`2`];
259	const int64_t n_as = src0->ne[`2`];
260	const int64_t n_ids = ids->ne[`0`];
261
262	// the top-k selected expert ids are stored in the ids tensor
263	// for simplicity, always copy ids to host, because it is small
264	// take into account that ids is not contiguous!
265
266	GGML_ASSERT(ids->ne[`1`] == src1->ne[`2`]);
267
268	// the extra dimension would need to be stored somewhere to be reflected in the imatrix file
269	if (ggml_nrows(tensor: src1) != src1->ne[`1`] * src1->ne[`2`]) {
270	LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
271	GGML_ASSERT(false);
272	}
273
274	m_ids.resize(new_size: ggml_nbytes(tensor: ids));
275	ggml_backend_tensor_get(tensor: ids, data: m_ids.data(), offset: `0`, size: ggml_nbytes(tensor: ids));
276
277	auto & e = m_stats [wname];
278
279	if (e.counts.size() == `1` && n_as > `1`) {
280	// broadcast, when loading an old imatrix
281	e.counts.resize(new_size: n_as, x: e.counts [`0`]);
282	}
283	if (e.values.empty()) {
284	e.values.resize(new_size: src1->ne[`0`]*n_as, x: `0`);
285	e.counts.resize(new_size: n_as, x: `0`);
286	}
287	else if (e.values.size() != (size_t)src1->ne[`0`]*n_as) {
288	LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[`0`]*n_as));
289	exit(status: `1`); //GGML_ABORT("fatal error");
290	}
291	else if (e.counts.size() != (size_t)n_as) {
292	LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_as);
293	exit(status: `1`); //GGML_ABORT("fatal error");
294	}
295	LOG_DBGV(`2`, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[`0`], (int)src1->ne[`2`], (int)src1->type);
296	// loop over all possible experts, regardless if they are used or not in the batch
297	for (int64_t ex = `0`; ex < n_as; ++ex) {
298	size_t e_start = ex*src1->ne[`0`];
299
300	for (int64_t idx = `0`; idx < n_ids; ++idx) {
301	for (int64_t row = `0`; row < src1->ne[`2`]; ++row) {
302	const int excur = (const* int32_t ) (m_ids.data() + rowids->nb[`1`] + idx*ids->nb[`0`]);
303
304	GGML_ASSERT(excur >= `0` && excur < n_as); // sanity check
305
306	if (excur != ex) continue;
307
308	const int64_t i11 = idx % src1->ne[`1`];
309	const int64_t i12 = row;
310	const float * x = (const float )(data + i11src1->nb[`1`] + i12*src1->nb[`2`]);
311
312	e.counts [ex]++;
313
314	for (int64_t j = `0`; j < src1->ne[`0`]; ++j) {
315	e.values [e_start + j] += x[j] * x[j];
316	if (!std::isfinite(x: (float)e.values [e_start + j])) {
317	LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
318	exit(status: `1`);
319	}
320	}
321	}
322	}
323	const int32_t n_chunk = e.counts [ex] / chunk_size;
324	if (n_chunk > m_last_chunk) {
325	const int32_t chunk_step = n_chunk - m_last_chunk;
326	m_last_chunk = n_chunk;
327	if ((m_last_chunk % m_params.n_out_freq) / chunk_step == `0`) {
328	save_imatrix();
329	}
330	if (m_params.n_save_freq > `0` && (m_last_chunk % m_params.n_save_freq) / chunk_step == `0`) {
331	save_imatrix(n_chunk: m_last_chunk);
332	}
333	}
334	}
335	} else {
336	auto & e = m_stats [wname];
337	const int64_t n_mat = src0->ne[`2`] * src0->ne[`3`];
338
339	// use a single count per dense tensor
340	// (necessary when merging older GGUF-imatrix files with 3d tensors)
341	if (e.counts.size() > `1`) {
342	bool all_equal = true;
343	for (size_t i = `1`; i < e.counts.size(); ++i) {
344	if (e.counts [`0`] != e.counts [i]) {
345	all_equal = false;
346	break;
347	}
348	}
349	if (all_equal) {
350	e.counts.resize(new_size: `1`);
351	}
352	}
353	if (e.values.empty()) {
354	e.values.resize(new_size: src1->ne[`0`] * n_mat, x: `0`);
355	e.counts.resize(new_size: `1`, x: `0`);
356	}
357	else if (e.values.size() != (size_t)(src1->ne[`0`] * n_mat)) {
358	LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[`0`] * n_mat));
359	exit(status: `1`); //GGML_ABORT("fatal error");
360	}
361	LOG_DBGV(`2`, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[`0`], (int)src1->ne[`1`], (int)src1->ne[`2`], (int)src1->type);
362
363	for (int64_t i3 = `0`; i3 < src1->ne[`3`]; ++i3) {
364	for (int64_t i2 = `0`; i2 < src1->ne[`2`]; ++i2) {
365	// handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D
366	const int64_t mat_id = (i3 % src0->ne[`3`]) * src0->ne[`2`] + (i2 % src0->ne[`2`]);
367	const int64_t mat_start = mat_id * src1->ne[`0`];
368
369	for (int64_t row = `0`; row < src1->ne[`1`]; ++row) {
370	const float * x = (const float ) (data + row src1->nb[`1`] + i2 * src1->nb[`2`] + i3 * src1->nb[`3`]);
371	for (int64_t j = `0`; j < src1->ne[`0`]; ++j) {
372	e.values [mat_start + j] += x[j] * x[j];
373	if (!std::isfinite(x: (float)e.values [j])) {
374	LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str());
375	exit(status: `1`);
376	}
377	}
378	}
379	}
380	}
381	// only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT
382	for (size_t i = `0`; i < e.counts.size(); ++i) {
383	e.counts [i] += ggml_nrows(tensor: src1) / n_mat;
384	const int32_t n_chunk = e.counts [i] / chunk_size;
385	if (n_chunk > m_last_chunk) {
386	const int32_t chunk_step = n_chunk - m_last_chunk;
387	m_last_chunk = n_chunk;
388	if ((m_last_chunk % m_params.n_out_freq) / chunk_step == `0`) {
389	save_imatrix();
390	}
391	if (m_params.n_save_freq > `0` && (m_last_chunk % m_params.n_save_freq) / chunk_step == `0`) {
392	save_imatrix(n_chunk: m_last_chunk);
393	}
394	}
395	}
396	}
397
398	return true;
399	}
400
401	void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
402	auto fname = m_params.out_file;
403
404	if (ncall > `0`) {
405	fname += ".at_";
406	fname += std::to_string(val: ncall);
407	}
408
409	// warn when writing imatrix entries that do not have full data
410	// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
411
412	int n_entries = `0`;
413	std::vector<std::string> to_store;
414
415	bool is_first = true; // for printing
416	for (const auto & kv : m_stats) {
417	const int n_all = kv.second.counts.size();
418
419	if (n_all == `0`) {
420	continue;
421	}
422
423	int n_zeros = `0`;
424	for (const int c : kv.second.counts) {
425	if (c == `0`) {
426	n_zeros++;
427	}
428	}
429
430	if (n_zeros != `0` && is_first) {
431	LOG_INF("\n");
432	is_first = false;
433	}
434
435	if (n_zeros == n_all) {
436	LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
437	continue;
438	}
439
440	if (n_zeros > `0`) {
441	LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), `100.0f` * (n_all - n_zeros) / n_all);
442	}
443
444	n_entries++;
445	to_store.push_back(x: kv.first);
446	}
447
448	if (to_store.size() < m_stats.size()) {
449	LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
450	}
451
452	// deterministic tensor name order
453	std::sort(first: to_store.begin(), last: to_store.end());
454
455	const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
456
457	std::ofstream out(fname, std::ios::binary);
458	out.write(s: (const char ) &n_entries, n: sizeof*(n_entries));
459	for (const auto & name : to_store) {
460	const auto & stat = m_stats.at(k: name);
461	const int32_t len = name.size();
462	out.write(s: (const char ) &len, n: sizeof*(len));
463	out.write(s: name.c_str(), n: len);
464	// ceiling division to avoid accidental zeros
465	const int32_t ncall = (*std::max_element(first: stat.counts.begin(), last: stat.counts.end()) + (chunk_size - `1`)) / chunk_size;
466	out.write(s: (const char ) &ncall, n: sizeof*(ncall));
467	const int32_t nval = stat.values.size();
468	const int32_t nmat = stat.counts.size();
469	out.write(s: (const char ) &nval, n: sizeof*(nval));
470	if (nval > `0` && nmat > `0`) {
471	std::vector<float> tmp(nval);
472	for (int32_t i = `0`; i < nval; i++) {
473	float count = static_cast<float>(stat.counts [i / (nval / nmat)]);
474	float value = stat.values [i];
475	if (count == `0.0f`) {
476	// store 1 for partial data
477	value = `1.0f`;
478	count = `1.0f`;
479	}
480	tmp [i] = (value / count) * static_cast<float>(ncall);
481	}
482	out.write(s: (const char ) tmp.data(), n: nval sizeof(float));
483	}
484	}
485
486	// Write the number of call the matrix was computed with
487	out.write(s: (const char ) &m_last_chunk, n: sizeof*(m_last_chunk));
488
489	// Write the input filename at the end of the file to later on specify it in quantize
490	{
491	const char * dataset_file = m_params.prompt_file.c_str();
492	int32_t len = m_params.prompt_file.size();
493	// When there is no prompt but there were other imatrix files loaded, use the last dataset
494	if (m_params.prompt_file.empty() && !m_datasets.empty()) {
495	const std::string & dataset_str = m_datasets [m_datasets.size() - `1`];
496	dataset_file = dataset_str.c_str();
497	len = dataset_str.size();
498	}
499	out.write(s: (const char ) &len, n: sizeof*(len));
500	out.write(s: dataset_file, n: len);
501	}
502
503	LOGV(`1`, "\n");
504	LOG_DBGV(`1`, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
505	}
506
507	void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
508	auto fname = m_params.out_file;
509	int8_t use_legacy_format = m_params.imat_dat;
510
511	if (use_legacy_format > `0`) {
512	this->save_imatrix_legacy(ncall: n_chunk);
513	return;
514	}
515	// only warn when `--output-format gguf` is not specified
516	if (use_legacy_format == `0` && !string_ends_with(str: fname, suffix: ".gguf")) {
517	LOG_WRN("\n%s: saving imatrix using GGUF format with a different suffix than .gguf\n", __func__);
518	LOG_WRN("%s: if you want the previous imatrix format, use --output-format dat\n", __func__);
519	}
520
521	if (n_chunk > `0`) {
522	fname += ".at_";
523	fname += std::to_string(val: n_chunk);
524	}
525
526	// write imatrix entries even if they don't have full data. (can be corrected when reading)
527	// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
528
529	std::vector<std::string> to_store;
530	size_t data_size = `0`;
531
532	bool is_first = true; // for printing
533	for (const auto & kv : m_stats) {
534	const int n_all = kv.second.counts.size();
535
536	int n_zeros = `0`;
537	for (const auto c : kv.second.counts) {
538	if (c == `0`) {
539	n_zeros++;
540	}
541	}
542
543	if (n_zeros != `0` && is_first) {
544	LOG_INF("\n");
545	is_first = false;
546	}
547
548	if (n_zeros > `0`) {
549	LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), `100.0f` * (n_all - n_zeros) / n_all);
550	}
551
552	to_store.push_back(x: kv.first);
553	data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
554	data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
555	}
556
557	// deterministic tensor name order
558	std::sort(first: to_store.begin(), last: to_store.end());
559
560	struct ggml_init_params params = {
561	/ .mem_size = / data_size,
562	/ .mem_buffer = / NULL,
563	/ .no_alloc = / false,
564	};
565	struct ggml_context * ctx = ggml_init(params);
566	struct gguf_context * ctx_gguf = gguf_init_empty();
567
568	{
569	std::vector<const char *> datasets;
570	datasets.reserve(n: m_datasets.size() + `1`);
571	for (size_t i = `0`; i < m_datasets.size(); ++i) {
572	datasets.push_back(x: m_datasets [i].c_str());
573	}
574	if (!m_params.prompt_file.empty()) {
575	datasets.push_back(x: m_params.prompt_file.c_str());
576	}
577
578	gguf_set_val_str(ctx: ctx_gguf, key: "general.type", val: "imatrix");
579	// Write the dataset paths
580	gguf_set_arr_str(ctx: ctx_gguf, key: LLM_KV_IMATRIX_DATASETS, data: datasets.data(), n: datasets.size());
581	// Write the number of chunks the matrix was computed with
582	gguf_set_val_u32(ctx: ctx_gguf, key: LLM_KV_IMATRIX_CHUNK_COUNT, val: m_last_chunk);
583	gguf_set_val_u32(ctx: ctx_gguf, key: LLM_KV_IMATRIX_CHUNK_SIZE, val: m_params.n_ctx / m_params.n_parallel);
584	}
585
586	for (const auto & name : to_store) {
587	const auto & stat = m_stats.at(k: name);
588	const int32_t nval = (int32_t) stat.values.size();
589	const int32_t nmat = (int32_t) stat.counts.size();
590	if (nval > `0` && nmat > `0`) {
591	struct ggml_tensor * in_sum2 = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: nval / nmat, ne1: nmat);
592	struct ggml_tensor * counts = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: `1`, ne1: nmat);
593	ggml_format_name(tensor: in_sum2, fmt: "%s.in_sum2", name.c_str());
594	ggml_format_name(tensor: counts, fmt: "%s.counts", name.c_str());
595
596	for (int32_t j = `0`; j < nval; ++j) {
597	((float ) in_sum2->data)[j] = (float*) stat.values [j];
598	}
599	for (int32_t j = `0`; j < nmat; ++j) {
600	((float ) counts->data)[j] = (float*) stat.counts [j];
601	}
602
603	gguf_add_tensor(ctx: ctx_gguf, tensor: in_sum2);
604	gguf_add_tensor(ctx: ctx_gguf, tensor: counts);
605	}
606	}
607
608	gguf_write_to_file(ctx: ctx_gguf, fname: fname.c_str(), only_meta: false);
609
610	LOGV(`1`, "\n");
611	LOG_DBGV(`1`, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
612
613	gguf_free(ctx: ctx_gguf);
614	ggml_free(ctx);
615	}
616
617	bool IMatrixCollector::load_imatrix_legacy(const char * fname) {
618	std::ifstream in(fname, std::ios::binary);
619	if (!in) {
620	LOG_ERR("%s: failed to open %s\n", __func__, fname);
621	return false;
622	}
623	int n_entries;
624	in.read(s: (char ) &n_entries, n: sizeof*(n_entries));
625	if (in.fail() \|\| n_entries < `1`) {
626	LOG_ERR("%s: no data in file %s\n", __func__, fname);
627	return false;
628	}
629	// Guess the chunk size because it's not stored in the file
630	const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
631
632	for (int i = `0`; i < n_entries; ++i) {
633	int32_t len = `0`;
634	in.read(s: (char ) &len, n: sizeof*(len));
635	std::vector<char> name_as_vec(len + `1`);
636	in.read(s: (char *) name_as_vec.data(), n: len);
637	if (in.fail()) {
638	LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + `1`, fname);
639	return false;
640	}
641	name_as_vec [len] = `0`;
642	std::string name{ name_as_vec.data() };
643	auto & e = m_stats [std::move(name)];
644	int32_t ncall = `0`;
645	in.read(s: (char ) &ncall, n: sizeof*(ncall));
646	int32_t nval = `0`;
647	in.read(s: (char ) &nval, n: sizeof*(nval));
648	if (in.fail() \|\| nval < `1`) {
649	LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i);
650	m_stats = {};
651	return false;
652	}
653
654	if (e.values.empty()) {
655	e.values.resize(new_size: nval, x: `0.0f`);
656	e.counts.resize(new_size: `1`, x: `0`);
657	}
658
659	std::vector<float> tmp(nval);
660	in.read(s: (char ) tmp.data(), n: nval sizeof(float));
661	if (in.fail()) {
662	LOG_ERR("%s: failed reading data for entry %d\n", __func__, i);
663	m_stats = {};
664	return false;
665	}
666
667	// Recreate the state as expected by save_imatrix(), and correct for weighted sum.
668	for (int i = `0`; i < nval; i++) {
669	e.values [i] += tmp [i] * chunk_size;
670	}
671	// The legacy format doesn't distinguish the counts for different experts
672	for (size_t j = `0`; j < e.counts.size(); ++j) {
673	e.counts [j] += ncall * chunk_size;
674	}
675	}
676
677	{
678	// TODO: extract into its own method; this is also used by the GGUF-based format
679	// Calculate the last chunk count
680	int64_t max_count = `0`;
681	for (const auto & stats : m_stats) {
682	for (int64_t count : stats.second.counts) {
683	if (count > max_count) {
684	max_count = count;
685	}
686	}
687	}
688	m_last_chunk = max_count / (chunk_size);
689	}
690
691	{
692	// Read the number of calls the matrix was computed with
693	int32_t n_calls;
694	in.read(s: (char ) &n_calls, n: sizeof*(n_calls));
695	// ignore it because it's not important
696	}
697
698	// Read the dataset path to include it when writing to GGUF
699	if (!in.fail()){
700	int32_t len = `0`;
701	in.read(s: (char ) &len, n: sizeof*(len));
702	if (!in.fail()) {
703	std::vector<char> dataset;
704	dataset.resize(new_size: len + `1`, x: `0`);
705	in.read(s: dataset.data(), n: len);
706	if (!in.fail()) {
707	m_datasets.push_back(x: dataset.data());
708	}
709	}
710	}
711
712	return true;
713	}
714
715	// Using GGUF as the file format, for greater extensibility
716	bool IMatrixCollector::load_imatrix(const char * file_name) {
717	struct ggml_context * ctx = nullptr;
718	struct gguf_init_params meta_gguf_params = {
719	/ .no_alloc = / false, // the data is needed
720	/ .ctx = / &ctx,
721	};
722	struct gguf_context * ctx_gguf = gguf_init_from_file(fname: file_name, params: meta_gguf_params);
723	if (!ctx_gguf) {
724	return this->load_imatrix_legacy(fname: file_name);
725	}
726	const int32_t n_entries = gguf_get_n_tensors(ctx: ctx_gguf);
727	if (n_entries < `1`) {
728	LOG_ERR("%s: no data in file %s\n", __func__, file_name);
729	gguf_free(ctx: ctx_gguf);
730	ggml_free(ctx);
731	return false;
732	}
733
734	const int64_t datasets_key = gguf_find_key(ctx: ctx_gguf, key: LLM_KV_IMATRIX_DATASETS);
735	if (datasets_key != -`1` && gguf_get_arr_type(ctx: ctx_gguf, key_id: datasets_key) == GGUF_TYPE_STRING) {
736	const int64_t n = gguf_get_arr_n(ctx: ctx_gguf, key_id: datasets_key);
737	m_datasets.reserve(n: m_datasets.size() + n);
738	for (int64_t i = `0`; i < n; ++i) {
739	m_datasets.push_back(x: gguf_get_arr_str(ctx: ctx_gguf, key_id: datasets_key, i));
740	}
741	}
742
743	const std::string in_sum2_suffix{ ".in_sum2" };
744	const std::string counts_suffix{ ".counts" };
745
746	// Could re-use m_stats instead, but this allows
747	// checking for completeness of each* loaded imatrix file*
748	// and also makes it easier to re-use a similar implementation in quantize.cpp
749	// Using an ordered map to get a deterministic iteration order.
750	std::map<std::string, std::pair<struct ggml_tensor , struct* ggml_tensor *>> sums_counts_for;
751
752	for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, tensor: cur)) {
753	std::string name = cur->name;
754
755	if (name.empty()) { continue; }
756
757	if (string_remove_suffix(str&: name, suffix: in_sum2_suffix)) {
758	// in_sum2
759	sums_counts_for [std::move(name)].first = cur;
760	} else if (string_remove_suffix(str&: name, suffix: counts_suffix)) {
761	// counts
762	sums_counts_for [std::move(name)].second = cur;
763	} else {
764	// ignore other tensors
765	}
766	}
767
768	for (const auto & sc : sums_counts_for) {
769	const std::string & name = sc.first;
770	const struct ggml_tensor * in_sum2 = sc.second.first;
771	const struct ggml_tensor * counts = sc.second.second;
772
773	if (!in_sum2 \|\| !counts) {
774	LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str());
775	gguf_free(ctx: ctx_gguf);
776	ggml_free(ctx);
777	return false;
778	}
779
780	auto & e = m_stats [name];
781
782	int64_t nval = ggml_nelements(tensor: in_sum2);
783	if (e.values.empty()) {
784	e.values.resize(new_size: nval, x: `0.0f`);
785	} else if ((size_t) nval != e.values.size()) {
786	LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size());
787	gguf_free(ctx: ctx_gguf);
788	ggml_free(ctx);
789	return false;
790	}
791
792	int64_t ncounts = ggml_nelements(tensor: counts);
793	if (e.counts.empty()) {
794	e.counts.resize(new_size: ncounts, x: `0`);
795	} else if (e.counts.size() == `1` && ncounts > `1`) {
796	// broadcast, when loading an old imatrix
797	e.counts.resize(new_size: ncounts, x: e.counts [`0`]);
798	} else if ((size_t) ncounts != e.counts.size()) {
799	LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size());
800	gguf_free(ctx: ctx_gguf);
801	ggml_free(ctx);
802	return false;
803	}
804
805	// Recreate the state as expected by save_imatrix()
806	for (int64_t j = `0`; j < nval; j++) {
807	e.values [j] += ((const float *) in_sum2->data)[j];
808	}
809	for (int64_t j = `0`; j < ncounts; j++) {
810	e.counts [j] += std::lround(x: ((const float *) counts->data)[j]);
811	}
812	}
813
814	// TODO: extract into its own method; this is also used by the legacy format
815	// Calculate the last chunk count
816	int64_t max_count = `0`;
817	for (const auto & stats : m_stats) {
818	for (int64_t count : stats.second.counts) {
819	if (count > max_count) {
820	max_count = count;
821	}
822	}
823	}
824	m_last_chunk = max_count / (m_params.n_ctx / m_params.n_parallel);
825
826	gguf_free(ctx: ctx_gguf);
827	ggml_free(ctx);
828	return true;
829	}
830
831	static IMatrixCollector g_collector;
832
833	static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
834	return g_collector.collect_imatrix(t, ask, user_data);
835	}
836
837	struct results_log_softmax {
838	double log_softmax;
839	float logit;
840	float prob;
841	};
842
843	static std::vector<float> softmax(const std::vector<float> & logits) {
844	std::vector<float> probs(logits.size());
845	float max_logit = logits [`0`];
846	for (float v : logits) {
847	max_logit = std::max(a: max_logit, b: v);
848	}
849	double sum_exp = `0.0`;
850	for (size_t i = `0`; i < logits.size(); i++) {
851	// Subtract the maximum logit value from the current logit value for numerical stability
852	const float logit = logits [i] - max_logit;
853	const float exp_logit = expf(x: logit);
854	sum_exp += exp_logit;
855	probs [i] = exp_logit;
856	}
857	for (size_t i = `0`; i < probs.size(); i++) {
858	probs [i] /= sum_exp;
859	}
860	return probs;
861	}
862
863	static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
864	float max_logit = logits[`0`];
865	for (int i = `1`; i < n_vocab; ++i) {
866	max_logit = std::max(a: max_logit, b: logits[i]);
867	}
868	double sum_exp = `0.0`;
869	for (int i = `0`; i < n_vocab; ++i) {
870	sum_exp += expf(x: logits[i] - max_logit);
871	}
872	return {.log_softmax: logits[tok] - max_logit - log(x: sum_exp), .logit: logits[tok], .prob: expf(x: logits[tok] - max_logit) / (float) sum_exp};
873	}
874
875	static void process_logits(
876	int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
877	double & nll, double & nll2, float * logit_history, float * prob_history) {
878	std::mutex mutex;
879	int counter = `0`;
880	auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
881	double local_nll = `0`;
882	double local_nll2 = `0`;
883	while (true) {
884	std::unique_lock<std::mutex> lock(mutex);
885	int i = counter++;
886	if (i >= n_token) {
887	nll += local_nll; nll2 += local_nll2;
888	break;
889	}
890	lock.unlock();
891	const results_log_softmax results = log_softmax(n_vocab, logits: logits + i*n_vocab, tok: tokens[i+`1`]);
892	const double v = -results.log_softmax;
893	local_nll += v;
894	local_nll2 += v*v;
895
896	logit_history[i] = results.logit;
897	prob_history[i] = results.prob;
898	}
899	};
900	for (auto & w : workers) {
901	w = std::thread (compute);
902	}
903	compute ();
904	for (auto & w : workers) {
905	w.join();
906	}
907	}
908
909	static bool compute_imatrix(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
910	const llama_model * model = llama_get_model(ctx);
911	const llama_vocab * vocab = llama_model_get_vocab(model);
912
913	const bool add_bos = llama_vocab_get_add_bos(vocab);
914
915	GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
916
917	auto tim1 = std::chrono::high_resolution_clock::now();
918	LOG_INF("%s: tokenizing the input ..\n", __func__);
919
920	std::vector<llama_token> tokens = common_tokenize(ctx, text: params.prompt, add_special: true, parse_special: params.parse_special);
921
922	auto tim2 = std::chrono::high_resolution_clock::now();
923	LOG_INF("%s: tokenization took %g ms\n",__func__,`1e-3`*std::chrono::duration_cast<std::chrono::microseconds>(tim2 -tim1).count());
924
925	if (params.i_chunk > `0`) {
926	if (size_t((params.i_chunk + `2`)*n_ctx) >= tokens.size()) {
927	LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
928	return false;
929	}
930	LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
931	tokens.erase(first: tokens.begin(), last: tokens.begin() + params.i_chunk*n_ctx);
932	}
933
934	if (int(tokens.size()) < `2`*n_ctx) {
935	LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, `2`*n_ctx, n_ctx);
936	LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
937	return false;
938	}
939
940	std::vector<float> logit_history;
941	std::vector<float> prob_history;
942
943	if (params.compute_ppl) {
944	logit_history.resize(new_size: tokens.size());
945	prob_history.resize(new_size: tokens.size());
946	}
947
948	const int n_chunk_max = tokens.size() / n_ctx;
949
950	const int n_chunk = params.n_chunks < `0` ? n_chunk_max : std::min(a: params.n_chunks, b: n_chunk_max);
951	const int n_vocab = llama_vocab_n_tokens(vocab);
952	const int n_batch = params.n_batch;
953
954	int count = `0`;
955	double nll = `0.0`;
956	double nll2 = `0.0`;
957
958	const int num_batches = (n_ctx + n_batch - `1`) / n_batch;
959	const int n_seq = std::max(a: `1`, b: n_batch / n_ctx);
960
961	GGML_ASSERT(n_batch < n_ctx \|\| n_batch % n_ctx == `0`);
962	GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
963
964	llama_batch batch = llama_batch_init(n_tokens: std::min(a: n_batch, b: n_ctx*n_seq), embd: `0`, n_seq_max: `1`);
965
966	std::vector<float> logits;
967	if (params.compute_ppl && num_batches > `1`) {
968	logits.reserve(n: (size_t)n_ctx * n_vocab);
969	}
970
971	LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
972
973	std::vector<std::thread> workers(std::thread::hardware_concurrency() - `1`);
974
975	for (int i = `0`; i < n_chunk; i += n_seq) {
976	const int start = i * n_ctx;
977	const int end = start + n_ctx;
978
979	const int n_seq_batch = std::min(a: n_seq, b: n_chunk - i);
980
981	const auto t_start = std::chrono::high_resolution_clock::now();
982
983	// clear the KV cache
984	llama_memory_clear(mem: llama_get_memory(ctx), data: true);
985
986	for (int j = `0`; j < num_batches; ++j) {
987	const int batch_start = start + j * n_batch;
988	const int batch_size = std::min(a: end - batch_start, b: n_batch);
989
990	// clear the batch
991	common_batch_clear(batch);
992
993	for (int seq = `0`; seq < n_seq_batch; seq++) {
994	int seq_start = batch_start + seq*n_ctx;
995
996	// save original token and restore it after eval
997	const auto token_org = tokens [seq_start];
998
999	// add BOS token for the first batch of each chunk
1000	if (add_bos && j == `0`) {
1001	tokens [seq_start] = llama_vocab_bos(vocab);
1002	}
1003	for (int k = `0`; k < batch_size; ++k) {
1004	// NOTE: specifying all logits to get activations for the output.weight tensor
1005	// and also for the perplexity calculation.
1006	// TODO: only get outputs when (params.process_output \|\| params.compute_ppl)
1007	// (not possible when this skips FFN computation of the last layer)
1008	common_batch_add(batch, id: tokens [seq_start + k], pos: jn_batch + k, seq_ids: { seq }, logits: true*);
1009	}
1010
1011	// restore the original token in case it was set to BOS
1012	tokens [seq_start] = token_org;
1013	}
1014
1015	if (llama_decode(ctx, batch)) {
1016	LOG_ERR("%s : failed to eval\n", __func__);
1017	llama_batch_free(batch);
1018	return false;
1019	}
1020
1021	if (params.compute_ppl && num_batches > `1`) {
1022	const auto * batch_logits = llama_get_logits(ctx);
1023	logits.insert(position: logits.end(), first: batch_logits, last: batch_logits + batch_size * n_vocab);
1024	}
1025	}
1026
1027
1028	if (i == `0`) {
1029	llama_synchronize(ctx);
1030	const auto t_end = std::chrono::high_resolution_clock::now();
1031	const float t_total = std::chrono::duration<float>(t_end - t_start).count();
1032	LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
1033	int total_seconds = (int)(t_total * n_chunk / n_seq);
1034	if (total_seconds >= `60`*`60`) {
1035	LOG("%d hours ", total_seconds / (`60`*`60`));
1036	total_seconds = total_seconds % (`60`*`60`);
1037	}
1038	LOG("%.2f minutes\n", total_seconds / `60.0`);
1039	}
1040
1041	if (params.compute_ppl) {
1042	const int first = n_ctx/`2`;
1043	for (int seq = `0`; seq < n_seq_batch; seq++) {
1044	const float * all_logits = num_batches > `1` ? logits.data() : llama_get_logits_ith(ctx, i: seq*n_ctx);
1045
1046	llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
1047
1048	process_logits(n_vocab, logits: all_logits + first*n_vocab,
1049	tokens: tokens_data, n_token: n_ctx - `1` - first,
1050	workers, nll, nll2,
1051	logit_history: logit_history.data() + start + seq*n_ctx + first,
1052	prob_history: prob_history.data() + start + seq*n_ctx + first);
1053
1054	count += n_ctx - first - `1`;
1055
1056	LOG("[%d]%.4lf,", i + seq + `1`, std::exp(nll / count));
1057	}
1058	fflush(stdout);
1059
1060	logits.clear();
1061	}
1062	}
1063
1064	LOG("\n");
1065
1066	if (params.compute_ppl) {
1067	nll2 /= count;
1068	nll /= count;
1069	const double ppl = exp(x: nll);
1070	nll2 -= nll * nll;
1071	if (nll2 > `0`) {
1072	nll2 = sqrt(x: nll2/(count-`1`));
1073	LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
1074	} else {
1075	LOG("Unexpected negative standard deviation of log(prob)\n");
1076	}
1077	}
1078
1079	llama_batch_free(batch);
1080
1081	return true;
1082	}
1083
1084	static bool show_statistics(const common_params & params) {
1085	std::vector<tensor_statistics> ts;
1086	if (params.in_files.empty() \|\| params.in_files.size() > `1`) {
1087	LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n");
1088	return false;
1089	}
1090	if (g_collector.load_imatrix(file_name: params.in_files [`0`].c_str())) {
1091	for (const auto & [name, stats] :g_collector.get_mstats()) {
1092	compute_statistics(tstats&: ts, name, e: stats);
1093	}
1094	} else {
1095	LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[`0`].c_str());
1096	return false;
1097	}
1098	if (!ts.empty()) {
1099	compute_cossim(tstats&: ts);
1100	} else {
1101	LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[`0`].c_str());
1102	return false;
1103	}
1104
1105	struct tensor_comparer {
1106	bool operator()(const tensor_statistics & a, const tensor_statistics & b) const {
1107	std::string layer, name_a, name_b;
1108	;
1109	process_tensor_name(input: a.tensor, layer, tensor&: name_a);
1110	process_tensor_name(input: b.tensor, layer, tensor&: name_b);
1111	return name_a < name_b \|\| (name_a == name_b && a.total_sqract > b.total_sqract);
1112	}
1113	};
1114	std::sort(first: ts.begin(), last: ts.end(), comp: tensor_comparer());
1115
1116	struct weighted_stats {
1117	float weighted_bias = `0.0f`;
1118	float weighted_zd = `0.0f`;
1119	float weighted_cossim = `0.0f`;
1120	int total_elements = `0`;
1121	};
1122	std::map<int, weighted_stats> ws;
1123
1124	LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[`0`].c_str(), static_cast<int>(ts.size()));
1125	LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", " Tensor", " Σ(Act²)",
1126	" Min", " Max", " μ", " σ", " % Active", "N", " Entropy", "E (norm)", "ZD",
1127	" CosSim");
1128	LOG_INF(
1129	"=============================================================================================================="
1130	"===========================================================\n");
1131	for (const auto & tstat : ts) {
1132	std::string layer, name;
1133	process_tensor_name(input: tstat.tensor, layer, tensor&: name);
1134
1135	int blk;
1136	try {
1137	blk = std::stoi(str: layer);
1138	} catch (const std::exception & e) {
1139	blk = -`1`; // not a block layer
1140	}
1141
1142	LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n",
1143	layer.c_str(), name.c_str(), tstat.total_sqract, tstat.min_sqract, tstat.max_sqract, tstat.mean_sqract,
1144	tstat.stddev, tstat.active * `100.0f`, tstat.elements, tstat.entropy,
1145	`100.0f` * (tstat.entropy / std::log2(tstat.elements)), `100.0f` * tstat.zd, tstat.cossim);
1146
1147	const float weighted_bias = tstat.elements * tstat.total_sqract;
1148	const float weighted_zd = tstat.elements * tstat.zd;
1149	const float weighted_cossim = tstat.elements * tstat.cossim;
1150
1151	if (ws.find(x: blk) != ws.end()) {
1152	ws [blk].weighted_bias += weighted_bias;
1153	ws [blk].weighted_zd += weighted_zd;
1154	ws [blk].weighted_cossim += weighted_cossim;
1155	ws [blk].total_elements += tstat.elements;
1156	} else {
1157	weighted_stats temp_ws;
1158	temp_ws.weighted_bias = weighted_bias;
1159	temp_ws.weighted_zd = weighted_zd;
1160	temp_ws.weighted_cossim = weighted_cossim;
1161	temp_ws.total_elements = tstat.elements;
1162	ws [blk] = temp_ws;
1163	}
1164	}
1165
1166	const int layers = std::count_if(first: ws.begin(), last: ws.end(), pred: [](const auto & kv) { return kv.first >= `0`; });
1167	LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers);
1168	LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " μΣ(Act²)", " μZD", "μCosSim");
1169	LOG_INF("================================================\n");
1170	for (const auto & [first, second] : ws) {
1171	const auto & layer = first;
1172	const auto & stats = second;
1173
1174	if (stats.total_elements == `0`) {
1175	continue;
1176	}
1177
1178	if (layer >= `0`) {
1179	const float bias = stats.weighted_bias / stats.total_elements;
1180	const float zd = stats.weighted_zd / stats.total_elements;
1181	const float cossim = stats.weighted_cossim / stats.total_elements;
1182
1183	LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, `100.0f` * zd, cossim);
1184	}
1185	}
1186	LOG_INF("\n");
1187
1188	return true;
1189	}
1190
1191	int main(int argc, char ** argv) {
1192	common_params params;
1193
1194	params.out_file = "imatrix.gguf";
1195
1196	params.n_ctx = `512`;
1197	params.escape = false;
1198
1199	if (!common_params_parse(argc, argv, params, ex: LLAMA_EXAMPLE_IMATRIX, print_usage)) {
1200	return `1`;
1201	}
1202
1203	if (params.show_statistics) {
1204	if (!show_statistics(params)) {
1205	return `1`;
1206	}
1207	return `0`;
1208	}
1209
1210	common_init();
1211
1212	const int32_t n_ctx = params.n_ctx;
1213
1214	if (n_ctx <= `0`) {
1215	LOG_ERR("%s: imatrix tool requires '--ctx-size' > 0\n", __func__);
1216	return `1`;
1217	}
1218
1219	{
1220	const int32_t n_seq = std::max(a: `1`, b: params.n_batch / n_ctx);
1221	const int32_t n_kv = n_seq * n_ctx;
1222
1223	params.n_parallel = n_seq;
1224	params.n_ctx = n_kv;
1225
1226	params.n_batch = std::min(a: params.n_batch, b: n_kv);
1227	}
1228
1229	g_collector.set_params(params);
1230
1231	for (const auto & in_file : params.in_files) {
1232	LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
1233	if (!g_collector.load_imatrix(file_name: in_file.c_str())) {
1234	LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
1235	return `1`;
1236	}
1237	}
1238
1239	if (params.prompt.empty()) {
1240	LOG_INF("No prompt provided; combining precomputed matrices only.\n");
1241
1242	if (params.in_files.empty()) {
1243	LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
1244	return `1`;
1245	}
1246
1247	if (params.in_files.size() == `1`) {
1248	LOG_INF("%s : saving imatrix to '%s'\n", __func__, params.out_file.c_str());
1249	} else if (params.in_files.size() > `1`) {
1250	LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
1251	}
1252
1253	g_collector.save_imatrix();
1254
1255	return `0`;
1256	}
1257
1258	llama_backend_init();
1259	llama_numa_init(numa: params.numa);
1260
1261	// pass the callback to the backend scheduler
1262	// it will be executed for each node during the graph computation
1263	params.cb_eval = ik_collect_imatrix;
1264	params.cb_eval_user_data = NULL;
1265	params.warmup = false;
1266
1267	// init
1268	common_init_result llama_init = common_init_from_params(params);
1269
1270	llama_model * model = llama_init.model.get();
1271	llama_context * ctx = llama_init.context.get();
1272
1273	if (model == nullptr \|\| ctx == nullptr) {
1274	LOG_ERR("%s : failed to init\n", __func__);
1275	return `1`;
1276	}
1277
1278	const int n_ctx_train = llama_model_n_ctx_train(model);
1279	if (params.n_ctx > n_ctx_train) {
1280	LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
1281	__func__, n_ctx_train, params.n_ctx);
1282	}
1283
1284	// print system information
1285	{
1286	LOG_INF("\n");
1287	LOG_INF("%s\n", common_params_get_system_info(params).c_str());
1288	}
1289
1290	if (!compute_imatrix(ctx, params, n_ctx)) {
1291	return `1`;
1292	}
1293
1294	g_collector.save_imatrix();
1295
1296	LOG("\n");
1297	llama_perf_context_print(ctx);
1298
1299	llama_backend_free();
1300
1301	return `0`;
1302	}
1303

Browse the source code of llama.cpp/tools/imatrix/imatrix.cpp