quantize.cpp source code [llama.cpp/tools/quantize/quantize.cpp]

1	#include "common.h"
2	#include "llama.h"
3	#include "gguf.h"
4
5	#include <cstdio>
6	#include <cstring>
7	#include <vector>
8	#include <string>
9	#include <unordered_map>
10	#include <map>
11	#include <fstream>
12	#include <cmath>
13	#include <cctype>
14	#include <algorithm>
15
16	struct quant_option {
17	std::string name;
18	llama_ftype ftype;
19	std::string desc;
20	};
21
22	static const std::vector<quant_option> QUANT_OPTIONS = {
23	{ .name: "Q4_0", .ftype: LLAMA_FTYPE_MOSTLY_Q4_0, .desc: " 4.34G, +0.4685 ppl @ Llama-3-8B", },
24	{ .name: "Q4_1", .ftype: LLAMA_FTYPE_MOSTLY_Q4_1, .desc: " 4.78G, +0.4511 ppl @ Llama-3-8B", },
25	{ .name: "MXFP4_MOE",.ftype: LLAMA_FTYPE_MOSTLY_MXFP4_MOE,.desc: " MXFP4 MoE", },
26	{ .name: "Q5_0", .ftype: LLAMA_FTYPE_MOSTLY_Q5_0, .desc: " 5.21G, +0.1316 ppl @ Llama-3-8B", },
27	{ .name: "Q5_1", .ftype: LLAMA_FTYPE_MOSTLY_Q5_1, .desc: " 5.65G, +0.1062 ppl @ Llama-3-8B", },
28	{ .name: "IQ2_XXS", .ftype: LLAMA_FTYPE_MOSTLY_IQ2_XXS, .desc: " 2.06 bpw quantization", },
29	{ .name: "IQ2_XS", .ftype: LLAMA_FTYPE_MOSTLY_IQ2_XS, .desc: " 2.31 bpw quantization", },
30	{ .name: "IQ2_S", .ftype: LLAMA_FTYPE_MOSTLY_IQ2_S, .desc: " 2.5 bpw quantization", },
31	{ .name: "IQ2_M", .ftype: LLAMA_FTYPE_MOSTLY_IQ2_M, .desc: " 2.7 bpw quantization", },
32	{ .name: "IQ1_S", .ftype: LLAMA_FTYPE_MOSTLY_IQ1_S, .desc: " 1.56 bpw quantization", },
33	{ .name: "IQ1_M", .ftype: LLAMA_FTYPE_MOSTLY_IQ1_M, .desc: " 1.75 bpw quantization", },
34	{ .name: "TQ1_0", .ftype: LLAMA_FTYPE_MOSTLY_TQ1_0, .desc: " 1.69 bpw ternarization", },
35	{ .name: "TQ2_0", .ftype: LLAMA_FTYPE_MOSTLY_TQ2_0, .desc: " 2.06 bpw ternarization", },
36	{ .name: "Q2_K", .ftype: LLAMA_FTYPE_MOSTLY_Q2_K, .desc: " 2.96G, +3.5199 ppl @ Llama-3-8B", },
37	{ .name: "Q2_K_S", .ftype: LLAMA_FTYPE_MOSTLY_Q2_K_S, .desc: " 2.96G, +3.1836 ppl @ Llama-3-8B", },
38	{ .name: "IQ3_XXS", .ftype: LLAMA_FTYPE_MOSTLY_IQ3_XXS, .desc: " 3.06 bpw quantization", },
39	{ .name: "IQ3_S", .ftype: LLAMA_FTYPE_MOSTLY_IQ3_S, .desc: " 3.44 bpw quantization", },
40	{ .name: "IQ3_M", .ftype: LLAMA_FTYPE_MOSTLY_IQ3_M, .desc: " 3.66 bpw quantization mix", },
41	{ .name: "Q3_K", .ftype: LLAMA_FTYPE_MOSTLY_Q3_K_M, .desc: "alias for Q3_K_M" },
42	{ .name: "IQ3_XS", .ftype: LLAMA_FTYPE_MOSTLY_IQ3_XS, .desc: " 3.3 bpw quantization", },
43	{ .name: "Q3_K_S", .ftype: LLAMA_FTYPE_MOSTLY_Q3_K_S, .desc: " 3.41G, +1.6321 ppl @ Llama-3-8B", },
44	{ .name: "Q3_K_M", .ftype: LLAMA_FTYPE_MOSTLY_Q3_K_M, .desc: " 3.74G, +0.6569 ppl @ Llama-3-8B", },
45	{ .name: "Q3_K_L", .ftype: LLAMA_FTYPE_MOSTLY_Q3_K_L, .desc: " 4.03G, +0.5562 ppl @ Llama-3-8B", },
46	{ .name: "IQ4_NL", .ftype: LLAMA_FTYPE_MOSTLY_IQ4_NL, .desc: " 4.50 bpw non-linear quantization", },
47	{ .name: "IQ4_XS", .ftype: LLAMA_FTYPE_MOSTLY_IQ4_XS, .desc: " 4.25 bpw non-linear quantization", },
48	{ .name: "Q4_K", .ftype: LLAMA_FTYPE_MOSTLY_Q4_K_M, .desc: "alias for Q4_K_M", },
49	{ .name: "Q4_K_S", .ftype: LLAMA_FTYPE_MOSTLY_Q4_K_S, .desc: " 4.37G, +0.2689 ppl @ Llama-3-8B", },
50	{ .name: "Q4_K_M", .ftype: LLAMA_FTYPE_MOSTLY_Q4_K_M, .desc: " 4.58G, +0.1754 ppl @ Llama-3-8B", },
51	{ .name: "Q5_K", .ftype: LLAMA_FTYPE_MOSTLY_Q5_K_M, .desc: "alias for Q5_K_M", },
52	{ .name: "Q5_K_S", .ftype: LLAMA_FTYPE_MOSTLY_Q5_K_S, .desc: " 5.21G, +0.1049 ppl @ Llama-3-8B", },
53	{ .name: "Q5_K_M", .ftype: LLAMA_FTYPE_MOSTLY_Q5_K_M, .desc: " 5.33G, +0.0569 ppl @ Llama-3-8B", },
54	{ .name: "Q6_K", .ftype: LLAMA_FTYPE_MOSTLY_Q6_K, .desc: " 6.14G, +0.0217 ppl @ Llama-3-8B", },
55	{ .name: "Q8_0", .ftype: LLAMA_FTYPE_MOSTLY_Q8_0, .desc: " 7.96G, +0.0026 ppl @ Llama-3-8B", },
56	{ .name: "F16", .ftype: LLAMA_FTYPE_MOSTLY_F16, .desc: "14.00G, +0.0020 ppl @ Mistral-7B", },
57	{ .name: "BF16", .ftype: LLAMA_FTYPE_MOSTLY_BF16, .desc: "14.00G, -0.0050 ppl @ Mistral-7B", },
58	{ .name: "F32", .ftype: LLAMA_FTYPE_ALL_F32, .desc: "26.00G @ 7B", },
59	// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
60	{ .name: "COPY", .ftype: LLAMA_FTYPE_ALL_F32, .desc: "only copy tensors, no quantizing", },
61	};
62
63	// Quantization types. Changes to this struct must be replicated in llama-quantize.cpp
64	struct tensor_quantization {
65	std::string name;
66	ggml_type quant = GGML_TYPE_COUNT;
67	};
68
69	static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
70	static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset";
71	static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
72	static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
73
74	// TODO: share with imatrix.cpp
75	static const char * const LLM_KV_IMATRIX_DATASETS = "imatrix.datasets";
76	static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
77	static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size";
78
79	static bool striequals(const char * a, const char * b) {
80	while (a && b) {
81	if (std::tolower(c: a) != std::tolower(c: b)) {
82	return false;
83	}
84	a++; b++;
85	}
86	return a == b;
87	}
88
89	static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
90	std::string ftype_str;
91
92	for (auto ch : ftype_str_in) {
93	ftype_str.push_back(c: std::toupper(c: ch));
94	}
95	for (const auto & it : QUANT_OPTIONS) {
96	if (striequals(a: it.name.c_str(), b: ftype_str.c_str())) {
97	ftype = it.ftype;
98	ftype_str_out = it.name;
99	return true;
100	}
101	}
102	try {
103	int ftype_int = std::stoi(str: ftype_str);
104	for (const auto & it : QUANT_OPTIONS) {
105	if (it.ftype == ftype_int) {
106	ftype = it.ftype;
107	ftype_str_out = it.name;
108	return true;
109	}
110	}
111	}
112	catch (...) {
113	// stoi failed
114	}
115	return false;
116	}
117
118	[[noreturn]]
119	static void usage(const char * executable) {
120	printf(format: "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
121	printf(format: " [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
122	printf(format: " model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
123	printf(format: " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
124	printf(format: " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
125	printf(format: " --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
126	printf(format: " --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
127	printf(format: " --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
128	printf(format: " --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
129	printf(format: " --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
130	printf(format: " --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
131	printf(format: " --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
132	printf(format: " Advanced option to selectively quantize tensors. May be specified multiple times.\n");
133	printf(format: " --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
134	printf(format: " Advanced option to remove all tensors from the given layers\n");
135	printf(format: " --keep-split: will generate quantized model in the same shards as input\n");
136	printf(format: " --override-kv KEY=TYPE:VALUE\n");
137	printf(format: " Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
138	printf(format: "Note: --include-weights and --exclude-weights cannot be used together\n");
139	printf(format: "\nAllowed quantization types:\n");
140	for (const auto & it : QUANT_OPTIONS) {
141	if (it.name != "COPY") {
142	printf(format: " %2d or ", it.ftype);
143	} else {
144	printf(format: " ");
145	}
146	printf(format: "%-7s : %s\n", it.name.c_str(), it.desc.c_str());
147	}
148	exit(status: `1`);
149	}
150
151	static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
152	std::ifstream in(imatrix_file.c_str(), std::ios::binary);
153	if (!in) {
154	printf(format: "%s: failed to open %s\n",__func__, imatrix_file.c_str());
155	exit(status: `1`);
156	}
157	int n_entries;
158	in.read(s: (char )&n_entries, n: sizeof*(n_entries));
159	if (in.fail() \|\| n_entries < `1`) {
160	printf(format: "%s: no data in file %s\n", __func__, imatrix_file.c_str());
161	exit(status: `1`);
162	}
163	for (int i = `0`; i < n_entries; ++i) {
164	int len; in.read(s: (char )&len, n: sizeof*(len));
165	std::vector<char> name_as_vec(len+`1`);
166	in.read(s: (char *)name_as_vec.data(), n: len);
167	if (in.fail()) {
168	printf(format: "%s: failed reading name for entry %d from %s\n", __func__, i+`1`, imatrix_file.c_str());
169	exit(status: `1`);
170	}
171	name_as_vec [len] = `0`;
172	std::string name{name_as_vec.data()};
173	auto & e = imatrix_data [name];
174	int ncall;
175	in.read(s: (char )&ncall, n: sizeof*(ncall));
176	int nval;
177	in.read(s: (char )&nval, n: sizeof*(nval));
178	if (in.fail() \|\| nval < `1`) {
179	printf(format: "%s: failed reading number of values for entry %d\n", __func__, i);
180	imatrix_data = {};
181	exit(status: `1`);
182	}
183	e.resize(new_size: nval);
184	in.read(s: (char )e.data(), n: nvalsizeof(float));
185	if (in.fail()) {
186	printf(format: "%s: failed reading data for entry %d\n", __func__, i);
187	imatrix_data = {};
188	exit(status: `1`);
189	}
190	if (ncall > `0`) {
191	for (auto & v : e) {
192	v /= ncall;
193	}
194	}
195
196	if (getenv(name: "LLAMA_TRACE")) {
197	printf(format: "%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
198	}
199	}
200
201	// latest legacy imatrix version contains the dataset filename at the end of the file
202	int m_last_call = `0`;
203	if (in.peek() != EOF) {
204	in.read(s: (char )&m_last_call, n: sizeof*(m_last_call));
205	int dataset_len;
206	in.read(s: (char )&dataset_len, n: sizeof*(dataset_len));
207	std::vector<char> dataset_as_vec(dataset_len);
208	in.read(s: dataset_as_vec.data(), n: dataset_len);
209	imatrix_datasets.resize(new_size: `1`);
210	imatrix_datasets [`0`].assign(first: dataset_as_vec.begin(), last: dataset_as_vec.end());
211	printf(format: "%s: imatrix dataset='%s'\n", __func__, imatrix_datasets [`0`].c_str());
212	}
213	printf(format: "%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
214	return m_last_call;
215	}
216
217	static int load_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
218
219	struct ggml_context * ctx = nullptr;
220	struct gguf_init_params meta_gguf_params = {
221	/ .no_alloc = / false, // the data is needed
222	/ .ctx = / &ctx,
223	};
224	struct gguf_context * ctx_gguf = gguf_init_from_file(fname: imatrix_file.c_str(), params: meta_gguf_params);
225	if (!ctx_gguf) {
226	fprintf(stderr, format: "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str());
227	return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data);
228	}
229	const int32_t n_entries = gguf_get_n_tensors(ctx: ctx_gguf);
230	if (n_entries < `1`) {
231	fprintf(stderr, format: "%s: no data in file %s\n", __func__, imatrix_file.c_str());
232	gguf_free(ctx: ctx_gguf);
233	ggml_free(ctx);
234	exit(status: `1`);
235	}
236
237	const int dataset_idx = gguf_find_key(ctx: ctx_gguf, key: LLM_KV_IMATRIX_DATASETS);
238	const int chunk_count_idx = gguf_find_key(ctx: ctx_gguf, key: LLM_KV_IMATRIX_CHUNK_COUNT);
239	const int chunk_size_idx = gguf_find_key(ctx: ctx_gguf, key: LLM_KV_IMATRIX_CHUNK_SIZE);
240	if (dataset_idx < `0` \|\| chunk_count_idx < `0` \|\| chunk_size_idx < `0`) {
241	fprintf(stderr, format: "%s: missing imatrix metadata in file %s\n", __func__, imatrix_file.c_str());
242	gguf_free(ctx: ctx_gguf);
243	ggml_free(ctx);
244	exit(status: `1`);
245	}
246
247	const uint32_t chunk_size = gguf_get_val_u32(ctx: ctx_gguf, key_id: chunk_size_idx);
248
249	const std::string sums_suffix{ ".in_sum2" };
250	const std::string counts_suffix{ ".counts" };
251
252	// Using an ordered map to get a deterministic iteration order.
253	std::map<std::string, std::pair<struct ggml_tensor , struct* ggml_tensor *>> sums_counts_for;
254
255	for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, tensor: cur)) {
256	std::string name = cur->name;
257
258	if (name.empty()) { continue; }
259
260	if (string_remove_suffix(str&: name, suffix: sums_suffix)) {
261	// in_sum2
262	sums_counts_for [std::move(name)].first = cur;
263	} else if (string_remove_suffix(str&: name, suffix: counts_suffix)) {
264	// counts
265	sums_counts_for [std::move(name)].second = cur;
266	} else {
267	// ignore other tensors
268	}
269	}
270
271	for (const auto & sc : sums_counts_for) {
272	const std::string & name = sc.first;
273	const struct ggml_tensor * sums = sc.second.first;
274	const struct ggml_tensor * counts = sc.second.second;
275
276	if (!sums \|\| !counts) {
277	fprintf(stderr, format: "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
278	gguf_free(ctx: ctx_gguf);
279	ggml_free(ctx);
280	exit(status: `1`);
281	}
282
283	const int64_t ne0 = sums->ne[`0`];
284	const int64_t ne1 = sums->ne[`1`];
285
286	auto & e = imatrix_data [name];
287	e.resize(new_size: ggml_nelements(tensor: sums));
288	float max_count = `0.0f`;
289	for (int64_t j = `0`; j < ne1; ++j) {
290	const float count = ((const float *) counts->data)[j];
291	if (count > `0.0f`) {
292	for (int64_t i = `0`; i < ne0; ++i) {
293	e [jne0 + i] = ((const* float ) sums->data)[jne0 + i] / count;
294	}
295	} else {
296	// Partial imatrix data, this tensor never got any input during calibration
297	for (int64_t i = `0`; i < ne0; ++i) {
298	e [j*ne0 + i] = `1`;
299	}
300	}
301	if (count > max_count) {
302	max_count = count;
303	}
304	}
305	if (getenv(name: "LLAMA_TRACE")) {
306	printf(format: "%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
307	}
308	}
309
310	int m_last_chunk = gguf_get_val_u32(ctx: ctx_gguf, key_id: chunk_count_idx);
311
312	int64_t n_datasets = gguf_get_arr_n(ctx: ctx_gguf, key_id: dataset_idx);
313	imatrix_datasets.reserve(n: n_datasets);
314	for (int64_t i = `0`; i < n_datasets; ++i) {
315	imatrix_datasets.push_back(x: gguf_get_arr_str(ctx: ctx_gguf, key_id: dataset_idx, i));
316	}
317	printf(format: "%s: imatrix datasets=['%s'", __func__, imatrix_datasets [`0`].c_str());
318	for (size_t i = `1`; i < imatrix_datasets.size(); ++i) {
319	printf(format: ", '%s'", imatrix_datasets [i].c_str());
320	}
321	printf(format: "]\n");
322
323	printf(format: "%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
324
325	gguf_free(ctx: ctx_gguf);
326	ggml_free(ctx);
327
328	return m_last_chunk;
329	}
330
331	static int prepare_imatrix(const std::string & imatrix_file,
332	std::vector<std::string> & imatrix_dataset,
333	const std::vector<std::string> & included_weights,
334	const std::vector<std::string> & excluded_weights,
335	std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
336	int m_last_call = -`1`;
337	if (!imatrix_file.empty()) {
338	m_last_call = load_imatrix(imatrix_file, imatrix_datasets&: imatrix_dataset, imatrix_data);
339	}
340	if (imatrix_data.empty()) {
341	return m_last_call;
342	}
343	if (!excluded_weights.empty()) {
344	for (const auto & name : excluded_weights) {
345	for (auto it = imatrix_data.begin(); it != imatrix_data.end();) {
346	auto pos = it ->first.find(str: name);
347	if (pos != std::string::npos) {
348	it = imatrix_data.erase(position: it);
349	} else {
350	++it;
351	}
352	}
353	}
354	}
355	if (!included_weights.empty()) {
356	std::unordered_map<std::string, std::vector<float>> tmp;
357	for (const auto & name : included_weights) {
358	for (auto & e : imatrix_data) {
359	auto pos = e.first.find(str: name);
360	if (pos != std::string::npos) {
361	tmp.emplace(args: std::move(e));
362	}
363	}
364	}
365	imatrix_data = std::move(tmp);
366	}
367	if (!imatrix_data.empty()) {
368	printf(format: "%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
369	}
370	return m_last_call;
371	}
372
373	static ggml_type parse_ggml_type(const char * arg) {
374	for (int i = `0`; i < GGML_TYPE_COUNT; ++i) {
375	auto type = (ggml_type)i;
376	const auto * name = ggml_type_name(type);
377	if (name && striequals(a: name, b: arg)) {
378	return type;
379	}
380	}
381	fprintf(stderr, format: "\n%s: invalid ggml_type '%s'\n\n", __func__, arg);
382	return GGML_TYPE_COUNT;
383	}
384
385	static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
386	const char * sep = strchr(s: data, c: `'='`);
387	if (sep == nullptr) {
388	printf(format: "\n%s: malformed tensor type '%s'\n\n", __func__, data);
389	return false;
390	}
391
392	const size_t tn_len = sep - data;
393	if (tn_len == `0`) {
394	printf(format: "\n%s: missing tensor name\n\n", __func__);
395	return false;
396	}
397	if (const size_t qt_len = strlen(s: sep); qt_len == `1`) {
398	printf(format: "\n%s: missing quantization type\n\n", __func__);
399	return false;
400	}
401
402	std::string tn(data, tn_len);
403	std::transform(first: tn.begin(), last: tn.end(), result: tn.begin(), unary_op: tolower);
404	sep++;
405	tensor_quantization tqz;
406	tqz.name = tn;
407	tqz.quant = parse_ggml_type(arg: sep);
408	tensor_type.emplace_back(args: std::move(tqz));
409	if (tqz.quant == GGML_TYPE_COUNT) {
410	printf(format: "\n%s: invalid quantization type '%s'\n\n", __func__, sep);
411	return false;
412	}
413
414	return true;
415	}
416
417	static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers) {
418	if (!data) {
419	printf(format: "\n%s: no layer pruning ids provided\n\n", __func__);
420	return false;
421	}
422
423	const auto block_ids = string_split<std::string>(input: data, separator: `','`);
424	for (const auto & block_id : block_ids) {
425	int id;
426	try {
427	id = std::stoi(str: block_id);
428	} catch (...) {
429	id = -`1`;
430	}
431	if (id < `0`) {
432	printf(format: "\n%s: invalid layer id '%s'\n\n", __func__, block_id.c_str());
433	return false;
434	}
435	prune_layers.emplace_back(args&: id);
436	}
437
438	sort(first: prune_layers.begin(), last: prune_layers.end());
439	prune_layers.erase(first: std::unique(first: prune_layers.begin(), last: prune_layers.end()), last: prune_layers.end());
440	return true;
441	}
442
443	int main(int argc, char ** argv) {
444	if (argc < `3`) {
445	usage(executable: argv[`0`]);
446	}
447
448	llama_model_quantize_params params = llama_model_quantize_default_params();
449
450	int arg_idx = `1`;
451	std::string imatrix_file;
452	std::vector<std::string> included_weights, excluded_weights;
453	std::vector<llama_model_kv_override> kv_overrides;
454	std::vector<tensor_quantization> tensor_types;
455	std::vector<int> prune_layers;
456
457	for (; arg_idx < argc && strncmp(s1: argv[arg_idx], s2: "--", n: `2`) == `0`; arg_idx++) {
458	if (strcmp(s1: argv[arg_idx], s2: "--leave-output-tensor") == `0`) {
459	params.quantize_output_tensor = false;
460	} else if (strcmp(s1: argv[arg_idx], s2: "--output-tensor-type") == `0`) {
461	if (arg_idx < argc-`1`) {
462	params.output_tensor_type = parse_ggml_type(arg: argv[++arg_idx]);
463	if (params.output_tensor_type == GGML_TYPE_COUNT) {
464	usage(executable: argv[`0`]);
465	}
466	} else {
467	usage(executable: argv[`0`]);
468	}
469	} else if (strcmp(s1: argv[arg_idx], s2: "--token-embedding-type") == `0`) {
470	if (arg_idx < argc-`1`) {
471	params.token_embedding_type = parse_ggml_type(arg: argv[++arg_idx]);
472	if (params.token_embedding_type == GGML_TYPE_COUNT) {
473	usage(executable: argv[`0`]);
474	}
475	} else {
476	usage(executable: argv[`0`]);
477	}
478	} else if (strcmp(s1: argv[arg_idx], s2: "--tensor-type") == `0`) {
479	if (arg_idx == argc-`1` \|\| !parse_tensor_type(data: argv[++arg_idx], tensor_type&: tensor_types)) {
480	usage(executable: argv[`0`]);
481	}
482	} else if (strcmp(s1: argv[arg_idx], s2: "--prune-layers") == `0`) {
483	if (arg_idx == argc-`1` \|\| !parse_layer_prune(data: argv[++arg_idx], prune_layers)) {
484	usage(executable: argv[`0`]);
485	}
486	} else if (strcmp(s1: argv[arg_idx], s2: "--override-kv") == `0`) {
487	if (arg_idx == argc-`1` \|\| !string_parse_kv_override(data: argv[++arg_idx], overrides&: kv_overrides)) {
488	usage(executable: argv[`0`]);
489	}
490	} else if (strcmp(s1: argv[arg_idx], s2: "--allow-requantize") == `0`) {
491	params.allow_requantize = true;
492	} else if (strcmp(s1: argv[arg_idx], s2: "--pure") == `0`) {
493	params.pure = true;
494	} else if (strcmp(s1: argv[arg_idx], s2: "--imatrix") == `0`) {
495	if (arg_idx < argc-`1`) {
496	imatrix_file = argv[++arg_idx];
497	} else {
498	usage(executable: argv[`0`]);
499	}
500	} else if (strcmp(s1: argv[arg_idx], s2: "--include-weights") == `0`) {
501	if (arg_idx < argc-`1`) {
502	included_weights.emplace_back(args&: argv[++arg_idx]);
503	} else {
504	usage(executable: argv[`0`]);
505	}
506	} else if (strcmp(s1: argv[arg_idx], s2: "--exclude-weights") == `0`) {
507	if (arg_idx < argc-`1`) {
508	excluded_weights.emplace_back(args&: argv[++arg_idx]);
509	} else {
510	usage(executable: argv[`0`]);
511	}
512	} else if (strcmp(s1: argv[arg_idx], s2: "--keep-split") == `0`) {
513	params.keep_split = true;
514	} else {
515	usage(executable: argv[`0`]);
516	}
517	}
518
519	if (argc - arg_idx < `2`) {
520	printf(format: "%s: bad arguments\n", argv[`0`]);
521	usage(executable: argv[`0`]);
522	}
523	if (!included_weights.empty() && !excluded_weights.empty()) {
524	usage(executable: argv[`0`]);
525	}
526
527	std::vector<std::string> imatrix_datasets;
528	std::unordered_map<std::string, std::vector<float>> imatrix_data;
529	int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset&: imatrix_datasets, included_weights, excluded_weights, imatrix_data);
530	if (!imatrix_data.empty()) {
531	params.imatrix = &imatrix_data;
532	{
533	llama_model_kv_override kvo;
534	std::strcpy(dest: kvo.key, src: LLM_KV_QUANTIZE_IMATRIX_FILE);
535	kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
536	strncpy(dest: kvo.val_str, src: imatrix_file.c_str(), n: `127`);
537	kvo.val_str[`127`] = `'\0'`;
538	kv_overrides.emplace_back(args: std::move(kvo));
539	}
540	if (!imatrix_datasets.empty()) {
541	llama_model_kv_override kvo;
542	// TODO: list multiple datasets when there are more than one
543	std::strcpy(dest: kvo.key, src: LLM_KV_QUANTIZE_IMATRIX_DATASET);
544	kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
545	strncpy(dest: kvo.val_str, src: imatrix_datasets [`0`].c_str(), n: `127`);
546	kvo.val_str[`127`] = `'\0'`;
547	kv_overrides.emplace_back(args: std::move(kvo));
548	}
549
550	{
551	llama_model_kv_override kvo;
552	std::strcpy(dest: kvo.key, src: LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
553	kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
554	kvo.val_i64 = imatrix_data.size();
555	kv_overrides.emplace_back(args: std::move(kvo));
556	}
557
558	if (m_last_call > `0`) {
559	llama_model_kv_override kvo;
560	std::strcpy(dest: kvo.key, src: LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
561	kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
562	kvo.val_i64 = m_last_call;
563	kv_overrides.emplace_back(args: std::move(kvo));
564	}
565	}
566	if (!kv_overrides.empty()) {
567	kv_overrides.emplace_back();
568	kv_overrides.back().key[`0`] = `0`;
569	params.kv_overrides = &kv_overrides;
570	}
571	if (!tensor_types.empty()) {
572	params.tensor_types = &tensor_types;
573	}
574	if (!prune_layers.empty()) {
575	params.prune_layers = &prune_layers;
576	}
577
578	llama_backend_init();
579
580	// parse command line arguments
581	const std::string fname_inp = argv[arg_idx];
582	arg_idx++;
583	std::string fname_out;
584
585	std::string ftype_str;
586	std::string suffix = ".gguf";
587	if (try_parse_ftype(ftype_str_in: argv[arg_idx], ftype&: params.ftype, ftype_str_out&: ftype_str)) {
588	std::string fpath;
589	const size_t pos = fname_inp.find_last_of(s: "/\\");
590	if (pos != std::string::npos) {
591	fpath = fname_inp.substr(pos: `0`, n: pos + `1`);
592	}
593
594	// export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
595	fname_out = fpath + "ggml-model-" + ftype_str;
596	if (!params.keep_split) {
597	fname_out += suffix;
598	}
599	arg_idx++;
600	if (ftype_str == "COPY") {
601	params.only_copy = true;
602	}
603	} else {
604	fname_out = argv[arg_idx];
605	if (params.keep_split && fname_out.find(str: suffix) != std::string::npos) {
606	fname_out = fname_out.substr(pos: `0`, n: fname_out.length() - suffix.length());
607	}
608	arg_idx++;
609
610	if (argc <= arg_idx) {
611	fprintf(stderr, format: "%s: missing ftype\n", __func__);
612	return `1`;
613	}
614	if (!try_parse_ftype(ftype_str_in: argv[arg_idx], ftype&: params.ftype, ftype_str_out&: ftype_str)) {
615	fprintf(stderr, format: "%s: invalid ftype '%s'\n", __func__, argv[arg_idx]);
616	return `1`;
617	}
618	if (ftype_str == "COPY") {
619	params.only_copy = true;
620	}
621	arg_idx++;
622	}
623
624	// parse nthreads
625	if (argc > arg_idx) {
626	try {
627	params.nthread = std::stoi(str: argv[arg_idx]);
628	}
629	catch (const std::exception & e) {
630	fprintf(stderr, format: "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
631	return `1`;
632	}
633	}
634
635	if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS \|\| params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS \|\|
636	params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S \|\|
637	params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S \|\|
638	params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S \|\|
639	params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
640	fprintf(stderr, format: "\n==========================================================================================================\n");
641	fprintf(stderr, format: "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
642	fprintf(stderr, format: "==========================================================================================================\n\n\n");
643	return `1`;
644	}
645
646	print_build_info();
647
648	fprintf(stderr, format: "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
649	if (params.nthread > `0`) {
650	fprintf(stderr, format: " using %d threads", params.nthread);
651	}
652	fprintf(stderr, format: "\n");
653
654	const int64_t t_main_start_us = llama_time_us();
655
656	int64_t t_quantize_us = `0`;
657
658	// load the model
659	{
660	const int64_t t_start_us = llama_time_us();
661
662	if (llama_model_quantize(fname_inp: fname_inp.c_str(), fname_out: fname_out.c_str(), params: &params)) {
663	fprintf(stderr, format: "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
664	return `1`;
665	}
666
667	t_quantize_us = llama_time_us() - t_start_us;
668	}
669
670	// report timing
671	{
672	const int64_t t_main_end_us = llama_time_us();
673
674	printf(format: "\n");
675	printf(format: "%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/`1000.0`);
676	printf(format: "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/`1000.0`);
677	}
678
679	llama_backend_free();
680
681	return `0`;
682	}
683

Browse the source code of llama.cpp/tools/quantize/quantize.cpp