llama-quant.cpp source code [llama.cpp/src/llama-quant.cpp]

1	#include "llama-quant.h"
2	#include "llama-impl.h"
3	#include "llama-model.h"
4	#include "llama-model-loader.h"
5
6	#include <algorithm>
7	#include <cmath>
8	#include <cstring>
9	#include <cinttypes>
10	#include <fstream>
11	#include <mutex>
12	#include <regex>
13	#include <thread>
14	#include <unordered_map>
15
16	// Quantization types. Changes to this struct must be replicated in quantize.cpp
17	struct tensor_quantization {
18	std::string name;
19	ggml_type quant = GGML_TYPE_COUNT;
20	};
21
22	static void zeros(std::ofstream & file, size_t n) {
23	char zero = `0`;
24	for (size_t i = `0`; i < n; ++i) {
25	file.write(s: &zero, n: `1`);
26	}
27	}
28
29	static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
30	if (prune.empty()) {
31	return orig_name;
32	}
33
34	static const std::regex pattern(R"(blk\.(\d+)\.)");
35	if (std::smatch match; std::regex_search(s: orig_name, m&: match, e: pattern)) {
36	const int blk = std::stoi(str: match [`1`]);
37	std::string new_name = orig_name;
38
39	if (mapped.count(x: blk)) {
40	// Already mapped, do nothing
41	} else if (std::find(first: prune.begin(), last: prune.end(), val: blk) != prune.end()) {
42	mapped [blk] = "";
43	} else if (blk < prune.front()) {
44	mapped [blk] = std::to_string(val: blk);
45	next_id = blk + `1`;
46	} else {
47	mapped [blk] = std::to_string(val: next_id);
48	++next_id;
49	}
50
51	return mapped [blk].empty() ? mapped [blk] : new_name.replace(pos: match.position(sub: `1`), n: match.length(sub: `1`), str: mapped [blk]);
52	}
53
54	return orig_name;
55	}
56
57	static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
58	if (mapped.empty()) {
59	return orig_name;
60	}
61
62	static const std::regex pattern(R"(blk\.(\d+)\.)");
63	if (std::smatch match; std::regex_search(s: orig_name, m&: match, e: pattern)) {
64	const std::string blk(match [`1`]);
65	std::string new_name = orig_name;
66
67	for (const auto & p : mapped) {
68	if (p.second == blk) {
69	LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
70	return new_name.replace(pos: match.position(sub: `1`), n: match.length(sub: `1`), str: std::to_string(val: p.first));
71	}
72	}
73	GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
74	}
75
76	return orig_name;
77	}
78
79	struct quantize_state_impl {
80	const llama_model & model;
81	const llama_model_quantize_params * params;
82
83	int n_attention_wv = `0`;
84	int n_ffn_down = `0`;
85	int n_ffn_gate = `0`;
86	int n_ffn_up = `0`;
87	int i_attention_wv = `0`;
88	int i_ffn_down = `0`;
89	int i_ffn_gate = `0`;
90	int i_ffn_up = `0`;
91
92	int n_k_quantized = `0`;
93	int n_fallback = `0`;
94
95	bool has_imatrix = false;
96
97	// used to figure out if a model shares tok_embd with the output weight
98	bool has_output = false;
99
100	quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
101	: model(model)
102	, params(params)
103	{}
104	};
105
106	static void llama_tensor_dequantize_impl(
107	ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
108	const size_t nelements, const int nthread
109	) {
110	if (output.size() < nelements) {
111	output.resize(new_size: nelements);
112	}
113	float * f32_output = (float *) output.data();
114
115	const ggml_type_traits * qtype = ggml_get_type_traits(type: tensor->type);
116	if (ggml_is_quantized(type: tensor->type)) {
117	if (qtype->to_float == NULL) {
118	throw std::runtime_error (format(fmt: "type %s unsupported for integer quantization: no dequantization available", ggml_type_name(type: tensor->type)));
119	}
120	} else if (tensor->type != GGML_TYPE_F16 &&
121	tensor->type != GGML_TYPE_BF16) {
122	throw std::runtime_error (format(fmt: "cannot dequantize/convert tensor type %s", ggml_type_name(type: tensor->type)));
123	}
124
125	if (nthread < `2`) {
126	if (tensor->type == GGML_TYPE_F16) {
127	ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
128	} else if (tensor->type == GGML_TYPE_BF16) {
129	ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
130	} else if (ggml_is_quantized(type: tensor->type)) {
131	qtype->to_float(tensor->data, f32_output, nelements);
132	} else {
133	GGML_ABORT("fatal error"); // unreachable
134	}
135	return;
136	}
137
138	size_t block_size;
139	if (tensor->type == GGML_TYPE_F16 \|\|
140	tensor->type == GGML_TYPE_BF16) {
141	block_size = `1`;
142	} else {
143	block_size = (size_t)ggml_blck_size(type: tensor->type);
144	}
145
146	size_t block_size_bytes = ggml_type_size(type: tensor->type);
147
148	GGML_ASSERT(nelements % block_size == `0`);
149	size_t nblocks = nelements / block_size;
150	size_t blocks_per_thread = nblocks / nthread;
151	size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
152
153	size_t in_buff_offs = `0`;
154	size_t out_buff_offs = `0`;
155
156	for (int tnum = `0`; tnum < nthread; tnum++) {
157	size_t thr_blocks = blocks_per_thread + (tnum == nthread - `1` ? spare_blocks : `0`); // num blocks for this thread
158	size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
159	size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
160
161	auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
162	if (typ == GGML_TYPE_F16) {
163	ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
164	} else if (typ == GGML_TYPE_BF16) {
165	ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
166	} else {
167	qtype->to_float(inbuf, outbuf, nels);
168	}
169	};
170	workers.emplace_back(args&: compute, args&: tensor->type, args: (uint8_t *) tensor->data + in_buff_offs, args: f32_output + out_buff_offs, args&: thr_elems);
171	in_buff_offs += thr_block_bytes;
172	out_buff_offs += thr_elems;
173	}
174	for (auto & w : workers) { w.join(); }
175	workers.clear();
176	}
177
178	static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
179	const std::string name = ggml_get_name(tensor);
180
181	// TODO: avoid hardcoded tensor names - use the TN_ constants*
182	const llm_arch arch = qs.model.arch;
183	const auto tn = LLM_TN (arch);
184
185	auto use_more_bits = [](int i_layer, int n_layers) -> bool {
186	return i_layer < n_layers/`8` \|\| i_layer >= `7`*n_layers/`8` \|\| (i_layer - n_layers/`8`)%`3` == `2`;
187	};
188	const int n_expert = std::max(a: `1`, b: (int)qs.model.hparams.n_expert);
189	auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
190	if (n_expert > `1`) {
191	// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
192	// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
193	// for getting the current layer as I initially thought, and we need to resort to parsing the
194	// tensor name.
195	if (sscanf(s: name, format: "blk.%d.", &i_layer) != `1`) {
196	throw std::runtime_error (format(fmt: "Failed to determine layer for tensor %s", name));
197	}
198	if (i_layer < `0` \|\| i_layer >= n_layer) {
199	throw std::runtime_error (format(fmt: "Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
200	}
201	}
202	return std::make_pair(x&: i_layer, y&: n_layer);
203	};
204
205	// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
206	// with the quantization of the output tensor
207	if (name == tn (LLM_TENSOR_OUTPUT, "weight") \|\| (!qs.has_output && name == tn (LLM_TENSOR_TOKEN_EMBD, "weight"))) {
208	if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
209	new_type = qs.params->output_tensor_type;
210	} else {
211	const int64_t nx = tensor->ne[`0`];
212	const int64_t qk_k = ggml_blck_size(type: new_type);
213
214	if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
215	new_type = GGML_TYPE_Q8_0;
216	}
217	else if (arch == LLM_ARCH_FALCON \|\| nx % qk_k != `0`) {
218	new_type = GGML_TYPE_Q8_0;
219	}
220	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS \|\|
221	ftype == LLAMA_FTYPE_MOSTLY_IQ1_S \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ2_S \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ2_M \|\|
222	ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
223	new_type = GGML_TYPE_Q5_K;
224	}
225	else if (new_type != GGML_TYPE_Q8_0) {
226	new_type = GGML_TYPE_Q6_K;
227	}
228	}
229	} else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
230	// MoE tensors -> MXFP4
231	// other tensors -> Q8_0
232	if (tensor->ne[`2`] > `1`) {
233	new_type = GGML_TYPE_MXFP4;
234	} else {
235	new_type = GGML_TYPE_Q8_0;
236	}
237	} else if (name == "token_embd.weight" \|\| name == "per_layer_token_embd.weight") {
238	if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
239	new_type = qs.params->token_embedding_type;
240	} else {
241	if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS \|\|
242	ftype == LLAMA_FTYPE_MOSTLY_IQ1_S \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
243	new_type = GGML_TYPE_Q2_K;
244	}
245	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
246	new_type = GGML_TYPE_IQ3_S;
247	}
248	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
249	new_type = GGML_TYPE_IQ3_S;
250	}
251	else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 \|\| ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
252	new_type = GGML_TYPE_Q4_K;
253	}
254	}
255	} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ1_S \|\|
256	ftype == LLAMA_FTYPE_MOSTLY_IQ2_S \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ2_M \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
257	if (name.find(s: "attn_v.weight") != std::string::npos) {
258	if (qs.model.hparams.n_gqa() >= `4` \|\| qs.model.hparams.n_expert >= `4`) new_type = GGML_TYPE_Q4_K;
259	else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
260	++qs.i_attention_wv;
261	}
262	else if (qs.model.hparams.n_expert == `8` && name.find(s: "attn_k.weight") != std::string::npos) {
263	new_type = GGML_TYPE_Q4_K;
264	}
265	else if (name.find(s: "ffn_down") != std::string::npos) {
266	if (qs.i_ffn_down < qs.n_ffn_down/`8`) {
267	new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
268	}
269	++qs.i_ffn_down;
270	}
271	else if (name.find(s: "attn_output.weight") != std::string::npos) {
272	if (qs.model.hparams.n_expert == `8`) {
273	new_type = GGML_TYPE_Q5_K;
274	} else {
275	if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
276	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
277	}
278	}
279	} else if (name.find(s: "attn_v.weight") != std::string::npos) {
280	if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
281	new_type = qs.model.hparams.n_gqa() >= `4` ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
282	}
283	else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= `4`) {
284	new_type = GGML_TYPE_Q4_K;
285	}
286	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
287	new_type = qs.model.hparams.n_gqa() >= `4` ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
288	}
289	else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= `4`) {
290	new_type = GGML_TYPE_Q4_K;
291	}
292	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
293	new_type = GGML_TYPE_Q4_K;
294	}
295	else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
296	new_type = qs.i_attention_wv < `2` ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
297	}
298	else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
299	else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= `4`) {
300	new_type = GGML_TYPE_Q5_K;
301	}
302	else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M \|\| ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
303	use_more_bits (qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
304	else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < `4`) new_type = GGML_TYPE_Q5_K;
305	if (qs.model.type == LLM_TYPE_70B) {
306	// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
307	// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
308	// nearly negligible increase in model size by quantizing this tensor with more bits:
309	if (new_type == GGML_TYPE_Q3_K \|\| new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
310	}
311	if (qs.model.hparams.n_expert == `8`) {
312	// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
313	// TODO: explore better strategies
314	new_type = GGML_TYPE_Q8_0;
315	}
316	++qs.i_attention_wv;
317	} else if (name.find(s: "attn_k.weight") != std::string::npos) {
318	if (qs.model.hparams.n_expert == `8`) {
319	// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
320	// TODO: explore better strategies
321	new_type = GGML_TYPE_Q8_0;
322	}
323	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
324	new_type = GGML_TYPE_IQ3_XXS;
325	}
326	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
327	new_type = GGML_TYPE_IQ2_S;
328	}
329	} else if (name.find(s: "attn_q.weight") != std::string::npos) {
330	if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
331	new_type = GGML_TYPE_IQ3_XXS;
332	}
333	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
334	new_type = GGML_TYPE_IQ2_S;
335	}
336	} else if (name.find(s: "ffn_down") != std::string::npos) {
337	auto info = layer_info (qs.i_ffn_down, qs.n_ffn_down, name.c_str());
338	int i_layer = info.first, n_layer = info.second;
339	if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
340	else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
341	if (i_layer < n_layer/`8`) new_type = GGML_TYPE_Q4_K;
342	}
343	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
344	new_type = i_layer < n_layer/`8` ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
345	}
346	else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
347	new_type = i_layer < n_layer/`16` ? GGML_TYPE_Q5_K
348	: arch != LLM_ARCH_FALCON \|\| use_more_bits (i_layer, n_layer) ? GGML_TYPE_Q4_K
349	: GGML_TYPE_Q3_K;
350	}
351	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/`8` \|\|
352	(qs.model.hparams.n_expert == `8` && use_more_bits (i_layer, n_layer)))) {
353	new_type = GGML_TYPE_Q4_K;
354	}
355	else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
356	new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
357	}
358	else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
359	if (arch == LLM_ARCH_FALCON) {
360	new_type = i_layer < n_layer/`16` ? GGML_TYPE_Q6_K :
361	use_more_bits (i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
362	} else {
363	if (use_more_bits (i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
364	}
365	}
366	else if (i_layer < n_layer/`8` && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
367	new_type = GGML_TYPE_Q5_K;
368	}
369	else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
370	else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/`8`) {
371	new_type = GGML_TYPE_Q5_K;
372	}
373	else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 \|\| ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
374	&& qs.has_imatrix && i_layer < n_layer/`8`) {
375	// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
376	// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
377	// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
378	new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
379	}
380	++qs.i_ffn_down;
381	} else if (name.find(s: "attn_output.weight") != std::string::npos) {
382	if (arch != LLM_ARCH_FALCON) {
383	if (qs.model.hparams.n_expert == `8`) {
384	if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS \|\|
385	ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S \|\| ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL \|\|
386	ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S \|\| ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ3_S \|\|
387	ftype == LLAMA_FTYPE_MOSTLY_IQ3_M \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
388	new_type = GGML_TYPE_Q5_K;
389	}
390	} else {
391	if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
392	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
393	else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
394	else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
395	else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
396	}
397	} else {
398	if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
399	}
400	}
401	else if (name.find(s: "attn_qkv.weight") != std::string::npos) {
402	if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M \|\| ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L \|\| ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
403	new_type = GGML_TYPE_Q4_K;
404	}
405	else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
406	else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
407	}
408	else if (name.find(s: "ffn_gate") != std::string::npos) {
409	auto info = layer_info (qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
410	int i_layer = info.first, n_layer = info.second;
411	if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/`8` && i_layer < `7`*n_layer/`8`)) {
412	new_type = GGML_TYPE_IQ3_XXS;
413	}
414	++qs.i_ffn_gate;
415	}
416	else if (name.find(s: "ffn_up") != std::string::npos) {
417	auto info = layer_info (qs.i_ffn_up, qs.n_ffn_up, name.c_str());
418	int i_layer = info.first, n_layer = info.second;
419	if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/`8` && i_layer < `7`*n_layer/`8`)) {
420	new_type = GGML_TYPE_IQ3_XXS;
421	}
422	++qs.i_ffn_up;
423	}
424
425	// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
426	//}
427	// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
428	//else if (name.find("ffn_gate") != std::string::npos \|\| name.find("ffn_up") != std::string::npos) {
429	// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
430	//}
431	// This can be used to reduce the size of the Q5_K_S model.
432	// The associated PPL increase is fully in line with the size reduction
433	//else {
434	// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
435	//}
436	bool convert_incompatible_tensor = false;
437	{
438	const int64_t nx = tensor->ne[`0`];
439	const int64_t ny = tensor->ne[`1`];
440	const int64_t qk_k = ggml_blck_size(type: new_type);
441
442	if (nx % qk_k != `0`) {
443	LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
444	convert_incompatible_tensor = true;
445	} else {
446	++qs.n_k_quantized;
447	}
448	}
449
450	if (convert_incompatible_tensor) {
451	switch (new_type) {
452	case GGML_TYPE_TQ1_0:
453	case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
454	case GGML_TYPE_IQ2_XXS:
455	case GGML_TYPE_IQ2_XS:
456	case GGML_TYPE_IQ2_S:
457	case GGML_TYPE_IQ3_XXS:
458	case GGML_TYPE_IQ3_S:
459	case GGML_TYPE_IQ1_S:
460	case GGML_TYPE_IQ1_M:
461	case GGML_TYPE_Q2_K:
462	case GGML_TYPE_Q3_K:
463	case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
464	case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
465	case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
466	case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
467	default: throw std::runtime_error ("\nUnsupported tensor size encountered\n");
468	}
469	if (tensor->ne[`0`] % ggml_blck_size(type: new_type) != `0`) {
470	new_type = GGML_TYPE_F16;
471	}
472	LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
473	++qs.n_fallback;
474	}
475
476	return new_type;
477	}
478
479	static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
480	if (nthread < `2`) {
481	// single-thread
482	size_t new_size = ggml_quantize_chunk(type: new_type, src: f32_data, dst: new_data, start: `0`, nrows, n_per_row, imatrix);
483	if (!ggml_validate_row_data(type: new_type, data: new_data, nbytes: new_size)) {
484	throw std::runtime_error ("quantized data validation failed");
485	}
486	return new_size;
487	}
488
489	std::mutex mutex;
490	int64_t counter = `0`;
491	size_t new_size = `0`;
492	bool valid = true;
493	auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
494	nrows, n_per_row, imatrix]() {
495	const int64_t nrows_per_chunk = chunk_size / n_per_row;
496	size_t local_size = `0`;
497	while (true) {
498	std::unique_lock<std::mutex> lock(mutex);
499	int64_t first_row = counter; counter += nrows_per_chunk;
500	if (first_row >= nrows) {
501	if (local_size > `0`) {
502	new_size += local_size;
503	}
504	break;
505	}
506	lock.unlock();
507	const int64_t this_nrow = std::min(a: nrows - first_row, b: nrows_per_chunk);
508	size_t this_size = ggml_quantize_chunk(type: new_type, src: f32_data, dst: new_data, start: first_row * n_per_row, nrows: this_nrow, n_per_row, imatrix);
509	local_size += this_size;
510
511	// validate the quantized data
512	const size_t row_size = ggml_row_size(type: new_type, ne: n_per_row);
513	void * this_data = (char ) new_data + first_row row_size;
514	if (!ggml_validate_row_data(type: new_type, data: this_data, nbytes: this_size)) {
515	std::unique_lock<std::mutex> lock(mutex);
516	valid = false;
517	break;
518	}
519	}
520	};
521	for (int it = `0`; it < nthread - `1`; ++it) {
522	workers.emplace_back(args&: compute);
523	}
524	compute ();
525	for (auto & w : workers) { w.join(); }
526	workers.clear();
527	if (!valid) {
528	throw std::runtime_error ("quantized data validation failed");
529	}
530	return new_size;
531	}
532
533	static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
534	ggml_type default_type;
535	llama_ftype ftype = params->ftype;
536
537	switch (params->ftype) {
538	case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
539	case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
540	case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
541	case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
542	case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
543	case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
544	case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
545	case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
546
547	case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
548
549	// K-quants
550	case LLAMA_FTYPE_MOSTLY_Q2_K_S:
551	case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
552	case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
553	case LLAMA_FTYPE_MOSTLY_Q3_K_S:
554	case LLAMA_FTYPE_MOSTLY_Q3_K_M:
555	case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
556	case LLAMA_FTYPE_MOSTLY_Q4_K_S:
557	case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
558	case LLAMA_FTYPE_MOSTLY_Q5_K_S:
559	case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
560	case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
561	case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break;
562	case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
563	case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
564	case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
565	case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
566	case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
567	case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
568	case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
569	case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
570	case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
571	case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
572	case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
573	case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
574
575	default: throw std::runtime_error (format(fmt: "invalid output file type %d\n", ftype));
576	}
577
578	int nthread = params->nthread;
579
580	if (nthread <= `0`) {
581	nthread = std::thread::hardware_concurrency();
582	}
583
584	// mmap consistently increases speed on Linux, and also increases speed on Windows with
585	// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
586	#if defined(__linux__) \|\| defined(_WIN32)
587	constexpr bool use_mmap = true;
588	#else
589	constexpr bool use_mmap = false;
590	#endif
591
592	llama_model_kv_override * kv_overrides = nullptr;
593	if (params->kv_overrides) {
594	auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
595	kv_overrides = v->data();
596	}
597
598	std::vector<std::string> splits = {};
599	llama_model_loader ml(fname_inp, splits, use_mmap, /check_tensors/ true, kv_overrides, nullptr);
600	ml.init_mappings(prefetch: false); // no prefetching
601
602	llama_model model(llama_model_default_params());
603
604	model.load_arch (ml);
605	model.load_hparams(ml);
606	model.load_stats (ml);
607
608	quantize_state_impl qs(model, params);
609
610	if (params->only_copy) {
611	ftype = ml.ftype;
612	}
613	const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
614	if (params->imatrix) {
615	imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
616	if (imatrix_data) {
617	LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
618	qs.has_imatrix = true;
619	// check imatrix for nans or infs
620	for (const auto & kv : *imatrix_data) {
621	for (float f : kv.second) {
622	if (!std::isfinite(x: f)) {
623	throw std::runtime_error (format(fmt: "imatrix contains non-finite value %f\n", f));
624	}
625	}
626	}
627	}
628	}
629
630	const size_t align = GGUF_DEFAULT_ALIGNMENT;
631	gguf_context_ptr ctx_out { gguf_init_empty() };
632
633	std::vector<int> prune_list = {};
634	if (params->prune_layers) {
635	prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
636	}
637
638	// copy the KV pairs from the input file
639	gguf_set_kv (ctx: ctx_out.get(), src: ml.meta.get());
640	gguf_set_val_u32(ctx: ctx_out.get(), key: "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
641	gguf_set_val_u32(ctx: ctx_out.get(), key: "general.file_type", val: ftype); // TODO: use LLM_KV
642
643	// Remove split metadata
644	gguf_remove_key(ctx: ctx_out.get(), key: ml.llm_kv (LLM_KV_SPLIT_NO).c_str());
645	gguf_remove_key(ctx: ctx_out.get(), key: ml.llm_kv (LLM_KV_SPLIT_COUNT).c_str());
646	gguf_remove_key(ctx: ctx_out.get(), key: ml.llm_kv (LLM_KV_SPLIT_TENSORS_COUNT).c_str());
647
648	if (params->kv_overrides) {
649	const std::vector<llama_model_kv_override> & overrides = (const* std::vector<llama_model_kv_override> *)params->kv_overrides;
650	for (const auto & o : overrides) {
651	if (o.key[`0`] == `0`) break;
652	if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
653	gguf_set_val_f32(ctx: ctx_out.get(), key: o.key, val: o.val_f64);
654	} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
655	// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
656	gguf_set_val_u32(ctx: ctx_out.get(), key: o.key, val: (uint32_t)std::abs(i: o.val_i64));
657	} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
658	gguf_set_val_bool(ctx: ctx_out.get(), key: o.key, val: o.val_bool);
659	} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
660	gguf_set_val_str(ctx: ctx_out.get(), key: o.key, val: o.val_str);
661	} else {
662	LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
663	}
664	}
665	}
666
667	std::map<int, std::string> mapped;
668	int blk_id = `0`;
669	int pruned_attention_w = `0`;
670
671	// make a list of weights
672	std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
673	tensors.reserve(n: ml.weights_map.size());
674	for (const auto & it : ml.weights_map) {
675	const std::string remapped_name(remap_layer(orig_name: it.first, prune: prune_list, mapped, next_id&: blk_id));
676	if (remapped_name.empty()) {
677	if (it.first.find(s: "attn_v.weight") != std::string::npos \|\|
678	it.first.find(s: "attn_qkv.weight") != std::string::npos \|\|
679	it.first.find(s: "attn_kv_b.weight") != std::string::npos) {
680	pruned_attention_w++;
681	}
682	LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
683	continue;
684	} else if (remapped_name != it.first) {
685	ggml_set_name(tensor: it.second.tensor, name: remapped_name.c_str());
686	LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
687	}
688	tensors.push_back(x: &it.second);
689	}
690	if (!prune_list.empty()) {
691	gguf_set_val_u32(ctx: ctx_out.get(), key: ml.llm_kv (LLM_KV_BLOCK_COUNT).c_str(), val: blk_id);
692	}
693
694	// keep_split requires that the weights are sorted by split index
695	if (params->keep_split) {
696	std::sort(first: tensors.begin(), last: tensors.end(), comp: [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
697	if (a->idx == b->idx) {
698	return a->offs < b->offs;
699	}
700	return a->idx < b->idx;
701	});
702	}
703
704	bool is_clip_model = false;
705	for (const auto * it : tensors) {
706	const struct ggml_tensor * tensor = it->tensor;
707
708	const std::string name = ggml_get_name(tensor);
709
710	// TODO: avoid hardcoded tensor names - use the TN_ constants*
711	if (name.find(s: "attn_v.weight") != std::string::npos \|\|
712	name.find(s: "attn_qkv.weight") != std::string::npos \|\|
713	name.find(s: "attn_kv_b.weight")!= std::string::npos) {
714	++qs.n_attention_wv;
715	} else if (name == LLM_TN (model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
716	qs.has_output = true;
717	}
718
719	is_clip_model \|= name.rfind(s: "mm.", pos: `0`) == `0`; // check the "mm." prefix
720	}
721
722	qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
723
724	// sanity checks for models that have attention layers
725	if (qs.n_attention_wv != `0` && !is_clip_model)
726	{
727	const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
728	// attention layers have a non-zero number of kv heads
729	int32_t n_attn_layer = model.hparams.n_layer - std::count(first: n_head_kv_iter, last: n_head_kv_iter + model.hparams.n_layer, value: `0`);
730	if (llama_model_has_encoder(model: &model)) {
731	// now n_attn_layer is the number of attention layers in the encoder
732	// for each decoder block, there are 2 attention layers
733	n_attn_layer += `2` * model.hparams.dec_n_layer;
734	}
735	GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
736	}
737
738	size_t total_size_org = `0`;
739	size_t total_size_new = `0`;
740
741	std::vector<std::thread> workers;
742	workers.reserve(n: nthread);
743
744	int idx = `0`;
745
746	std::vector<no_init<uint8_t>> read_data;
747	std::vector<no_init<uint8_t>> work;
748	std::vector<no_init<float>> f32_conv_buf;
749
750	uint16_t n_split = `1`;
751
752	// Assume split index is continuous
753	if (params->keep_split) {
754	for (const auto * it : tensors) {
755	n_split = std::max(a: uint16_t(it->idx + `1`), b: n_split);
756	}
757	}
758	std::vector<gguf_context_ptr> ctx_outs(n_split);
759	ctx_outs [`0`] = std::move(ctx_out);
760
761	// populate the original tensors so we get an initial meta data
762	for (const auto * it : tensors) {
763	uint16_t i_split = params->keep_split ? it->idx : `0`;
764	ggml_tensor * tensor = it->tensor;
765	if (!ctx_outs [i_split]) {
766	ctx_outs [i_split].reset(p: gguf_init_empty());
767	}
768	gguf_add_tensor(ctx: ctx_outs [i_split].get(), tensor);
769	}
770
771	// Set split info if needed
772	if (n_split > `1`) {
773	for (size_t i = `0`; i < ctx_outs.size(); ++i) {
774	gguf_set_val_u16(ctx: ctx_outs [i].get(), key: ml.llm_kv (LLM_KV_SPLIT_NO).c_str(), val: i);
775	gguf_set_val_u16(ctx: ctx_outs [i].get(), key: ml.llm_kv (LLM_KV_SPLIT_COUNT).c_str(), val: n_split);
776	gguf_set_val_i32(ctx: ctx_outs [i].get(), key: ml.llm_kv (LLM_KV_SPLIT_TENSORS_COUNT).c_str(), val: (int32_t)tensors.size());
777	}
778	}
779
780	int cur_split = -`1`;
781	std::ofstream fout;
782	auto close_ofstream = [&]() {
783	// Write metadata and close file handler
784	if (fout.is_open()) {
785	fout.seekp(`0`);
786	std::vector<uint8_t> data(gguf_get_meta_size(ctx: ctx_outs [cur_split].get()));
787	gguf_get_meta_data(ctx: ctx_outs [cur_split].get(), data: data.data());
788	fout.write(s: (const char *) data.data(), n: data.size());
789	fout.close();
790	}
791	};
792	auto new_ofstream = [&](int index) {
793	cur_split = index;
794	GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
795	std::string fname = fname_out;
796	if (params->keep_split) {
797	std::vector<char> split_path(llama_path_max(), `0`);
798	llama_split_path(split_path: split_path.data(), maxlen: split_path.size(), path_prefix: fname_out.c_str(), split_no: cur_split, split_count: n_split);
799	fname = std::string (split_path.data());
800	}
801
802	fout = std::ofstream (fname, std::ios::binary);
803	fout.exceptions(except: std::ofstream::failbit); // fail fast on write errors
804	const size_t meta_size = gguf_get_meta_size(ctx: ctx_outs [cur_split].get());
805	// placeholder for the meta data
806	::zeros(file&: fout, n: meta_size);
807	};
808
809	const auto tn = LLM_TN (model.arch);
810	new_ofstream (`0`);
811	for (const auto * it : tensors) {
812	const auto & weight = *it;
813	ggml_tensor * tensor = weight.tensor;
814	if (weight.idx != cur_split && params->keep_split) {
815	close_ofstream ();
816	new_ofstream (weight.idx);
817	}
818
819	const std::string name = ggml_get_name(tensor);
820
821	if (!ml.use_mmap) {
822	if (read_data.size() < ggml_nbytes(tensor)) {
823	read_data.resize(new_size: ggml_nbytes(tensor));
824	}
825	tensor->data = read_data.data();
826	}
827	ml.load_data_for(cur: tensor);
828
829	LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
830	++idx, ml.n_tensors,
831	ggml_get_name(tensor),
832	llama_format_tensor_shape(tensor).c_str(),
833	ggml_type_name(tensor->type));
834
835	// This used to be a regex, but <regex> has an extreme cost to compile times.
836	bool quantize = name.rfind(s: "weight") == name.size() - `6`; // ends with 'weight'?
837
838	// quantize only 2D and 3D tensors (experts)
839	quantize &= (ggml_n_dims(tensor) >= `2`);
840
841	// do not quantize norm tensors
842	quantize &= name.find(s: "_norm.weight") == std::string::npos;
843
844	quantize &= params->quantize_output_tensor \|\| name != "output.weight";
845	quantize &= !params->only_copy;
846
847	// do not quantize expert gating tensors
848	// NOTE: can't use LLM_TN here because the layer number is not known
849	quantize &= name.find(s: "ffn_gate_inp.weight") == std::string::npos;
850
851	// these are very small (e.g. 4x4)
852	quantize &= name.find(s: "altup") == std::string::npos;
853	quantize &= name.find(s: "laurel") == std::string::npos;
854
855	// these are not too big so keep them as it is
856	quantize &= name.find(s: "per_layer_model_proj") == std::string::npos;
857
858	// do not quantize positional embeddings and token types (BERT)
859	quantize &= name != LLM_TN (model.arch)(LLM_TENSOR_POS_EMBD, "weight");
860	quantize &= name != LLM_TN (model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
861
862	// do not quantize Mamba's small yet 2D weights
863	// NOTE: can't use LLM_TN here because the layer number is not known
864	quantize &= name.find(s: "ssm_conv1d.weight") == std::string::npos;
865	quantize &= name.find(s: "shortconv.conv.weight") == std::string::npos;
866
867	// do not quantize RWKV's small yet 2D weights
868	quantize &= name.find(s: "time_mix_first.weight") == std::string::npos;
869	quantize &= name.find(s: "time_mix_w0.weight") == std::string::npos;
870	quantize &= name.find(s: "time_mix_w1.weight") == std::string::npos;
871	quantize &= name.find(s: "time_mix_w2.weight") == std::string::npos;
872	quantize &= name.find(s: "time_mix_v0.weight") == std::string::npos;
873	quantize &= name.find(s: "time_mix_v1.weight") == std::string::npos;
874	quantize &= name.find(s: "time_mix_v2.weight") == std::string::npos;
875	quantize &= name.find(s: "time_mix_a0.weight") == std::string::npos;
876	quantize &= name.find(s: "time_mix_a1.weight") == std::string::npos;
877	quantize &= name.find(s: "time_mix_a2.weight") == std::string::npos;
878	quantize &= name.find(s: "time_mix_g1.weight") == std::string::npos;
879	quantize &= name.find(s: "time_mix_g2.weight") == std::string::npos;
880	quantize &= name.find(s: "time_mix_decay_w1.weight") == std::string::npos;
881	quantize &= name.find(s: "time_mix_decay_w2.weight") == std::string::npos;
882	quantize &= name.find(s: "time_mix_lerp_fused.weight") == std::string::npos;
883
884	// do not quantize relative position bias (T5)
885	quantize &= name.find(s: "attn_rel_b.weight") == std::string::npos;
886
887	// do not quantize specific multimodal tensors
888	quantize &= name.find(s: ".position_embd.") == std::string::npos;
889
890	ggml_type new_type;
891	void * new_data;
892	size_t new_size;
893
894	if (quantize) {
895	new_type = default_type;
896
897	// get more optimal quantization type based on the tensor shape, layer, etc.
898	if (!params->pure && ggml_is_quantized(type: default_type)) {
899	int fallback = qs.n_fallback;
900	new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
901	// unless the user specifies a type, and the tensor geometry will not require fallback quantisation
902	if (params->tensor_types && qs.n_fallback - fallback == `0`) {
903	const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
904	const std::string tensor_name(tensor->name);
905	for (const auto & [tname, qtype] : tensor_types) {
906	if (std::regex pattern(tname); std::regex_search(s: tensor_name, e: pattern)) {
907	if (qtype != new_type) {
908	LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
909	new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
910	}
911	}
912	}
913	}
914	}
915	if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(s1: tensor->name, s2: "token_embd.weight") == `0`) {
916	new_type = params->token_embedding_type;
917	}
918	if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(s1: tensor->name, s2: "output.weight") == `0`) {
919	new_type = params->output_tensor_type;
920	}
921
922	// If we've decided to quantize to the same type the tensor is already
923	// in then there's nothing to do.
924	quantize = tensor->type != new_type;
925	}
926
927	if (!quantize) {
928	new_type = tensor->type;
929	new_data = tensor->data;
930	new_size = ggml_nbytes(tensor);
931	LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/`1024.0`/`1024.0`);
932	} else {
933	const int64_t nelements = ggml_nelements(tensor);
934
935	const float * imatrix = nullptr;
936	if (imatrix_data) {
937	auto it = imatrix_data->find(x: remap_imatrix(orig_name: tensor->name, mapped));
938	if (it == imatrix_data->end()) {
939	LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
940	} else {
941	if (it ->second.size() == (size_t)tensor->ne[`0`]*tensor->ne[`2`]) {
942	imatrix = it ->second.data();
943	} else {
944	LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
945	int(it ->second.size()), int(tensor->ne[`0`]*tensor->ne[`2`]), tensor->name);
946
947	// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
948	// this is a significant error and it may be good idea to abort the process if this happens,
949	// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
950	// tok_embd should be ignored in this case, since it always causes this warning
951	if (name != tn (LLM_TENSOR_TOKEN_EMBD, "weight")) {
952	throw std::runtime_error (format(fmt: "imatrix size %d is different from tensor size %d for %s",
953	int(it ->second.size()), int(tensor->ne[`0`]*tensor->ne[`2`]), tensor->name));
954	}
955	}
956	}
957	}
958	if ((new_type == GGML_TYPE_IQ2_XXS \|\|
959	new_type == GGML_TYPE_IQ2_XS \|\|
960	new_type == GGML_TYPE_IQ2_S \|\|
961	new_type == GGML_TYPE_IQ1_S \|\|
962	(new_type == GGML_TYPE_IQ1_M && strcmp(s1: tensor->name, s2: "token_embd.weight") && strcmp(s1: tensor->name, s2: "output.weight")) \|\|
963	(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(s1: tensor->name, s2: "token_embd.weight") != `0`)) && !imatrix) {
964	LLAMA_LOG_ERROR("\n\n============================================================\n");
965	LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
966	LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
967	LLAMA_LOG_ERROR("============================================================\n\n");
968	throw std::runtime_error (format(fmt: "Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
969	}
970
971	float * f32_data;
972
973	if (tensor->type == GGML_TYPE_F32) {
974	f32_data = (float *) tensor->data;
975	} else if (ggml_is_quantized(type: tensor->type) && !params->allow_requantize) {
976	throw std::runtime_error (format(fmt: "requantizing from type %s is disabled", ggml_type_name(type: tensor->type)));
977	} else {
978	llama_tensor_dequantize_impl(tensor, output&: f32_conv_buf, workers, nelements, nthread);
979	f32_data = (float *) f32_conv_buf.data();
980	}
981
982	LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
983	fflush(stdout);
984
985	if (work.size() < (size_t)nelements * `4`) {
986	work.resize(new_size: nelements * `4`); // upper bound on size
987	}
988	new_data = work.data();
989
990	const int64_t n_per_row = tensor->ne[`0`];
991	const int64_t nrows = tensor->ne[`1`];
992
993	static const int64_t min_chunk_size = `32` * `512`;
994	const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - `1`)/n_per_row));
995
996	const int64_t nelements_matrix = tensor->ne[`0`] * tensor->ne[`1`];
997	const int64_t nchunk = (nelements_matrix + chunk_size - `1`)/chunk_size;
998	const int64_t nthread_use = nthread > `1` ? std::max(a: (int64_t)`1`, b: std::min(a: (int64_t)nthread, b: nchunk)) : `1`;
999
1000	// quantize each expert separately since they have different importance matrices
1001	new_size = `0`;
1002	for (int64_t i03 = `0`; i03 < tensor->ne[`2`]; ++i03) {
1003	const float * f32_data_03 = f32_data + i03 * nelements_matrix;
1004	void * new_data_03 = (char )new_data + ggml_row_size(type: new_type, ne: n_per_row) i03 * nrows;
1005	const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
1006
1007	new_size += llama_tensor_quantize_impl(new_type, f32_data: f32_data_03, new_data: new_data_03, chunk_size, nrows, n_per_row, imatrix: imatrix_03, workers, nthread: nthread_use);
1008
1009	// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
1010	#if 0
1011	if (new_type == GGML_TYPE_MXFP4) {
1012	auto * x = f32_data_03;
1013
1014	//LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
1015	std::vector<float> deq(nrows*n_per_row);
1016	const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
1017	qtype->to_float(new_data_03, deq.data(), deq.size());
1018
1019	double err = `0.0f`;
1020	for (int i = `0`; i < (int) deq.size(); ++i) {
1021	err += fabsf(deq[i] - x[i]);
1022	//if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
1023	if (deq[i] != x[i]) {
1024	LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
1025	}
1026	}
1027	//LLAMA_LOG_INFO("err = %f\n", err);
1028	GGML_ASSERT(err == `0.00000`);
1029	}
1030	#endif
1031	}
1032	LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/`1024.0`/`1024.0`, new_size/`1024.0`/`1024.0`);
1033	}
1034	total_size_org += ggml_nbytes(tensor);
1035	total_size_new += new_size;
1036
1037	// update the gguf meta data as we go
1038	gguf_set_tensor_type(ctx: ctx_outs [cur_split].get(), name: name.c_str(), type: new_type);
1039	GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
1040	gguf_set_tensor_data(ctx: ctx_outs [cur_split].get(), name: name.c_str(), data: new_data);
1041
1042	// write tensor data + padding
1043	fout.write(s: (const char *) new_data, n: new_size);
1044	zeros(file&: fout, GGML_PAD(new_size, align) - new_size);
1045	}
1046	close_ofstream ();
1047
1048	LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/`1024.0`/`1024.0`);
1049	LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/`1024.0`/`1024.0`);
1050
1051	if (qs.n_fallback > `0`) {
1052	LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
1053	__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
1054	}
1055	}
1056
1057	//
1058	// interface implementation
1059	//
1060
1061	llama_model_quantize_params llama_model_quantize_default_params() {
1062	llama_model_quantize_params result = {
1063	/.nthread =/ `0`,
1064	/.ftype =/ LLAMA_FTYPE_MOSTLY_Q5_1,
1065	/.output_tensor_type =/ GGML_TYPE_COUNT,
1066	/.token_embedding_type =/ GGML_TYPE_COUNT,
1067	/.allow_requantize =/ false,
1068	/.quantize_output_tensor =/ true,
1069	/.only_copy =/ false,
1070	/.pure =/ false,
1071	/.keep_split =/ false,
1072	/.imatrix =/ nullptr,
1073	/.kv_overrides =/ nullptr,
1074	/.tensor_type =/ .tensor_types: nullptr,
1075	/.prune_layers =/ nullptr
1076	};
1077
1078	return result;
1079	}
1080
1081	uint32_t llama_model_quantize(
1082	const char * fname_inp,
1083	const char * fname_out,
1084	const llama_model_quantize_params * params) {
1085	try {
1086	llama_model_quantize_impl(fname_inp, fname_out, params);
1087	} catch (const std::exception & err) {
1088	LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
1089	return `1`;
1090	}
1091
1092	return `0`;
1093	}
1094

Browse the source code of llama.cpp/src/llama-quant.cpp