1#include "llama-quant.h"
2#include "llama-impl.h"
3#include "llama-model.h"
4#include "llama-model-loader.h"
5
6#include <algorithm>
7#include <cmath>
8#include <cstring>
9#include <cinttypes>
10#include <fstream>
11#include <mutex>
12#include <regex>
13#include <thread>
14#include <unordered_map>
15
16// Quantization types. Changes to this struct must be replicated in quantize.cpp
17struct tensor_quantization {
18 std::string name;
19 ggml_type quant = GGML_TYPE_COUNT;
20};
21
22static void zeros(std::ofstream & file, size_t n) {
23 char zero = 0;
24 for (size_t i = 0; i < n; ++i) {
25 file.write(s: &zero, n: 1);
26 }
27}
28
29static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
30 if (prune.empty()) {
31 return orig_name;
32 }
33
34 static const std::regex pattern(R"(blk\.(\d+)\.)");
35 if (std::smatch match; std::regex_search(s: orig_name, m&: match, e: pattern)) {
36 const int blk = std::stoi(str: match[1]);
37 std::string new_name = orig_name;
38
39 if (mapped.count(x: blk)) {
40 // Already mapped, do nothing
41 } else if (std::find(first: prune.begin(), last: prune.end(), val: blk) != prune.end()) {
42 mapped[blk] = "";
43 } else if (blk < prune.front()) {
44 mapped[blk] = std::to_string(val: blk);
45 next_id = blk + 1;
46 } else {
47 mapped[blk] = std::to_string(val: next_id);
48 ++next_id;
49 }
50
51 return mapped[blk].empty() ? mapped[blk] : new_name.replace(pos: match.position(sub: 1), n: match.length(sub: 1), str: mapped[blk]);
52 }
53
54 return orig_name;
55}
56
57static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
58 if (mapped.empty()) {
59 return orig_name;
60 }
61
62 static const std::regex pattern(R"(blk\.(\d+)\.)");
63 if (std::smatch match; std::regex_search(s: orig_name, m&: match, e: pattern)) {
64 const std::string blk(match[1]);
65 std::string new_name = orig_name;
66
67 for (const auto & p : mapped) {
68 if (p.second == blk) {
69 LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
70 return new_name.replace(pos: match.position(sub: 1), n: match.length(sub: 1), str: std::to_string(val: p.first));
71 }
72 }
73 GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
74 }
75
76 return orig_name;
77}
78
79struct quantize_state_impl {
80 const llama_model & model;
81 const llama_model_quantize_params * params;
82
83 int n_attention_wv = 0;
84 int n_ffn_down = 0;
85 int n_ffn_gate = 0;
86 int n_ffn_up = 0;
87 int i_attention_wv = 0;
88 int i_ffn_down = 0;
89 int i_ffn_gate = 0;
90 int i_ffn_up = 0;
91
92 int n_k_quantized = 0;
93 int n_fallback = 0;
94
95 bool has_imatrix = false;
96
97 // used to figure out if a model shares tok_embd with the output weight
98 bool has_output = false;
99
100 quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
101 : model(model)
102 , params(params)
103 {}
104};
105
106static void llama_tensor_dequantize_impl(
107 ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
108 const size_t nelements, const int nthread
109) {
110 if (output.size() < nelements) {
111 output.resize(new_size: nelements);
112 }
113 float * f32_output = (float *) output.data();
114
115 const ggml_type_traits * qtype = ggml_get_type_traits(type: tensor->type);
116 if (ggml_is_quantized(type: tensor->type)) {
117 if (qtype->to_float == NULL) {
118 throw std::runtime_error(format(fmt: "type %s unsupported for integer quantization: no dequantization available", ggml_type_name(type: tensor->type)));
119 }
120 } else if (tensor->type != GGML_TYPE_F16 &&
121 tensor->type != GGML_TYPE_BF16) {
122 throw std::runtime_error(format(fmt: "cannot dequantize/convert tensor type %s", ggml_type_name(type: tensor->type)));
123 }
124
125 if (nthread < 2) {
126 if (tensor->type == GGML_TYPE_F16) {
127 ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
128 } else if (tensor->type == GGML_TYPE_BF16) {
129 ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
130 } else if (ggml_is_quantized(type: tensor->type)) {
131 qtype->to_float(tensor->data, f32_output, nelements);
132 } else {
133 GGML_ABORT("fatal error"); // unreachable
134 }
135 return;
136 }
137
138 size_t block_size;
139 if (tensor->type == GGML_TYPE_F16 ||
140 tensor->type == GGML_TYPE_BF16) {
141 block_size = 1;
142 } else {
143 block_size = (size_t)ggml_blck_size(type: tensor->type);
144 }
145
146 size_t block_size_bytes = ggml_type_size(type: tensor->type);
147
148 GGML_ASSERT(nelements % block_size == 0);
149 size_t nblocks = nelements / block_size;
150 size_t blocks_per_thread = nblocks / nthread;
151 size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
152
153 size_t in_buff_offs = 0;
154 size_t out_buff_offs = 0;
155
156 for (int tnum = 0; tnum < nthread; tnum++) {
157 size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
158 size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
159 size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
160
161 auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
162 if (typ == GGML_TYPE_F16) {
163 ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
164 } else if (typ == GGML_TYPE_BF16) {
165 ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
166 } else {
167 qtype->to_float(inbuf, outbuf, nels);
168 }
169 };
170 workers.emplace_back(args&: compute, args&: tensor->type, args: (uint8_t *) tensor->data + in_buff_offs, args: f32_output + out_buff_offs, args&: thr_elems);
171 in_buff_offs += thr_block_bytes;
172 out_buff_offs += thr_elems;
173 }
174 for (auto & w : workers) { w.join(); }
175 workers.clear();
176}
177
178static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
179 const std::string name = ggml_get_name(tensor);
180
181 // TODO: avoid hardcoded tensor names - use the TN_* constants
182 const llm_arch arch = qs.model.arch;
183 const auto tn = LLM_TN(arch);
184
185 auto use_more_bits = [](int i_layer, int n_layers) -> bool {
186 return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
187 };
188 const int n_expert = std::max(a: 1, b: (int)qs.model.hparams.n_expert);
189 auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
190 if (n_expert > 1) {
191 // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
192 // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
193 // for getting the current layer as I initially thought, and we need to resort to parsing the
194 // tensor name.
195 if (sscanf(s: name, format: "blk.%d.", &i_layer) != 1) {
196 throw std::runtime_error(format(fmt: "Failed to determine layer for tensor %s", name));
197 }
198 if (i_layer < 0 || i_layer >= n_layer) {
199 throw std::runtime_error(format(fmt: "Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
200 }
201 }
202 return std::make_pair(x&: i_layer, y&: n_layer);
203 };
204
205 // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
206 // with the quantization of the output tensor
207 if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
208 if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
209 new_type = qs.params->output_tensor_type;
210 } else {
211 const int64_t nx = tensor->ne[0];
212 const int64_t qk_k = ggml_blck_size(type: new_type);
213
214 if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
215 new_type = GGML_TYPE_Q8_0;
216 }
217 else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
218 new_type = GGML_TYPE_Q8_0;
219 }
220 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
221 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
222 ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
223 new_type = GGML_TYPE_Q5_K;
224 }
225 else if (new_type != GGML_TYPE_Q8_0) {
226 new_type = GGML_TYPE_Q6_K;
227 }
228 }
229 } else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
230 // MoE tensors -> MXFP4
231 // other tensors -> Q8_0
232 if (tensor->ne[2] > 1) {
233 new_type = GGML_TYPE_MXFP4;
234 } else {
235 new_type = GGML_TYPE_Q8_0;
236 }
237 } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
238 if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
239 new_type = qs.params->token_embedding_type;
240 } else {
241 if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
242 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
243 new_type = GGML_TYPE_Q2_K;
244 }
245 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
246 new_type = GGML_TYPE_IQ3_S;
247 }
248 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
249 new_type = GGML_TYPE_IQ3_S;
250 }
251 else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
252 new_type = GGML_TYPE_Q4_K;
253 }
254 }
255 } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
256 ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
257 if (name.find(s: "attn_v.weight") != std::string::npos) {
258 if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
259 else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
260 ++qs.i_attention_wv;
261 }
262 else if (qs.model.hparams.n_expert == 8 && name.find(s: "attn_k.weight") != std::string::npos) {
263 new_type = GGML_TYPE_Q4_K;
264 }
265 else if (name.find(s: "ffn_down") != std::string::npos) {
266 if (qs.i_ffn_down < qs.n_ffn_down/8) {
267 new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
268 }
269 ++qs.i_ffn_down;
270 }
271 else if (name.find(s: "attn_output.weight") != std::string::npos) {
272 if (qs.model.hparams.n_expert == 8) {
273 new_type = GGML_TYPE_Q5_K;
274 } else {
275 if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
276 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
277 }
278 }
279 } else if (name.find(s: "attn_v.weight") != std::string::npos) {
280 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
281 new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
282 }
283 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
284 new_type = GGML_TYPE_Q4_K;
285 }
286 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
287 new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
288 }
289 else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
290 new_type = GGML_TYPE_Q4_K;
291 }
292 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
293 new_type = GGML_TYPE_Q4_K;
294 }
295 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
296 new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
297 }
298 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
299 else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
300 new_type = GGML_TYPE_Q5_K;
301 }
302 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
303 use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
304 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
305 if (qs.model.type == LLM_TYPE_70B) {
306 // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
307 // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
308 // nearly negligible increase in model size by quantizing this tensor with more bits:
309 if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
310 }
311 if (qs.model.hparams.n_expert == 8) {
312 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
313 // TODO: explore better strategies
314 new_type = GGML_TYPE_Q8_0;
315 }
316 ++qs.i_attention_wv;
317 } else if (name.find(s: "attn_k.weight") != std::string::npos) {
318 if (qs.model.hparams.n_expert == 8) {
319 // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
320 // TODO: explore better strategies
321 new_type = GGML_TYPE_Q8_0;
322 }
323 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
324 new_type = GGML_TYPE_IQ3_XXS;
325 }
326 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
327 new_type = GGML_TYPE_IQ2_S;
328 }
329 } else if (name.find(s: "attn_q.weight") != std::string::npos) {
330 if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
331 new_type = GGML_TYPE_IQ3_XXS;
332 }
333 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
334 new_type = GGML_TYPE_IQ2_S;
335 }
336 } else if (name.find(s: "ffn_down") != std::string::npos) {
337 auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
338 int i_layer = info.first, n_layer = info.second;
339 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
340 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
341 if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
342 }
343 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
344 new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
345 }
346 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
347 new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
348 : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
349 : GGML_TYPE_Q3_K;
350 }
351 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
352 (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
353 new_type = GGML_TYPE_Q4_K;
354 }
355 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
356 new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
357 }
358 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
359 if (arch == LLM_ARCH_FALCON) {
360 new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
361 use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
362 } else {
363 if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
364 }
365 }
366 else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
367 new_type = GGML_TYPE_Q5_K;
368 }
369 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
370 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
371 new_type = GGML_TYPE_Q5_K;
372 }
373 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
374 && qs.has_imatrix && i_layer < n_layer/8) {
375 // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
376 // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
377 // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
378 new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
379 }
380 ++qs.i_ffn_down;
381 } else if (name.find(s: "attn_output.weight") != std::string::npos) {
382 if (arch != LLM_ARCH_FALCON) {
383 if (qs.model.hparams.n_expert == 8) {
384 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
385 ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
386 ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
387 ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
388 new_type = GGML_TYPE_Q5_K;
389 }
390 } else {
391 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
392 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
393 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
394 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
395 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
396 }
397 } else {
398 if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
399 }
400 }
401 else if (name.find(s: "attn_qkv.weight") != std::string::npos) {
402 if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
403 new_type = GGML_TYPE_Q4_K;
404 }
405 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
406 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
407 }
408 else if (name.find(s: "ffn_gate") != std::string::npos) {
409 auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
410 int i_layer = info.first, n_layer = info.second;
411 if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
412 new_type = GGML_TYPE_IQ3_XXS;
413 }
414 ++qs.i_ffn_gate;
415 }
416 else if (name.find(s: "ffn_up") != std::string::npos) {
417 auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
418 int i_layer = info.first, n_layer = info.second;
419 if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
420 new_type = GGML_TYPE_IQ3_XXS;
421 }
422 ++qs.i_ffn_up;
423 }
424
425 // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
426 //}
427 // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
428 //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
429 // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
430 //}
431 // This can be used to reduce the size of the Q5_K_S model.
432 // The associated PPL increase is fully in line with the size reduction
433 //else {
434 // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
435 //}
436 bool convert_incompatible_tensor = false;
437 {
438 const int64_t nx = tensor->ne[0];
439 const int64_t ny = tensor->ne[1];
440 const int64_t qk_k = ggml_blck_size(type: new_type);
441
442 if (nx % qk_k != 0) {
443 LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
444 convert_incompatible_tensor = true;
445 } else {
446 ++qs.n_k_quantized;
447 }
448 }
449
450 if (convert_incompatible_tensor) {
451 switch (new_type) {
452 case GGML_TYPE_TQ1_0:
453 case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
454 case GGML_TYPE_IQ2_XXS:
455 case GGML_TYPE_IQ2_XS:
456 case GGML_TYPE_IQ2_S:
457 case GGML_TYPE_IQ3_XXS:
458 case GGML_TYPE_IQ3_S:
459 case GGML_TYPE_IQ1_S:
460 case GGML_TYPE_IQ1_M:
461 case GGML_TYPE_Q2_K:
462 case GGML_TYPE_Q3_K:
463 case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
464 case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
465 case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
466 case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
467 default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
468 }
469 if (tensor->ne[0] % ggml_blck_size(type: new_type) != 0) {
470 new_type = GGML_TYPE_F16;
471 }
472 LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
473 ++qs.n_fallback;
474 }
475
476 return new_type;
477}
478
479static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
480 if (nthread < 2) {
481 // single-thread
482 size_t new_size = ggml_quantize_chunk(type: new_type, src: f32_data, dst: new_data, start: 0, nrows, n_per_row, imatrix);
483 if (!ggml_validate_row_data(type: new_type, data: new_data, nbytes: new_size)) {
484 throw std::runtime_error("quantized data validation failed");
485 }
486 return new_size;
487 }
488
489 std::mutex mutex;
490 int64_t counter = 0;
491 size_t new_size = 0;
492 bool valid = true;
493 auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
494 nrows, n_per_row, imatrix]() {
495 const int64_t nrows_per_chunk = chunk_size / n_per_row;
496 size_t local_size = 0;
497 while (true) {
498 std::unique_lock<std::mutex> lock(mutex);
499 int64_t first_row = counter; counter += nrows_per_chunk;
500 if (first_row >= nrows) {
501 if (local_size > 0) {
502 new_size += local_size;
503 }
504 break;
505 }
506 lock.unlock();
507 const int64_t this_nrow = std::min(a: nrows - first_row, b: nrows_per_chunk);
508 size_t this_size = ggml_quantize_chunk(type: new_type, src: f32_data, dst: new_data, start: first_row * n_per_row, nrows: this_nrow, n_per_row, imatrix);
509 local_size += this_size;
510
511 // validate the quantized data
512 const size_t row_size = ggml_row_size(type: new_type, ne: n_per_row);
513 void * this_data = (char *) new_data + first_row * row_size;
514 if (!ggml_validate_row_data(type: new_type, data: this_data, nbytes: this_size)) {
515 std::unique_lock<std::mutex> lock(mutex);
516 valid = false;
517 break;
518 }
519 }
520 };
521 for (int it = 0; it < nthread - 1; ++it) {
522 workers.emplace_back(args&: compute);
523 }
524 compute();
525 for (auto & w : workers) { w.join(); }
526 workers.clear();
527 if (!valid) {
528 throw std::runtime_error("quantized data validation failed");
529 }
530 return new_size;
531}
532
533static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
534 ggml_type default_type;
535 llama_ftype ftype = params->ftype;
536
537 switch (params->ftype) {
538 case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
539 case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
540 case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
541 case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
542 case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
543 case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
544 case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
545 case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
546
547 case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
548
549 // K-quants
550 case LLAMA_FTYPE_MOSTLY_Q2_K_S:
551 case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
552 case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
553 case LLAMA_FTYPE_MOSTLY_Q3_K_S:
554 case LLAMA_FTYPE_MOSTLY_Q3_K_M:
555 case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
556 case LLAMA_FTYPE_MOSTLY_Q4_K_S:
557 case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
558 case LLAMA_FTYPE_MOSTLY_Q5_K_S:
559 case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
560 case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
561 case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break;
562 case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
563 case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
564 case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
565 case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
566 case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
567 case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
568 case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
569 case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
570 case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
571 case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
572 case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
573 case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
574
575 default: throw std::runtime_error(format(fmt: "invalid output file type %d\n", ftype));
576 }
577
578 int nthread = params->nthread;
579
580 if (nthread <= 0) {
581 nthread = std::thread::hardware_concurrency();
582 }
583
584 // mmap consistently increases speed on Linux, and also increases speed on Windows with
585 // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
586#if defined(__linux__) || defined(_WIN32)
587 constexpr bool use_mmap = true;
588#else
589 constexpr bool use_mmap = false;
590#endif
591
592 llama_model_kv_override * kv_overrides = nullptr;
593 if (params->kv_overrides) {
594 auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
595 kv_overrides = v->data();
596 }
597
598 std::vector<std::string> splits = {};
599 llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
600 ml.init_mappings(prefetch: false); // no prefetching
601
602 llama_model model(llama_model_default_params());
603
604 model.load_arch (ml);
605 model.load_hparams(ml);
606 model.load_stats (ml);
607
608 quantize_state_impl qs(model, params);
609
610 if (params->only_copy) {
611 ftype = ml.ftype;
612 }
613 const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
614 if (params->imatrix) {
615 imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
616 if (imatrix_data) {
617 LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
618 qs.has_imatrix = true;
619 // check imatrix for nans or infs
620 for (const auto & kv : *imatrix_data) {
621 for (float f : kv.second) {
622 if (!std::isfinite(x: f)) {
623 throw std::runtime_error(format(fmt: "imatrix contains non-finite value %f\n", f));
624 }
625 }
626 }
627 }
628 }
629
630 const size_t align = GGUF_DEFAULT_ALIGNMENT;
631 gguf_context_ptr ctx_out { gguf_init_empty() };
632
633 std::vector<int> prune_list = {};
634 if (params->prune_layers) {
635 prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
636 }
637
638 // copy the KV pairs from the input file
639 gguf_set_kv (ctx: ctx_out.get(), src: ml.meta.get());
640 gguf_set_val_u32(ctx: ctx_out.get(), key: "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
641 gguf_set_val_u32(ctx: ctx_out.get(), key: "general.file_type", val: ftype); // TODO: use LLM_KV
642
643 // Remove split metadata
644 gguf_remove_key(ctx: ctx_out.get(), key: ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
645 gguf_remove_key(ctx: ctx_out.get(), key: ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
646 gguf_remove_key(ctx: ctx_out.get(), key: ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
647
648 if (params->kv_overrides) {
649 const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
650 for (const auto & o : overrides) {
651 if (o.key[0] == 0) break;
652 if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
653 gguf_set_val_f32(ctx: ctx_out.get(), key: o.key, val: o.val_f64);
654 } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
655 // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
656 gguf_set_val_u32(ctx: ctx_out.get(), key: o.key, val: (uint32_t)std::abs(i: o.val_i64));
657 } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
658 gguf_set_val_bool(ctx: ctx_out.get(), key: o.key, val: o.val_bool);
659 } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
660 gguf_set_val_str(ctx: ctx_out.get(), key: o.key, val: o.val_str);
661 } else {
662 LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
663 }
664 }
665 }
666
667 std::map<int, std::string> mapped;
668 int blk_id = 0;
669 int pruned_attention_w = 0;
670
671 // make a list of weights
672 std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
673 tensors.reserve(n: ml.weights_map.size());
674 for (const auto & it : ml.weights_map) {
675 const std::string remapped_name(remap_layer(orig_name: it.first, prune: prune_list, mapped, next_id&: blk_id));
676 if (remapped_name.empty()) {
677 if (it.first.find(s: "attn_v.weight") != std::string::npos ||
678 it.first.find(s: "attn_qkv.weight") != std::string::npos ||
679 it.first.find(s: "attn_kv_b.weight") != std::string::npos) {
680 pruned_attention_w++;
681 }
682 LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
683 continue;
684 } else if (remapped_name != it.first) {
685 ggml_set_name(tensor: it.second.tensor, name: remapped_name.c_str());
686 LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
687 }
688 tensors.push_back(x: &it.second);
689 }
690 if (!prune_list.empty()) {
691 gguf_set_val_u32(ctx: ctx_out.get(), key: ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), val: blk_id);
692 }
693
694 // keep_split requires that the weights are sorted by split index
695 if (params->keep_split) {
696 std::sort(first: tensors.begin(), last: tensors.end(), comp: [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
697 if (a->idx == b->idx) {
698 return a->offs < b->offs;
699 }
700 return a->idx < b->idx;
701 });
702 }
703
704 bool is_clip_model = false;
705 for (const auto * it : tensors) {
706 const struct ggml_tensor * tensor = it->tensor;
707
708 const std::string name = ggml_get_name(tensor);
709
710 // TODO: avoid hardcoded tensor names - use the TN_* constants
711 if (name.find(s: "attn_v.weight") != std::string::npos ||
712 name.find(s: "attn_qkv.weight") != std::string::npos ||
713 name.find(s: "attn_kv_b.weight")!= std::string::npos) {
714 ++qs.n_attention_wv;
715 } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
716 qs.has_output = true;
717 }
718
719 is_clip_model |= name.rfind(s: "mm.", pos: 0) == 0; // check the "mm." prefix
720 }
721
722 qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
723
724 // sanity checks for models that have attention layers
725 if (qs.n_attention_wv != 0 && !is_clip_model)
726 {
727 const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
728 // attention layers have a non-zero number of kv heads
729 int32_t n_attn_layer = model.hparams.n_layer - std::count(first: n_head_kv_iter, last: n_head_kv_iter + model.hparams.n_layer, value: 0);
730 if (llama_model_has_encoder(model: &model)) {
731 // now n_attn_layer is the number of attention layers in the encoder
732 // for each decoder block, there are 2 attention layers
733 n_attn_layer += 2 * model.hparams.dec_n_layer;
734 }
735 GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
736 }
737
738 size_t total_size_org = 0;
739 size_t total_size_new = 0;
740
741 std::vector<std::thread> workers;
742 workers.reserve(n: nthread);
743
744 int idx = 0;
745
746 std::vector<no_init<uint8_t>> read_data;
747 std::vector<no_init<uint8_t>> work;
748 std::vector<no_init<float>> f32_conv_buf;
749
750 uint16_t n_split = 1;
751
752 // Assume split index is continuous
753 if (params->keep_split) {
754 for (const auto * it : tensors) {
755 n_split = std::max(a: uint16_t(it->idx + 1), b: n_split);
756 }
757 }
758 std::vector<gguf_context_ptr> ctx_outs(n_split);
759 ctx_outs[0] = std::move(ctx_out);
760
761 // populate the original tensors so we get an initial meta data
762 for (const auto * it : tensors) {
763 uint16_t i_split = params->keep_split ? it->idx : 0;
764 ggml_tensor * tensor = it->tensor;
765 if (!ctx_outs[i_split]) {
766 ctx_outs[i_split].reset(p: gguf_init_empty());
767 }
768 gguf_add_tensor(ctx: ctx_outs[i_split].get(), tensor);
769 }
770
771 // Set split info if needed
772 if (n_split > 1) {
773 for (size_t i = 0; i < ctx_outs.size(); ++i) {
774 gguf_set_val_u16(ctx: ctx_outs[i].get(), key: ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), val: i);
775 gguf_set_val_u16(ctx: ctx_outs[i].get(), key: ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), val: n_split);
776 gguf_set_val_i32(ctx: ctx_outs[i].get(), key: ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), val: (int32_t)tensors.size());
777 }
778 }
779
780 int cur_split = -1;
781 std::ofstream fout;
782 auto close_ofstream = [&]() {
783 // Write metadata and close file handler
784 if (fout.is_open()) {
785 fout.seekp(0);
786 std::vector<uint8_t> data(gguf_get_meta_size(ctx: ctx_outs[cur_split].get()));
787 gguf_get_meta_data(ctx: ctx_outs[cur_split].get(), data: data.data());
788 fout.write(s: (const char *) data.data(), n: data.size());
789 fout.close();
790 }
791 };
792 auto new_ofstream = [&](int index) {
793 cur_split = index;
794 GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
795 std::string fname = fname_out;
796 if (params->keep_split) {
797 std::vector<char> split_path(llama_path_max(), 0);
798 llama_split_path(split_path: split_path.data(), maxlen: split_path.size(), path_prefix: fname_out.c_str(), split_no: cur_split, split_count: n_split);
799 fname = std::string(split_path.data());
800 }
801
802 fout = std::ofstream(fname, std::ios::binary);
803 fout.exceptions(except: std::ofstream::failbit); // fail fast on write errors
804 const size_t meta_size = gguf_get_meta_size(ctx: ctx_outs[cur_split].get());
805 // placeholder for the meta data
806 ::zeros(file&: fout, n: meta_size);
807 };
808
809 const auto tn = LLM_TN(model.arch);
810 new_ofstream(0);
811 for (const auto * it : tensors) {
812 const auto & weight = *it;
813 ggml_tensor * tensor = weight.tensor;
814 if (weight.idx != cur_split && params->keep_split) {
815 close_ofstream();
816 new_ofstream(weight.idx);
817 }
818
819 const std::string name = ggml_get_name(tensor);
820
821 if (!ml.use_mmap) {
822 if (read_data.size() < ggml_nbytes(tensor)) {
823 read_data.resize(new_size: ggml_nbytes(tensor));
824 }
825 tensor->data = read_data.data();
826 }
827 ml.load_data_for(cur: tensor);
828
829 LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
830 ++idx, ml.n_tensors,
831 ggml_get_name(tensor),
832 llama_format_tensor_shape(tensor).c_str(),
833 ggml_type_name(tensor->type));
834
835 // This used to be a regex, but <regex> has an extreme cost to compile times.
836 bool quantize = name.rfind(s: "weight") == name.size() - 6; // ends with 'weight'?
837
838 // quantize only 2D and 3D tensors (experts)
839 quantize &= (ggml_n_dims(tensor) >= 2);
840
841 // do not quantize norm tensors
842 quantize &= name.find(s: "_norm.weight") == std::string::npos;
843
844 quantize &= params->quantize_output_tensor || name != "output.weight";
845 quantize &= !params->only_copy;
846
847 // do not quantize expert gating tensors
848 // NOTE: can't use LLM_TN here because the layer number is not known
849 quantize &= name.find(s: "ffn_gate_inp.weight") == std::string::npos;
850
851 // these are very small (e.g. 4x4)
852 quantize &= name.find(s: "altup") == std::string::npos;
853 quantize &= name.find(s: "laurel") == std::string::npos;
854
855 // these are not too big so keep them as it is
856 quantize &= name.find(s: "per_layer_model_proj") == std::string::npos;
857
858 // do not quantize positional embeddings and token types (BERT)
859 quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
860 quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
861
862 // do not quantize Mamba's small yet 2D weights
863 // NOTE: can't use LLM_TN here because the layer number is not known
864 quantize &= name.find(s: "ssm_conv1d.weight") == std::string::npos;
865 quantize &= name.find(s: "shortconv.conv.weight") == std::string::npos;
866
867 // do not quantize RWKV's small yet 2D weights
868 quantize &= name.find(s: "time_mix_first.weight") == std::string::npos;
869 quantize &= name.find(s: "time_mix_w0.weight") == std::string::npos;
870 quantize &= name.find(s: "time_mix_w1.weight") == std::string::npos;
871 quantize &= name.find(s: "time_mix_w2.weight") == std::string::npos;
872 quantize &= name.find(s: "time_mix_v0.weight") == std::string::npos;
873 quantize &= name.find(s: "time_mix_v1.weight") == std::string::npos;
874 quantize &= name.find(s: "time_mix_v2.weight") == std::string::npos;
875 quantize &= name.find(s: "time_mix_a0.weight") == std::string::npos;
876 quantize &= name.find(s: "time_mix_a1.weight") == std::string::npos;
877 quantize &= name.find(s: "time_mix_a2.weight") == std::string::npos;
878 quantize &= name.find(s: "time_mix_g1.weight") == std::string::npos;
879 quantize &= name.find(s: "time_mix_g2.weight") == std::string::npos;
880 quantize &= name.find(s: "time_mix_decay_w1.weight") == std::string::npos;
881 quantize &= name.find(s: "time_mix_decay_w2.weight") == std::string::npos;
882 quantize &= name.find(s: "time_mix_lerp_fused.weight") == std::string::npos;
883
884 // do not quantize relative position bias (T5)
885 quantize &= name.find(s: "attn_rel_b.weight") == std::string::npos;
886
887 // do not quantize specific multimodal tensors
888 quantize &= name.find(s: ".position_embd.") == std::string::npos;
889
890 ggml_type new_type;
891 void * new_data;
892 size_t new_size;
893
894 if (quantize) {
895 new_type = default_type;
896
897 // get more optimal quantization type based on the tensor shape, layer, etc.
898 if (!params->pure && ggml_is_quantized(type: default_type)) {
899 int fallback = qs.n_fallback;
900 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
901 // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
902 if (params->tensor_types && qs.n_fallback - fallback == 0) {
903 const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
904 const std::string tensor_name(tensor->name);
905 for (const auto & [tname, qtype] : tensor_types) {
906 if (std::regex pattern(tname); std::regex_search(s: tensor_name, e: pattern)) {
907 if (qtype != new_type) {
908 LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
909 new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
910 }
911 }
912 }
913 }
914 }
915 if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(s1: tensor->name, s2: "token_embd.weight") == 0) {
916 new_type = params->token_embedding_type;
917 }
918 if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(s1: tensor->name, s2: "output.weight") == 0) {
919 new_type = params->output_tensor_type;
920 }
921
922 // If we've decided to quantize to the same type the tensor is already
923 // in then there's nothing to do.
924 quantize = tensor->type != new_type;
925 }
926
927 if (!quantize) {
928 new_type = tensor->type;
929 new_data = tensor->data;
930 new_size = ggml_nbytes(tensor);
931 LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
932 } else {
933 const int64_t nelements = ggml_nelements(tensor);
934
935 const float * imatrix = nullptr;
936 if (imatrix_data) {
937 auto it = imatrix_data->find(x: remap_imatrix(orig_name: tensor->name, mapped));
938 if (it == imatrix_data->end()) {
939 LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
940 } else {
941 if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
942 imatrix = it->second.data();
943 } else {
944 LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
945 int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
946
947 // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
948 // this is a significant error and it may be good idea to abort the process if this happens,
949 // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
950 // tok_embd should be ignored in this case, since it always causes this warning
951 if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
952 throw std::runtime_error(format(fmt: "imatrix size %d is different from tensor size %d for %s",
953 int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
954 }
955 }
956 }
957 }
958 if ((new_type == GGML_TYPE_IQ2_XXS ||
959 new_type == GGML_TYPE_IQ2_XS ||
960 new_type == GGML_TYPE_IQ2_S ||
961 new_type == GGML_TYPE_IQ1_S ||
962 (new_type == GGML_TYPE_IQ1_M && strcmp(s1: tensor->name, s2: "token_embd.weight") && strcmp(s1: tensor->name, s2: "output.weight")) ||
963 (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(s1: tensor->name, s2: "token_embd.weight") != 0)) && !imatrix) {
964 LLAMA_LOG_ERROR("\n\n============================================================\n");
965 LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
966 LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
967 LLAMA_LOG_ERROR("============================================================\n\n");
968 throw std::runtime_error(format(fmt: "Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
969 }
970
971 float * f32_data;
972
973 if (tensor->type == GGML_TYPE_F32) {
974 f32_data = (float *) tensor->data;
975 } else if (ggml_is_quantized(type: tensor->type) && !params->allow_requantize) {
976 throw std::runtime_error(format(fmt: "requantizing from type %s is disabled", ggml_type_name(type: tensor->type)));
977 } else {
978 llama_tensor_dequantize_impl(tensor, output&: f32_conv_buf, workers, nelements, nthread);
979 f32_data = (float *) f32_conv_buf.data();
980 }
981
982 LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
983 fflush(stdout);
984
985 if (work.size() < (size_t)nelements * 4) {
986 work.resize(new_size: nelements * 4); // upper bound on size
987 }
988 new_data = work.data();
989
990 const int64_t n_per_row = tensor->ne[0];
991 const int64_t nrows = tensor->ne[1];
992
993 static const int64_t min_chunk_size = 32 * 512;
994 const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
995
996 const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
997 const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
998 const int64_t nthread_use = nthread > 1 ? std::max(a: (int64_t)1, b: std::min(a: (int64_t)nthread, b: nchunk)) : 1;
999
1000 // quantize each expert separately since they have different importance matrices
1001 new_size = 0;
1002 for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
1003 const float * f32_data_03 = f32_data + i03 * nelements_matrix;
1004 void * new_data_03 = (char *)new_data + ggml_row_size(type: new_type, ne: n_per_row) * i03 * nrows;
1005 const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
1006
1007 new_size += llama_tensor_quantize_impl(new_type, f32_data: f32_data_03, new_data: new_data_03, chunk_size, nrows, n_per_row, imatrix: imatrix_03, workers, nthread: nthread_use);
1008
1009 // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
1010#if 0
1011 if (new_type == GGML_TYPE_MXFP4) {
1012 auto * x = f32_data_03;
1013
1014 //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
1015 std::vector<float> deq(nrows*n_per_row);
1016 const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
1017 qtype->to_float(new_data_03, deq.data(), deq.size());
1018
1019 double err = 0.0f;
1020 for (int i = 0; i < (int) deq.size(); ++i) {
1021 err += fabsf(deq[i] - x[i]);
1022 //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
1023 if (deq[i] != x[i]) {
1024 LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
1025 }
1026 }
1027 //LLAMA_LOG_INFO("err = %f\n", err);
1028 GGML_ASSERT(err == 0.00000);
1029 }
1030#endif
1031 }
1032 LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
1033 }
1034 total_size_org += ggml_nbytes(tensor);
1035 total_size_new += new_size;
1036
1037 // update the gguf meta data as we go
1038 gguf_set_tensor_type(ctx: ctx_outs[cur_split].get(), name: name.c_str(), type: new_type);
1039 GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
1040 gguf_set_tensor_data(ctx: ctx_outs[cur_split].get(), name: name.c_str(), data: new_data);
1041
1042 // write tensor data + padding
1043 fout.write(s: (const char *) new_data, n: new_size);
1044 zeros(file&: fout, GGML_PAD(new_size, align) - new_size);
1045 }
1046 close_ofstream();
1047
1048 LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
1049 LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
1050
1051 if (qs.n_fallback > 0) {
1052 LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
1053 __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
1054 }
1055}
1056
1057//
1058// interface implementation
1059//
1060
1061llama_model_quantize_params llama_model_quantize_default_params() {
1062 llama_model_quantize_params result = {
1063 /*.nthread =*/ 0,
1064 /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
1065 /*.output_tensor_type =*/ GGML_TYPE_COUNT,
1066 /*.token_embedding_type =*/ GGML_TYPE_COUNT,
1067 /*.allow_requantize =*/ false,
1068 /*.quantize_output_tensor =*/ true,
1069 /*.only_copy =*/ false,
1070 /*.pure =*/ false,
1071 /*.keep_split =*/ false,
1072 /*.imatrix =*/ nullptr,
1073 /*.kv_overrides =*/ nullptr,
1074 /*.tensor_type =*/ .tensor_types: nullptr,
1075 /*.prune_layers =*/ nullptr
1076 };
1077
1078 return result;
1079}
1080
1081uint32_t llama_model_quantize(
1082 const char * fname_inp,
1083 const char * fname_out,
1084 const llama_model_quantize_params * params) {
1085 try {
1086 llama_model_quantize_impl(fname_inp, fname_out, params);
1087 } catch (const std::exception & err) {
1088 LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
1089 return 1;
1090 }
1091
1092 return 0;
1093}
1094