| 1 | #include "llama-quant.h" |
| 2 | #include "llama-impl.h" |
| 3 | #include "llama-model.h" |
| 4 | #include "llama-model-loader.h" |
| 5 | |
| 6 | #include <algorithm> |
| 7 | #include <cmath> |
| 8 | #include <cstring> |
| 9 | #include <cinttypes> |
| 10 | #include <fstream> |
| 11 | #include <mutex> |
| 12 | #include <regex> |
| 13 | #include <thread> |
| 14 | #include <unordered_map> |
| 15 | |
| 16 | // Quantization types. Changes to this struct must be replicated in quantize.cpp |
| 17 | struct tensor_quantization { |
| 18 | std::string name; |
| 19 | ggml_type quant = GGML_TYPE_COUNT; |
| 20 | }; |
| 21 | |
| 22 | static void zeros(std::ofstream & file, size_t n) { |
| 23 | char zero = 0; |
| 24 | for (size_t i = 0; i < n; ++i) { |
| 25 | file.write(s: &zero, n: 1); |
| 26 | } |
| 27 | } |
| 28 | |
| 29 | static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) { |
| 30 | if (prune.empty()) { |
| 31 | return orig_name; |
| 32 | } |
| 33 | |
| 34 | static const std::regex pattern(R"(blk\.(\d+)\.)" ); |
| 35 | if (std::smatch match; std::regex_search(s: orig_name, m&: match, e: pattern)) { |
| 36 | const int blk = std::stoi(str: match[1]); |
| 37 | std::string new_name = orig_name; |
| 38 | |
| 39 | if (mapped.count(x: blk)) { |
| 40 | // Already mapped, do nothing |
| 41 | } else if (std::find(first: prune.begin(), last: prune.end(), val: blk) != prune.end()) { |
| 42 | mapped[blk] = "" ; |
| 43 | } else if (blk < prune.front()) { |
| 44 | mapped[blk] = std::to_string(val: blk); |
| 45 | next_id = blk + 1; |
| 46 | } else { |
| 47 | mapped[blk] = std::to_string(val: next_id); |
| 48 | ++next_id; |
| 49 | } |
| 50 | |
| 51 | return mapped[blk].empty() ? mapped[blk] : new_name.replace(pos: match.position(sub: 1), n: match.length(sub: 1), str: mapped[blk]); |
| 52 | } |
| 53 | |
| 54 | return orig_name; |
| 55 | } |
| 56 | |
| 57 | static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) { |
| 58 | if (mapped.empty()) { |
| 59 | return orig_name; |
| 60 | } |
| 61 | |
| 62 | static const std::regex pattern(R"(blk\.(\d+)\.)" ); |
| 63 | if (std::smatch match; std::regex_search(s: orig_name, m&: match, e: pattern)) { |
| 64 | const std::string blk(match[1]); |
| 65 | std::string new_name = orig_name; |
| 66 | |
| 67 | for (const auto & p : mapped) { |
| 68 | if (p.second == blk) { |
| 69 | LLAMA_LOG_DEBUG("(blk.%d imatrix) " , p.first); |
| 70 | return new_name.replace(pos: match.position(sub: 1), n: match.length(sub: 1), str: std::to_string(val: p.first)); |
| 71 | } |
| 72 | } |
| 73 | GGML_ABORT("\n%s: imatrix mapping error for %s\n" , __func__, orig_name.c_str()); |
| 74 | } |
| 75 | |
| 76 | return orig_name; |
| 77 | } |
| 78 | |
| 79 | struct quantize_state_impl { |
| 80 | const llama_model & model; |
| 81 | const llama_model_quantize_params * params; |
| 82 | |
| 83 | int n_attention_wv = 0; |
| 84 | int n_ffn_down = 0; |
| 85 | int n_ffn_gate = 0; |
| 86 | int n_ffn_up = 0; |
| 87 | int i_attention_wv = 0; |
| 88 | int i_ffn_down = 0; |
| 89 | int i_ffn_gate = 0; |
| 90 | int i_ffn_up = 0; |
| 91 | |
| 92 | int n_k_quantized = 0; |
| 93 | int n_fallback = 0; |
| 94 | |
| 95 | bool has_imatrix = false; |
| 96 | |
| 97 | // used to figure out if a model shares tok_embd with the output weight |
| 98 | bool has_output = false; |
| 99 | |
| 100 | quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params) |
| 101 | : model(model) |
| 102 | , params(params) |
| 103 | {} |
| 104 | }; |
| 105 | |
| 106 | static void llama_tensor_dequantize_impl( |
| 107 | ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers, |
| 108 | const size_t nelements, const int nthread |
| 109 | ) { |
| 110 | if (output.size() < nelements) { |
| 111 | output.resize(new_size: nelements); |
| 112 | } |
| 113 | float * f32_output = (float *) output.data(); |
| 114 | |
| 115 | const ggml_type_traits * qtype = ggml_get_type_traits(type: tensor->type); |
| 116 | if (ggml_is_quantized(type: tensor->type)) { |
| 117 | if (qtype->to_float == NULL) { |
| 118 | throw std::runtime_error(format(fmt: "type %s unsupported for integer quantization: no dequantization available" , ggml_type_name(type: tensor->type))); |
| 119 | } |
| 120 | } else if (tensor->type != GGML_TYPE_F16 && |
| 121 | tensor->type != GGML_TYPE_BF16) { |
| 122 | throw std::runtime_error(format(fmt: "cannot dequantize/convert tensor type %s" , ggml_type_name(type: tensor->type))); |
| 123 | } |
| 124 | |
| 125 | if (nthread < 2) { |
| 126 | if (tensor->type == GGML_TYPE_F16) { |
| 127 | ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements); |
| 128 | } else if (tensor->type == GGML_TYPE_BF16) { |
| 129 | ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements); |
| 130 | } else if (ggml_is_quantized(type: tensor->type)) { |
| 131 | qtype->to_float(tensor->data, f32_output, nelements); |
| 132 | } else { |
| 133 | GGML_ABORT("fatal error" ); // unreachable |
| 134 | } |
| 135 | return; |
| 136 | } |
| 137 | |
| 138 | size_t block_size; |
| 139 | if (tensor->type == GGML_TYPE_F16 || |
| 140 | tensor->type == GGML_TYPE_BF16) { |
| 141 | block_size = 1; |
| 142 | } else { |
| 143 | block_size = (size_t)ggml_blck_size(type: tensor->type); |
| 144 | } |
| 145 | |
| 146 | size_t block_size_bytes = ggml_type_size(type: tensor->type); |
| 147 | |
| 148 | GGML_ASSERT(nelements % block_size == 0); |
| 149 | size_t nblocks = nelements / block_size; |
| 150 | size_t blocks_per_thread = nblocks / nthread; |
| 151 | size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count |
| 152 | |
| 153 | size_t in_buff_offs = 0; |
| 154 | size_t out_buff_offs = 0; |
| 155 | |
| 156 | for (int tnum = 0; tnum < nthread; tnum++) { |
| 157 | size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread |
| 158 | size_t thr_elems = thr_blocks * block_size; // number of elements for this thread |
| 159 | size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread |
| 160 | |
| 161 | auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { |
| 162 | if (typ == GGML_TYPE_F16) { |
| 163 | ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); |
| 164 | } else if (typ == GGML_TYPE_BF16) { |
| 165 | ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels); |
| 166 | } else { |
| 167 | qtype->to_float(inbuf, outbuf, nels); |
| 168 | } |
| 169 | }; |
| 170 | workers.emplace_back(args&: compute, args&: tensor->type, args: (uint8_t *) tensor->data + in_buff_offs, args: f32_output + out_buff_offs, args&: thr_elems); |
| 171 | in_buff_offs += thr_block_bytes; |
| 172 | out_buff_offs += thr_elems; |
| 173 | } |
| 174 | for (auto & w : workers) { w.join(); } |
| 175 | workers.clear(); |
| 176 | } |
| 177 | |
| 178 | static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { |
| 179 | const std::string name = ggml_get_name(tensor); |
| 180 | |
| 181 | // TODO: avoid hardcoded tensor names - use the TN_* constants |
| 182 | const llm_arch arch = qs.model.arch; |
| 183 | const auto tn = LLM_TN(arch); |
| 184 | |
| 185 | auto use_more_bits = [](int i_layer, int n_layers) -> bool { |
| 186 | return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; |
| 187 | }; |
| 188 | const int n_expert = std::max(a: 1, b: (int)qs.model.hparams.n_expert); |
| 189 | auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { |
| 190 | if (n_expert > 1) { |
| 191 | // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly |
| 192 | // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work |
| 193 | // for getting the current layer as I initially thought, and we need to resort to parsing the |
| 194 | // tensor name. |
| 195 | if (sscanf(s: name, format: "blk.%d." , &i_layer) != 1) { |
| 196 | throw std::runtime_error(format(fmt: "Failed to determine layer for tensor %s" , name)); |
| 197 | } |
| 198 | if (i_layer < 0 || i_layer >= n_layer) { |
| 199 | throw std::runtime_error(format(fmt: "Bad layer %d for tensor %s. Must be in [0, %d)" , i_layer, name, n_layer)); |
| 200 | } |
| 201 | } |
| 202 | return std::make_pair(x&: i_layer, y&: n_layer); |
| 203 | }; |
| 204 | |
| 205 | // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings |
| 206 | // with the quantization of the output tensor |
| 207 | if (name == tn(LLM_TENSOR_OUTPUT, "weight" ) || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight" ))) { |
| 208 | if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { |
| 209 | new_type = qs.params->output_tensor_type; |
| 210 | } else { |
| 211 | const int64_t nx = tensor->ne[0]; |
| 212 | const int64_t qk_k = ggml_blck_size(type: new_type); |
| 213 | |
| 214 | if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) { |
| 215 | new_type = GGML_TYPE_Q8_0; |
| 216 | } |
| 217 | else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) { |
| 218 | new_type = GGML_TYPE_Q8_0; |
| 219 | } |
| 220 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || |
| 221 | ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || |
| 222 | ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { |
| 223 | new_type = GGML_TYPE_Q5_K; |
| 224 | } |
| 225 | else if (new_type != GGML_TYPE_Q8_0) { |
| 226 | new_type = GGML_TYPE_Q6_K; |
| 227 | } |
| 228 | } |
| 229 | } else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) { |
| 230 | // MoE tensors -> MXFP4 |
| 231 | // other tensors -> Q8_0 |
| 232 | if (tensor->ne[2] > 1) { |
| 233 | new_type = GGML_TYPE_MXFP4; |
| 234 | } else { |
| 235 | new_type = GGML_TYPE_Q8_0; |
| 236 | } |
| 237 | } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight" ) { |
| 238 | if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { |
| 239 | new_type = qs.params->token_embedding_type; |
| 240 | } else { |
| 241 | if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || |
| 242 | ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { |
| 243 | new_type = GGML_TYPE_Q2_K; |
| 244 | } |
| 245 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { |
| 246 | new_type = GGML_TYPE_IQ3_S; |
| 247 | } |
| 248 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { |
| 249 | new_type = GGML_TYPE_IQ3_S; |
| 250 | } |
| 251 | else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { |
| 252 | new_type = GGML_TYPE_Q4_K; |
| 253 | } |
| 254 | } |
| 255 | } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || |
| 256 | ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { |
| 257 | if (name.find(s: "attn_v.weight" ) != std::string::npos) { |
| 258 | if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; |
| 259 | else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; |
| 260 | ++qs.i_attention_wv; |
| 261 | } |
| 262 | else if (qs.model.hparams.n_expert == 8 && name.find(s: "attn_k.weight" ) != std::string::npos) { |
| 263 | new_type = GGML_TYPE_Q4_K; |
| 264 | } |
| 265 | else if (name.find(s: "ffn_down" ) != std::string::npos) { |
| 266 | if (qs.i_ffn_down < qs.n_ffn_down/8) { |
| 267 | new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; |
| 268 | } |
| 269 | ++qs.i_ffn_down; |
| 270 | } |
| 271 | else if (name.find(s: "attn_output.weight" ) != std::string::npos) { |
| 272 | if (qs.model.hparams.n_expert == 8) { |
| 273 | new_type = GGML_TYPE_Q5_K; |
| 274 | } else { |
| 275 | if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; |
| 276 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; |
| 277 | } |
| 278 | } |
| 279 | } else if (name.find(s: "attn_v.weight" ) != std::string::npos) { |
| 280 | if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { |
| 281 | new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; |
| 282 | } |
| 283 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { |
| 284 | new_type = GGML_TYPE_Q4_K; |
| 285 | } |
| 286 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { |
| 287 | new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; |
| 288 | } |
| 289 | else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) { |
| 290 | new_type = GGML_TYPE_Q4_K; |
| 291 | } |
| 292 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { |
| 293 | new_type = GGML_TYPE_Q4_K; |
| 294 | } |
| 295 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { |
| 296 | new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; |
| 297 | } |
| 298 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; |
| 299 | else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) { |
| 300 | new_type = GGML_TYPE_Q5_K; |
| 301 | } |
| 302 | else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && |
| 303 | use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; |
| 304 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; |
| 305 | if (qs.model.type == LLM_TYPE_70B) { |
| 306 | // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is |
| 307 | // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with |
| 308 | // nearly negligible increase in model size by quantizing this tensor with more bits: |
| 309 | if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; |
| 310 | } |
| 311 | if (qs.model.hparams.n_expert == 8) { |
| 312 | // for the 8-expert model, bumping this to Q8_0 trades just ~128MB |
| 313 | // TODO: explore better strategies |
| 314 | new_type = GGML_TYPE_Q8_0; |
| 315 | } |
| 316 | ++qs.i_attention_wv; |
| 317 | } else if (name.find(s: "attn_k.weight" ) != std::string::npos) { |
| 318 | if (qs.model.hparams.n_expert == 8) { |
| 319 | // for the 8-expert model, bumping this to Q8_0 trades just ~128MB |
| 320 | // TODO: explore better strategies |
| 321 | new_type = GGML_TYPE_Q8_0; |
| 322 | } |
| 323 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { |
| 324 | new_type = GGML_TYPE_IQ3_XXS; |
| 325 | } |
| 326 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { |
| 327 | new_type = GGML_TYPE_IQ2_S; |
| 328 | } |
| 329 | } else if (name.find(s: "attn_q.weight" ) != std::string::npos) { |
| 330 | if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { |
| 331 | new_type = GGML_TYPE_IQ3_XXS; |
| 332 | } |
| 333 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { |
| 334 | new_type = GGML_TYPE_IQ2_S; |
| 335 | } |
| 336 | } else if (name.find(s: "ffn_down" ) != std::string::npos) { |
| 337 | auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); |
| 338 | int i_layer = info.first, n_layer = info.second; |
| 339 | if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; |
| 340 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { |
| 341 | if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; |
| 342 | } |
| 343 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { |
| 344 | new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; |
| 345 | } |
| 346 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { |
| 347 | new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K |
| 348 | : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K |
| 349 | : GGML_TYPE_Q3_K; |
| 350 | } |
| 351 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || |
| 352 | (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { |
| 353 | new_type = GGML_TYPE_Q4_K; |
| 354 | } |
| 355 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { |
| 356 | new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; |
| 357 | } |
| 358 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { |
| 359 | if (arch == LLM_ARCH_FALCON) { |
| 360 | new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : |
| 361 | use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; |
| 362 | } else { |
| 363 | if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; |
| 364 | } |
| 365 | } |
| 366 | else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { |
| 367 | new_type = GGML_TYPE_Q5_K; |
| 368 | } |
| 369 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; |
| 370 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { |
| 371 | new_type = GGML_TYPE_Q5_K; |
| 372 | } |
| 373 | else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) |
| 374 | && qs.has_imatrix && i_layer < n_layer/8) { |
| 375 | // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. |
| 376 | // We only do it when an imatrix is provided because a) we want to make sure that one can always get the |
| 377 | // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. |
| 378 | new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; |
| 379 | } |
| 380 | ++qs.i_ffn_down; |
| 381 | } else if (name.find(s: "attn_output.weight" ) != std::string::npos) { |
| 382 | if (arch != LLM_ARCH_FALCON) { |
| 383 | if (qs.model.hparams.n_expert == 8) { |
| 384 | if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || |
| 385 | ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || |
| 386 | ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || |
| 387 | ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { |
| 388 | new_type = GGML_TYPE_Q5_K; |
| 389 | } |
| 390 | } else { |
| 391 | if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; |
| 392 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; |
| 393 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; |
| 394 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; |
| 395 | else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K; |
| 396 | } |
| 397 | } else { |
| 398 | if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; |
| 399 | } |
| 400 | } |
| 401 | else if (name.find(s: "attn_qkv.weight" ) != std::string::npos) { |
| 402 | if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { |
| 403 | new_type = GGML_TYPE_Q4_K; |
| 404 | } |
| 405 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; |
| 406 | else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; |
| 407 | } |
| 408 | else if (name.find(s: "ffn_gate" ) != std::string::npos) { |
| 409 | auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); |
| 410 | int i_layer = info.first, n_layer = info.second; |
| 411 | if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { |
| 412 | new_type = GGML_TYPE_IQ3_XXS; |
| 413 | } |
| 414 | ++qs.i_ffn_gate; |
| 415 | } |
| 416 | else if (name.find(s: "ffn_up" ) != std::string::npos) { |
| 417 | auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); |
| 418 | int i_layer = info.first, n_layer = info.second; |
| 419 | if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { |
| 420 | new_type = GGML_TYPE_IQ3_XXS; |
| 421 | } |
| 422 | ++qs.i_ffn_up; |
| 423 | } |
| 424 | |
| 425 | // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; |
| 426 | //} |
| 427 | // IK: let's remove this, else Q2_K is almost the same as Q3_K_S |
| 428 | //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) { |
| 429 | // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; |
| 430 | //} |
| 431 | // This can be used to reduce the size of the Q5_K_S model. |
| 432 | // The associated PPL increase is fully in line with the size reduction |
| 433 | //else { |
| 434 | // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; |
| 435 | //} |
| 436 | bool convert_incompatible_tensor = false; |
| 437 | { |
| 438 | const int64_t nx = tensor->ne[0]; |
| 439 | const int64_t ny = tensor->ne[1]; |
| 440 | const int64_t qk_k = ggml_blck_size(type: new_type); |
| 441 | |
| 442 | if (nx % qk_k != 0) { |
| 443 | LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s" , __func__, nx, ny, qk_k, ggml_type_name(new_type)); |
| 444 | convert_incompatible_tensor = true; |
| 445 | } else { |
| 446 | ++qs.n_k_quantized; |
| 447 | } |
| 448 | } |
| 449 | |
| 450 | if (convert_incompatible_tensor) { |
| 451 | switch (new_type) { |
| 452 | case GGML_TYPE_TQ1_0: |
| 453 | case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead |
| 454 | case GGML_TYPE_IQ2_XXS: |
| 455 | case GGML_TYPE_IQ2_XS: |
| 456 | case GGML_TYPE_IQ2_S: |
| 457 | case GGML_TYPE_IQ3_XXS: |
| 458 | case GGML_TYPE_IQ3_S: |
| 459 | case GGML_TYPE_IQ1_S: |
| 460 | case GGML_TYPE_IQ1_M: |
| 461 | case GGML_TYPE_Q2_K: |
| 462 | case GGML_TYPE_Q3_K: |
| 463 | case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break; |
| 464 | case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; |
| 465 | case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; |
| 466 | case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; |
| 467 | default: throw std::runtime_error("\nUnsupported tensor size encountered\n" ); |
| 468 | } |
| 469 | if (tensor->ne[0] % ggml_blck_size(type: new_type) != 0) { |
| 470 | new_type = GGML_TYPE_F16; |
| 471 | } |
| 472 | LLAMA_LOG_WARN(" - using fallback quantization %s\n" , ggml_type_name(new_type)); |
| 473 | ++qs.n_fallback; |
| 474 | } |
| 475 | |
| 476 | return new_type; |
| 477 | } |
| 478 | |
| 479 | static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) { |
| 480 | if (nthread < 2) { |
| 481 | // single-thread |
| 482 | size_t new_size = ggml_quantize_chunk(type: new_type, src: f32_data, dst: new_data, start: 0, nrows, n_per_row, imatrix); |
| 483 | if (!ggml_validate_row_data(type: new_type, data: new_data, nbytes: new_size)) { |
| 484 | throw std::runtime_error("quantized data validation failed" ); |
| 485 | } |
| 486 | return new_size; |
| 487 | } |
| 488 | |
| 489 | std::mutex mutex; |
| 490 | int64_t counter = 0; |
| 491 | size_t new_size = 0; |
| 492 | bool valid = true; |
| 493 | auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, |
| 494 | nrows, n_per_row, imatrix]() { |
| 495 | const int64_t nrows_per_chunk = chunk_size / n_per_row; |
| 496 | size_t local_size = 0; |
| 497 | while (true) { |
| 498 | std::unique_lock<std::mutex> lock(mutex); |
| 499 | int64_t first_row = counter; counter += nrows_per_chunk; |
| 500 | if (first_row >= nrows) { |
| 501 | if (local_size > 0) { |
| 502 | new_size += local_size; |
| 503 | } |
| 504 | break; |
| 505 | } |
| 506 | lock.unlock(); |
| 507 | const int64_t this_nrow = std::min(a: nrows - first_row, b: nrows_per_chunk); |
| 508 | size_t this_size = ggml_quantize_chunk(type: new_type, src: f32_data, dst: new_data, start: first_row * n_per_row, nrows: this_nrow, n_per_row, imatrix); |
| 509 | local_size += this_size; |
| 510 | |
| 511 | // validate the quantized data |
| 512 | const size_t row_size = ggml_row_size(type: new_type, ne: n_per_row); |
| 513 | void * this_data = (char *) new_data + first_row * row_size; |
| 514 | if (!ggml_validate_row_data(type: new_type, data: this_data, nbytes: this_size)) { |
| 515 | std::unique_lock<std::mutex> lock(mutex); |
| 516 | valid = false; |
| 517 | break; |
| 518 | } |
| 519 | } |
| 520 | }; |
| 521 | for (int it = 0; it < nthread - 1; ++it) { |
| 522 | workers.emplace_back(args&: compute); |
| 523 | } |
| 524 | compute(); |
| 525 | for (auto & w : workers) { w.join(); } |
| 526 | workers.clear(); |
| 527 | if (!valid) { |
| 528 | throw std::runtime_error("quantized data validation failed" ); |
| 529 | } |
| 530 | return new_size; |
| 531 | } |
| 532 | |
| 533 | static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { |
| 534 | ggml_type default_type; |
| 535 | llama_ftype ftype = params->ftype; |
| 536 | |
| 537 | switch (params->ftype) { |
| 538 | case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break; |
| 539 | case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break; |
| 540 | case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break; |
| 541 | case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break; |
| 542 | case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break; |
| 543 | case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break; |
| 544 | case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break; |
| 545 | case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break; |
| 546 | |
| 547 | case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break; |
| 548 | |
| 549 | // K-quants |
| 550 | case LLAMA_FTYPE_MOSTLY_Q2_K_S: |
| 551 | case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break; |
| 552 | case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break; |
| 553 | case LLAMA_FTYPE_MOSTLY_Q3_K_S: |
| 554 | case LLAMA_FTYPE_MOSTLY_Q3_K_M: |
| 555 | case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break; |
| 556 | case LLAMA_FTYPE_MOSTLY_Q4_K_S: |
| 557 | case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break; |
| 558 | case LLAMA_FTYPE_MOSTLY_Q5_K_S: |
| 559 | case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break; |
| 560 | case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break; |
| 561 | case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break; |
| 562 | case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break; |
| 563 | case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break; |
| 564 | case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; |
| 565 | case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break; |
| 566 | case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break; |
| 567 | case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; |
| 568 | case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; |
| 569 | case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; |
| 570 | case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; |
| 571 | case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; |
| 572 | case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; |
| 573 | case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; |
| 574 | |
| 575 | default: throw std::runtime_error(format(fmt: "invalid output file type %d\n" , ftype)); |
| 576 | } |
| 577 | |
| 578 | int nthread = params->nthread; |
| 579 | |
| 580 | if (nthread <= 0) { |
| 581 | nthread = std::thread::hardware_concurrency(); |
| 582 | } |
| 583 | |
| 584 | // mmap consistently increases speed on Linux, and also increases speed on Windows with |
| 585 | // hot cache. It may cause a slowdown on macOS, possibly related to free memory. |
| 586 | #if defined(__linux__) || defined(_WIN32) |
| 587 | constexpr bool use_mmap = true; |
| 588 | #else |
| 589 | constexpr bool use_mmap = false; |
| 590 | #endif |
| 591 | |
| 592 | llama_model_kv_override * kv_overrides = nullptr; |
| 593 | if (params->kv_overrides) { |
| 594 | auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides; |
| 595 | kv_overrides = v->data(); |
| 596 | } |
| 597 | |
| 598 | std::vector<std::string> splits = {}; |
| 599 | llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr); |
| 600 | ml.init_mappings(prefetch: false); // no prefetching |
| 601 | |
| 602 | llama_model model(llama_model_default_params()); |
| 603 | |
| 604 | model.load_arch (ml); |
| 605 | model.load_hparams(ml); |
| 606 | model.load_stats (ml); |
| 607 | |
| 608 | quantize_state_impl qs(model, params); |
| 609 | |
| 610 | if (params->only_copy) { |
| 611 | ftype = ml.ftype; |
| 612 | } |
| 613 | const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr; |
| 614 | if (params->imatrix) { |
| 615 | imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix); |
| 616 | if (imatrix_data) { |
| 617 | LLAMA_LOG_INFO("================================ Have weights data with %d entries\n" ,int(imatrix_data->size())); |
| 618 | qs.has_imatrix = true; |
| 619 | // check imatrix for nans or infs |
| 620 | for (const auto & kv : *imatrix_data) { |
| 621 | for (float f : kv.second) { |
| 622 | if (!std::isfinite(x: f)) { |
| 623 | throw std::runtime_error(format(fmt: "imatrix contains non-finite value %f\n" , f)); |
| 624 | } |
| 625 | } |
| 626 | } |
| 627 | } |
| 628 | } |
| 629 | |
| 630 | const size_t align = GGUF_DEFAULT_ALIGNMENT; |
| 631 | gguf_context_ptr ctx_out { gguf_init_empty() }; |
| 632 | |
| 633 | std::vector<int> prune_list = {}; |
| 634 | if (params->prune_layers) { |
| 635 | prune_list = *static_cast<const std::vector<int> *>(params->prune_layers); |
| 636 | } |
| 637 | |
| 638 | // copy the KV pairs from the input file |
| 639 | gguf_set_kv (ctx: ctx_out.get(), src: ml.meta.get()); |
| 640 | gguf_set_val_u32(ctx: ctx_out.get(), key: "general.quantization_version" , GGML_QNT_VERSION); // TODO: use LLM_KV |
| 641 | gguf_set_val_u32(ctx: ctx_out.get(), key: "general.file_type" , val: ftype); // TODO: use LLM_KV |
| 642 | |
| 643 | // Remove split metadata |
| 644 | gguf_remove_key(ctx: ctx_out.get(), key: ml.llm_kv(LLM_KV_SPLIT_NO).c_str()); |
| 645 | gguf_remove_key(ctx: ctx_out.get(), key: ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str()); |
| 646 | gguf_remove_key(ctx: ctx_out.get(), key: ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str()); |
| 647 | |
| 648 | if (params->kv_overrides) { |
| 649 | const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides; |
| 650 | for (const auto & o : overrides) { |
| 651 | if (o.key[0] == 0) break; |
| 652 | if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { |
| 653 | gguf_set_val_f32(ctx: ctx_out.get(), key: o.key, val: o.val_f64); |
| 654 | } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { |
| 655 | // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context |
| 656 | gguf_set_val_u32(ctx: ctx_out.get(), key: o.key, val: (uint32_t)std::abs(i: o.val_i64)); |
| 657 | } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { |
| 658 | gguf_set_val_bool(ctx: ctx_out.get(), key: o.key, val: o.val_bool); |
| 659 | } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) { |
| 660 | gguf_set_val_str(ctx: ctx_out.get(), key: o.key, val: o.val_str); |
| 661 | } else { |
| 662 | LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n" , __func__, o.key); |
| 663 | } |
| 664 | } |
| 665 | } |
| 666 | |
| 667 | std::map<int, std::string> mapped; |
| 668 | int blk_id = 0; |
| 669 | int pruned_attention_w = 0; |
| 670 | |
| 671 | // make a list of weights |
| 672 | std::vector<const llama_model_loader::llama_tensor_weight *> tensors; |
| 673 | tensors.reserve(n: ml.weights_map.size()); |
| 674 | for (const auto & it : ml.weights_map) { |
| 675 | const std::string remapped_name(remap_layer(orig_name: it.first, prune: prune_list, mapped, next_id&: blk_id)); |
| 676 | if (remapped_name.empty()) { |
| 677 | if (it.first.find(s: "attn_v.weight" ) != std::string::npos || |
| 678 | it.first.find(s: "attn_qkv.weight" ) != std::string::npos || |
| 679 | it.first.find(s: "attn_kv_b.weight" ) != std::string::npos) { |
| 680 | pruned_attention_w++; |
| 681 | } |
| 682 | LLAMA_LOG_DEBUG("%s: pruning tensor %s\n" , __func__, it.first.c_str()); |
| 683 | continue; |
| 684 | } else if (remapped_name != it.first) { |
| 685 | ggml_set_name(tensor: it.second.tensor, name: remapped_name.c_str()); |
| 686 | LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n" , __func__, it.first.c_str(), ggml_get_name(it.second.tensor)); |
| 687 | } |
| 688 | tensors.push_back(x: &it.second); |
| 689 | } |
| 690 | if (!prune_list.empty()) { |
| 691 | gguf_set_val_u32(ctx: ctx_out.get(), key: ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), val: blk_id); |
| 692 | } |
| 693 | |
| 694 | // keep_split requires that the weights are sorted by split index |
| 695 | if (params->keep_split) { |
| 696 | std::sort(first: tensors.begin(), last: tensors.end(), comp: [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) { |
| 697 | if (a->idx == b->idx) { |
| 698 | return a->offs < b->offs; |
| 699 | } |
| 700 | return a->idx < b->idx; |
| 701 | }); |
| 702 | } |
| 703 | |
| 704 | bool is_clip_model = false; |
| 705 | for (const auto * it : tensors) { |
| 706 | const struct ggml_tensor * tensor = it->tensor; |
| 707 | |
| 708 | const std::string name = ggml_get_name(tensor); |
| 709 | |
| 710 | // TODO: avoid hardcoded tensor names - use the TN_* constants |
| 711 | if (name.find(s: "attn_v.weight" ) != std::string::npos || |
| 712 | name.find(s: "attn_qkv.weight" ) != std::string::npos || |
| 713 | name.find(s: "attn_kv_b.weight" )!= std::string::npos) { |
| 714 | ++qs.n_attention_wv; |
| 715 | } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight" )) { |
| 716 | qs.has_output = true; |
| 717 | } |
| 718 | |
| 719 | is_clip_model |= name.rfind(s: "mm." , pos: 0) == 0; // check the "mm." prefix |
| 720 | } |
| 721 | |
| 722 | qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; |
| 723 | |
| 724 | // sanity checks for models that have attention layers |
| 725 | if (qs.n_attention_wv != 0 && !is_clip_model) |
| 726 | { |
| 727 | const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); |
| 728 | // attention layers have a non-zero number of kv heads |
| 729 | int32_t n_attn_layer = model.hparams.n_layer - std::count(first: n_head_kv_iter, last: n_head_kv_iter + model.hparams.n_layer, value: 0); |
| 730 | if (llama_model_has_encoder(model: &model)) { |
| 731 | // now n_attn_layer is the number of attention layers in the encoder |
| 732 | // for each decoder block, there are 2 attention layers |
| 733 | n_attn_layer += 2 * model.hparams.dec_n_layer; |
| 734 | } |
| 735 | GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected" ); |
| 736 | } |
| 737 | |
| 738 | size_t total_size_org = 0; |
| 739 | size_t total_size_new = 0; |
| 740 | |
| 741 | std::vector<std::thread> workers; |
| 742 | workers.reserve(n: nthread); |
| 743 | |
| 744 | int idx = 0; |
| 745 | |
| 746 | std::vector<no_init<uint8_t>> read_data; |
| 747 | std::vector<no_init<uint8_t>> work; |
| 748 | std::vector<no_init<float>> f32_conv_buf; |
| 749 | |
| 750 | uint16_t n_split = 1; |
| 751 | |
| 752 | // Assume split index is continuous |
| 753 | if (params->keep_split) { |
| 754 | for (const auto * it : tensors) { |
| 755 | n_split = std::max(a: uint16_t(it->idx + 1), b: n_split); |
| 756 | } |
| 757 | } |
| 758 | std::vector<gguf_context_ptr> ctx_outs(n_split); |
| 759 | ctx_outs[0] = std::move(ctx_out); |
| 760 | |
| 761 | // populate the original tensors so we get an initial meta data |
| 762 | for (const auto * it : tensors) { |
| 763 | uint16_t i_split = params->keep_split ? it->idx : 0; |
| 764 | ggml_tensor * tensor = it->tensor; |
| 765 | if (!ctx_outs[i_split]) { |
| 766 | ctx_outs[i_split].reset(p: gguf_init_empty()); |
| 767 | } |
| 768 | gguf_add_tensor(ctx: ctx_outs[i_split].get(), tensor); |
| 769 | } |
| 770 | |
| 771 | // Set split info if needed |
| 772 | if (n_split > 1) { |
| 773 | for (size_t i = 0; i < ctx_outs.size(); ++i) { |
| 774 | gguf_set_val_u16(ctx: ctx_outs[i].get(), key: ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), val: i); |
| 775 | gguf_set_val_u16(ctx: ctx_outs[i].get(), key: ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), val: n_split); |
| 776 | gguf_set_val_i32(ctx: ctx_outs[i].get(), key: ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), val: (int32_t)tensors.size()); |
| 777 | } |
| 778 | } |
| 779 | |
| 780 | int cur_split = -1; |
| 781 | std::ofstream fout; |
| 782 | auto close_ofstream = [&]() { |
| 783 | // Write metadata and close file handler |
| 784 | if (fout.is_open()) { |
| 785 | fout.seekp(0); |
| 786 | std::vector<uint8_t> data(gguf_get_meta_size(ctx: ctx_outs[cur_split].get())); |
| 787 | gguf_get_meta_data(ctx: ctx_outs[cur_split].get(), data: data.data()); |
| 788 | fout.write(s: (const char *) data.data(), n: data.size()); |
| 789 | fout.close(); |
| 790 | } |
| 791 | }; |
| 792 | auto new_ofstream = [&](int index) { |
| 793 | cur_split = index; |
| 794 | GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context" ); |
| 795 | std::string fname = fname_out; |
| 796 | if (params->keep_split) { |
| 797 | std::vector<char> split_path(llama_path_max(), 0); |
| 798 | llama_split_path(split_path: split_path.data(), maxlen: split_path.size(), path_prefix: fname_out.c_str(), split_no: cur_split, split_count: n_split); |
| 799 | fname = std::string(split_path.data()); |
| 800 | } |
| 801 | |
| 802 | fout = std::ofstream(fname, std::ios::binary); |
| 803 | fout.exceptions(except: std::ofstream::failbit); // fail fast on write errors |
| 804 | const size_t meta_size = gguf_get_meta_size(ctx: ctx_outs[cur_split].get()); |
| 805 | // placeholder for the meta data |
| 806 | ::zeros(file&: fout, n: meta_size); |
| 807 | }; |
| 808 | |
| 809 | const auto tn = LLM_TN(model.arch); |
| 810 | new_ofstream(0); |
| 811 | for (const auto * it : tensors) { |
| 812 | const auto & weight = *it; |
| 813 | ggml_tensor * tensor = weight.tensor; |
| 814 | if (weight.idx != cur_split && params->keep_split) { |
| 815 | close_ofstream(); |
| 816 | new_ofstream(weight.idx); |
| 817 | } |
| 818 | |
| 819 | const std::string name = ggml_get_name(tensor); |
| 820 | |
| 821 | if (!ml.use_mmap) { |
| 822 | if (read_data.size() < ggml_nbytes(tensor)) { |
| 823 | read_data.resize(new_size: ggml_nbytes(tensor)); |
| 824 | } |
| 825 | tensor->data = read_data.data(); |
| 826 | } |
| 827 | ml.load_data_for(cur: tensor); |
| 828 | |
| 829 | LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, " , |
| 830 | ++idx, ml.n_tensors, |
| 831 | ggml_get_name(tensor), |
| 832 | llama_format_tensor_shape(tensor).c_str(), |
| 833 | ggml_type_name(tensor->type)); |
| 834 | |
| 835 | // This used to be a regex, but <regex> has an extreme cost to compile times. |
| 836 | bool quantize = name.rfind(s: "weight" ) == name.size() - 6; // ends with 'weight'? |
| 837 | |
| 838 | // quantize only 2D and 3D tensors (experts) |
| 839 | quantize &= (ggml_n_dims(tensor) >= 2); |
| 840 | |
| 841 | // do not quantize norm tensors |
| 842 | quantize &= name.find(s: "_norm.weight" ) == std::string::npos; |
| 843 | |
| 844 | quantize &= params->quantize_output_tensor || name != "output.weight" ; |
| 845 | quantize &= !params->only_copy; |
| 846 | |
| 847 | // do not quantize expert gating tensors |
| 848 | // NOTE: can't use LLM_TN here because the layer number is not known |
| 849 | quantize &= name.find(s: "ffn_gate_inp.weight" ) == std::string::npos; |
| 850 | |
| 851 | // these are very small (e.g. 4x4) |
| 852 | quantize &= name.find(s: "altup" ) == std::string::npos; |
| 853 | quantize &= name.find(s: "laurel" ) == std::string::npos; |
| 854 | |
| 855 | // these are not too big so keep them as it is |
| 856 | quantize &= name.find(s: "per_layer_model_proj" ) == std::string::npos; |
| 857 | |
| 858 | // do not quantize positional embeddings and token types (BERT) |
| 859 | quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight" ); |
| 860 | quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight" ); |
| 861 | |
| 862 | // do not quantize Mamba's small yet 2D weights |
| 863 | // NOTE: can't use LLM_TN here because the layer number is not known |
| 864 | quantize &= name.find(s: "ssm_conv1d.weight" ) == std::string::npos; |
| 865 | quantize &= name.find(s: "shortconv.conv.weight" ) == std::string::npos; |
| 866 | |
| 867 | // do not quantize RWKV's small yet 2D weights |
| 868 | quantize &= name.find(s: "time_mix_first.weight" ) == std::string::npos; |
| 869 | quantize &= name.find(s: "time_mix_w0.weight" ) == std::string::npos; |
| 870 | quantize &= name.find(s: "time_mix_w1.weight" ) == std::string::npos; |
| 871 | quantize &= name.find(s: "time_mix_w2.weight" ) == std::string::npos; |
| 872 | quantize &= name.find(s: "time_mix_v0.weight" ) == std::string::npos; |
| 873 | quantize &= name.find(s: "time_mix_v1.weight" ) == std::string::npos; |
| 874 | quantize &= name.find(s: "time_mix_v2.weight" ) == std::string::npos; |
| 875 | quantize &= name.find(s: "time_mix_a0.weight" ) == std::string::npos; |
| 876 | quantize &= name.find(s: "time_mix_a1.weight" ) == std::string::npos; |
| 877 | quantize &= name.find(s: "time_mix_a2.weight" ) == std::string::npos; |
| 878 | quantize &= name.find(s: "time_mix_g1.weight" ) == std::string::npos; |
| 879 | quantize &= name.find(s: "time_mix_g2.weight" ) == std::string::npos; |
| 880 | quantize &= name.find(s: "time_mix_decay_w1.weight" ) == std::string::npos; |
| 881 | quantize &= name.find(s: "time_mix_decay_w2.weight" ) == std::string::npos; |
| 882 | quantize &= name.find(s: "time_mix_lerp_fused.weight" ) == std::string::npos; |
| 883 | |
| 884 | // do not quantize relative position bias (T5) |
| 885 | quantize &= name.find(s: "attn_rel_b.weight" ) == std::string::npos; |
| 886 | |
| 887 | // do not quantize specific multimodal tensors |
| 888 | quantize &= name.find(s: ".position_embd." ) == std::string::npos; |
| 889 | |
| 890 | ggml_type new_type; |
| 891 | void * new_data; |
| 892 | size_t new_size; |
| 893 | |
| 894 | if (quantize) { |
| 895 | new_type = default_type; |
| 896 | |
| 897 | // get more optimal quantization type based on the tensor shape, layer, etc. |
| 898 | if (!params->pure && ggml_is_quantized(type: default_type)) { |
| 899 | int fallback = qs.n_fallback; |
| 900 | new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); |
| 901 | // unless the user specifies a type, and the tensor geometry will not require fallback quantisation |
| 902 | if (params->tensor_types && qs.n_fallback - fallback == 0) { |
| 903 | const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types); |
| 904 | const std::string tensor_name(tensor->name); |
| 905 | for (const auto & [tname, qtype] : tensor_types) { |
| 906 | if (std::regex pattern(tname); std::regex_search(s: tensor_name, e: pattern)) { |
| 907 | if (qtype != new_type) { |
| 908 | LLAMA_LOG_DEBUG("(overriding %s) " , ggml_type_name(new_type)); |
| 909 | new_type = qtype; // if two or more types are specified for the same tensor, the last match wins |
| 910 | } |
| 911 | } |
| 912 | } |
| 913 | } |
| 914 | } |
| 915 | if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(s1: tensor->name, s2: "token_embd.weight" ) == 0) { |
| 916 | new_type = params->token_embedding_type; |
| 917 | } |
| 918 | if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(s1: tensor->name, s2: "output.weight" ) == 0) { |
| 919 | new_type = params->output_tensor_type; |
| 920 | } |
| 921 | |
| 922 | // If we've decided to quantize to the same type the tensor is already |
| 923 | // in then there's nothing to do. |
| 924 | quantize = tensor->type != new_type; |
| 925 | } |
| 926 | |
| 927 | if (!quantize) { |
| 928 | new_type = tensor->type; |
| 929 | new_data = tensor->data; |
| 930 | new_size = ggml_nbytes(tensor); |
| 931 | LLAMA_LOG_INFO("size = %8.3f MiB\n" , ggml_nbytes(tensor)/1024.0/1024.0); |
| 932 | } else { |
| 933 | const int64_t nelements = ggml_nelements(tensor); |
| 934 | |
| 935 | const float * imatrix = nullptr; |
| 936 | if (imatrix_data) { |
| 937 | auto it = imatrix_data->find(x: remap_imatrix(orig_name: tensor->name, mapped)); |
| 938 | if (it == imatrix_data->end()) { |
| 939 | LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n" , __func__, tensor->name); |
| 940 | } else { |
| 941 | if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { |
| 942 | imatrix = it->second.data(); |
| 943 | } else { |
| 944 | LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n" , __func__, |
| 945 | int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name); |
| 946 | |
| 947 | // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix |
| 948 | // this is a significant error and it may be good idea to abort the process if this happens, |
| 949 | // since many people will miss the error and not realize that most of the model is being quantized without an imatrix |
| 950 | // tok_embd should be ignored in this case, since it always causes this warning |
| 951 | if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight" )) { |
| 952 | throw std::runtime_error(format(fmt: "imatrix size %d is different from tensor size %d for %s" , |
| 953 | int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name)); |
| 954 | } |
| 955 | } |
| 956 | } |
| 957 | } |
| 958 | if ((new_type == GGML_TYPE_IQ2_XXS || |
| 959 | new_type == GGML_TYPE_IQ2_XS || |
| 960 | new_type == GGML_TYPE_IQ2_S || |
| 961 | new_type == GGML_TYPE_IQ1_S || |
| 962 | (new_type == GGML_TYPE_IQ1_M && strcmp(s1: tensor->name, s2: "token_embd.weight" ) && strcmp(s1: tensor->name, s2: "output.weight" )) || |
| 963 | (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(s1: tensor->name, s2: "token_embd.weight" ) != 0)) && !imatrix) { |
| 964 | LLAMA_LOG_ERROR("\n\n============================================================\n" ); |
| 965 | LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n" , tensor->name); |
| 966 | LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n" ); |
| 967 | LLAMA_LOG_ERROR("============================================================\n\n" ); |
| 968 | throw std::runtime_error(format(fmt: "Missing importance matrix for tensor %s in a very low-bit quantization" , tensor->name)); |
| 969 | } |
| 970 | |
| 971 | float * f32_data; |
| 972 | |
| 973 | if (tensor->type == GGML_TYPE_F32) { |
| 974 | f32_data = (float *) tensor->data; |
| 975 | } else if (ggml_is_quantized(type: tensor->type) && !params->allow_requantize) { |
| 976 | throw std::runtime_error(format(fmt: "requantizing from type %s is disabled" , ggml_type_name(type: tensor->type))); |
| 977 | } else { |
| 978 | llama_tensor_dequantize_impl(tensor, output&: f32_conv_buf, workers, nelements, nthread); |
| 979 | f32_data = (float *) f32_conv_buf.data(); |
| 980 | } |
| 981 | |
| 982 | LLAMA_LOG_INFO("converting to %s .. " , ggml_type_name(new_type)); |
| 983 | fflush(stdout); |
| 984 | |
| 985 | if (work.size() < (size_t)nelements * 4) { |
| 986 | work.resize(new_size: nelements * 4); // upper bound on size |
| 987 | } |
| 988 | new_data = work.data(); |
| 989 | |
| 990 | const int64_t n_per_row = tensor->ne[0]; |
| 991 | const int64_t nrows = tensor->ne[1]; |
| 992 | |
| 993 | static const int64_t min_chunk_size = 32 * 512; |
| 994 | const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)); |
| 995 | |
| 996 | const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; |
| 997 | const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; |
| 998 | const int64_t nthread_use = nthread > 1 ? std::max(a: (int64_t)1, b: std::min(a: (int64_t)nthread, b: nchunk)) : 1; |
| 999 | |
| 1000 | // quantize each expert separately since they have different importance matrices |
| 1001 | new_size = 0; |
| 1002 | for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { |
| 1003 | const float * f32_data_03 = f32_data + i03 * nelements_matrix; |
| 1004 | void * new_data_03 = (char *)new_data + ggml_row_size(type: new_type, ne: n_per_row) * i03 * nrows; |
| 1005 | const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; |
| 1006 | |
| 1007 | new_size += llama_tensor_quantize_impl(new_type, f32_data: f32_data_03, new_data: new_data_03, chunk_size, nrows, n_per_row, imatrix: imatrix_03, workers, nthread: nthread_use); |
| 1008 | |
| 1009 | // TODO: temporary sanity check that the F16 -> MXFP4 is lossless |
| 1010 | #if 0 |
| 1011 | if (new_type == GGML_TYPE_MXFP4) { |
| 1012 | auto * x = f32_data_03; |
| 1013 | |
| 1014 | //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row); |
| 1015 | std::vector<float> deq(nrows*n_per_row); |
| 1016 | const ggml_type_traits * qtype = ggml_get_type_traits(new_type); |
| 1017 | qtype->to_float(new_data_03, deq.data(), deq.size()); |
| 1018 | |
| 1019 | double err = 0.0f; |
| 1020 | for (int i = 0; i < (int) deq.size(); ++i) { |
| 1021 | err += fabsf(deq[i] - x[i]); |
| 1022 | //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) { |
| 1023 | if (deq[i] != x[i]) { |
| 1024 | LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n" , i, deq[i], i, x[i]); |
| 1025 | } |
| 1026 | } |
| 1027 | //LLAMA_LOG_INFO("err = %f\n", err); |
| 1028 | GGML_ASSERT(err == 0.00000); |
| 1029 | } |
| 1030 | #endif |
| 1031 | } |
| 1032 | LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n" , ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); |
| 1033 | } |
| 1034 | total_size_org += ggml_nbytes(tensor); |
| 1035 | total_size_new += new_size; |
| 1036 | |
| 1037 | // update the gguf meta data as we go |
| 1038 | gguf_set_tensor_type(ctx: ctx_outs[cur_split].get(), name: name.c_str(), type: new_type); |
| 1039 | GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size); |
| 1040 | gguf_set_tensor_data(ctx: ctx_outs[cur_split].get(), name: name.c_str(), data: new_data); |
| 1041 | |
| 1042 | // write tensor data + padding |
| 1043 | fout.write(s: (const char *) new_data, n: new_size); |
| 1044 | zeros(file&: fout, GGML_PAD(new_size, align) - new_size); |
| 1045 | } |
| 1046 | close_ofstream(); |
| 1047 | |
| 1048 | LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n" , __func__, total_size_org/1024.0/1024.0); |
| 1049 | LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n" , __func__, total_size_new/1024.0/1024.0); |
| 1050 | |
| 1051 | if (qs.n_fallback > 0) { |
| 1052 | LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n" , |
| 1053 | __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); |
| 1054 | } |
| 1055 | } |
| 1056 | |
| 1057 | // |
| 1058 | // interface implementation |
| 1059 | // |
| 1060 | |
| 1061 | llama_model_quantize_params llama_model_quantize_default_params() { |
| 1062 | llama_model_quantize_params result = { |
| 1063 | /*.nthread =*/ 0, |
| 1064 | /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, |
| 1065 | /*.output_tensor_type =*/ GGML_TYPE_COUNT, |
| 1066 | /*.token_embedding_type =*/ GGML_TYPE_COUNT, |
| 1067 | /*.allow_requantize =*/ false, |
| 1068 | /*.quantize_output_tensor =*/ true, |
| 1069 | /*.only_copy =*/ false, |
| 1070 | /*.pure =*/ false, |
| 1071 | /*.keep_split =*/ false, |
| 1072 | /*.imatrix =*/ nullptr, |
| 1073 | /*.kv_overrides =*/ nullptr, |
| 1074 | /*.tensor_type =*/ .tensor_types: nullptr, |
| 1075 | /*.prune_layers =*/ nullptr |
| 1076 | }; |
| 1077 | |
| 1078 | return result; |
| 1079 | } |
| 1080 | |
| 1081 | uint32_t llama_model_quantize( |
| 1082 | const char * fname_inp, |
| 1083 | const char * fname_out, |
| 1084 | const llama_model_quantize_params * params) { |
| 1085 | try { |
| 1086 | llama_model_quantize_impl(fname_inp, fname_out, params); |
| 1087 | } catch (const std::exception & err) { |
| 1088 | LLAMA_LOG_ERROR("%s: failed to quantize: %s\n" , __func__, err.what()); |
| 1089 | return 1; |
| 1090 | } |
| 1091 | |
| 1092 | return 0; |
| 1093 | } |
| 1094 | |