| 1 | #include "ggml.h" |
| 2 | #include "gguf.h" |
| 3 | |
| 4 | #include "llama.h" |
| 5 | #include "common.h" |
| 6 | #include "log.h" |
| 7 | |
| 8 | #include <unordered_map> |
| 9 | #include <vector> |
| 10 | #include <cassert> |
| 11 | #include <climits> |
| 12 | #include <cstring> |
| 13 | #include <cstdarg> |
| 14 | #include <cinttypes> |
| 15 | #include <ctime> |
| 16 | #include <random> |
| 17 | #include <stdexcept> |
| 18 | #include <sstream> |
| 19 | #include <algorithm> |
| 20 | #include <string> |
| 21 | |
| 22 | // GGUF keys & tensor names. |
| 23 | |
| 24 | #define KV_GENERAL_ARCHITECTURE "general.architecture" |
| 25 | #define KV_GENERAL_NAME "general.name" |
| 26 | |
| 27 | #define KV_TOKENIZER_MODEL "tokenizer.ggml.model" |
| 28 | #define KV_TOKENIZER_LIST "tokenizer.ggml.tokens" |
| 29 | #define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type" |
| 30 | #define KV_TOKENIZER_SCORES "tokenizer.ggml.scores" |
| 31 | #define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id" |
| 32 | #define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id" |
| 33 | #define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id" |
| 34 | #define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id" |
| 35 | #define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id" |
| 36 | #define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json" |
| 37 | |
| 38 | #define KV_CONTEXT_LENGTH "llama.context_length" |
| 39 | #define KV_EMBEDDING_LENGTH "llama.embedding_length" |
| 40 | #define KV_BLOCK_COUNT "llama.block_count" |
| 41 | #define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length" |
| 42 | #define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count" |
| 43 | #define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv" |
| 44 | #define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon" |
| 45 | #define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count" |
| 46 | |
| 47 | #define TN_TOKEN_EMBD "token_embd.weight" |
| 48 | #define TN_OUTPUT_NORM "output_norm.weight" |
| 49 | #define TN_OUTPUT "output.weight" |
| 50 | #define TN_ATTN_NORM "blk.%d.attn_norm.weight" |
| 51 | #define TN_ATTN_Q "blk.%d.attn_q.weight" |
| 52 | #define TN_ATTN_K "blk.%d.attn_k.weight" |
| 53 | #define TN_ATTN_V "blk.%d.attn_v.weight" |
| 54 | #define TN_ATTN_OUTPUT "blk.%d.attn_output.weight" |
| 55 | #define TN_FFN_NORM "blk.%d.ffn_norm.weight" |
| 56 | #define TN_FFN_GATE "blk.%d.ffn_gate.weight" |
| 57 | #define TN_FFN_DOWN "blk.%d.ffn_down.weight" |
| 58 | #define TN_FFN_UP "blk.%d.ffn_up.weight" |
| 59 | |
| 60 | #if defined(_MSC_VER) |
| 61 | #pragma warning(disable: 4244 4267) // possible loss of data |
| 62 | #endif |
| 63 | |
| 64 | #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' |
| 65 | #define LLAMA_FILE_VERSION_GGJT_V3 3 |
| 66 | |
| 67 | #define TOKENIZER_NAME "llama" |
| 68 | #define UNKNOWN_TOKEN_ID 0 |
| 69 | #define BOS_TOKEN_ID 1 |
| 70 | #define EOS_TOKEN_ID 2 |
| 71 | |
| 72 | //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc. |
| 73 | typedef struct { |
| 74 | int dim; // transformer dimension |
| 75 | int hidden_dim; // for ffn layers |
| 76 | int n_layers; // number of layers |
| 77 | int n_heads; // number of query heads |
| 78 | int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery) |
| 79 | int vocab_size; // vocabulary size, usually 256 (byte-level) |
| 80 | int seq_len; // max sequence length |
| 81 | } Config; |
| 82 | |
| 83 | struct TransformerWeights { |
| 84 | // token embedding table |
| 85 | std::vector<float> token_embedding_table; // (vocab_size, dim) |
| 86 | // weights for rmsnorms |
| 87 | std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights |
| 88 | std::vector<float> rms_ffn_weight; // (layer, dim) |
| 89 | // weights for matmuls |
| 90 | std::vector<float> wq; // (layer, dim, dim) |
| 91 | std::vector<float> wk; // (layer, dim, dim) |
| 92 | std::vector<float> wv; // (layer, dim, dim) |
| 93 | std::vector<float> wo; // (layer, dim, dim) |
| 94 | // weights for ffn |
| 95 | std::vector<float> w1; // (layer, hidden_dim, dim) |
| 96 | std::vector<float> w2; // (layer, dim, hidden_dim) |
| 97 | std::vector<float> w3; // (layer, hidden_dim, dim) |
| 98 | // final rmsnorm |
| 99 | std::vector<float> rms_final_weight; // (dim,) |
| 100 | // freq_cis for RoPE relatively positional embeddings |
| 101 | // std::vector<float> freq_cis_real; // (seq_len, dim/2) |
| 102 | // std::vector<float> freq_cis_imag; // (seq_len, dim/2) |
| 103 | // (optional) classifier weights for the logits, on the last layer |
| 104 | std::vector<float> wcls; |
| 105 | }; |
| 106 | |
| 107 | static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) { |
| 108 | const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads; |
| 109 | try { |
| 110 | w->token_embedding_table.resize(new_size: p->vocab_size * p->dim); |
| 111 | LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n" ,__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); |
| 112 | |
| 113 | w->rms_att_weight.resize(new_size: p->n_layers * p->dim); |
| 114 | LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n" ,__func__,p->n_layers, p->dim, p->n_layers * p->dim); |
| 115 | |
| 116 | w->rms_ffn_weight.resize(new_size: p->n_layers * p->dim); |
| 117 | LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n" ,__func__,p->n_layers , p->dim, p->n_layers * p->dim); |
| 118 | |
| 119 | w->wq.resize(new_size: p->n_layers * p->dim * p->dim); |
| 120 | LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n" ,__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); |
| 121 | |
| 122 | w->wk.resize(new_size: p->n_layers * p->dim * p->dim / n_multiqueries); |
| 123 | LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n" ,__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); |
| 124 | |
| 125 | w->wv.resize(new_size: p->n_layers * p->dim * p->dim / n_multiqueries); |
| 126 | LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n" ,__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); |
| 127 | |
| 128 | w->wo.resize(new_size: p->n_layers * p->dim * p->dim); |
| 129 | LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n" ,__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); |
| 130 | |
| 131 | w->w1.resize(new_size: p->n_layers * p->hidden_dim * p->dim); |
| 132 | LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n" ,__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); |
| 133 | |
| 134 | w->w2.resize(new_size: p->n_layers * p->hidden_dim * p->dim); |
| 135 | LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n" ,__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); |
| 136 | |
| 137 | w->w3.resize(new_size: p->n_layers * p->hidden_dim * p->dim); |
| 138 | LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n" ,__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); |
| 139 | |
| 140 | w->rms_final_weight.resize(new_size: p->dim); |
| 141 | LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n" ,__func__,p->dim); |
| 142 | |
| 143 | if (shared_weights) { |
| 144 | w->wcls = {}; |
| 145 | } else { |
| 146 | w->wcls.resize(new_size: p->vocab_size * p->dim); |
| 147 | LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n" ,__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); |
| 148 | } |
| 149 | } |
| 150 | catch (std::length_error &) { |
| 151 | die("Invalid configuration. Failed to allocate memory for weights" ); |
| 152 | } |
| 153 | } |
| 154 | |
| 155 | static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) { |
| 156 | if (fread(ptr: w->token_embedding_table.data(), size: sizeof(float), n: w->token_embedding_table.size(), stream: f) != w->token_embedding_table.size()) return 1; |
| 157 | if (fread(ptr: w->rms_att_weight.data(), size: sizeof(float), n: w->rms_att_weight.size(), stream: f) != w->rms_att_weight.size()) return 1; |
| 158 | if (fread(ptr: w->wq.data(), size: sizeof(float), n: w->wq.size(), stream: f) != w->wq.size()) return 1; |
| 159 | if (fread(ptr: w->wk.data(), size: sizeof(float), n: w->wk.size(), stream: f) != w->wk.size()) return 1; |
| 160 | if (fread(ptr: w->wv.data(), size: sizeof(float), n: w->wv.size(), stream: f) != w->wv.size()) return 1; |
| 161 | if (fread(ptr: w->wo.data(), size: sizeof(float), n: w->wo.size(), stream: f) != w->wo.size()) return 1; |
| 162 | if (fread(ptr: w->rms_ffn_weight.data(), size: sizeof(float), n: w->rms_ffn_weight.size(), stream: f) != w->rms_ffn_weight.size()) return 1; |
| 163 | if (fread(ptr: w->w1.data(), size: sizeof(float), n: w->w1.size(), stream: f) != w->w1.size()) return 1; |
| 164 | if (fread(ptr: w->w2.data(), size: sizeof(float), n: w->w2.size(), stream: f) != w->w2.size()) return 1; |
| 165 | if (fread(ptr: w->w3.data(), size: sizeof(float), n: w->w3.size(), stream: f) != w->w3.size()) return 1; |
| 166 | if (fread(ptr: w->rms_final_weight.data(), size: sizeof(float), n: w->rms_final_weight.size(), stream: f) != w->rms_final_weight.size()) return 1; |
| 167 | |
| 168 | // Skip freq_cis_real & freq_cis_imag |
| 169 | int head_size = p->dim / p->n_heads; |
| 170 | fseek(stream: f, off: p->seq_len * head_size * sizeof(float), SEEK_CUR); |
| 171 | |
| 172 | if (!shared_weights && fread(ptr: w->wcls.data(), size: sizeof(float), n: w->wcls.size(), stream: f) != w->wcls.size()) return 1; |
| 173 | |
| 174 | // Check we didn't forget to read anything |
| 175 | auto curr = ftell(stream: f); |
| 176 | fseek(stream: f, off: 0, SEEK_END); |
| 177 | auto end = ftell(stream: f); |
| 178 | if (curr != end) { |
| 179 | LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n" , __func__, curr, end); |
| 180 | return 1; |
| 181 | } |
| 182 | |
| 183 | return 0; |
| 184 | } |
| 185 | |
| 186 | static void print_sample_weights(TransformerWeights *w){ |
| 187 | LOG_INF("----- Quick print of first of the weight vales of all the variables\n" ); |
| 188 | LOG_INF("%f\n" , w->token_embedding_table[0]); |
| 189 | LOG_INF("%f\n" , w->rms_att_weight[0]); |
| 190 | LOG_INF("%f\n" , w->rms_ffn_weight[0]); |
| 191 | |
| 192 | LOG_INF("%f\n" , w->wq[0]); |
| 193 | LOG_INF("%f\n" , w->wk[0]); |
| 194 | LOG_INF("%f\n" , w->wv[0]); |
| 195 | LOG_INF("%f\n" , w->wo[0]); |
| 196 | LOG_INF("%f\n" , w->w1[0]); |
| 197 | LOG_INF("%f\n" , w->w2[0]); |
| 198 | LOG_INF("%f\n" , w->w3[0]); |
| 199 | LOG_INF("%f\n" , w->rms_att_weight[0]); |
| 200 | if (!w->wcls.empty()) LOG_INF("%f\n" , w->wcls[0]); |
| 201 | } |
| 202 | //////////////////////////////////////////////////////////////////////////////////////////////////////////// |
| 203 | |
| 204 | //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model. |
| 205 | |
| 206 | struct my_llama_vocab { |
| 207 | using id = int32_t; |
| 208 | using token = std::string; |
| 209 | using ttype = llama_token_type; |
| 210 | |
| 211 | struct token_data { |
| 212 | token text; |
| 213 | float score; |
| 214 | ttype type; |
| 215 | }; |
| 216 | |
| 217 | std::unordered_map<token, id> token_to_id; |
| 218 | std::vector<token_data> id_to_token; |
| 219 | }; |
| 220 | |
| 221 | struct my_llama_hparams { |
| 222 | uint32_t n_vocab = 32000; |
| 223 | uint32_t n_ctx = 512; // this is provided as user input? |
| 224 | uint32_t n_embd = 4096; |
| 225 | uint32_t n_ff = 11008; |
| 226 | uint32_t n_mult = 4; |
| 227 | uint32_t n_head = 32; |
| 228 | uint32_t n_head_kv = 32; |
| 229 | uint32_t n_layer = 32; |
| 230 | uint32_t n_rot = 64; |
| 231 | |
| 232 | bool operator!=(const my_llama_hparams& other) const { |
| 233 | return memcmp(s1: this, s2: &other, n: sizeof(my_llama_hparams)); |
| 234 | } |
| 235 | }; |
| 236 | |
| 237 | struct my_llama_layer { |
| 238 | // normalization |
| 239 | struct ggml_tensor * attention_norm; |
| 240 | |
| 241 | // attention |
| 242 | struct ggml_tensor * wq; |
| 243 | struct ggml_tensor * wk; |
| 244 | struct ggml_tensor * wv; |
| 245 | struct ggml_tensor * wo; |
| 246 | |
| 247 | // normalization |
| 248 | struct ggml_tensor * ffn_norm; |
| 249 | |
| 250 | // ff |
| 251 | struct ggml_tensor * w1; |
| 252 | struct ggml_tensor * w2; |
| 253 | struct ggml_tensor * w3; |
| 254 | }; |
| 255 | |
| 256 | struct my_llama_model { |
| 257 | struct ggml_context * ctx = NULL; |
| 258 | |
| 259 | std::string name; |
| 260 | |
| 261 | my_llama_hparams hparams; |
| 262 | |
| 263 | struct ggml_tensor * tok_embeddings; |
| 264 | |
| 265 | struct ggml_tensor * norm; |
| 266 | struct ggml_tensor * output; |
| 267 | |
| 268 | std::vector<my_llama_layer> layers; |
| 269 | |
| 270 | uint32_t train_its = 0; |
| 271 | uint32_t train_samples = 0; |
| 272 | uint32_t train_tokens = 0; |
| 273 | }; |
| 274 | |
| 275 | struct train_params { |
| 276 | const char * fn_vocab_model; |
| 277 | const char * fn_llama2c_model; |
| 278 | const char * fn_llama2c_output_model; |
| 279 | const char * fn_train_data; |
| 280 | const char * fn_checkpoint_in; |
| 281 | const char * fn_checkpoint_out; |
| 282 | const char * fn_model_out; |
| 283 | |
| 284 | uint32_t seed; |
| 285 | |
| 286 | int n_ctx; |
| 287 | int n_embd; |
| 288 | int n_mult; |
| 289 | int n_head; |
| 290 | int n_layer; |
| 291 | int n_rotmax; |
| 292 | |
| 293 | int n_threads; |
| 294 | int n_batch; |
| 295 | int n_examples; |
| 296 | int n_predict; |
| 297 | |
| 298 | int print_info_interval; |
| 299 | int print_details_interval; |
| 300 | |
| 301 | bool samples_start_after_nl; |
| 302 | bool use_adam; |
| 303 | bool use_flash; |
| 304 | bool use_scratch; |
| 305 | |
| 306 | // only adam |
| 307 | int warmup; |
| 308 | int cos_decay_steps; |
| 309 | float cos_decay_restart; |
| 310 | float cos_decay_alpha; |
| 311 | |
| 312 | int lbfgs_n_iter; |
| 313 | int adam_n_iter; |
| 314 | float adam_alpha; |
| 315 | float adam_decay; |
| 316 | |
| 317 | int mem_model_gb; |
| 318 | int mem_compute_gb; |
| 319 | int mem_compute0_gb; |
| 320 | int mem_compute1_gb; |
| 321 | }; |
| 322 | |
| 323 | static void print_params(struct my_llama_hparams * params) { |
| 324 | LOG_INF("%s: n_vocab: %u\n" , __func__, params->n_vocab); |
| 325 | LOG_INF("%s: n_ctx: %u\n" , __func__, params->n_ctx); |
| 326 | LOG_INF("%s: n_embd: %u\n" , __func__, params->n_embd); |
| 327 | LOG_INF("%s: n_mult: %u\n" , __func__, params->n_mult); |
| 328 | LOG_INF("%s: n_head: %u\n" , __func__, params->n_head); |
| 329 | LOG_INF("%s: n_head_kv: %u\n" , __func__, params->n_head_kv); |
| 330 | LOG_INF("%s: n_ff: %u\n" , __func__, params->n_ff); |
| 331 | LOG_INF("%s: n_layer: %u\n" , __func__, params->n_layer); |
| 332 | LOG_INF("%s: n_rot: %u\n" , __func__, params->n_rot); |
| 333 | } |
| 334 | |
| 335 | static void print_tensor_info(const struct ggml_context * ctx) { |
| 336 | for (auto * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, tensor: t)) { |
| 337 | LOG_INF("%s: Allocating " , __func__); |
| 338 | int64_t total = 1; |
| 339 | int i = 0; |
| 340 | for (; i < ggml_n_dims(tensor: t); ++i) { |
| 341 | if (i > 0) { LOG_INF("x " ); } |
| 342 | LOG_INF("[%" PRId64 "] " , t->ne[i]); |
| 343 | total *= t->ne[i]; |
| 344 | } |
| 345 | if (i > 1) { LOG_INF("= [%" PRId64 "] " , total); } |
| 346 | LOG_INF("float space for %s\n" , ggml_get_name(t)); |
| 347 | } |
| 348 | } |
| 349 | |
| 350 | static void init_model(struct my_llama_model * model) { |
| 351 | const auto & hparams = model->hparams; |
| 352 | |
| 353 | const uint32_t n_embd = hparams.n_embd; |
| 354 | const uint32_t n_layer = hparams.n_layer; |
| 355 | const uint32_t n_vocab = hparams.n_vocab; |
| 356 | |
| 357 | const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv; |
| 358 | |
| 359 | const uint32_t n_ff = hparams.n_ff; |
| 360 | struct ggml_context * ctx = model->ctx; |
| 361 | |
| 362 | model->train_its = 0; |
| 363 | model->train_samples = 0; |
| 364 | model->train_tokens = 0; |
| 365 | |
| 366 | model->tok_embeddings = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_vocab); |
| 367 | model->norm = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_embd); |
| 368 | model->output = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_vocab); |
| 369 | |
| 370 | ggml_set_name(tensor: model->tok_embeddings, name: "tok_embeddings.weight" ); |
| 371 | ggml_set_name(tensor: model->norm, name: "norm.weight" ); |
| 372 | ggml_set_name(tensor: model->output, name: "output.weight" ); |
| 373 | |
| 374 | model->layers.resize(new_size: n_layer); |
| 375 | for (uint32_t i = 0; i < n_layer; ++i) { |
| 376 | auto & layer = model->layers[i]; |
| 377 | |
| 378 | std::string layers_i = "layers." + std::to_string(val: i); |
| 379 | |
| 380 | layer.attention_norm = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_embd); |
| 381 | |
| 382 | layer.wq = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd); |
| 383 | layer.wk = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd / n_multiqueries); |
| 384 | layer.wv = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd / n_multiqueries); |
| 385 | layer.wo = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd); |
| 386 | |
| 387 | layer.ffn_norm = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_embd); |
| 388 | |
| 389 | layer.w1 = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_ff); |
| 390 | layer.w2 = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_ff, ne1: n_embd); |
| 391 | layer.w3 = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_ff); |
| 392 | |
| 393 | ggml_set_name(tensor: layer.attention_norm, name: (layers_i + ".attention_norm.weight" ).c_str()); |
| 394 | |
| 395 | ggml_set_name(tensor: layer.wq, name: (layers_i + ".attention.wq.weight" ).c_str()); |
| 396 | ggml_set_name(tensor: layer.wk, name: (layers_i + ".attention.wk.weight" ).c_str()); |
| 397 | ggml_set_name(tensor: layer.wv, name: (layers_i + ".attention.wv.weight" ).c_str()); |
| 398 | ggml_set_name(tensor: layer.wo, name: (layers_i + ".attention.wo.weight" ).c_str()); |
| 399 | |
| 400 | ggml_set_name(tensor: layer.ffn_norm, name: (layers_i + ".ffn_norm.weight" ).c_str()); |
| 401 | |
| 402 | ggml_format_name(tensor: layer.w1, fmt: "%s.feed_forward.w1.weight" , layers_i.c_str()); |
| 403 | ggml_format_name(tensor: layer.w2, fmt: "%s.feed_forward.w2.weight" , layers_i.c_str()); |
| 404 | ggml_format_name(tensor: layer.w3, fmt: "%s.feed_forward.w3.weight" , layers_i.c_str()); |
| 405 | } |
| 406 | |
| 407 | print_tensor_info(ctx); |
| 408 | } |
| 409 | |
| 410 | static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { |
| 411 | float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); |
| 412 | return *ptr; |
| 413 | } |
| 414 | |
| 415 | static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { |
| 416 | int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); |
| 417 | return *ptr; |
| 418 | } |
| 419 | |
| 420 | static void print_row(struct ggml_tensor * probs, int i) { |
| 421 | for (int k = 0; k < probs->ne[0]; ++k) { |
| 422 | float p = get_f32_2d(tensor: probs, i0: k, i1: i); |
| 423 | LOG(" %f" , p); |
| 424 | } |
| 425 | LOG("\n" ); |
| 426 | } |
| 427 | |
| 428 | static void print_matrix(struct ggml_tensor * probs) { |
| 429 | assert(ggml_is_matrix(probs)); |
| 430 | for (int i = 0; i < probs->ne[1]; ++i) { |
| 431 | for (int k = 0; k < probs->ne[0]; ++k) { |
| 432 | float p = get_f32_2d(tensor: probs, i0: k, i1: i); |
| 433 | LOG(" %.2f" , p); |
| 434 | } |
| 435 | LOG("\n" ); |
| 436 | } |
| 437 | } |
| 438 | |
| 439 | struct my_llama_file { |
| 440 | // use FILE * so we don't have to re-open the file to mmap |
| 441 | FILE * fp; |
| 442 | size_t size; |
| 443 | |
| 444 | my_llama_file(const char * fname, const char * mode) { |
| 445 | fp = std::fopen(filename: fname, modes: mode); |
| 446 | if (fp == NULL) { |
| 447 | size = 0; |
| 448 | } else { |
| 449 | seek(offset: 0, SEEK_END); |
| 450 | size = tell(); |
| 451 | seek(offset: 0, SEEK_SET); |
| 452 | } |
| 453 | } |
| 454 | |
| 455 | size_t tell() const { |
| 456 | #ifdef _WIN32 |
| 457 | __int64 ret = _ftelli64(fp); |
| 458 | #else |
| 459 | long ret = std::ftell(stream: fp); |
| 460 | #endif |
| 461 | GGML_ASSERT(ret != -1); // this really shouldn't fail |
| 462 | return (size_t) ret; |
| 463 | } |
| 464 | |
| 465 | void seek(size_t offset, int whence) { |
| 466 | #ifdef _WIN32 |
| 467 | int ret = _fseeki64(fp, (__int64) offset, whence); |
| 468 | #else |
| 469 | int ret = std::fseek(stream: fp, off: (long) offset, whence: whence); |
| 470 | #endif |
| 471 | GGML_ASSERT(ret == 0); // same |
| 472 | } |
| 473 | |
| 474 | void read_raw(void * ptr, size_t size) { |
| 475 | if (size == 0) { |
| 476 | return; |
| 477 | } |
| 478 | errno = 0; |
| 479 | std::size_t ret = std::fread(ptr: ptr, size: size, n: 1, stream: fp); |
| 480 | if (ferror(stream: fp)) { |
| 481 | die_fmt("fread failed: %s" , strerror(errno)); |
| 482 | } |
| 483 | if (ret != 1) { |
| 484 | die("unexpectedly reached end of file" ); |
| 485 | } |
| 486 | } |
| 487 | |
| 488 | std::uint32_t read_u32() { |
| 489 | std::uint32_t ret; |
| 490 | read_raw(ptr: &ret, size: sizeof(ret)); |
| 491 | return ret; |
| 492 | } |
| 493 | std::float_t read_f32() { |
| 494 | std::float_t ret; |
| 495 | read_raw(ptr: &ret, size: sizeof(ret)); |
| 496 | return ret; |
| 497 | } |
| 498 | |
| 499 | std::string read_string(std::uint32_t len) { |
| 500 | std::vector<char> chars(len); |
| 501 | read_raw(ptr: chars.data(), size: len); |
| 502 | return std::string(chars.data(), len); |
| 503 | } |
| 504 | |
| 505 | ~my_llama_file() { |
| 506 | if (fp) { |
| 507 | std::fclose(stream: fp); |
| 508 | } |
| 509 | } |
| 510 | }; |
| 511 | |
| 512 | static bool is_ggml_file(const char * filename) { |
| 513 | my_llama_file file(filename, "rb" ); |
| 514 | if (file.size < 4) { |
| 515 | return false; |
| 516 | } |
| 517 | std::string magic = file.read_string(len: 4); |
| 518 | return magic == GGUF_MAGIC; |
| 519 | } |
| 520 | |
| 521 | static std::string llama_escape_whitespaces(const std::string & text) { |
| 522 | std::ostringstream out; |
| 523 | for (char c : text) { |
| 524 | if (c == ' ') out << "\xe2\x96\x81" ; |
| 525 | else out << c; |
| 526 | } |
| 527 | return out.str(); |
| 528 | } |
| 529 | |
| 530 | static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) { |
| 531 | if (is_ggml_file(filename)) { |
| 532 | LOG_INF("%s: Loading vocabulary from gguf file %s\n" , __func__, filename); |
| 533 | struct ggml_context * ctx_data = NULL; |
| 534 | |
| 535 | struct gguf_init_params params = { |
| 536 | /*.no_alloc = */ false, |
| 537 | /*.ctx = */ &ctx_data, |
| 538 | }; |
| 539 | |
| 540 | struct gguf_context * ctx = gguf_init_from_file(fname: filename, params); |
| 541 | GGML_ASSERT(ctx != NULL); |
| 542 | |
| 543 | const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL); |
| 544 | GGML_ASSERT(model_idx >= 0); |
| 545 | std::string tokenizer_name = gguf_get_val_str(ctx, key_id: model_idx); |
| 546 | GGML_ASSERT(tokenizer_name == TOKENIZER_NAME); |
| 547 | |
| 548 | const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST); |
| 549 | GGML_ASSERT(token_idx >= 0); |
| 550 | |
| 551 | const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES); |
| 552 | GGML_ASSERT(score_idx >= 0); |
| 553 | const float * scores = (const float * ) gguf_get_arr_data(ctx, key_id: score_idx); |
| 554 | |
| 555 | const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE); |
| 556 | GGML_ASSERT(toktype_idx >= 0); |
| 557 | const int * toktypes = (const int * ) gguf_get_arr_data(ctx, key_id: toktype_idx); |
| 558 | |
| 559 | const uint32_t n_vocab = gguf_get_arr_n(ctx, key_id: token_idx); |
| 560 | if (n_vocab != static_cast<uint32_t>(config->vocab_size)) { |
| 561 | die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d" , n_vocab, config->vocab_size); |
| 562 | } |
| 563 | |
| 564 | vocab->id_to_token.resize(new_size: n_vocab); |
| 565 | |
| 566 | for (uint32_t i = 0; i < n_vocab; i++) { |
| 567 | std::string word = gguf_get_arr_str(ctx, key_id: token_idx, i); |
| 568 | |
| 569 | vocab->token_to_id[word] = i; |
| 570 | |
| 571 | auto & token_data = vocab->id_to_token[i]; |
| 572 | token_data.text = std::move(word); |
| 573 | token_data.score = scores[i]; |
| 574 | token_data.type = (llama_token_type) toktypes[i]; |
| 575 | } |
| 576 | ggml_free(ctx: ctx_data); |
| 577 | gguf_free(ctx); |
| 578 | } else { |
| 579 | // assume llama2.c vocabulary |
| 580 | LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n" , __func__, filename); |
| 581 | my_llama_file file(filename, "rb" ); |
| 582 | if (!file.fp) { |
| 583 | die_fmt("%s: %s" , strerror(errno), filename); |
| 584 | } |
| 585 | const int n_vocab = config->vocab_size; |
| 586 | /* uint32_t max_token_length = */ file.read_u32(); // unused |
| 587 | vocab->id_to_token.resize(new_size: n_vocab); |
| 588 | for (my_llama_vocab::id id=0; id<n_vocab; ++id) { |
| 589 | float_t score = file.read_f32(); |
| 590 | uint32_t len = file.read_u32(); |
| 591 | std::string text = file.read_string(len); |
| 592 | |
| 593 | unsigned char byte_val; |
| 594 | my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL; |
| 595 | if (id == UNKNOWN_TOKEN_ID) { |
| 596 | text = "<unk>" ; |
| 597 | type = LLAMA_TOKEN_TYPE_UNKNOWN; |
| 598 | } else if (id == BOS_TOKEN_ID) { |
| 599 | text = "<s>" ; |
| 600 | type = LLAMA_TOKEN_TYPE_CONTROL; |
| 601 | } else if (id == EOS_TOKEN_ID) { |
| 602 | text = "</s>" ; |
| 603 | type = LLAMA_TOKEN_TYPE_CONTROL; |
| 604 | } else if (text.empty()) { |
| 605 | type = LLAMA_TOKEN_TYPE_CONTROL; |
| 606 | } else if (sscanf(s: text.c_str(), format: "<0x%02hhX>" , &byte_val) == 1) { |
| 607 | // Text of byte tokens is already in the expected format. |
| 608 | type = LLAMA_TOKEN_TYPE_BYTE; |
| 609 | } else { |
| 610 | type = LLAMA_TOKEN_TYPE_NORMAL; |
| 611 | } |
| 612 | text = llama_escape_whitespaces(text); |
| 613 | |
| 614 | vocab->id_to_token[id].text = text; |
| 615 | vocab->id_to_token[id].score = score; |
| 616 | vocab->id_to_token[id].type = type; |
| 617 | vocab->token_to_id.emplace(args&: text, args&: id); |
| 618 | } |
| 619 | } |
| 620 | } |
| 621 | |
| 622 | static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { |
| 623 | int size = 1; |
| 624 | for (int dim = 0; dim < ggml_n_dims(tensor: gg_weights); ++dim) { |
| 625 | size *= gg_weights->ne[dim]; |
| 626 | } |
| 627 | for (int ct = 0; ct < size; ++ct) { |
| 628 | int64_t i0 = 0; int64_t i1 = 0; |
| 629 | int64_t i2 = 0; int64_t i3 = 0; |
| 630 | ggml_unravel_index(tensor: gg_weights, i: ct, i0: &i0, i1: &i1, i2: &i2, i3: &i3); |
| 631 | ggml_set_f32_nd(tensor: gg_weights, i0, i1, i2, i3, value: karpathy_weights[ct]); |
| 632 | } |
| 633 | } |
| 634 | |
| 635 | static void save_as_llama_model( |
| 636 | struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename |
| 637 | ) { |
| 638 | // convert AK weights into GG weights one by one. |
| 639 | // w->token_embedding_table -> model->tok_embeddings |
| 640 | // float* -> struct ggml_tensor |
| 641 | convert_weights_ak_to_gg(gg_weights: model->tok_embeddings, karpathy_weights: w->token_embedding_table.data()); |
| 642 | convert_weights_ak_to_gg(gg_weights: model->output, karpathy_weights: !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data()); |
| 643 | |
| 644 | convert_weights_ak_to_gg(gg_weights: model->norm, karpathy_weights: w->rms_final_weight.data()); |
| 645 | //print_row(model->norm, 0); |
| 646 | |
| 647 | // for rms-att-weight |
| 648 | int row_length = model->hparams.n_embd; |
| 649 | int n_ff = model->hparams.n_ff; |
| 650 | |
| 651 | const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv; |
| 652 | |
| 653 | for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ |
| 654 | auto & layer = model->layers[i]; |
| 655 | // 1d |
| 656 | convert_weights_ak_to_gg(gg_weights: layer.attention_norm, karpathy_weights: &w->rms_att_weight[i*row_length]); |
| 657 | convert_weights_ak_to_gg(gg_weights: layer.ffn_norm , karpathy_weights: &w->rms_ffn_weight[i*row_length]); |
| 658 | |
| 659 | // from 3d matrix layer x dim x dim to 2d matrix dim x dim |
| 660 | convert_weights_ak_to_gg(gg_weights: layer.wq , karpathy_weights: &w->wq[i*row_length*row_length]); |
| 661 | convert_weights_ak_to_gg(gg_weights: layer.wo , karpathy_weights: &w->wo[i*row_length*row_length]); |
| 662 | // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries |
| 663 | convert_weights_ak_to_gg(gg_weights: layer.wk , karpathy_weights: &w->wk[i*row_length*row_length/n_multiqueries]); |
| 664 | convert_weights_ak_to_gg(gg_weights: layer.wv , karpathy_weights: &w->wv[i*row_length*row_length/n_multiqueries]); |
| 665 | |
| 666 | convert_weights_ak_to_gg(gg_weights: layer.w1 , karpathy_weights: &w->w1[i*row_length*n_ff]); |
| 667 | convert_weights_ak_to_gg(gg_weights: layer.w2 , karpathy_weights: &w->w2[i*n_ff*row_length]); |
| 668 | convert_weights_ak_to_gg(gg_weights: layer.w3 , karpathy_weights: &w->w3[i*row_length*n_ff]); |
| 669 | } |
| 670 | |
| 671 | struct gguf_context * ctx = gguf_init_empty(); |
| 672 | |
| 673 | std::vector<const char*> tokens; |
| 674 | std::vector<float> scores; |
| 675 | std::vector<llama_token_type> token_types; |
| 676 | for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) { |
| 677 | tokens.push_back(x: token_data.text.c_str()); |
| 678 | scores.push_back(x: token_data.score); |
| 679 | token_types.push_back(x: token_data.type); |
| 680 | } |
| 681 | gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, data: tokens.data(), n: tokens.size()); |
| 682 | gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, type: GGUF_TYPE_FLOAT32, data: scores.data(), n: scores.size()); |
| 683 | gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, type: GGUF_TYPE_INT32, data: token_types.data(), n: token_types.size()); |
| 684 | |
| 685 | gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME); |
| 686 | |
| 687 | gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, val: "llama" ); |
| 688 | gguf_set_val_str(ctx, KV_GENERAL_NAME, val: "llama" ); |
| 689 | |
| 690 | // special tokens |
| 691 | gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); |
| 692 | gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); |
| 693 | gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); |
| 694 | gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL); |
| 695 | gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL); |
| 696 | |
| 697 | gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, val: model->hparams.n_ctx); |
| 698 | gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, val: model->hparams.n_embd); |
| 699 | gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, val: model->hparams.n_ff); |
| 700 | gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, val: model->hparams.n_head); |
| 701 | gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, val: model->hparams.n_head); |
| 702 | gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, val: model->hparams.n_head_kv); |
| 703 | gguf_set_val_u32(ctx, KV_BLOCK_COUNT, val: model->hparams.n_layer); |
| 704 | gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, val: model->hparams.n_rot); |
| 705 | gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, val: 1e-5f); |
| 706 | |
| 707 | // write tensors |
| 708 | ggml_set_name(tensor: model->tok_embeddings, TN_TOKEN_EMBD); |
| 709 | gguf_add_tensor(ctx, tensor: model->tok_embeddings); |
| 710 | |
| 711 | ggml_set_name(tensor: model->norm, TN_OUTPUT_NORM); |
| 712 | gguf_add_tensor(ctx, tensor: model->norm); |
| 713 | |
| 714 | ggml_set_name(tensor: model->output, TN_OUTPUT); |
| 715 | gguf_add_tensor(ctx, tensor: model->output); |
| 716 | |
| 717 | for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { |
| 718 | auto & layer = model->layers[i]; |
| 719 | |
| 720 | ggml_format_name(tensor: layer.wq, TN_ATTN_Q, i); |
| 721 | gguf_add_tensor(ctx, tensor: layer.wq); |
| 722 | |
| 723 | ggml_format_name(tensor: layer.wk, TN_ATTN_K, i); |
| 724 | gguf_add_tensor(ctx, tensor: layer.wk); |
| 725 | |
| 726 | ggml_format_name(tensor: layer.wv, TN_ATTN_V, i); |
| 727 | gguf_add_tensor(ctx, tensor: layer.wv); |
| 728 | |
| 729 | ggml_format_name(tensor: layer.wo, TN_ATTN_OUTPUT, i); |
| 730 | gguf_add_tensor(ctx, tensor: layer.wo); |
| 731 | |
| 732 | ggml_format_name(tensor: layer.attention_norm, TN_ATTN_NORM, i); |
| 733 | gguf_add_tensor(ctx, tensor: layer.attention_norm); |
| 734 | |
| 735 | ggml_format_name(tensor: layer.w1, TN_FFN_GATE, i); |
| 736 | gguf_add_tensor(ctx, tensor: layer.w1); |
| 737 | |
| 738 | ggml_format_name(tensor: layer.w2, TN_FFN_DOWN, i); |
| 739 | gguf_add_tensor(ctx, tensor: layer.w2); |
| 740 | |
| 741 | ggml_format_name(tensor: layer.w3, TN_FFN_UP, i); |
| 742 | gguf_add_tensor(ctx, tensor: layer.w3); |
| 743 | |
| 744 | ggml_format_name(tensor: layer.ffn_norm, TN_FFN_NORM, i); |
| 745 | gguf_add_tensor(ctx, tensor: layer.ffn_norm); |
| 746 | } |
| 747 | |
| 748 | gguf_write_to_file(ctx, fname: filename, only_meta: false); |
| 749 | gguf_free(ctx); |
| 750 | } |
| 751 | |
| 752 | static struct train_params get_default_train_params() { |
| 753 | struct train_params params; |
| 754 | params.fn_vocab_model = "models/7B/ggml-model-f16.gguf" ; |
| 755 | params.fn_llama2c_output_model = "ak_llama_model.bin" ; |
| 756 | params.fn_train_data = "shakespeare.txt" ; |
| 757 | params.fn_checkpoint_in = "checkpoint.bin" ; |
| 758 | params.fn_checkpoint_out = "checkpoint.bin" ; |
| 759 | params.fn_model_out = "ggml-checkpoint-f32.bin" ; |
| 760 | |
| 761 | params.seed = -1; |
| 762 | |
| 763 | params.n_ctx = 128; |
| 764 | params.n_embd = 256; |
| 765 | params.n_mult = 256; |
| 766 | params.n_head = 8; |
| 767 | params.n_layer = 16; |
| 768 | params.n_rotmax = 64; |
| 769 | |
| 770 | params.n_threads = 6; |
| 771 | params.n_batch = 8; |
| 772 | params.n_examples = 8; |
| 773 | params.n_predict = 1024; |
| 774 | |
| 775 | params.print_info_interval = 1; |
| 776 | params.print_details_interval = 2; |
| 777 | |
| 778 | params.samples_start_after_nl = false; |
| 779 | params.use_adam = true; |
| 780 | params.use_flash = false; |
| 781 | params.use_scratch = true; |
| 782 | |
| 783 | // only adam |
| 784 | params.warmup = 100; |
| 785 | params.cos_decay_steps = 1000; |
| 786 | params.cos_decay_restart = 1.1f; |
| 787 | params.cos_decay_alpha = 0.0f; |
| 788 | |
| 789 | params.lbfgs_n_iter = 16; |
| 790 | params.adam_n_iter = 16; |
| 791 | params.adam_alpha = 1e-3f; |
| 792 | params.adam_decay = 1e-3f; |
| 793 | |
| 794 | params.mem_model_gb = 2; |
| 795 | params.mem_compute_gb = 24; |
| 796 | params.mem_compute0_gb = 8; |
| 797 | params.mem_compute1_gb = 2; |
| 798 | |
| 799 | return params; |
| 800 | } |
| 801 | |
| 802 | static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { |
| 803 | fprintf(stderr, format: "usage: %s [options]\n" , argv[0]); |
| 804 | fprintf(stderr, format: "\n" ); |
| 805 | fprintf(stderr, format: "options:\n" ); |
| 806 | fprintf(stderr, format: " -h, --help show this help message and exit\n" ); |
| 807 | fprintf(stderr, format: " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n" , params->fn_vocab_model); |
| 808 | fprintf(stderr, format: " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n" ); |
| 809 | fprintf(stderr, format: " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n" , params->fn_llama2c_output_model); |
| 810 | fprintf(stderr, format: "\n" ); |
| 811 | } |
| 812 | |
| 813 | static bool params_parse(int argc, char ** argv, struct train_params * params) { |
| 814 | bool invalid_param = false; |
| 815 | bool reqd_param_found = false; |
| 816 | std::string arg; |
| 817 | struct train_params default_params = get_default_train_params(); |
| 818 | const std::string arg_prefix = "--" ; |
| 819 | |
| 820 | for (int i = 1; i < argc; i++) { |
| 821 | arg = argv[i]; |
| 822 | if (arg.compare(pos: 0, n: arg_prefix.size(), str: arg_prefix) == 0) { |
| 823 | std::replace(first: arg.begin(), last: arg.end(), old_value: '_', new_value: '-'); |
| 824 | } |
| 825 | |
| 826 | if (arg == "--copy-vocab-from-model" ) { |
| 827 | if (++i >= argc) { |
| 828 | invalid_param = true; |
| 829 | break; |
| 830 | } |
| 831 | params->fn_vocab_model = argv[i]; |
| 832 | } else if (arg == "--llama2c-model" ) { |
| 833 | if (++i >= argc) { |
| 834 | invalid_param = true; |
| 835 | break; |
| 836 | } |
| 837 | reqd_param_found = true; |
| 838 | params->fn_llama2c_model = argv[i]; |
| 839 | } else if (arg == "--llama2c-output-model" ) { |
| 840 | if (++i >= argc) { |
| 841 | invalid_param = true; |
| 842 | break; |
| 843 | } |
| 844 | params->fn_llama2c_output_model = argv[i]; |
| 845 | } else if (arg == "-h" || arg == "--help" ) { |
| 846 | print_usage(argc, argv, params: &default_params); |
| 847 | exit(status: 0); |
| 848 | } else { |
| 849 | fprintf(stderr, format: "error: unknown argument: %s\n" , arg.c_str()); |
| 850 | print_usage(argc, argv, params: &default_params); |
| 851 | exit(status: 1); |
| 852 | } |
| 853 | } |
| 854 | if (invalid_param) { |
| 855 | fprintf(stderr, format: "error: invalid parameter for argument: %s\n" , arg.c_str()); |
| 856 | print_usage(argc, argv, params: &default_params); |
| 857 | exit(status: 1); |
| 858 | } |
| 859 | if (!reqd_param_found){ |
| 860 | fprintf(stderr, format: "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n" ); |
| 861 | print_usage(argc, argv, params: &default_params); |
| 862 | exit(status: 1); |
| 863 | } |
| 864 | |
| 865 | return true; |
| 866 | } |
| 867 | |
| 868 | static std::string basename(const std::string &path) { |
| 869 | size_t pos = path.find_last_of(s: "/\\" ); |
| 870 | if (pos == std::string::npos) { |
| 871 | return path; |
| 872 | } |
| 873 | return path.substr(pos: pos + 1); |
| 874 | } |
| 875 | |
| 876 | int main(int argc, char ** argv) { |
| 877 | common_init(); |
| 878 | |
| 879 | struct train_params params = get_default_train_params(); |
| 880 | if (!params_parse(argc, argv, params: ¶ms)) { |
| 881 | return 1; |
| 882 | } |
| 883 | |
| 884 | Config config; |
| 885 | TransformerWeights weights = {}; |
| 886 | { |
| 887 | LOG_INF("%s: Loading llama2c model from %s\n" , __func__, params.fn_llama2c_model); |
| 888 | FILE * file = fopen(filename: params.fn_llama2c_model, modes: "rb" ); |
| 889 | if (!file) { |
| 890 | LOG_ERR("%s: Unable to open the checkpoint file %s!\n" , __func__, params.fn_llama2c_model); |
| 891 | return 1; |
| 892 | } |
| 893 | // read in the config header |
| 894 | if (fread(ptr: &config, size: sizeof(Config), n: 1, stream: file) != 1) { |
| 895 | LOG_ERR("%s: Unable to read llama2c config from %s!\n" ,__func__,params.fn_llama2c_model); |
| 896 | return 1; |
| 897 | } |
| 898 | auto shared_weights = config.vocab_size > 0; |
| 899 | config.vocab_size = abs(x: config.vocab_size); |
| 900 | |
| 901 | // read in the Transformer weights |
| 902 | alloc_weights(w: &weights, p: &config, shared_weights); |
| 903 | if (checkpoint_init_weights(w: &weights, p: &config, f: file, shared_weights)) { |
| 904 | LOG_ERR("%s: Unable to initialize transformer weights from %s!" ,__func__,params.fn_llama2c_model); |
| 905 | return 1; |
| 906 | } |
| 907 | fclose(stream: file); |
| 908 | } |
| 909 | |
| 910 | struct my_llama_vocab vocab; |
| 911 | load_vocab(filename: params.fn_vocab_model, config: &config, vocab: &vocab); |
| 912 | |
| 913 | struct my_llama_model model; |
| 914 | model.hparams.n_vocab = config.vocab_size; //llama_vocab_n_vocab(lctx); |
| 915 | model.hparams.n_ctx = params.n_ctx; |
| 916 | model.hparams.n_embd = config.dim; //params.n_embd; |
| 917 | model.hparams.n_ff = config.hidden_dim; |
| 918 | model.hparams.n_mult = 32;//params.n_mult; |
| 919 | model.hparams.n_head = config.n_heads; //params.n_head; |
| 920 | model.hparams.n_head_kv = config.n_kv_heads; |
| 921 | model.hparams.n_layer = config.n_layers; //params.n_layer; |
| 922 | model.hparams.n_rot = std::min(a: (uint32_t)params.n_rotmax, b: model.hparams.n_embd / model.hparams.n_head); |
| 923 | |
| 924 | print_params(params: &model.hparams); |
| 925 | |
| 926 | struct ggml_init_params lcparams; |
| 927 | lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); |
| 928 | lcparams.mem_buffer = NULL; |
| 929 | lcparams.no_alloc = false; |
| 930 | |
| 931 | model.ctx = ggml_init(params: lcparams); |
| 932 | |
| 933 | init_model(model: &model); |
| 934 | model.name = basename(filename: params.fn_llama2c_model); |
| 935 | save_as_llama_model(vocab: &vocab, model: &model, w: &weights, filename: params.fn_llama2c_output_model); |
| 936 | |
| 937 | LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n" , __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); |
| 938 | |
| 939 | ggml_free(ctx: model.ctx); |
| 940 | return 0; |
| 941 | } |
| 942 | |