1#include "ggml.h"
2#include "gguf.h"
3
4#include "llama.h"
5#include "common.h"
6#include "log.h"
7
8#include <unordered_map>
9#include <vector>
10#include <cassert>
11#include <climits>
12#include <cstring>
13#include <cstdarg>
14#include <cinttypes>
15#include <ctime>
16#include <random>
17#include <stdexcept>
18#include <sstream>
19#include <algorithm>
20#include <string>
21
22// GGUF keys & tensor names.
23
24#define KV_GENERAL_ARCHITECTURE "general.architecture"
25#define KV_GENERAL_NAME "general.name"
26
27#define KV_TOKENIZER_MODEL "tokenizer.ggml.model"
28#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens"
29#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type"
30#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores"
31#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id"
32#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id"
33#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id"
34#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id"
35#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id"
36#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json"
37
38#define KV_CONTEXT_LENGTH "llama.context_length"
39#define KV_EMBEDDING_LENGTH "llama.embedding_length"
40#define KV_BLOCK_COUNT "llama.block_count"
41#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length"
42#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count"
43#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv"
44#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon"
45#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count"
46
47#define TN_TOKEN_EMBD "token_embd.weight"
48#define TN_OUTPUT_NORM "output_norm.weight"
49#define TN_OUTPUT "output.weight"
50#define TN_ATTN_NORM "blk.%d.attn_norm.weight"
51#define TN_ATTN_Q "blk.%d.attn_q.weight"
52#define TN_ATTN_K "blk.%d.attn_k.weight"
53#define TN_ATTN_V "blk.%d.attn_v.weight"
54#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
55#define TN_FFN_NORM "blk.%d.ffn_norm.weight"
56#define TN_FFN_GATE "blk.%d.ffn_gate.weight"
57#define TN_FFN_DOWN "blk.%d.ffn_down.weight"
58#define TN_FFN_UP "blk.%d.ffn_up.weight"
59
60#if defined(_MSC_VER)
61#pragma warning(disable: 4244 4267) // possible loss of data
62#endif
63
64#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
65#define LLAMA_FILE_VERSION_GGJT_V3 3
66
67#define TOKENIZER_NAME "llama"
68#define UNKNOWN_TOKEN_ID 0
69#define BOS_TOKEN_ID 1
70#define EOS_TOKEN_ID 2
71
72//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
73typedef struct {
74 int dim; // transformer dimension
75 int hidden_dim; // for ffn layers
76 int n_layers; // number of layers
77 int n_heads; // number of query heads
78 int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
79 int vocab_size; // vocabulary size, usually 256 (byte-level)
80 int seq_len; // max sequence length
81} Config;
82
83struct TransformerWeights {
84 // token embedding table
85 std::vector<float> token_embedding_table; // (vocab_size, dim)
86 // weights for rmsnorms
87 std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
88 std::vector<float> rms_ffn_weight; // (layer, dim)
89 // weights for matmuls
90 std::vector<float> wq; // (layer, dim, dim)
91 std::vector<float> wk; // (layer, dim, dim)
92 std::vector<float> wv; // (layer, dim, dim)
93 std::vector<float> wo; // (layer, dim, dim)
94 // weights for ffn
95 std::vector<float> w1; // (layer, hidden_dim, dim)
96 std::vector<float> w2; // (layer, dim, hidden_dim)
97 std::vector<float> w3; // (layer, hidden_dim, dim)
98 // final rmsnorm
99 std::vector<float> rms_final_weight; // (dim,)
100 // freq_cis for RoPE relatively positional embeddings
101 // std::vector<float> freq_cis_real; // (seq_len, dim/2)
102 // std::vector<float> freq_cis_imag; // (seq_len, dim/2)
103 // (optional) classifier weights for the logits, on the last layer
104 std::vector<float> wcls;
105};
106
107static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
108 const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
109 try {
110 w->token_embedding_table.resize(new_size: p->vocab_size * p->dim);
111 LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
112
113 w->rms_att_weight.resize(new_size: p->n_layers * p->dim);
114 LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
115
116 w->rms_ffn_weight.resize(new_size: p->n_layers * p->dim);
117 LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
118
119 w->wq.resize(new_size: p->n_layers * p->dim * p->dim);
120 LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
121
122 w->wk.resize(new_size: p->n_layers * p->dim * p->dim / n_multiqueries);
123 LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
124
125 w->wv.resize(new_size: p->n_layers * p->dim * p->dim / n_multiqueries);
126 LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
127
128 w->wo.resize(new_size: p->n_layers * p->dim * p->dim);
129 LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
130
131 w->w1.resize(new_size: p->n_layers * p->hidden_dim * p->dim);
132 LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
133
134 w->w2.resize(new_size: p->n_layers * p->hidden_dim * p->dim);
135 LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
136
137 w->w3.resize(new_size: p->n_layers * p->hidden_dim * p->dim);
138 LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
139
140 w->rms_final_weight.resize(new_size: p->dim);
141 LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
142
143 if (shared_weights) {
144 w->wcls = {};
145 } else {
146 w->wcls.resize(new_size: p->vocab_size * p->dim);
147 LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
148 }
149 }
150 catch (std::length_error &) {
151 die("Invalid configuration. Failed to allocate memory for weights");
152 }
153}
154
155static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
156 if (fread(ptr: w->token_embedding_table.data(), size: sizeof(float), n: w->token_embedding_table.size(), stream: f) != w->token_embedding_table.size()) return 1;
157 if (fread(ptr: w->rms_att_weight.data(), size: sizeof(float), n: w->rms_att_weight.size(), stream: f) != w->rms_att_weight.size()) return 1;
158 if (fread(ptr: w->wq.data(), size: sizeof(float), n: w->wq.size(), stream: f) != w->wq.size()) return 1;
159 if (fread(ptr: w->wk.data(), size: sizeof(float), n: w->wk.size(), stream: f) != w->wk.size()) return 1;
160 if (fread(ptr: w->wv.data(), size: sizeof(float), n: w->wv.size(), stream: f) != w->wv.size()) return 1;
161 if (fread(ptr: w->wo.data(), size: sizeof(float), n: w->wo.size(), stream: f) != w->wo.size()) return 1;
162 if (fread(ptr: w->rms_ffn_weight.data(), size: sizeof(float), n: w->rms_ffn_weight.size(), stream: f) != w->rms_ffn_weight.size()) return 1;
163 if (fread(ptr: w->w1.data(), size: sizeof(float), n: w->w1.size(), stream: f) != w->w1.size()) return 1;
164 if (fread(ptr: w->w2.data(), size: sizeof(float), n: w->w2.size(), stream: f) != w->w2.size()) return 1;
165 if (fread(ptr: w->w3.data(), size: sizeof(float), n: w->w3.size(), stream: f) != w->w3.size()) return 1;
166 if (fread(ptr: w->rms_final_weight.data(), size: sizeof(float), n: w->rms_final_weight.size(), stream: f) != w->rms_final_weight.size()) return 1;
167
168 // Skip freq_cis_real & freq_cis_imag
169 int head_size = p->dim / p->n_heads;
170 fseek(stream: f, off: p->seq_len * head_size * sizeof(float), SEEK_CUR);
171
172 if (!shared_weights && fread(ptr: w->wcls.data(), size: sizeof(float), n: w->wcls.size(), stream: f) != w->wcls.size()) return 1;
173
174 // Check we didn't forget to read anything
175 auto curr = ftell(stream: f);
176 fseek(stream: f, off: 0, SEEK_END);
177 auto end = ftell(stream: f);
178 if (curr != end) {
179 LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
180 return 1;
181 }
182
183 return 0;
184}
185
186static void print_sample_weights(TransformerWeights *w){
187 LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
188 LOG_INF("%f\n", w->token_embedding_table[0]);
189 LOG_INF("%f\n", w->rms_att_weight[0]);
190 LOG_INF("%f\n", w->rms_ffn_weight[0]);
191
192 LOG_INF("%f\n", w->wq[0]);
193 LOG_INF("%f\n", w->wk[0]);
194 LOG_INF("%f\n", w->wv[0]);
195 LOG_INF("%f\n", w->wo[0]);
196 LOG_INF("%f\n", w->w1[0]);
197 LOG_INF("%f\n", w->w2[0]);
198 LOG_INF("%f\n", w->w3[0]);
199 LOG_INF("%f\n", w->rms_att_weight[0]);
200 if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
201}
202////////////////////////////////////////////////////////////////////////////////////////////////////////////
203
204//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
205
206struct my_llama_vocab {
207 using id = int32_t;
208 using token = std::string;
209 using ttype = llama_token_type;
210
211 struct token_data {
212 token text;
213 float score;
214 ttype type;
215 };
216
217 std::unordered_map<token, id> token_to_id;
218 std::vector<token_data> id_to_token;
219};
220
221struct my_llama_hparams {
222 uint32_t n_vocab = 32000;
223 uint32_t n_ctx = 512; // this is provided as user input?
224 uint32_t n_embd = 4096;
225 uint32_t n_ff = 11008;
226 uint32_t n_mult = 4;
227 uint32_t n_head = 32;
228 uint32_t n_head_kv = 32;
229 uint32_t n_layer = 32;
230 uint32_t n_rot = 64;
231
232 bool operator!=(const my_llama_hparams& other) const {
233 return memcmp(s1: this, s2: &other, n: sizeof(my_llama_hparams));
234 }
235};
236
237struct my_llama_layer {
238 // normalization
239 struct ggml_tensor * attention_norm;
240
241 // attention
242 struct ggml_tensor * wq;
243 struct ggml_tensor * wk;
244 struct ggml_tensor * wv;
245 struct ggml_tensor * wo;
246
247 // normalization
248 struct ggml_tensor * ffn_norm;
249
250 // ff
251 struct ggml_tensor * w1;
252 struct ggml_tensor * w2;
253 struct ggml_tensor * w3;
254};
255
256struct my_llama_model {
257 struct ggml_context * ctx = NULL;
258
259 std::string name;
260
261 my_llama_hparams hparams;
262
263 struct ggml_tensor * tok_embeddings;
264
265 struct ggml_tensor * norm;
266 struct ggml_tensor * output;
267
268 std::vector<my_llama_layer> layers;
269
270 uint32_t train_its = 0;
271 uint32_t train_samples = 0;
272 uint32_t train_tokens = 0;
273};
274
275struct train_params {
276 const char * fn_vocab_model;
277 const char * fn_llama2c_model;
278 const char * fn_llama2c_output_model;
279 const char * fn_train_data;
280 const char * fn_checkpoint_in;
281 const char * fn_checkpoint_out;
282 const char * fn_model_out;
283
284 uint32_t seed;
285
286 int n_ctx;
287 int n_embd;
288 int n_mult;
289 int n_head;
290 int n_layer;
291 int n_rotmax;
292
293 int n_threads;
294 int n_batch;
295 int n_examples;
296 int n_predict;
297
298 int print_info_interval;
299 int print_details_interval;
300
301 bool samples_start_after_nl;
302 bool use_adam;
303 bool use_flash;
304 bool use_scratch;
305
306 // only adam
307 int warmup;
308 int cos_decay_steps;
309 float cos_decay_restart;
310 float cos_decay_alpha;
311
312 int lbfgs_n_iter;
313 int adam_n_iter;
314 float adam_alpha;
315 float adam_decay;
316
317 int mem_model_gb;
318 int mem_compute_gb;
319 int mem_compute0_gb;
320 int mem_compute1_gb;
321};
322
323static void print_params(struct my_llama_hparams * params) {
324 LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
325 LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
326 LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
327 LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
328 LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
329 LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
330 LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
331 LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
332 LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
333}
334
335static void print_tensor_info(const struct ggml_context * ctx) {
336 for (auto * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, tensor: t)) {
337 LOG_INF("%s: Allocating ", __func__);
338 int64_t total = 1;
339 int i = 0;
340 for (; i < ggml_n_dims(tensor: t); ++i) {
341 if (i > 0) { LOG_INF("x "); }
342 LOG_INF("[%" PRId64 "] ", t->ne[i]);
343 total *= t->ne[i];
344 }
345 if (i > 1) { LOG_INF("= [%" PRId64 "] ", total); }
346 LOG_INF("float space for %s\n", ggml_get_name(t));
347 }
348}
349
350static void init_model(struct my_llama_model * model) {
351 const auto & hparams = model->hparams;
352
353 const uint32_t n_embd = hparams.n_embd;
354 const uint32_t n_layer = hparams.n_layer;
355 const uint32_t n_vocab = hparams.n_vocab;
356
357 const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
358
359 const uint32_t n_ff = hparams.n_ff;
360 struct ggml_context * ctx = model->ctx;
361
362 model->train_its = 0;
363 model->train_samples = 0;
364 model->train_tokens = 0;
365
366 model->tok_embeddings = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_vocab);
367 model->norm = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_embd);
368 model->output = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_vocab);
369
370 ggml_set_name(tensor: model->tok_embeddings, name: "tok_embeddings.weight");
371 ggml_set_name(tensor: model->norm, name: "norm.weight");
372 ggml_set_name(tensor: model->output, name: "output.weight");
373
374 model->layers.resize(new_size: n_layer);
375 for (uint32_t i = 0; i < n_layer; ++i) {
376 auto & layer = model->layers[i];
377
378 std::string layers_i = "layers." + std::to_string(val: i);
379
380 layer.attention_norm = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_embd);
381
382 layer.wq = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd);
383 layer.wk = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd / n_multiqueries);
384 layer.wv = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd / n_multiqueries);
385 layer.wo = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd);
386
387 layer.ffn_norm = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_embd);
388
389 layer.w1 = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_ff);
390 layer.w2 = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_ff, ne1: n_embd);
391 layer.w3 = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_ff);
392
393 ggml_set_name(tensor: layer.attention_norm, name: (layers_i + ".attention_norm.weight").c_str());
394
395 ggml_set_name(tensor: layer.wq, name: (layers_i + ".attention.wq.weight").c_str());
396 ggml_set_name(tensor: layer.wk, name: (layers_i + ".attention.wk.weight").c_str());
397 ggml_set_name(tensor: layer.wv, name: (layers_i + ".attention.wv.weight").c_str());
398 ggml_set_name(tensor: layer.wo, name: (layers_i + ".attention.wo.weight").c_str());
399
400 ggml_set_name(tensor: layer.ffn_norm, name: (layers_i + ".ffn_norm.weight").c_str());
401
402 ggml_format_name(tensor: layer.w1, fmt: "%s.feed_forward.w1.weight", layers_i.c_str());
403 ggml_format_name(tensor: layer.w2, fmt: "%s.feed_forward.w2.weight", layers_i.c_str());
404 ggml_format_name(tensor: layer.w3, fmt: "%s.feed_forward.w3.weight", layers_i.c_str());
405 }
406
407 print_tensor_info(ctx);
408}
409
410static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
411 float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
412 return *ptr;
413}
414
415static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
416 int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
417 return *ptr;
418}
419
420static void print_row(struct ggml_tensor * probs, int i) {
421 for (int k = 0; k < probs->ne[0]; ++k) {
422 float p = get_f32_2d(tensor: probs, i0: k, i1: i);
423 LOG(" %f", p);
424 }
425 LOG("\n");
426}
427
428static void print_matrix(struct ggml_tensor * probs) {
429 assert(ggml_is_matrix(probs));
430 for (int i = 0; i < probs->ne[1]; ++i) {
431 for (int k = 0; k < probs->ne[0]; ++k) {
432 float p = get_f32_2d(tensor: probs, i0: k, i1: i);
433 LOG(" %.2f", p);
434 }
435 LOG("\n");
436 }
437}
438
439struct my_llama_file {
440 // use FILE * so we don't have to re-open the file to mmap
441 FILE * fp;
442 size_t size;
443
444 my_llama_file(const char * fname, const char * mode) {
445 fp = std::fopen(filename: fname, modes: mode);
446 if (fp == NULL) {
447 size = 0;
448 } else {
449 seek(offset: 0, SEEK_END);
450 size = tell();
451 seek(offset: 0, SEEK_SET);
452 }
453 }
454
455 size_t tell() const {
456#ifdef _WIN32
457 __int64 ret = _ftelli64(fp);
458#else
459 long ret = std::ftell(stream: fp);
460#endif
461 GGML_ASSERT(ret != -1); // this really shouldn't fail
462 return (size_t) ret;
463 }
464
465 void seek(size_t offset, int whence) {
466#ifdef _WIN32
467 int ret = _fseeki64(fp, (__int64) offset, whence);
468#else
469 int ret = std::fseek(stream: fp, off: (long) offset, whence: whence);
470#endif
471 GGML_ASSERT(ret == 0); // same
472 }
473
474 void read_raw(void * ptr, size_t size) {
475 if (size == 0) {
476 return;
477 }
478 errno = 0;
479 std::size_t ret = std::fread(ptr: ptr, size: size, n: 1, stream: fp);
480 if (ferror(stream: fp)) {
481 die_fmt("fread failed: %s", strerror(errno));
482 }
483 if (ret != 1) {
484 die("unexpectedly reached end of file");
485 }
486 }
487
488 std::uint32_t read_u32() {
489 std::uint32_t ret;
490 read_raw(ptr: &ret, size: sizeof(ret));
491 return ret;
492 }
493 std::float_t read_f32() {
494 std::float_t ret;
495 read_raw(ptr: &ret, size: sizeof(ret));
496 return ret;
497 }
498
499 std::string read_string(std::uint32_t len) {
500 std::vector<char> chars(len);
501 read_raw(ptr: chars.data(), size: len);
502 return std::string(chars.data(), len);
503 }
504
505 ~my_llama_file() {
506 if (fp) {
507 std::fclose(stream: fp);
508 }
509 }
510};
511
512static bool is_ggml_file(const char * filename) {
513 my_llama_file file(filename, "rb");
514 if (file.size < 4) {
515 return false;
516 }
517 std::string magic = file.read_string(len: 4);
518 return magic == GGUF_MAGIC;
519}
520
521static std::string llama_escape_whitespaces(const std::string & text) {
522 std::ostringstream out;
523 for (char c : text) {
524 if (c == ' ') out << "\xe2\x96\x81";
525 else out << c;
526 }
527 return out.str();
528}
529
530static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
531 if (is_ggml_file(filename)) {
532 LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
533 struct ggml_context * ctx_data = NULL;
534
535 struct gguf_init_params params = {
536 /*.no_alloc = */ false,
537 /*.ctx = */ &ctx_data,
538 };
539
540 struct gguf_context * ctx = gguf_init_from_file(fname: filename, params);
541 GGML_ASSERT(ctx != NULL);
542
543 const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
544 GGML_ASSERT(model_idx >= 0);
545 std::string tokenizer_name = gguf_get_val_str(ctx, key_id: model_idx);
546 GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
547
548 const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
549 GGML_ASSERT(token_idx >= 0);
550
551 const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
552 GGML_ASSERT(score_idx >= 0);
553 const float * scores = (const float * ) gguf_get_arr_data(ctx, key_id: score_idx);
554
555 const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
556 GGML_ASSERT(toktype_idx >= 0);
557 const int * toktypes = (const int * ) gguf_get_arr_data(ctx, key_id: toktype_idx);
558
559 const uint32_t n_vocab = gguf_get_arr_n(ctx, key_id: token_idx);
560 if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
561 die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
562 }
563
564 vocab->id_to_token.resize(new_size: n_vocab);
565
566 for (uint32_t i = 0; i < n_vocab; i++) {
567 std::string word = gguf_get_arr_str(ctx, key_id: token_idx, i);
568
569 vocab->token_to_id[word] = i;
570
571 auto & token_data = vocab->id_to_token[i];
572 token_data.text = std::move(word);
573 token_data.score = scores[i];
574 token_data.type = (llama_token_type) toktypes[i];
575 }
576 ggml_free(ctx: ctx_data);
577 gguf_free(ctx);
578 } else {
579 // assume llama2.c vocabulary
580 LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
581 my_llama_file file(filename, "rb");
582 if (!file.fp) {
583 die_fmt("%s: %s", strerror(errno), filename);
584 }
585 const int n_vocab = config->vocab_size;
586 /* uint32_t max_token_length = */ file.read_u32(); // unused
587 vocab->id_to_token.resize(new_size: n_vocab);
588 for (my_llama_vocab::id id=0; id<n_vocab; ++id) {
589 float_t score = file.read_f32();
590 uint32_t len = file.read_u32();
591 std::string text = file.read_string(len);
592
593 unsigned char byte_val;
594 my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
595 if (id == UNKNOWN_TOKEN_ID) {
596 text = "<unk>";
597 type = LLAMA_TOKEN_TYPE_UNKNOWN;
598 } else if (id == BOS_TOKEN_ID) {
599 text = "<s>";
600 type = LLAMA_TOKEN_TYPE_CONTROL;
601 } else if (id == EOS_TOKEN_ID) {
602 text = "</s>";
603 type = LLAMA_TOKEN_TYPE_CONTROL;
604 } else if (text.empty()) {
605 type = LLAMA_TOKEN_TYPE_CONTROL;
606 } else if (sscanf(s: text.c_str(), format: "<0x%02hhX>", &byte_val) == 1) {
607 // Text of byte tokens is already in the expected format.
608 type = LLAMA_TOKEN_TYPE_BYTE;
609 } else {
610 type = LLAMA_TOKEN_TYPE_NORMAL;
611 }
612 text = llama_escape_whitespaces(text);
613
614 vocab->id_to_token[id].text = text;
615 vocab->id_to_token[id].score = score;
616 vocab->id_to_token[id].type = type;
617 vocab->token_to_id.emplace(args&: text, args&: id);
618 }
619 }
620}
621
622static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
623 int size = 1;
624 for (int dim = 0; dim < ggml_n_dims(tensor: gg_weights); ++dim) {
625 size *= gg_weights->ne[dim];
626 }
627 for (int ct = 0; ct < size; ++ct) {
628 int64_t i0 = 0; int64_t i1 = 0;
629 int64_t i2 = 0; int64_t i3 = 0;
630 ggml_unravel_index(tensor: gg_weights, i: ct, i0: &i0, i1: &i1, i2: &i2, i3: &i3);
631 ggml_set_f32_nd(tensor: gg_weights, i0, i1, i2, i3, value: karpathy_weights[ct]);
632 }
633}
634
635static void save_as_llama_model(
636 struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
637) {
638 // convert AK weights into GG weights one by one.
639 // w->token_embedding_table -> model->tok_embeddings
640 // float* -> struct ggml_tensor
641 convert_weights_ak_to_gg(gg_weights: model->tok_embeddings, karpathy_weights: w->token_embedding_table.data());
642 convert_weights_ak_to_gg(gg_weights: model->output, karpathy_weights: !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
643
644 convert_weights_ak_to_gg(gg_weights: model->norm, karpathy_weights: w->rms_final_weight.data());
645 //print_row(model->norm, 0);
646
647 // for rms-att-weight
648 int row_length = model->hparams.n_embd;
649 int n_ff = model->hparams.n_ff;
650
651 const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
652
653 for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
654 auto & layer = model->layers[i];
655 // 1d
656 convert_weights_ak_to_gg(gg_weights: layer.attention_norm, karpathy_weights: &w->rms_att_weight[i*row_length]);
657 convert_weights_ak_to_gg(gg_weights: layer.ffn_norm , karpathy_weights: &w->rms_ffn_weight[i*row_length]);
658
659 // from 3d matrix layer x dim x dim to 2d matrix dim x dim
660 convert_weights_ak_to_gg(gg_weights: layer.wq , karpathy_weights: &w->wq[i*row_length*row_length]);
661 convert_weights_ak_to_gg(gg_weights: layer.wo , karpathy_weights: &w->wo[i*row_length*row_length]);
662 // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
663 convert_weights_ak_to_gg(gg_weights: layer.wk , karpathy_weights: &w->wk[i*row_length*row_length/n_multiqueries]);
664 convert_weights_ak_to_gg(gg_weights: layer.wv , karpathy_weights: &w->wv[i*row_length*row_length/n_multiqueries]);
665
666 convert_weights_ak_to_gg(gg_weights: layer.w1 , karpathy_weights: &w->w1[i*row_length*n_ff]);
667 convert_weights_ak_to_gg(gg_weights: layer.w2 , karpathy_weights: &w->w2[i*n_ff*row_length]);
668 convert_weights_ak_to_gg(gg_weights: layer.w3 , karpathy_weights: &w->w3[i*row_length*n_ff]);
669 }
670
671 struct gguf_context * ctx = gguf_init_empty();
672
673 std::vector<const char*> tokens;
674 std::vector<float> scores;
675 std::vector<llama_token_type> token_types;
676 for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
677 tokens.push_back(x: token_data.text.c_str());
678 scores.push_back(x: token_data.score);
679 token_types.push_back(x: token_data.type);
680 }
681 gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, data: tokens.data(), n: tokens.size());
682 gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, type: GGUF_TYPE_FLOAT32, data: scores.data(), n: scores.size());
683 gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, type: GGUF_TYPE_INT32, data: token_types.data(), n: token_types.size());
684
685 gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
686
687 gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, val: "llama");
688 gguf_set_val_str(ctx, KV_GENERAL_NAME, val: "llama");
689
690 // special tokens
691 gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
692 gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
693 gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
694 gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL);
695 gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL);
696
697 gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, val: model->hparams.n_ctx);
698 gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, val: model->hparams.n_embd);
699 gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, val: model->hparams.n_ff);
700 gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, val: model->hparams.n_head);
701 gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, val: model->hparams.n_head);
702 gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, val: model->hparams.n_head_kv);
703 gguf_set_val_u32(ctx, KV_BLOCK_COUNT, val: model->hparams.n_layer);
704 gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, val: model->hparams.n_rot);
705 gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, val: 1e-5f);
706
707 // write tensors
708 ggml_set_name(tensor: model->tok_embeddings, TN_TOKEN_EMBD);
709 gguf_add_tensor(ctx, tensor: model->tok_embeddings);
710
711 ggml_set_name(tensor: model->norm, TN_OUTPUT_NORM);
712 gguf_add_tensor(ctx, tensor: model->norm);
713
714 ggml_set_name(tensor: model->output, TN_OUTPUT);
715 gguf_add_tensor(ctx, tensor: model->output);
716
717 for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
718 auto & layer = model->layers[i];
719
720 ggml_format_name(tensor: layer.wq, TN_ATTN_Q, i);
721 gguf_add_tensor(ctx, tensor: layer.wq);
722
723 ggml_format_name(tensor: layer.wk, TN_ATTN_K, i);
724 gguf_add_tensor(ctx, tensor: layer.wk);
725
726 ggml_format_name(tensor: layer.wv, TN_ATTN_V, i);
727 gguf_add_tensor(ctx, tensor: layer.wv);
728
729 ggml_format_name(tensor: layer.wo, TN_ATTN_OUTPUT, i);
730 gguf_add_tensor(ctx, tensor: layer.wo);
731
732 ggml_format_name(tensor: layer.attention_norm, TN_ATTN_NORM, i);
733 gguf_add_tensor(ctx, tensor: layer.attention_norm);
734
735 ggml_format_name(tensor: layer.w1, TN_FFN_GATE, i);
736 gguf_add_tensor(ctx, tensor: layer.w1);
737
738 ggml_format_name(tensor: layer.w2, TN_FFN_DOWN, i);
739 gguf_add_tensor(ctx, tensor: layer.w2);
740
741 ggml_format_name(tensor: layer.w3, TN_FFN_UP, i);
742 gguf_add_tensor(ctx, tensor: layer.w3);
743
744 ggml_format_name(tensor: layer.ffn_norm, TN_FFN_NORM, i);
745 gguf_add_tensor(ctx, tensor: layer.ffn_norm);
746 }
747
748 gguf_write_to_file(ctx, fname: filename, only_meta: false);
749 gguf_free(ctx);
750}
751
752static struct train_params get_default_train_params() {
753 struct train_params params;
754 params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
755 params.fn_llama2c_output_model = "ak_llama_model.bin";
756 params.fn_train_data = "shakespeare.txt";
757 params.fn_checkpoint_in = "checkpoint.bin";
758 params.fn_checkpoint_out = "checkpoint.bin";
759 params.fn_model_out = "ggml-checkpoint-f32.bin";
760
761 params.seed = -1;
762
763 params.n_ctx = 128;
764 params.n_embd = 256;
765 params.n_mult = 256;
766 params.n_head = 8;
767 params.n_layer = 16;
768 params.n_rotmax = 64;
769
770 params.n_threads = 6;
771 params.n_batch = 8;
772 params.n_examples = 8;
773 params.n_predict = 1024;
774
775 params.print_info_interval = 1;
776 params.print_details_interval = 2;
777
778 params.samples_start_after_nl = false;
779 params.use_adam = true;
780 params.use_flash = false;
781 params.use_scratch = true;
782
783 // only adam
784 params.warmup = 100;
785 params.cos_decay_steps = 1000;
786 params.cos_decay_restart = 1.1f;
787 params.cos_decay_alpha = 0.0f;
788
789 params.lbfgs_n_iter = 16;
790 params.adam_n_iter = 16;
791 params.adam_alpha = 1e-3f;
792 params.adam_decay = 1e-3f;
793
794 params.mem_model_gb = 2;
795 params.mem_compute_gb = 24;
796 params.mem_compute0_gb = 8;
797 params.mem_compute1_gb = 2;
798
799 return params;
800}
801
802static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
803 fprintf(stderr, format: "usage: %s [options]\n", argv[0]);
804 fprintf(stderr, format: "\n");
805 fprintf(stderr, format: "options:\n");
806 fprintf(stderr, format: " -h, --help show this help message and exit\n");
807 fprintf(stderr, format: " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
808 fprintf(stderr, format: " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
809 fprintf(stderr, format: " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
810 fprintf(stderr, format: "\n");
811}
812
813static bool params_parse(int argc, char ** argv, struct train_params * params) {
814 bool invalid_param = false;
815 bool reqd_param_found = false;
816 std::string arg;
817 struct train_params default_params = get_default_train_params();
818 const std::string arg_prefix = "--";
819
820 for (int i = 1; i < argc; i++) {
821 arg = argv[i];
822 if (arg.compare(pos: 0, n: arg_prefix.size(), str: arg_prefix) == 0) {
823 std::replace(first: arg.begin(), last: arg.end(), old_value: '_', new_value: '-');
824 }
825
826 if (arg == "--copy-vocab-from-model") {
827 if (++i >= argc) {
828 invalid_param = true;
829 break;
830 }
831 params->fn_vocab_model = argv[i];
832 } else if (arg == "--llama2c-model") {
833 if (++i >= argc) {
834 invalid_param = true;
835 break;
836 }
837 reqd_param_found = true;
838 params->fn_llama2c_model = argv[i];
839 } else if (arg == "--llama2c-output-model") {
840 if (++i >= argc) {
841 invalid_param = true;
842 break;
843 }
844 params->fn_llama2c_output_model = argv[i];
845 } else if (arg == "-h" || arg == "--help") {
846 print_usage(argc, argv, params: &default_params);
847 exit(status: 0);
848 } else {
849 fprintf(stderr, format: "error: unknown argument: %s\n", arg.c_str());
850 print_usage(argc, argv, params: &default_params);
851 exit(status: 1);
852 }
853 }
854 if (invalid_param) {
855 fprintf(stderr, format: "error: invalid parameter for argument: %s\n", arg.c_str());
856 print_usage(argc, argv, params: &default_params);
857 exit(status: 1);
858 }
859 if (!reqd_param_found){
860 fprintf(stderr, format: "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
861 print_usage(argc, argv, params: &default_params);
862 exit(status: 1);
863 }
864
865 return true;
866}
867
868static std::string basename(const std::string &path) {
869 size_t pos = path.find_last_of(s: "/\\");
870 if (pos == std::string::npos) {
871 return path;
872 }
873 return path.substr(pos: pos + 1);
874}
875
876int main(int argc, char ** argv) {
877 common_init();
878
879 struct train_params params = get_default_train_params();
880 if (!params_parse(argc, argv, params: &params)) {
881 return 1;
882 }
883
884 Config config;
885 TransformerWeights weights = {};
886 {
887 LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
888 FILE * file = fopen(filename: params.fn_llama2c_model, modes: "rb");
889 if (!file) {
890 LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
891 return 1;
892 }
893 // read in the config header
894 if (fread(ptr: &config, size: sizeof(Config), n: 1, stream: file) != 1) {
895 LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
896 return 1;
897 }
898 auto shared_weights = config.vocab_size > 0;
899 config.vocab_size = abs(x: config.vocab_size);
900
901 // read in the Transformer weights
902 alloc_weights(w: &weights, p: &config, shared_weights);
903 if (checkpoint_init_weights(w: &weights, p: &config, f: file, shared_weights)) {
904 LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
905 return 1;
906 }
907 fclose(stream: file);
908 }
909
910 struct my_llama_vocab vocab;
911 load_vocab(filename: params.fn_vocab_model, config: &config, vocab: &vocab);
912
913 struct my_llama_model model;
914 model.hparams.n_vocab = config.vocab_size; //llama_vocab_n_vocab(lctx);
915 model.hparams.n_ctx = params.n_ctx;
916 model.hparams.n_embd = config.dim; //params.n_embd;
917 model.hparams.n_ff = config.hidden_dim;
918 model.hparams.n_mult = 32;//params.n_mult;
919 model.hparams.n_head = config.n_heads; //params.n_head;
920 model.hparams.n_head_kv = config.n_kv_heads;
921 model.hparams.n_layer = config.n_layers; //params.n_layer;
922 model.hparams.n_rot = std::min(a: (uint32_t)params.n_rotmax, b: model.hparams.n_embd / model.hparams.n_head);
923
924 print_params(params: &model.hparams);
925
926 struct ggml_init_params lcparams;
927 lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
928 lcparams.mem_buffer = NULL;
929 lcparams.no_alloc = false;
930
931 model.ctx = ggml_init(params: lcparams);
932
933 init_model(model: &model);
934 model.name = basename(filename: params.fn_llama2c_model);
935 save_as_llama_model(vocab: &vocab, model: &model, w: &weights, filename: params.fn_llama2c_output_model);
936
937 LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
938
939 ggml_free(ctx: model.ctx);
940 return 0;
941}
942