convert-llama2c-to-ggml.cpp source code [llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp]

1	#include "ggml.h"
2	#include "gguf.h"
3
4	#include "llama.h"
5	#include "common.h"
6	#include "log.h"
7
8	#include <unordered_map>
9	#include <vector>
10	#include <cassert>
11	#include <climits>
12	#include <cstring>
13	#include <cstdarg>
14	#include <cinttypes>
15	#include <ctime>
16	#include <random>
17	#include <stdexcept>
18	#include <sstream>
19	#include <algorithm>
20	#include <string>
21
22	// GGUF keys & tensor names.
23
24	#define KV_GENERAL_ARCHITECTURE "general.architecture"
25	#define KV_GENERAL_NAME "general.name"
26
27	#define KV_TOKENIZER_MODEL "tokenizer.ggml.model"
28	#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens"
29	#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type"
30	#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores"
31	#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id"
32	#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id"
33	#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id"
34	#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id"
35	#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id"
36	#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json"
37
38	#define KV_CONTEXT_LENGTH "llama.context_length"
39	#define KV_EMBEDDING_LENGTH "llama.embedding_length"
40	#define KV_BLOCK_COUNT "llama.block_count"
41	#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length"
42	#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count"
43	#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv"
44	#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon"
45	#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count"
46
47	#define TN_TOKEN_EMBD "token_embd.weight"
48	#define TN_OUTPUT_NORM "output_norm.weight"
49	#define TN_OUTPUT "output.weight"
50	#define TN_ATTN_NORM "blk.%d.attn_norm.weight"
51	#define TN_ATTN_Q "blk.%d.attn_q.weight"
52	#define TN_ATTN_K "blk.%d.attn_k.weight"
53	#define TN_ATTN_V "blk.%d.attn_v.weight"
54	#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
55	#define TN_FFN_NORM "blk.%d.ffn_norm.weight"
56	#define TN_FFN_GATE "blk.%d.ffn_gate.weight"
57	#define TN_FFN_DOWN "blk.%d.ffn_down.weight"
58	#define TN_FFN_UP "blk.%d.ffn_up.weight"
59
60	#if defined(_MSC_VER)
61	#pragma warning(disable: 4244 4267) // possible loss of data
62	#endif
63
64	#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
65	#define LLAMA_FILE_VERSION_GGJT_V3 3
66
67	#define TOKENIZER_NAME "llama"
68	#define UNKNOWN_TOKEN_ID 0
69	#define BOS_TOKEN_ID 1
70	#define EOS_TOKEN_ID 2
71
72	//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
73	typedef struct {
74	int dim; // transformer dimension
75	int hidden_dim; // for ffn layers
76	int n_layers; // number of layers
77	int n_heads; // number of query heads
78	int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
79	int vocab_size; // vocabulary size, usually 256 (byte-level)
80	int seq_len; // max sequence length
81	} Config;
82
83	struct TransformerWeights {
84	// token embedding table
85	std::vector<float> token_embedding_table; // (vocab_size, dim)
86	// weights for rmsnorms
87	std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
88	std::vector<float> rms_ffn_weight; // (layer, dim)
89	// weights for matmuls
90	std::vector<float> wq; // (layer, dim, dim)
91	std::vector<float> wk; // (layer, dim, dim)
92	std::vector<float> wv; // (layer, dim, dim)
93	std::vector<float> wo; // (layer, dim, dim)
94	// weights for ffn
95	std::vector<float> w1; // (layer, hidden_dim, dim)
96	std::vector<float> w2; // (layer, dim, hidden_dim)
97	std::vector<float> w3; // (layer, hidden_dim, dim)
98	// final rmsnorm
99	std::vector<float> rms_final_weight; // (dim,)
100	// freq_cis for RoPE relatively positional embeddings
101	// std::vector<float> freq_cis_real; // (seq_len, dim/2)
102	// std::vector<float> freq_cis_imag; // (seq_len, dim/2)
103	// (optional) classifier weights for the logits, on the last layer
104	std::vector<float> wcls;
105	};
106
107	static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
108	const int n_multiqueries = p->n_kv_heads <= `0` \|\| p->n_kv_heads >= p->n_heads ? `1` : p->n_heads / p->n_kv_heads;
109	try {
110	w->token_embedding_table.resize(new_size: p->vocab_size * p->dim);
111	LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
112
113	w->rms_att_weight.resize(new_size: p->n_layers * p->dim);
114	LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
115
116	w->rms_ffn_weight.resize(new_size: p->n_layers * p->dim);
117	LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
118
119	w->wq.resize(new_size: p->n_layers * p->dim * p->dim);
120	LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
121
122	w->wk.resize(new_size: p->n_layers * p->dim * p->dim / n_multiqueries);
123	LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
124
125	w->wv.resize(new_size: p->n_layers * p->dim * p->dim / n_multiqueries);
126	LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
127
128	w->wo.resize(new_size: p->n_layers * p->dim * p->dim);
129	LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
130
131	w->w1.resize(new_size: p->n_layers * p->hidden_dim * p->dim);
132	LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
133
134	w->w2.resize(new_size: p->n_layers * p->hidden_dim * p->dim);
135	LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
136
137	w->w3.resize(new_size: p->n_layers * p->hidden_dim * p->dim);
138	LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
139
140	w->rms_final_weight.resize(new_size: p->dim);
141	LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
142
143	if (shared_weights) {
144	w->wcls = {};
145	} else {
146	w->wcls.resize(new_size: p->vocab_size * p->dim);
147	LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
148	}
149	}
150	catch (std::length_error &) {
151	die("Invalid configuration. Failed to allocate memory for weights");
152	}
153	}
154
155	static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
156	if (fread(ptr: w->token_embedding_table.data(), size: sizeof(float), n: w->token_embedding_table.size(), stream: f) != w->token_embedding_table.size()) return `1`;
157	if (fread(ptr: w->rms_att_weight.data(), size: sizeof(float), n: w->rms_att_weight.size(), stream: f) != w->rms_att_weight.size()) return `1`;
158	if (fread(ptr: w->wq.data(), size: sizeof(float), n: w->wq.size(), stream: f) != w->wq.size()) return `1`;
159	if (fread(ptr: w->wk.data(), size: sizeof(float), n: w->wk.size(), stream: f) != w->wk.size()) return `1`;
160	if (fread(ptr: w->wv.data(), size: sizeof(float), n: w->wv.size(), stream: f) != w->wv.size()) return `1`;
161	if (fread(ptr: w->wo.data(), size: sizeof(float), n: w->wo.size(), stream: f) != w->wo.size()) return `1`;
162	if (fread(ptr: w->rms_ffn_weight.data(), size: sizeof(float), n: w->rms_ffn_weight.size(), stream: f) != w->rms_ffn_weight.size()) return `1`;
163	if (fread(ptr: w->w1.data(), size: sizeof(float), n: w->w1.size(), stream: f) != w->w1.size()) return `1`;
164	if (fread(ptr: w->w2.data(), size: sizeof(float), n: w->w2.size(), stream: f) != w->w2.size()) return `1`;
165	if (fread(ptr: w->w3.data(), size: sizeof(float), n: w->w3.size(), stream: f) != w->w3.size()) return `1`;
166	if (fread(ptr: w->rms_final_weight.data(), size: sizeof(float), n: w->rms_final_weight.size(), stream: f) != w->rms_final_weight.size()) return `1`;
167
168	// Skip freq_cis_real & freq_cis_imag
169	int head_size = p->dim / p->n_heads;
170	fseek(stream: f, off: p->seq_len * head_size * sizeof(float), SEEK_CUR);
171
172	if (!shared_weights && fread(ptr: w->wcls.data(), size: sizeof(float), n: w->wcls.size(), stream: f) != w->wcls.size()) return `1`;
173
174	// Check we didn't forget to read anything
175	auto curr = ftell(stream: f);
176	fseek(stream: f, off: `0`, SEEK_END);
177	auto end = ftell(stream: f);
178	if (curr != end) {
179	LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
180	return `1`;
181	}
182
183	return `0`;
184	}
185
186	static void print_sample_weights(TransformerWeights *w){
187	LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
188	LOG_INF("%f\n", w->token_embedding_table[`0`]);
189	LOG_INF("%f\n", w->rms_att_weight[`0`]);
190	LOG_INF("%f\n", w->rms_ffn_weight[`0`]);
191
192	LOG_INF("%f\n", w->wq[`0`]);
193	LOG_INF("%f\n", w->wk[`0`]);
194	LOG_INF("%f\n", w->wv[`0`]);
195	LOG_INF("%f\n", w->wo[`0`]);
196	LOG_INF("%f\n", w->w1[`0`]);
197	LOG_INF("%f\n", w->w2[`0`]);
198	LOG_INF("%f\n", w->w3[`0`]);
199	LOG_INF("%f\n", w->rms_att_weight[`0`]);
200	if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[`0`]);
201	}
202	////////////////////////////////////////////////////////////////////////////////////////////////////////////
203
204	//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
205
206	struct my_llama_vocab {
207	using id = int32_t;
208	using token = std::string;
209	using ttype = llama_token_type;
210
211	struct token_data {
212	token text;
213	float score;
214	ttype type;
215	};
216
217	std::unordered_map<token, id> token_to_id;
218	std::vector<token_data> id_to_token;
219	};
220
221	struct my_llama_hparams {
222	uint32_t n_vocab = `32000`;
223	uint32_t n_ctx = `512`; // this is provided as user input?
224	uint32_t n_embd = `4096`;
225	uint32_t n_ff = `11008`;
226	uint32_t n_mult = `4`;
227	uint32_t n_head = `32`;
228	uint32_t n_head_kv = `32`;
229	uint32_t n_layer = `32`;
230	uint32_t n_rot = `64`;
231
232	bool operator!=(const my_llama_hparams& other) const {
233	return memcmp(s1: this, s2: &other, n: sizeof(my_llama_hparams));
234	}
235	};
236
237	struct my_llama_layer {
238	// normalization
239	struct ggml_tensor * attention_norm;
240
241	// attention
242	struct ggml_tensor * wq;
243	struct ggml_tensor * wk;
244	struct ggml_tensor * wv;
245	struct ggml_tensor * wo;
246
247	// normalization
248	struct ggml_tensor * ffn_norm;
249
250	// ff
251	struct ggml_tensor * w1;
252	struct ggml_tensor * w2;
253	struct ggml_tensor * w3;
254	};
255
256	struct my_llama_model {
257	struct ggml_context * ctx = NULL;
258
259	std::string name;
260
261	my_llama_hparams hparams;
262
263	struct ggml_tensor * tok_embeddings;
264
265	struct ggml_tensor * norm;
266	struct ggml_tensor * output;
267
268	std::vector<my_llama_layer> layers;
269
270	uint32_t train_its = `0`;
271	uint32_t train_samples = `0`;
272	uint32_t train_tokens = `0`;
273	};
274
275	struct train_params {
276	const char * fn_vocab_model;
277	const char * fn_llama2c_model;
278	const char * fn_llama2c_output_model;
279	const char * fn_train_data;
280	const char * fn_checkpoint_in;
281	const char * fn_checkpoint_out;
282	const char * fn_model_out;
283
284	uint32_t seed;
285
286	int n_ctx;
287	int n_embd;
288	int n_mult;
289	int n_head;
290	int n_layer;
291	int n_rotmax;
292
293	int n_threads;
294	int n_batch;
295	int n_examples;
296	int n_predict;
297
298	int print_info_interval;
299	int print_details_interval;
300
301	bool samples_start_after_nl;
302	bool use_adam;
303	bool use_flash;
304	bool use_scratch;
305
306	// only adam
307	int warmup;
308	int cos_decay_steps;
309	float cos_decay_restart;
310	float cos_decay_alpha;
311
312	int lbfgs_n_iter;
313	int adam_n_iter;
314	float adam_alpha;
315	float adam_decay;
316
317	int mem_model_gb;
318	int mem_compute_gb;
319	int mem_compute0_gb;
320	int mem_compute1_gb;
321	};
322
323	static void print_params(struct my_llama_hparams * params) {
324	LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
325	LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
326	LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
327	LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
328	LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
329	LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
330	LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
331	LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
332	LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
333	}
334
335	static void print_tensor_info(const struct ggml_context * ctx) {
336	for (auto * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, tensor: t)) {
337	LOG_INF("%s: Allocating ", __func__);
338	int64_t total = `1`;
339	int i = `0`;
340	for (; i < ggml_n_dims(tensor: t); ++i) {
341	if (i > `0`) { LOG_INF("x "); }
342	LOG_INF("[%" PRId64 "] ", t->ne[i]);
343	total *= t->ne[i];
344	}
345	if (i > `1`) { LOG_INF("= [%" PRId64 "] ", total); }
346	LOG_INF("float space for %s\n", ggml_get_name(t));
347	}
348	}
349
350	static void init_model(struct my_llama_model * model) {
351	const auto & hparams = model->hparams;
352
353	const uint32_t n_embd = hparams.n_embd;
354	const uint32_t n_layer = hparams.n_layer;
355	const uint32_t n_vocab = hparams.n_vocab;
356
357	const uint32_t n_multiqueries = hparams.n_head_kv <= `0` \|\| hparams.n_head_kv >= hparams.n_head ? `1` : hparams.n_head / hparams.n_head_kv;
358
359	const uint32_t n_ff = hparams.n_ff;
360	struct ggml_context * ctx = model->ctx;
361
362	model->train_its = `0`;
363	model->train_samples = `0`;
364	model->train_tokens = `0`;
365
366	model->tok_embeddings = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_vocab);
367	model->norm = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_embd);
368	model->output = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_vocab);
369
370	ggml_set_name(tensor: model->tok_embeddings, name: "tok_embeddings.weight");
371	ggml_set_name(tensor: model->norm, name: "norm.weight");
372	ggml_set_name(tensor: model->output, name: "output.weight");
373
374	model->layers.resize(new_size: n_layer);
375	for (uint32_t i = `0`; i < n_layer; ++i) {
376	auto & layer = model->layers [i];
377
378	std::string layers_i = "layers." + std::to_string(val: i);
379
380	layer.attention_norm = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_embd);
381
382	layer.wq = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd);
383	layer.wk = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd / n_multiqueries);
384	layer.wv = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd / n_multiqueries);
385	layer.wo = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd);
386
387	layer.ffn_norm = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_embd);
388
389	layer.w1 = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_ff);
390	layer.w2 = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_ff, ne1: n_embd);
391	layer.w3 = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_ff);
392
393	ggml_set_name(tensor: layer.attention_norm, name: (layers_i + ".attention_norm.weight").c_str());
394
395	ggml_set_name(tensor: layer.wq, name: (layers_i + ".attention.wq.weight").c_str());
396	ggml_set_name(tensor: layer.wk, name: (layers_i + ".attention.wk.weight").c_str());
397	ggml_set_name(tensor: layer.wv, name: (layers_i + ".attention.wv.weight").c_str());
398	ggml_set_name(tensor: layer.wo, name: (layers_i + ".attention.wo.weight").c_str());
399
400	ggml_set_name(tensor: layer.ffn_norm, name: (layers_i + ".ffn_norm.weight").c_str());
401
402	ggml_format_name(tensor: layer.w1, fmt: "%s.feed_forward.w1.weight", layers_i.c_str());
403	ggml_format_name(tensor: layer.w2, fmt: "%s.feed_forward.w2.weight", layers_i.c_str());
404	ggml_format_name(tensor: layer.w3, fmt: "%s.feed_forward.w3.weight", layers_i.c_str());
405	}
406
407	print_tensor_info(ctx);
408	}
409
410	static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
411	float * ptr = (float ) ((char* ) tensor->data + i0tensor->nb[`0`] + i1*tensor->nb[`1`]);
412	return *ptr;
413	}
414
415	static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
416	int32_t * ptr = (int32_t ) ((char* ) tensor->data + i0tensor->nb[`0`] + i1*tensor->nb[`1`]);
417	return *ptr;
418	}
419
420	static void print_row(struct ggml_tensor * probs, int i) {
421	for (int k = `0`; k < probs->ne[`0`]; ++k) {
422	float p = get_f32_2d(tensor: probs, i0: k, i1: i);
423	LOG(" %f", p);
424	}
425	LOG("\n");
426	}
427
428	static void print_matrix(struct ggml_tensor * probs) {
429	assert(ggml_is_matrix(probs));
430	for (int i = `0`; i < probs->ne[`1`]; ++i) {
431	for (int k = `0`; k < probs->ne[`0`]; ++k) {
432	float p = get_f32_2d(tensor: probs, i0: k, i1: i);
433	LOG(" %.2f", p);
434	}
435	LOG("\n");
436	}
437	}
438
439	struct my_llama_file {
440	// use FILE so we don't have to re-open the file to mmap*
441	FILE * fp;
442	size_t size;
443
444	my_llama_file(const char * fname, const char * mode) {
445	fp = std::fopen(filename: fname, modes: mode);
446	if (fp == NULL) {
447	size = `0`;
448	} else {
449	seek(offset: `0`, SEEK_END);
450	size = tell();
451	seek(offset: `0`, SEEK_SET);
452	}
453	}
454
455	size_t tell() const {
456	#ifdef _WIN32
457	__int64 ret = _ftelli64(fp);
458	#else
459	long ret = std::ftell(stream: fp);
460	#endif
461	GGML_ASSERT(ret != -`1`); // this really shouldn't fail
462	return (size_t) ret;
463	}
464
465	void seek(size_t offset, int whence) {
466	#ifdef _WIN32
467	int ret = _fseeki64(fp, (__int64) offset, whence);
468	#else
469	int ret = std::fseek(stream: fp, off: (long) offset, whence: whence);
470	#endif
471	GGML_ASSERT(ret == `0`); // same
472	}
473
474	void read_raw(void * ptr, size_t size) {
475	if (size == `0`) {
476	return;
477	}
478	errno = `0`;
479	std::size_t ret = std::fread(ptr: ptr, size: size, n: `1`, stream: fp);
480	if (ferror(stream: fp)) {
481	die_fmt("fread failed: %s", strerror(errno));
482	}
483	if (ret != `1`) {
484	die("unexpectedly reached end of file");
485	}
486	}
487
488	std::uint32_t read_u32() {
489	std::uint32_t ret;
490	read_raw(ptr: &ret, size: sizeof(ret));
491	return ret;
492	}
493	std::float_t read_f32() {
494	std::float_t ret;
495	read_raw(ptr: &ret, size: sizeof(ret));
496	return ret;
497	}
498
499	std::string read_string(std::uint32_t len) {
500	std::vector<char> chars(len);
501	read_raw(ptr: chars.data(), size: len);
502	return std::string (chars.data(), len);
503	}
504
505	~my_llama_file() {
506	if (fp) {
507	std::fclose(stream: fp);
508	}
509	}
510	};
511
512	static bool is_ggml_file(const char * filename) {
513	my_llama_file file(filename, "rb");
514	if (file.size < `4`) {
515	return false;
516	}
517	std::string magic = file.read_string(len: `4`);
518	return magic == GGUF_MAGIC;
519	}
520
521	static std::string llama_escape_whitespaces(const std::string & text) {
522	std::ostringstream out;
523	for (char c : text) {
524	if (c == `' '`) out << "\xe2\x96\x81";
525	else out << c;
526	}
527	return out.str();
528	}
529
530	static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
531	if (is_ggml_file(filename)) {
532	LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
533	struct ggml_context * ctx_data = NULL;
534
535	struct gguf_init_params params = {
536	/.no_alloc = / false,
537	/.ctx = / &ctx_data,
538	};
539
540	struct gguf_context * ctx = gguf_init_from_file(fname: filename, params);
541	GGML_ASSERT(ctx != NULL);
542
543	const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
544	GGML_ASSERT(model_idx >= `0`);
545	std::string tokenizer_name = gguf_get_val_str(ctx, key_id: model_idx);
546	GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
547
548	const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
549	GGML_ASSERT(token_idx >= `0`);
550
551	const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
552	GGML_ASSERT(score_idx >= `0`);
553	const float * scores = (const float * ) gguf_get_arr_data(ctx, key_id: score_idx);
554
555	const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
556	GGML_ASSERT(toktype_idx >= `0`);
557	const int * toktypes = (const int * ) gguf_get_arr_data(ctx, key_id: toktype_idx);
558
559	const uint32_t n_vocab = gguf_get_arr_n(ctx, key_id: token_idx);
560	if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
561	die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
562	}
563
564	vocab->id_to_token.resize(new_size: n_vocab);
565
566	for (uint32_t i = `0`; i < n_vocab; i++) {
567	std::string word = gguf_get_arr_str(ctx, key_id: token_idx, i);
568
569	vocab->token_to_id [word] = i;
570
571	auto & token_data = vocab->id_to_token [i];
572	token_data.text = std::move(word);
573	token_data.score = scores[i];
574	token_data.type = (llama_token_type) toktypes[i];
575	}
576	ggml_free(ctx: ctx_data);
577	gguf_free(ctx);
578	} else {
579	// assume llama2.c vocabulary
580	LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
581	my_llama_file file(filename, "rb");
582	if (!file.fp) {
583	die_fmt("%s: %s", strerror(errno), filename);
584	}
585	const int n_vocab = config->vocab_size;
586	/ uint32_t max_token_length = / file.read_u32(); // unused
587	vocab->id_to_token.resize(new_size: n_vocab);
588	for (my_llama_vocab::id id=`0`; id<n_vocab; ++id) {
589	float_t score = file.read_f32();
590	uint32_t len = file.read_u32();
591	std::string text = file.read_string(len);
592
593	unsigned char byte_val;
594	my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
595	if (id == UNKNOWN_TOKEN_ID) {
596	text = "<unk>";
597	type = LLAMA_TOKEN_TYPE_UNKNOWN;
598	} else if (id == BOS_TOKEN_ID) {
599	text = "<s>";
600	type = LLAMA_TOKEN_TYPE_CONTROL;
601	} else if (id == EOS_TOKEN_ID) {
602	text = "</s>";
603	type = LLAMA_TOKEN_TYPE_CONTROL;
604	} else if (text.empty()) {
605	type = LLAMA_TOKEN_TYPE_CONTROL;
606	} else if (sscanf(s: text.c_str(), format: "<0x%02hhX>", &byte_val) == `1`) {
607	// Text of byte tokens is already in the expected format.
608	type = LLAMA_TOKEN_TYPE_BYTE;
609	} else {
610	type = LLAMA_TOKEN_TYPE_NORMAL;
611	}
612	text = llama_escape_whitespaces(text);
613
614	vocab->id_to_token [id].text = text;
615	vocab->id_to_token [id].score = score;
616	vocab->id_to_token [id].type = type;
617	vocab->token_to_id.emplace(args&: text, args&: id);
618	}
619	}
620	}
621
622	static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
623	int size = `1`;
624	for (int dim = `0`; dim < ggml_n_dims(tensor: gg_weights); ++dim) {
625	size *= gg_weights->ne[dim];
626	}
627	for (int ct = `0`; ct < size; ++ct) {
628	int64_t i0 = `0`; int64_t i1 = `0`;
629	int64_t i2 = `0`; int64_t i3 = `0`;
630	ggml_unravel_index(tensor: gg_weights, i: ct, i0: &i0, i1: &i1, i2: &i2, i3: &i3);
631	ggml_set_f32_nd(tensor: gg_weights, i0, i1, i2, i3, value: karpathy_weights[ct]);
632	}
633	}
634
635	static void save_as_llama_model(
636	struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
637	) {
638	// convert AK weights into GG weights one by one.
639	// w->token_embedding_table -> model->tok_embeddings
640	// float -> struct ggml_tensor*
641	convert_weights_ak_to_gg(gg_weights: model->tok_embeddings, karpathy_weights: w->token_embedding_table.data());
642	convert_weights_ak_to_gg(gg_weights: model->output, karpathy_weights: !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
643
644	convert_weights_ak_to_gg(gg_weights: model->norm, karpathy_weights: w->rms_final_weight.data());
645	//print_row(model->norm, 0);
646
647	// for rms-att-weight
648	int row_length = model->hparams.n_embd;
649	int n_ff = model->hparams.n_ff;
650
651	const uint32_t n_multiqueries = model->hparams.n_head_kv <= `0` \|\| model->hparams.n_head_kv >= model->hparams.n_head ? `1` : model->hparams.n_head / model->hparams.n_head_kv;
652
653	for (uint32_t i = `0`; i < model->hparams.n_layer; ++i){
654	auto & layer = model->layers [i];
655	// 1d
656	convert_weights_ak_to_gg(gg_weights: layer.attention_norm, karpathy_weights: &w->rms_att_weight [i*row_length]);
657	convert_weights_ak_to_gg(gg_weights: layer.ffn_norm , karpathy_weights: &w->rms_ffn_weight [i*row_length]);
658
659	// from 3d matrix layer x dim x dim to 2d matrix dim x dim
660	convert_weights_ak_to_gg(gg_weights: layer.wq , karpathy_weights: &w->wq [irow_lengthrow_length]);
661	convert_weights_ak_to_gg(gg_weights: layer.wo , karpathy_weights: &w->wo [irow_lengthrow_length]);
662	// from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
663	convert_weights_ak_to_gg(gg_weights: layer.wk , karpathy_weights: &w->wk [irow_lengthrow_length/n_multiqueries]);
664	convert_weights_ak_to_gg(gg_weights: layer.wv , karpathy_weights: &w->wv [irow_lengthrow_length/n_multiqueries]);
665
666	convert_weights_ak_to_gg(gg_weights: layer.w1 , karpathy_weights: &w->w1 [irow_lengthn_ff]);
667	convert_weights_ak_to_gg(gg_weights: layer.w2 , karpathy_weights: &w->w2 [in_ffrow_length]);
668	convert_weights_ak_to_gg(gg_weights: layer.w3 , karpathy_weights: &w->w3 [irow_lengthn_ff]);
669	}
670
671	struct gguf_context * ctx = gguf_init_empty();
672
673	std::vector<const char*> tokens;
674	std::vector<float> scores;
675	std::vector<llama_token_type> token_types;
676	for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
677	tokens.push_back(x: token_data.text.c_str());
678	scores.push_back(x: token_data.score);
679	token_types.push_back(x: token_data.type);
680	}
681	gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, data: tokens.data(), n: tokens.size());
682	gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, type: GGUF_TYPE_FLOAT32, data: scores.data(), n: scores.size());
683	gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, type: GGUF_TYPE_INT32, data: token_types.data(), n: token_types.size());
684
685	gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
686
687	gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, val: "llama");
688	gguf_set_val_str(ctx, KV_GENERAL_NAME, val: "llama");
689
690	// special tokens
691	gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
692	gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
693	gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
694	gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL);
695	gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL);
696
697	gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, val: model->hparams.n_ctx);
698	gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, val: model->hparams.n_embd);
699	gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, val: model->hparams.n_ff);
700	gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, val: model->hparams.n_head);
701	gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, val: model->hparams.n_head);
702	gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, val: model->hparams.n_head_kv);
703	gguf_set_val_u32(ctx, KV_BLOCK_COUNT, val: model->hparams.n_layer);
704	gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, val: model->hparams.n_rot);
705	gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, val: `1e-5f`);
706
707	// write tensors
708	ggml_set_name(tensor: model->tok_embeddings, TN_TOKEN_EMBD);
709	gguf_add_tensor(ctx, tensor: model->tok_embeddings);
710
711	ggml_set_name(tensor: model->norm, TN_OUTPUT_NORM);
712	gguf_add_tensor(ctx, tensor: model->norm);
713
714	ggml_set_name(tensor: model->output, TN_OUTPUT);
715	gguf_add_tensor(ctx, tensor: model->output);
716
717	for (uint32_t i = `0`; i < model->hparams.n_layer; ++i) {
718	auto & layer = model->layers [i];
719
720	ggml_format_name(tensor: layer.wq, TN_ATTN_Q, i);
721	gguf_add_tensor(ctx, tensor: layer.wq);
722
723	ggml_format_name(tensor: layer.wk, TN_ATTN_K, i);
724	gguf_add_tensor(ctx, tensor: layer.wk);
725
726	ggml_format_name(tensor: layer.wv, TN_ATTN_V, i);
727	gguf_add_tensor(ctx, tensor: layer.wv);
728
729	ggml_format_name(tensor: layer.wo, TN_ATTN_OUTPUT, i);
730	gguf_add_tensor(ctx, tensor: layer.wo);
731
732	ggml_format_name(tensor: layer.attention_norm, TN_ATTN_NORM, i);
733	gguf_add_tensor(ctx, tensor: layer.attention_norm);
734
735	ggml_format_name(tensor: layer.w1, TN_FFN_GATE, i);
736	gguf_add_tensor(ctx, tensor: layer.w1);
737
738	ggml_format_name(tensor: layer.w2, TN_FFN_DOWN, i);
739	gguf_add_tensor(ctx, tensor: layer.w2);
740
741	ggml_format_name(tensor: layer.w3, TN_FFN_UP, i);
742	gguf_add_tensor(ctx, tensor: layer.w3);
743
744	ggml_format_name(tensor: layer.ffn_norm, TN_FFN_NORM, i);
745	gguf_add_tensor(ctx, tensor: layer.ffn_norm);
746	}
747
748	gguf_write_to_file(ctx, fname: filename, only_meta: false);
749	gguf_free(ctx);
750	}
751
752	static struct train_params get_default_train_params() {
753	struct train_params params;
754	params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
755	params.fn_llama2c_output_model = "ak_llama_model.bin";
756	params.fn_train_data = "shakespeare.txt";
757	params.fn_checkpoint_in = "checkpoint.bin";
758	params.fn_checkpoint_out = "checkpoint.bin";
759	params.fn_model_out = "ggml-checkpoint-f32.bin";
760
761	params.seed = -`1`;
762
763	params.n_ctx = `128`;
764	params.n_embd = `256`;
765	params.n_mult = `256`;
766	params.n_head = `8`;
767	params.n_layer = `16`;
768	params.n_rotmax = `64`;
769
770	params.n_threads = `6`;
771	params.n_batch = `8`;
772	params.n_examples = `8`;
773	params.n_predict = `1024`;
774
775	params.print_info_interval = `1`;
776	params.print_details_interval = `2`;
777
778	params.samples_start_after_nl = false;
779	params.use_adam = true;
780	params.use_flash = false;
781	params.use_scratch = true;
782
783	// only adam
784	params.warmup = `100`;
785	params.cos_decay_steps = `1000`;
786	params.cos_decay_restart = `1.1f`;
787	params.cos_decay_alpha = `0.0f`;
788
789	params.lbfgs_n_iter = `16`;
790	params.adam_n_iter = `16`;
791	params.adam_alpha = `1e-3f`;
792	params.adam_decay = `1e-3f`;
793
794	params.mem_model_gb = `2`;
795	params.mem_compute_gb = `24`;
796	params.mem_compute0_gb = `8`;
797	params.mem_compute1_gb = `2`;
798
799	return params;
800	}
801
802	static void print_usage(int /argc/, char ** argv, const struct train_params * params) {
803	fprintf(stderr, format: "usage: %s [options]\n", argv[`0`]);
804	fprintf(stderr, format: "\n");
805	fprintf(stderr, format: "options:\n");
806	fprintf(stderr, format: " -h, --help show this help message and exit\n");
807	fprintf(stderr, format: " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
808	fprintf(stderr, format: " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
809	fprintf(stderr, format: " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
810	fprintf(stderr, format: "\n");
811	}
812
813	static bool params_parse(int argc, char argv, struct** train_params * params) {
814	bool invalid_param = false;
815	bool reqd_param_found = false;
816	std::string arg;
817	struct train_params default_params = get_default_train_params();
818	const std::string arg_prefix = "--";
819
820	for (int i = `1`; i < argc; i++) {
821	arg = argv[i];
822	if (arg.compare(pos: `0`, n: arg_prefix.size(), str: arg_prefix) == `0`) {
823	std::replace(first: arg.begin(), last: arg.end(), old_value: `'_'`, new_value: `'-'`);
824	}
825
826	if (arg == "--copy-vocab-from-model") {
827	if (++i >= argc) {
828	invalid_param = true;
829	break;
830	}
831	params->fn_vocab_model = argv[i];
832	} else if (arg == "--llama2c-model") {
833	if (++i >= argc) {
834	invalid_param = true;
835	break;
836	}
837	reqd_param_found = true;
838	params->fn_llama2c_model = argv[i];
839	} else if (arg == "--llama2c-output-model") {
840	if (++i >= argc) {
841	invalid_param = true;
842	break;
843	}
844	params->fn_llama2c_output_model = argv[i];
845	} else if (arg == "-h" \|\| arg == "--help") {
846	print_usage(argc, argv, params: &default_params);
847	exit(status: `0`);
848	} else {
849	fprintf(stderr, format: "error: unknown argument: %s\n", arg.c_str());
850	print_usage(argc, argv, params: &default_params);
851	exit(status: `1`);
852	}
853	}
854	if (invalid_param) {
855	fprintf(stderr, format: "error: invalid parameter for argument: %s\n", arg.c_str());
856	print_usage(argc, argv, params: &default_params);
857	exit(status: `1`);
858	}
859	if (!reqd_param_found){
860	fprintf(stderr, format: "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
861	print_usage(argc, argv, params: &default_params);
862	exit(status: `1`);
863	}
864
865	return true;
866	}
867
868	static std::string basename(const std::string &path) {
869	size_t pos = path.find_last_of(s: "/\\");
870	if (pos == std::string::npos) {
871	return path;
872	}
873	return path.substr(pos: pos + `1`);
874	}
875
876	int main(int argc, char ** argv) {
877	common_init();
878
879	struct train_params params = get_default_train_params();
880	if (!params_parse(argc, argv, params: &params)) {
881	return `1`;
882	}
883
884	Config config;
885	TransformerWeights weights = {};
886	{
887	LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
888	FILE * file = fopen(filename: params.fn_llama2c_model, modes: "rb");
889	if (!file) {
890	LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
891	return `1`;
892	}
893	// read in the config header
894	if (fread(ptr: &config, size: sizeof(Config), n: `1`, stream: file) != `1`) {
895	LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
896	return `1`;
897	}
898	auto shared_weights = config.vocab_size > `0`;
899	config.vocab_size = abs(x: config.vocab_size);
900
901	// read in the Transformer weights
902	alloc_weights(w: &weights, p: &config, shared_weights);
903	if (checkpoint_init_weights(w: &weights, p: &config, f: file, shared_weights)) {
904	LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
905	return `1`;
906	}
907	fclose(stream: file);
908	}
909
910	struct my_llama_vocab vocab;
911	load_vocab(filename: params.fn_vocab_model, config: &config, vocab: &vocab);
912
913	struct my_llama_model model;
914	model.hparams.n_vocab = config.vocab_size; //llama_vocab_n_vocab(lctx);
915	model.hparams.n_ctx = params.n_ctx;
916	model.hparams.n_embd = config.dim; //params.n_embd;
917	model.hparams.n_ff = config.hidden_dim;
918	model.hparams.n_mult = `32`;//params.n_mult;
919	model.hparams.n_head = config.n_heads; //params.n_head;
920	model.hparams.n_head_kv = config.n_kv_heads;
921	model.hparams.n_layer = config.n_layers; //params.n_layer;
922	model.hparams.n_rot = std::min(a: (uint32_t)params.n_rotmax, b: model.hparams.n_embd / model.hparams.n_head);
923
924	print_params(params: &model.hparams);
925
926	struct ggml_init_params lcparams;
927	lcparams.mem_size = `1024ll``1024ll``1024ll`*((size_t) params.mem_model_gb);
928	lcparams.mem_buffer = NULL;
929	lcparams.no_alloc = false;
930
931	model.ctx = ggml_init(params: lcparams);
932
933	init_model(model: &model);
934	model.name = basename(filename: params.fn_llama2c_model);
935	save_as_llama_model(vocab: &vocab, model: &model, w: &weights, filename: params.fn_llama2c_output_model);
936
937	LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
938
939	ggml_free(ctx: model.ctx);
940	return `0`;
941	}
942

Browse the source code of llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp