cvector-generator.cpp source code [llama.cpp/tools/cvector-generator/cvector-generator.cpp]

1	#include "ggml.h"
2	#include "gguf.h"
3
4	#include "arg.h"
5	#include "common.h"
6	#include "llama.h"
7	#include "pca.hpp"
8	#include "mean.hpp"
9
10	#ifdef GGML_USE_CUDA
11	#include "ggml-cuda.h"
12	#endif
13
14	#ifdef GGML_USE_METAL
15	#include "ggml-metal.h"
16	#endif
17
18	#include <algorithm>
19	#include <climits>
20	#include <cstdio>
21	#include <cstring>
22	#include <fstream>
23	#include <iostream>
24	#include <string>
25	#include <tuple>
26	#include <vector>
27
28
29	//////////////////////////////////////////////////
30	// utils
31
32	template <class Iter>
33	static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
34	std::string ret;
35	for (; begin != end; ++begin) {
36	ret += common_token_to_piece(ctx, *begin);
37	}
38
39	return ret;
40	}
41
42	static void print_usage(int, char ** argv) {
43	printf(format: "\nexample usage:\n");
44	printf(format: "\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[`0`]);
45	printf(format: "\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[`0`]);
46	printf(format: "\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[`0`]);
47	printf(format: "\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[`0`]);
48	printf(format: "\n");
49	}
50
51	//////////////////////////////////////////////////
52
53
54	// cb_eval is reused for each pair of positive - negative prompt
55	struct callback_data {
56	ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
57
58	int n_layers = `0`;
59	int n_tokens = `0`;
60	bool is_eval_pos = true;
61
62	// each element of the vector correspond to one layer
63	std::vector<struct ggml_tensor > v_pos; // vector of matrices of size [n_embd, n_tokens]*
64	std::vector<struct ggml_tensor > v_neg; // vector of matrices of size [n_embd, n_tokens]*
65	std::vector<struct ggml_tensor > v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer*
66
67	// save a tensor into either v_pos or v_neg (decided by is_eval_pos)
68	void save_tensor_for_layer(struct ggml_tensor * t) {
69	GGML_ASSERT(t->type == GGML_TYPE_F32);
70
71	if (ctx_ggml == nullptr) {
72	// alloc a new ctx_ggml if needed
73	struct ggml_init_params params_ggml = {
74	/.mem_size =/ ggml_tensor_overhead() * n_layers * `3u`,
75	/.mem_buffer =/ NULL,
76	/.no_alloc =/ true,
77	};
78	ctx_ggml = ggml_init(params: params_ggml);
79	}
80
81	// copy tensor data
82	auto n_bytes = ggml_nbytes(tensor: t);
83	struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx: ctx_ggml, type: t->type, ne0: t->ne[`0`], ne1: t->ne[`1`]);
84	t_layer->data = malloc(size: n_bytes); // TODO @ngxson : get rid of this malloc somehow
85	ggml_backend_tensor_get(tensor: t, data: t_layer->data, offset: `0`, size: n_bytes);
86	ggml_set_name(tensor: t_layer, name: ggml_get_name(tensor: t));
87	//print_debug_tensor(t_layer);
88
89	if (is_eval_pos) {
90	v_pos.push_back(x: t_layer);
91	} else {
92	v_neg.push_back(x: t_layer);
93	}
94	}
95
96	// calculate diff (v_pos - v_neg) and place the result back to v_pos
97	// all zero rows in the diff tensor will also be removed
98	// NOTE: final layer is ignored. we only have (n_layers - 1) to process
99	std::vector<struct ggml_tensor *> calc_diff() {
100	for (float il = `0`; il < v_pos.size(); il++) {
101	float * a = (float *) v_pos [il]->data;
102	float * b = (float *) v_neg [il]->data;
103	size_t n_elem = ggml_nelements(tensor: v_pos [il]);
104	for (size_t j = `0`; j < n_elem; j++) {
105	a[j] -= b[j];
106	}
107	//print_debug_tensor(v_pos[i]);
108	auto diff_filtered = filter_nonzero_rows(a: v_pos [il]);
109	v_diff_filtered.push_back(x: diff_filtered);
110	}
111	return v_diff_filtered; // for convinient, we return the result std::vector
112	}
113
114	// delete zero rows from a given 2D tensor
115	struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
116	//printf("filter_nonzero_rows\n");
117	auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
118	// check if given row containing all zero elements
119	int n_cols = t->ne[`0`]; // hint: should be equal to n_embd
120	for (int col = `0`; col < n_cols; ++col) {
121	if (ggml_get_f32_nd(tensor: t, i0: col, i1: row, i2: `0`, i3: `0`) > eps) {
122	return false;
123	}
124	}
125	return true;
126	};
127	std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
128	for (int i_row = `0`; i_row < a->ne[`1`]; i_row++) {
129	if (!is_row_all_zeros(a, i_row, `1e-6`)) {
130	rows_to_copy.push_back(x: i_row);
131	}
132	}
133
134	// get "n_nonzero_rows" for the output "diff_filtered"
135	int n_nonzero_rows = rows_to_copy.size();
136	//printf("n_nonzero_rows: %d\n", n_nonzero_rows);
137	int n_embd = a->ne[`0`];
138	GGML_ASSERT(n_nonzero_rows > `0`);
139
140	// diff_filtered: [n_embd, n_nonzero_rows]
141	struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
142	ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_nonzero_rows);
143	ggml_format_name(tensor: diff_filtered, fmt: "diff_filtered_%s", a->name);
144	diff_filtered->data = malloc(size: ggml_nbytes(tensor: diff_filtered));
145
146	// copy non-zero rows
147	for (int dest_row = `0`; dest_row < n_nonzero_rows; dest_row++) {
148	int src_row = rows_to_copy [dest_row];
149	for (int i = `0`; i < n_embd; i++) {
150	float src_elem = ggml_get_f32_nd(tensor: a, i0: i, i1: src_row, i2: `0`, i3: `0`);
151	ggml_set_f32_nd(tensor: diff_filtered, i0: i, i1: dest_row, i2: `0`, i3: `0`, value: src_elem);
152	}
153	}
154
155	//print_debug_tensor(diff_filtered);
156
157	return diff_filtered;
158	}
159
160	// we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
161	void reset() {
162	for (auto ptr : v_pos) free(ptr: ptr->data);
163	for (auto ptr : v_neg) free(ptr: ptr->data);
164	for (auto ptr : v_diff_filtered) free(ptr: ptr->data);
165	v_pos.clear();
166	v_neg.clear();
167	v_diff_filtered.clear();
168	if (ctx_ggml) {
169	ggml_free(ctx: ctx_ggml);
170	}
171	ctx_ggml = nullptr;
172	}
173	};
174
175	/**
176	* process_ctx is used to store the ggml context for pre-post processing the diff vectors
177	* in short, input => v_diff and output => v_final
178	*/
179	struct train_context {
180	ggml_context * ctx_ggml;
181	int n_embd;
182	int n_layers;
183
184	/ pair of prompts to be used for generating final vector /
185	std::vector<std::string> positive_entries;
186	std::vector<std::string> negative_entries;
187
188	// each element of the vector correspond to one layer
189	// NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
190	// NOTE (2): v_diff is transposed from v_diff_tmp
191	std::vector<struct ggml_tensor > v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)*
192	std::vector<struct ggml_tensor > v_final; // vector of vectors of size [n_embd] to be written to file*
193
194	// to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
195	// v_diff_tmp will get converted unto v_diff later on
196	std::vector<std::vector<uint8_t>> v_diff_tmp;
197
198	train_context(int n_embd_, int n_layers_) {
199	n_embd = n_embd_;
200	n_layers = n_layers_;
201	struct ggml_init_params params_ggml = {
202	/.mem_size =/ ggml_tensor_overhead() * (n_layers - `1`) * `2u`,
203	/.mem_buffer =/ NULL,
204	/.no_alloc =/ true,
205	};
206	ctx_ggml = ggml_init(params: params_ggml);
207	for (int il = `0`; il < n_layers - `1`; il++) {
208	std::vector<uint8_t> empty;
209	v_diff_tmp.push_back(x: empty);
210	auto t = ggml_new_tensor_1d(ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_embd);
211	t->data = malloc(size: ggml_nbytes(tensor: t)); // TODO: get rid of malloc if possible
212	v_final.push_back(x: t);
213	}
214	}
215
216	// add new rows into existing tensor in v_diff_tmp
217	void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
218	GGML_ASSERT((int) diff_filtered.size() == n_layers - `1`);
219	for (int il = `0`; il < n_layers - `1`; il++) {
220	auto t = diff_filtered [il];
221	auto & diff_tmp = v_diff_tmp [il];
222	size_t curr_size = diff_tmp.size();
223	diff_tmp.resize(new_size: curr_size + ggml_nbytes(tensor: t));
224	memcpy(dest: diff_tmp.data() + curr_size, src: t->data, n: ggml_nbytes(tensor: t));
225	}
226	}
227
228	// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
229	// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
230	void build_v_diff(bool transpose) {
231	printf(format: "build_v_diff\n");
232	for (int il = `0`; il < n_layers - `1`; il++) {
233	auto & diff_tmp = v_diff_tmp [il];
234	int n_elem = diff_tmp.size() / sizeof(float);
235	GGML_ASSERT(n_elem % n_embd == `0`);
236	int n_rows = n_elem / n_embd;
237	struct ggml_tensor * diff = transpose
238	? ggml_new_tensor_2d(ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_rows, ne1: n_embd)
239	: ggml_new_tensor_2d(ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_rows);
240	ggml_set_name(tensor: diff, name: (std::string ("diff_") + std::to_string(val: il)).c_str());
241	diff->data = malloc(size: ggml_nbytes(tensor: diff)); // TODO: get rid of this malloc if possible
242	if (transpose) {
243	// copy data & transpose
244	float * arr = (float *) diff_tmp.data();
245	for (int ir = `0`; ir < n_rows; ++ir) {
246	for (int ic = `0`; ic < n_embd; ++ic) {
247	float f = arr[ir*n_embd + ic];
248	ggml_set_f32_nd(tensor: diff, i0: ir, i1: ic, i2: `0`, i3: `0`, value: f);
249	}
250	}
251	} else {
252	// only copy
253	memcpy(dest: diff->data, src: diff_tmp.data(), n: ggml_nbytes(tensor: diff));
254	}
255	v_diff.push_back(x: diff);
256	print_debug_tensor(t: diff);
257	// free memory of diff_tmp
258	diff_tmp.resize(new_size: `0`);
259	}
260	}
261
262	~train_context() {
263	for (auto ptr : v_final) free(ptr: ptr->data);
264	for (auto ptr : v_diff) free(ptr: ptr->data);
265	// no need to free v_diff_tmp, since we didn't use malloc
266	ggml_free(ctx: ctx_ggml);
267	}
268	};
269
270	struct tokenized_prompt {
271	std::vector<llama_token> tokens_pos;
272	std::vector<llama_token> tokens_neg;
273	size_t max_seq_len;
274
275	tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
276	const llama_model * model = llama_get_model(ctx);
277	const llama_vocab * vocab = llama_model_get_vocab(model);
278	const bool add_bos = llama_vocab_get_add_bos(vocab);
279	tokens_pos = common_tokenize(ctx, text: pos, add_special: add_bos, parse_special: true);
280	tokens_neg = common_tokenize(ctx, text: neg, add_special: add_bos, parse_special: true);
281	max_seq_len = std::max(a: tokens_pos.size(), b: tokens_neg.size());
282	padding_seq(ctx, tokens&: tokens_pos, len: max_seq_len);
283	padding_seq(ctx, tokens&: tokens_neg, len: max_seq_len);
284	}
285
286	void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
287	// TODO: customize padding token
288	std::vector<llama_token> pad_tokens = common_tokenize(ctx, text: " ", add_special: false);
289	llama_token pad_tok = pad_tokens.back();
290	while (tokens.size() < len) {
291	tokens.push_back(x: pad_tok);
292	}
293	}
294	};
295
296	//////////////////////////////////////////////////
297
298	template <typename T>
299	static std::string to_string(const T & val) {
300	std::stringstream ss;
301	ss << val;
302	return ss.str();
303	}
304
305	static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
306	std::vector<std::string> output;
307	std::ifstream file(path);
308	if (!file.is_open()) {
309	fprintf(stderr, format: "error: unable to open file: %s\n", path.c_str());
310	exit(status: `1`);
311	}
312	std::string line;
313	while (std::getline(is&: file, str&: line)) {
314	bool is_skip = skip_empty_lines && line.empty();
315	if (!is_skip) {
316	string_process_escapes(input&: line);
317	output.push_back(x: line);
318	}
319	}
320	file.close();
321	return output;
322	}
323
324	//////////////////////////////////////////////////
325
326	static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
327	auto * cb_data = (callback_data *) user_data;
328	static const char * l_out_name = "l_out";
329	const bool is_l_out = strncmp(s1: t->name, s2: l_out_name, n: strlen(s: l_out_name)) == `0`;
330
331	if (ask) {
332	return is_l_out;
333	}
334
335	if (!is_l_out \|\| t->ne[`1`] != cb_data->n_tokens) {
336	return true;
337	}
338
339	// save the tensor to current context
340	cb_data->save_tensor_for_layer(t);
341	return true;
342	}
343
344	static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
345	llama_memory_clear(mem: llama_get_memory(ctx), data: true);
346	if (llama_decode(ctx, batch: llama_batch_get_one(tokens: tokens.data(), n_tokens: tokens.size()))) {
347	fprintf(stderr, format: "%s : failed to eval\n", __func__);
348	return false;
349	}
350	return true;
351	}
352
353	static void export_gguf(const std::vector<struct ggml_tensor > & v_ctrl, const* std::string fname, const std::string model_hint) {
354	struct gguf_context * ctx = gguf_init_empty();
355
356	const std::string arch = "controlvector";
357	gguf_set_val_str(ctx, key: "general.architecture", val: arch.c_str());
358	gguf_set_val_str(ctx, key: (arch + ".model_hint").c_str(), val: model_hint.c_str());
359	gguf_set_val_i32(ctx, key: (arch + ".layer_count").c_str(), val: v_ctrl.size());
360
361	for (size_t i = `0`; i < v_ctrl.size(); ++i) {
362	gguf_add_tensor(ctx, tensor: v_ctrl [i]);
363	print_debug_tensor(t: v_ctrl [i]);
364	printf(format: "Added tensor: %s\n", v_ctrl [i]->name);
365	}
366
367	printf(format: "%s: writing file...\n", __func__);
368	gguf_write_to_file(ctx, fname: fname.c_str(), only_meta: false);
369	printf(format: "%s: wrote file '%s'\n", __func__, fname.c_str());
370	gguf_free(ctx);
371	}
372
373	/**
374	* Load prompt files and completion file.
375	* Then format each pair of prompt + completion to make an entry.
376	*/
377	static int prepare_entries(common_params & params, train_context & ctx_train) {
378	// load prompts
379	std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(path: params.cvector_positive_file, skip_empty_lines: true);
380	std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(path: params.cvector_negative_file, skip_empty_lines: true);
381	if (positive_prompts.size() != negative_prompts.size()) {
382	fprintf(stderr, format: "number of positive and negative prompts must be equal\n");
383	return `1`;
384	}
385	if (positive_prompts.empty()) {
386	fprintf(stderr, format: "must provide at least one prompt pair\n");
387	return `1`;
388	}
389	ctx_train.positive_entries = positive_prompts;
390	ctx_train.negative_entries = negative_prompts;
391	return `0`;
392	}
393
394	int main(int argc, char ** argv) {
395	common_params params;
396
397	params.out_file = "control_vector.gguf";
398
399	if (!common_params_parse(argc, argv, params, ex: LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
400	return `1`;
401	}
402
403	if (params.n_pca_iterations % params.n_pca_batch != `0`) {
404	fprintf(stderr, format: "PCA iterations must by multiply of PCA batch size\n");
405	return `1`;
406	}
407
408
409	callback_data cb_data;
410
411	// pass the callback to the backend scheduler
412	// it will be executed for each node during the graph computation
413	params.cb_eval = cb_eval;
414	params.cb_eval_user_data = &cb_data;
415	params.warmup = false;
416
417	print_build_info();
418	llama_backend_init();
419	llama_numa_init(numa: params.numa);
420
421	// load the model to get hparams
422	common_init_result llama_init = common_init_from_params(params);
423
424	llama_model * model = llama_init.model.get();
425	llama_context * ctx = llama_init.context.get();
426
427	// int n_ctx = llama_n_ctx(ctx);
428	int n_layers = llama_model_n_layer(model);
429	int n_embd = llama_model_n_embd(model);
430
431	// get model hint param (a.k.a model arch name)
432	char model_hint[`128`];
433	llama_model_meta_val_str(model, key: "general.architecture", buf: model_hint, buf_size: `128`);
434
435	// init train_context
436	train_context ctx_train(n_embd, n_layers);
437
438	// load and prepare entries for training
439	prepare_entries(params, ctx_train);
440
441	// we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
442	std::vector<tokenized_prompt> tokenized_prompts;
443	size_t n_total_tokens = `0`;
444	for (size_t i = `0`; i < ctx_train.positive_entries.size(); ++i) {
445	tokenized_prompt t(ctx, ctx_train.positive_entries [i], ctx_train.negative_entries [i]);
446	n_total_tokens += `2` * t.max_seq_len;
447	tokenized_prompts.push_back(x: std::move(t));
448	}
449
450	std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
451
452	for(size_t i = `0`; i < ctx_train.positive_entries.size(); ++i) {
453	bool success = false;
454	tokenized_prompt t = tokenized_prompts [i];
455	cb_data.n_layers = n_layers;
456	cb_data.n_tokens = t.max_seq_len;
457
458	printf(format: "Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
459	(int) i+`1`, (int) ctx_train.positive_entries.size(),
460	tokens_to_str(ctx, begin: t.tokens_pos.cbegin(), end: t.tokens_pos.cend()).c_str(),
461	tokens_to_str(ctx, begin: t.tokens_neg.cbegin(), end: t.tokens_neg.cend()).c_str(),
462	(int) t.max_seq_len);
463
464	cb_data.is_eval_pos = true;
465	success = get_hidden_layers(ctx, tokens&: t.tokens_pos);
466	if (!success) break;
467
468	cb_data.is_eval_pos = false;
469	success = get_hidden_layers(ctx, tokens&: t.tokens_neg);
470	if (!success) break;
471
472	// calculate diff and remove all zero rows
473	auto v_diff_filtered = cb_data.calc_diff();
474
475	// save & concat the filtered v_diff to ctx_train
476	ctx_train.concat_diff_tmp(diff_filtered: v_diff_filtered);
477
478	// reset for next iteration
479	cb_data.reset();
480	}
481
482	// done with the model, we can now free it to make gain some memory
483	printf(format: "Done evaluate prompts, unload model...\n");
484
485	bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
486
487	// prepare ctx_train for PCA
488	ctx_train.build_v_diff(transpose: use_pca);
489
490	if (use_pca) {
491	// run PCA
492	PCA::pca_params pca_params;
493	pca_params.n_threads = params.cpuparams.n_threads;
494	pca_params.n_batch = params.n_pca_batch;
495	pca_params.n_iterations = params.n_pca_iterations;
496	PCA::run_pca(params&: pca_params, v_input: ctx_train.v_diff, v_output: ctx_train.v_final);
497	} else {
498	// run mean
499	mean::run(v_input: ctx_train.v_diff, v_output: ctx_train.v_final);
500	}
501
502	// write output vectors to gguf
503	export_gguf(v_ctrl: ctx_train.v_final, fname: params.out_file, model_hint);
504
505	llama_backend_free();
506
507	return `0`;
508	}
509

Browse the source code of llama.cpp/tools/cvector-generator/cvector-generator.cpp