1#include "ggml.h"
2#include "gguf.h"
3
4#include "arg.h"
5#include "common.h"
6#include "llama.h"
7#include "pca.hpp"
8#include "mean.hpp"
9
10#ifdef GGML_USE_CUDA
11#include "ggml-cuda.h"
12#endif
13
14#ifdef GGML_USE_METAL
15#include "ggml-metal.h"
16#endif
17
18#include <algorithm>
19#include <climits>
20#include <cstdio>
21#include <cstring>
22#include <fstream>
23#include <iostream>
24#include <string>
25#include <tuple>
26#include <vector>
27
28
29//////////////////////////////////////////////////
30// utils
31
32template <class Iter>
33static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
34 std::string ret;
35 for (; begin != end; ++begin) {
36 ret += common_token_to_piece(ctx, *begin);
37 }
38
39 return ret;
40}
41
42static void print_usage(int, char ** argv) {
43 printf(format: "\nexample usage:\n");
44 printf(format: "\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
45 printf(format: "\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
46 printf(format: "\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
47 printf(format: "\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
48 printf(format: "\n");
49}
50
51//////////////////////////////////////////////////
52
53
54// cb_eval is reused for each pair of positive - negative prompt
55struct callback_data {
56 ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
57
58 int n_layers = 0;
59 int n_tokens = 0;
60 bool is_eval_pos = true;
61
62 // each element of the vector correspond to one layer
63 std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
64 std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
65 std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
66
67 // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
68 void save_tensor_for_layer(struct ggml_tensor * t) {
69 GGML_ASSERT(t->type == GGML_TYPE_F32);
70
71 if (ctx_ggml == nullptr) {
72 // alloc a new ctx_ggml if needed
73 struct ggml_init_params params_ggml = {
74 /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u,
75 /*.mem_buffer =*/ NULL,
76 /*.no_alloc =*/ true,
77 };
78 ctx_ggml = ggml_init(params: params_ggml);
79 }
80
81 // copy tensor data
82 auto n_bytes = ggml_nbytes(tensor: t);
83 struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx: ctx_ggml, type: t->type, ne0: t->ne[0], ne1: t->ne[1]);
84 t_layer->data = malloc(size: n_bytes); // TODO @ngxson : get rid of this malloc somehow
85 ggml_backend_tensor_get(tensor: t, data: t_layer->data, offset: 0, size: n_bytes);
86 ggml_set_name(tensor: t_layer, name: ggml_get_name(tensor: t));
87 //print_debug_tensor(t_layer);
88
89 if (is_eval_pos) {
90 v_pos.push_back(x: t_layer);
91 } else {
92 v_neg.push_back(x: t_layer);
93 }
94 }
95
96 // calculate diff (v_pos - v_neg) and place the result back to v_pos
97 // all zero rows in the diff tensor will also be removed
98 // NOTE: final layer is ignored. we only have (n_layers - 1) to process
99 std::vector<struct ggml_tensor *> calc_diff() {
100 for (float il = 0; il < v_pos.size(); il++) {
101 float * a = (float *) v_pos[il]->data;
102 float * b = (float *) v_neg[il]->data;
103 size_t n_elem = ggml_nelements(tensor: v_pos[il]);
104 for (size_t j = 0; j < n_elem; j++) {
105 a[j] -= b[j];
106 }
107 //print_debug_tensor(v_pos[i]);
108 auto diff_filtered = filter_nonzero_rows(a: v_pos[il]);
109 v_diff_filtered.push_back(x: diff_filtered);
110 }
111 return v_diff_filtered; // for convinient, we return the result std::vector
112 }
113
114 // delete zero rows from a given 2D tensor
115 struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
116 //printf("filter_nonzero_rows\n");
117 auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
118 // check if given row containing all zero elements
119 int n_cols = t->ne[0]; // hint: should be equal to n_embd
120 for (int col = 0; col < n_cols; ++col) {
121 if (ggml_get_f32_nd(tensor: t, i0: col, i1: row, i2: 0, i3: 0) > eps) {
122 return false;
123 }
124 }
125 return true;
126 };
127 std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
128 for (int i_row = 0; i_row < a->ne[1]; i_row++) {
129 if (!is_row_all_zeros(a, i_row, 1e-6)) {
130 rows_to_copy.push_back(x: i_row);
131 }
132 }
133
134 // get "n_nonzero_rows" for the output "diff_filtered"
135 int n_nonzero_rows = rows_to_copy.size();
136 //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
137 int n_embd = a->ne[0];
138 GGML_ASSERT(n_nonzero_rows > 0);
139
140 // diff_filtered: [n_embd, n_nonzero_rows]
141 struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
142 ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_nonzero_rows);
143 ggml_format_name(tensor: diff_filtered, fmt: "diff_filtered_%s", a->name);
144 diff_filtered->data = malloc(size: ggml_nbytes(tensor: diff_filtered));
145
146 // copy non-zero rows
147 for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
148 int src_row = rows_to_copy[dest_row];
149 for (int i = 0; i < n_embd; i++) {
150 float src_elem = ggml_get_f32_nd(tensor: a, i0: i, i1: src_row, i2: 0, i3: 0);
151 ggml_set_f32_nd(tensor: diff_filtered, i0: i, i1: dest_row, i2: 0, i3: 0, value: src_elem);
152 }
153 }
154
155 //print_debug_tensor(diff_filtered);
156
157 return diff_filtered;
158 }
159
160 // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
161 void reset() {
162 for (auto ptr : v_pos) free(ptr: ptr->data);
163 for (auto ptr : v_neg) free(ptr: ptr->data);
164 for (auto ptr : v_diff_filtered) free(ptr: ptr->data);
165 v_pos.clear();
166 v_neg.clear();
167 v_diff_filtered.clear();
168 if (ctx_ggml) {
169 ggml_free(ctx: ctx_ggml);
170 }
171 ctx_ggml = nullptr;
172 }
173};
174
175/**
176 * process_ctx is used to store the ggml context for pre-post processing the diff vectors
177 * in short, input => v_diff and output => v_final
178 */
179struct train_context {
180 ggml_context * ctx_ggml;
181 int n_embd;
182 int n_layers;
183
184 /* pair of prompts to be used for generating final vector */
185 std::vector<std::string> positive_entries;
186 std::vector<std::string> negative_entries;
187
188 // each element of the vector correspond to one layer
189 // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
190 // NOTE (2): v_diff is transposed from v_diff_tmp
191 std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
192 std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
193
194 // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
195 // v_diff_tmp will get converted unto v_diff later on
196 std::vector<std::vector<uint8_t>> v_diff_tmp;
197
198 train_context(int n_embd_, int n_layers_) {
199 n_embd = n_embd_;
200 n_layers = n_layers_;
201 struct ggml_init_params params_ggml = {
202 /*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
203 /*.mem_buffer =*/ NULL,
204 /*.no_alloc =*/ true,
205 };
206 ctx_ggml = ggml_init(params: params_ggml);
207 for (int il = 0; il < n_layers - 1; il++) {
208 std::vector<uint8_t> empty;
209 v_diff_tmp.push_back(x: empty);
210 auto t = ggml_new_tensor_1d(ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_embd);
211 t->data = malloc(size: ggml_nbytes(tensor: t)); // TODO: get rid of malloc if possible
212 v_final.push_back(x: t);
213 }
214 }
215
216 // add new rows into existing tensor in v_diff_tmp
217 void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
218 GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
219 for (int il = 0; il < n_layers - 1; il++) {
220 auto t = diff_filtered[il];
221 auto & diff_tmp = v_diff_tmp[il];
222 size_t curr_size = diff_tmp.size();
223 diff_tmp.resize(new_size: curr_size + ggml_nbytes(tensor: t));
224 memcpy(dest: diff_tmp.data() + curr_size, src: t->data, n: ggml_nbytes(tensor: t));
225 }
226 }
227
228 // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
229 // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
230 void build_v_diff(bool transpose) {
231 printf(format: "build_v_diff\n");
232 for (int il = 0; il < n_layers - 1; il++) {
233 auto & diff_tmp = v_diff_tmp[il];
234 int n_elem = diff_tmp.size() / sizeof(float);
235 GGML_ASSERT(n_elem % n_embd == 0);
236 int n_rows = n_elem / n_embd;
237 struct ggml_tensor * diff = transpose
238 ? ggml_new_tensor_2d(ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_rows, ne1: n_embd)
239 : ggml_new_tensor_2d(ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_rows);
240 ggml_set_name(tensor: diff, name: (std::string("diff_") + std::to_string(val: il)).c_str());
241 diff->data = malloc(size: ggml_nbytes(tensor: diff)); // TODO: get rid of this malloc if possible
242 if (transpose) {
243 // copy data & transpose
244 float * arr = (float *) diff_tmp.data();
245 for (int ir = 0; ir < n_rows; ++ir) {
246 for (int ic = 0; ic < n_embd; ++ic) {
247 float f = arr[ir*n_embd + ic];
248 ggml_set_f32_nd(tensor: diff, i0: ir, i1: ic, i2: 0, i3: 0, value: f);
249 }
250 }
251 } else {
252 // only copy
253 memcpy(dest: diff->data, src: diff_tmp.data(), n: ggml_nbytes(tensor: diff));
254 }
255 v_diff.push_back(x: diff);
256 print_debug_tensor(t: diff);
257 // free memory of diff_tmp
258 diff_tmp.resize(new_size: 0);
259 }
260 }
261
262 ~train_context() {
263 for (auto ptr : v_final) free(ptr: ptr->data);
264 for (auto ptr : v_diff) free(ptr: ptr->data);
265 // no need to free v_diff_tmp, since we didn't use malloc
266 ggml_free(ctx: ctx_ggml);
267 }
268};
269
270struct tokenized_prompt {
271 std::vector<llama_token> tokens_pos;
272 std::vector<llama_token> tokens_neg;
273 size_t max_seq_len;
274
275 tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
276 const llama_model * model = llama_get_model(ctx);
277 const llama_vocab * vocab = llama_model_get_vocab(model);
278 const bool add_bos = llama_vocab_get_add_bos(vocab);
279 tokens_pos = common_tokenize(ctx, text: pos, add_special: add_bos, parse_special: true);
280 tokens_neg = common_tokenize(ctx, text: neg, add_special: add_bos, parse_special: true);
281 max_seq_len = std::max(a: tokens_pos.size(), b: tokens_neg.size());
282 padding_seq(ctx, tokens&: tokens_pos, len: max_seq_len);
283 padding_seq(ctx, tokens&: tokens_neg, len: max_seq_len);
284 }
285
286 void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
287 // TODO: customize padding token
288 std::vector<llama_token> pad_tokens = common_tokenize(ctx, text: " ", add_special: false);
289 llama_token pad_tok = pad_tokens.back();
290 while (tokens.size() < len) {
291 tokens.push_back(x: pad_tok);
292 }
293 }
294};
295
296//////////////////////////////////////////////////
297
298template <typename T>
299static std::string to_string(const T & val) {
300 std::stringstream ss;
301 ss << val;
302 return ss.str();
303}
304
305static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
306 std::vector<std::string> output;
307 std::ifstream file(path);
308 if (!file.is_open()) {
309 fprintf(stderr, format: "error: unable to open file: %s\n", path.c_str());
310 exit(status: 1);
311 }
312 std::string line;
313 while (std::getline(is&: file, str&: line)) {
314 bool is_skip = skip_empty_lines && line.empty();
315 if (!is_skip) {
316 string_process_escapes(input&: line);
317 output.push_back(x: line);
318 }
319 }
320 file.close();
321 return output;
322}
323
324//////////////////////////////////////////////////
325
326static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
327 auto * cb_data = (callback_data *) user_data;
328 static const char * l_out_name = "l_out";
329 const bool is_l_out = strncmp(s1: t->name, s2: l_out_name, n: strlen(s: l_out_name)) == 0;
330
331 if (ask) {
332 return is_l_out;
333 }
334
335 if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
336 return true;
337 }
338
339 // save the tensor to current context
340 cb_data->save_tensor_for_layer(t);
341 return true;
342}
343
344static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
345 llama_memory_clear(mem: llama_get_memory(ctx), data: true);
346 if (llama_decode(ctx, batch: llama_batch_get_one(tokens: tokens.data(), n_tokens: tokens.size()))) {
347 fprintf(stderr, format: "%s : failed to eval\n", __func__);
348 return false;
349 }
350 return true;
351}
352
353static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
354 struct gguf_context * ctx = gguf_init_empty();
355
356 const std::string arch = "controlvector";
357 gguf_set_val_str(ctx, key: "general.architecture", val: arch.c_str());
358 gguf_set_val_str(ctx, key: (arch + ".model_hint").c_str(), val: model_hint.c_str());
359 gguf_set_val_i32(ctx, key: (arch + ".layer_count").c_str(), val: v_ctrl.size());
360
361 for (size_t i = 0; i < v_ctrl.size(); ++i) {
362 gguf_add_tensor(ctx, tensor: v_ctrl[i]);
363 print_debug_tensor(t: v_ctrl[i]);
364 printf(format: "Added tensor: %s\n", v_ctrl[i]->name);
365 }
366
367 printf(format: "%s: writing file...\n", __func__);
368 gguf_write_to_file(ctx, fname: fname.c_str(), only_meta: false);
369 printf(format: "%s: wrote file '%s'\n", __func__, fname.c_str());
370 gguf_free(ctx);
371}
372
373/**
374 * Load prompt files and completion file.
375 * Then format each pair of prompt + completion to make an entry.
376 */
377static int prepare_entries(common_params & params, train_context & ctx_train) {
378 // load prompts
379 std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(path: params.cvector_positive_file, skip_empty_lines: true);
380 std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(path: params.cvector_negative_file, skip_empty_lines: true);
381 if (positive_prompts.size() != negative_prompts.size()) {
382 fprintf(stderr, format: "number of positive and negative prompts must be equal\n");
383 return 1;
384 }
385 if (positive_prompts.empty()) {
386 fprintf(stderr, format: "must provide at least one prompt pair\n");
387 return 1;
388 }
389 ctx_train.positive_entries = positive_prompts;
390 ctx_train.negative_entries = negative_prompts;
391 return 0;
392}
393
394int main(int argc, char ** argv) {
395 common_params params;
396
397 params.out_file = "control_vector.gguf";
398
399 if (!common_params_parse(argc, argv, params, ex: LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
400 return 1;
401 }
402
403 if (params.n_pca_iterations % params.n_pca_batch != 0) {
404 fprintf(stderr, format: "PCA iterations must by multiply of PCA batch size\n");
405 return 1;
406 }
407
408
409 callback_data cb_data;
410
411 // pass the callback to the backend scheduler
412 // it will be executed for each node during the graph computation
413 params.cb_eval = cb_eval;
414 params.cb_eval_user_data = &cb_data;
415 params.warmup = false;
416
417 print_build_info();
418 llama_backend_init();
419 llama_numa_init(numa: params.numa);
420
421 // load the model to get hparams
422 common_init_result llama_init = common_init_from_params(params);
423
424 llama_model * model = llama_init.model.get();
425 llama_context * ctx = llama_init.context.get();
426
427 // int n_ctx = llama_n_ctx(ctx);
428 int n_layers = llama_model_n_layer(model);
429 int n_embd = llama_model_n_embd(model);
430
431 // get model hint param (a.k.a model arch name)
432 char model_hint[128];
433 llama_model_meta_val_str(model, key: "general.architecture", buf: model_hint, buf_size: 128);
434
435 // init train_context
436 train_context ctx_train(n_embd, n_layers);
437
438 // load and prepare entries for training
439 prepare_entries(params, ctx_train);
440
441 // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
442 std::vector<tokenized_prompt> tokenized_prompts;
443 size_t n_total_tokens = 0;
444 for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
445 tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
446 n_total_tokens += 2 * t.max_seq_len;
447 tokenized_prompts.push_back(x: std::move(t));
448 }
449
450 std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
451
452 for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
453 bool success = false;
454 tokenized_prompt t = tokenized_prompts[i];
455 cb_data.n_layers = n_layers;
456 cb_data.n_tokens = t.max_seq_len;
457
458 printf(format: "Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
459 (int) i+1, (int) ctx_train.positive_entries.size(),
460 tokens_to_str(ctx, begin: t.tokens_pos.cbegin(), end: t.tokens_pos.cend()).c_str(),
461 tokens_to_str(ctx, begin: t.tokens_neg.cbegin(), end: t.tokens_neg.cend()).c_str(),
462 (int) t.max_seq_len);
463
464 cb_data.is_eval_pos = true;
465 success = get_hidden_layers(ctx, tokens&: t.tokens_pos);
466 if (!success) break;
467
468 cb_data.is_eval_pos = false;
469 success = get_hidden_layers(ctx, tokens&: t.tokens_neg);
470 if (!success) break;
471
472 // calculate diff and remove all zero rows
473 auto v_diff_filtered = cb_data.calc_diff();
474
475 // save & concat the filtered v_diff to ctx_train
476 ctx_train.concat_diff_tmp(diff_filtered: v_diff_filtered);
477
478 // reset for next iteration
479 cb_data.reset();
480 }
481
482 // done with the model, we can now free it to make gain some memory
483 printf(format: "Done evaluate prompts, unload model...\n");
484
485 bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
486
487 // prepare ctx_train for PCA
488 ctx_train.build_v_diff(transpose: use_pca);
489
490 if (use_pca) {
491 // run PCA
492 PCA::pca_params pca_params;
493 pca_params.n_threads = params.cpuparams.n_threads;
494 pca_params.n_batch = params.n_pca_batch;
495 pca_params.n_iterations = params.n_pca_iterations;
496 PCA::run_pca(params&: pca_params, v_input: ctx_train.v_diff, v_output: ctx_train.v_final);
497 } else {
498 // run mean
499 mean::run(v_input: ctx_train.v_diff, v_output: ctx_train.v_final);
500 }
501
502 // write output vectors to gguf
503 export_gguf(v_ctrl: ctx_train.v_final, fname: params.out_file, model_hint);
504
505 llama_backend_free();
506
507 return 0;
508}
509