| 1 | #include "ggml.h" |
| 2 | #include "gguf.h" |
| 3 | |
| 4 | #include "arg.h" |
| 5 | #include "common.h" |
| 6 | #include "llama.h" |
| 7 | #include "pca.hpp" |
| 8 | #include "mean.hpp" |
| 9 | |
| 10 | #ifdef GGML_USE_CUDA |
| 11 | #include "ggml-cuda.h" |
| 12 | #endif |
| 13 | |
| 14 | #ifdef GGML_USE_METAL |
| 15 | #include "ggml-metal.h" |
| 16 | #endif |
| 17 | |
| 18 | #include <algorithm> |
| 19 | #include <climits> |
| 20 | #include <cstdio> |
| 21 | #include <cstring> |
| 22 | #include <fstream> |
| 23 | #include <iostream> |
| 24 | #include <string> |
| 25 | #include <tuple> |
| 26 | #include <vector> |
| 27 | |
| 28 | |
| 29 | ////////////////////////////////////////////////// |
| 30 | // utils |
| 31 | |
| 32 | template <class Iter> |
| 33 | static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { |
| 34 | std::string ret; |
| 35 | for (; begin != end; ++begin) { |
| 36 | ret += common_token_to_piece(ctx, *begin); |
| 37 | } |
| 38 | |
| 39 | return ret; |
| 40 | } |
| 41 | |
| 42 | static void print_usage(int, char ** argv) { |
| 43 | printf(format: "\nexample usage:\n" ); |
| 44 | printf(format: "\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n" , argv[0]); |
| 45 | printf(format: "\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n" , argv[0]); |
| 46 | printf(format: "\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n" , argv[0]); |
| 47 | printf(format: "\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n" , argv[0]); |
| 48 | printf(format: "\n" ); |
| 49 | } |
| 50 | |
| 51 | ////////////////////////////////////////////////// |
| 52 | |
| 53 | |
| 54 | // cb_eval is reused for each pair of positive - negative prompt |
| 55 | struct callback_data { |
| 56 | ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered |
| 57 | |
| 58 | int n_layers = 0; |
| 59 | int n_tokens = 0; |
| 60 | bool is_eval_pos = true; |
| 61 | |
| 62 | // each element of the vector correspond to one layer |
| 63 | std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens] |
| 64 | std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens] |
| 65 | std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer |
| 66 | |
| 67 | // save a tensor into either v_pos or v_neg (decided by is_eval_pos) |
| 68 | void save_tensor_for_layer(struct ggml_tensor * t) { |
| 69 | GGML_ASSERT(t->type == GGML_TYPE_F32); |
| 70 | |
| 71 | if (ctx_ggml == nullptr) { |
| 72 | // alloc a new ctx_ggml if needed |
| 73 | struct ggml_init_params params_ggml = { |
| 74 | /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u, |
| 75 | /*.mem_buffer =*/ NULL, |
| 76 | /*.no_alloc =*/ true, |
| 77 | }; |
| 78 | ctx_ggml = ggml_init(params: params_ggml); |
| 79 | } |
| 80 | |
| 81 | // copy tensor data |
| 82 | auto n_bytes = ggml_nbytes(tensor: t); |
| 83 | struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx: ctx_ggml, type: t->type, ne0: t->ne[0], ne1: t->ne[1]); |
| 84 | t_layer->data = malloc(size: n_bytes); // TODO @ngxson : get rid of this malloc somehow |
| 85 | ggml_backend_tensor_get(tensor: t, data: t_layer->data, offset: 0, size: n_bytes); |
| 86 | ggml_set_name(tensor: t_layer, name: ggml_get_name(tensor: t)); |
| 87 | //print_debug_tensor(t_layer); |
| 88 | |
| 89 | if (is_eval_pos) { |
| 90 | v_pos.push_back(x: t_layer); |
| 91 | } else { |
| 92 | v_neg.push_back(x: t_layer); |
| 93 | } |
| 94 | } |
| 95 | |
| 96 | // calculate diff (v_pos - v_neg) and place the result back to v_pos |
| 97 | // all zero rows in the diff tensor will also be removed |
| 98 | // NOTE: final layer is ignored. we only have (n_layers - 1) to process |
| 99 | std::vector<struct ggml_tensor *> calc_diff() { |
| 100 | for (float il = 0; il < v_pos.size(); il++) { |
| 101 | float * a = (float *) v_pos[il]->data; |
| 102 | float * b = (float *) v_neg[il]->data; |
| 103 | size_t n_elem = ggml_nelements(tensor: v_pos[il]); |
| 104 | for (size_t j = 0; j < n_elem; j++) { |
| 105 | a[j] -= b[j]; |
| 106 | } |
| 107 | //print_debug_tensor(v_pos[i]); |
| 108 | auto diff_filtered = filter_nonzero_rows(a: v_pos[il]); |
| 109 | v_diff_filtered.push_back(x: diff_filtered); |
| 110 | } |
| 111 | return v_diff_filtered; // for convinient, we return the result std::vector |
| 112 | } |
| 113 | |
| 114 | // delete zero rows from a given 2D tensor |
| 115 | struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) { |
| 116 | //printf("filter_nonzero_rows\n"); |
| 117 | auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool { |
| 118 | // check if given row containing all zero elements |
| 119 | int n_cols = t->ne[0]; // hint: should be equal to n_embd |
| 120 | for (int col = 0; col < n_cols; ++col) { |
| 121 | if (ggml_get_f32_nd(tensor: t, i0: col, i1: row, i2: 0, i3: 0) > eps) { |
| 122 | return false; |
| 123 | } |
| 124 | } |
| 125 | return true; |
| 126 | }; |
| 127 | std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered) |
| 128 | for (int i_row = 0; i_row < a->ne[1]; i_row++) { |
| 129 | if (!is_row_all_zeros(a, i_row, 1e-6)) { |
| 130 | rows_to_copy.push_back(x: i_row); |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | // get "n_nonzero_rows" for the output "diff_filtered" |
| 135 | int n_nonzero_rows = rows_to_copy.size(); |
| 136 | //printf("n_nonzero_rows: %d\n", n_nonzero_rows); |
| 137 | int n_embd = a->ne[0]; |
| 138 | GGML_ASSERT(n_nonzero_rows > 0); |
| 139 | |
| 140 | // diff_filtered: [n_embd, n_nonzero_rows] |
| 141 | struct ggml_tensor * diff_filtered = ggml_new_tensor_2d( |
| 142 | ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_nonzero_rows); |
| 143 | ggml_format_name(tensor: diff_filtered, fmt: "diff_filtered_%s" , a->name); |
| 144 | diff_filtered->data = malloc(size: ggml_nbytes(tensor: diff_filtered)); |
| 145 | |
| 146 | // copy non-zero rows |
| 147 | for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) { |
| 148 | int src_row = rows_to_copy[dest_row]; |
| 149 | for (int i = 0; i < n_embd; i++) { |
| 150 | float src_elem = ggml_get_f32_nd(tensor: a, i0: i, i1: src_row, i2: 0, i3: 0); |
| 151 | ggml_set_f32_nd(tensor: diff_filtered, i0: i, i1: dest_row, i2: 0, i3: 0, value: src_elem); |
| 152 | } |
| 153 | } |
| 154 | |
| 155 | //print_debug_tensor(diff_filtered); |
| 156 | |
| 157 | return diff_filtered; |
| 158 | } |
| 159 | |
| 160 | // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors |
| 161 | void reset() { |
| 162 | for (auto ptr : v_pos) free(ptr: ptr->data); |
| 163 | for (auto ptr : v_neg) free(ptr: ptr->data); |
| 164 | for (auto ptr : v_diff_filtered) free(ptr: ptr->data); |
| 165 | v_pos.clear(); |
| 166 | v_neg.clear(); |
| 167 | v_diff_filtered.clear(); |
| 168 | if (ctx_ggml) { |
| 169 | ggml_free(ctx: ctx_ggml); |
| 170 | } |
| 171 | ctx_ggml = nullptr; |
| 172 | } |
| 173 | }; |
| 174 | |
| 175 | /** |
| 176 | * process_ctx is used to store the ggml context for pre-post processing the diff vectors |
| 177 | * in short, input => v_diff and output => v_final |
| 178 | */ |
| 179 | struct train_context { |
| 180 | ggml_context * ctx_ggml; |
| 181 | int n_embd; |
| 182 | int n_layers; |
| 183 | |
| 184 | /* pair of prompts to be used for generating final vector */ |
| 185 | std::vector<std::string> positive_entries; |
| 186 | std::vector<std::string> negative_entries; |
| 187 | |
| 188 | // each element of the vector correspond to one layer |
| 189 | // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here |
| 190 | // NOTE (2): v_diff is transposed from v_diff_tmp |
| 191 | std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows) |
| 192 | std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file |
| 193 | |
| 194 | // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor |
| 195 | // v_diff_tmp will get converted unto v_diff later on |
| 196 | std::vector<std::vector<uint8_t>> v_diff_tmp; |
| 197 | |
| 198 | train_context(int n_embd_, int n_layers_) { |
| 199 | n_embd = n_embd_; |
| 200 | n_layers = n_layers_; |
| 201 | struct ggml_init_params params_ggml = { |
| 202 | /*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u, |
| 203 | /*.mem_buffer =*/ NULL, |
| 204 | /*.no_alloc =*/ true, |
| 205 | }; |
| 206 | ctx_ggml = ggml_init(params: params_ggml); |
| 207 | for (int il = 0; il < n_layers - 1; il++) { |
| 208 | std::vector<uint8_t> empty; |
| 209 | v_diff_tmp.push_back(x: empty); |
| 210 | auto t = ggml_new_tensor_1d(ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_embd); |
| 211 | t->data = malloc(size: ggml_nbytes(tensor: t)); // TODO: get rid of malloc if possible |
| 212 | v_final.push_back(x: t); |
| 213 | } |
| 214 | } |
| 215 | |
| 216 | // add new rows into existing tensor in v_diff_tmp |
| 217 | void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) { |
| 218 | GGML_ASSERT((int) diff_filtered.size() == n_layers - 1); |
| 219 | for (int il = 0; il < n_layers - 1; il++) { |
| 220 | auto t = diff_filtered[il]; |
| 221 | auto & diff_tmp = v_diff_tmp[il]; |
| 222 | size_t curr_size = diff_tmp.size(); |
| 223 | diff_tmp.resize(new_size: curr_size + ggml_nbytes(tensor: t)); |
| 224 | memcpy(dest: diff_tmp.data() + curr_size, src: t->data, n: ggml_nbytes(tensor: t)); |
| 225 | } |
| 226 | } |
| 227 | |
| 228 | // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed) |
| 229 | // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method |
| 230 | void build_v_diff(bool transpose) { |
| 231 | printf(format: "build_v_diff\n" ); |
| 232 | for (int il = 0; il < n_layers - 1; il++) { |
| 233 | auto & diff_tmp = v_diff_tmp[il]; |
| 234 | int n_elem = diff_tmp.size() / sizeof(float); |
| 235 | GGML_ASSERT(n_elem % n_embd == 0); |
| 236 | int n_rows = n_elem / n_embd; |
| 237 | struct ggml_tensor * diff = transpose |
| 238 | ? ggml_new_tensor_2d(ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_rows, ne1: n_embd) |
| 239 | : ggml_new_tensor_2d(ctx: ctx_ggml, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_rows); |
| 240 | ggml_set_name(tensor: diff, name: (std::string("diff_" ) + std::to_string(val: il)).c_str()); |
| 241 | diff->data = malloc(size: ggml_nbytes(tensor: diff)); // TODO: get rid of this malloc if possible |
| 242 | if (transpose) { |
| 243 | // copy data & transpose |
| 244 | float * arr = (float *) diff_tmp.data(); |
| 245 | for (int ir = 0; ir < n_rows; ++ir) { |
| 246 | for (int ic = 0; ic < n_embd; ++ic) { |
| 247 | float f = arr[ir*n_embd + ic]; |
| 248 | ggml_set_f32_nd(tensor: diff, i0: ir, i1: ic, i2: 0, i3: 0, value: f); |
| 249 | } |
| 250 | } |
| 251 | } else { |
| 252 | // only copy |
| 253 | memcpy(dest: diff->data, src: diff_tmp.data(), n: ggml_nbytes(tensor: diff)); |
| 254 | } |
| 255 | v_diff.push_back(x: diff); |
| 256 | print_debug_tensor(t: diff); |
| 257 | // free memory of diff_tmp |
| 258 | diff_tmp.resize(new_size: 0); |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | ~train_context() { |
| 263 | for (auto ptr : v_final) free(ptr: ptr->data); |
| 264 | for (auto ptr : v_diff) free(ptr: ptr->data); |
| 265 | // no need to free v_diff_tmp, since we didn't use malloc |
| 266 | ggml_free(ctx: ctx_ggml); |
| 267 | } |
| 268 | }; |
| 269 | |
| 270 | struct tokenized_prompt { |
| 271 | std::vector<llama_token> tokens_pos; |
| 272 | std::vector<llama_token> tokens_neg; |
| 273 | size_t max_seq_len; |
| 274 | |
| 275 | tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { |
| 276 | const llama_model * model = llama_get_model(ctx); |
| 277 | const llama_vocab * vocab = llama_model_get_vocab(model); |
| 278 | const bool add_bos = llama_vocab_get_add_bos(vocab); |
| 279 | tokens_pos = common_tokenize(ctx, text: pos, add_special: add_bos, parse_special: true); |
| 280 | tokens_neg = common_tokenize(ctx, text: neg, add_special: add_bos, parse_special: true); |
| 281 | max_seq_len = std::max(a: tokens_pos.size(), b: tokens_neg.size()); |
| 282 | padding_seq(ctx, tokens&: tokens_pos, len: max_seq_len); |
| 283 | padding_seq(ctx, tokens&: tokens_neg, len: max_seq_len); |
| 284 | } |
| 285 | |
| 286 | void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) { |
| 287 | // TODO: customize padding token |
| 288 | std::vector<llama_token> pad_tokens = common_tokenize(ctx, text: " " , add_special: false); |
| 289 | llama_token pad_tok = pad_tokens.back(); |
| 290 | while (tokens.size() < len) { |
| 291 | tokens.push_back(x: pad_tok); |
| 292 | } |
| 293 | } |
| 294 | }; |
| 295 | |
| 296 | ////////////////////////////////////////////////// |
| 297 | |
| 298 | template <typename T> |
| 299 | static std::string to_string(const T & val) { |
| 300 | std::stringstream ss; |
| 301 | ss << val; |
| 302 | return ss.str(); |
| 303 | } |
| 304 | |
| 305 | static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) { |
| 306 | std::vector<std::string> output; |
| 307 | std::ifstream file(path); |
| 308 | if (!file.is_open()) { |
| 309 | fprintf(stderr, format: "error: unable to open file: %s\n" , path.c_str()); |
| 310 | exit(status: 1); |
| 311 | } |
| 312 | std::string line; |
| 313 | while (std::getline(is&: file, str&: line)) { |
| 314 | bool is_skip = skip_empty_lines && line.empty(); |
| 315 | if (!is_skip) { |
| 316 | string_process_escapes(input&: line); |
| 317 | output.push_back(x: line); |
| 318 | } |
| 319 | } |
| 320 | file.close(); |
| 321 | return output; |
| 322 | } |
| 323 | |
| 324 | ////////////////////////////////////////////////// |
| 325 | |
| 326 | static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { |
| 327 | auto * cb_data = (callback_data *) user_data; |
| 328 | static const char * l_out_name = "l_out" ; |
| 329 | const bool is_l_out = strncmp(s1: t->name, s2: l_out_name, n: strlen(s: l_out_name)) == 0; |
| 330 | |
| 331 | if (ask) { |
| 332 | return is_l_out; |
| 333 | } |
| 334 | |
| 335 | if (!is_l_out || t->ne[1] != cb_data->n_tokens) { |
| 336 | return true; |
| 337 | } |
| 338 | |
| 339 | // save the tensor to current context |
| 340 | cb_data->save_tensor_for_layer(t); |
| 341 | return true; |
| 342 | } |
| 343 | |
| 344 | static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) { |
| 345 | llama_memory_clear(mem: llama_get_memory(ctx), data: true); |
| 346 | if (llama_decode(ctx, batch: llama_batch_get_one(tokens: tokens.data(), n_tokens: tokens.size()))) { |
| 347 | fprintf(stderr, format: "%s : failed to eval\n" , __func__); |
| 348 | return false; |
| 349 | } |
| 350 | return true; |
| 351 | } |
| 352 | |
| 353 | static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) { |
| 354 | struct gguf_context * ctx = gguf_init_empty(); |
| 355 | |
| 356 | const std::string arch = "controlvector" ; |
| 357 | gguf_set_val_str(ctx, key: "general.architecture" , val: arch.c_str()); |
| 358 | gguf_set_val_str(ctx, key: (arch + ".model_hint" ).c_str(), val: model_hint.c_str()); |
| 359 | gguf_set_val_i32(ctx, key: (arch + ".layer_count" ).c_str(), val: v_ctrl.size()); |
| 360 | |
| 361 | for (size_t i = 0; i < v_ctrl.size(); ++i) { |
| 362 | gguf_add_tensor(ctx, tensor: v_ctrl[i]); |
| 363 | print_debug_tensor(t: v_ctrl[i]); |
| 364 | printf(format: "Added tensor: %s\n" , v_ctrl[i]->name); |
| 365 | } |
| 366 | |
| 367 | printf(format: "%s: writing file...\n" , __func__); |
| 368 | gguf_write_to_file(ctx, fname: fname.c_str(), only_meta: false); |
| 369 | printf(format: "%s: wrote file '%s'\n" , __func__, fname.c_str()); |
| 370 | gguf_free(ctx); |
| 371 | } |
| 372 | |
| 373 | /** |
| 374 | * Load prompt files and completion file. |
| 375 | * Then format each pair of prompt + completion to make an entry. |
| 376 | */ |
| 377 | static int prepare_entries(common_params & params, train_context & ctx_train) { |
| 378 | // load prompts |
| 379 | std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(path: params.cvector_positive_file, skip_empty_lines: true); |
| 380 | std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(path: params.cvector_negative_file, skip_empty_lines: true); |
| 381 | if (positive_prompts.size() != negative_prompts.size()) { |
| 382 | fprintf(stderr, format: "number of positive and negative prompts must be equal\n" ); |
| 383 | return 1; |
| 384 | } |
| 385 | if (positive_prompts.empty()) { |
| 386 | fprintf(stderr, format: "must provide at least one prompt pair\n" ); |
| 387 | return 1; |
| 388 | } |
| 389 | ctx_train.positive_entries = positive_prompts; |
| 390 | ctx_train.negative_entries = negative_prompts; |
| 391 | return 0; |
| 392 | } |
| 393 | |
| 394 | int main(int argc, char ** argv) { |
| 395 | common_params params; |
| 396 | |
| 397 | params.out_file = "control_vector.gguf" ; |
| 398 | |
| 399 | if (!common_params_parse(argc, argv, params, ex: LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { |
| 400 | return 1; |
| 401 | } |
| 402 | |
| 403 | if (params.n_pca_iterations % params.n_pca_batch != 0) { |
| 404 | fprintf(stderr, format: "PCA iterations must by multiply of PCA batch size\n" ); |
| 405 | return 1; |
| 406 | } |
| 407 | |
| 408 | |
| 409 | callback_data cb_data; |
| 410 | |
| 411 | // pass the callback to the backend scheduler |
| 412 | // it will be executed for each node during the graph computation |
| 413 | params.cb_eval = cb_eval; |
| 414 | params.cb_eval_user_data = &cb_data; |
| 415 | params.warmup = false; |
| 416 | |
| 417 | print_build_info(); |
| 418 | llama_backend_init(); |
| 419 | llama_numa_init(numa: params.numa); |
| 420 | |
| 421 | // load the model to get hparams |
| 422 | common_init_result llama_init = common_init_from_params(params); |
| 423 | |
| 424 | llama_model * model = llama_init.model.get(); |
| 425 | llama_context * ctx = llama_init.context.get(); |
| 426 | |
| 427 | // int n_ctx = llama_n_ctx(ctx); |
| 428 | int n_layers = llama_model_n_layer(model); |
| 429 | int n_embd = llama_model_n_embd(model); |
| 430 | |
| 431 | // get model hint param (a.k.a model arch name) |
| 432 | char model_hint[128]; |
| 433 | llama_model_meta_val_str(model, key: "general.architecture" , buf: model_hint, buf_size: 128); |
| 434 | |
| 435 | // init train_context |
| 436 | train_context ctx_train(n_embd, n_layers); |
| 437 | |
| 438 | // load and prepare entries for training |
| 439 | prepare_entries(params, ctx_train); |
| 440 | |
| 441 | // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped |
| 442 | std::vector<tokenized_prompt> tokenized_prompts; |
| 443 | size_t n_total_tokens = 0; |
| 444 | for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { |
| 445 | tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]); |
| 446 | n_total_tokens += 2 * t.max_seq_len; |
| 447 | tokenized_prompts.push_back(x: std::move(t)); |
| 448 | } |
| 449 | |
| 450 | std::cout << "n_total_tokens: " << n_total_tokens << std::endl; |
| 451 | |
| 452 | for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { |
| 453 | bool success = false; |
| 454 | tokenized_prompt t = tokenized_prompts[i]; |
| 455 | cb_data.n_layers = n_layers; |
| 456 | cb_data.n_tokens = t.max_seq_len; |
| 457 | |
| 458 | printf(format: "Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n" , |
| 459 | (int) i+1, (int) ctx_train.positive_entries.size(), |
| 460 | tokens_to_str(ctx, begin: t.tokens_pos.cbegin(), end: t.tokens_pos.cend()).c_str(), |
| 461 | tokens_to_str(ctx, begin: t.tokens_neg.cbegin(), end: t.tokens_neg.cend()).c_str(), |
| 462 | (int) t.max_seq_len); |
| 463 | |
| 464 | cb_data.is_eval_pos = true; |
| 465 | success = get_hidden_layers(ctx, tokens&: t.tokens_pos); |
| 466 | if (!success) break; |
| 467 | |
| 468 | cb_data.is_eval_pos = false; |
| 469 | success = get_hidden_layers(ctx, tokens&: t.tokens_neg); |
| 470 | if (!success) break; |
| 471 | |
| 472 | // calculate diff and remove all zero rows |
| 473 | auto v_diff_filtered = cb_data.calc_diff(); |
| 474 | |
| 475 | // save & concat the filtered v_diff to ctx_train |
| 476 | ctx_train.concat_diff_tmp(diff_filtered: v_diff_filtered); |
| 477 | |
| 478 | // reset for next iteration |
| 479 | cb_data.reset(); |
| 480 | } |
| 481 | |
| 482 | // done with the model, we can now free it to make gain some memory |
| 483 | printf(format: "Done evaluate prompts, unload model...\n" ); |
| 484 | |
| 485 | bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA; |
| 486 | |
| 487 | // prepare ctx_train for PCA |
| 488 | ctx_train.build_v_diff(transpose: use_pca); |
| 489 | |
| 490 | if (use_pca) { |
| 491 | // run PCA |
| 492 | PCA::pca_params pca_params; |
| 493 | pca_params.n_threads = params.cpuparams.n_threads; |
| 494 | pca_params.n_batch = params.n_pca_batch; |
| 495 | pca_params.n_iterations = params.n_pca_iterations; |
| 496 | PCA::run_pca(params&: pca_params, v_input: ctx_train.v_diff, v_output: ctx_train.v_final); |
| 497 | } else { |
| 498 | // run mean |
| 499 | mean::run(v_input: ctx_train.v_diff, v_output: ctx_train.v_final); |
| 500 | } |
| 501 | |
| 502 | // write output vectors to gguf |
| 503 | export_gguf(v_ctrl: ctx_train.v_final, fname: params.out_file, model_hint); |
| 504 | |
| 505 | llama_backend_free(); |
| 506 | |
| 507 | return 0; |
| 508 | } |
| 509 | |