1#if defined(_MSC_VER)
2#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3#endif
4
5#include "ggml.h"
6#include "gguf.h"
7
8#include "common.h"
9#include "log.h"
10#include "llama.h"
11
12#include <algorithm>
13#include <cinttypes>
14#include <climits>
15#include <cmath>
16#include <codecvt>
17#include <chrono>
18#include <cstdarg>
19#include <cstring>
20#include <ctime>
21#include <filesystem>
22#include <fstream>
23#include <iostream>
24#include <iterator>
25#include <regex>
26#include <sstream>
27#include <string>
28#include <thread>
29#include <unordered_map>
30#include <unordered_set>
31#include <vector>
32
33#if defined(__APPLE__) && defined(__MACH__)
34#include <sys/types.h>
35#include <sys/sysctl.h>
36#endif
37
38#if defined(_WIN32)
39#define WIN32_LEAN_AND_MEAN
40#ifndef NOMINMAX
41# define NOMINMAX
42#endif
43#include <locale>
44#include <windows.h>
45#include <string.h>
46#include <fcntl.h>
47#include <io.h>
48#else
49#include <sys/ioctl.h>
50#include <sys/stat.h>
51#include <unistd.h>
52#endif
53
54#if defined(__linux__)
55#include <sys/types.h>
56#include <pwd.h>
57#endif
58
59#if defined(_MSC_VER)
60#pragma warning(disable: 4244 4267) // possible loss of data
61#endif
62
63//
64// CPU utils
65//
66
67int32_t cpu_get_num_physical_cores() {
68#ifdef __linux__
69 // enumerate the set of thread siblings, num entries is num cores
70 std::unordered_set<std::string> siblings;
71 for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
72 std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
73 + std::to_string(val: cpu) + "/topology/thread_siblings");
74 if (!thread_siblings.is_open()) {
75 break; // no more cpus
76 }
77 std::string line;
78 if (std::getline(is&: thread_siblings, str&: line)) {
79 siblings.insert(x: line);
80 }
81 }
82 if (!siblings.empty()) {
83 return static_cast<int32_t>(siblings.size());
84 }
85#elif defined(__APPLE__) && defined(__MACH__)
86 int32_t num_physical_cores;
87 size_t len = sizeof(num_physical_cores);
88 int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
89 if (result == 0) {
90 return num_physical_cores;
91 }
92 result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
93 if (result == 0) {
94 return num_physical_cores;
95 }
96#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
97 // TODO: windows + arm64 + mingw64
98 unsigned int n_threads_win = std::thread::hardware_concurrency();
99 unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
100
101 DWORD buffer_size = 0;
102 if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
103 if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
104 return default_threads;
105 }
106 }
107
108 std::vector<char> buffer(buffer_size);
109 if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
110 return default_threads;
111 }
112
113 int32_t num_physical_cores = 0;
114 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
115 while (buffer_size > 0) {
116 if (info->Relationship == RelationProcessorCore) {
117 num_physical_cores += info->Processor.GroupCount;
118 }
119 buffer_size -= info->Size;
120 info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
121 }
122
123 return num_physical_cores > 0 ? num_physical_cores : default_threads;
124#endif
125 unsigned int n_threads = std::thread::hardware_concurrency();
126 return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
127}
128
129#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
130#include <pthread.h>
131
132static void cpuid(unsigned leaf, unsigned subleaf,
133 unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
134 __asm__("movq\t%%rbx,%%rsi\n\t"
135 "cpuid\n\t"
136 "xchgq\t%%rbx,%%rsi"
137 : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
138 : "0"(leaf), "2"(subleaf));
139}
140
141static int pin_cpu(int cpu) {
142 cpu_set_t mask;
143 CPU_ZERO(&mask);
144 CPU_SET(cpu, &mask);
145 return pthread_setaffinity_np(th: pthread_self(), cpusetsize: sizeof(mask), cpuset: &mask);
146}
147
148static bool is_hybrid_cpu(void) {
149 unsigned eax, ebx, ecx, edx;
150 cpuid(leaf: 7, subleaf: 0, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx);
151 return !!(edx & (1u << 15));
152}
153
154static bool is_running_on_efficiency_core(void) {
155 unsigned eax, ebx, ecx, edx;
156 cpuid(leaf: 0x1a, subleaf: 0, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx);
157 int intel_atom = 0x20;
158 int core_type = (eax & 0xff000000u) >> 24;
159 return core_type == intel_atom;
160}
161
162static int cpu_count_math_cpus(int n_cpu) {
163 int result = 0;
164 for (int cpu = 0; cpu < n_cpu; ++cpu) {
165 if (pin_cpu(cpu)) {
166 return -1;
167 }
168 if (is_running_on_efficiency_core()) {
169 continue; // efficiency cores harm lockstep threading
170 }
171 ++cpu; // hyperthreading isn't useful for linear algebra
172 ++result;
173 }
174 return result;
175}
176
177#endif // __x86_64__ && __linux__
178
179/**
180 * Returns number of CPUs on system that are useful for math.
181 */
182int32_t cpu_get_num_math() {
183#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
184 int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
185 if (n_cpu < 1) {
186 return cpu_get_num_physical_cores();
187 }
188 if (is_hybrid_cpu()) {
189 cpu_set_t affinity;
190 if (!pthread_getaffinity_np(th: pthread_self(), cpusetsize: sizeof(affinity), cpuset: &affinity)) {
191 int result = cpu_count_math_cpus(n_cpu);
192 pthread_setaffinity_np(th: pthread_self(), cpusetsize: sizeof(affinity), cpuset: &affinity);
193 if (result > 0) {
194 return result;
195 }
196 }
197 }
198#endif
199 return cpu_get_num_physical_cores();
200}
201
202// Helper for setting process priority
203
204#if defined(_WIN32)
205
206bool set_process_priority(enum ggml_sched_priority prio) {
207 if (prio == GGML_SCHED_PRIO_NORMAL) {
208 return true;
209 }
210
211 DWORD p = NORMAL_PRIORITY_CLASS;
212 switch (prio) {
213 case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
214 case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
215 case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
216 case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
217 case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
218 }
219
220 if (!SetPriorityClass(GetCurrentProcess(), p)) {
221 LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
222 return false;
223 }
224
225 return true;
226}
227
228#else // MacOS and POSIX
229#include <sys/types.h>
230#include <sys/resource.h>
231
232bool set_process_priority(enum ggml_sched_priority prio) {
233 if (prio == GGML_SCHED_PRIO_NORMAL) {
234 return true;
235 }
236
237 int p = 0;
238 switch (prio) {
239 case GGML_SCHED_PRIO_LOW: p = 5; break;
240 case GGML_SCHED_PRIO_NORMAL: p = 0; break;
241 case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
242 case GGML_SCHED_PRIO_HIGH: p = -10; break;
243 case GGML_SCHED_PRIO_REALTIME: p = -20; break;
244 }
245
246 if (!setpriority(PRIO_PROCESS, who: 0, prio: p)) {
247 LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
248 return false;
249 }
250 return true;
251}
252
253#endif
254
255//
256// CLI argument parsing
257//
258
259
260void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
261 int32_t n_set = 0;
262
263 if (cpuparams.n_threads < 0) {
264 // Assuming everything about cpuparams is invalid
265 if (role_model != nullptr) {
266 cpuparams = *role_model;
267 } else {
268 cpuparams.n_threads = cpu_get_num_math();
269 }
270 }
271
272 for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
273 if (cpuparams.cpumask[i]) {
274 n_set++;
275 }
276 }
277
278 if (n_set && n_set < cpuparams.n_threads) {
279 // Not enough set bits, may experience performance issues.
280 LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
281 }
282}
283
284bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
285 size_t dash_loc = range.find(c: '-');
286 if (dash_loc == std::string::npos) {
287 LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
288 return false;
289 }
290
291 size_t start_i;
292 size_t end_i;
293
294 if (dash_loc == 0) {
295 start_i = 0;
296 } else {
297 start_i = std::stoull(str: range.substr(pos: 0, n: dash_loc));
298 if (start_i >= GGML_MAX_N_THREADS) {
299 LOG_ERR("Start index out of bounds!\n");
300 return false;
301 }
302 }
303
304 if (dash_loc == range.length() - 1) {
305 end_i = GGML_MAX_N_THREADS - 1;
306 } else {
307 end_i = std::stoull(str: range.substr(pos: dash_loc + 1));
308 if (end_i >= GGML_MAX_N_THREADS) {
309 LOG_ERR("End index out of bounds!\n");
310 return false;
311 }
312 }
313
314 for (size_t i = start_i; i <= end_i; i++) {
315 boolmask[i] = true;
316 }
317
318 return true;
319}
320
321bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
322 // Discard potential 0x prefix
323 size_t start_i = 0;
324 if (mask.length() >= 2 && mask.substr(pos: 0, n: 2) == "0x") {
325 start_i = 2;
326 }
327
328 size_t num_digits = mask.length() - start_i;
329 if (num_digits > 128) num_digits = 128;
330
331 size_t end_i = num_digits + start_i;
332
333 for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
334 char c = mask.at(n: i);
335 int8_t id = c;
336
337 if ((c >= '0' && c <= '9')) {
338 id -= '0';
339 } else if (c >= 'a' && c <= 'f') {
340 id -= 'a' - 10;
341 } else if (c >= 'A' && c <= 'F') {
342 id -= 'A' - 10;
343 } else {
344 LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
345 return false;
346 }
347
348 boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
349 boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
350 boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
351 boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
352 }
353
354 return true;
355}
356
357void common_init() {
358 llama_log_set(log_callback: [](ggml_log_level level, const char * text, void * /*user_data*/) {
359 if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
360 common_log_add(log: common_log_main(), level, fmt: "%s", text);
361 }
362 }, NULL);
363
364#ifdef NDEBUG
365 const char * build_type = "";
366#else
367 const char * build_type = " (debug)";
368#endif
369
370 LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
371}
372
373std::string common_params_get_system_info(const common_params & params) {
374 std::ostringstream os;
375
376 os << "system_info: n_threads = " << params.cpuparams.n_threads;
377 if (params.cpuparams_batch.n_threads != -1) {
378 os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
379 }
380#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
381 // TODO: windows + arm64 + mingw64
382 DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
383 os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
384#else
385 os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
386#endif
387
388 return os.str();
389}
390
391//
392// String utils
393//
394
395std::string string_format(const char * fmt, ...) {
396 va_list ap;
397 va_list ap2;
398 va_start(ap, fmt);
399 va_copy(ap2, ap);
400 int size = vsnprintf(NULL, maxlen: 0, format: fmt, arg: ap);
401 GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
402 std::vector<char> buf(size + 1);
403 int size2 = vsnprintf(s: buf.data(), maxlen: size + 1, format: fmt, arg: ap2);
404 GGML_ASSERT(size2 == size);
405 va_end(ap2);
406 va_end(ap);
407 return std::string(buf.data(), size);
408}
409
410std::string string_strip(const std::string & str) {
411 size_t start = 0;
412 size_t end = str.size();
413 while (start < end && std::isspace(str[start])) {
414 start++;
415 }
416 while (end > start && std::isspace(str[end - 1])) {
417 end--;
418 }
419 return str.substr(pos: start, n: end - start);
420}
421
422std::string string_get_sortable_timestamp() {
423 using clock = std::chrono::system_clock;
424
425 const clock::time_point current_time = clock::now();
426 const time_t as_time_t = clock::to_time_t(t: current_time);
427 char timestamp_no_ns[100];
428 std::strftime(s: timestamp_no_ns, maxsize: 100, format: "%Y_%m_%d-%H_%M_%S", tp: std::localtime(timer: &as_time_t));
429
430 const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
431 d: current_time.time_since_epoch() % 1000000000).count();
432 char timestamp_ns[11];
433 snprintf(s: timestamp_ns, maxlen: 11, format: "%09" PRId64, ns);
434
435 return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
436}
437
438void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
439 if (search.empty()) {
440 return;
441 }
442 std::string builder;
443 builder.reserve(res_arg: s.length());
444 size_t pos = 0;
445 size_t last_pos = 0;
446 while ((pos = s.find(str: search, pos: last_pos)) != std::string::npos) {
447 builder.append(str: s, pos: last_pos, n: pos - last_pos);
448 builder.append(str: replace);
449 last_pos = pos + search.length();
450 }
451 builder.append(str: s, pos: last_pos, n: std::string::npos);
452 s = std::move(builder);
453}
454
455bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
456 return str.size() >= suffix.size() && str.compare(pos1: str.size()-suffix.size(), n1: suffix.size(), str: suffix) == 0;
457}
458
459bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
460 bool has_suffix = string_ends_with(str, suffix);
461 if (has_suffix) {
462 str = str.substr(pos: 0, n: str.size() - suffix.size());
463 }
464 return has_suffix;
465}
466
467size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
468 if (!str.empty() && !stop.empty()) {
469 const char text_last_char = str.back();
470 for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
471 if (stop[char_index] == text_last_char) {
472 const auto current_partial = stop.substr(pos: 0, n: char_index + 1);
473 if (string_ends_with(str, suffix: current_partial)) {
474 return str.size() - char_index - 1;
475 }
476 }
477 }
478 }
479
480 return std::string::npos;
481}
482
483std::string regex_escape(const std::string & s) {
484 static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
485 return std::regex_replace(s: s, e: special_chars, fmt: "\\$&");
486}
487
488std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
489 std::ostringstream result;
490 for (size_t i = 0; i < values.size(); ++i) {
491 if (i > 0) {
492 result << separator;
493 }
494 result << values[i];
495 }
496 return result.str();
497}
498
499std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
500 std::vector<std::string> parts;
501 size_t start = 0;
502 size_t end = str.find(str: delimiter);
503
504 while (end != std::string::npos) {
505 parts.push_back(x: str.substr(pos: start, n: end - start));
506 start = end + delimiter.length();
507 end = str.find(str: delimiter, pos: start);
508 }
509
510 parts.push_back(x: str.substr(pos: start));
511
512 return parts;
513}
514
515std::string string_repeat(const std::string & str, size_t n) {
516 if (n == 0) {
517 return "";
518 }
519
520 std::string result;
521 result.reserve(res_arg: str.length() * n);
522
523 for (size_t i = 0; i < n; ++i) {
524 result += str;
525 }
526
527 return result;
528}
529
530std::string string_from(bool value) {
531 return value ? "true" : "false";
532}
533
534std::string string_from(const std::vector<int> & values) {
535 std::stringstream buf;
536
537 buf << "[ ";
538 bool first = true;
539 for (auto e : values) {
540 if (first) {
541 first = false;
542 } else {
543 buf << ", ";
544 }
545 buf << std::to_string(val: e);
546 }
547 buf << " ]";
548
549 return buf.str();
550}
551
552std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
553 std::stringstream buf;
554
555 buf << "[ ";
556
557 bool first = true;
558 for (const auto & token : tokens) {
559 if (!first) {
560 buf << ", ";
561 } else {
562 first = false;
563 }
564
565 auto detokenized = common_token_to_piece(ctx, token);
566
567 buf << "'" << detokenized << "'"
568 << ":" << std::to_string(val: token);
569 }
570
571 buf << " ]";
572
573 return buf.str();
574}
575
576std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
577 std::stringstream buf;
578
579 buf << "[ ";
580
581 bool first = true;
582 for (int i = 0; i < batch.n_tokens; ++i) {
583 if (!first) {
584 buf << ", ";
585 } else {
586 first = false;
587 }
588
589 auto detokenized = common_token_to_piece(ctx, token: batch.token[i]);
590
591 buf << "\n" << std::to_string(val: i)
592 << ", token '" << detokenized << "'"
593 << ", pos " << std::to_string(val: batch.pos[i])
594 << ", n_seq_id " << std::to_string(val: batch.n_seq_id[i])
595 << ", seq_id " << std::to_string(val: batch.seq_id[i][0])
596 << ", logits " << std::to_string(val: batch.logits[i]);
597 }
598
599 buf << " ]";
600
601 return buf.str();
602}
603
604void string_process_escapes(std::string & input) {
605 std::size_t input_len = input.length();
606 std::size_t output_idx = 0;
607
608 for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
609 if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
610 switch (input[++input_idx]) {
611 case 'n': input[output_idx++] = '\n'; break;
612 case 'r': input[output_idx++] = '\r'; break;
613 case 't': input[output_idx++] = '\t'; break;
614 case '\'': input[output_idx++] = '\''; break;
615 case '\"': input[output_idx++] = '\"'; break;
616 case '\\': input[output_idx++] = '\\'; break;
617 case 'x':
618 // Handle \x12, etc
619 if (input_idx + 2 < input_len) {
620 const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
621 char *err_p = nullptr;
622 const long val = std::strtol(nptr: x, endptr: &err_p, base: 16);
623 if (err_p == x + 2) {
624 input_idx += 2;
625 input[output_idx++] = char(val);
626 break;
627 }
628 }
629 // fall through
630 default: input[output_idx++] = '\\';
631 input[output_idx++] = input[input_idx]; break;
632 }
633 } else {
634 input[output_idx++] = input[input_idx];
635 }
636 }
637
638 input.resize(n: output_idx);
639}
640
641bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
642 const char * sep = strchr(s: data, c: '=');
643 if (sep == nullptr || sep - data >= 128) {
644 LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
645 return false;
646 }
647 llama_model_kv_override kvo;
648 std::strncpy(dest: kvo.key, src: data, n: sep - data);
649 kvo.key[sep - data] = 0;
650 sep++;
651 if (strncmp(s1: sep, s2: "int:", n: 4) == 0) {
652 sep += 4;
653 kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
654 kvo.val_i64 = std::atol(nptr: sep);
655 } else if (strncmp(s1: sep, s2: "float:", n: 6) == 0) {
656 sep += 6;
657 kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
658 kvo.val_f64 = std::atof(nptr: sep);
659 } else if (strncmp(s1: sep, s2: "bool:", n: 5) == 0) {
660 sep += 5;
661 kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
662 if (std::strcmp(s1: sep, s2: "true") == 0) {
663 kvo.val_bool = true;
664 } else if (std::strcmp(s1: sep, s2: "false") == 0) {
665 kvo.val_bool = false;
666 } else {
667 LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
668 return false;
669 }
670 } else if (strncmp(s1: sep, s2: "str:", n: 4) == 0) {
671 sep += 4;
672 kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
673 if (strlen(s: sep) > 127) {
674 LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
675 return false;
676 }
677 strncpy(dest: kvo.val_str, src: sep, n: 127);
678 kvo.val_str[127] = '\0';
679 } else {
680 LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
681 return false;
682 }
683 overrides.emplace_back(args: std::move(kvo));
684 return true;
685}
686
687//
688// Filesystem utils
689//
690
691// Validate if a filename is safe to use
692// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
693bool fs_validate_filename(const std::string & filename) {
694 if (!filename.length()) {
695 // Empty filename invalid
696 return false;
697 }
698 if (filename.length() > 255) {
699 // Limit at common largest possible filename on Linux filesystems
700 // to avoid unnecessary further validation
701 // (On systems with smaller limits it will be caught by the OS)
702 return false;
703 }
704
705 std::u32string filename_utf32;
706 try {
707#if defined(__clang__)
708 // disable C++17 deprecation warning for std::codecvt_utf8
709# pragma clang diagnostic push
710# pragma clang diagnostic ignored "-Wdeprecated-declarations"
711#elif defined(__GNUC__)
712# pragma GCC diagnostic push
713# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
714#endif
715
716 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
717
718#if defined(__clang__)
719# pragma clang diagnostic pop
720#elif defined(__GNUC__)
721# pragma GCC diagnostic pop
722#endif
723
724 filename_utf32 = converter.from_bytes(str: filename);
725
726 // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
727 // or invalid encodings were encountered. Reject such attempts
728 std::string filename_reencoded = converter.to_bytes(wstr: filename_utf32);
729 if (filename_reencoded != filename) {
730 return false;
731 }
732 } catch (const std::exception &) {
733 return false;
734 }
735
736 // Check for forbidden codepoints:
737 // - Control characters
738 // - Unicode equivalents of illegal characters
739 // - UTF-16 surrogate pairs
740 // - UTF-8 replacement character
741 // - Byte order mark (BOM)
742 // - Illegal characters: / \ : * ? " < > |
743 for (char32_t c : filename_utf32) {
744 if (c <= 0x1F // Control characters (C0)
745 || c == 0x7F // Control characters (DEL)
746 || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
747 || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
748 || c == 0x2215 // Division Slash (forward slash equivalent)
749 || c == 0x2216 // Set Minus (backslash equivalent)
750 || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
751 || c == 0xFFFD // Replacement Character (UTF-8)
752 || c == 0xFEFF // Byte Order Mark (BOM)
753 || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
754 || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
755 return false;
756 }
757 }
758
759 // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
760 // Unicode and other whitespace is not affected, only 0x20 space
761 if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
762 return false;
763 }
764
765 // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
766 if (filename.find(s: "..") != std::string::npos) {
767 return false;
768 }
769
770 // Reject "."
771 if (filename == ".") {
772 return false;
773 }
774
775 return true;
776}
777
778#include <iostream>
779
780
781// returns true if successful, false otherwise
782bool fs_create_directory_with_parents(const std::string & path) {
783#ifdef _WIN32
784 std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
785 std::wstring wpath = converter.from_bytes(path);
786
787 // if the path already exists, check whether it's a directory
788 const DWORD attributes = GetFileAttributesW(wpath.c_str());
789 if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
790 return true;
791 }
792
793 size_t pos_slash = 0;
794
795 // process path from front to back, procedurally creating directories
796 while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
797 const std::wstring subpath = wpath.substr(0, pos_slash);
798
799 pos_slash += 1;
800
801 // skip the drive letter, in some systems it can return an access denied error
802 if (subpath.length() == 2 && subpath[1] == ':') {
803 continue;
804 }
805
806 const bool success = CreateDirectoryW(subpath.c_str(), NULL);
807
808 if (!success) {
809 const DWORD error = GetLastError();
810
811 // if the path already exists, ensure that it's a directory
812 if (error == ERROR_ALREADY_EXISTS) {
813 const DWORD attributes = GetFileAttributesW(subpath.c_str());
814 if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
815 return false;
816 }
817 } else {
818 return false;
819 }
820 }
821 }
822
823 return true;
824#else
825 // if the path already exists, check whether it's a directory
826 struct stat info;
827 if (stat(file: path.c_str(), buf: &info) == 0) {
828 return S_ISDIR(info.st_mode);
829 }
830
831 size_t pos_slash = 1; // skip leading slashes for directory creation
832
833 // process path from front to back, procedurally creating directories
834 while ((pos_slash = path.find(c: '/', pos: pos_slash)) != std::string::npos) {
835 const std::string subpath = path.substr(pos: 0, n: pos_slash);
836 struct stat info;
837
838 // if the path already exists, ensure that it's a directory
839 if (stat(file: subpath.c_str(), buf: &info) == 0) {
840 if (!S_ISDIR(info.st_mode)) {
841 return false;
842 }
843 } else {
844 // create parent directories
845 const int ret = mkdir(path: subpath.c_str(), mode: 0755);
846 if (ret != 0) {
847 return false;
848 }
849 }
850
851 pos_slash += 1;
852 }
853
854 return true;
855#endif // _WIN32
856}
857
858std::string fs_get_cache_directory() {
859 std::string cache_directory = "";
860 auto ensure_trailing_slash = [](std::string p) {
861 // Make sure to add trailing slash
862 if (p.back() != DIRECTORY_SEPARATOR) {
863 p += DIRECTORY_SEPARATOR;
864 }
865 return p;
866 };
867 if (getenv(name: "LLAMA_CACHE")) {
868 cache_directory = std::getenv(name: "LLAMA_CACHE");
869 } else {
870#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
871 if (std::getenv(name: "XDG_CACHE_HOME")) {
872 cache_directory = std::getenv(name: "XDG_CACHE_HOME");
873 } else if (std::getenv(name: "HOME")) {
874 cache_directory = std::getenv(name: "HOME") + std::string("/.cache/");
875 } else {
876#if defined(__linux__)
877 /* no $HOME is defined, fallback to getpwuid */
878 struct passwd *pw = getpwuid(uid: getuid());
879 if ((!pw) || (!pw->pw_dir)) {
880 throw std::runtime_error("Failed to find $HOME directory");
881 }
882
883 cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
884#else /* defined(__linux__) */
885 throw std::runtime_error("Failed to find $HOME directory");
886#endif /* defined(__linux__) */
887 }
888#elif defined(__APPLE__)
889 cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
890#elif defined(_WIN32)
891 cache_directory = std::getenv("LOCALAPPDATA");
892#else
893# error Unknown architecture
894#endif
895 cache_directory = ensure_trailing_slash(cache_directory);
896 cache_directory += "llama.cpp";
897 }
898 return ensure_trailing_slash(cache_directory);
899}
900
901std::string fs_get_cache_file(const std::string & filename) {
902 GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
903 std::string cache_directory = fs_get_cache_directory();
904 const bool success = fs_create_directory_with_parents(path: cache_directory);
905 if (!success) {
906 throw std::runtime_error("failed to create cache directory: " + cache_directory);
907 }
908 return cache_directory + filename;
909}
910
911
912//
913// Model utils
914//
915
916struct common_init_result common_init_from_params(common_params & params) {
917 common_init_result iparams;
918 auto mparams = common_model_params_to_llama(params);
919
920 llama_model * model = llama_model_load_from_file(path_model: params.model.path.c_str(), params: mparams);
921 if (model == NULL) {
922 LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
923 __func__, params.model.path.c_str());
924 return iparams;
925 }
926
927 const llama_vocab * vocab = llama_model_get_vocab(model);
928
929 auto cparams = common_context_params_to_llama(params);
930
931 llama_context * lctx = llama_init_from_model(model, params: cparams);
932 if (lctx == NULL) {
933 LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
934 __func__, params.model.path.c_str());
935 llama_model_free(model);
936 return iparams;
937 }
938
939 if (params.ctx_shift && !llama_memory_can_shift(mem: llama_get_memory(ctx: lctx))) {
940 LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
941 params.ctx_shift = false;
942 }
943
944 if (!params.control_vectors.empty()) {
945 if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
946 if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
947
948 const auto cvec = common_control_vector_load(load_infos: params.control_vectors);
949 if (cvec.n_embd == -1) {
950 llama_free(ctx: lctx);
951 llama_model_free(model);
952
953 return iparams;
954 }
955
956 int err = llama_apply_adapter_cvec(
957 ctx: lctx,
958 data: cvec.data.data(),
959 len: cvec.data.size(),
960 n_embd: cvec.n_embd,
961 il_start: params.control_vector_layer_start,
962 il_end: params.control_vector_layer_end);
963 if (err) {
964 llama_free(ctx: lctx);
965 llama_model_free(model);
966
967 return iparams;
968 }
969 }
970
971 if (llama_pooling_type(ctx: lctx) == LLAMA_POOLING_TYPE_RANK) {
972 bool ok = true;
973
974 if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
975 LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
976 ok = false;
977 }
978
979 bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
980 bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
981 bool has_rerank_prompt = llama_model_chat_template(model, name: "rerank") != NULL;
982
983 if (!has_eos && !has_sep && !has_rerank_prompt) {
984 LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
985 ok = false;
986 } else if (!has_eos) {
987 LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
988 }
989
990 if (!ok) {
991 llama_free(ctx: lctx);
992 llama_model_free(model);
993
994 return iparams;
995 }
996 }
997
998 // load and optionally apply lora adapters
999 for (auto & la : params.lora_adapters) {
1000 llama_adapter_lora_ptr lora;
1001 lora.reset(p: llama_adapter_lora_init(model, path_lora: la.path.c_str()));
1002 if (lora == nullptr) {
1003 LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
1004 llama_free(ctx: lctx);
1005 llama_model_free(model);
1006 return iparams;
1007 }
1008
1009 char buf[1024];
1010 la.ptr = lora.get();
1011 llama_adapter_meta_val_str(adapter: la.ptr, key: "adapter.lora.task_name", buf, buf_size: sizeof(buf));
1012 la.task_name = buf;
1013 llama_adapter_meta_val_str(adapter: la.ptr, key: "adapter.lora.prompt_prefix", buf, buf_size: sizeof(buf));
1014 la.prompt_prefix = buf;
1015 iparams.lora.emplace_back(args: std::move(lora)); // copy to list of loaded adapters
1016 }
1017
1018 if (!params.lora_init_without_apply) {
1019 common_set_adapter_lora(ctx: lctx, lora&: params.lora_adapters);
1020 }
1021
1022 if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
1023 LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
1024 params.sampling.ignore_eos = false;
1025 }
1026
1027 // initialize once
1028 for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1029 if (llama_vocab_is_eog(vocab, token: i)) {
1030 LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1031 params.sampling.logit_bias_eog.push_back(x: {.token: i, .bias: -INFINITY});
1032 }
1033 }
1034
1035 if (params.sampling.ignore_eos) {
1036 // add EOG biases to the active set of logit biases
1037 params.sampling.logit_bias.insert(
1038 position: params.sampling.logit_bias.end(),
1039 first: params.sampling.logit_bias_eog.begin(), last: params.sampling.logit_bias_eog.end());
1040 }
1041
1042 if (params.sampling.penalty_last_n == -1) {
1043 LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1044 params.sampling.penalty_last_n = llama_n_ctx(ctx: lctx);
1045 }
1046
1047 if (params.sampling.dry_penalty_last_n == -1) {
1048 LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1049 params.sampling.dry_penalty_last_n = llama_n_ctx(ctx: lctx);
1050 }
1051
1052 if (params.warmup) {
1053 LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
1054
1055 llama_set_warmup(ctx: lctx, warmup: true);
1056
1057 std::vector<llama_token> tmp;
1058 llama_token bos = llama_vocab_bos(vocab);
1059 llama_token eos = llama_vocab_eos(vocab);
1060
1061 // some models (e.g. T5) don't have a BOS token
1062 if (bos != LLAMA_TOKEN_NULL) {
1063 tmp.push_back(x: bos);
1064 }
1065 if (eos != LLAMA_TOKEN_NULL) {
1066 tmp.push_back(x: eos);
1067 }
1068 if (tmp.empty()) {
1069 tmp.push_back(x: 0);
1070 }
1071
1072 if (llama_model_has_encoder(model)) {
1073 llama_encode(ctx: lctx, batch: llama_batch_get_one(tokens: tmp.data(), n_tokens: tmp.size()));
1074 llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
1075 if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
1076 decoder_start_token_id = bos;
1077 }
1078 tmp.clear();
1079 tmp.push_back(x: decoder_start_token_id);
1080 }
1081 if (llama_model_has_decoder(model)) {
1082 llama_decode(ctx: lctx, batch: llama_batch_get_one(tokens: tmp.data(), n_tokens: std::min(a: tmp.size(), b: (size_t) params.n_batch)));
1083 }
1084 llama_memory_clear(mem: llama_get_memory(ctx: lctx), data: true);
1085 llama_synchronize(ctx: lctx);
1086 llama_perf_context_reset(ctx: lctx);
1087 llama_set_warmup(ctx: lctx, warmup: false);
1088 }
1089
1090 iparams.model.reset(p: model);
1091 iparams.context.reset(p: lctx);
1092
1093 return iparams;
1094}
1095
1096std::string get_model_endpoint() {
1097 const char * model_endpoint_env = getenv(name: "MODEL_ENDPOINT");
1098 // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
1099 const char * hf_endpoint_env = getenv(name: "HF_ENDPOINT");
1100 const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
1101 std::string model_endpoint = "https://huggingface.co/";
1102 if (endpoint_env) {
1103 model_endpoint = endpoint_env;
1104 if (model_endpoint.back() != '/') model_endpoint += '/';
1105 }
1106 return model_endpoint;
1107}
1108
1109void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
1110 llama_clear_adapter_lora(ctx);
1111 for (auto & la : lora) {
1112 if (la.scale != 0.0f) {
1113 llama_set_adapter_lora(ctx, adapter: la.ptr, scale: la.scale);
1114 }
1115 }
1116}
1117
1118struct llama_model_params common_model_params_to_llama(common_params & params) {
1119 auto mparams = llama_model_default_params();
1120
1121 if (!params.devices.empty()) {
1122 mparams.devices = params.devices.data();
1123 }
1124
1125 if (params.n_gpu_layers != -1) {
1126 mparams.n_gpu_layers = params.n_gpu_layers;
1127 }
1128
1129 mparams.main_gpu = params.main_gpu;
1130 mparams.split_mode = params.split_mode;
1131 mparams.tensor_split = params.tensor_split;
1132 mparams.use_mmap = params.use_mmap;
1133 mparams.use_mlock = params.use_mlock;
1134 mparams.check_tensors = params.check_tensors;
1135 mparams.use_extra_bufts = !params.no_extra_bufts;
1136 mparams.no_host = params.no_host;
1137
1138 if (params.kv_overrides.empty()) {
1139 mparams.kv_overrides = NULL;
1140 } else {
1141 GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
1142 mparams.kv_overrides = params.kv_overrides.data();
1143 }
1144
1145 if (params.tensor_buft_overrides.empty()) {
1146 mparams.tensor_buft_overrides = NULL;
1147 } else {
1148 GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
1149 mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
1150 }
1151
1152 mparams.progress_callback = params.load_progress_callback;
1153 mparams.progress_callback_user_data = params.load_progress_callback_user_data;
1154
1155 return mparams;
1156}
1157
1158struct llama_context_params common_context_params_to_llama(const common_params & params) {
1159 auto cparams = llama_context_default_params();
1160
1161 cparams.n_ctx = params.n_ctx;
1162 cparams.n_seq_max = params.n_parallel;
1163 cparams.n_batch = params.n_batch;
1164 cparams.n_ubatch = params.n_ubatch;
1165 cparams.n_threads = params.cpuparams.n_threads;
1166 cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1167 params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1168 cparams.embeddings = params.embedding;
1169 cparams.rope_scaling_type = params.rope_scaling_type;
1170 cparams.rope_freq_base = params.rope_freq_base;
1171 cparams.rope_freq_scale = params.rope_freq_scale;
1172 cparams.yarn_ext_factor = params.yarn_ext_factor;
1173 cparams.yarn_attn_factor = params.yarn_attn_factor;
1174 cparams.yarn_beta_fast = params.yarn_beta_fast;
1175 cparams.yarn_beta_slow = params.yarn_beta_slow;
1176 cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1177 cparams.pooling_type = params.pooling_type;
1178 cparams.attention_type = params.attention_type;
1179 cparams.flash_attn_type = params.flash_attn_type;
1180 cparams.cb_eval = params.cb_eval;
1181 cparams.cb_eval_user_data = params.cb_eval_user_data;
1182 cparams.offload_kqv = !params.no_kv_offload;
1183 cparams.no_perf = params.no_perf;
1184 cparams.op_offload = !params.no_op_offload;
1185 cparams.swa_full = params.swa_full;
1186 cparams.kv_unified = params.kv_unified;
1187
1188 cparams.type_k = params.cache_type_k;
1189 cparams.type_v = params.cache_type_v;
1190
1191 return cparams;
1192}
1193
1194struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
1195 struct ggml_threadpool_params tpp;
1196
1197 ggml_threadpool_params_init(p: &tpp, n_threads: params.n_threads); // setup the defaults
1198
1199 if (params.mask_valid) {
1200 std::memcpy(dest: &tpp.cpumask, src: &params.cpumask, GGML_MAX_N_THREADS);
1201 }
1202
1203 tpp.prio = params.priority;
1204 tpp.poll = params.poll;
1205 tpp.strict_cpu = params.strict_cpu;
1206
1207 return tpp;
1208}
1209
1210//
1211// Batch utils
1212//
1213
1214void common_batch_clear(struct llama_batch & batch) {
1215 batch.n_tokens = 0;
1216}
1217
1218void common_batch_add(
1219 struct llama_batch & batch,
1220 llama_token id,
1221 llama_pos pos,
1222 const std::vector<llama_seq_id> & seq_ids,
1223 bool logits) {
1224 GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
1225
1226 batch.token [batch.n_tokens] = id;
1227 batch.pos [batch.n_tokens] = pos;
1228 batch.n_seq_id[batch.n_tokens] = seq_ids.size();
1229 for (size_t i = 0; i < seq_ids.size(); ++i) {
1230 batch.seq_id[batch.n_tokens][i] = seq_ids[i];
1231 }
1232 batch.logits [batch.n_tokens] = logits;
1233
1234 batch.n_tokens++;
1235}
1236
1237//
1238// Token utils
1239//
1240
1241size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
1242 size_t i;
1243 for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
1244
1245 return i;
1246}
1247
1248size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
1249 // check for empty sequences
1250 if (a.empty() || b.empty()) {
1251 return 0;
1252 }
1253
1254 // get the lengths of the input sequences
1255 size_t a_len = a.size();
1256 size_t b_len = b.size();
1257
1258 // initialize the maximum length of the longest common subsequence (LCS)
1259 size_t max_length = 0;
1260
1261 // use two rows instead of a 2D matrix to optimize space
1262 std::vector<size_t> prev_row(b_len + 1, 0);
1263 std::vector<size_t> curr_row(b_len + 1, 0);
1264
1265 // iterate through the elements of a
1266 for (size_t i = 1; i <= a_len; i++) {
1267 // iterate through the elements of b
1268 for (size_t j = 1; j <= b_len; j++) {
1269 // if elements at the current positions match
1270 if (a[i - 1] == b[j - 1]) {
1271 // if it's the first element of either sequences, set LCS length to 1
1272 if (i == 1 || j == 1) {
1273 curr_row[j] = 1;
1274 } else {
1275 // increment LCS length by 1 compared to the previous element
1276 curr_row[j] = prev_row[j - 1] + 1;
1277 }
1278
1279 // update max_length if necessary
1280 if (curr_row[j] > max_length) {
1281 max_length = curr_row[j];
1282 }
1283 } else {
1284 // reset LCS length if elements don't match
1285 curr_row[j] = 0;
1286 }
1287 }
1288
1289 // update the previous row for the next iteration
1290 prev_row = curr_row;
1291 }
1292
1293 // return the maximum length of the LCS
1294 return max_length;
1295}
1296
1297//
1298// Vocab utils
1299//
1300
1301std::vector<llama_token> common_tokenize(
1302 const struct llama_context * ctx,
1303 const std::string & text,
1304 bool add_special,
1305 bool parse_special) {
1306 const llama_model * model = llama_get_model(ctx);
1307 const llama_vocab * vocab = llama_model_get_vocab(model);
1308 return common_tokenize(vocab, text, add_special, parse_special);
1309}
1310
1311std::vector<llama_token> common_tokenize(
1312 const struct llama_vocab * vocab,
1313 const std::string & text,
1314 bool add_special,
1315 bool parse_special) {
1316 // upper limit for the number of tokens
1317 int n_tokens = text.length() + 2 * add_special;
1318 std::vector<llama_token> result(n_tokens);
1319 n_tokens = llama_tokenize(vocab, text: text.data(), text_len: text.length(), tokens: result.data(), n_tokens_max: result.size(), add_special, parse_special);
1320 if (n_tokens == std::numeric_limits<int32_t>::min()) {
1321 throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1322 }
1323 if (n_tokens < 0) {
1324 result.resize(new_size: -n_tokens);
1325 int check = llama_tokenize(vocab, text: text.data(), text_len: text.length(), tokens: result.data(), n_tokens_max: result.size(), add_special, parse_special);
1326 GGML_ASSERT(check == -n_tokens);
1327 } else {
1328 result.resize(new_size: n_tokens);
1329 }
1330 return result;
1331}
1332
1333std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1334 const llama_model * model = llama_get_model(ctx);
1335 const llama_vocab * vocab = llama_model_get_vocab(model);
1336 return common_token_to_piece(vocab, token, special);
1337}
1338
1339std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
1340 std::string piece;
1341 piece.resize(n: piece.capacity()); // using string internal cache, 15 bytes + '\n'
1342 const int n_chars = llama_token_to_piece(vocab, token, buf: &piece[0], length: piece.size(), lstrip: 0, special);
1343 if (n_chars < 0) {
1344 piece.resize(n: -n_chars);
1345 int check = llama_token_to_piece(vocab, token, buf: &piece[0], length: piece.size(), lstrip: 0, special);
1346 GGML_ASSERT(check == -n_chars);
1347 }
1348 else {
1349 piece.resize(n: n_chars);
1350 }
1351
1352 return piece;
1353}
1354
1355std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1356 const llama_model * model = llama_get_model(ctx);
1357 const llama_vocab * vocab = llama_model_get_vocab(model);
1358 return common_detokenize(vocab, tokens, special);
1359}
1360
1361std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
1362 std::string text;
1363 text.resize(n: std::max(a: text.capacity(), b: tokens.size()));
1364 int32_t n_chars = llama_detokenize(vocab, tokens: tokens.data(), n_tokens: (int32_t)tokens.size(), text: &text[0], text_len_max: (int32_t)text.size(), remove_special: false, unparse_special: special);
1365 if (n_chars < 0) {
1366 text.resize(n: -n_chars);
1367 n_chars = llama_detokenize(vocab, tokens: tokens.data(), n_tokens: (int32_t)tokens.size(), text: &text[0], text_len_max: (int32_t)text.size(), remove_special: false, unparse_special: special);
1368 GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1369 }
1370
1371 text.resize(n: n_chars);
1372
1373 // NOTE: the original tokenizer decodes bytes after collecting the pieces.
1374 return text;
1375}
1376
1377//
1378// Embedding utils
1379//
1380
1381void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1382 double sum = 0.0;
1383
1384 switch (embd_norm) {
1385 case -1: // no normalisation
1386 sum = 1.0;
1387 break;
1388 case 0: // max absolute
1389 for (int i = 0; i < n; i++) {
1390 if (sum < std::abs(x: inp[i])) {
1391 sum = std::abs(x: inp[i]);
1392 }
1393 }
1394 sum /= 32760.0; // make an int16 range
1395 break;
1396 case 2: // euclidean
1397 for (int i = 0; i < n; i++) {
1398 sum += inp[i] * inp[i];
1399 }
1400 sum = std::sqrt(x: sum);
1401 break;
1402 default: // p-norm (euclidean is p-norm p=2)
1403 for (int i = 0; i < n; i++) {
1404 sum += std::pow(x: std::abs(x: inp[i]), y: embd_norm);
1405 }
1406 sum = std::pow(x: sum, y: 1.0 / embd_norm);
1407 break;
1408 }
1409
1410 const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
1411
1412 for (int i = 0; i < n; i++) {
1413 out[i] = inp[i] * norm;
1414 }
1415}
1416
1417float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1418 double sum = 0.0;
1419 double sum1 = 0.0;
1420 double sum2 = 0.0;
1421
1422 for (int i = 0; i < n; i++) {
1423 sum += embd1[i] * embd2[i];
1424 sum1 += embd1[i] * embd1[i];
1425 sum2 += embd2[i] * embd2[i];
1426 }
1427
1428 // Handle the case where one or both vectors are zero vectors
1429 if (sum1 == 0.0 || sum2 == 0.0) {
1430 if (sum1 == 0.0 && sum2 == 0.0) {
1431 return 1.0f; // two zero vectors are similar
1432 }
1433 return 0.0f;
1434 }
1435
1436 return sum / (sqrt(x: sum1) * sqrt(x: sum2));
1437}
1438
1439//
1440// Control vector utils
1441//
1442
1443static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
1444 common_control_vector_data result = { .n_embd: -1, .data: {} };
1445
1446 ggml_context * ctx = nullptr;
1447 struct gguf_init_params meta_gguf_params = {
1448 /* .no_alloc = */ false,
1449 /* .ctx = */ &ctx,
1450 };
1451 struct gguf_context * ctx_gguf = gguf_init_from_file(fname: load_info.fname.c_str(), params: meta_gguf_params);
1452 if (!ctx_gguf) {
1453 LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
1454 return result;
1455 }
1456
1457 int32_t n_tensors = gguf_get_n_tensors(ctx: ctx_gguf);
1458 if (n_tensors == 0) {
1459 LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
1460 }
1461
1462 for (int i = 0; i < n_tensors; i++) {
1463 std::string name = gguf_get_tensor_name(ctx: ctx_gguf, tensor_id: i);
1464
1465 int layer_idx = -1;
1466
1467 // split on '.'
1468 size_t dotpos = name.find(c: '.');
1469 if (dotpos != std::string::npos && name.substr(pos: 0, n: dotpos) == "direction") {
1470 try {
1471 layer_idx = std::stoi(str: name.substr(pos: dotpos + 1));
1472 } catch (...) {
1473 layer_idx = -1;
1474 }
1475 }
1476 if (layer_idx < 0) {
1477 LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1478 result.n_embd = -1;
1479 break;
1480 } else if (layer_idx == 0) {
1481 LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1482 result.n_embd = -1;
1483 break;
1484 }
1485
1486 struct ggml_tensor * tensor = ggml_get_tensor(ctx, name: name.c_str());
1487 if (tensor->type != GGML_TYPE_F32) {
1488 LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
1489 result.n_embd = -1;
1490 break;
1491 }
1492 if (ggml_n_dims(tensor) != 1) {
1493 LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
1494 result.n_embd = -1;
1495 break;
1496 }
1497
1498 if (result.n_embd == -1) {
1499 result.n_embd = ggml_nelements(tensor);
1500 } else if (ggml_nelements(tensor) != result.n_embd) {
1501 LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
1502 result.n_embd = -1;
1503 break;
1504 }
1505
1506 // extend if necessary - do not store data for layer 0 (it's not used)
1507 result.data.resize(new_size: std::max(a: result.data.size(), b: static_cast<size_t>(result.n_embd * layer_idx)), x: 0.0f);
1508
1509 const float * src = (const float *) tensor->data;
1510 float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
1511 for (int j = 0; j < result.n_embd; j++) {
1512 dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
1513 }
1514
1515 }
1516
1517 if (result.n_embd == -1) {
1518 LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
1519 result.data.clear();
1520 }
1521
1522 gguf_free(ctx: ctx_gguf);
1523 ggml_free(ctx);
1524
1525 return result;
1526}
1527
1528common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
1529 common_control_vector_data result = { .n_embd: -1, .data: {} };
1530
1531 for (const auto & info : load_infos) {
1532 auto cur = common_control_vector_load_one(load_info: info);
1533
1534 if (cur.n_embd == -1) {
1535 result.n_embd = -1;
1536 break;
1537 }
1538 if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
1539 LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
1540 result.n_embd = -1;
1541 break;
1542 }
1543
1544 if (result.n_embd == -1) {
1545 result = std::move(cur);
1546 } else {
1547 result.data.resize(new_size: std::max(a: result.data.size(), b: cur.data.size()), x: 0.0f); // extend if necessary
1548 for (size_t i = 0; i < cur.data.size(); i++) {
1549 result.data[i] += cur.data[i];
1550 }
1551 }
1552 }
1553
1554 if (result.n_embd == -1) {
1555 LOG_ERR("%s: no valid control vector files passed\n", __func__);
1556 result.data.clear();
1557 }
1558
1559 return result;
1560}
1561
1562ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
1563 const int64_t ne_datapoint = llama_n_ctx(ctx);
1564 const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
1565 ggml_opt_dataset_t result = ggml_opt_dataset_init(
1566 type_data: GGML_TYPE_I32, type_label: GGML_TYPE_I32, ne_datapoint, ne_label: ne_datapoint, ndata, /*ndata_shard =*/ 1);
1567
1568 llama_token * data = (llama_token *) ggml_opt_dataset_data(dataset: result)->data;
1569 llama_token * labels = (llama_token *) ggml_opt_dataset_labels(dataset: result)->data;
1570
1571 for (int64_t idata = 0; idata < ndata; ++idata) {
1572 memcpy(dest: data + idata*ne_datapoint, src: tokens.data() + idata*stride + 0, n: ne_datapoint*sizeof(llama_token));
1573 memcpy(dest: labels + idata*ne_datapoint, src: tokens.data() + idata*stride + 1, n: ne_datapoint*sizeof(llama_token));
1574 }
1575
1576 return result;
1577}
1578
1579ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
1580 ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata: nullptr);
1581 const lr_opt & d = *(lr_opt *) userdata;
1582 result.adamw.alpha = result.sgd.alpha = d.get_lr(e: d.epoch);
1583 result.sgd.wd = result.adamw.wd = d.wd;
1584 return result;
1585}
1586
1587// TODO make all command line args case-insensitive
1588static inline bool eq_case_insensitive(char const* a, char const* b) {
1589 return !
1590#if defined(_MSC_VER)
1591 _stricmp
1592#else
1593 strcasecmp
1594#endif // defined(_MSC_VER)
1595 (s1: a, s2: b);
1596}
1597
1598enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
1599 if (eq_case_insensitive(a: "adamw", b: n)) {
1600 return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
1601 }
1602 if (eq_case_insensitive(a: "sgd", b: n)) {
1603 return GGML_OPT_OPTIMIZER_TYPE_SGD;
1604 }
1605 return GGML_OPT_OPTIMIZER_TYPE_COUNT;
1606}
1607
1608// TODO simplify to use just log and exp
1609static float const k_log_2 = std::log(x: 2.f);
1610
1611void lr_opt::init() {
1612 if (lr_min > 0 && lr_min < lr0) {
1613 float nhalf = std::log(x: lr0 / lr_min) / k_log_2;
1614 float e = epochs;
1615 if (decay_epochs > 0 && decay_epochs < e) {
1616 e = decay_epochs;
1617 } else {
1618 decay_epochs = e;
1619 }
1620 scale_epoch = nhalf / e;
1621 }
1622}
1623
1624float lr_opt::get_lr(float epoch) const {
1625 float r = lr_min <= 0 ? lr0 :
1626 epoch >= decay_epochs ? lr_min :
1627 lr0 * std::pow(x: 0.5f, y: epoch * scale_epoch);
1628 LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
1629 return r;
1630}
1631