common.cpp source code [llama.cpp/common/common.cpp]

1	#if defined(_MSC_VER)
2	#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3	#endif
4
5	#include "ggml.h"
6	#include "gguf.h"
7
8	#include "common.h"
9	#include "log.h"
10	#include "llama.h"
11
12	#include <algorithm>
13	#include <cinttypes>
14	#include <climits>
15	#include <cmath>
16	#include <codecvt>
17	#include <chrono>
18	#include <cstdarg>
19	#include <cstring>
20	#include <ctime>
21	#include <filesystem>
22	#include <fstream>
23	#include <iostream>
24	#include <iterator>
25	#include <regex>
26	#include <sstream>
27	#include <string>
28	#include <thread>
29	#include <unordered_map>
30	#include <unordered_set>
31	#include <vector>
32
33	#if defined(__APPLE__) && defined(__MACH__)
34	#include <sys/types.h>
35	#include <sys/sysctl.h>
36	#endif
37
38	#if defined(_WIN32)
39	#define WIN32_LEAN_AND_MEAN
40	#ifndef NOMINMAX
41	# define NOMINMAX
42	#endif
43	#include <locale>
44	#include <windows.h>
45	#include <string.h>
46	#include <fcntl.h>
47	#include <io.h>
48	#else
49	#include <sys/ioctl.h>
50	#include <sys/stat.h>
51	#include <unistd.h>
52	#endif
53
54	#if defined(__linux__)
55	#include <sys/types.h>
56	#include <pwd.h>
57	#endif
58
59	#if defined(_MSC_VER)
60	#pragma warning(disable: 4244 4267) // possible loss of data
61	#endif
62
63	//
64	// CPU utils
65	//
66
67	int32_t cpu_get_num_physical_cores() {
68	#ifdef __linux__
69	// enumerate the set of thread siblings, num entries is num cores
70	std::unordered_set<std::string> siblings;
71	for (uint32_t cpu=`0`; cpu < UINT32_MAX; ++cpu) {
72	std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
73	+ std::to_string(val: cpu) + "/topology/thread_siblings");
74	if (!thread_siblings.is_open()) {
75	break; // no more cpus
76	}
77	std::string line;
78	if (std::getline(is&: thread_siblings, str&: line)) {
79	siblings.insert(x: line);
80	}
81	}
82	if (!siblings.empty()) {
83	return static_cast<int32_t>(siblings.size());
84	}
85	#elif defined(__APPLE__) && defined(__MACH__)
86	int32_t num_physical_cores;
87	size_t len = sizeof(num_physical_cores);
88	int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, `0`);
89	if (result == `0`) {
90	return num_physical_cores;
91	}
92	result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, `0`);
93	if (result == `0`) {
94	return num_physical_cores;
95	}
96	#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
97	// TODO: windows + arm64 + mingw64
98	unsigned int n_threads_win = std::thread::hardware_concurrency();
99	unsigned int default_threads = n_threads_win > `0` ? (n_threads_win <= `4` ? n_threads_win : n_threads_win / `2`) : `4`;
100
101	DWORD buffer_size = `0`;
102	if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
103	if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
104	return default_threads;
105	}
106	}
107
108	std::vector<char> buffer(buffer_size);
109	if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
110	return default_threads;
111	}
112
113	int32_t num_physical_cores = `0`;
114	PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
115	while (buffer_size > `0`) {
116	if (info->Relationship == RelationProcessorCore) {
117	num_physical_cores += info->Processor.GroupCount;
118	}
119	buffer_size -= info->Size;
120	info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
121	}
122
123	return num_physical_cores > `0` ? num_physical_cores : default_threads;
124	#endif
125	unsigned int n_threads = std::thread::hardware_concurrency();
126	return n_threads > `0` ? (n_threads <= `4` ? n_threads : n_threads / `2`) : `4`;
127	}
128
129	#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
130	#include <pthread.h>
131
132	static void cpuid(unsigned leaf, unsigned subleaf,
133	unsigned eax, unsigned* ebx, unsigned* ecx, unsigned* *edx) {
134	__asm__("movq\t%%rbx,%%rsi\n\t"
135	"cpuid\n\t"
136	"xchgq\t%%rbx,%%rsi"
137	: "=a"(eax), "=S"(ebx), "=c"(ecx), "=d"(edx)
138	: "0"(leaf), "2"(subleaf));
139	}
140
141	static int pin_cpu(int cpu) {
142	cpu_set_t mask;
143	CPU_ZERO(&mask);
144	CPU_SET(cpu, &mask);
145	return pthread_setaffinity_np(th: pthread_self(), cpusetsize: sizeof(mask), cpuset: &mask);
146	}
147
148	static bool is_hybrid_cpu(void) {
149	unsigned eax, ebx, ecx, edx;
150	cpuid(leaf: `7`, subleaf: `0`, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx);
151	return !!(edx & (`1u` << `15`));
152	}
153
154	static bool is_running_on_efficiency_core(void) {
155	unsigned eax, ebx, ecx, edx;
156	cpuid(leaf: `0x1a`, subleaf: `0`, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx);
157	int intel_atom = `0x20`;
158	int core_type = (eax & `0xff000000u`) >> `24`;
159	return core_type == intel_atom;
160	}
161
162	static int cpu_count_math_cpus(int n_cpu) {
163	int result = `0`;
164	for (int cpu = `0`; cpu < n_cpu; ++cpu) {
165	if (pin_cpu(cpu)) {
166	return -`1`;
167	}
168	if (is_running_on_efficiency_core()) {
169	continue; // efficiency cores harm lockstep threading
170	}
171	++cpu; // hyperthreading isn't useful for linear algebra
172	++result;
173	}
174	return result;
175	}
176
177	#endif // __x86_64__ && __linux__
178
179	/**
180	* Returns number of CPUs on system that are useful for math.
181	*/
182	int32_t cpu_get_num_math() {
183	#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
184	int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
185	if (n_cpu < `1`) {
186	return cpu_get_num_physical_cores();
187	}
188	if (is_hybrid_cpu()) {
189	cpu_set_t affinity;
190	if (!pthread_getaffinity_np(th: pthread_self(), cpusetsize: sizeof(affinity), cpuset: &affinity)) {
191	int result = cpu_count_math_cpus(n_cpu);
192	pthread_setaffinity_np(th: pthread_self(), cpusetsize: sizeof(affinity), cpuset: &affinity);
193	if (result > `0`) {
194	return result;
195	}
196	}
197	}
198	#endif
199	return cpu_get_num_physical_cores();
200	}
201
202	// Helper for setting process priority
203
204	#if defined(_WIN32)
205
206	bool set_process_priority(enum ggml_sched_priority prio) {
207	if (prio == GGML_SCHED_PRIO_NORMAL) {
208	return true;
209	}
210
211	DWORD p = NORMAL_PRIORITY_CLASS;
212	switch (prio) {
213	case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
214	case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
215	case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
216	case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
217	case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
218	}
219
220	if (!SetPriorityClass(GetCurrentProcess(), p)) {
221	LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
222	return false;
223	}
224
225	return true;
226	}
227
228	#else // MacOS and POSIX
229	#include <sys/types.h>
230	#include <sys/resource.h>
231
232	bool set_process_priority(enum ggml_sched_priority prio) {
233	if (prio == GGML_SCHED_PRIO_NORMAL) {
234	return true;
235	}
236
237	int p = `0`;
238	switch (prio) {
239	case GGML_SCHED_PRIO_LOW: p = `5`; break;
240	case GGML_SCHED_PRIO_NORMAL: p = `0`; break;
241	case GGML_SCHED_PRIO_MEDIUM: p = -`5`; break;
242	case GGML_SCHED_PRIO_HIGH: p = -`10`; break;
243	case GGML_SCHED_PRIO_REALTIME: p = -`20`; break;
244	}
245
246	if (!setpriority(PRIO_PROCESS, who: `0`, prio: p)) {
247	LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
248	return false;
249	}
250	return true;
251	}
252
253	#endif
254
255	//
256	// CLI argument parsing
257	//
258
259
260	void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
261	int32_t n_set = `0`;
262
263	if (cpuparams.n_threads < `0`) {
264	// Assuming everything about cpuparams is invalid
265	if (role_model != nullptr) {
266	cpuparams = *role_model;
267	} else {
268	cpuparams.n_threads = cpu_get_num_math();
269	}
270	}
271
272	for (int32_t i = `0`; i < GGML_MAX_N_THREADS; i++) {
273	if (cpuparams.cpumask[i]) {
274	n_set++;
275	}
276	}
277
278	if (n_set && n_set < cpuparams.n_threads) {
279	// Not enough set bits, may experience performance issues.
280	LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
281	}
282	}
283
284	bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
285	size_t dash_loc = range.find(c: `'-'`);
286	if (dash_loc == std::string::npos) {
287	LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
288	return false;
289	}
290
291	size_t start_i;
292	size_t end_i;
293
294	if (dash_loc == `0`) {
295	start_i = `0`;
296	} else {
297	start_i = std::stoull(str: range.substr(pos: `0`, n: dash_loc));
298	if (start_i >= GGML_MAX_N_THREADS) {
299	LOG_ERR("Start index out of bounds!\n");
300	return false;
301	}
302	}
303
304	if (dash_loc == range.length() - `1`) {
305	end_i = GGML_MAX_N_THREADS - `1`;
306	} else {
307	end_i = std::stoull(str: range.substr(pos: dash_loc + `1`));
308	if (end_i >= GGML_MAX_N_THREADS) {
309	LOG_ERR("End index out of bounds!\n");
310	return false;
311	}
312	}
313
314	for (size_t i = start_i; i <= end_i; i++) {
315	boolmask[i] = true;
316	}
317
318	return true;
319	}
320
321	bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
322	// Discard potential 0x prefix
323	size_t start_i = `0`;
324	if (mask.length() >= `2` && mask.substr(pos: `0`, n: `2`) == "0x") {
325	start_i = `2`;
326	}
327
328	size_t num_digits = mask.length() - start_i;
329	if (num_digits > `128`) num_digits = `128`;
330
331	size_t end_i = num_digits + start_i;
332
333	for (size_t i = start_i, n = (num_digits*`4` - `1`); i < end_i; i++, n-=`4`) {
334	char c = mask.at(n: i);
335	int8_t id = c;
336
337	if ((c >= `'0'` && c <= `'9'`)) {
338	id -= `'0'`;
339	} else if (c >= `'a'` && c <= `'f'`) {
340	id -= `'a'` - `10`;
341	} else if (c >= `'A'` && c <= `'F'`) {
342	id -= `'A'` - `10`;
343	} else {
344	LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
345	return false;
346	}
347
348	boolmask[ n ] = boolmask[ n ] \|\| ((id & `8`) != `0`);
349	boolmask[n - `1`] = boolmask[n - `1`] \|\| ((id & `4`) != `0`);
350	boolmask[n - `2`] = boolmask[n - `2`] \|\| ((id & `2`) != `0`);
351	boolmask[n - `3`] = boolmask[n - `3`] \|\| ((id & `1`) != `0`);
352	}
353
354	return true;
355	}
356
357	void common_init() {
358	llama_log_set(log_callback: [](ggml_log_level level, const char * text, void * /user_data/) {
359	if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
360	common_log_add(log: common_log_main(), level, fmt: "%s", text);
361	}
362	}, NULL);
363
364	#ifdef NDEBUG
365	const char * build_type = "";
366	#else
367	const char * build_type = " (debug)";
368	#endif
369
370	LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
371	}
372
373	std::string common_params_get_system_info(const common_params & params) {
374	std::ostringstream os;
375
376	os << "system_info: n_threads = " << params.cpuparams.n_threads;
377	if (params.cpuparams_batch.n_threads != -`1`) {
378	os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
379	}
380	#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
381	// TODO: windows + arm64 + mingw64
382	DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
383	os << " / " << logicalProcessorCount << " \| " << llama_print_system_info();
384	#else
385	os << " / " << std::thread::hardware_concurrency() << " \| " << llama_print_system_info();
386	#endif
387
388	return os.str();
389	}
390
391	//
392	// String utils
393	//
394
395	std::string string_format(const char * fmt, ...) {
396	va_list ap;
397	va_list ap2;
398	va_start(ap, fmt);
399	va_copy(ap2, ap);
400	int size = vsnprintf(NULL, maxlen: `0`, format: fmt, arg: ap);
401	GGML_ASSERT(size >= `0` && size < INT_MAX); // NOLINT
402	std::vector<char> buf(size + `1`);
403	int size2 = vsnprintf(s: buf.data(), maxlen: size + `1`, format: fmt, arg: ap2);
404	GGML_ASSERT(size2 == size);
405	va_end(ap2);
406	va_end(ap);
407	return std::string (buf.data(), size);
408	}
409
410	std::string string_strip(const std::string & str) {
411	size_t start = `0`;
412	size_t end = str.size();
413	while (start < end && std::isspace(str [start])) {
414	start++;
415	}
416	while (end > start && std::isspace(str [end - `1`])) {
417	end--;
418	}
419	return str.substr(pos: start, n: end - start);
420	}
421
422	std::string string_get_sortable_timestamp() {
423	using clock = std::chrono::system_clock;
424
425	const clock::time_point current_time = clock::now();
426	const time_t as_time_t = clock::to_time_t(t: current_time);
427	char timestamp_no_ns[`100`];
428	std::strftime(s: timestamp_no_ns, maxsize: `100`, format: "%Y_%m_%d-%H_%M_%S", tp: std::localtime(timer: &as_time_t));
429
430	const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
431	d: current_time.time_since_epoch() % `1000000000`).count();
432	char timestamp_ns[`11`];
433	snprintf(s: timestamp_ns, maxlen: `11`, format: "%09" PRId64, ns);
434
435	return std::string (timestamp_no_ns) + "." + std::string (timestamp_ns);
436	}
437
438	void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
439	if (search.empty()) {
440	return;
441	}
442	std::string builder;
443	builder.reserve(res_arg: s.length());
444	size_t pos = `0`;
445	size_t last_pos = `0`;
446	while ((pos = s.find(str: search, pos: last_pos)) != std::string::npos) {
447	builder.append(str: s, pos: last_pos, n: pos - last_pos);
448	builder.append(str: replace);
449	last_pos = pos + search.length();
450	}
451	builder.append(str: s, pos: last_pos, n: std::string::npos);
452	s = std::move(builder);
453	}
454
455	bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
456	return str.size() >= suffix.size() && str.compare(pos1: str.size()-suffix.size(), n1: suffix.size(), str: suffix) == `0`;
457	}
458
459	bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
460	bool has_suffix = string_ends_with(str, suffix);
461	if (has_suffix) {
462	str = str.substr(pos: `0`, n: str.size() - suffix.size());
463	}
464	return has_suffix;
465	}
466
467	size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
468	if (!str.empty() && !stop.empty()) {
469	const char text_last_char = str.back();
470	for (int64_t char_index = stop.size() - `1`; char_index >= `0`; char_index--) {
471	if (stop [char_index] == text_last_char) {
472	const auto current_partial = stop.substr(pos: `0`, n: char_index + `1`);
473	if (string_ends_with(str, suffix: current_partial)) {
474	return str.size() - char_index - `1`;
475	}
476	}
477	}
478	}
479
480	return std::string::npos;
481	}
482
483	std::string regex_escape(const std::string & s) {
484	static const std::regex special_chars("[.^$\|()*+?\\[\\]{}\\\\]");
485	return std::regex_replace(s: s, e: special_chars, fmt: "\\$&");
486	}
487
488	std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
489	std::ostringstream result;
490	for (size_t i = `0`; i < values.size(); ++i) {
491	if (i > `0`) {
492	result << separator;
493	}
494	result << values [i];
495	}
496	return result.str();
497	}
498
499	std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
500	std::vector<std::string> parts;
501	size_t start = `0`;
502	size_t end = str.find(str: delimiter);
503
504	while (end != std::string::npos) {
505	parts.push_back(x: str.substr(pos: start, n: end - start));
506	start = end + delimiter.length();
507	end = str.find(str: delimiter, pos: start);
508	}
509
510	parts.push_back(x: str.substr(pos: start));
511
512	return parts;
513	}
514
515	std::string string_repeat(const std::string & str, size_t n) {
516	if (n == `0`) {
517	return "";
518	}
519
520	std::string result;
521	result.reserve(res_arg: str.length() * n);
522
523	for (size_t i = `0`; i < n; ++i) {
524	result += str;
525	}
526
527	return result;
528	}
529
530	std::string string_from(bool value) {
531	return value ? "true" : "false";
532	}
533
534	std::string string_from(const std::vector<int> & values) {
535	std::stringstream buf;
536
537	buf << "[ ";
538	bool first = true;
539	for (auto e : values) {
540	if (first) {
541	first = false;
542	} else {
543	buf << ", ";
544	}
545	buf << std::to_string(val: e);
546	}
547	buf << " ]";
548
549	return buf.str();
550	}
551
552	std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
553	std::stringstream buf;
554
555	buf << "[ ";
556
557	bool first = true;
558	for (const auto & token : tokens) {
559	if (!first) {
560	buf << ", ";
561	} else {
562	first = false;
563	}
564
565	auto detokenized = common_token_to_piece(ctx, token);
566
567	buf << "'" << detokenized << "'"
568	<< ":" << std::to_string(val: token);
569	}
570
571	buf << " ]";
572
573	return buf.str();
574	}
575
576	std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
577	std::stringstream buf;
578
579	buf << "[ ";
580
581	bool first = true;
582	for (int i = `0`; i < batch.n_tokens; ++i) {
583	if (!first) {
584	buf << ", ";
585	} else {
586	first = false;
587	}
588
589	auto detokenized = common_token_to_piece(ctx, token: batch.token[i]);
590
591	buf << "\n" << std::to_string(val: i)
592	<< ", token '" << detokenized << "'"
593	<< ", pos " << std::to_string(val: batch.pos[i])
594	<< ", n_seq_id " << std::to_string(val: batch.n_seq_id[i])
595	<< ", seq_id " << std::to_string(val: batch.seq_id[i][`0`])
596	<< ", logits " << std::to_string(val: batch.logits[i]);
597	}
598
599	buf << " ]";
600
601	return buf.str();
602	}
603
604	void string_process_escapes(std::string & input) {
605	std::size_t input_len = input.length();
606	std::size_t output_idx = `0`;
607
608	for (std::size_t input_idx = `0`; input_idx < input_len; ++input_idx) {
609	if (input [input_idx] == `'\\'` && input_idx + `1` < input_len) {
610	switch (input [++input_idx]) {
611	case `'n'`: input [output_idx++] = `'\n'`; break;
612	case `'r'`: input [output_idx++] = `'\r'`; break;
613	case `'t'`: input [output_idx++] = `'\t'`; break;
614	case `'\''`: input [output_idx++] = `'\''`; break;
615	case `'\"'`: input [output_idx++] = `'\"'`; break;
616	case `'\\'`: input [output_idx++] = `'\\'`; break;
617	case `'x'`:
618	// Handle \x12, etc
619	if (input_idx + `2` < input_len) {
620	const char x[`3`] = { input [input_idx + `1`], input [input_idx + `2`], `0` };
621	char err_p = nullptr*;
622	const long val = std::strtol(nptr: x, endptr: &err_p, base: `16`);
623	if (err_p == x + `2`) {
624	input_idx += `2`;
625	input [output_idx++] = char(val);
626	break;
627	}
628	}
629	// fall through
630	default: input [output_idx++] = `'\\'`;
631	input [output_idx++] = input [input_idx]; break;
632	}
633	} else {
634	input [output_idx++] = input [input_idx];
635	}
636	}
637
638	input.resize(n: output_idx);
639	}
640
641	bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
642	const char * sep = strchr(s: data, c: `'='`);
643	if (sep == nullptr \|\| sep - data >= `128`) {
644	LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
645	return false;
646	}
647	llama_model_kv_override kvo;
648	std::strncpy(dest: kvo.key, src: data, n: sep - data);
649	kvo.key[sep - data] = `0`;
650	sep++;
651	if (strncmp(s1: sep, s2: "int:", n: `4`) == `0`) {
652	sep += `4`;
653	kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
654	kvo.val_i64 = std::atol(nptr: sep);
655	} else if (strncmp(s1: sep, s2: "float:", n: `6`) == `0`) {
656	sep += `6`;
657	kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
658	kvo.val_f64 = std::atof(nptr: sep);
659	} else if (strncmp(s1: sep, s2: "bool:", n: `5`) == `0`) {
660	sep += `5`;
661	kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
662	if (std::strcmp(s1: sep, s2: "true") == `0`) {
663	kvo.val_bool = true;
664	} else if (std::strcmp(s1: sep, s2: "false") == `0`) {
665	kvo.val_bool = false;
666	} else {
667	LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
668	return false;
669	}
670	} else if (strncmp(s1: sep, s2: "str:", n: `4`) == `0`) {
671	sep += `4`;
672	kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
673	if (strlen(s: sep) > `127`) {
674	LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
675	return false;
676	}
677	strncpy(dest: kvo.val_str, src: sep, n: `127`);
678	kvo.val_str[`127`] = `'\0'`;
679	} else {
680	LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
681	return false;
682	}
683	overrides.emplace_back(args: std::move(kvo));
684	return true;
685	}
686
687	//
688	// Filesystem utils
689	//
690
691	// Validate if a filename is safe to use
692	// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
693	bool fs_validate_filename(const std::string & filename) {
694	if (!filename.length()) {
695	// Empty filename invalid
696	return false;
697	}
698	if (filename.length() > `255`) {
699	// Limit at common largest possible filename on Linux filesystems
700	// to avoid unnecessary further validation
701	// (On systems with smaller limits it will be caught by the OS)
702	return false;
703	}
704
705	std::u32string filename_utf32;
706	try {
707	#if defined(__clang__)
708	// disable C++17 deprecation warning for std::codecvt_utf8
709	# pragma clang diagnostic push
710	# pragma clang diagnostic ignored "-Wdeprecated-declarations"
711	#elif defined(__GNUC__)
712	# pragma GCC diagnostic push
713	# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
714	#endif
715
716	std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
717
718	#if defined(__clang__)
719	# pragma clang diagnostic pop
720	#elif defined(__GNUC__)
721	# pragma GCC diagnostic pop
722	#endif
723
724	filename_utf32 = converter.from_bytes(str: filename);
725
726	// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
727	// or invalid encodings were encountered. Reject such attempts
728	std::string filename_reencoded = converter.to_bytes(wstr: filename_utf32);
729	if (filename_reencoded != filename) {
730	return false;
731	}
732	} catch (const std::exception &) {
733	return false;
734	}
735
736	// Check for forbidden codepoints:
737	// - Control characters
738	// - Unicode equivalents of illegal characters
739	// - UTF-16 surrogate pairs
740	// - UTF-8 replacement character
741	// - Byte order mark (BOM)
742	// - Illegal characters: / \ : ? " < > \|*
743	for (char32_t c : filename_utf32) {
744	if (c <= `0x1F` // Control characters (C0)
745	\|\| c == `0x7F` // Control characters (DEL)
746	\|\| (c >= `0x80` && c <= `0x9F`) // Control characters (C1)
747	\|\| c == `0xFF0E` // Fullwidth Full Stop (period equivalent)
748	\|\| c == `0x2215` // Division Slash (forward slash equivalent)
749	\|\| c == `0x2216` // Set Minus (backslash equivalent)
750	\|\| (c >= `0xD800` && c <= `0xDFFF`) // UTF-16 surrogate pairs
751	\|\| c == `0xFFFD` // Replacement Character (UTF-8)
752	\|\| c == `0xFEFF` // Byte Order Mark (BOM)
753	\|\| c == `'/'` \|\| c == `'\\'` \|\| c == `':'` \|\| c == `''` // Illegal characters*
754	\|\| c == `'?'` \|\| c == `'"'` \|\| c == `'<'` \|\| c == `'>'` \|\| c == `'\|'`) {
755	return false;
756	}
757	}
758
759	// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
760	// Unicode and other whitespace is not affected, only 0x20 space
761	if (filename.front() == `' '` \|\| filename.back() == `' '` \|\| filename.back() == `'.'`) {
762	return false;
763	}
764
765	// Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
766	if (filename.find(s: "..") != std::string::npos) {
767	return false;
768	}
769
770	// Reject "."
771	if (filename == ".") {
772	return false;
773	}
774
775	return true;
776	}
777
778	#include <iostream>
779
780
781	// returns true if successful, false otherwise
782	bool fs_create_directory_with_parents(const std::string & path) {
783	#ifdef _WIN32
784	std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
785	std::wstring wpath = converter.from_bytes(path);
786
787	// if the path already exists, check whether it's a directory
788	const DWORD attributes = GetFileAttributesW(wpath.c_str());
789	if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
790	return true;
791	}
792
793	size_t pos_slash = `0`;
794
795	// process path from front to back, procedurally creating directories
796	while ((pos_slash = path.find(`'\\'`, pos_slash)) != std::string::npos) {
797	const std::wstring subpath = wpath.substr(`0`, pos_slash);
798
799	pos_slash += `1`;
800
801	// skip the drive letter, in some systems it can return an access denied error
802	if (subpath.length() == `2` && subpath[`1`] == `':'`) {
803	continue;
804	}
805
806	const bool success = CreateDirectoryW(subpath.c_str(), NULL);
807
808	if (!success) {
809	const DWORD error = GetLastError();
810
811	// if the path already exists, ensure that it's a directory
812	if (error == ERROR_ALREADY_EXISTS) {
813	const DWORD attributes = GetFileAttributesW(subpath.c_str());
814	if (attributes == INVALID_FILE_ATTRIBUTES \|\| !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
815	return false;
816	}
817	} else {
818	return false;
819	}
820	}
821	}
822
823	return true;
824	#else
825	// if the path already exists, check whether it's a directory
826	struct stat info;
827	if (stat(file: path.c_str(), buf: &info) == `0`) {
828	return S_ISDIR(info.st_mode);
829	}
830
831	size_t pos_slash = `1`; // skip leading slashes for directory creation
832
833	// process path from front to back, procedurally creating directories
834	while ((pos_slash = path.find(c: `'/'`, pos: pos_slash)) != std::string::npos) {
835	const std::string subpath = path.substr(pos: `0`, n: pos_slash);
836	struct stat info;
837
838	// if the path already exists, ensure that it's a directory
839	if (stat(file: subpath.c_str(), buf: &info) == `0`) {
840	if (!S_ISDIR(info.st_mode)) {
841	return false;
842	}
843	} else {
844	// create parent directories
845	const int ret = mkdir(path: subpath.c_str(), mode: `0755`);
846	if (ret != `0`) {
847	return false;
848	}
849	}
850
851	pos_slash += `1`;
852	}
853
854	return true;
855	#endif // _WIN32
856	}
857
858	std::string fs_get_cache_directory() {
859	std::string cache_directory = "";
860	auto ensure_trailing_slash = [](std::string p) {
861	// Make sure to add trailing slash
862	if (p.back() != DIRECTORY_SEPARATOR) {
863	p += DIRECTORY_SEPARATOR;
864	}
865	return p;
866	};
867	if (getenv(name: "LLAMA_CACHE")) {
868	cache_directory = std::getenv(name: "LLAMA_CACHE");
869	} else {
870	#if defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX) \|\| defined(__OpenBSD__)
871	if (std::getenv(name: "XDG_CACHE_HOME")) {
872	cache_directory = std::getenv(name: "XDG_CACHE_HOME");
873	} else if (std::getenv(name: "HOME")) {
874	cache_directory = std::getenv(name: "HOME") + std::string ("/.cache/");
875	} else {
876	#if defined(__linux__)
877	/ no $HOME is defined, fallback to getpwuid /
878	struct passwd *pw = getpwuid(uid: getuid());
879	if ((!pw) \|\| (!pw->pw_dir)) {
880	throw std::runtime_error ("Failed to find $HOME directory");
881	}
882
883	cache_directory = std::string (pw->pw_dir) + std::string ("/.cache/");
884	#else /* defined(__linux__) */
885	throw std::runtime_error("Failed to find $HOME directory");
886	#endif /* defined(__linux__) */
887	}
888	#elif defined(__APPLE__)
889	cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
890	#elif defined(_WIN32)
891	cache_directory = std::getenv("LOCALAPPDATA");
892	#else
893	# error Unknown architecture
894	#endif
895	cache_directory = ensure_trailing_slash (cache_directory);
896	cache_directory += "llama.cpp";
897	}
898	return ensure_trailing_slash (cache_directory);
899	}
900
901	std::string fs_get_cache_file(const std::string & filename) {
902	GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
903	std::string cache_directory = fs_get_cache_directory();
904	const bool success = fs_create_directory_with_parents(path: cache_directory);
905	if (!success) {
906	throw std::runtime_error ("failed to create cache directory: " + cache_directory);
907	}
908	return cache_directory + filename;
909	}
910
911
912	//
913	// Model utils
914	//
915
916	struct common_init_result common_init_from_params(common_params & params) {
917	common_init_result iparams;
918	auto mparams = common_model_params_to_llama(params);
919
920	llama_model * model = llama_model_load_from_file(path_model: params.model.path.c_str(), params: mparams);
921	if (model == NULL) {
922	LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
923	__func__, params.model.path.c_str());
924	return iparams;
925	}
926
927	const llama_vocab * vocab = llama_model_get_vocab(model);
928
929	auto cparams = common_context_params_to_llama(params);
930
931	llama_context * lctx = llama_init_from_model(model, params: cparams);
932	if (lctx == NULL) {
933	LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
934	__func__, params.model.path.c_str());
935	llama_model_free(model);
936	return iparams;
937	}
938
939	if (params.ctx_shift && !llama_memory_can_shift(mem: llama_get_memory(ctx: lctx))) {
940	LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
941	params.ctx_shift = false;
942	}
943
944	if (!params.control_vectors.empty()) {
945	if (params.control_vector_layer_start <= `0`) params.control_vector_layer_start = `1`;
946	if (params.control_vector_layer_end <= `0`) params.control_vector_layer_end = llama_model_n_layer(model);
947
948	const auto cvec = common_control_vector_load(load_infos: params.control_vectors);
949	if (cvec.n_embd == -`1`) {
950	llama_free(ctx: lctx);
951	llama_model_free(model);
952
953	return iparams;
954	}
955
956	int err = llama_apply_adapter_cvec(
957	ctx: lctx,
958	data: cvec.data.data(),
959	len: cvec.data.size(),
960	n_embd: cvec.n_embd,
961	il_start: params.control_vector_layer_start,
962	il_end: params.control_vector_layer_end);
963	if (err) {
964	llama_free(ctx: lctx);
965	llama_model_free(model);
966
967	return iparams;
968	}
969	}
970
971	if (llama_pooling_type(ctx: lctx) == LLAMA_POOLING_TYPE_RANK) {
972	bool ok = true;
973
974	if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
975	LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
976	ok = false;
977	}
978
979	bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
980	bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
981	bool has_rerank_prompt = llama_model_chat_template(model, name: "rerank") != NULL;
982
983	if (!has_eos && !has_sep && !has_rerank_prompt) {
984	LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
985	ok = false;
986	} else if (!has_eos) {
987	LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
988	}
989
990	if (!ok) {
991	llama_free(ctx: lctx);
992	llama_model_free(model);
993
994	return iparams;
995	}
996	}
997
998	// load and optionally apply lora adapters
999	for (auto & la : params.lora_adapters) {
1000	llama_adapter_lora_ptr lora;
1001	lora.reset(p: llama_adapter_lora_init(model, path_lora: la.path.c_str()));
1002	if (lora == nullptr) {
1003	LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
1004	llama_free(ctx: lctx);
1005	llama_model_free(model);
1006	return iparams;
1007	}
1008
1009	char buf[`1024`];
1010	la.ptr = lora.get();
1011	llama_adapter_meta_val_str(adapter: la.ptr, key: "adapter.lora.task_name", buf, buf_size: sizeof(buf));
1012	la.task_name = buf;
1013	llama_adapter_meta_val_str(adapter: la.ptr, key: "adapter.lora.prompt_prefix", buf, buf_size: sizeof(buf));
1014	la.prompt_prefix = buf;
1015	iparams.lora.emplace_back(args: std::move(lora)); // copy to list of loaded adapters
1016	}
1017
1018	if (!params.lora_init_without_apply) {
1019	common_set_adapter_lora(ctx: lctx, lora&: params.lora_adapters);
1020	}
1021
1022	if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
1023	LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
1024	params.sampling.ignore_eos = false;
1025	}
1026
1027	// initialize once
1028	for (llama_token i = `0`; i < llama_vocab_n_tokens(vocab); i++) {
1029	if (llama_vocab_is_eog(vocab, token: i)) {
1030	LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1031	params.sampling.logit_bias_eog.push_back(x: {.token: i, .bias: -INFINITY});
1032	}
1033	}
1034
1035	if (params.sampling.ignore_eos) {
1036	// add EOG biases to the active set of logit biases
1037	params.sampling.logit_bias.insert(
1038	position: params.sampling.logit_bias.end(),
1039	first: params.sampling.logit_bias_eog.begin(), last: params.sampling.logit_bias_eog.end());
1040	}
1041
1042	if (params.sampling.penalty_last_n == -`1`) {
1043	LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1044	params.sampling.penalty_last_n = llama_n_ctx(ctx: lctx);
1045	}
1046
1047	if (params.sampling.dry_penalty_last_n == -`1`) {
1048	LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1049	params.sampling.dry_penalty_last_n = llama_n_ctx(ctx: lctx);
1050	}
1051
1052	if (params.warmup) {
1053	LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
1054
1055	llama_set_warmup(ctx: lctx, warmup: true);
1056
1057	std::vector<llama_token> tmp;
1058	llama_token bos = llama_vocab_bos(vocab);
1059	llama_token eos = llama_vocab_eos(vocab);
1060
1061	// some models (e.g. T5) don't have a BOS token
1062	if (bos != LLAMA_TOKEN_NULL) {
1063	tmp.push_back(x: bos);
1064	}
1065	if (eos != LLAMA_TOKEN_NULL) {
1066	tmp.push_back(x: eos);
1067	}
1068	if (tmp.empty()) {
1069	tmp.push_back(x: `0`);
1070	}
1071
1072	if (llama_model_has_encoder(model)) {
1073	llama_encode(ctx: lctx, batch: llama_batch_get_one(tokens: tmp.data(), n_tokens: tmp.size()));
1074	llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
1075	if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
1076	decoder_start_token_id = bos;
1077	}
1078	tmp.clear();
1079	tmp.push_back(x: decoder_start_token_id);
1080	}
1081	if (llama_model_has_decoder(model)) {
1082	llama_decode(ctx: lctx, batch: llama_batch_get_one(tokens: tmp.data(), n_tokens: std::min(a: tmp.size(), b: (size_t) params.n_batch)));
1083	}
1084	llama_memory_clear(mem: llama_get_memory(ctx: lctx), data: true);
1085	llama_synchronize(ctx: lctx);
1086	llama_perf_context_reset(ctx: lctx);
1087	llama_set_warmup(ctx: lctx, warmup: false);
1088	}
1089
1090	iparams.model.reset(p: model);
1091	iparams.context.reset(p: lctx);
1092
1093	return iparams;
1094	}
1095
1096	std::string get_model_endpoint() {
1097	const char * model_endpoint_env = getenv(name: "MODEL_ENDPOINT");
1098	// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
1099	const char * hf_endpoint_env = getenv(name: "HF_ENDPOINT");
1100	const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
1101	std::string model_endpoint = "https://huggingface.co/";
1102	if (endpoint_env) {
1103	model_endpoint = endpoint_env;
1104	if (model_endpoint.back() != `'/'`) model_endpoint += `'/'`;
1105	}
1106	return model_endpoint;
1107	}
1108
1109	void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
1110	llama_clear_adapter_lora(ctx);
1111	for (auto & la : lora) {
1112	if (la.scale != `0.0f`) {
1113	llama_set_adapter_lora(ctx, adapter: la.ptr, scale: la.scale);
1114	}
1115	}
1116	}
1117
1118	struct llama_model_params common_model_params_to_llama(common_params & params) {
1119	auto mparams = llama_model_default_params();
1120
1121	if (!params.devices.empty()) {
1122	mparams.devices = params.devices.data();
1123	}
1124
1125	if (params.n_gpu_layers != -`1`) {
1126	mparams.n_gpu_layers = params.n_gpu_layers;
1127	}
1128
1129	mparams.main_gpu = params.main_gpu;
1130	mparams.split_mode = params.split_mode;
1131	mparams.tensor_split = params.tensor_split;
1132	mparams.use_mmap = params.use_mmap;
1133	mparams.use_mlock = params.use_mlock;
1134	mparams.check_tensors = params.check_tensors;
1135	mparams.use_extra_bufts = !params.no_extra_bufts;
1136	mparams.no_host = params.no_host;
1137
1138	if (params.kv_overrides.empty()) {
1139	mparams.kv_overrides = NULL;
1140	} else {
1141	GGML_ASSERT(params.kv_overrides.back().key[`0`] == `0` && "KV overrides not terminated with empty key");
1142	mparams.kv_overrides = params.kv_overrides.data();
1143	}
1144
1145	if (params.tensor_buft_overrides.empty()) {
1146	mparams.tensor_buft_overrides = NULL;
1147	} else {
1148	GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
1149	mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
1150	}
1151
1152	mparams.progress_callback = params.load_progress_callback;
1153	mparams.progress_callback_user_data = params.load_progress_callback_user_data;
1154
1155	return mparams;
1156	}
1157
1158	struct llama_context_params common_context_params_to_llama(const common_params & params) {
1159	auto cparams = llama_context_default_params();
1160
1161	cparams.n_ctx = params.n_ctx;
1162	cparams.n_seq_max = params.n_parallel;
1163	cparams.n_batch = params.n_batch;
1164	cparams.n_ubatch = params.n_ubatch;
1165	cparams.n_threads = params.cpuparams.n_threads;
1166	cparams.n_threads_batch = params.cpuparams_batch.n_threads == -`1` ?
1167	params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1168	cparams.embeddings = params.embedding;
1169	cparams.rope_scaling_type = params.rope_scaling_type;
1170	cparams.rope_freq_base = params.rope_freq_base;
1171	cparams.rope_freq_scale = params.rope_freq_scale;
1172	cparams.yarn_ext_factor = params.yarn_ext_factor;
1173	cparams.yarn_attn_factor = params.yarn_attn_factor;
1174	cparams.yarn_beta_fast = params.yarn_beta_fast;
1175	cparams.yarn_beta_slow = params.yarn_beta_slow;
1176	cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1177	cparams.pooling_type = params.pooling_type;
1178	cparams.attention_type = params.attention_type;
1179	cparams.flash_attn_type = params.flash_attn_type;
1180	cparams.cb_eval = params.cb_eval;
1181	cparams.cb_eval_user_data = params.cb_eval_user_data;
1182	cparams.offload_kqv = !params.no_kv_offload;
1183	cparams.no_perf = params.no_perf;
1184	cparams.op_offload = !params.no_op_offload;
1185	cparams.swa_full = params.swa_full;
1186	cparams.kv_unified = params.kv_unified;
1187
1188	cparams.type_k = params.cache_type_k;
1189	cparams.type_v = params.cache_type_v;
1190
1191	return cparams;
1192	}
1193
1194	struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
1195	struct ggml_threadpool_params tpp;
1196
1197	ggml_threadpool_params_init(p: &tpp, n_threads: params.n_threads); // setup the defaults
1198
1199	if (params.mask_valid) {
1200	std::memcpy(dest: &tpp.cpumask, src: &params.cpumask, GGML_MAX_N_THREADS);
1201	}
1202
1203	tpp.prio = params.priority;
1204	tpp.poll = params.poll;
1205	tpp.strict_cpu = params.strict_cpu;
1206
1207	return tpp;
1208	}
1209
1210	//
1211	// Batch utils
1212	//
1213
1214	void common_batch_clear(struct llama_batch & batch) {
1215	batch.n_tokens = `0`;
1216	}
1217
1218	void common_batch_add(
1219	struct llama_batch & batch,
1220	llama_token id,
1221	llama_pos pos,
1222	const std::vector<llama_seq_id> & seq_ids,
1223	bool logits) {
1224	GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
1225
1226	batch.token [batch.n_tokens] = id;
1227	batch.pos [batch.n_tokens] = pos;
1228	batch.n_seq_id[batch.n_tokens] = seq_ids.size();
1229	for (size_t i = `0`; i < seq_ids.size(); ++i) {
1230	batch.seq_id[batch.n_tokens][i] = seq_ids [i];
1231	}
1232	batch.logits [batch.n_tokens] = logits;
1233
1234	batch.n_tokens++;
1235	}
1236
1237	//
1238	// Token utils
1239	//
1240
1241	size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
1242	size_t i;
1243	for (i = `0`; i < a.size() && i < b.size() && a [i] == b [i]; i++) {}
1244
1245	return i;
1246	}
1247
1248	size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
1249	// check for empty sequences
1250	if (a.empty() \|\| b.empty()) {
1251	return `0`;
1252	}
1253
1254	// get the lengths of the input sequences
1255	size_t a_len = a.size();
1256	size_t b_len = b.size();
1257
1258	// initialize the maximum length of the longest common subsequence (LCS)
1259	size_t max_length = `0`;
1260
1261	// use two rows instead of a 2D matrix to optimize space
1262	std::vector<size_t> prev_row(b_len + `1`, `0`);
1263	std::vector<size_t> curr_row(b_len + `1`, `0`);
1264
1265	// iterate through the elements of a
1266	for (size_t i = `1`; i <= a_len; i++) {
1267	// iterate through the elements of b
1268	for (size_t j = `1`; j <= b_len; j++) {
1269	// if elements at the current positions match
1270	if (a [i - `1`] == b [j - `1`]) {
1271	// if it's the first element of either sequences, set LCS length to 1
1272	if (i == `1` \|\| j == `1`) {
1273	curr_row [j] = `1`;
1274	} else {
1275	// increment LCS length by 1 compared to the previous element
1276	curr_row [j] = prev_row [j - `1`] + `1`;
1277	}
1278
1279	// update max_length if necessary
1280	if (curr_row [j] > max_length) {
1281	max_length = curr_row [j];
1282	}
1283	} else {
1284	// reset LCS length if elements don't match
1285	curr_row [j] = `0`;
1286	}
1287	}
1288
1289	// update the previous row for the next iteration
1290	prev_row = curr_row;
1291	}
1292
1293	// return the maximum length of the LCS
1294	return max_length;
1295	}
1296
1297	//
1298	// Vocab utils
1299	//
1300
1301	std::vector<llama_token> common_tokenize(
1302	const struct llama_context * ctx,
1303	const std::string & text,
1304	bool add_special,
1305	bool parse_special) {
1306	const llama_model * model = llama_get_model(ctx);
1307	const llama_vocab * vocab = llama_model_get_vocab(model);
1308	return common_tokenize(vocab, text, add_special, parse_special);
1309	}
1310
1311	std::vector<llama_token> common_tokenize(
1312	const struct llama_vocab * vocab,
1313	const std::string & text,
1314	bool add_special,
1315	bool parse_special) {
1316	// upper limit for the number of tokens
1317	int n_tokens = text.length() + `2` * add_special;
1318	std::vector<llama_token> result(n_tokens);
1319	n_tokens = llama_tokenize(vocab, text: text.data(), text_len: text.length(), tokens: result.data(), n_tokens_max: result.size(), add_special, parse_special);
1320	if (n_tokens == std::numeric_limits<int32_t>::min()) {
1321	throw std::runtime_error ("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1322	}
1323	if (n_tokens < `0`) {
1324	result.resize(new_size: -n_tokens);
1325	int check = llama_tokenize(vocab, text: text.data(), text_len: text.length(), tokens: result.data(), n_tokens_max: result.size(), add_special, parse_special);
1326	GGML_ASSERT(check == -n_tokens);
1327	} else {
1328	result.resize(new_size: n_tokens);
1329	}
1330	return result;
1331	}
1332
1333	std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1334	const llama_model * model = llama_get_model(ctx);
1335	const llama_vocab * vocab = llama_model_get_vocab(model);
1336	return common_token_to_piece(vocab, token, special);
1337	}
1338
1339	std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
1340	std::string piece;
1341	piece.resize(n: piece.capacity()); // using string internal cache, 15 bytes + '\n'
1342	const int n_chars = llama_token_to_piece(vocab, token, buf: &piece [`0`], length: piece.size(), lstrip: `0`, special);
1343	if (n_chars < `0`) {
1344	piece.resize(n: -n_chars);
1345	int check = llama_token_to_piece(vocab, token, buf: &piece [`0`], length: piece.size(), lstrip: `0`, special);
1346	GGML_ASSERT(check == -n_chars);
1347	}
1348	else {
1349	piece.resize(n: n_chars);
1350	}
1351
1352	return piece;
1353	}
1354
1355	std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1356	const llama_model * model = llama_get_model(ctx);
1357	const llama_vocab * vocab = llama_model_get_vocab(model);
1358	return common_detokenize(vocab, tokens, special);
1359	}
1360
1361	std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
1362	std::string text;
1363	text.resize(n: std::max(a: text.capacity(), b: tokens.size()));
1364	int32_t n_chars = llama_detokenize(vocab, tokens: tokens.data(), n_tokens: (int32_t)tokens.size(), text: &text [`0`], text_len_max: (int32_t)text.size(), remove_special: false, unparse_special: special);
1365	if (n_chars < `0`) {
1366	text.resize(n: -n_chars);
1367	n_chars = llama_detokenize(vocab, tokens: tokens.data(), n_tokens: (int32_t)tokens.size(), text: &text [`0`], text_len_max: (int32_t)text.size(), remove_special: false, unparse_special: special);
1368	GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1369	}
1370
1371	text.resize(n: n_chars);
1372
1373	// NOTE: the original tokenizer decodes bytes after collecting the pieces.
1374	return text;
1375	}
1376
1377	//
1378	// Embedding utils
1379	//
1380
1381	void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1382	double sum = `0.0`;
1383
1384	switch (embd_norm) {
1385	case -`1`: // no normalisation
1386	sum = `1.0`;
1387	break;
1388	case `0`: // max absolute
1389	for (int i = `0`; i < n; i++) {
1390	if (sum < std::abs(x: inp[i])) {
1391	sum = std::abs(x: inp[i]);
1392	}
1393	}
1394	sum /= `32760.0`; // make an int16 range
1395	break;
1396	case `2`: // euclidean
1397	for (int i = `0`; i < n; i++) {
1398	sum += inp[i] * inp[i];
1399	}
1400	sum = std::sqrt(x: sum);
1401	break;
1402	default: // p-norm (euclidean is p-norm p=2)
1403	for (int i = `0`; i < n; i++) {
1404	sum += std::pow(x: std::abs(x: inp[i]), y: embd_norm);
1405	}
1406	sum = std::pow(x: sum, y: `1.0` / embd_norm);
1407	break;
1408	}
1409
1410	const float norm = sum > `0.0` ? `1.0` / sum : `0.0f`;
1411
1412	for (int i = `0`; i < n; i++) {
1413	out[i] = inp[i] * norm;
1414	}
1415	}
1416
1417	float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1418	double sum = `0.0`;
1419	double sum1 = `0.0`;
1420	double sum2 = `0.0`;
1421
1422	for (int i = `0`; i < n; i++) {
1423	sum += embd1[i] * embd2[i];
1424	sum1 += embd1[i] * embd1[i];
1425	sum2 += embd2[i] * embd2[i];
1426	}
1427
1428	// Handle the case where one or both vectors are zero vectors
1429	if (sum1 == `0.0` \|\| sum2 == `0.0`) {
1430	if (sum1 == `0.0` && sum2 == `0.0`) {
1431	return `1.0f`; // two zero vectors are similar
1432	}
1433	return `0.0f`;
1434	}
1435
1436	return sum / (sqrt(x: sum1) * sqrt(x: sum2));
1437	}
1438
1439	//
1440	// Control vector utils
1441	//
1442
1443	static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
1444	common_control_vector_data result = { .n_embd: -`1`, .data: {} };
1445
1446	ggml_context * ctx = nullptr;
1447	struct gguf_init_params meta_gguf_params = {
1448	/ .no_alloc = / false,
1449	/ .ctx = / &ctx,
1450	};
1451	struct gguf_context * ctx_gguf = gguf_init_from_file(fname: load_info.fname.c_str(), params: meta_gguf_params);
1452	if (!ctx_gguf) {
1453	LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
1454	return result;
1455	}
1456
1457	int32_t n_tensors = gguf_get_n_tensors(ctx: ctx_gguf);
1458	if (n_tensors == `0`) {
1459	LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
1460	}
1461
1462	for (int i = `0`; i < n_tensors; i++) {
1463	std::string name = gguf_get_tensor_name(ctx: ctx_gguf, tensor_id: i);
1464
1465	int layer_idx = -`1`;
1466
1467	// split on '.'
1468	size_t dotpos = name.find(c: `'.'`);
1469	if (dotpos != std::string::npos && name.substr(pos: `0`, n: dotpos) == "direction") {
1470	try {
1471	layer_idx = std::stoi(str: name.substr(pos: dotpos + `1`));
1472	} catch (...) {
1473	layer_idx = -`1`;
1474	}
1475	}
1476	if (layer_idx < `0`) {
1477	LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1478	result.n_embd = -`1`;
1479	break;
1480	} else if (layer_idx == `0`) {
1481	LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1482	result.n_embd = -`1`;
1483	break;
1484	}
1485
1486	struct ggml_tensor * tensor = ggml_get_tensor(ctx, name: name.c_str());
1487	if (tensor->type != GGML_TYPE_F32) {
1488	LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
1489	result.n_embd = -`1`;
1490	break;
1491	}
1492	if (ggml_n_dims(tensor) != `1`) {
1493	LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
1494	result.n_embd = -`1`;
1495	break;
1496	}
1497
1498	if (result.n_embd == -`1`) {
1499	result.n_embd = ggml_nelements(tensor);
1500	} else if (ggml_nelements(tensor) != result.n_embd) {
1501	LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
1502	result.n_embd = -`1`;
1503	break;
1504	}
1505
1506	// extend if necessary - do not store data for layer 0 (it's not used)
1507	result.data.resize(new_size: std::max(a: result.data.size(), b: static_cast<size_t>(result.n_embd * layer_idx)), x: `0.0f`);
1508
1509	const float * src = (const float *) tensor->data;
1510	float * dst = result.data.data() + result.n_embd * (layer_idx - `1`); // layer 1 at [0]
1511	for (int j = `0`; j < result.n_embd; j++) {
1512	dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
1513	}
1514
1515	}
1516
1517	if (result.n_embd == -`1`) {
1518	LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
1519	result.data.clear();
1520	}
1521
1522	gguf_free(ctx: ctx_gguf);
1523	ggml_free(ctx);
1524
1525	return result;
1526	}
1527
1528	common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
1529	common_control_vector_data result = { .n_embd: -`1`, .data: {} };
1530
1531	for (const auto & info : load_infos) {
1532	auto cur = common_control_vector_load_one(load_info: info);
1533
1534	if (cur.n_embd == -`1`) {
1535	result.n_embd = -`1`;
1536	break;
1537	}
1538	if (result.n_embd != -`1` && result.n_embd != cur.n_embd) {
1539	LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
1540	result.n_embd = -`1`;
1541	break;
1542	}
1543
1544	if (result.n_embd == -`1`) {
1545	result = std::move(cur);
1546	} else {
1547	result.data.resize(new_size: std::max(a: result.data.size(), b: cur.data.size()), x: `0.0f`); // extend if necessary
1548	for (size_t i = `0`; i < cur.data.size(); i++) {
1549	result.data [i] += cur.data [i];
1550	}
1551	}
1552	}
1553
1554	if (result.n_embd == -`1`) {
1555	LOG_ERR("%s: no valid control vector files passed\n", __func__);
1556	result.data.clear();
1557	}
1558
1559	return result;
1560	}
1561
1562	ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
1563	const int64_t ne_datapoint = llama_n_ctx(ctx);
1564	const int64_t ndata = (tokens.size() - ne_datapoint - `1`) / stride;
1565	ggml_opt_dataset_t result = ggml_opt_dataset_init(
1566	type_data: GGML_TYPE_I32, type_label: GGML_TYPE_I32, ne_datapoint, ne_label: ne_datapoint, ndata, /ndata_shard =/ `1`);
1567
1568	llama_token * data = (llama_token *) ggml_opt_dataset_data(dataset: result)->data;
1569	llama_token * labels = (llama_token *) ggml_opt_dataset_labels(dataset: result)->data;
1570
1571	for (int64_t idata = `0`; idata < ndata; ++idata) {
1572	memcpy(dest: data + idatane_datapoint, src: tokens.data() + idatastride + `0`, n: ne_datapoint*sizeof(llama_token));
1573	memcpy(dest: labels + idatane_datapoint, src: tokens.data() + idatastride + `1`, n: ne_datapoint*sizeof(llama_token));
1574	}
1575
1576	return result;
1577	}
1578
1579	ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
1580	ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata: nullptr);
1581	const lr_opt & d = (lr_opt ) userdata;
1582	result.adamw.alpha = result.sgd.alpha = d.get_lr(e: d.epoch);
1583	result.sgd.wd = result.adamw.wd = d.wd;
1584	return result;
1585	}
1586
1587	// TODO make all command line args case-insensitive
1588	static inline bool eq_case_insensitive(char const* a, char const* b) {
1589	return !
1590	#if defined(_MSC_VER)
1591	_stricmp
1592	#else
1593	strcasecmp
1594	#endif // defined(_MSC_VER)
1595	(s1: a, s2: b);
1596	}
1597
1598	enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
1599	if (eq_case_insensitive(a: "adamw", b: n)) {
1600	return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
1601	}
1602	if (eq_case_insensitive(a: "sgd", b: n)) {
1603	return GGML_OPT_OPTIMIZER_TYPE_SGD;
1604	}
1605	return GGML_OPT_OPTIMIZER_TYPE_COUNT;
1606	}
1607
1608	// TODO simplify to use just log and exp
1609	static float const k_log_2 = std::log(x: `2.f`);
1610
1611	void lr_opt::init() {
1612	if (lr_min > `0` && lr_min < lr0) {
1613	float nhalf = std::log(x: lr0 / lr_min) / k_log_2;
1614	float e = epochs;
1615	if (decay_epochs > `0` && decay_epochs < e) {
1616	e = decay_epochs;
1617	} else {
1618	decay_epochs = e;
1619	}
1620	scale_epoch = nhalf / e;
1621	}
1622	}
1623
1624	float lr_opt::get_lr(float epoch) const {
1625	float r = lr_min <= `0` ? lr0 :
1626	epoch >= decay_epochs ? lr_min :
1627	lr0 * std::pow(x: `0.5f`, y: epoch * scale_epoch);
1628	LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
1629	return r;
1630	}
1631

Browse the source code of llama.cpp/common/common.cpp