llama.cpp source code [llama.cpp/src/llama.cpp]

1	#include "llama-impl.h"
2
3	#include "llama-chat.h"
4	#include "llama-mmap.h"
5	#include "llama-vocab.h"
6	#include "llama-model-loader.h"
7	#include "llama-model-saver.h"
8	#include "llama-model.h"
9
10	#include "ggml.h"
11	#include "ggml-backend.h"
12
13	#include <algorithm>
14	#include <cstddef>
15	#include <cstdint>
16	#include <cstdio>
17	#include <cstring>
18	#include <ctime>
19
20	#if defined(_MSC_VER)
21	#pragma warning(disable: 4244 4267) // possible loss of data
22	#endif
23
24	//
25	// interface implementation
26	//
27
28	const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
29	switch (flash_attn_type) {
30	case LLAMA_FLASH_ATTN_TYPE_AUTO:
31	return "auto";
32	case LLAMA_FLASH_ATTN_TYPE_DISABLED:
33	return "disabled";
34	case LLAMA_FLASH_ATTN_TYPE_ENABLED:
35	return "enabled";
36	}
37	GGML_ABORT("fatal error");
38	}
39
40	struct llama_sampler_chain_params llama_sampler_chain_default_params() {
41	struct llama_sampler_chain_params result = {
42	/.no_perf =/ true,
43	};
44
45	return result;
46	}
47
48	size_t llama_max_devices(void) {
49	return `16`;
50	}
51
52	bool llama_supports_mmap(void) {
53	return llama_mmap::SUPPORTED;
54	}
55
56	bool llama_supports_mlock(void) {
57	return llama_mlock::SUPPORTED;
58	}
59
60	bool llama_supports_gpu_offload(void) {
61	return ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr \|\|
62	ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr \|\|
63	llama_supports_rpc();
64	}
65
66	bool llama_supports_rpc(void) {
67	return ggml_backend_reg_by_name(name: "RPC") != nullptr;
68	}
69
70	void llama_backend_init(void) {
71	ggml_time_init();
72
73	// needed to initialize f16 tables
74	{
75	struct ggml_init_params params = { .mem_size: `0`, NULL, .no_alloc: false };
76	struct ggml_context * ctx = ggml_init(params);
77	ggml_free(ctx);
78	}
79	}
80
81	void llama_numa_init(enum ggml_numa_strategy numa) {
82	if (numa != GGML_NUMA_STRATEGY_DISABLED) {
83	auto * dev = ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_CPU);
84	GGML_ASSERT(dev && "CPU backend is not loaded");
85	auto * reg = ggml_backend_dev_backend_reg(device: dev);
86	auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, name: "ggml_backend_cpu_numa_init");
87	if (numa_init_fn) {
88	numa_init_fn(numa);
89	}
90	}
91	}
92
93	void llama_backend_free(void) {
94	ggml_quantize_free();
95	}
96
97	int64_t llama_time_us(void) {
98	return ggml_time_us();
99	}
100
101	// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
102	static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
103	// loading time will be recalculated after the first eval, so
104	// we take page faults deferred by mmap() into consideration
105	model.t_load_us = `0`;
106	time_meas tm(model.t_load_us);
107
108	model.t_start_us = tm.t_start_us;
109
110	try {
111	llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
112
113	ml.print_info();
114
115	model.hparams.vocab_only = params.vocab_only;
116
117	try {
118	model.load_arch(ml);
119	} catch(const std::exception & e) {
120	throw std::runtime_error ("error loading model architecture: " + std::string (e.what()));
121	}
122	try {
123	model.load_hparams(ml);
124	} catch(const std::exception & e) {
125	throw std::runtime_error ("error loading model hyperparameters: " + std::string (e.what()));
126	}
127	if (model.arch == LLM_ARCH_CLIP) {
128	throw std::runtime_error ("CLIP cannot be used as main model, use it with --mmproj instead");
129	}
130	try {
131	model.load_vocab(ml);
132	} catch(const std::exception & e) {
133	throw std::runtime_error ("error loading model vocabulary: " + std::string (e.what()));
134	}
135
136	model.load_stats(ml);
137	model.print_info();
138
139	if (params.vocab_only) {
140	LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
141	return `0`;
142	}
143
144	if (!model.load_tensors(ml)) {
145	return -`2`;
146	}
147	} catch (const std::exception & err) {
148	LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
149	return -`1`;
150	}
151
152	return `0`;
153	}
154
155	static struct llama_model * llama_model_load_from_file_impl(
156	const std::string & path_model,
157	std::vector<std::string> & splits,
158	struct llama_model_params params) {
159	ggml_time_init();
160
161	if (!params.vocab_only && ggml_backend_reg_count() == `0`) {
162	LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
163	return nullptr;
164	}
165
166	unsigned cur_percentage = `0`;
167	if (params.progress_callback == NULL) {
168	params.progress_callback_user_data = &cur_percentage;
169	params.progress_callback = [](float progress, void * ctx) {
170	unsigned * cur_percentage_p = (unsigned *) ctx;
171	unsigned percentage = (unsigned) (`100` * progress);
172	while (percentage > *cur_percentage_p) {
173	*cur_percentage_p = percentage;
174	LLAMA_LOG_CONT(".");
175	if (percentage >= `100`) {
176	LLAMA_LOG_CONT("\n");
177	}
178	}
179	return true;
180	};
181	}
182
183	llama_model * model = new llama_model (params);
184
185	// create list of devices to use with this model
186	if (params.devices) {
187	for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
188	model->devices.push_back(x: *dev);
189	}
190	} else {
191	// default device selection
192
193	// build list of available devices
194	std::vector<ggml_backend_dev_t> gpus;
195	std::vector<ggml_backend_dev_t> igpus;
196	std::vector<ggml_backend_dev_t> rpc_servers;
197
198	for (size_t i = `0`; i < ggml_backend_dev_count(); ++i) {
199	ggml_backend_dev_t dev = ggml_backend_dev_get(index: i);
200	switch (ggml_backend_dev_type(device: dev)) {
201	case GGML_BACKEND_DEVICE_TYPE_CPU:
202	case GGML_BACKEND_DEVICE_TYPE_ACCEL:
203	// skip CPU backends since they are handled separately
204	break;
205
206	case GGML_BACKEND_DEVICE_TYPE_GPU: {
207	ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(device: dev);
208	if (ggml_backend_reg_name(reg) == std::string ("RPC")) {
209	rpc_servers.push_back(x: dev);
210	} else {
211	// check if there is already a GPU with the same device id
212	ggml_backend_dev_props props;
213	ggml_backend_dev_get_props(device: dev, props: &props);
214	auto it = std::find_if(first: gpus.begin(), last: gpus.end(), pred: [&props](ggml_backend_dev_t d) {
215	ggml_backend_dev_props d_props;
216	ggml_backend_dev_get_props(device: d, props: &d_props);
217	if (props.device_id && d_props.device_id) {
218	return strcmp(s1: props.device_id, s2: d_props.device_id) == `0`;
219	}
220	return false;
221	});
222
223	if (it != gpus.end()) {
224	LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
225	__func__,
226	ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
227	props.device_id ? props.device_id : "unknown id",
228	ggml_backend_dev_name(it), ggml_backend_dev_description(it));
229	} else {
230	gpus.push_back(x: dev);
231	}
232	}
233	break;
234	}
235
236	case GGML_BACKEND_DEVICE_TYPE_IGPU:
237	igpus.push_back(x: dev);
238	break;
239	}
240	}
241
242	// add RPC servers at the front of the list to minimize network transfers
243	model->devices.insert(position: model->devices.begin(), first: rpc_servers.begin(), last: rpc_servers.end());
244
245	// add GPUs
246	model->devices.insert(position: model->devices.end(), first: gpus.begin(), last: gpus.end());
247
248	// add integrated GPUs only if no other devices were found
249	if (model->devices.empty()) {
250	model->devices.insert(position: model->devices.end(), first: igpus.begin(), last: igpus.end());
251	}
252	}
253
254	// if using single GPU mode, remove all except the main GPU
255	if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
256	if (params.main_gpu < `0`) {
257	model->devices.clear();
258	} else {
259	if (params.main_gpu >= (int)model->devices.size()) {
260	LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
261	llama_model_free(model);
262	return nullptr;
263	}
264	ggml_backend_dev_t main_gpu = model->devices [params.main_gpu];
265	model->devices.clear();
266	model->devices.push_back(x: main_gpu);
267	}
268	}
269
270	for (auto * dev : model->devices) {
271	ggml_backend_dev_props props;
272	ggml_backend_dev_get_props(device: dev, props: &props);
273	LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
274	ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
275	props.device_id ? props.device_id : "unknown id",
276	props.memory_free/`1024`/`1024`);
277	}
278
279	const int status = llama_model_load(fname: path_model, splits, model&: *model, params);
280	GGML_ASSERT(status <= `0`);
281	if (status < `0`) {
282	if (status == -`1`) {
283	LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
284	} else if (status == -`2`) {
285	LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
286	}
287
288	llama_model_free(model);
289	return nullptr;
290	}
291
292	return model;
293	}
294
295	// deprecated
296	struct llama_model * llama_load_model_from_file(
297	const char * path_model,
298	struct llama_model_params params) {
299	return llama_model_load_from_file(path_model, params);
300	}
301
302	struct llama_model * llama_model_load_from_file(
303	const char * path_model,
304	struct llama_model_params params) {
305	std::vector<std::string> splits = {};
306	return llama_model_load_from_file_impl(path_model, splits, params);
307	}
308
309	struct llama_model * llama_model_load_from_splits(
310	const char ** paths,
311	size_t n_paths,
312	struct llama_model_params params) {
313	std::vector<std::string> splits;
314	if (n_paths == `0`) {
315	LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
316	return nullptr;
317	}
318	splits.reserve(n: n_paths);
319	for (size_t i = `0`; i < n_paths; ++i) {
320	splits.push_back(x: paths[i]);
321	}
322	return llama_model_load_from_file_impl(path_model: splits.front(), splits, params);
323	}
324
325	void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
326	llama_model_saver ms(*model);
327	ms.add_kv_from_model();
328	ms.add_tensors_from_model();
329	ms.save(path_model);
330	}
331
332	//
333	// chat templates
334	//
335
336	int32_t llama_chat_apply_template(
337	const char * tmpl,
338	const struct llama_chat_message * chat,
339	size_t n_msg,
340	bool add_ass,
341	char * buf,
342	int32_t length) {
343	const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
344
345	// format the chat to string
346	std::vector<const llama_chat_message *> chat_vec;
347	chat_vec.resize(new_size: n_msg);
348	for (size_t i = `0`; i < n_msg; i++) {
349	chat_vec [i] = &chat[i];
350	}
351
352	std::string formatted_chat;
353	llm_chat_template detected_tmpl = llm_chat_detect_template(tmpl: curr_tmpl);
354	if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
355	return -`1`;
356	}
357	int32_t res = llm_chat_apply_template(tmpl: detected_tmpl, chat: chat_vec, dest&: formatted_chat, add_ass);
358	if (res < `0`) {
359	return res;
360	}
361	if (buf && length > `0`) {
362	strncpy(dest: buf, src: formatted_chat.c_str(), n: length);
363	}
364	return res;
365	}
366
367	//
368	// model split
369	//
370
371	int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
372	static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
373	if (snprintf(s: split_path, maxlen: maxlen, format: SPLIT_PATH_FORMAT, path_prefix, split_no + `1`, split_count)) {
374	return strlen(s: split_path);
375	}
376	return `0`;
377	}
378
379	int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
380	std::string str_split_path(split_path);
381	char postfix[`32`];
382	snprintf(s: postfix, maxlen: `32`, format: "-%05d-of-%05d.gguf", split_no + `1`, split_count);
383	std::string str_postfix(postfix);
384
385	// check if split_prefix ends with postfix
386	int size_prefix = str_split_path.size() - str_postfix.size();
387	if (size_prefix > `0` && str_split_path.find(str: str_postfix, pos: size_prefix) != std::string::npos) {
388	snprintf(s: split_prefix, maxlen: std::min(a: (size_t) size_prefix + `1`, b: maxlen), format: "%s", split_path);
389	return size_prefix;
390	}
391
392	return `0`;
393	}
394
395	const char * llama_print_system_info(void) {
396	static std::string s;
397	s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
398
399	for (size_t i = `0`; i < ggml_backend_reg_count(); i++) {
400	auto * reg = ggml_backend_reg_get(index: i);
401	auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, name: "ggml_backend_get_features");
402	if (get_features_fn) {
403	ggml_backend_feature * features = get_features_fn(reg);
404	s += ggml_backend_reg_name(reg);
405	s += " : ";
406	for (; features->name; features++) {
407	s += features->name;
408	s += " = ";
409	s += features->value;
410	s += " \| ";
411	}
412	}
413	}
414
415	return s.c_str();
416	}
417
418

Browse the source code of llama.cpp/src/llama.cpp