1#include "llama-impl.h"
2
3#include "llama-chat.h"
4#include "llama-mmap.h"
5#include "llama-vocab.h"
6#include "llama-model-loader.h"
7#include "llama-model-saver.h"
8#include "llama-model.h"
9
10#include "ggml.h"
11#include "ggml-backend.h"
12
13#include <algorithm>
14#include <cstddef>
15#include <cstdint>
16#include <cstdio>
17#include <cstring>
18#include <ctime>
19
20#if defined(_MSC_VER)
21#pragma warning(disable: 4244 4267) // possible loss of data
22#endif
23
24//
25// interface implementation
26//
27
28const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
29 switch (flash_attn_type) {
30 case LLAMA_FLASH_ATTN_TYPE_AUTO:
31 return "auto";
32 case LLAMA_FLASH_ATTN_TYPE_DISABLED:
33 return "disabled";
34 case LLAMA_FLASH_ATTN_TYPE_ENABLED:
35 return "enabled";
36 }
37 GGML_ABORT("fatal error");
38}
39
40struct llama_sampler_chain_params llama_sampler_chain_default_params() {
41 struct llama_sampler_chain_params result = {
42 /*.no_perf =*/ true,
43 };
44
45 return result;
46}
47
48size_t llama_max_devices(void) {
49 return 16;
50}
51
52bool llama_supports_mmap(void) {
53 return llama_mmap::SUPPORTED;
54}
55
56bool llama_supports_mlock(void) {
57 return llama_mlock::SUPPORTED;
58}
59
60bool llama_supports_gpu_offload(void) {
61 return ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
62 ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
63 llama_supports_rpc();
64}
65
66bool llama_supports_rpc(void) {
67 return ggml_backend_reg_by_name(name: "RPC") != nullptr;
68}
69
70void llama_backend_init(void) {
71 ggml_time_init();
72
73 // needed to initialize f16 tables
74 {
75 struct ggml_init_params params = { .mem_size: 0, NULL, .no_alloc: false };
76 struct ggml_context * ctx = ggml_init(params);
77 ggml_free(ctx);
78 }
79}
80
81void llama_numa_init(enum ggml_numa_strategy numa) {
82 if (numa != GGML_NUMA_STRATEGY_DISABLED) {
83 auto * dev = ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_CPU);
84 GGML_ASSERT(dev && "CPU backend is not loaded");
85 auto * reg = ggml_backend_dev_backend_reg(device: dev);
86 auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, name: "ggml_backend_cpu_numa_init");
87 if (numa_init_fn) {
88 numa_init_fn(numa);
89 }
90 }
91}
92
93void llama_backend_free(void) {
94 ggml_quantize_free();
95}
96
97int64_t llama_time_us(void) {
98 return ggml_time_us();
99}
100
101// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
102static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
103 // loading time will be recalculated after the first eval, so
104 // we take page faults deferred by mmap() into consideration
105 model.t_load_us = 0;
106 time_meas tm(model.t_load_us);
107
108 model.t_start_us = tm.t_start_us;
109
110 try {
111 llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
112
113 ml.print_info();
114
115 model.hparams.vocab_only = params.vocab_only;
116
117 try {
118 model.load_arch(ml);
119 } catch(const std::exception & e) {
120 throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
121 }
122 try {
123 model.load_hparams(ml);
124 } catch(const std::exception & e) {
125 throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
126 }
127 if (model.arch == LLM_ARCH_CLIP) {
128 throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
129 }
130 try {
131 model.load_vocab(ml);
132 } catch(const std::exception & e) {
133 throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
134 }
135
136 model.load_stats(ml);
137 model.print_info();
138
139 if (params.vocab_only) {
140 LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
141 return 0;
142 }
143
144 if (!model.load_tensors(ml)) {
145 return -2;
146 }
147 } catch (const std::exception & err) {
148 LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
149 return -1;
150 }
151
152 return 0;
153}
154
155static struct llama_model * llama_model_load_from_file_impl(
156 const std::string & path_model,
157 std::vector<std::string> & splits,
158 struct llama_model_params params) {
159 ggml_time_init();
160
161 if (!params.vocab_only && ggml_backend_reg_count() == 0) {
162 LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
163 return nullptr;
164 }
165
166 unsigned cur_percentage = 0;
167 if (params.progress_callback == NULL) {
168 params.progress_callback_user_data = &cur_percentage;
169 params.progress_callback = [](float progress, void * ctx) {
170 unsigned * cur_percentage_p = (unsigned *) ctx;
171 unsigned percentage = (unsigned) (100 * progress);
172 while (percentage > *cur_percentage_p) {
173 *cur_percentage_p = percentage;
174 LLAMA_LOG_CONT(".");
175 if (percentage >= 100) {
176 LLAMA_LOG_CONT("\n");
177 }
178 }
179 return true;
180 };
181 }
182
183 llama_model * model = new llama_model(params);
184
185 // create list of devices to use with this model
186 if (params.devices) {
187 for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
188 model->devices.push_back(x: *dev);
189 }
190 } else {
191 // default device selection
192
193 // build list of available devices
194 std::vector<ggml_backend_dev_t> gpus;
195 std::vector<ggml_backend_dev_t> igpus;
196 std::vector<ggml_backend_dev_t> rpc_servers;
197
198 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
199 ggml_backend_dev_t dev = ggml_backend_dev_get(index: i);
200 switch (ggml_backend_dev_type(device: dev)) {
201 case GGML_BACKEND_DEVICE_TYPE_CPU:
202 case GGML_BACKEND_DEVICE_TYPE_ACCEL:
203 // skip CPU backends since they are handled separately
204 break;
205
206 case GGML_BACKEND_DEVICE_TYPE_GPU: {
207 ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(device: dev);
208 if (ggml_backend_reg_name(reg) == std::string("RPC")) {
209 rpc_servers.push_back(x: dev);
210 } else {
211 // check if there is already a GPU with the same device id
212 ggml_backend_dev_props props;
213 ggml_backend_dev_get_props(device: dev, props: &props);
214 auto it = std::find_if(first: gpus.begin(), last: gpus.end(), pred: [&props](ggml_backend_dev_t d) {
215 ggml_backend_dev_props d_props;
216 ggml_backend_dev_get_props(device: d, props: &d_props);
217 if (props.device_id && d_props.device_id) {
218 return strcmp(s1: props.device_id, s2: d_props.device_id) == 0;
219 }
220 return false;
221 });
222
223 if (it != gpus.end()) {
224 LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
225 __func__,
226 ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
227 props.device_id ? props.device_id : "unknown id",
228 ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
229 } else {
230 gpus.push_back(x: dev);
231 }
232 }
233 break;
234 }
235
236 case GGML_BACKEND_DEVICE_TYPE_IGPU:
237 igpus.push_back(x: dev);
238 break;
239 }
240 }
241
242 // add RPC servers at the front of the list to minimize network transfers
243 model->devices.insert(position: model->devices.begin(), first: rpc_servers.begin(), last: rpc_servers.end());
244
245 // add GPUs
246 model->devices.insert(position: model->devices.end(), first: gpus.begin(), last: gpus.end());
247
248 // add integrated GPUs only if no other devices were found
249 if (model->devices.empty()) {
250 model->devices.insert(position: model->devices.end(), first: igpus.begin(), last: igpus.end());
251 }
252 }
253
254 // if using single GPU mode, remove all except the main GPU
255 if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
256 if (params.main_gpu < 0) {
257 model->devices.clear();
258 } else {
259 if (params.main_gpu >= (int)model->devices.size()) {
260 LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
261 llama_model_free(model);
262 return nullptr;
263 }
264 ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
265 model->devices.clear();
266 model->devices.push_back(x: main_gpu);
267 }
268 }
269
270 for (auto * dev : model->devices) {
271 ggml_backend_dev_props props;
272 ggml_backend_dev_get_props(device: dev, props: &props);
273 LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
274 ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
275 props.device_id ? props.device_id : "unknown id",
276 props.memory_free/1024/1024);
277 }
278
279 const int status = llama_model_load(fname: path_model, splits, model&: *model, params);
280 GGML_ASSERT(status <= 0);
281 if (status < 0) {
282 if (status == -1) {
283 LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
284 } else if (status == -2) {
285 LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
286 }
287
288 llama_model_free(model);
289 return nullptr;
290 }
291
292 return model;
293}
294
295// deprecated
296struct llama_model * llama_load_model_from_file(
297 const char * path_model,
298 struct llama_model_params params) {
299 return llama_model_load_from_file(path_model, params);
300}
301
302struct llama_model * llama_model_load_from_file(
303 const char * path_model,
304 struct llama_model_params params) {
305 std::vector<std::string> splits = {};
306 return llama_model_load_from_file_impl(path_model, splits, params);
307}
308
309struct llama_model * llama_model_load_from_splits(
310 const char ** paths,
311 size_t n_paths,
312 struct llama_model_params params) {
313 std::vector<std::string> splits;
314 if (n_paths == 0) {
315 LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
316 return nullptr;
317 }
318 splits.reserve(n: n_paths);
319 for (size_t i = 0; i < n_paths; ++i) {
320 splits.push_back(x: paths[i]);
321 }
322 return llama_model_load_from_file_impl(path_model: splits.front(), splits, params);
323}
324
325void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
326 llama_model_saver ms(*model);
327 ms.add_kv_from_model();
328 ms.add_tensors_from_model();
329 ms.save(path_model);
330}
331
332//
333// chat templates
334//
335
336int32_t llama_chat_apply_template(
337 const char * tmpl,
338 const struct llama_chat_message * chat,
339 size_t n_msg,
340 bool add_ass,
341 char * buf,
342 int32_t length) {
343 const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
344
345 // format the chat to string
346 std::vector<const llama_chat_message *> chat_vec;
347 chat_vec.resize(new_size: n_msg);
348 for (size_t i = 0; i < n_msg; i++) {
349 chat_vec[i] = &chat[i];
350 }
351
352 std::string formatted_chat;
353 llm_chat_template detected_tmpl = llm_chat_detect_template(tmpl: curr_tmpl);
354 if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
355 return -1;
356 }
357 int32_t res = llm_chat_apply_template(tmpl: detected_tmpl, chat: chat_vec, dest&: formatted_chat, add_ass);
358 if (res < 0) {
359 return res;
360 }
361 if (buf && length > 0) {
362 strncpy(dest: buf, src: formatted_chat.c_str(), n: length);
363 }
364 return res;
365}
366
367//
368// model split
369//
370
371int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
372 static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
373 if (snprintf(s: split_path, maxlen: maxlen, format: SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
374 return strlen(s: split_path);
375 }
376 return 0;
377}
378
379int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
380 std::string str_split_path(split_path);
381 char postfix[32];
382 snprintf(s: postfix, maxlen: 32, format: "-%05d-of-%05d.gguf", split_no + 1, split_count);
383 std::string str_postfix(postfix);
384
385 // check if split_prefix ends with postfix
386 int size_prefix = str_split_path.size() - str_postfix.size();
387 if (size_prefix > 0 && str_split_path.find(str: str_postfix, pos: size_prefix) != std::string::npos) {
388 snprintf(s: split_prefix, maxlen: std::min(a: (size_t) size_prefix + 1, b: maxlen), format: "%s", split_path);
389 return size_prefix;
390 }
391
392 return 0;
393}
394
395const char * llama_print_system_info(void) {
396 static std::string s;
397 s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
398
399 for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
400 auto * reg = ggml_backend_reg_get(index: i);
401 auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, name: "ggml_backend_get_features");
402 if (get_features_fn) {
403 ggml_backend_feature * features = get_features_fn(reg);
404 s += ggml_backend_reg_name(reg);
405 s += " : ";
406 for (; features->name; features++) {
407 s += features->name;
408 s += " = ";
409 s += features->value;
410 s += " | ";
411 }
412 }
413 }
414
415 return s.c_str();
416}
417
418