1#include "llama-adapter.h"
2
3#include "llama-impl.h"
4#include "llama-mmap.h"
5#include "llama-model.h"
6
7#include <map>
8#include <cassert>
9#include <sstream>
10#include <stdexcept>
11
12// vec
13
14ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
15 if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
16 return nullptr;
17 }
18
19 return tensors[il];
20}
21
22ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
23 ggml_tensor * layer_dir = tensor_for(il);
24 if (layer_dir != nullptr) {
25 cur = ggml_add(ctx, a: cur, b: layer_dir);
26 }
27
28 return cur;
29}
30
31bool llama_adapter_cvec::init(const llama_model & model) {
32 const auto & hparams = model.hparams;
33
34 GGML_ASSERT(tensors.empty());
35 GGML_ASSERT(ctxs.empty());
36 GGML_ASSERT(bufs.empty());
37
38 // create a context for each buffer type
39 std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
40 auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
41 auto it = ctx_map.find(x: buft);
42 if (it == ctx_map.end()) {
43 ggml_init_params params = {
44 /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
45 /*.mem_buffer =*/ NULL,
46 /*.no_alloc =*/ true,
47 };
48
49 ggml_context * ctx = ggml_init(params);
50 if (!ctx) {
51 return nullptr;
52 }
53
54 ctx_map[buft] = ctx;
55 ctxs.emplace_back(args&: ctx);
56
57 return ctx;
58 }
59
60 return it->second;
61 };
62
63 // make tensors
64 tensors.reserve(n: hparams.n_layer);
65 tensors.push_back(x: nullptr); // there's never a tensor for layer 0
66 for (size_t il = 1; il < hparams.n_layer; il++) {
67 ggml_backend_buffer_type_t buft = model.select_buft(il);
68 ggml_context * ctx = ctx_for_buft(buft);
69 if (!ctx) {
70 LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
71 return false;
72 }
73 ggml_tensor * tensor = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: hparams.n_embd);
74 tensors.push_back(x: tensor);
75 }
76
77 // allocate tensors / buffers and zero
78 bufs.reserve(n: ctx_map.size());
79 for (auto it : ctx_map) {
80 ggml_backend_buffer_type_t buft = it.first;
81 ggml_context * ctx = it.second;
82 ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
83 if (!buf) {
84 LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
85 return false;
86 }
87 ggml_backend_buffer_clear(buffer: buf, value: 0);
88 bufs.emplace_back(args&: buf);
89 }
90
91 return true;
92}
93
94bool llama_adapter_cvec::apply(
95 const llama_model & model,
96 const float * data,
97 size_t len,
98 int32_t n_embd,
99 int32_t il_start,
100 int32_t il_end) {
101 const auto & hparams = model.hparams;
102
103 if (data == nullptr) {
104 // disable the current control vector (but leave allocated for later)
105 layer_start = -1;
106 layer_end = -1;
107 return true;
108 }
109
110 if (n_embd != (int) hparams.n_embd) {
111 LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
112 return false;
113 }
114
115 if (tensors.empty()) {
116 if (!init(model)) {
117 return false;
118 }
119 }
120
121 layer_start = il_start;
122 layer_end = il_end;
123
124 for (size_t il = 1; il < hparams.n_layer; il++) {
125 assert(tensors[il] != nullptr);
126
127 const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
128 if (off + n_embd <= len) {
129 ggml_backend_tensor_set(tensor: tensors[il], data: data + off, offset: 0, size: n_embd * ggml_element_size(tensor: tensors[il]));
130 }
131 }
132
133 return true;
134}
135
136// lora
137
138llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
139 const std::string name(w->name);
140
141 const auto pos = ab_map.find(x: name);
142 if (pos != ab_map.end()) {
143 return &pos->second;
144 }
145
146 return nullptr;
147}
148
149static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
150 LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
152 ggml_context * ctx_init;
153 gguf_init_params meta_gguf_params = {
154 /* .no_alloc = */ true,
155 /* .ctx = */ &ctx_init,
156 };
157
158 gguf_context_ptr ctx_gguf { gguf_init_from_file(fname: path_lora, params: meta_gguf_params) };
159 if (!ctx_gguf) {
160 throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
161 }
162
163 ggml_context_ptr ctx { ctx_init };
164
165 // check metadata
166 {
167 const gguf_context * gguf_ctx = ctx_gguf.get();
168
169 LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
170
171 // get metadata as string
172 for (int i = 0; i < gguf_get_n_kv(ctx: gguf_ctx); i++) {
173 gguf_type type = gguf_get_kv_type(ctx: gguf_ctx, key_id: i);
174 const std::string type_name =
175 type == GGUF_TYPE_ARRAY
176 ? format(fmt: "%s[%s,%zu]", gguf_type_name(type), gguf_type_name(type: gguf_get_arr_type(ctx: gguf_ctx, key_id: i)), gguf_get_arr_n(ctx: gguf_ctx, key_id: i))
177 : gguf_type_name(type);
178 const char * name = gguf_get_key(ctx: gguf_ctx, key_id: i);
179 const std::string value = gguf_kv_to_str(ctx_gguf: gguf_ctx, i);
180
181 if (type != GGUF_TYPE_ARRAY) {
182 adapter.gguf_kv.emplace(args&: name, args: value);
183 }
184
185 const size_t MAX_VALUE_LEN = 40;
186 std::string print_value = value.size() > MAX_VALUE_LEN ? format(fmt: "%s...", value.substr(pos: 0, n: MAX_VALUE_LEN - 3).c_str()) : value;
187 replace_all(s&: print_value, search: "\n", replace: "\\n");
188
189 LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
190 }
191
192 auto get_kv_str = [&](const std::string & key) -> std::string {
193 int id = gguf_find_key(ctx: gguf_ctx, key: key.c_str());
194 return id < 0 ? "" : std::string(gguf_get_val_str(ctx: gguf_ctx, key_id: id));
195 };
196 auto get_kv_f32 = [&](const std::string & key) -> float {
197 int id = gguf_find_key(ctx: gguf_ctx, key: key.c_str());
198 return id < 0 ? 0.0f : gguf_get_val_f32(ctx: gguf_ctx, key_id: id);
199 };
200 LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
201
202 auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
203 if (general_type != "adapter") {
204 throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
205 }
206
207 auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
208 auto general_arch = llm_arch_from_string(name: general_arch_str);
209 if (general_arch != model.arch) {
210 throw std::runtime_error("model arch and LoRA arch mismatch");
211 }
212
213 auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
214 if (adapter_type != "lora") {
215 throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
216 }
217
218 adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
219
220 // parse alora invocation sequence vector
221 const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
222 const int kid = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
223 if (kid >= 0) {
224 if (gguf_get_kv_type(ctx: ctx_gguf.get(), key_id: kid) != GGUF_TYPE_ARRAY) {
225 throw std::runtime_error("invalid gguf type for " + key);
226 }
227 const auto arr_type = gguf_get_arr_type(ctx: ctx_gguf.get(), key_id: kid);
228 if (arr_type != GGUF_TYPE_UINT32) {
229 throw std::runtime_error("invalid gguf element type for " + key);
230 }
231 const size_t seq_len = gguf_get_arr_n(ctx: ctx_gguf.get(), key_id: kid);
232 const void * data = gguf_get_arr_data(ctx: ctx_gguf.get(), key_id: kid);
233 adapter.alora_invocation_tokens.resize(new_size: seq_len);
234 std::copy(
235 first: (const llama_token *)data,
236 last: (const llama_token *)data + seq_len,
237 result: adapter.alora_invocation_tokens.begin());
238 }
239 }
240
241 int n_tensors = gguf_get_n_tensors(ctx: ctx_gguf.get());
242
243 // contexts for each buffer type
244 std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
245 auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
246 auto it = ctx_map.find(x: buft);
247 if (it == ctx_map.end()) {
248 // add a new context
249 ggml_init_params params = {
250 /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
251 /*.mem_buffer =*/ NULL,
252 /*.no_alloc =*/ true,
253 };
254 ggml_context * buft_ctx = ggml_init(params);
255 if (!buft_ctx) {
256 return nullptr;
257 }
258 ctx_map[buft] = buft_ctx;
259 adapter.ctxs.emplace_back(args&: buft_ctx);
260 return buft_ctx;
261 };
262 return it->second;
263 };
264
265 // bundle lora_a and lora_b into pairs
266 std::map<std::string, llama_adapter_lora_weight> ab_map;
267 auto str_endswith = [](const std::string & str, const std::string & suffix) {
268 return str.size() >= suffix.size() && str.compare(pos: str.size()-suffix.size(), n: suffix.size(), str: suffix) == 0;
269 };
270
271 for (ggml_tensor * cur = ggml_get_first_tensor(ctx: ctx.get()); cur; cur = ggml_get_next_tensor(ctx: ctx.get(), tensor: cur)) {
272 std::string name(cur->name);
273 if (str_endswith(name, ".lora_a")) {
274 replace_all(s&: name, search: ".lora_a", replace: "");
275 if (ab_map.find(x: name) == ab_map.end()) {
276 ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
277 } else {
278 ab_map[name].a = cur;
279 }
280 } else if (str_endswith(name, ".lora_b")) {
281 replace_all(s&: name, search: ".lora_b", replace: "");
282 if (ab_map.find(x: name) == ab_map.end()) {
283 ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
284 } else {
285 ab_map[name].b = cur;
286 }
287 } else if (str_endswith(name, "_norm.weight")) {
288 // TODO: add support for norm vector
289 // for now, we don't really care because most adapters still work fine without it
290 continue;
291 } else {
292 throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
293 }
294 }
295
296 // get extra buffer types of the CPU
297 // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
298 // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
299 std::vector<ggml_backend_buffer_type_t> buft_extra;
300 {
301 auto * cpu_dev = ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_CPU);
302 if (!cpu_dev) {
303 throw std::runtime_error(format(fmt: "%s: no CPU backend found", __func__));
304 }
305 auto * cpu_reg = ggml_backend_dev_backend_reg(device: cpu_dev);
306
307 auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
308 ggml_backend_reg_get_proc_address(reg: cpu_reg, name: "ggml_backend_dev_get_extra_bufts");
309
310 if (ggml_backend_dev_get_extra_bufts_fn) {
311 ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
312 while (extra_bufts && *extra_bufts) {
313 buft_extra.emplace_back(args&: *extra_bufts);
314 ++extra_bufts;
315 }
316 }
317 }
318
319 // add tensors
320 for (auto & it : ab_map) {
321 const std::string & name = it.first;
322 llama_adapter_lora_weight & w = it.second;
323 bool is_token_embd = str_endswith(name, "token_embd.weight");
324
325 if (!w.a || !w.b) {
326 throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
327 }
328
329 // device buft and device ctx
330 const auto * model_tensor = model.get_tensor(name: name.c_str());
331 if (!model_tensor) {
332 throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
333 }
334
335 auto * buft = ggml_backend_buffer_get_type(buffer: model_tensor->buffer);
336
337 // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
338 for (auto & ex : buft_extra) {
339 if (ex == buft) {
340 LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
341
342 auto * cpu_dev = ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_CPU);
343 if (!cpu_dev) {
344 throw std::runtime_error(format(fmt: "%s: no CPU backend found", __func__));
345 }
346 buft = ggml_backend_dev_buffer_type(device: cpu_dev);
347
348 break;
349 }
350 }
351
352 LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
353
354 ggml_context * dev_ctx = ctx_for_buft(buft);
355 // validate tensor shape
356 if (is_token_embd) {
357 // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
358 if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
359 throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
360 }
361 } else {
362 if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
363 throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
364 }
365 if (w.a->ne[1] != w.b->ne[0]) {
366 throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
367 }
368 }
369
370 // save tensor to adapter
371 ggml_tensor * tensor_a = ggml_dup_tensor(ctx: dev_ctx, src: w.a);
372 ggml_tensor * tensor_b = ggml_dup_tensor(ctx: dev_ctx, src: w.b);
373 ggml_set_name(tensor: tensor_a, name: w.a->name);
374 ggml_set_name(tensor: tensor_b, name: w.b->name);
375 adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
376 }
377
378 // allocate tensors / buffers and zero
379 {
380 adapter.ctxs.reserve(n: ctx_map.size());
381 adapter.bufs.reserve(n: ctx_map.size());
382 for (auto & it : ctx_map) {
383 ggml_backend_buffer_type_t buft = it.first;
384 ggml_context * ctx_dev = it.second;
385 ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx: ctx_dev, buft) };
386 if (!buf) {
387 throw std::runtime_error("failed to allocate buffer for lora adapter\n");
388 }
389 LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
390 adapter.bufs.emplace_back(args: std::move(buf));
391 }
392 }
393
394 // set tensor data
395 {
396 llama_file gguf_file(path_lora, "rb");
397 std::vector<uint8_t> read_buf;
398 auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
399 size_t offs = gguf_get_data_offset(ctx: ctx_gguf.get()) + gguf_get_tensor_offset(ctx: ctx_gguf.get(), tensor_id: gguf_find_tensor(ctx: ctx_gguf.get(), name: orig->name));
400 size_t size = ggml_nbytes(tensor: orig);
401 read_buf.resize(new_size: size);
402 gguf_file.seek(offset: offs, SEEK_SET);
403 gguf_file.read_raw(ptr: read_buf.data(), len: size);
404 ggml_backend_tensor_set(tensor: dev, data: read_buf.data(), offset: 0, size);
405 };
406 for (auto & it : adapter.ab_map) {
407 auto orig = ab_map[it.first];
408 auto dev = it.second;
409 set_tensor(orig.a, dev.a);
410 set_tensor(orig.b, dev.b);
411 }
412 }
413
414 LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
415}
416
417llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
418 llama_adapter_lora * adapter = new llama_adapter_lora();
419
420 try {
421 llama_adapter_lora_init_impl(model&: *model, path_lora, adapter&: *adapter);
422 return adapter;
423 } catch (const std::exception & err) {
424 LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
425
426 delete adapter;
427 }
428
429 return nullptr;
430}
431
432int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
433 const auto & it = adapter->gguf_kv.find(x: key);
434 if (it == adapter->gguf_kv.end()) {
435 if (buf_size > 0) {
436 buf[0] = '\0';
437 }
438 return -1;
439 }
440 return snprintf(s: buf, maxlen: buf_size, format: "%s", it->second.c_str());
441}
442
443int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
444 return (int)adapter->gguf_kv.size();
445}
446
447int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
448 if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
449 if (buf_size > 0) {
450 buf[0] = '\0';
451 }
452 return -1;
453 }
454 auto it = adapter->gguf_kv.begin();
455 std::advance(i&: it, n: i);
456 return snprintf(s: buf, maxlen: buf_size, format: "%s", it->first.c_str());
457}
458
459int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
460 if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
461 if (buf_size > 0) {
462 buf[0] = '\0';
463 }
464 return -1;
465 }
466 auto it = adapter->gguf_kv.begin();
467 std::advance(i&: it, n: i);
468 return snprintf(s: buf, maxlen: buf_size, format: "%s", it->second.c_str());
469}
470
471void llama_adapter_lora_free(llama_adapter_lora * adapter) {
472 delete adapter;
473}
474
475uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
476 if (!adapter) {
477 return 0;
478 }
479 return adapter->alora_invocation_tokens.size();
480}
481
482const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
483 GGML_ASSERT(adapter);
484 return adapter->alora_invocation_tokens.data();
485}
486