| 1 | #pragma once |
| 2 | |
| 3 | #include "llama.h" |
| 4 | |
| 5 | #include "ggml-cpp.h" |
| 6 | |
| 7 | #include <string> |
| 8 | #include <unordered_map> |
| 9 | #include <vector> |
| 10 | |
| 11 | // TODO: pimpl |
| 12 | |
| 13 | // |
| 14 | // llama_adapter_cvec |
| 15 | // |
| 16 | |
| 17 | struct llama_adapter_cvec { |
| 18 | ggml_tensor * tensor_for(int il) const; |
| 19 | |
| 20 | ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const; |
| 21 | |
| 22 | bool apply( |
| 23 | const llama_model & model, |
| 24 | const float * data, |
| 25 | size_t len, |
| 26 | int32_t n_embd, |
| 27 | int32_t il_start, |
| 28 | int32_t il_end); |
| 29 | |
| 30 | private: |
| 31 | bool init(const llama_model & model); |
| 32 | |
| 33 | int32_t layer_start = -1; |
| 34 | int32_t layer_end = -1; |
| 35 | |
| 36 | std::vector<ggml_context_ptr> ctxs; |
| 37 | std::vector<ggml_backend_buffer_ptr> bufs; |
| 38 | |
| 39 | std::vector<ggml_tensor *> tensors; // per layer |
| 40 | }; |
| 41 | |
| 42 | // |
| 43 | // llama_adapter_lora |
| 44 | // |
| 45 | |
| 46 | struct llama_adapter_lora_weight { |
| 47 | ggml_tensor * a = nullptr; |
| 48 | ggml_tensor * b = nullptr; |
| 49 | |
| 50 | // get actual scale based on rank and alpha |
| 51 | float get_scale(float alpha, float adapter_scale) const { |
| 52 | const float rank = (float) b->ne[0]; |
| 53 | const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale; |
| 54 | return scale; |
| 55 | } |
| 56 | |
| 57 | llama_adapter_lora_weight() = default; |
| 58 | llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {} |
| 59 | }; |
| 60 | |
| 61 | struct llama_adapter_lora { |
| 62 | // map tensor name to lora_a_b |
| 63 | std::unordered_map<std::string, llama_adapter_lora_weight> ab_map; |
| 64 | |
| 65 | std::vector<ggml_context_ptr> ctxs; |
| 66 | std::vector<ggml_backend_buffer_ptr> bufs; |
| 67 | |
| 68 | float alpha; |
| 69 | |
| 70 | // gguf metadata |
| 71 | std::unordered_map<std::string, std::string> gguf_kv; |
| 72 | |
| 73 | // activated lora (aLoRA) |
| 74 | std::vector<llama_token> alora_invocation_tokens; |
| 75 | |
| 76 | llama_adapter_lora() = default; |
| 77 | ~llama_adapter_lora() = default; |
| 78 | |
| 79 | llama_adapter_lora_weight * get_weight(ggml_tensor * w); |
| 80 | }; |
| 81 | |
| 82 | using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>; |
| 83 | |