llama-model-loader.h source code [llama.cpp/src/llama-model-loader.h]

1	#pragma once
2
3	#include "llama.h"
4
5	#include "llama-impl.h"
6	#include "llama-arch.h"
7	#include "llama-mmap.h"
8
9	#include "ggml-cpp.h"
10
11	#include <cstddef>
12	#include <map>
13	#include <stdexcept>
14	#include <unordered_map>
15
16	using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
17
18	enum llama_fver {
19	GGUF_FILE_VERSION_V1 = `1`,
20	GGUF_FILE_VERSION_V2 = `2`,
21	GGUF_FILE_VERSION_V3 = `3`,
22	};
23
24	const char * llama_file_version_name(llama_fver version);
25
26	struct llama_model_loader {
27	// Holds information on a model weight
28	struct llama_tensor_weight {
29	uint16_t idx; // source file index
30	size_t offs; // tensor data offset in the original file
31
32	ggml_tensor * tensor;
33
34	llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
35	const int tensor_idx = gguf_find_tensor(ctx: gguf_ctx, name: ggml_get_name(tensor));
36	if (tensor_idx < `0`) {
37	throw std::runtime_error (format(fmt: "tensor '%s' not found in the model", ggml_get_name(tensor)));
38	}
39
40	offs = gguf_get_data_offset(ctx: gguf_ctx) + gguf_get_tensor_offset(ctx: gguf_ctx, tensor_id: tensor_idx);
41	if (offs + ggml_nbytes(tensor) < offs \|\| offs + ggml_nbytes(tensor) > file->size()) {
42	throw std::runtime_error (format(fmt: "tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
43	}
44	}
45	};
46
47	// custom comparator to sort weights more nicely by layer
48	struct weight_name_comparer {
49	bool operator()(const std::string & a, const std::string & b) const {
50	int a_layer = -`1`;
51	int b_layer = -`1`;
52	sscanf(s: a.c_str(), format: "blk.%d.", &a_layer);
53	sscanf(s: b.c_str(), format: "blk.%d.", &b_layer);
54	if (a_layer != b_layer) {
55	return a_layer < b_layer;
56	}
57	return a < b;
58	}
59	};
60
61	static const int TENSOR_NOT_REQUIRED = `1` << `0`;
62	static const int TENSOR_DUPLICATED = `1` << `1`;
63	static const int TENSOR_SKIP = `1` << `2`;
64
65	int n_kv = `0`;
66	int n_tensors = `0`;
67	int n_created = `0`;
68
69	uint64_t n_elements = `0`;
70	size_t n_bytes = `0`;
71
72	bool use_mmap = false;
73	bool check_tensors;
74
75	llama_files files;
76	llama_ftype ftype;
77	llama_fver fver;
78
79	llama_mmaps mappings;
80
81	std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
82	std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
83	const llama_model_tensor_buft_override * tensor_buft_overrides;
84
85	gguf_context_ptr meta;
86	std::vector<ggml_context_ptr> contexts;
87
88	std::string arch_name;
89	LLM_KV llm_kv = LLM_KV (LLM_ARCH_UNKNOWN);
90
91	size_t size_done = `0`;
92	size_t size_data = `0`;
93	std::vector<std::pair<size_t, size_t>> mmaps_used;
94
95	llama_model_loader(
96	const std::string & fname,
97	std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
98	bool use_mmap,
99	bool check_tensors,
100	const llama_model_kv_override * param_overrides_p,
101	const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
102
103	template<typename T>
104	typename std::enable_if<std::is_integral<T>::value, bool>::type
105	get_arr_n(const std::string & key, T & result, bool required = true);
106
107	template<typename T>
108	typename std::enable_if<std::is_integral<T>::value, bool>::type
109	get_arr_n(enum llm_kv kid, T & result, bool required = true);
110
111	template<typename T>
112	bool get_arr(const std::string & key, std::vector<T> & result, bool required = true);
113
114	template<typename T, size_t N_MAX>
115	bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true);
116
117	template<typename T>
118	bool get_arr(enum llm_kv kid, T & result, bool required = true);
119
120	template<typename T>
121	bool get_key(const std::string & key, T & result, bool required = true);
122
123	template<typename T>
124	bool get_key(enum llm_kv kid, T & result, bool required = true);
125
126	template<typename T, size_t N_MAX>
127	bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true);
128
129	template<typename T>
130	bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
131
132	std::string get_arch_name() const;
133
134	enum llm_arch get_arch() const;
135
136	const llama_tensor_weight * get_weight(const char * name) const;
137
138	const llama_tensor_weight & require_weight(const char * name) const;
139
140	struct ggml_tensor * get_tensor_meta(const char * name) const;
141
142	struct ggml_tensor * require_tensor_meta(const std::string & name) const;
143
144	const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
145
146	struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = `0`);
147
148	struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
149
150	void done_getting_tensors() const;
151
152	void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);
153
154	void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
155
156	// for backwards compatibility, does not support ggml-backend
157	void load_data_for(struct ggml_tensor * cur) const;
158
159	// Returns false if cancelled by progress_callback
160	bool load_all_data(
161	struct ggml_context * ctx,
162	llama_buf_map & bufs,
163	llama_mlocks * lmlocks,
164	llama_progress_callback progress_callback,
165	void * progress_callback_user_data);
166
167	std::string ftype_name() const;
168
169	void print_info() const;
170	};
171

Browse the source code of llama.cpp/src/llama-model-loader.h