| 1 | #include "llama-model-loader.h" |
| 2 | |
| 3 | #include "ggml.h" |
| 4 | |
| 5 | #include <array> |
| 6 | #include <cinttypes> |
| 7 | #include <cstring> |
| 8 | #include <future> |
| 9 | |
| 10 | static const size_t kiB = 1024; |
| 11 | static const size_t MiB = 1024*kiB; |
| 12 | static const size_t GiB = 1024*MiB; |
| 13 | |
| 14 | const char * llama_file_version_name(llama_fver version) { |
| 15 | switch (version) { |
| 16 | case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)" ; |
| 17 | case GGUF_FILE_VERSION_V2: return "GGUF V2" ; |
| 18 | case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)" ; |
| 19 | } |
| 20 | |
| 21 | return "unknown" ; |
| 22 | } |
| 23 | |
| 24 | static std::string llama_model_ftype_name(llama_ftype ftype) { |
| 25 | if (ftype & LLAMA_FTYPE_GUESSED) { |
| 26 | return llama_model_ftype_name(ftype: (enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)" ; |
| 27 | } |
| 28 | |
| 29 | switch (ftype) { |
| 30 | case LLAMA_FTYPE_ALL_F32: return "all F32" ; |
| 31 | case LLAMA_FTYPE_MOSTLY_F16: return "F16" ; |
| 32 | case LLAMA_FTYPE_MOSTLY_BF16: return "BF16" ; |
| 33 | case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0" ; |
| 34 | case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1" ; |
| 35 | case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0" ; |
| 36 | case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1" ; |
| 37 | case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0" ; |
| 38 | case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE" ; |
| 39 | case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium" ; |
| 40 | case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small" ; |
| 41 | case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small" ; |
| 42 | case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium" ; |
| 43 | case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large" ; |
| 44 | case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small" ; |
| 45 | case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium" ; |
| 46 | case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small" ; |
| 47 | case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium" ; |
| 48 | case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K" ; |
| 49 | case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary" ; |
| 50 | case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary" ; |
| 51 | case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw" ; |
| 52 | case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw" ; |
| 53 | case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw" ; |
| 54 | case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw" ; |
| 55 | case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw" ; |
| 56 | case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw" ; |
| 57 | case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw" ; |
| 58 | case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw" ; |
| 59 | case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw" ; |
| 60 | case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw" ; |
| 61 | case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw" ; |
| 62 | case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw" ; |
| 63 | |
| 64 | default: return "unknown, may not work" ; |
| 65 | } |
| 66 | } |
| 67 | |
| 68 | // return a list of splits for a given path |
| 69 | // for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits |
| 70 | static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) { |
| 71 | std::vector<std::string> paths; |
| 72 | std::string split_prefix; |
| 73 | std::vector<char> buf(llama_path_max(), 0); |
| 74 | |
| 75 | { |
| 76 | int ret = llama_split_prefix(split_prefix: buf.data(), maxlen: buf.size(), split_path: path.c_str(), split_no: idx, split_count: n_split); |
| 77 | if (!ret) { |
| 78 | throw std::runtime_error(format(fmt: "invalid split file name: %s" , path.c_str())); |
| 79 | } |
| 80 | split_prefix = std::string(buf.data(), ret); |
| 81 | } |
| 82 | |
| 83 | if (split_prefix.empty()) { |
| 84 | throw std::runtime_error(format(fmt: "invalid split file: %s" , path.c_str())); |
| 85 | } |
| 86 | |
| 87 | for (int idx = 0; idx < n_split; ++idx) { |
| 88 | int ret = llama_split_path(split_path: buf.data(), maxlen: buf.size(), path_prefix: split_prefix.c_str(), split_no: idx, split_count: n_split); |
| 89 | paths.push_back(x: std::string(buf.data(), ret)); |
| 90 | } |
| 91 | |
| 92 | return paths; |
| 93 | } |
| 94 | |
| 95 | namespace GGUFMeta { |
| 96 | template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)> |
| 97 | struct GKV_Base_Type { |
| 98 | static constexpr gguf_type gt = gt_; |
| 99 | |
| 100 | static T getter(const gguf_context * ctx, const int kid) { |
| 101 | return gfun(ctx, kid); |
| 102 | } |
| 103 | }; |
| 104 | |
| 105 | template<typename T> struct GKV_Base; |
| 106 | |
| 107 | template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {}; |
| 108 | template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {}; |
| 109 | template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {}; |
| 110 | template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {}; |
| 111 | template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {}; |
| 112 | template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {}; |
| 113 | template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {}; |
| 114 | template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {}; |
| 115 | template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {}; |
| 116 | template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {}; |
| 117 | template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {}; |
| 118 | template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {}; |
| 119 | |
| 120 | template<> struct GKV_Base<std::string> { |
| 121 | static constexpr gguf_type gt = GGUF_TYPE_STRING; |
| 122 | |
| 123 | static std::string getter(const gguf_context * ctx, const int kid) { |
| 124 | return gguf_get_val_str(ctx, key_id: kid); |
| 125 | } |
| 126 | }; |
| 127 | |
| 128 | struct ArrayInfo { |
| 129 | const gguf_type gt; |
| 130 | const size_t length; |
| 131 | const void * data; |
| 132 | }; |
| 133 | |
| 134 | template<> struct GKV_Base<ArrayInfo> { |
| 135 | public: |
| 136 | static constexpr gguf_type gt = GGUF_TYPE_ARRAY; |
| 137 | static ArrayInfo getter(const gguf_context *ctx, const int k) { |
| 138 | const enum gguf_type arr_type = gguf_get_arr_type(ctx, key_id: k); |
| 139 | return ArrayInfo { |
| 140 | .gt: arr_type, |
| 141 | .length: size_t(gguf_get_arr_n(ctx, key_id: k)), |
| 142 | .data: arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, key_id: k), |
| 143 | }; |
| 144 | } |
| 145 | }; |
| 146 | |
| 147 | template<typename T> |
| 148 | class GKV : public GKV_Base<T> { |
| 149 | GKV() = delete; |
| 150 | |
| 151 | public: |
| 152 | static T get_kv(const gguf_context * ctx, const int k) { |
| 153 | const enum gguf_type kt = gguf_get_kv_type(ctx, key_id: k); |
| 154 | |
| 155 | if (kt != GKV::gt) { |
| 156 | throw std::runtime_error(format("key %s has wrong type %s but expected type %s" , |
| 157 | gguf_get_key(ctx, key_id: k), gguf_type_name(type: kt), gguf_type_name(GKV::gt))); |
| 158 | } |
| 159 | return GKV::getter(ctx, k); |
| 160 | } |
| 161 | |
| 162 | static const char * override_type_to_str(const llama_model_kv_override_type ty) { |
| 163 | switch (ty) { |
| 164 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool" ; |
| 165 | case LLAMA_KV_OVERRIDE_TYPE_INT: return "int" ; |
| 166 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float" ; |
| 167 | case LLAMA_KV_OVERRIDE_TYPE_STR: return "str" ; |
| 168 | } |
| 169 | return "unknown" ; |
| 170 | } |
| 171 | |
| 172 | static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { |
| 173 | if (!ovrd) { return false; } |
| 174 | if (ovrd->tag == expected_type) { |
| 175 | LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = " , |
| 176 | __func__, override_type_to_str(ovrd->tag), ovrd->key); |
| 177 | switch (ovrd->tag) { |
| 178 | case LLAMA_KV_OVERRIDE_TYPE_BOOL: { |
| 179 | LLAMA_LOG_INFO("%s\n" , ovrd->val_bool ? "true" : "false" ); |
| 180 | } break; |
| 181 | case LLAMA_KV_OVERRIDE_TYPE_INT: { |
| 182 | LLAMA_LOG_INFO("%" PRId64 "\n" , ovrd->val_i64); |
| 183 | } break; |
| 184 | case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { |
| 185 | LLAMA_LOG_INFO("%.6f\n" , ovrd->val_f64); |
| 186 | } break; |
| 187 | case LLAMA_KV_OVERRIDE_TYPE_STR: { |
| 188 | LLAMA_LOG_INFO("%s\n" , ovrd->val_str); |
| 189 | } break; |
| 190 | default: |
| 191 | // Shouldn't be possible to end up here, but just in case... |
| 192 | throw std::runtime_error( |
| 193 | format(fmt: "Unsupported attempt to override %s type for metadata key %s\n" , |
| 194 | override_type_to_str(ty: ovrd->tag), ovrd->key)); |
| 195 | } |
| 196 | return true; |
| 197 | } |
| 198 | LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n" , |
| 199 | __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); |
| 200 | return false; |
| 201 | } |
| 202 | |
| 203 | template<typename OT> |
| 204 | static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type |
| 205 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
| 206 | if (validate_override(expected_type: LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { |
| 207 | target = ovrd->val_bool; |
| 208 | return true; |
| 209 | } |
| 210 | return false; |
| 211 | } |
| 212 | |
| 213 | template<typename OT> |
| 214 | static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type |
| 215 | try_override(OT & target, const struct llama_model_kv_override * ovrd) { |
| 216 | if (validate_override(expected_type: LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { |
| 217 | target = ovrd->val_i64; |
| 218 | return true; |
| 219 | } |
| 220 | return false; |
| 221 | } |
| 222 | |
| 223 | template<typename OT> |
| 224 | static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type |
| 225 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
| 226 | if (validate_override(expected_type: LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { |
| 227 | target = ovrd->val_f64; |
| 228 | return true; |
| 229 | } |
| 230 | return false; |
| 231 | } |
| 232 | |
| 233 | template<typename OT> |
| 234 | static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type |
| 235 | try_override(T & target, const struct llama_model_kv_override * ovrd) { |
| 236 | if (validate_override(expected_type: LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) { |
| 237 | target = ovrd->val_str; |
| 238 | return true; |
| 239 | } |
| 240 | return false; |
| 241 | } |
| 242 | |
| 243 | static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
| 244 | if (try_override<T>(target, ovrd)) { |
| 245 | return true; |
| 246 | } |
| 247 | if (k < 0) { return false; } |
| 248 | target = get_kv(ctx, k); |
| 249 | return true; |
| 250 | } |
| 251 | |
| 252 | static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
| 253 | return set(ctx, gguf_find_key(ctx, key), target, ovrd); |
| 254 | } |
| 255 | |
| 256 | static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { |
| 257 | return set(ctx, key.c_str(), target, ovrd); |
| 258 | } |
| 259 | }; |
| 260 | } |
| 261 | |
| 262 | template<typename T> |
| 263 | typename std::enable_if<std::is_integral<T>::value, bool>::type |
| 264 | llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) { |
| 265 | const int kid = gguf_find_key(ctx: meta.get(), key: key.c_str()); |
| 266 | |
| 267 | if (kid < 0) { |
| 268 | if (required) { |
| 269 | throw std::runtime_error(format(fmt: "key not found in model: %s" , key.c_str())); |
| 270 | } |
| 271 | return false; |
| 272 | } |
| 273 | |
| 274 | struct GGUFMeta::ArrayInfo arr_info = |
| 275 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx: meta.get(), k: kid); |
| 276 | |
| 277 | |
| 278 | result = arr_info.length; |
| 279 | return true; |
| 280 | } |
| 281 | |
| 282 | template<typename T> |
| 283 | typename std::enable_if<std::is_integral<T>::value, bool>::type |
| 284 | llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) { |
| 285 | return get_arr_n(llm_kv(kid), result, required); |
| 286 | } |
| 287 | |
| 288 | template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required); |
| 289 | |
| 290 | template<typename T> |
| 291 | bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) { |
| 292 | const gguf_context * ctx = meta.get(); |
| 293 | const int kid = gguf_find_key(ctx, key: key.c_str()); |
| 294 | |
| 295 | if (kid < 0 || gguf_get_kv_type(ctx, key_id: kid) != GGUF_TYPE_ARRAY) { |
| 296 | if (required) { |
| 297 | throw std::runtime_error(format(fmt: "array key not found in model: %s" , key.c_str())); |
| 298 | } |
| 299 | return false; |
| 300 | } |
| 301 | |
| 302 | struct GGUFMeta::ArrayInfo arr_info = |
| 303 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, k: kid); |
| 304 | |
| 305 | switch (arr_info.gt) { |
| 306 | case GGUF_TYPE_UINT32: |
| 307 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
| 308 | (std::is_same<T, uint32_t>::value)); break; |
| 309 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
| 310 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
| 311 | default: |
| 312 | throw std::runtime_error(format(fmt: "%s is not a string/float32/uint32/int32 array" , key.c_str())); |
| 313 | } |
| 314 | |
| 315 | if constexpr (std::is_same<T, std::string>::value) { |
| 316 | const size_t n_items = gguf_get_arr_n(ctx, key_id: kid); |
| 317 | result.clear(); |
| 318 | |
| 319 | for (size_t i = 0; i < n_items; i++) { |
| 320 | const T value = gguf_get_arr_str(ctx, key_id: kid, i); |
| 321 | result.emplace_back(value); |
| 322 | } |
| 323 | } else { |
| 324 | result.resize(arr_info.length); |
| 325 | result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); |
| 326 | } |
| 327 | |
| 328 | return true; |
| 329 | } |
| 330 | |
| 331 | template<typename T, size_t N_MAX> |
| 332 | bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) { |
| 333 | const gguf_context * ctx = meta.get(); |
| 334 | const int kid = gguf_find_key(ctx, key: key.c_str()); |
| 335 | |
| 336 | if (kid < 0 || gguf_get_kv_type(ctx, key_id: kid) != GGUF_TYPE_ARRAY) { |
| 337 | if (required) { |
| 338 | throw std::runtime_error(format(fmt: "array key not found in model: %s" , key.c_str())); |
| 339 | } |
| 340 | return false; |
| 341 | } |
| 342 | |
| 343 | struct GGUFMeta::ArrayInfo arr_info = |
| 344 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, k: kid); |
| 345 | |
| 346 | switch (arr_info.gt) { |
| 347 | case GGUF_TYPE_UINT32: |
| 348 | case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) || |
| 349 | (std::is_same<T, uint32_t>::value)); break; |
| 350 | case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break; |
| 351 | case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break; |
| 352 | default: |
| 353 | throw std::runtime_error(format(fmt: "%s is not a string/float32/uint32/int32 array" , key.c_str())); |
| 354 | } |
| 355 | |
| 356 | if (arr_info.length > N_MAX) { |
| 357 | throw std::runtime_error(format(fmt: "array length %u for key %s exceeds max %u" , (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX)); |
| 358 | } |
| 359 | |
| 360 | if constexpr (std::is_same<T, std::string>::value) { |
| 361 | const size_t n_items = gguf_get_arr_n(ctx, key_id: kid); |
| 362 | |
| 363 | for (size_t i = 0; i < n_items; i++) { |
| 364 | const T value = gguf_get_arr_str(ctx, key_id: kid, i); |
| 365 | result[i] = value; |
| 366 | } |
| 367 | } else { |
| 368 | std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); |
| 369 | } |
| 370 | |
| 371 | return true; |
| 372 | } |
| 373 | |
| 374 | template<typename T> |
| 375 | bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) { |
| 376 | return get_arr(llm_kv(kid), result, required); |
| 377 | } |
| 378 | |
| 379 | template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required); |
| 380 | |
| 381 | template<typename T> |
| 382 | bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { |
| 383 | auto it = kv_overrides.find(x: key); |
| 384 | |
| 385 | const struct llama_model_kv_override * override = |
| 386 | it != kv_overrides.end() ? &it->second : nullptr; |
| 387 | |
| 388 | const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override); |
| 389 | |
| 390 | if (required && !found) { |
| 391 | throw std::runtime_error(format(fmt: "key not found in model: %s" , key.c_str())); |
| 392 | } |
| 393 | |
| 394 | return found; |
| 395 | } |
| 396 | |
| 397 | template<typename T> |
| 398 | bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) { |
| 399 | return get_key(llm_kv(kid), result, required); |
| 400 | } |
| 401 | |
| 402 | template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required); |
| 403 | template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required); |
| 404 | template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required); |
| 405 | template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required); |
| 406 | |
| 407 | template<> |
| 408 | bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) { |
| 409 | uint32_t tmp; |
| 410 | const bool found = get_key(kid, result&: tmp, required); |
| 411 | if (found) { |
| 412 | result = (enum llama_pooling_type) tmp; |
| 413 | } else { |
| 414 | result = LLAMA_POOLING_TYPE_UNSPECIFIED; |
| 415 | } |
| 416 | return found; |
| 417 | } |
| 418 | |
| 419 | // get array of n <= N_MAX elements, or a single element repeated n times |
| 420 | template<typename T, size_t N_MAX> |
| 421 | bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) { |
| 422 | const int kid = gguf_find_key(ctx: meta.get(), key: key.c_str()); |
| 423 | |
| 424 | if (kid < 0) { |
| 425 | if (required) { |
| 426 | throw std::runtime_error(format(fmt: "key not found in model: %s" , key.c_str())); |
| 427 | } |
| 428 | return false; |
| 429 | } |
| 430 | |
| 431 | if (n > N_MAX) { |
| 432 | throw std::runtime_error(format(fmt: "n > N_MAX: %u > %u for key %s" , (uint32_t) n, (uint32_t) N_MAX, key.c_str())); |
| 433 | } |
| 434 | |
| 435 | if (gguf_get_kv_type(ctx: meta.get(), key_id: kid) == GGUF_TYPE_ARRAY) { |
| 436 | struct GGUFMeta::ArrayInfo arr_info = |
| 437 | GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx: meta.get(), k: kid); |
| 438 | |
| 439 | if (n != arr_info.length) { |
| 440 | throw std::runtime_error(format(fmt: "key %s has wrong array length; expected %u, got %u" , key.c_str(), n, (uint32_t) arr_info.length)); |
| 441 | } |
| 442 | |
| 443 | return get_arr(key, result, required); |
| 444 | } |
| 445 | |
| 446 | T value; |
| 447 | |
| 448 | bool ok = get_key(key, value, required); |
| 449 | if (!ok) { |
| 450 | return false; |
| 451 | } |
| 452 | |
| 453 | for (uint32_t i = 0; i < n; i++) { |
| 454 | result[i] = value; |
| 455 | } |
| 456 | |
| 457 | return true; |
| 458 | } |
| 459 | |
| 460 | template<typename T> |
| 461 | bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) { |
| 462 | return get_key_or_arr(llm_kv(kid), result, n, required); |
| 463 | } |
| 464 | |
| 465 | // TODO: this is not very clever - figure out something better |
| 466 | template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required); |
| 467 | template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required); |
| 468 | template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required); |
| 469 | |
| 470 | |
| 471 | llama_model_loader::llama_model_loader( |
| 472 | const std::string & fname, |
| 473 | std::vector<std::string> & splits, |
| 474 | bool use_mmap, |
| 475 | bool check_tensors, |
| 476 | const llama_model_kv_override * param_overrides_p, |
| 477 | const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { |
| 478 | int trace = 0; |
| 479 | if (getenv(name: "LLAMA_TRACE" )) { |
| 480 | trace = atoi(nptr: getenv(name: "LLAMA_TRACE" )); |
| 481 | } |
| 482 | |
| 483 | if (param_overrides_p != nullptr) { |
| 484 | for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { |
| 485 | kv_overrides.insert(x: {std::string(p->key), *p}); |
| 486 | } |
| 487 | } |
| 488 | |
| 489 | tensor_buft_overrides = param_tensor_buft_overrides_p; |
| 490 | |
| 491 | // Load the main GGUF |
| 492 | struct ggml_context * ctx = NULL; |
| 493 | struct gguf_init_params params = { |
| 494 | /*.no_alloc = */ true, |
| 495 | /*.ctx = */ &ctx, |
| 496 | }; |
| 497 | |
| 498 | meta.reset(p: gguf_init_from_file(fname: fname.c_str(), params)); |
| 499 | if (!meta) { |
| 500 | throw std::runtime_error(format(fmt: "%s: failed to load model from %s" , __func__, fname.c_str())); |
| 501 | } |
| 502 | |
| 503 | get_key(key: llm_kv(LLM_KV_GENERAL_ARCHITECTURE), result&: arch_name, required: false); |
| 504 | llm_kv = LLM_KV(llm_arch_from_string(name: arch_name)); |
| 505 | |
| 506 | files.emplace_back(args: new llama_file(fname.c_str(), "rb" )); |
| 507 | contexts.emplace_back(args&: ctx); |
| 508 | |
| 509 | // Save tensors data offset of the main file. |
| 510 | // For subsidiary files, `meta` tensor data offset must not be used, |
| 511 | // so we build a unified tensors index for weights. |
| 512 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, tensor: cur)) { |
| 513 | std::string tensor_name = std::string(cur->name); |
| 514 | // make sure there is no duplicated tensor names |
| 515 | if (weights_map.find(x: tensor_name) != weights_map.end()) { |
| 516 | throw std::runtime_error(format(fmt: "invalid model: tensor '%s' is duplicated" , ggml_get_name(tensor: cur))); |
| 517 | } |
| 518 | n_elements += ggml_nelements(tensor: cur); |
| 519 | n_bytes += ggml_nbytes(tensor: cur); |
| 520 | weights_map.emplace(args&: tensor_name, args: llama_tensor_weight(files.back().get(), 0, meta.get(), cur)); |
| 521 | } |
| 522 | uint16_t n_split = 0; |
| 523 | get_key(key: llm_kv(LLM_KV_SPLIT_COUNT), result&: n_split, required: false); |
| 524 | |
| 525 | // Load additional GGML contexts |
| 526 | if (n_split > 1) { |
| 527 | // make sure the main file is loaded first |
| 528 | uint16_t idx = 0; |
| 529 | const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO); |
| 530 | get_key(key: kv_split_no, result&: idx); |
| 531 | if (idx != 0) { |
| 532 | throw std::runtime_error(format(fmt: "illegal split file idx: %d (file: %s), model must be loaded with the first split" , idx, fname.c_str())); |
| 533 | } |
| 534 | |
| 535 | // generate list of splits if needed |
| 536 | if (splits.empty()) { |
| 537 | splits = llama_get_list_splits(path: fname, idx, n_split); |
| 538 | } |
| 539 | |
| 540 | // in case user give a custom list of splits, check if it matches the expected number |
| 541 | if (n_split != (uint16_t)splits.size()) { |
| 542 | throw std::runtime_error(format(fmt: "invalid split count, given: %zu splits, but expected %d" , splits.size(), n_split)); |
| 543 | } |
| 544 | |
| 545 | if (trace > 0) { |
| 546 | LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n" , __func__, n_split); |
| 547 | } |
| 548 | |
| 549 | // load other splits |
| 550 | for (idx = 1; idx < n_split; idx++) { |
| 551 | const char * fname_split = splits[idx].c_str(); |
| 552 | |
| 553 | struct gguf_init_params split_params = { |
| 554 | /*.no_alloc = */ true, |
| 555 | /*.ctx = */ &ctx, |
| 556 | }; |
| 557 | gguf_context_ptr ctx_gguf { gguf_init_from_file(fname: fname_split, params: split_params) }; |
| 558 | if (!ctx_gguf) { |
| 559 | throw std::runtime_error(format(fmt: "%s: failed to load GGUF split from %s" , __func__, fname_split)); |
| 560 | } |
| 561 | |
| 562 | // check idx |
| 563 | { |
| 564 | const int kid = gguf_find_key(ctx: ctx_gguf.get(), key: kv_split_no.c_str()); |
| 565 | if (kid < 0) { |
| 566 | throw std::runtime_error(format(fmt: "missing key %s in GGUF split %s" , kv_split_no.c_str(), fname_split)); |
| 567 | } |
| 568 | int idx_gguf = gguf_get_val_u16(ctx: ctx_gguf.get(), key_id: kid); |
| 569 | if (idx_gguf != idx) { |
| 570 | throw std::runtime_error(format(fmt: "invalid split file idx: %d (file: %s), expected %d" , idx_gguf, fname_split, idx)); |
| 571 | } |
| 572 | } |
| 573 | |
| 574 | files.emplace_back(args: new llama_file(fname_split, "rb" )); |
| 575 | contexts.emplace_back(args&: ctx); |
| 576 | |
| 577 | // Save tensors data offset info of the shard. |
| 578 | for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, tensor: cur)) { |
| 579 | std::string tensor_name = std::string(cur->name); |
| 580 | // make sure there is no duplicated tensor names |
| 581 | if (weights_map.find(x: tensor_name) != weights_map.end()) { |
| 582 | throw std::runtime_error(format(fmt: "invalid model: tensor '%s' is duplicated" , ggml_get_name(tensor: cur))); |
| 583 | } |
| 584 | n_elements += ggml_nelements(tensor: cur); |
| 585 | n_bytes += ggml_nbytes(tensor: cur); |
| 586 | weights_map.emplace(args&: tensor_name, args: llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur)); |
| 587 | } |
| 588 | } |
| 589 | |
| 590 | get_key(key: llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), result&: n_tensors); |
| 591 | |
| 592 | // sanity check |
| 593 | { |
| 594 | const int n_tensors_loaded = (int) weights_map.size(); |
| 595 | if (n_tensors != n_tensors_loaded) { |
| 596 | throw std::runtime_error(format(fmt: "corrupted model: %d tensors expected but %d found" , n_tensors, n_tensors_loaded)); |
| 597 | } |
| 598 | } |
| 599 | |
| 600 | LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n" , __func__, n_split - 1); |
| 601 | } |
| 602 | |
| 603 | n_kv = gguf_get_n_kv(ctx: meta.get()); |
| 604 | n_tensors = weights_map.size(); |
| 605 | |
| 606 | fver = (enum llama_fver) gguf_get_version(ctx: meta.get()); |
| 607 | |
| 608 | LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n" , |
| 609 | __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); |
| 610 | |
| 611 | // determine file type based on the number of tensors for each quantization and print meta data |
| 612 | // TODO: make optional |
| 613 | { |
| 614 | std::map<enum ggml_type, uint32_t> n_type; |
| 615 | |
| 616 | uint32_t n_type_max = 0; |
| 617 | enum ggml_type type_max = GGML_TYPE_F32; |
| 618 | |
| 619 | for (const auto & it : weights_map) { |
| 620 | const llama_tensor_weight & w = it.second; |
| 621 | const ggml_tensor * tensor = w.tensor; |
| 622 | |
| 623 | enum ggml_type type = tensor->type; |
| 624 | |
| 625 | n_type[type]++; |
| 626 | |
| 627 | if (n_type_max < n_type[type]) { |
| 628 | n_type_max = n_type[type]; |
| 629 | type_max = type; |
| 630 | } |
| 631 | |
| 632 | if (trace > 0) { |
| 633 | const uint16_t sid = w.idx; |
| 634 | LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n" , __func__, |
| 635 | sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(), |
| 636 | ggml_nbytes(tensor)/1024.0f/1024.0f); |
| 637 | } |
| 638 | } |
| 639 | |
| 640 | switch (type_max) { |
| 641 | case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; |
| 642 | case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; |
| 643 | case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; |
| 644 | case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; |
| 645 | case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; |
| 646 | case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; |
| 647 | case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; |
| 648 | case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; |
| 649 | case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; |
| 650 | case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; |
| 651 | case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; |
| 652 | case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; |
| 653 | case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; |
| 654 | case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break; |
| 655 | case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break; |
| 656 | case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; |
| 657 | case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; |
| 658 | case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; |
| 659 | case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; |
| 660 | case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; |
| 661 | case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; |
| 662 | case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; |
| 663 | case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; |
| 664 | case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; |
| 665 | default: |
| 666 | { |
| 667 | LLAMA_LOG_WARN("%s: unknown type %s\n" , __func__, ggml_type_name(type_max)); |
| 668 | ftype = LLAMA_FTYPE_ALL_F32; |
| 669 | } break; |
| 670 | } |
| 671 | |
| 672 | // this is a way to mark that we have "guessed" the file type |
| 673 | ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); |
| 674 | |
| 675 | { |
| 676 | uint32_t ftype_val = 0; |
| 677 | if (get_key(kid: LLM_KV_GENERAL_FILE_TYPE, result&: ftype_val, required: false)) { |
| 678 | ftype = (llama_ftype) ftype_val; |
| 679 | } |
| 680 | } |
| 681 | |
| 682 | LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n" , __func__); |
| 683 | |
| 684 | for (int i = 0; i < n_kv; i++) { |
| 685 | const char * name = gguf_get_key(ctx: meta.get(), key_id: i); |
| 686 | const enum gguf_type type = gguf_get_kv_type(ctx: meta.get(), key_id: i); |
| 687 | const std::string type_name = |
| 688 | type == GGUF_TYPE_ARRAY |
| 689 | ? format(fmt: "%s[%s,%zu]" , gguf_type_name(type), gguf_type_name(type: gguf_get_arr_type(ctx: meta.get(), key_id: i)), gguf_get_arr_n(ctx: meta.get(), key_id: i)) |
| 690 | : gguf_type_name(type); |
| 691 | |
| 692 | std::string value = gguf_kv_to_str(ctx_gguf: meta.get(), i); |
| 693 | const size_t MAX_VALUE_LEN = 40; |
| 694 | if (value.size() > MAX_VALUE_LEN) { |
| 695 | value = format(fmt: "%s..." , value.substr(pos: 0, n: MAX_VALUE_LEN - 3).c_str()); |
| 696 | } |
| 697 | replace_all(s&: value, search: "\n" , replace: "\\n" ); |
| 698 | |
| 699 | LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n" , __func__, i, name, type_name.c_str(), value.c_str()); |
| 700 | } |
| 701 | |
| 702 | // print type counts |
| 703 | for (auto & kv : n_type) { |
| 704 | if (kv.second == 0) { |
| 705 | continue; |
| 706 | } |
| 707 | |
| 708 | LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n" , __func__, ggml_type_name(kv.first), kv.second); |
| 709 | } |
| 710 | } |
| 711 | |
| 712 | if (!llama_mmap::SUPPORTED) { |
| 713 | LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n" , __func__); |
| 714 | use_mmap = false; |
| 715 | } |
| 716 | |
| 717 | this->use_mmap = use_mmap; |
| 718 | this->check_tensors = check_tensors; |
| 719 | } |
| 720 | |
| 721 | std::string llama_model_loader::get_arch_name() const { |
| 722 | return arch_name; |
| 723 | } |
| 724 | |
| 725 | enum llm_arch llama_model_loader::get_arch() const { |
| 726 | return llm_kv.arch; |
| 727 | } |
| 728 | |
| 729 | const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const { |
| 730 | auto pos = weights_map.find(x: name); |
| 731 | if (pos != weights_map.end()) { |
| 732 | return &pos->second; |
| 733 | } |
| 734 | |
| 735 | return nullptr; |
| 736 | } |
| 737 | |
| 738 | const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const { |
| 739 | const llama_tensor_weight * weight = get_weight(name); |
| 740 | if (!weight) { |
| 741 | throw std::runtime_error(format(fmt: "%s: tensor '%s' not found" , __func__, name)); |
| 742 | } |
| 743 | return *weight; |
| 744 | } |
| 745 | |
| 746 | struct ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const { |
| 747 | const auto * weight = get_weight(name); |
| 748 | if (!weight) { |
| 749 | return nullptr; |
| 750 | } |
| 751 | return weight->tensor; |
| 752 | } |
| 753 | |
| 754 | struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const { |
| 755 | struct ggml_tensor * tensor = get_tensor_meta(name: name.c_str()); |
| 756 | if (!tensor) { |
| 757 | throw std::runtime_error(format(fmt: "%s: tensor '%s' not found" , __func__, name.c_str())); |
| 758 | } |
| 759 | return tensor; |
| 760 | } |
| 761 | |
| 762 | const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const { |
| 763 | const struct ggml_tensor * cur = get_tensor_meta(name: name.c_str()); |
| 764 | |
| 765 | if (cur == NULL) { |
| 766 | if (!required) { |
| 767 | return NULL; |
| 768 | } |
| 769 | throw std::runtime_error(format(fmt: "%s: tensor '%s' not found" , __func__, name.c_str())); |
| 770 | } |
| 771 | |
| 772 | { |
| 773 | bool is_ok = true; |
| 774 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
| 775 | if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) { |
| 776 | is_ok = false; |
| 777 | break; |
| 778 | } |
| 779 | } |
| 780 | if (!is_ok) { |
| 781 | throw std::runtime_error( |
| 782 | format(fmt: "%s: tensor '%s' has wrong shape; expected %s, got %s" , |
| 783 | __func__, name.c_str(), |
| 784 | llama_format_tensor_shape(ne).c_str(), |
| 785 | llama_format_tensor_shape(t: cur).c_str())); |
| 786 | } |
| 787 | } |
| 788 | |
| 789 | return cur; |
| 790 | } |
| 791 | |
| 792 | struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) { |
| 793 | LLAMA_LOG_DEBUG("%s: loading tensor %s\n" , __func__, name.c_str()); |
| 794 | const struct ggml_tensor * cur = check_tensor_dims(name, ne, required: !(flags & TENSOR_NOT_REQUIRED)); |
| 795 | |
| 796 | if (cur == NULL) { |
| 797 | return NULL; |
| 798 | } |
| 799 | |
| 800 | bool duplicated = flags & TENSOR_DUPLICATED; |
| 801 | |
| 802 | struct ggml_tensor * tensor = ggml_dup_tensor(ctx, src: cur); |
| 803 | ggml_set_name(tensor, name: ggml_get_name(tensor: cur)); |
| 804 | |
| 805 | if (duplicated) { |
| 806 | size_data += ggml_nbytes(tensor: cur); |
| 807 | } else { |
| 808 | n_created++; |
| 809 | } |
| 810 | |
| 811 | return tensor; |
| 812 | |
| 813 | } |
| 814 | |
| 815 | struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) { |
| 816 | const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); |
| 817 | |
| 818 | if (cur == NULL) { |
| 819 | return NULL; |
| 820 | } |
| 821 | |
| 822 | if (cur->type != base->type) { |
| 823 | throw std::runtime_error(format(fmt: "%s: tensor '%s' has wrong type; expected %s, got %s" , __func__, name.c_str(), ggml_type_name(type: base->type), ggml_type_name(type: cur->type))); |
| 824 | } |
| 825 | |
| 826 | std::array<int64_t, GGML_MAX_DIMS> dims; |
| 827 | for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { |
| 828 | dims[i] = i < ne.size() ? ne.begin()[i] : 1; |
| 829 | } |
| 830 | |
| 831 | struct ggml_tensor * tensor = ggml_view_4d(ctx, a: base, |
| 832 | ne0: dims[0], ne1: dims[1], ne2: dims[2], ne3: dims[3], |
| 833 | nb1: cur->nb[1], nb2: cur->nb[2], nb3: cur->nb[3], |
| 834 | offset); |
| 835 | |
| 836 | ggml_set_name(tensor, name: name.c_str()); |
| 837 | |
| 838 | n_created++; |
| 839 | |
| 840 | return tensor; |
| 841 | } |
| 842 | |
| 843 | void llama_model_loader::done_getting_tensors() const { |
| 844 | if (n_created != n_tensors) { |
| 845 | throw std::runtime_error(format(fmt: "%s: wrong number of tensors; expected %d, got %d" , __func__, n_tensors, n_created)); |
| 846 | } |
| 847 | } |
| 848 | |
| 849 | void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) { |
| 850 | if (use_mmap) { |
| 851 | mappings.reserve(n: files.size()); |
| 852 | mmaps_used.reserve(n: files.size()); |
| 853 | for (const auto & file : files) { |
| 854 | bool is_numa = false; |
| 855 | |
| 856 | auto * dev = ggml_backend_dev_by_type(type: GGML_BACKEND_DEVICE_TYPE_CPU); |
| 857 | if (dev) { |
| 858 | auto * reg = ggml_backend_dev_backend_reg(device: dev); |
| 859 | auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, name: "ggml_backend_cpu_is_numa" ); |
| 860 | if (is_numa_fn) { |
| 861 | is_numa = is_numa_fn(); |
| 862 | } |
| 863 | } |
| 864 | |
| 865 | std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(args: file.get(), args: prefetch ? -1 : 0, args&: is_numa); |
| 866 | mmaps_used.emplace_back(args: mapping->size(), args: 0); |
| 867 | if (mlock_mmaps) { |
| 868 | std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock()); |
| 869 | mlock_mmap->init(ptr: mapping->addr()); |
| 870 | mlock_mmaps->emplace_back(args: std::move(mlock_mmap)); |
| 871 | } |
| 872 | mappings.emplace_back(args: std::move(mapping)); |
| 873 | } |
| 874 | } |
| 875 | |
| 876 | // compute the total size of all tensors for progress reporting |
| 877 | for (const auto & it : weights_map) { |
| 878 | size_data += ggml_nbytes(tensor: it.second.tensor); |
| 879 | } |
| 880 | } |
| 881 | |
| 882 | void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { |
| 883 | GGML_ASSERT(!mappings.empty()); |
| 884 | const auto & mapping = mappings.at(n: idx); |
| 885 | |
| 886 | *first = mapping->size(); |
| 887 | *last = 0; |
| 888 | *addr = mapping->addr(); |
| 889 | for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { |
| 890 | const auto * weight = get_weight(name: ggml_get_name(tensor)); |
| 891 | if (!weight || weight->idx != idx) { |
| 892 | continue; |
| 893 | } |
| 894 | *first = std::min(a: *first, b: weight->offs); |
| 895 | *last = std::max(a: *last, b: weight->offs + ggml_nbytes(tensor)); |
| 896 | } |
| 897 | } |
| 898 | |
| 899 | void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { |
| 900 | const auto & w = require_weight(name: ggml_get_name(tensor: cur)); |
| 901 | |
| 902 | if (use_mmap) { |
| 903 | const auto & mapping = mappings.at(n: w.idx); |
| 904 | if (cur->data == nullptr) { |
| 905 | cur->data = (uint8_t *)mapping->addr() + w.offs; |
| 906 | } else { |
| 907 | memcpy(dest: cur->data, src: (uint8_t *)mapping->addr() + w.offs, n: ggml_nbytes(tensor: cur)); |
| 908 | } |
| 909 | } else { |
| 910 | GGML_ASSERT(cur->data != nullptr); |
| 911 | GGML_ASSERT(w.idx < files.size()); |
| 912 | const auto & file = files.at(n: w.idx); |
| 913 | file->seek(offset: w.offs, SEEK_SET); |
| 914 | file->read_raw(ptr: cur->data, len: ggml_nbytes(tensor: cur)); |
| 915 | } |
| 916 | |
| 917 | if (check_tensors && !ggml_validate_row_data(type: cur->type, data: cur->data, nbytes: ggml_nbytes(tensor: cur))) { |
| 918 | throw std::runtime_error(format(fmt: "tensor '%s' has invalid data" , ggml_get_name(tensor: cur))); |
| 919 | } |
| 920 | } |
| 921 | |
| 922 | bool llama_model_loader::load_all_data( |
| 923 | struct ggml_context * ctx, |
| 924 | llama_buf_map & bufs, |
| 925 | llama_mlocks * lmlocks, |
| 926 | llama_progress_callback progress_callback, |
| 927 | void * progress_callback_user_data) { |
| 928 | GGML_ASSERT(size_data != 0 && "call init_mappings() first" ); |
| 929 | |
| 930 | std::vector<no_init<uint8_t>> read_buf; |
| 931 | std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result; |
| 932 | |
| 933 | // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. |
| 934 | // NVMe raid configurations might require more / larger buffers. |
| 935 | constexpr size_t n_buffers = 4; |
| 936 | constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB |
| 937 | |
| 938 | std::vector<ggml_backend_buffer_t> host_buffers; |
| 939 | std::vector<ggml_backend_event_t> events; |
| 940 | std::vector<void *> host_ptrs; |
| 941 | size_t buffer_idx = 0; // buffer to use for async loads |
| 942 | ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { |
| 943 | if (use_mmap || check_tensors) { |
| 944 | return nullptr; |
| 945 | } |
| 946 | // When not using mmaped io use async uploads from pinned memory to GPU memory. |
| 947 | // First determine if the backend supports the necessary features for async uploads. |
| 948 | auto * buf = bufs.count(x: 0) ? bufs.at(k: 0) : nullptr; |
| 949 | if (!buf) { |
| 950 | LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n" , func); |
| 951 | return nullptr; |
| 952 | } |
| 953 | |
| 954 | auto * buft = ggml_backend_buffer_get_type(buffer: buf); |
| 955 | auto * dev = ggml_backend_buft_get_device(buft); |
| 956 | if (!dev) { |
| 957 | LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n" , func, |
| 958 | ggml_backend_buft_name(buft)); |
| 959 | return nullptr; |
| 960 | } |
| 961 | |
| 962 | if (buft != ggml_backend_dev_buffer_type(device: dev)) { |
| 963 | LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n" , func, |
| 964 | ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); |
| 965 | return nullptr; |
| 966 | } |
| 967 | |
| 968 | ggml_backend_dev_props props; |
| 969 | ggml_backend_dev_get_props(device: dev, props: &props); |
| 970 | if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { |
| 971 | LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n" , func, |
| 972 | ggml_backend_dev_name(dev)); |
| 973 | return nullptr; |
| 974 | } |
| 975 | |
| 976 | auto * host_buft = ggml_backend_dev_host_buffer_type(device: dev); |
| 977 | if (!host_buft) { |
| 978 | LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n" , func, |
| 979 | ggml_backend_dev_name(dev)); |
| 980 | return nullptr; |
| 981 | } |
| 982 | |
| 983 | // If the backend is supported, create pinned memory buffers and events for synchronisation. |
| 984 | for (size_t idx = 0; idx < n_buffers; ++idx) { |
| 985 | auto * buf = ggml_backend_buft_alloc_buffer(buft: host_buft, size: buffer_size); |
| 986 | if (!buf) { |
| 987 | LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n" , func, |
| 988 | ggml_backend_dev_name(dev)); |
| 989 | return nullptr; |
| 990 | } |
| 991 | |
| 992 | host_buffers.emplace_back(args&: buf); |
| 993 | host_ptrs.emplace_back(args: ggml_backend_buffer_get_base(buffer: buf)); |
| 994 | |
| 995 | auto * event = ggml_backend_event_new(device: dev); |
| 996 | if (!event) { |
| 997 | LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n" , func, |
| 998 | ggml_backend_dev_name(dev)); |
| 999 | return nullptr; |
| 1000 | } |
| 1001 | |
| 1002 | events.emplace_back(args&: event); |
| 1003 | } |
| 1004 | |
| 1005 | ggml_backend_t backend = ggml_backend_dev_init(device: dev, params: nullptr); |
| 1006 | if (!backend) { |
| 1007 | LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n" , func, |
| 1008 | ggml_backend_dev_name(dev)); |
| 1009 | return nullptr; |
| 1010 | } |
| 1011 | |
| 1012 | return backend; |
| 1013 | }(__func__); |
| 1014 | |
| 1015 | if (upload_backend) { |
| 1016 | LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n" , __func__, |
| 1017 | ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), |
| 1018 | ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), |
| 1019 | ggml_backend_name(upload_backend)); |
| 1020 | } |
| 1021 | |
| 1022 | for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, tensor: cur)) { |
| 1023 | const auto * weight = get_weight(name: ggml_get_name(tensor: cur)); |
| 1024 | if (weight == nullptr) { |
| 1025 | // this can happen with split experts models |
| 1026 | continue; |
| 1027 | } |
| 1028 | |
| 1029 | if (progress_callback) { |
| 1030 | if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { |
| 1031 | return false; |
| 1032 | } |
| 1033 | } |
| 1034 | |
| 1035 | size_t n_size = ggml_nbytes(tensor: cur); |
| 1036 | |
| 1037 | if (use_mmap) { |
| 1038 | const auto & mapping = mappings.at(n: weight->idx); |
| 1039 | ggml_backend_buffer_t buf_mmap = nullptr; |
| 1040 | if (bufs.count(x: weight->idx)) { |
| 1041 | buf_mmap = bufs.at(k: weight->idx); |
| 1042 | } |
| 1043 | uint8_t * data = (uint8_t *) mapping->addr() + weight->offs; |
| 1044 | |
| 1045 | if (check_tensors) { |
| 1046 | validation_result.emplace_back(args: std::async(policy: std::launch::async, fn: [cur, data, n_size] { |
| 1047 | return std::make_pair(x: cur, y: ggml_validate_row_data(type: cur->type, data, nbytes: n_size)); |
| 1048 | })); |
| 1049 | } |
| 1050 | |
| 1051 | GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated |
| 1052 | if (buf_mmap && cur->data == nullptr) { |
| 1053 | ggml_backend_tensor_alloc(buffer: buf_mmap, tensor: cur, addr: data); |
| 1054 | if (lmlocks) { |
| 1055 | const auto & lmlock = lmlocks->at(n: weight->idx); |
| 1056 | lmlock->grow_to(target_size: weight->offs + n_size); |
| 1057 | } |
| 1058 | |
| 1059 | auto & mmap_used = mmaps_used[weight->idx]; |
| 1060 | mmap_used.first = std::min(a: mmap_used.first, b: weight->offs); |
| 1061 | mmap_used.second = std::max(a: mmap_used.second, b: weight->offs + n_size); |
| 1062 | } else { |
| 1063 | ggml_backend_tensor_set(tensor: cur, data, offset: 0, size: n_size); |
| 1064 | } |
| 1065 | } else { |
| 1066 | const auto & file = files.at(n: weight->idx); |
| 1067 | if (ggml_backend_buffer_is_host(buffer: cur->buffer)) { |
| 1068 | file->seek(offset: weight->offs, SEEK_SET); |
| 1069 | file->read_raw(ptr: cur->data, len: n_size); |
| 1070 | if (check_tensors) { |
| 1071 | validation_result.emplace_back(args: std::async(policy: std::launch::async, fn: [cur, n_size] { |
| 1072 | return std::make_pair(x: cur, y: ggml_validate_row_data(type: cur->type, data: cur->data, nbytes: n_size)); |
| 1073 | })); |
| 1074 | } |
| 1075 | } else { |
| 1076 | // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. |
| 1077 | if (upload_backend) { |
| 1078 | file->seek(offset: weight->offs, SEEK_SET); |
| 1079 | |
| 1080 | size_t bytes_read = 0; |
| 1081 | |
| 1082 | while (bytes_read < n_size) { |
| 1083 | size_t read_iteration = std::min<size_t>(a: buffer_size, b: n_size - bytes_read); |
| 1084 | |
| 1085 | ggml_backend_event_synchronize(event: events[buffer_idx]); |
| 1086 | file->read_raw(ptr: host_ptrs[buffer_idx], len: read_iteration); |
| 1087 | ggml_backend_tensor_set_async(backend: upload_backend, tensor: cur, data: host_ptrs[buffer_idx], offset: bytes_read, size: read_iteration); |
| 1088 | ggml_backend_event_record(event: events[buffer_idx], backend: upload_backend); |
| 1089 | |
| 1090 | bytes_read += read_iteration; |
| 1091 | ++buffer_idx; |
| 1092 | buffer_idx %= n_buffers; |
| 1093 | } |
| 1094 | } else { |
| 1095 | read_buf.resize(new_size: n_size); |
| 1096 | file->seek(offset: weight->offs, SEEK_SET); |
| 1097 | file->read_raw(ptr: read_buf.data(), len: n_size); |
| 1098 | ggml_backend_tensor_set(tensor: cur, data: read_buf.data(), offset: 0, size: n_size); |
| 1099 | if (check_tensors && !ggml_validate_row_data(type: cur->type, data: read_buf.data(), nbytes: n_size)) { |
| 1100 | throw std::runtime_error(format(fmt: "tensor '%s' has invalid data" , ggml_get_name(tensor: cur))); |
| 1101 | } |
| 1102 | } |
| 1103 | } |
| 1104 | } |
| 1105 | |
| 1106 | size_done += n_size; |
| 1107 | } |
| 1108 | |
| 1109 | // free temporary resources used for async uploads |
| 1110 | for (auto * event : events) { |
| 1111 | ggml_backend_event_synchronize(event); |
| 1112 | ggml_backend_event_free(event); |
| 1113 | } |
| 1114 | for (auto * buf : host_buffers) { |
| 1115 | ggml_backend_buffer_free(buffer: buf); |
| 1116 | } |
| 1117 | ggml_backend_free(backend: upload_backend); |
| 1118 | |
| 1119 | // check validation results |
| 1120 | bool validation_failed = false; |
| 1121 | for (auto & future : validation_result) { |
| 1122 | auto result = future.get(); |
| 1123 | if (!result.second) { |
| 1124 | LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n" , __func__, ggml_get_name(result.first)); |
| 1125 | validation_failed = true; |
| 1126 | } |
| 1127 | } |
| 1128 | if (validation_failed) { |
| 1129 | throw std::runtime_error("found tensors with invalid data" ); |
| 1130 | } |
| 1131 | |
| 1132 | // check if this is the last call and do final cleanup |
| 1133 | if (size_done >= size_data) { |
| 1134 | // unmap offloaded tensors and metadata |
| 1135 | if (use_mmap) { |
| 1136 | for (uint32_t idx = 0; idx < mappings.size(); idx++) { |
| 1137 | const auto & mmap_used = mmaps_used.at(n: idx); |
| 1138 | auto & mapping = mappings.at(n: idx); |
| 1139 | mapping->unmap_fragment(first: 0, last: mmap_used.first); |
| 1140 | if (mmap_used.second != 0) { |
| 1141 | mapping->unmap_fragment(first: mmap_used.second, last: mapping->size()); |
| 1142 | } |
| 1143 | } |
| 1144 | } |
| 1145 | if (progress_callback) { |
| 1146 | // Even though the model is done loading, we still honor |
| 1147 | // cancellation since we need to free allocations. |
| 1148 | return progress_callback(1.0f, progress_callback_user_data); |
| 1149 | } |
| 1150 | } |
| 1151 | |
| 1152 | return true; |
| 1153 | } |
| 1154 | |
| 1155 | std::string llama_model_loader::ftype_name() const { |
| 1156 | return llama_model_ftype_name(ftype); |
| 1157 | } |
| 1158 | |
| 1159 | void llama_model_loader::print_info() const { |
| 1160 | LLAMA_LOG_INFO("%s: file format = %s\n" , __func__, llama_file_version_name(fver)); |
| 1161 | LLAMA_LOG_INFO("%s: file type = %s\n" , __func__, llama_model_ftype_name(ftype).c_str()); |
| 1162 | if (n_bytes < GiB) { |
| 1163 | LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n" , __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements); |
| 1164 | } else { |
| 1165 | LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n" , __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements); |
| 1166 | } |
| 1167 | } |
| 1168 | |