1#include "ggml.h"
2#include "gguf.h"
3#include "llama.h"
4#include "common.h"
5
6#include <algorithm>
7#include <cinttypes>
8#include <climits>
9#include <cstdio>
10#include <cstdlib>
11#include <stdexcept>
12#include <cstring>
13#include <fstream>
14#include <string>
15#include <vector>
16
17#if defined(_WIN32)
18 #include <windows.h>
19 #ifndef PATH_MAX
20 #define PATH_MAX MAX_PATH
21 #endif
22 #include <io.h>
23#endif
24
25enum split_operation : uint8_t {
26 OP_NONE,
27 OP_SPLIT,
28 OP_MERGE,
29};
30
31enum split_mode : uint8_t {
32 MODE_NONE,
33 MODE_TENSOR,
34 MODE_SIZE,
35};
36
37struct split_params {
38 split_operation operation = OP_NONE;
39 split_mode mode = MODE_NONE;
40 size_t n_bytes_split = 0;
41 int n_split_tensors = 128;
42 std::string input;
43 std::string output;
44 bool no_tensor_first_split = false;
45 bool dry_run = false;
46};
47
48static void split_print_usage(const char * executable) {
49 const split_params default_params;
50 printf(format: "\n");
51 printf(format: "usage: %s [options] GGUF_IN GGUF_OUT\n", executable);
52 printf(format: "\n");
53 printf(format: "Apply a GGUF operation on IN to OUT.");
54 printf(format: "\n");
55 printf(format: "options:\n");
56 printf(format: " -h, --help show this help message and exit\n");
57 printf(format: " --version show version and build info\n");
58 printf(format: " --split split GGUF to multiple GGUF (enabled by default)\n");
59 printf(format: " --merge merge multiple GGUF to a single GGUF\n");
60 printf(format: " --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
61 printf(format: " --split-max-size N(M|G) max size per split\n");
62 printf(format: " --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
63 printf(format: " --dry-run only print out a split plan and exit, without writing any new files\n");
64 printf(format: "\n");
65}
66
67// return convert string, for example "128M" or "4G" to number of bytes
68static size_t split_str_to_n_bytes(std::string str) {
69 size_t n_bytes = 0;
70 int n;
71 if (str.back() == 'M') {
72 sscanf(s: str.c_str(), format: "%d", &n);
73 n_bytes = (size_t)n * 1000 * 1000; // megabytes
74 } else if (str.back() == 'G') {
75 sscanf(s: str.c_str(), format: "%d", &n);
76 n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
77 } else {
78 throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
79 }
80 if (n <= 0) {
81 throw std::invalid_argument("error: size must be a positive value");
82 }
83 return n_bytes;
84}
85
86static void split_params_parse_ex(int argc, const char ** argv, split_params & params) {
87 std::string arg;
88 const std::string arg_prefix = "--";
89 bool invalid_param = false;
90
91 int arg_idx = 1;
92 for (; arg_idx < argc && strncmp(s1: argv[arg_idx], s2: "--", n: 2) == 0; arg_idx++) {
93 arg = argv[arg_idx];
94 if (arg.compare(pos: 0, n: arg_prefix.size(), str: arg_prefix) == 0) {
95 std::replace(first: arg.begin(), last: arg.end(), old_value: '_', new_value: '-');
96 }
97
98 bool arg_found = false;
99 if (arg == "-h" || arg == "--help") {
100 split_print_usage(executable: argv[0]);
101 exit(status: 0);
102 } else if (arg == "--version") {
103 fprintf(stderr, format: "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
104 fprintf(stderr, format: "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
105 exit(status: 0);
106 } else if (arg == "--dry-run") {
107 arg_found = true;
108 params.dry_run = true;
109 } else if (arg == "--no-tensor-first-split") {
110 arg_found = true;
111 params.no_tensor_first_split = true;
112 } else if (arg == "--merge") {
113 arg_found = true;
114 if (params.operation != OP_NONE && params.operation != OP_MERGE) {
115 throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
116 }
117 params.operation = OP_MERGE;
118 } else if (arg == "--split") {
119 arg_found = true;
120 if (params.operation != OP_NONE && params.operation != OP_SPLIT) {
121 throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
122 }
123 params.operation = OP_SPLIT;
124 } else if (arg == "--split-max-tensors") {
125 if (++arg_idx >= argc) {
126 invalid_param = true;
127 break;
128 }
129 arg_found = true;
130 if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) {
131 throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
132 }
133 params.mode = MODE_TENSOR;
134 params.n_split_tensors = atoi(nptr: argv[arg_idx]);
135 } else if (arg == "--split-max-size") {
136 if (++arg_idx >= argc) {
137 invalid_param = true;
138 break;
139 }
140 arg_found = true;
141 if (params.mode != MODE_NONE && params.mode != MODE_SIZE) {
142 throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
143 }
144 params.mode = MODE_SIZE;
145 params.n_bytes_split = split_str_to_n_bytes(str: argv[arg_idx]);
146 }
147
148 if (!arg_found) {
149 throw std::invalid_argument("error: unknown argument: " + arg);
150 }
151 }
152
153 // the operation is split if not specified
154 if (params.operation == OP_NONE) {
155 params.operation = OP_SPLIT;
156 }
157 // the split mode is by tensor if not specified
158 if (params.mode == MODE_NONE) {
159 params.mode = MODE_TENSOR;
160 }
161
162 if (invalid_param) {
163 throw std::invalid_argument("error: invalid parameter for argument: " + arg);
164 }
165
166 if (argc - arg_idx != 2) {
167 throw std::invalid_argument("error: bad arguments");
168 }
169
170 params.input = argv[arg_idx++];
171 params.output = argv[arg_idx++];
172}
173
174static bool split_params_parse(int argc, const char ** argv, split_params & params) {
175 bool result = true;
176 try {
177 split_params_parse_ex(argc, argv, params);
178 }
179 catch (const std::invalid_argument & ex) {
180 fprintf(stderr, format: "%s\n", ex.what());
181 split_print_usage(executable: argv[0]);
182 exit(EXIT_FAILURE);
183 }
184 return result;
185}
186
187static void zeros(std::ofstream & file, size_t n) {
188 char zero = 0;
189 for (size_t i = 0; i < n; ++i) {
190 file.write(s: &zero, n: 1);
191 }
192}
193
194struct split_strategy {
195 const split_params params;
196 std::ifstream & f_input;
197 struct gguf_context * ctx_gguf;
198 struct ggml_context * ctx_meta = NULL;
199 const int n_tensors;
200
201 // one ctx_out per one output file
202 std::vector<struct gguf_context *> ctx_outs;
203
204 // temporary buffer for reading in tensor data
205 std::vector<uint8_t> read_buf;
206
207 split_strategy(const split_params & params,
208 std::ifstream & f_input,
209 struct gguf_context * ctx_gguf,
210 struct ggml_context * ctx_meta) :
211 params(params),
212 f_input(f_input),
213 ctx_gguf(ctx_gguf),
214 ctx_meta(ctx_meta),
215 n_tensors(gguf_get_n_tensors(ctx: ctx_gguf)) {
216
217 // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
218 int i_split = -1;
219 struct gguf_context * ctx_out = NULL;
220 auto new_ctx_out = [&](bool allow_no_tensors) {
221 i_split++;
222 if (ctx_out != NULL) {
223 if (gguf_get_n_tensors(ctx: ctx_out) == 0 && !allow_no_tensors) {
224 fprintf(stderr, format: "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
225 exit(EXIT_FAILURE);
226 }
227 ctx_outs.push_back(x: ctx_out);
228 }
229 ctx_out = gguf_init_empty();
230 // Save all metadata in first split only
231 if (i_split == 0) {
232 gguf_set_kv(ctx: ctx_out, src: ctx_gguf);
233 }
234 gguf_set_val_u16(ctx: ctx_out, key: LLM_KV_SPLIT_NO, val: i_split);
235 gguf_set_val_u16(ctx: ctx_out, key: LLM_KV_SPLIT_COUNT, val: 0); // placeholder
236 gguf_set_val_i32(ctx: ctx_out, key: LLM_KV_SPLIT_TENSORS_COUNT, val: n_tensors);
237 };
238
239 // initialize ctx_out for the first split
240 new_ctx_out(false);
241
242 // skip first split if no_tensor_first_split is set
243 if (params.no_tensor_first_split) {
244 new_ctx_out(true);
245 }
246
247 // process tensors one by one
248 size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
249 for (int i = 0; i < n_tensors; ++i) {
250 struct ggml_tensor * t = ggml_get_tensor(ctx: ctx_meta, name: gguf_get_tensor_name(ctx: ctx_gguf, tensor_id: i));
251 // calculate the "imaginary" size = the current size + next tensor size
252 size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
253 size_t next_tensors_size = curr_tensors_size + n_bytes;
254 if (should_split(i_tensor: i, next_size: next_tensors_size)) {
255 new_ctx_out(false);
256 curr_tensors_size = n_bytes;
257 } else {
258 curr_tensors_size = next_tensors_size;
259 }
260 gguf_add_tensor(ctx: ctx_out, tensor: t);
261 }
262
263 // push the last ctx_out
264 ctx_outs.push_back(x: ctx_out);
265
266 // set the correct n_split for all ctx_out
267 for (auto & ctx : ctx_outs) {
268 gguf_set_val_u16(ctx, key: LLM_KV_SPLIT_COUNT, val: ctx_outs.size());
269 }
270 }
271
272 ~split_strategy() {
273 for (auto & ctx_out : ctx_outs) {
274 gguf_free(ctx: ctx_out);
275 }
276 }
277
278 bool should_split(int i_tensor, size_t next_size) {
279 if (params.mode == MODE_SIZE) {
280 // split by max size per file
281 return next_size > params.n_bytes_split;
282 } else if (params.mode == MODE_TENSOR) {
283 // split by number of tensors per file
284 return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
285 }
286 // should never happen
287 GGML_ABORT("invalid mode");
288 }
289
290 void print_info() {
291 printf(format: "n_split: %zu\n", ctx_outs.size());
292 int i_split = 0;
293 for (auto & ctx_out : ctx_outs) {
294 // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
295 size_t total_size = gguf_get_meta_size(ctx: ctx_out);
296 for (int i = 0; i < gguf_get_n_tensors(ctx: ctx_out); ++i) {
297 struct ggml_tensor * t = ggml_get_tensor(ctx: ctx_meta, name: gguf_get_tensor_name(ctx: ctx_out, tensor_id: i));
298 total_size += ggml_nbytes(tensor: t);
299 }
300 total_size = total_size / 1000 / 1000; // convert to megabytes
301 printf(format: "split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx: ctx_out), total_size);
302 i_split++;
303 }
304 }
305
306 void write() {
307 int i_split = 0;
308 int n_split = ctx_outs.size();
309 for (auto & ctx_out : ctx_outs) {
310 // construct file path
311 char split_path[PATH_MAX] = {0};
312 llama_split_path(split_path, maxlen: sizeof(split_path), path_prefix: params.output.c_str(), split_no: i_split, split_count: n_split);
313
314 // open the output file
315 printf(format: "Writing file %s ... ", split_path);
316 fflush(stdout);
317 std::ofstream fout = std::ofstream(split_path, std::ios::binary);
318 fout.exceptions(except: std::ofstream::failbit); // fail fast on write errors
319
320 // write metadata
321 std::vector<uint8_t> data(gguf_get_meta_size(ctx: ctx_out));
322 gguf_get_meta_data(ctx: ctx_out, data: data.data());
323 fout.write(s: (const char *)data.data(), n: data.size());
324
325 // write tensors
326 for (int i = 0; i < gguf_get_n_tensors(ctx: ctx_out); ++i) {
327 // read tensor meta and prepare buffer
328 const char * t_name = gguf_get_tensor_name(ctx: ctx_out, tensor_id: i);
329 struct ggml_tensor * t = ggml_get_tensor(ctx: ctx_meta, name: t_name);
330 auto n_bytes = ggml_nbytes(tensor: t);
331 read_buf.resize(new_size: n_bytes);
332
333 // calculate offset
334 auto i_tensor_in = gguf_find_tensor(ctx: ctx_gguf, name: t_name); // idx of tensor in the input file
335 auto offset = gguf_get_data_offset(ctx: ctx_gguf) + gguf_get_tensor_offset(ctx: ctx_gguf, tensor_id: i_tensor_in);
336
337 // copy tensor from input to output file
338 copy_file_to_file(f_in&: f_input, f_out&: fout, in_offset: offset, len: n_bytes);
339 zeros(file&: fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
340 }
341
342 printf(format: "done\n");
343 // close the file
344 fout.close();
345 i_split++;
346 }
347 }
348
349 void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) {
350 // TODO: detect OS and use copy_file_range() here for better performance
351 if (read_buf.size() < len) {
352 read_buf.resize(new_size: len);
353 }
354 f_in.seekg(in_offset);
355 f_in.read(s: (char *)read_buf.data(), n: len);
356 f_out.write(s: (const char *)read_buf.data(), n: len);
357 }
358};
359
360static void gguf_split(const split_params & split_params) {
361 struct ggml_context * ctx_meta = NULL;
362
363 struct gguf_init_params params = {
364 /*.no_alloc = */ true,
365 /*.ctx = */ &ctx_meta,
366 };
367
368 std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
369 if (!f_input.is_open()) {
370 fprintf(stderr, format: "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
371 exit(EXIT_FAILURE);
372 }
373
374 auto * ctx_gguf = gguf_init_from_file(fname: split_params.input.c_str(), params);
375 if (!ctx_gguf) {
376 fprintf(stderr, format: "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
377 exit(EXIT_FAILURE);
378 }
379
380 // prepare the strategy
381 split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
382 int n_split = strategy.ctx_outs.size();
383 strategy.print_info();
384
385 if (!split_params.dry_run) {
386 // write all output splits
387 strategy.write();
388 }
389
390 // done, clean up
391 gguf_free(ctx: ctx_gguf);
392 f_input.close();
393
394 fprintf(stderr, format: "%s: %d gguf split written with a total of %d tensors.\n",
395 __func__, n_split, strategy.n_tensors);
396}
397
398static void gguf_merge(const split_params & split_params) {
399 fprintf(stderr, format: "%s: %s -> %s\n",
400 __func__, split_params.input.c_str(),
401 split_params.output.c_str());
402 int n_split = 1;
403 int total_tensors = 0;
404
405 // avoid overwriting existing output file
406 if (std::ifstream(split_params.output.c_str())) {
407 fprintf(stderr, format: "%s: output file %s already exists\n", __func__, split_params.output.c_str());
408 exit(EXIT_FAILURE);
409 }
410
411
412 auto * ctx_out = gguf_init_empty();
413
414 std::vector<uint8_t> read_data;
415 std::vector<ggml_context *> ctx_metas;
416 std::vector<gguf_context *> ctx_ggufs;
417
418 char split_path[PATH_MAX] = {0};
419 strncpy(dest: split_path, src: split_params.input.c_str(), n: sizeof(split_path) - 1);
420 char split_prefix[PATH_MAX] = {0};
421
422 // First pass to find KV and tensors metadata
423 for (int i_split = 0; i_split < n_split; i_split++) {
424 struct ggml_context * ctx_meta = NULL;
425
426 struct gguf_init_params params = {
427 /*.no_alloc = */ true,
428 /*.ctx = */ &ctx_meta,
429 };
430
431 if (i_split > 0) {
432 llama_split_path(split_path, maxlen: sizeof(split_path), path_prefix: split_prefix, split_no: i_split, split_count: n_split);
433 }
434 fprintf(stderr, format: "%s: reading metadata %s ...", __func__, split_path);
435
436 auto * ctx_gguf = gguf_init_from_file(fname: split_path, params);
437 if (!ctx_gguf) {
438 fprintf(stderr, format: "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
439 exit(EXIT_FAILURE);
440 }
441 ctx_ggufs.push_back(x: ctx_gguf);
442 ctx_metas.push_back(x: ctx_meta);
443
444 if (i_split == 0) {
445 auto key_n_split = gguf_find_key(ctx: ctx_gguf, key: LLM_KV_SPLIT_COUNT);
446 if (key_n_split < 0) {
447 fprintf(stderr,
448 format: "\n%s: input file does not contain %s metadata\n",
449 __func__,
450 LLM_KV_SPLIT_COUNT);
451 gguf_free(ctx: ctx_gguf);
452 ggml_free(ctx: ctx_meta);
453 gguf_free(ctx: ctx_out);
454 exit(EXIT_FAILURE);
455 }
456
457 n_split = gguf_get_val_u16(ctx: ctx_gguf, key_id: key_n_split);
458 if (n_split < 1) {
459 fprintf(stderr,
460 format: "\n%s: input file does not contain a valid split count %d\n",
461 __func__,
462 n_split);
463 gguf_free(ctx: ctx_gguf);
464 ggml_free(ctx: ctx_meta);
465 gguf_free(ctx: ctx_out);
466 exit(EXIT_FAILURE);
467 }
468
469 // Verify the file naming and extract split_prefix
470 if (!llama_split_prefix(split_prefix, maxlen: sizeof (split_prefix), split_path, split_no: i_split, split_count: n_split)) {
471 fprintf(stderr, format: "\n%s: unexpected input file name: %s"
472 " i_split=%d"
473 " n_split=%d\n", __func__,
474 split_path, i_split, n_split);
475 gguf_free(ctx: ctx_gguf);
476 ggml_free(ctx: ctx_meta);
477 gguf_free(ctx: ctx_out);
478 exit(EXIT_FAILURE);
479 }
480
481 // Do not trigger merge if we try to merge again the output
482 gguf_set_val_u16(ctx: ctx_gguf, key: LLM_KV_SPLIT_COUNT, val: 0);
483
484 // Set metadata from the first split
485 gguf_set_kv(ctx: ctx_out, src: ctx_gguf);
486 }
487
488 auto n_tensors = gguf_get_n_tensors(ctx: ctx_gguf);
489 for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
490 const char * t_name = gguf_get_tensor_name(ctx: ctx_gguf, tensor_id: i_tensor);
491 struct ggml_tensor * t = ggml_get_tensor(ctx: ctx_meta, name: t_name);
492 gguf_add_tensor(ctx: ctx_out, tensor: t);
493 }
494 total_tensors += n_tensors;
495
496 fprintf(stderr, format: "\033[3Ddone\n");
497 }
498 std::ofstream fout;
499 if (!split_params.dry_run) {
500 fout.open(s: split_params.output.c_str(), mode: std::ios::binary);
501 fout.exceptions(except: std::ofstream::failbit); // fail fast on write errors
502 // placeholder for the meta data
503 auto meta_size = gguf_get_meta_size(ctx: ctx_out);
504 ::zeros(file&: fout, n: meta_size);
505 }
506
507 // Write tensors data
508 for (int i_split = 0; i_split < n_split; i_split++) {
509 llama_split_path(split_path, maxlen: sizeof(split_path), path_prefix: split_prefix, split_no: i_split, split_count: n_split);
510 std::ifstream f_input(split_path, std::ios::binary);
511 if (!f_input.is_open()) {
512 fprintf(stderr, format: "%s: failed to open input GGUF from %s\n", __func__, split_path);
513 for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
514 gguf_free(ctx: ctx_ggufs[i]);
515 ggml_free(ctx: ctx_metas[i]);
516 }
517 gguf_free(ctx: ctx_out);
518 if (!split_params.dry_run) {
519 fout.close();
520 }
521 exit(EXIT_FAILURE);
522 }
523 fprintf(stderr, format: "%s: writing tensors %s ...", __func__, split_path);
524
525 auto * ctx_gguf = ctx_ggufs[i_split];
526 auto * ctx_meta = ctx_metas[i_split];
527
528 auto n_tensors = gguf_get_n_tensors(ctx: ctx_gguf);
529 for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
530 const char * t_name = gguf_get_tensor_name(ctx: ctx_gguf, tensor_id: i_tensor);
531 struct ggml_tensor * t = ggml_get_tensor(ctx: ctx_meta, name: t_name);
532
533 auto n_bytes = ggml_nbytes(tensor: t);
534
535 if (read_data.size() < n_bytes) {
536 read_data.resize(new_size: n_bytes);
537 }
538
539 auto offset = gguf_get_data_offset(ctx: ctx_gguf) + gguf_get_tensor_offset(ctx: ctx_gguf, tensor_id: i_tensor);
540 f_input.seekg(offset);
541 f_input.read(s: (char *)read_data.data(), n: n_bytes);
542 if (!split_params.dry_run) {
543 // write tensor data + padding
544 fout.write(s: (const char *)read_data.data(), n: n_bytes);
545 zeros(file&: fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
546 }
547 }
548
549 gguf_free(ctx: ctx_gguf);
550 ggml_free(ctx: ctx_meta);
551 f_input.close();
552 fprintf(stderr, format: "\033[3Ddone\n");
553 }
554
555 if (!split_params.dry_run) {
556 // go back to beginning of file and write the updated metadata
557 fout.seekp(0);
558 std::vector<uint8_t> data(gguf_get_meta_size(ctx: ctx_out));
559 gguf_get_meta_data(ctx: ctx_out, data: data.data());
560 fout.write(s: (const char *)data.data(), n: data.size());
561 fout.close();
562 }
563 gguf_free(ctx: ctx_out);
564
565 fprintf(stderr, format: "%s: %s merged from %d split with %d tensors.\n",
566 __func__, split_params.output.c_str(), n_split, total_tensors);
567}
568
569int main(int argc, const char ** argv) {
570 split_params params;
571 split_params_parse(argc, argv, params);
572
573 switch (params.operation) {
574 case OP_SPLIT: gguf_split(split_params: params);
575 break;
576 case OP_MERGE: gguf_merge(split_params: params);
577 break;
578 default: split_print_usage(executable: argv[0]);
579 exit(EXIT_FAILURE);
580 }
581
582 return 0;
583}
584