gguf-split.cpp source code [llama.cpp/tools/gguf-split/gguf-split.cpp]

1	#include "ggml.h"
2	#include "gguf.h"
3	#include "llama.h"
4	#include "common.h"
5
6	#include <algorithm>
7	#include <cinttypes>
8	#include <climits>
9	#include <cstdio>
10	#include <cstdlib>
11	#include <stdexcept>
12	#include <cstring>
13	#include <fstream>
14	#include <string>
15	#include <vector>
16
17	#if defined(_WIN32)
18	#include <windows.h>
19	#ifndef PATH_MAX
20	#define PATH_MAX MAX_PATH
21	#endif
22	#include <io.h>
23	#endif
24
25	enum split_operation : uint8_t {
26	OP_NONE,
27	OP_SPLIT,
28	OP_MERGE,
29	};
30
31	enum split_mode : uint8_t {
32	MODE_NONE,
33	MODE_TENSOR,
34	MODE_SIZE,
35	};
36
37	struct split_params {
38	split_operation operation = OP_NONE;
39	split_mode mode = MODE_NONE;
40	size_t n_bytes_split = `0`;
41	int n_split_tensors = `128`;
42	std::string input;
43	std::string output;
44	bool no_tensor_first_split = false;
45	bool dry_run = false;
46	};
47
48	static void split_print_usage(const char * executable) {
49	const split_params default_params;
50	printf(format: "\n");
51	printf(format: "usage: %s [options] GGUF_IN GGUF_OUT\n", executable);
52	printf(format: "\n");
53	printf(format: "Apply a GGUF operation on IN to OUT.");
54	printf(format: "\n");
55	printf(format: "options:\n");
56	printf(format: " -h, --help show this help message and exit\n");
57	printf(format: " --version show version and build info\n");
58	printf(format: " --split split GGUF to multiple GGUF (enabled by default)\n");
59	printf(format: " --merge merge multiple GGUF to a single GGUF\n");
60	printf(format: " --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
61	printf(format: " --split-max-size N(M\|G) max size per split\n");
62	printf(format: " --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
63	printf(format: " --dry-run only print out a split plan and exit, without writing any new files\n");
64	printf(format: "\n");
65	}
66
67	// return convert string, for example "128M" or "4G" to number of bytes
68	static size_t split_str_to_n_bytes(std::string str) {
69	size_t n_bytes = `0`;
70	int n;
71	if (str.back() == `'M'`) {
72	sscanf(s: str.c_str(), format: "%d", &n);
73	n_bytes = (size_t)n * `1000` * `1000`; // megabytes
74	} else if (str.back() == `'G'`) {
75	sscanf(s: str.c_str(), format: "%d", &n);
76	n_bytes = (size_t)n * `1000` * `1000` * `1000`; // gigabytes
77	} else {
78	throw std::invalid_argument ("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string (`1`, str.back()));
79	}
80	if (n <= `0`) {
81	throw std::invalid_argument ("error: size must be a positive value");
82	}
83	return n_bytes;
84	}
85
86	static void split_params_parse_ex(int argc, const char ** argv, split_params & params) {
87	std::string arg;
88	const std::string arg_prefix = "--";
89	bool invalid_param = false;
90
91	int arg_idx = `1`;
92	for (; arg_idx < argc && strncmp(s1: argv[arg_idx], s2: "--", n: `2`) == `0`; arg_idx++) {
93	arg = argv[arg_idx];
94	if (arg.compare(pos: `0`, n: arg_prefix.size(), str: arg_prefix) == `0`) {
95	std::replace(first: arg.begin(), last: arg.end(), old_value: `'_'`, new_value: `'-'`);
96	}
97
98	bool arg_found = false;
99	if (arg == "-h" \|\| arg == "--help") {
100	split_print_usage(executable: argv[`0`]);
101	exit(status: `0`);
102	} else if (arg == "--version") {
103	fprintf(stderr, format: "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
104	fprintf(stderr, format: "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
105	exit(status: `0`);
106	} else if (arg == "--dry-run") {
107	arg_found = true;
108	params.dry_run = true;
109	} else if (arg == "--no-tensor-first-split") {
110	arg_found = true;
111	params.no_tensor_first_split = true;
112	} else if (arg == "--merge") {
113	arg_found = true;
114	if (params.operation != OP_NONE && params.operation != OP_MERGE) {
115	throw std::invalid_argument ("error: either --split or --merge can be specified, but not both");
116	}
117	params.operation = OP_MERGE;
118	} else if (arg == "--split") {
119	arg_found = true;
120	if (params.operation != OP_NONE && params.operation != OP_SPLIT) {
121	throw std::invalid_argument ("error: either --split or --merge can be specified, but not both");
122	}
123	params.operation = OP_SPLIT;
124	} else if (arg == "--split-max-tensors") {
125	if (++arg_idx >= argc) {
126	invalid_param = true;
127	break;
128	}
129	arg_found = true;
130	if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) {
131	throw std::invalid_argument ("error: either --split-max-tensors or --split-max-size can be specified, but not both");
132	}
133	params.mode = MODE_TENSOR;
134	params.n_split_tensors = atoi(nptr: argv[arg_idx]);
135	} else if (arg == "--split-max-size") {
136	if (++arg_idx >= argc) {
137	invalid_param = true;
138	break;
139	}
140	arg_found = true;
141	if (params.mode != MODE_NONE && params.mode != MODE_SIZE) {
142	throw std::invalid_argument ("error: either --split-max-tensors or --split-max-size can be specified, but not both");
143	}
144	params.mode = MODE_SIZE;
145	params.n_bytes_split = split_str_to_n_bytes(str: argv[arg_idx]);
146	}
147
148	if (!arg_found) {
149	throw std::invalid_argument ("error: unknown argument: " + arg);
150	}
151	}
152
153	// the operation is split if not specified
154	if (params.operation == OP_NONE) {
155	params.operation = OP_SPLIT;
156	}
157	// the split mode is by tensor if not specified
158	if (params.mode == MODE_NONE) {
159	params.mode = MODE_TENSOR;
160	}
161
162	if (invalid_param) {
163	throw std::invalid_argument ("error: invalid parameter for argument: " + arg);
164	}
165
166	if (argc - arg_idx != `2`) {
167	throw std::invalid_argument ("error: bad arguments");
168	}
169
170	params.input = argv[arg_idx++];
171	params.output = argv[arg_idx++];
172	}
173
174	static bool split_params_parse(int argc, const char ** argv, split_params & params) {
175	bool result = true;
176	try {
177	split_params_parse_ex(argc, argv, params);
178	}
179	catch (const std::invalid_argument & ex) {
180	fprintf(stderr, format: "%s\n", ex.what());
181	split_print_usage(executable: argv[`0`]);
182	exit(EXIT_FAILURE);
183	}
184	return result;
185	}
186
187	static void zeros(std::ofstream & file, size_t n) {
188	char zero = `0`;
189	for (size_t i = `0`; i < n; ++i) {
190	file.write(s: &zero, n: `1`);
191	}
192	}
193
194	struct split_strategy {
195	const split_params params;
196	std::ifstream & f_input;
197	struct gguf_context * ctx_gguf;
198	struct ggml_context * ctx_meta = NULL;
199	const int n_tensors;
200
201	// one ctx_out per one output file
202	std::vector<struct gguf_context *> ctx_outs;
203
204	// temporary buffer for reading in tensor data
205	std::vector<uint8_t> read_buf;
206
207	split_strategy(const split_params & params,
208	std::ifstream & f_input,
209	struct gguf_context * ctx_gguf,
210	struct ggml_context * ctx_meta) :
211	params (params),
212	f_input(f_input),
213	ctx_gguf(ctx_gguf),
214	ctx_meta(ctx_meta),
215	n_tensors(gguf_get_n_tensors(ctx: ctx_gguf)) {
216
217	// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
218	int i_split = -`1`;
219	struct gguf_context * ctx_out = NULL;
220	auto new_ctx_out = [&](bool allow_no_tensors) {
221	i_split++;
222	if (ctx_out != NULL) {
223	if (gguf_get_n_tensors(ctx: ctx_out) == `0` && !allow_no_tensors) {
224	fprintf(stderr, format: "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
225	exit(EXIT_FAILURE);
226	}
227	ctx_outs.push_back(x: ctx_out);
228	}
229	ctx_out = gguf_init_empty();
230	// Save all metadata in first split only
231	if (i_split == `0`) {
232	gguf_set_kv(ctx: ctx_out, src: ctx_gguf);
233	}
234	gguf_set_val_u16(ctx: ctx_out, key: LLM_KV_SPLIT_NO, val: i_split);
235	gguf_set_val_u16(ctx: ctx_out, key: LLM_KV_SPLIT_COUNT, val: `0`); // placeholder
236	gguf_set_val_i32(ctx: ctx_out, key: LLM_KV_SPLIT_TENSORS_COUNT, val: n_tensors);
237	};
238
239	// initialize ctx_out for the first split
240	new_ctx_out(false);
241
242	// skip first split if no_tensor_first_split is set
243	if (params.no_tensor_first_split) {
244	new_ctx_out(true);
245	}
246
247	// process tensors one by one
248	size_t curr_tensors_size = `0`; // current size by counting only tensors size (without metadata)
249	for (int i = `0`; i < n_tensors; ++i) {
250	struct ggml_tensor * t = ggml_get_tensor(ctx: ctx_meta, name: gguf_get_tensor_name(ctx: ctx_gguf, tensor_id: i));
251	// calculate the "imaginary" size = the current size + next tensor size
252	size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
253	size_t next_tensors_size = curr_tensors_size + n_bytes;
254	if (should_split(i_tensor: i, next_size: next_tensors_size)) {
255	new_ctx_out(false);
256	curr_tensors_size = n_bytes;
257	} else {
258	curr_tensors_size = next_tensors_size;
259	}
260	gguf_add_tensor(ctx: ctx_out, tensor: t);
261	}
262
263	// push the last ctx_out
264	ctx_outs.push_back(x: ctx_out);
265
266	// set the correct n_split for all ctx_out
267	for (auto & ctx : ctx_outs) {
268	gguf_set_val_u16(ctx, key: LLM_KV_SPLIT_COUNT, val: ctx_outs.size());
269	}
270	}
271
272	~split_strategy() {
273	for (auto & ctx_out : ctx_outs) {
274	gguf_free(ctx: ctx_out);
275	}
276	}
277
278	bool should_split(int i_tensor, size_t next_size) {
279	if (params.mode == MODE_SIZE) {
280	// split by max size per file
281	return next_size > params.n_bytes_split;
282	} else if (params.mode == MODE_TENSOR) {
283	// split by number of tensors per file
284	return i_tensor > `0` && i_tensor < n_tensors && i_tensor % params.n_split_tensors == `0`;
285	}
286	// should never happen
287	GGML_ABORT("invalid mode");
288	}
289
290	void print_info() {
291	printf(format: "n_split: %zu\n", ctx_outs.size());
292	int i_split = `0`;
293	for (auto & ctx_out : ctx_outs) {
294	// re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
295	size_t total_size = gguf_get_meta_size(ctx: ctx_out);
296	for (int i = `0`; i < gguf_get_n_tensors(ctx: ctx_out); ++i) {
297	struct ggml_tensor * t = ggml_get_tensor(ctx: ctx_meta, name: gguf_get_tensor_name(ctx: ctx_out, tensor_id: i));
298	total_size += ggml_nbytes(tensor: t);
299	}
300	total_size = total_size / `1000` / `1000`; // convert to megabytes
301	printf(format: "split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + `1`, gguf_get_n_tensors(ctx: ctx_out), total_size);
302	i_split++;
303	}
304	}
305
306	void write() {
307	int i_split = `0`;
308	int n_split = ctx_outs.size();
309	for (auto & ctx_out : ctx_outs) {
310	// construct file path
311	char split_path[PATH_MAX] = {`0`};
312	llama_split_path(split_path, maxlen: sizeof(split_path), path_prefix: params.output.c_str(), split_no: i_split, split_count: n_split);
313
314	// open the output file
315	printf(format: "Writing file %s ... ", split_path);
316	fflush(stdout);
317	std::ofstream fout = std::ofstream (split_path, std::ios::binary);
318	fout.exceptions(except: std::ofstream::failbit); // fail fast on write errors
319
320	// write metadata
321	std::vector<uint8_t> data(gguf_get_meta_size(ctx: ctx_out));
322	gguf_get_meta_data(ctx: ctx_out, data: data.data());
323	fout.write(s: (const char *)data.data(), n: data.size());
324
325	// write tensors
326	for (int i = `0`; i < gguf_get_n_tensors(ctx: ctx_out); ++i) {
327	// read tensor meta and prepare buffer
328	const char * t_name = gguf_get_tensor_name(ctx: ctx_out, tensor_id: i);
329	struct ggml_tensor * t = ggml_get_tensor(ctx: ctx_meta, name: t_name);
330	auto n_bytes = ggml_nbytes(tensor: t);
331	read_buf.resize(new_size: n_bytes);
332
333	// calculate offset
334	auto i_tensor_in = gguf_find_tensor(ctx: ctx_gguf, name: t_name); // idx of tensor in the input file
335	auto offset = gguf_get_data_offset(ctx: ctx_gguf) + gguf_get_tensor_offset(ctx: ctx_gguf, tensor_id: i_tensor_in);
336
337	// copy tensor from input to output file
338	copy_file_to_file(f_in&: f_input, f_out&: fout, in_offset: offset, len: n_bytes);
339	zeros(file&: fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
340	}
341
342	printf(format: "done\n");
343	// close the file
344	fout.close();
345	i_split++;
346	}
347	}
348
349	void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) {
350	// TODO: detect OS and use copy_file_range() here for better performance
351	if (read_buf.size() < len) {
352	read_buf.resize(new_size: len);
353	}
354	f_in.seekg(in_offset);
355	f_in.read(s: (char *)read_buf.data(), n: len);
356	f_out.write(s: (const char *)read_buf.data(), n: len);
357	}
358	};
359
360	static void gguf_split(const split_params & split_params) {
361	struct ggml_context * ctx_meta = NULL;
362
363	struct gguf_init_params params = {
364	/.no_alloc = / true,
365	/.ctx = / &ctx_meta,
366	};
367
368	std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
369	if (!f_input.is_open()) {
370	fprintf(stderr, format: "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
371	exit(EXIT_FAILURE);
372	}
373
374	auto * ctx_gguf = gguf_init_from_file(fname: split_params.input.c_str(), params);
375	if (!ctx_gguf) {
376	fprintf(stderr, format: "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
377	exit(EXIT_FAILURE);
378	}
379
380	// prepare the strategy
381	split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
382	int n_split = strategy.ctx_outs.size();
383	strategy.print_info();
384
385	if (!split_params.dry_run) {
386	// write all output splits
387	strategy.write();
388	}
389
390	// done, clean up
391	gguf_free(ctx: ctx_gguf);
392	f_input.close();
393
394	fprintf(stderr, format: "%s: %d gguf split written with a total of %d tensors.\n",
395	__func__, n_split, strategy.n_tensors);
396	}
397
398	static void gguf_merge(const split_params & split_params) {
399	fprintf(stderr, format: "%s: %s -> %s\n",
400	__func__, split_params.input.c_str(),
401	split_params.output.c_str());
402	int n_split = `1`;
403	int total_tensors = `0`;
404
405	// avoid overwriting existing output file
406	if (std::ifstream (split_params.output.c_str())) {
407	fprintf(stderr, format: "%s: output file %s already exists\n", __func__, split_params.output.c_str());
408	exit(EXIT_FAILURE);
409	}
410
411
412	auto * ctx_out = gguf_init_empty();
413
414	std::vector<uint8_t> read_data;
415	std::vector<ggml_context *> ctx_metas;
416	std::vector<gguf_context *> ctx_ggufs;
417
418	char split_path[PATH_MAX] = {`0`};
419	strncpy(dest: split_path, src: split_params.input.c_str(), n: sizeof(split_path) - `1`);
420	char split_prefix[PATH_MAX] = {`0`};
421
422	// First pass to find KV and tensors metadata
423	for (int i_split = `0`; i_split < n_split; i_split++) {
424	struct ggml_context * ctx_meta = NULL;
425
426	struct gguf_init_params params = {
427	/.no_alloc = / true,
428	/.ctx = / &ctx_meta,
429	};
430
431	if (i_split > `0`) {
432	llama_split_path(split_path, maxlen: sizeof(split_path), path_prefix: split_prefix, split_no: i_split, split_count: n_split);
433	}
434	fprintf(stderr, format: "%s: reading metadata %s ...", __func__, split_path);
435
436	auto * ctx_gguf = gguf_init_from_file(fname: split_path, params);
437	if (!ctx_gguf) {
438	fprintf(stderr, format: "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
439	exit(EXIT_FAILURE);
440	}
441	ctx_ggufs.push_back(x: ctx_gguf);
442	ctx_metas.push_back(x: ctx_meta);
443
444	if (i_split == `0`) {
445	auto key_n_split = gguf_find_key(ctx: ctx_gguf, key: LLM_KV_SPLIT_COUNT);
446	if (key_n_split < `0`) {
447	fprintf(stderr,
448	format: "\n%s: input file does not contain %s metadata\n",
449	__func__,
450	LLM_KV_SPLIT_COUNT);
451	gguf_free(ctx: ctx_gguf);
452	ggml_free(ctx: ctx_meta);
453	gguf_free(ctx: ctx_out);
454	exit(EXIT_FAILURE);
455	}
456
457	n_split = gguf_get_val_u16(ctx: ctx_gguf, key_id: key_n_split);
458	if (n_split < `1`) {
459	fprintf(stderr,
460	format: "\n%s: input file does not contain a valid split count %d\n",
461	__func__,
462	n_split);
463	gguf_free(ctx: ctx_gguf);
464	ggml_free(ctx: ctx_meta);
465	gguf_free(ctx: ctx_out);
466	exit(EXIT_FAILURE);
467	}
468
469	// Verify the file naming and extract split_prefix
470	if (!llama_split_prefix(split_prefix, maxlen: sizeof (split_prefix), split_path, split_no: i_split, split_count: n_split)) {
471	fprintf(stderr, format: "\n%s: unexpected input file name: %s"
472	" i_split=%d"
473	" n_split=%d\n", __func__,
474	split_path, i_split, n_split);
475	gguf_free(ctx: ctx_gguf);
476	ggml_free(ctx: ctx_meta);
477	gguf_free(ctx: ctx_out);
478	exit(EXIT_FAILURE);
479	}
480
481	// Do not trigger merge if we try to merge again the output
482	gguf_set_val_u16(ctx: ctx_gguf, key: LLM_KV_SPLIT_COUNT, val: `0`);
483
484	// Set metadata from the first split
485	gguf_set_kv(ctx: ctx_out, src: ctx_gguf);
486	}
487
488	auto n_tensors = gguf_get_n_tensors(ctx: ctx_gguf);
489	for (int i_tensor = `0`; i_tensor < n_tensors; i_tensor++) {
490	const char * t_name = gguf_get_tensor_name(ctx: ctx_gguf, tensor_id: i_tensor);
491	struct ggml_tensor * t = ggml_get_tensor(ctx: ctx_meta, name: t_name);
492	gguf_add_tensor(ctx: ctx_out, tensor: t);
493	}
494	total_tensors += n_tensors;
495
496	fprintf(stderr, format: "\033[3Ddone\n");
497	}
498	std::ofstream fout;
499	if (!split_params.dry_run) {
500	fout.open(s: split_params.output.c_str(), mode: std::ios::binary);
501	fout.exceptions(except: std::ofstream::failbit); // fail fast on write errors
502	// placeholder for the meta data
503	auto meta_size = gguf_get_meta_size(ctx: ctx_out);
504	::zeros(file&: fout, n: meta_size);
505	}
506
507	// Write tensors data
508	for (int i_split = `0`; i_split < n_split; i_split++) {
509	llama_split_path(split_path, maxlen: sizeof(split_path), path_prefix: split_prefix, split_no: i_split, split_count: n_split);
510	std::ifstream f_input(split_path, std::ios::binary);
511	if (!f_input.is_open()) {
512	fprintf(stderr, format: "%s: failed to open input GGUF from %s\n", __func__, split_path);
513	for (uint32_t i = `0`; i < ctx_ggufs.size(); i++) {
514	gguf_free(ctx: ctx_ggufs [i]);
515	ggml_free(ctx: ctx_metas [i]);
516	}
517	gguf_free(ctx: ctx_out);
518	if (!split_params.dry_run) {
519	fout.close();
520	}
521	exit(EXIT_FAILURE);
522	}
523	fprintf(stderr, format: "%s: writing tensors %s ...", __func__, split_path);
524
525	auto * ctx_gguf = ctx_ggufs [i_split];
526	auto * ctx_meta = ctx_metas [i_split];
527
528	auto n_tensors = gguf_get_n_tensors(ctx: ctx_gguf);
529	for (int i_tensor = `0`; i_tensor < n_tensors; i_tensor++) {
530	const char * t_name = gguf_get_tensor_name(ctx: ctx_gguf, tensor_id: i_tensor);
531	struct ggml_tensor * t = ggml_get_tensor(ctx: ctx_meta, name: t_name);
532
533	auto n_bytes = ggml_nbytes(tensor: t);
534
535	if (read_data.size() < n_bytes) {
536	read_data.resize(new_size: n_bytes);
537	}
538
539	auto offset = gguf_get_data_offset(ctx: ctx_gguf) + gguf_get_tensor_offset(ctx: ctx_gguf, tensor_id: i_tensor);
540	f_input.seekg(offset);
541	f_input.read(s: (char *)read_data.data(), n: n_bytes);
542	if (!split_params.dry_run) {
543	// write tensor data + padding
544	fout.write(s: (const char *)read_data.data(), n: n_bytes);
545	zeros(file&: fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
546	}
547	}
548
549	gguf_free(ctx: ctx_gguf);
550	ggml_free(ctx: ctx_meta);
551	f_input.close();
552	fprintf(stderr, format: "\033[3Ddone\n");
553	}
554
555	if (!split_params.dry_run) {
556	// go back to beginning of file and write the updated metadata
557	fout.seekp(`0`);
558	std::vector<uint8_t> data(gguf_get_meta_size(ctx: ctx_out));
559	gguf_get_meta_data(ctx: ctx_out, data: data.data());
560	fout.write(s: (const char *)data.data(), n: data.size());
561	fout.close();
562	}
563	gguf_free(ctx: ctx_out);
564
565	fprintf(stderr, format: "%s: %s merged from %d split with %d tensors.\n",
566	__func__, split_params.output.c_str(), n_split, total_tensors);
567	}
568
569	int main(int argc, const char ** argv) {
570	split_params params;
571	split_params_parse(argc, argv, params);
572
573	switch (params.operation) {
574	case OP_SPLIT: gguf_split(split_params: params);
575	break;
576	case OP_MERGE: gguf_merge(split_params: params);
577	break;
578	default: split_print_usage(executable: argv[`0`]);
579	exit(EXIT_FAILURE);
580	}
581
582	return `0`;
583	}
584

Browse the source code of llama.cpp/tools/gguf-split/gguf-split.cpp