test-alloc.cpp source code [llama.cpp/tests/test-alloc.cpp]

1	#include <ggml-alloc.h>
2	#include <ggml-backend-impl.h>
3	#include <ggml-cpp.h>
4	#include <ggml-impl.h>
5	#include <ggml.h>
6
7	#include <algorithm>
8	#include <exception>
9	#include <memory>
10	#include <vector>
11
12	//
13	// dummy backend with configurable max_buffer_size, tracks allocations
14
15	uint8_t * const alloc_base = (uint8_t *) `16`;
16
17	struct dummy_backend_context {
18	size_t max_buffer_size = `64`;
19	size_t alignment = `8`;
20
21	ggml_backend_buffer_i buffer_interface;
22	std::vector<ggml_backend_buffer_t> buffers;
23
24	size_t allocated_total() const {
25	size_t n = `0`;
26	for (ggml_backend_buffer_t buf : buffers) {
27	n += ggml_backend_buffer_get_size(buffer: buf);
28	}
29	return n;
30	}
31	};
32
33	// ggml_backend_buffer_type interface
34
35	static const char * dummy_backend_buffer_type_get_name(ggml_backend_buffer_type_t) {
36	return "dummy_buffer_type";
37	}
38
39	static ggml_backend_buffer_t dummy_backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
40	dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
41	ggml_backend_buffer_t & buffer = ctx->buffers.emplace_back();
42	buffer = ggml_backend_buffer_init(buft, iface: ctx->buffer_interface, context: ctx, size);
43	return buffer;
44	}
45
46	static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
47	dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
48	return ctx->alignment;
49	}
50
51	static size_t dummy_backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
52	dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
53	return ctx->max_buffer_size;
54	}
55
56	static bool dummy_backend_buffer_type_is_host(ggml_backend_buffer_type_t) {
57	return true;
58	}
59
60	// ggml_backend_buffer interface
61
62	static void dummy_backend_buffer_free_buffer(ggml_backend_buffer_t buffer) {
63	dummy_backend_context * ctx = (dummy_backend_context *) buffer->context;
64
65	auto i = std::find(first: ctx->buffers.begin(), last: ctx->buffers.end(), val: buffer);
66	GGML_ASSERT(i != ctx->buffers.end());
67	ctx->buffers.erase(position: i);
68	}
69
70	static void * dummy_backend_buffer_get_base(ggml_backend_buffer_t) {
71	return alloc_base;
72	}
73
74	static ggml_status dummy_backend_buffer_init_tensor(ggml_backend_buffer_t, ggml_tensor *) {
75	return GGML_STATUS_SUCCESS;
76	}
77
78	static void dummy_backend_buffer_memset_tensor(ggml_backend_buffer_t, ggml_tensor *, uint8_t, size_t, size_t) {}
79
80	static void dummy_backend_buffer_set_tensor(ggml_backend_buffer_t, ggml_tensor , const* void *, size_t, size_t) {}
81
82	static void dummy_backend_buffer_get_tensor(ggml_backend_buffer_t, const ggml_tensor , void* *, size_t, size_t) {}
83
84	static void dummy_backend_buffer_clear(ggml_backend_buffer_t, uint8_t) {}
85
86	// dummy_backend (not really a full backend, just provides what gallocr needs)
87
88	struct dummy_backend {
89	std::unique_ptr<dummy_backend_context> context;
90	ggml_backend_buffer_type buffer_type;
91	};
92
93	static dummy_backend dummy_backend_init(size_t max_buffer_size, size_t alignment = `8`) {
94	dummy_backend b{};
95	b.context = std::make_unique<dummy_backend_context>();
96	b.context ->alignment = alignment;
97	b.context ->max_buffer_size = max_buffer_size;
98
99	b.context ->buffer_interface.free_buffer = dummy_backend_buffer_free_buffer;
100	b.context ->buffer_interface.get_base = dummy_backend_buffer_get_base;
101	b.context ->buffer_interface.init_tensor = dummy_backend_buffer_init_tensor;
102	b.context ->buffer_interface.memset_tensor = dummy_backend_buffer_memset_tensor;
103	b.context ->buffer_interface.set_tensor = dummy_backend_buffer_set_tensor;
104	b.context ->buffer_interface.get_tensor = dummy_backend_buffer_get_tensor;
105	b.context ->buffer_interface.clear = dummy_backend_buffer_clear;
106
107	b.buffer_type.context = b.context.get();
108	b.buffer_type.iface.get_name = dummy_backend_buffer_type_get_name;
109	b.buffer_type.iface.alloc_buffer = dummy_backend_buffer_type_alloc_buffer;
110	b.buffer_type.iface.get_alignment = dummy_backend_buffer_type_get_alignment;
111	b.buffer_type.iface.get_max_size = dummy_backend_buffer_type_get_max_size;
112	b.buffer_type.iface.is_host = dummy_backend_buffer_type_is_host;
113	return b;
114	}
115
116	//
117	// test utilities
118
119	struct test_context_with_graph {
120	ggml_context * ctx;
121	ggml_cgraph * graph;
122	ggml_context_ptr ctx_ptr;
123	};
124
125	static test_context_with_graph make_context() {
126	ggml_init_params params{};
127	params.mem_size = `48` * ggml_tensor_overhead() + ggml_graph_overhead();
128	params.no_alloc = true;
129
130	ggml_context * ctx = ggml_init(params);
131	ggml_context_ptr ctx_ptr = ggml_context_ptr (ctx);
132	ggml_cgraph * graph = ggml_new_graph(ctx);
133	return { .ctx: ctx, .graph: graph, .ctx_ptr: std::move(ctx_ptr) };
134	}
135
136	static ggml_tensor * make_input_1d(ggml_context * ctx, int64_t n_elements) {
137	ggml_tensor * t = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_elements);
138	ggml_set_input(tensor: t);
139	return t;
140	}
141
142	static ggml_tensor * make_input_with_size(ggml_context * ctx, size_t size_bytes) {
143	GGML_ASSERT(size_bytes % `4` == `0`);
144	return make_input_1d(ctx, n_elements: size_bytes / `4`);
145	}
146
147	static void assign_names(ggml_context * ctx, const char * prefix = "x") {
148	int i = `0`;
149	for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, tensor: t)) {
150	ggml_format_name(tensor: t, fmt: "%s%d", prefix, i++);
151	}
152	}
153
154	static int get_leaf_id(ggml_cgraph * graph, const char * tensor_name) {
155	for (int i = `0`; i < graph->n_leafs; ++i) {
156	if (strncmp(s1: graph->leafs[i]->name, s2: tensor_name, GGML_MAX_NAME) == `0`) {
157	return i;
158	}
159	}
160	fprintf(stderr, format: "leaf not found: %s\n", tensor_name);
161	return -`1`;
162	}
163
164	static int get_node_id(ggml_cgraph * graph, const char * tensor_name) {
165	for (int i = `0`; i < graph->n_nodes; ++i) {
166	if (strncmp(s1: graph->nodes[i]->name, s2: tensor_name, GGML_MAX_NAME) == `0`) {
167	return i;
168	}
169	}
170	fprintf(stderr, format: "node not found: %s", tensor_name);
171	return -`1`;
172	}
173
174	static ggml_gallocr_ptr allocate_graph(ggml_cgraph * graph, ggml_tensor * out, ggml_backend_buffer_type_t buft) {
175	ggml_set_output(tensor: out);
176	ggml_build_forward_expand(cgraph: graph, tensor: out);
177
178	ggml_gallocr_ptr galloc = ggml_gallocr_ptr (ggml_gallocr_new(buft));
179	bool result = ggml_gallocr_alloc_graph(galloc: galloc.get(), graph);
180	GGML_ASSERT(result);
181	return galloc;
182	}
183
184	//
185	// correctness checks for result allocations
186
187	static void check_all_allocated(ggml_cgraph * graph) {
188	for (int i = `0`; i < ggml_graph_n_nodes(cgraph: graph); ++i) {
189	ggml_tensor * t = ggml_graph_node(cgraph: graph, i);
190	GGML_ASSERT(t->buffer != nullptr);
191	GGML_ASSERT(t->data != nullptr);
192	}
193	}
194
195	static void check_max_size(ggml_context * ctx) {
196	for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, tensor: t)) {
197	auto buft = ggml_backend_buffer_get_type(buffer: t->buffer);
198	size_t max_size = ggml_backend_buft_get_max_size(buft);
199	size_t offset = (char ) t->data - (char* *) ggml_backend_buffer_get_base(buffer: t->buffer);
200	GGML_ASSERT(t->data >= ggml_backend_buffer_get_base(t->buffer));
201	GGML_ASSERT((size_t) offset + ggml_nbytes(t) <= max_size);
202	}
203	}
204
205	static bool can_reuse_memory(ggml_cgraph * graph, int current_i, ggml_tensor * current, ggml_tensor * other) {
206	if (other->flags & GGML_TENSOR_FLAG_OUTPUT) {
207	return false;
208	}
209	// Check if `other` is still "alive", ie. an input to any node after the `current` op
210	for (int i = current_i; i < ggml_graph_n_nodes(cgraph: graph); ++i) {
211	ggml_tensor * t = ggml_graph_node(cgraph: graph, i);
212	for (int s = `0`; s < GGML_MAX_SRC; s++) {
213	if (t == current && ggml_op_can_inplace(op: t->op)) {
214	continue;
215	}
216	if (t->src[s] == other) {
217	return false;
218	}
219	if (t->src[s] && t->src[s]->view_src == other) {
220	return false;
221	}
222	}
223	}
224	return true;
225	}
226
227	static bool memory_overlap(ggml_tensor * a, ggml_tensor * b) {
228	if (a->buffer != b->buffer) {
229	return false;
230	}
231	int64_t a0 = (int64_t) a->data;
232	int64_t a1 = a0 + ggml_nbytes(tensor: a);
233	int64_t b0 = (int64_t) b->data;
234	int64_t b1 = b0 + ggml_nbytes(tensor: b);
235	return a1 > b0 && b1 > a0;
236	}
237
238	static ggml_tensor * get_view_source(ggml_tensor * t) {
239	while (t->view_src) {
240	t = t->view_src;
241	}
242	return t;
243	}
244
245	static void check_no_overlap(ggml_cgraph * graph) {
246	for (int i = `0`; i < ggml_graph_n_nodes(cgraph: graph); ++i) {
247	for (int j = `0`; j < i; ++j) {
248	ggml_tensor * t = ggml_graph_node(cgraph: graph, i);
249	ggml_tensor * o = ggml_graph_node(cgraph: graph, i: j);
250	GGML_ASSERT(t != o);
251
252	if (get_view_source(t) == get_view_source(t: o)) {
253	continue;
254	}
255	if (memory_overlap(a: t, b: o)) {
256	GGML_ASSERT(can_reuse_memory(graph, i, t, o));
257	}
258	}
259	}
260	}
261
262	//
263	// test cases
264
265	// Scenario where the first backend buffer is completely exhausted and there are further
266	// tensors which require a second buffer
267	static void test_max_size_too_many_tensors() {
268	dummy_backend backend = dummy_backend_init(max_buffer_size: `16`);
269	auto [ctx, graph, ctx_ptr] = make_context();
270
271	ggml_tensor * x[`7`];
272	x[`0`] = make_input_with_size(ctx, size_bytes: `8`);
273	x[`1`] = make_input_with_size(ctx, size_bytes: `8`);
274	x[`2`] = make_input_with_size(ctx, size_bytes: `8`);
275	x[`3`] = ggml_mul(ctx, a: x[`0`], b: x[`1`]);
276	x[`4`] = ggml_add(ctx, a: x[`1`], b: x[`2`]);
277	x[`5`] = ggml_add(ctx, a: x[`3`], b: x[`0`]);
278	x[`6`] = ggml_add(ctx, a: x[`4`], b: x[`5`]);
279	assign_names(ctx);
280
281	ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[`6`], buft: &backend.buffer_type);
282	check_all_allocated(graph);
283	check_no_overlap(graph);
284	check_max_size(ctx);
285	GGML_ASSERT(backend.context ->allocated_total() <= `16` + `16`);
286	}
287
288	// Scenario where there is some space left in the first buffer, but not enough to accomodate
289	// a larger tensor, so a second buffer is required
290	static void test_max_size_tensor_too_large() {
291	dummy_backend backend = dummy_backend_init(max_buffer_size: `32`);
292	auto [ctx, graph, ctx_ptr] = make_context();
293
294	ggml_tensor * x[`3`];
295	x[`0`] = make_input_with_size(ctx, size_bytes: `16`); // chunk 0, [0 , 16)
296	x[`1`] = make_input_with_size(ctx, size_bytes: `8`); // chunk 0, [16, 24)
297	x[`2`] = ggml_concat(ctx, a: x[`0`], b: x[`1`], dim: `0`); // chunk 1, [0 , 24)
298	assign_names(ctx);
299
300	ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[`2`], buft: &backend.buffer_type);
301	check_all_allocated(graph);
302	check_no_overlap(graph);
303	check_max_size(ctx);
304	GGML_ASSERT(backend.context ->allocated_total() <= `32` + `24`);
305	}
306
307	// Scenario where a single tensor exceeds the max buffer size - in this case the allocator
308	// should try to create a bigger buffer anyway, and wait for the backend to throw an error.
309	// Backends may report an artificially lower max size in some cases for compatibility reasons.
310	static void test_tensor_larger_than_max_size() {
311	dummy_backend backend = dummy_backend_init(max_buffer_size: `16`);
312	auto [ctx, graph, ctx_ptr] = make_context();
313
314	ggml_tensor * x[`2`];
315	x[`0`] = make_input_with_size(ctx, size_bytes: `24`);
316	x[`1`] = ggml_scale(ctx, a: x[`0`], s: `2.0f`);
317	assign_names(ctx);
318
319	ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[`1`], buft: &backend.buffer_type);
320	check_all_allocated(graph);
321	check_no_overlap(graph);
322	GGML_ASSERT(backend.context ->allocated_total() == `24`);
323	}
324
325	// This test assumes a max of 16 buffer chunks, and tries to allocate tensors that would
326	// require more. Expectation is that the last buffer should grow to fit everything,
327	// leaving it to the backend to error out if it can't allocate that much.
328	static void test_not_enough_chunks() {
329	const int max_chunks = `16`;
330	const int max_size = `8`;
331
332	dummy_backend backend = dummy_backend_init(max_buffer_size: max_size);
333	auto [ctx, graph, ctx_ptr] = make_context();
334
335	ggml_tensor * x[max_chunks + `1`];
336	for (int i = `0`; i < max_chunks + `1`; ++i) {
337	x[i] = make_input_with_size(ctx, size_bytes: max_size);
338	}
339	ggml_tensor * acc = x[`0`];
340	for (int i = `0`; i < max_chunks; ++i) {
341	acc = ggml_add(ctx, a: acc, b: x[i + `1`]);
342	}
343	assign_names(ctx);
344
345	ggml_gallocr_ptr galloc = allocate_graph(graph, out: acc, buft: &backend.buffer_type);
346	check_all_allocated(graph);
347	check_no_overlap(graph);
348	GGML_ASSERT(backend.context ->allocated_total() > max_chunks * max_size);
349	}
350
351	// Fill up leftover unallocated space of a chunk after allocating a large tensor that
352	// requires a new chunk.
353	static void test_fill_leftover_space() {
354	dummy_backend backend = dummy_backend_init(max_buffer_size: `16`);
355	auto [ctx, graph, ctx_ptr] = make_context();
356
357	ggml_tensor * x[`4`];
358	x[`0`] = make_input_with_size(ctx, size_bytes: `8`);
359	x[`1`] = ggml_pad(ctx, a: x[`0`], p0: `2`, p1: `0`, p2: `0`, p3: `0`);
360	x[`3`] = ggml_mean(ctx, a: x[`1`]);
361	assign_names(ctx);
362
363	ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[`3`], buft: &backend.buffer_type);
364	check_all_allocated(graph);
365	check_no_overlap(graph);
366	check_max_size(ctx);
367	GGML_ASSERT(backend.context ->allocated_total() <= `12` + `16`);
368	}
369
370	// Check that views don't require any extra memory
371	static void test_view_inplace() {
372	dummy_backend backend = dummy_backend_init(max_buffer_size: `32`);
373	auto [ctx, graph, ctx_ptr] = make_context();
374
375	ggml_tensor * x[`6`];
376	x[`0`] = make_input_1d(ctx, n_elements: `4`); // chunk 0, [0, 16)
377	x[`1`] = ggml_reshape_2d(ctx, a: x[`0`], ne0: `2`, ne1: `2`); // view of x0
378	x[`2`] = ggml_permute(ctx, a: x[`1`], axis0: `1`, axis1: `0`, axis2: `2`, axis3: `3`); // view of x0
379	x[`3`] = ggml_view_1d(ctx, a: x[`2`], ne0: `2`, offset: `4`); // view of x0
380	x[`4`] = make_input_1d(ctx, n_elements: `2`); // chunk 0, [16, 24)
381	x[`5`] = ggml_add(ctx, a: x[`3`], b: x[`4`]); // reuse (inplace add)
382	assign_names(ctx);
383
384	ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[`5`], buft: &backend.buffer_type);
385	check_all_allocated(graph);
386	check_no_overlap(graph);
387	check_max_size(ctx);
388	GGML_ASSERT(backend.context ->allocated_total() <= `24`);
389	}
390
391	static void test_reuse_and_free() {
392	dummy_backend backend = dummy_backend_init(max_buffer_size: `40`);
393	auto [ctx, graph, ctx_ptr] = make_context();
394
395	ggml_tensor * x[`9`];
396	x[`0`] = make_input_with_size(ctx, size_bytes: `24`);
397	x[`1`] = make_input_with_size(ctx, size_bytes: `8`);
398	x[`2`] = make_input_with_size(ctx, size_bytes: `8`);
399	x[`3`] = ggml_add(ctx, a: x[`1`], b: x[`2`]); // reuse, free x2
400	x[`4`] = ggml_pad(ctx, a: x[`0`], p0: `2`, p1: `0`, p2: `0`, p3: `0`); // alloc new buffer, free x0
401	x[`5`] = ggml_scale(ctx, a: x[`4`], s: `2.0f`); // alloc from free block
402	x[`6`] = ggml_add(ctx, a: x[`4`], b: x[`5`]); // reuse, free x5
403	x[`7`] = ggml_view_1d(ctx, a: x[`6`], ne0: `2`, offset: `8`); // view
404	x[`8`] = ggml_add(ctx, a: x[`3`], b: x[`7`]); // reuse
405	assign_names(ctx);
406
407	ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[`8`], buft: &backend.buffer_type);
408	check_all_allocated(graph);
409	check_no_overlap(graph);
410	check_max_size(ctx);
411	GGML_ASSERT(backend.context ->allocated_total() <= `40` + `32` + `32`);
412	}
413
414	static void test_merge_free_block(size_t max_buffer_size) {
415	dummy_backend backend = dummy_backend_init(max_buffer_size);
416	auto [ctx, graph, ctx_ptr] = make_context();
417
418	ggml_tensor * x[`9`];
419	x[`0`] = make_input_with_size(ctx, size_bytes: `16`);
420	x[`1`] = make_input_with_size(ctx, size_bytes: `16`);
421	x[`2`] = make_input_with_size(ctx, size_bytes: `16`);
422	x[`3`] = ggml_mean(ctx, a: x[`0`]);
423	x[`4`] = ggml_mean(ctx, a: x[`1`]);
424	x[`5`] = ggml_pad(ctx, a: x[`2`], p0: `2`, p1: `0`, p2: `0`, p3: `0`);
425	x[`6`] = ggml_add(ctx, a: x[`3`], b: x[`4`]);
426	x[`7`] = ggml_pad(ctx, a: x[`6`], p0: `5`, p1: `0`, p2: `0`, p3: `0`);
427	x[`8`] = ggml_add(ctx, a: x[`5`], b: x[`7`]);
428	assign_names(ctx);
429
430	ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[`8`], buft: &backend.buffer_type);
431	check_all_allocated(graph);
432	check_no_overlap(graph);
433	check_max_size(ctx);
434	GGML_ASSERT(backend.context ->allocated_total() <= `32` + `32` + `24`);
435	}
436
437	// Check that previously allocated but freed memory is preferred over allocating
438	// additional memory, even if the remaining space in a chunk would match tensor size better
439	static void test_prefer_already_allocated_memory() {
440	dummy_backend backend = dummy_backend_init(max_buffer_size: `32`, /align/ alignment: `4`);
441	auto [ctx, graph, ctx_ptr] = make_context();
442
443	ggml_tensor * x[`3`];
444	x[`0`] = make_input_with_size(ctx, size_bytes: `24`); // [24b][8b unused]
445	x[`1`] = ggml_mean(ctx, a: x[`0`]); // [24b free][4b][4b unused]
446	x[`2`] = ggml_mean(ctx, a: x[`1`]); // should be allocated in the 24b block
447	assign_names(ctx);
448
449	ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[`2`], buft: &backend.buffer_type);
450	check_all_allocated(graph);
451	check_no_overlap(graph);
452	GGML_ASSERT(backend.context ->allocated_total() <= `28`);
453	}
454
455	// test for allocating on multiple devices with some tensors in the graph
456	// allocated externally (not by gallocr).
457	static void test_multiple_buffer_types() {
458	dummy_backend backend_a = dummy_backend_init(max_buffer_size: `32`);
459	dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
460
461	auto [ctx_a, _a, ctx_a_ptr] = make_context();
462	auto [ctx_b, _b, ctx_b_ptr] = make_context();
463	auto [ctx, graph, ctx_ptr] = make_context();
464
465	ggml_tensor * a[`2`];
466	a[`0`] = make_input_with_size(ctx: ctx_a, size_bytes: `16`);
467	a[`1`] = make_input_with_size(ctx: ctx_a, size_bytes: `16`);
468	assign_names(ctx: ctx_a, prefix: "a");
469
470	ggml_tensor * b[`2`];
471	b[`0`] = make_input_with_size(ctx: ctx_b, size_bytes: `24`);
472	b[`1`] = make_input_with_size(ctx: ctx_b, size_bytes: `4`);
473	assign_names(ctx: ctx_b, prefix: "b");
474
475	ggml_tensor * x[`9`];
476	x[`0`] = make_input_with_size(ctx, size_bytes: `16`);
477	x[`1`] = ggml_mul(ctx, a: x[`0`], b: a[`0`]);
478	x[`2`] = ggml_pad(ctx, a: x[`1`], p0: `2`, p1: `0`, p2: `0`, p3: `0`);
479	x[`3`] = ggml_mul(ctx, a: x[`2`], b: b[`0`]);
480	x[`4`] = ggml_mean(ctx, a: x[`3`]);
481	x[`5`] = ggml_add(ctx, a: x[`4`], b: b[`1`]);
482	x[`6`] = ggml_pad(ctx, a: x[`5`], p0: `3`, p1: `0`, p2: `0`, p3: `0`);
483	x[`7`] = ggml_add(ctx, a: x[`6`], b: a[`1`]);
484	x[`8`] = ggml_scale(ctx, a: x[`7`], s: `2.0f`);
485	assign_names(ctx, prefix: "x");
486
487	ggml_backend_buffer_ptr buf_a(ggml_backend_alloc_ctx_tensors_from_buft(ctx: ctx_a, buft: &backend_a.buffer_type));
488	ggml_backend_buffer_ptr buf_b(ggml_backend_alloc_ctx_tensors_from_buft(ctx: ctx_b, buft: &backend_b.buffer_type));
489	ggml_backend_buffer_type_t bufts[`2`] = { &backend_a.buffer_type, &backend_b.buffer_type };
490
491	// assign buffer types manually to avoid extra complexity from backend scheduler
492	ggml_set_output(tensor: x[`8`]);
493	ggml_build_forward_expand(cgraph: graph, tensor: x[`8`]);
494
495	GGML_ASSERT(graph->n_leafs == `5`);
496	int leaf_buffer_ids[`5`];
497	leaf_buffer_ids[get_leaf_id(graph, tensor_name: "a0")] = `0`;
498	leaf_buffer_ids[get_leaf_id(graph, tensor_name: "a1")] = `0`;
499	leaf_buffer_ids[get_leaf_id(graph, tensor_name: "b0")] = `1`;
500	leaf_buffer_ids[get_leaf_id(graph, tensor_name: "b1")] = `1`;
501	leaf_buffer_ids[get_leaf_id(graph, tensor_name: "x0")] = `0`;
502
503	GGML_ASSERT(graph->n_nodes == `8`);
504	int node_buffer_ids[`8`];
505	node_buffer_ids[get_node_id(graph, tensor_name: "x1")] = `0`;
506	node_buffer_ids[get_node_id(graph, tensor_name: "x2")] = `0`;
507	node_buffer_ids[get_node_id(graph, tensor_name: "x3")] = `1`;
508	node_buffer_ids[get_node_id(graph, tensor_name: "x4")] = `1`;
509	node_buffer_ids[get_node_id(graph, tensor_name: "x5")] = `1`;
510	node_buffer_ids[get_node_id(graph, tensor_name: "x6")] = `1`;
511	node_buffer_ids[get_node_id(graph, tensor_name: "x7")] = `0`;
512	node_buffer_ids[get_node_id(graph, tensor_name: "x8")] = `0`;
513
514	ggml_gallocr_ptr galloc(ggml_gallocr_new_n(bufts, n_bufs: `2`));
515	ggml_gallocr_reserve_n(galloc: galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
516	ggml_gallocr_alloc_graph(galloc: galloc.get(), graph);
517
518	check_all_allocated(graph);
519	check_no_overlap(graph);
520	check_max_size(ctx);
521	GGML_ASSERT(backend_a.context ->allocated_total() <= `32` + `32` + `24`);
522	GGML_ASSERT(backend_b.context ->allocated_total() <= `32` + `24`);
523	}
524
525	static void test_buffer_size_zero() {
526	dummy_backend backend_a = dummy_backend_init(SIZE_MAX);
527	dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
528	auto [ctx, graph, ctx_ptr] = make_context();
529
530	ggml_tensor * x[`2`];
531	x[`0`] = make_input_with_size(ctx, size_bytes: `16`);
532	x[`1`] = ggml_scale(ctx, a: x[`0`], s: `2.0f`);
533
534	ggml_set_output(tensor: x[`1`]);
535	ggml_build_forward_expand(cgraph: graph, tensor: x[`1`]);
536
537	int leaf_buffer_ids[`1`] = { `0` };
538	int node_buffer_ids[`1`] = { `0` };
539
540	ggml_backend_buffer_type_t bufts[`2`] = { &backend_a.buffer_type, &backend_b.buffer_type };
541	ggml_gallocr_ptr galloc = ggml_gallocr_ptr (ggml_gallocr_new_n(bufts, n_bufs: `2`));
542	bool res1 = ggml_gallocr_reserve_n(galloc: galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
543	bool res2 = ggml_gallocr_alloc_graph(galloc: galloc.get(), graph);
544	GGML_ASSERT(res1 && res2);
545
546	check_all_allocated(graph);
547	GGML_ASSERT(backend_a.context ->allocated_total() == `16`);
548	GGML_ASSERT(backend_b.context ->allocated_total() == `0`);
549	}
550
551	// Test re-using gallocr for a different graph. The new graph has the same
552	// total size, but one of the chunks is larger, so reallocation is required.
553	static void test_reallocation() {
554	dummy_backend backend = dummy_backend_init(max_buffer_size: `32`, /align/ alignment: `4`);
555	ggml_gallocr_ptr galloc;
556	{
557	auto [ctx, graph, ctx_ptr] = make_context();
558	ggml_tensor * x[`4`];
559	x[`0`] = make_input_with_size(ctx, size_bytes: `24`);
560	x[`1`] = make_input_with_size(ctx, size_bytes: `16`);
561	x[`2`] = ggml_view_1d(ctx, a: x[`0`], ne0: `4`, offset: `0`);
562	x[`3`] = ggml_add(ctx, a: x[`2`], b: x[`1`]);
563	assign_names(ctx);
564
565	galloc = allocate_graph(graph, out: x[`3`], buft: &backend.buffer_type);
566	check_all_allocated(graph);
567	GGML_ASSERT(backend.context ->allocated_total() == `40`);
568	}
569	{
570	auto [ctx, graph, ctx_ptr] = make_context();
571	ggml_tensor * x[`3`];
572	x[`0`] = make_input_with_size(ctx, size_bytes: `20`);
573	x[`1`] = make_input_with_size(ctx, size_bytes: `20`);
574	x[`2`] = ggml_add(ctx, a: x[`0`], b: x[`1`]);
575	assign_names(ctx);
576	ggml_set_output(tensor: x[`2`]);
577	ggml_build_forward_expand(cgraph: graph, tensor: x[`2`]);
578
579	bool result = ggml_gallocr_alloc_graph(galloc: galloc.get(), graph);
580	GGML_ASSERT(result);
581	check_all_allocated(graph);
582	GGML_ASSERT(backend.context ->allocated_total() == `40`);
583	}
584	}
585
586	static void run(const char * name, void (*f)()) {
587	printf(format: "%s ", name);
588	fflush(stdout);
589	f();
590	printf(format: "PASSED\n");
591	}
592
593	int main() {
594	run(name: "test_max_size_too_many_tensors", f: test_max_size_too_many_tensors);
595	run(name: "test_max_size_tensor_too_large", f: test_max_size_tensor_too_large);
596	run(name: "test_tensor_larger_than_max_size", f: test_tensor_larger_than_max_size);
597	run(name: "test_not_enough_chunks", f: test_not_enough_chunks);
598	run(name: "test_fill_leftover_space", f: test_fill_leftover_space);
599	run(name: "test_view_inplace", f: test_view_inplace);
600	run(name: "test_reuse_and_free", f: test_reuse_and_free);
601	run(name: "test_merge_free_block(32)", f: []() { test_merge_free_block(max_buffer_size: `32`); });
602	run(name: "test_merge_free_block(SIZE_MAX)", f: []() { test_merge_free_block(SIZE_MAX); });
603	run(name: "test_prefer_already_allocated_memory", f: test_prefer_already_allocated_memory);
604	run(name: "test_multiple_buffer_types", f: test_multiple_buffer_types);
605	run(name: "test_buffer_size_zero", f: test_buffer_size_zero);
606	run(name: "test_reallocation", f: test_reallocation);
607	return `0`;
608	}
609

Browse the source code of llama.cpp/tests/test-alloc.cpp