1#include <ggml-alloc.h>
2#include <ggml-backend-impl.h>
3#include <ggml-cpp.h>
4#include <ggml-impl.h>
5#include <ggml.h>
6
7#include <algorithm>
8#include <exception>
9#include <memory>
10#include <vector>
11
12//
13// dummy backend with configurable max_buffer_size, tracks allocations
14
15uint8_t * const alloc_base = (uint8_t *) 16;
16
17struct dummy_backend_context {
18 size_t max_buffer_size = 64;
19 size_t alignment = 8;
20
21 ggml_backend_buffer_i buffer_interface;
22 std::vector<ggml_backend_buffer_t> buffers;
23
24 size_t allocated_total() const {
25 size_t n = 0;
26 for (ggml_backend_buffer_t buf : buffers) {
27 n += ggml_backend_buffer_get_size(buffer: buf);
28 }
29 return n;
30 }
31};
32
33// ggml_backend_buffer_type interface
34
35static const char * dummy_backend_buffer_type_get_name(ggml_backend_buffer_type_t) {
36 return "dummy_buffer_type";
37}
38
39static ggml_backend_buffer_t dummy_backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
40 dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
41 ggml_backend_buffer_t & buffer = ctx->buffers.emplace_back();
42 buffer = ggml_backend_buffer_init(buft, iface: ctx->buffer_interface, context: ctx, size);
43 return buffer;
44}
45
46static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
47 dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
48 return ctx->alignment;
49}
50
51static size_t dummy_backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
52 dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
53 return ctx->max_buffer_size;
54}
55
56static bool dummy_backend_buffer_type_is_host(ggml_backend_buffer_type_t) {
57 return true;
58}
59
60// ggml_backend_buffer interface
61
62static void dummy_backend_buffer_free_buffer(ggml_backend_buffer_t buffer) {
63 dummy_backend_context * ctx = (dummy_backend_context *) buffer->context;
64
65 auto i = std::find(first: ctx->buffers.begin(), last: ctx->buffers.end(), val: buffer);
66 GGML_ASSERT(i != ctx->buffers.end());
67 ctx->buffers.erase(position: i);
68}
69
70static void * dummy_backend_buffer_get_base(ggml_backend_buffer_t) {
71 return alloc_base;
72}
73
74static ggml_status dummy_backend_buffer_init_tensor(ggml_backend_buffer_t, ggml_tensor *) {
75 return GGML_STATUS_SUCCESS;
76}
77
78static void dummy_backend_buffer_memset_tensor(ggml_backend_buffer_t, ggml_tensor *, uint8_t, size_t, size_t) {}
79
80static void dummy_backend_buffer_set_tensor(ggml_backend_buffer_t, ggml_tensor *, const void *, size_t, size_t) {}
81
82static void dummy_backend_buffer_get_tensor(ggml_backend_buffer_t, const ggml_tensor *, void *, size_t, size_t) {}
83
84static void dummy_backend_buffer_clear(ggml_backend_buffer_t, uint8_t) {}
85
86// dummy_backend (not really a full backend, just provides what gallocr needs)
87
88struct dummy_backend {
89 std::unique_ptr<dummy_backend_context> context;
90 ggml_backend_buffer_type buffer_type;
91};
92
93static dummy_backend dummy_backend_init(size_t max_buffer_size, size_t alignment = 8) {
94 dummy_backend b{};
95 b.context = std::make_unique<dummy_backend_context>();
96 b.context->alignment = alignment;
97 b.context->max_buffer_size = max_buffer_size;
98
99 b.context->buffer_interface.free_buffer = dummy_backend_buffer_free_buffer;
100 b.context->buffer_interface.get_base = dummy_backend_buffer_get_base;
101 b.context->buffer_interface.init_tensor = dummy_backend_buffer_init_tensor;
102 b.context->buffer_interface.memset_tensor = dummy_backend_buffer_memset_tensor;
103 b.context->buffer_interface.set_tensor = dummy_backend_buffer_set_tensor;
104 b.context->buffer_interface.get_tensor = dummy_backend_buffer_get_tensor;
105 b.context->buffer_interface.clear = dummy_backend_buffer_clear;
106
107 b.buffer_type.context = b.context.get();
108 b.buffer_type.iface.get_name = dummy_backend_buffer_type_get_name;
109 b.buffer_type.iface.alloc_buffer = dummy_backend_buffer_type_alloc_buffer;
110 b.buffer_type.iface.get_alignment = dummy_backend_buffer_type_get_alignment;
111 b.buffer_type.iface.get_max_size = dummy_backend_buffer_type_get_max_size;
112 b.buffer_type.iface.is_host = dummy_backend_buffer_type_is_host;
113 return b;
114}
115
116//
117// test utilities
118
119struct test_context_with_graph {
120 ggml_context * ctx;
121 ggml_cgraph * graph;
122 ggml_context_ptr ctx_ptr;
123};
124
125static test_context_with_graph make_context() {
126 ggml_init_params params{};
127 params.mem_size = 48 * ggml_tensor_overhead() + ggml_graph_overhead();
128 params.no_alloc = true;
129
130 ggml_context * ctx = ggml_init(params);
131 ggml_context_ptr ctx_ptr = ggml_context_ptr(ctx);
132 ggml_cgraph * graph = ggml_new_graph(ctx);
133 return { .ctx: ctx, .graph: graph, .ctx_ptr: std::move(ctx_ptr) };
134}
135
136static ggml_tensor * make_input_1d(ggml_context * ctx, int64_t n_elements) {
137 ggml_tensor * t = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_elements);
138 ggml_set_input(tensor: t);
139 return t;
140}
141
142static ggml_tensor * make_input_with_size(ggml_context * ctx, size_t size_bytes) {
143 GGML_ASSERT(size_bytes % 4 == 0);
144 return make_input_1d(ctx, n_elements: size_bytes / 4);
145}
146
147static void assign_names(ggml_context * ctx, const char * prefix = "x") {
148 int i = 0;
149 for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, tensor: t)) {
150 ggml_format_name(tensor: t, fmt: "%s%d", prefix, i++);
151 }
152}
153
154static int get_leaf_id(ggml_cgraph * graph, const char * tensor_name) {
155 for (int i = 0; i < graph->n_leafs; ++i) {
156 if (strncmp(s1: graph->leafs[i]->name, s2: tensor_name, GGML_MAX_NAME) == 0) {
157 return i;
158 }
159 }
160 fprintf(stderr, format: "leaf not found: %s\n", tensor_name);
161 return -1;
162}
163
164static int get_node_id(ggml_cgraph * graph, const char * tensor_name) {
165 for (int i = 0; i < graph->n_nodes; ++i) {
166 if (strncmp(s1: graph->nodes[i]->name, s2: tensor_name, GGML_MAX_NAME) == 0) {
167 return i;
168 }
169 }
170 fprintf(stderr, format: "node not found: %s", tensor_name);
171 return -1;
172}
173
174static ggml_gallocr_ptr allocate_graph(ggml_cgraph * graph, ggml_tensor * out, ggml_backend_buffer_type_t buft) {
175 ggml_set_output(tensor: out);
176 ggml_build_forward_expand(cgraph: graph, tensor: out);
177
178 ggml_gallocr_ptr galloc = ggml_gallocr_ptr(ggml_gallocr_new(buft));
179 bool result = ggml_gallocr_alloc_graph(galloc: galloc.get(), graph);
180 GGML_ASSERT(result);
181 return galloc;
182}
183
184//
185// correctness checks for result allocations
186
187static void check_all_allocated(ggml_cgraph * graph) {
188 for (int i = 0; i < ggml_graph_n_nodes(cgraph: graph); ++i) {
189 ggml_tensor * t = ggml_graph_node(cgraph: graph, i);
190 GGML_ASSERT(t->buffer != nullptr);
191 GGML_ASSERT(t->data != nullptr);
192 }
193}
194
195static void check_max_size(ggml_context * ctx) {
196 for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, tensor: t)) {
197 auto buft = ggml_backend_buffer_get_type(buffer: t->buffer);
198 size_t max_size = ggml_backend_buft_get_max_size(buft);
199 size_t offset = (char *) t->data - (char *) ggml_backend_buffer_get_base(buffer: t->buffer);
200 GGML_ASSERT(t->data >= ggml_backend_buffer_get_base(t->buffer));
201 GGML_ASSERT((size_t) offset + ggml_nbytes(t) <= max_size);
202 }
203}
204
205static bool can_reuse_memory(ggml_cgraph * graph, int current_i, ggml_tensor * current, ggml_tensor * other) {
206 if (other->flags & GGML_TENSOR_FLAG_OUTPUT) {
207 return false;
208 }
209 // Check if `other` is still "alive", ie. an input to any node after the `current` op
210 for (int i = current_i; i < ggml_graph_n_nodes(cgraph: graph); ++i) {
211 ggml_tensor * t = ggml_graph_node(cgraph: graph, i);
212 for (int s = 0; s < GGML_MAX_SRC; s++) {
213 if (t == current && ggml_op_can_inplace(op: t->op)) {
214 continue;
215 }
216 if (t->src[s] == other) {
217 return false;
218 }
219 if (t->src[s] && t->src[s]->view_src == other) {
220 return false;
221 }
222 }
223 }
224 return true;
225}
226
227static bool memory_overlap(ggml_tensor * a, ggml_tensor * b) {
228 if (a->buffer != b->buffer) {
229 return false;
230 }
231 int64_t a0 = (int64_t) a->data;
232 int64_t a1 = a0 + ggml_nbytes(tensor: a);
233 int64_t b0 = (int64_t) b->data;
234 int64_t b1 = b0 + ggml_nbytes(tensor: b);
235 return a1 > b0 && b1 > a0;
236}
237
238static ggml_tensor * get_view_source(ggml_tensor * t) {
239 while (t->view_src) {
240 t = t->view_src;
241 }
242 return t;
243}
244
245static void check_no_overlap(ggml_cgraph * graph) {
246 for (int i = 0; i < ggml_graph_n_nodes(cgraph: graph); ++i) {
247 for (int j = 0; j < i; ++j) {
248 ggml_tensor * t = ggml_graph_node(cgraph: graph, i);
249 ggml_tensor * o = ggml_graph_node(cgraph: graph, i: j);
250 GGML_ASSERT(t != o);
251
252 if (get_view_source(t) == get_view_source(t: o)) {
253 continue;
254 }
255 if (memory_overlap(a: t, b: o)) {
256 GGML_ASSERT(can_reuse_memory(graph, i, t, o));
257 }
258 }
259 }
260}
261
262//
263// test cases
264
265// Scenario where the first backend buffer is completely exhausted and there are further
266// tensors which require a second buffer
267static void test_max_size_too_many_tensors() {
268 dummy_backend backend = dummy_backend_init(max_buffer_size: 16);
269 auto [ctx, graph, ctx_ptr] = make_context();
270
271 ggml_tensor * x[7];
272 x[0] = make_input_with_size(ctx, size_bytes: 8);
273 x[1] = make_input_with_size(ctx, size_bytes: 8);
274 x[2] = make_input_with_size(ctx, size_bytes: 8);
275 x[3] = ggml_mul(ctx, a: x[0], b: x[1]);
276 x[4] = ggml_add(ctx, a: x[1], b: x[2]);
277 x[5] = ggml_add(ctx, a: x[3], b: x[0]);
278 x[6] = ggml_add(ctx, a: x[4], b: x[5]);
279 assign_names(ctx);
280
281 ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[6], buft: &backend.buffer_type);
282 check_all_allocated(graph);
283 check_no_overlap(graph);
284 check_max_size(ctx);
285 GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
286}
287
288// Scenario where there is some space left in the first buffer, but not enough to accomodate
289// a larger tensor, so a second buffer is required
290static void test_max_size_tensor_too_large() {
291 dummy_backend backend = dummy_backend_init(max_buffer_size: 32);
292 auto [ctx, graph, ctx_ptr] = make_context();
293
294 ggml_tensor * x[3];
295 x[0] = make_input_with_size(ctx, size_bytes: 16); // chunk 0, [0 , 16)
296 x[1] = make_input_with_size(ctx, size_bytes: 8); // chunk 0, [16, 24)
297 x[2] = ggml_concat(ctx, a: x[0], b: x[1], dim: 0); // chunk 1, [0 , 24)
298 assign_names(ctx);
299
300 ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[2], buft: &backend.buffer_type);
301 check_all_allocated(graph);
302 check_no_overlap(graph);
303 check_max_size(ctx);
304 GGML_ASSERT(backend.context->allocated_total() <= 32 + 24);
305}
306
307// Scenario where a single tensor exceeds the max buffer size - in this case the allocator
308// should try to create a bigger buffer anyway, and wait for the backend to throw an error.
309// Backends may report an artificially lower max size in some cases for compatibility reasons.
310static void test_tensor_larger_than_max_size() {
311 dummy_backend backend = dummy_backend_init(max_buffer_size: 16);
312 auto [ctx, graph, ctx_ptr] = make_context();
313
314 ggml_tensor * x[2];
315 x[0] = make_input_with_size(ctx, size_bytes: 24);
316 x[1] = ggml_scale(ctx, a: x[0], s: 2.0f);
317 assign_names(ctx);
318
319 ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[1], buft: &backend.buffer_type);
320 check_all_allocated(graph);
321 check_no_overlap(graph);
322 GGML_ASSERT(backend.context->allocated_total() == 24);
323}
324
325// This test assumes a max of 16 buffer chunks, and tries to allocate tensors that would
326// require more. Expectation is that the last buffer should grow to fit everything,
327// leaving it to the backend to error out if it can't allocate that much.
328static void test_not_enough_chunks() {
329 const int max_chunks = 16;
330 const int max_size = 8;
331
332 dummy_backend backend = dummy_backend_init(max_buffer_size: max_size);
333 auto [ctx, graph, ctx_ptr] = make_context();
334
335 ggml_tensor * x[max_chunks + 1];
336 for (int i = 0; i < max_chunks + 1; ++i) {
337 x[i] = make_input_with_size(ctx, size_bytes: max_size);
338 }
339 ggml_tensor * acc = x[0];
340 for (int i = 0; i < max_chunks; ++i) {
341 acc = ggml_add(ctx, a: acc, b: x[i + 1]);
342 }
343 assign_names(ctx);
344
345 ggml_gallocr_ptr galloc = allocate_graph(graph, out: acc, buft: &backend.buffer_type);
346 check_all_allocated(graph);
347 check_no_overlap(graph);
348 GGML_ASSERT(backend.context->allocated_total() > max_chunks * max_size);
349}
350
351// Fill up leftover unallocated space of a chunk after allocating a large tensor that
352// requires a new chunk.
353static void test_fill_leftover_space() {
354 dummy_backend backend = dummy_backend_init(max_buffer_size: 16);
355 auto [ctx, graph, ctx_ptr] = make_context();
356
357 ggml_tensor * x[4];
358 x[0] = make_input_with_size(ctx, size_bytes: 8);
359 x[1] = ggml_pad(ctx, a: x[0], p0: 2, p1: 0, p2: 0, p3: 0);
360 x[3] = ggml_mean(ctx, a: x[1]);
361 assign_names(ctx);
362
363 ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[3], buft: &backend.buffer_type);
364 check_all_allocated(graph);
365 check_no_overlap(graph);
366 check_max_size(ctx);
367 GGML_ASSERT(backend.context->allocated_total() <= 12 + 16);
368}
369
370// Check that views don't require any extra memory
371static void test_view_inplace() {
372 dummy_backend backend = dummy_backend_init(max_buffer_size: 32);
373 auto [ctx, graph, ctx_ptr] = make_context();
374
375 ggml_tensor * x[6];
376 x[0] = make_input_1d(ctx, n_elements: 4); // chunk 0, [0, 16)
377 x[1] = ggml_reshape_2d(ctx, a: x[0], ne0: 2, ne1: 2); // view of x0
378 x[2] = ggml_permute(ctx, a: x[1], axis0: 1, axis1: 0, axis2: 2, axis3: 3); // view of x0
379 x[3] = ggml_view_1d(ctx, a: x[2], ne0: 2, offset: 4); // view of x0
380 x[4] = make_input_1d(ctx, n_elements: 2); // chunk 0, [16, 24)
381 x[5] = ggml_add(ctx, a: x[3], b: x[4]); // reuse (inplace add)
382 assign_names(ctx);
383
384 ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[5], buft: &backend.buffer_type);
385 check_all_allocated(graph);
386 check_no_overlap(graph);
387 check_max_size(ctx);
388 GGML_ASSERT(backend.context->allocated_total() <= 24);
389}
390
391static void test_reuse_and_free() {
392 dummy_backend backend = dummy_backend_init(max_buffer_size: 40);
393 auto [ctx, graph, ctx_ptr] = make_context();
394
395 ggml_tensor * x[9];
396 x[0] = make_input_with_size(ctx, size_bytes: 24);
397 x[1] = make_input_with_size(ctx, size_bytes: 8);
398 x[2] = make_input_with_size(ctx, size_bytes: 8);
399 x[3] = ggml_add(ctx, a: x[1], b: x[2]); // reuse, free x2
400 x[4] = ggml_pad(ctx, a: x[0], p0: 2, p1: 0, p2: 0, p3: 0); // alloc new buffer, free x0
401 x[5] = ggml_scale(ctx, a: x[4], s: 2.0f); // alloc from free block
402 x[6] = ggml_add(ctx, a: x[4], b: x[5]); // reuse, free x5
403 x[7] = ggml_view_1d(ctx, a: x[6], ne0: 2, offset: 8); // view
404 x[8] = ggml_add(ctx, a: x[3], b: x[7]); // reuse
405 assign_names(ctx);
406
407 ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[8], buft: &backend.buffer_type);
408 check_all_allocated(graph);
409 check_no_overlap(graph);
410 check_max_size(ctx);
411 GGML_ASSERT(backend.context->allocated_total() <= 40 + 32 + 32);
412}
413
414static void test_merge_free_block(size_t max_buffer_size) {
415 dummy_backend backend = dummy_backend_init(max_buffer_size);
416 auto [ctx, graph, ctx_ptr] = make_context();
417
418 ggml_tensor * x[9];
419 x[0] = make_input_with_size(ctx, size_bytes: 16);
420 x[1] = make_input_with_size(ctx, size_bytes: 16);
421 x[2] = make_input_with_size(ctx, size_bytes: 16);
422 x[3] = ggml_mean(ctx, a: x[0]);
423 x[4] = ggml_mean(ctx, a: x[1]);
424 x[5] = ggml_pad(ctx, a: x[2], p0: 2, p1: 0, p2: 0, p3: 0);
425 x[6] = ggml_add(ctx, a: x[3], b: x[4]);
426 x[7] = ggml_pad(ctx, a: x[6], p0: 5, p1: 0, p2: 0, p3: 0);
427 x[8] = ggml_add(ctx, a: x[5], b: x[7]);
428 assign_names(ctx);
429
430 ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[8], buft: &backend.buffer_type);
431 check_all_allocated(graph);
432 check_no_overlap(graph);
433 check_max_size(ctx);
434 GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 24);
435}
436
437// Check that previously allocated but freed memory is preferred over allocating
438// additional memory, even if the remaining space in a chunk would match tensor size better
439static void test_prefer_already_allocated_memory() {
440 dummy_backend backend = dummy_backend_init(max_buffer_size: 32, /*align*/ alignment: 4);
441 auto [ctx, graph, ctx_ptr] = make_context();
442
443 ggml_tensor * x[3];
444 x[0] = make_input_with_size(ctx, size_bytes: 24); // [24b][8b unused]
445 x[1] = ggml_mean(ctx, a: x[0]); // [24b free][4b][4b unused]
446 x[2] = ggml_mean(ctx, a: x[1]); // should be allocated in the 24b block
447 assign_names(ctx);
448
449 ggml_gallocr_ptr galloc = allocate_graph(graph, out: x[2], buft: &backend.buffer_type);
450 check_all_allocated(graph);
451 check_no_overlap(graph);
452 GGML_ASSERT(backend.context->allocated_total() <= 28);
453}
454
455// test for allocating on multiple devices with some tensors in the graph
456// allocated externally (not by gallocr).
457static void test_multiple_buffer_types() {
458 dummy_backend backend_a = dummy_backend_init(max_buffer_size: 32);
459 dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
460
461 auto [ctx_a, _a, ctx_a_ptr] = make_context();
462 auto [ctx_b, _b, ctx_b_ptr] = make_context();
463 auto [ctx, graph, ctx_ptr] = make_context();
464
465 ggml_tensor * a[2];
466 a[0] = make_input_with_size(ctx: ctx_a, size_bytes: 16);
467 a[1] = make_input_with_size(ctx: ctx_a, size_bytes: 16);
468 assign_names(ctx: ctx_a, prefix: "a");
469
470 ggml_tensor * b[2];
471 b[0] = make_input_with_size(ctx: ctx_b, size_bytes: 24);
472 b[1] = make_input_with_size(ctx: ctx_b, size_bytes: 4);
473 assign_names(ctx: ctx_b, prefix: "b");
474
475 ggml_tensor * x[9];
476 x[0] = make_input_with_size(ctx, size_bytes: 16);
477 x[1] = ggml_mul(ctx, a: x[0], b: a[0]);
478 x[2] = ggml_pad(ctx, a: x[1], p0: 2, p1: 0, p2: 0, p3: 0);
479 x[3] = ggml_mul(ctx, a: x[2], b: b[0]);
480 x[4] = ggml_mean(ctx, a: x[3]);
481 x[5] = ggml_add(ctx, a: x[4], b: b[1]);
482 x[6] = ggml_pad(ctx, a: x[5], p0: 3, p1: 0, p2: 0, p3: 0);
483 x[7] = ggml_add(ctx, a: x[6], b: a[1]);
484 x[8] = ggml_scale(ctx, a: x[7], s: 2.0f);
485 assign_names(ctx, prefix: "x");
486
487 ggml_backend_buffer_ptr buf_a(ggml_backend_alloc_ctx_tensors_from_buft(ctx: ctx_a, buft: &backend_a.buffer_type));
488 ggml_backend_buffer_ptr buf_b(ggml_backend_alloc_ctx_tensors_from_buft(ctx: ctx_b, buft: &backend_b.buffer_type));
489 ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
490
491 // assign buffer types manually to avoid extra complexity from backend scheduler
492 ggml_set_output(tensor: x[8]);
493 ggml_build_forward_expand(cgraph: graph, tensor: x[8]);
494
495 GGML_ASSERT(graph->n_leafs == 5);
496 int leaf_buffer_ids[5];
497 leaf_buffer_ids[get_leaf_id(graph, tensor_name: "a0")] = 0;
498 leaf_buffer_ids[get_leaf_id(graph, tensor_name: "a1")] = 0;
499 leaf_buffer_ids[get_leaf_id(graph, tensor_name: "b0")] = 1;
500 leaf_buffer_ids[get_leaf_id(graph, tensor_name: "b1")] = 1;
501 leaf_buffer_ids[get_leaf_id(graph, tensor_name: "x0")] = 0;
502
503 GGML_ASSERT(graph->n_nodes == 8);
504 int node_buffer_ids[8];
505 node_buffer_ids[get_node_id(graph, tensor_name: "x1")] = 0;
506 node_buffer_ids[get_node_id(graph, tensor_name: "x2")] = 0;
507 node_buffer_ids[get_node_id(graph, tensor_name: "x3")] = 1;
508 node_buffer_ids[get_node_id(graph, tensor_name: "x4")] = 1;
509 node_buffer_ids[get_node_id(graph, tensor_name: "x5")] = 1;
510 node_buffer_ids[get_node_id(graph, tensor_name: "x6")] = 1;
511 node_buffer_ids[get_node_id(graph, tensor_name: "x7")] = 0;
512 node_buffer_ids[get_node_id(graph, tensor_name: "x8")] = 0;
513
514 ggml_gallocr_ptr galloc(ggml_gallocr_new_n(bufts, n_bufs: 2));
515 ggml_gallocr_reserve_n(galloc: galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
516 ggml_gallocr_alloc_graph(galloc: galloc.get(), graph);
517
518 check_all_allocated(graph);
519 check_no_overlap(graph);
520 check_max_size(ctx);
521 GGML_ASSERT(backend_a.context->allocated_total() <= 32 + 32 + 24);
522 GGML_ASSERT(backend_b.context->allocated_total() <= 32 + 24);
523}
524
525static void test_buffer_size_zero() {
526 dummy_backend backend_a = dummy_backend_init(SIZE_MAX);
527 dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
528 auto [ctx, graph, ctx_ptr] = make_context();
529
530 ggml_tensor * x[2];
531 x[0] = make_input_with_size(ctx, size_bytes: 16);
532 x[1] = ggml_scale(ctx, a: x[0], s: 2.0f);
533
534 ggml_set_output(tensor: x[1]);
535 ggml_build_forward_expand(cgraph: graph, tensor: x[1]);
536
537 int leaf_buffer_ids[1] = { 0 };
538 int node_buffer_ids[1] = { 0 };
539
540 ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
541 ggml_gallocr_ptr galloc = ggml_gallocr_ptr(ggml_gallocr_new_n(bufts, n_bufs: 2));
542 bool res1 = ggml_gallocr_reserve_n(galloc: galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
543 bool res2 = ggml_gallocr_alloc_graph(galloc: galloc.get(), graph);
544 GGML_ASSERT(res1 && res2);
545
546 check_all_allocated(graph);
547 GGML_ASSERT(backend_a.context->allocated_total() == 16);
548 GGML_ASSERT(backend_b.context->allocated_total() == 0);
549}
550
551// Test re-using gallocr for a different graph. The new graph has the same
552// total size, but one of the chunks is larger, so reallocation is required.
553static void test_reallocation() {
554 dummy_backend backend = dummy_backend_init(max_buffer_size: 32, /*align*/ alignment: 4);
555 ggml_gallocr_ptr galloc;
556 {
557 auto [ctx, graph, ctx_ptr] = make_context();
558 ggml_tensor * x[4];
559 x[0] = make_input_with_size(ctx, size_bytes: 24);
560 x[1] = make_input_with_size(ctx, size_bytes: 16);
561 x[2] = ggml_view_1d(ctx, a: x[0], ne0: 4, offset: 0);
562 x[3] = ggml_add(ctx, a: x[2], b: x[1]);
563 assign_names(ctx);
564
565 galloc = allocate_graph(graph, out: x[3], buft: &backend.buffer_type);
566 check_all_allocated(graph);
567 GGML_ASSERT(backend.context->allocated_total() == 40);
568 }
569 {
570 auto [ctx, graph, ctx_ptr] = make_context();
571 ggml_tensor * x[3];
572 x[0] = make_input_with_size(ctx, size_bytes: 20);
573 x[1] = make_input_with_size(ctx, size_bytes: 20);
574 x[2] = ggml_add(ctx, a: x[0], b: x[1]);
575 assign_names(ctx);
576 ggml_set_output(tensor: x[2]);
577 ggml_build_forward_expand(cgraph: graph, tensor: x[2]);
578
579 bool result = ggml_gallocr_alloc_graph(galloc: galloc.get(), graph);
580 GGML_ASSERT(result);
581 check_all_allocated(graph);
582 GGML_ASSERT(backend.context->allocated_total() == 40);
583 }
584}
585
586static void run(const char * name, void (*f)()) {
587 printf(format: "%s ", name);
588 fflush(stdout);
589 f();
590 printf(format: "PASSED\n");
591}
592
593int main() {
594 run(name: "test_max_size_too_many_tensors", f: test_max_size_too_many_tensors);
595 run(name: "test_max_size_tensor_too_large", f: test_max_size_tensor_too_large);
596 run(name: "test_tensor_larger_than_max_size", f: test_tensor_larger_than_max_size);
597 run(name: "test_not_enough_chunks", f: test_not_enough_chunks);
598 run(name: "test_fill_leftover_space", f: test_fill_leftover_space);
599 run(name: "test_view_inplace", f: test_view_inplace);
600 run(name: "test_reuse_and_free", f: test_reuse_and_free);
601 run(name: "test_merge_free_block(32)", f: []() { test_merge_free_block(max_buffer_size: 32); });
602 run(name: "test_merge_free_block(SIZE_MAX)", f: []() { test_merge_free_block(SIZE_MAX); });
603 run(name: "test_prefer_already_allocated_memory", f: test_prefer_already_allocated_memory);
604 run(name: "test_multiple_buffer_types", f: test_multiple_buffer_types);
605 run(name: "test_buffer_size_zero", f: test_buffer_size_zero);
606 run(name: "test_reallocation", f: test_reallocation);
607 return 0;
608}
609