ggml-backend.cpp source code [llama.cpp/ggml/src/ggml-backend.cpp]

1	// Note: porting this file to C++ is a work in progress
2
3	#ifdef _WIN32
4	#define WIN32_LEAN_AND_MEAN
5	#ifndef NOMINMAX
6	# define NOMINMAX
7	#endif
8	#include <windows.h>
9	#endif
10
11	#include "ggml-backend.h"
12	#include "ggml-backend-impl.h"
13	#include "ggml-alloc.h"
14	#include "ggml-impl.h"
15
16	#include <assert.h>
17	#include <limits.h>
18	#include <stdarg.h>
19	#include <stdio.h>
20	#include <stdlib.h>
21	#include <string.h>
22	#include <algorithm>
23	#include <vector>
24
25	#ifdef __APPLE__
26	#include <sys/types.h>
27	#include <sys/sysctl.h>
28	#endif
29
30
31	// backend buffer type
32
33	const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
34	GGML_ASSERT(buft);
35	return buft->iface.get_name(buft);
36	}
37
38	ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
39	if (size == `0`) {
40	// return a dummy buffer for zero-sized allocations
41	return ggml_backend_buffer_init(buft, iface: {}, NULL, size: `0`);
42	}
43
44	GGML_ASSERT(buft);
45	return buft->iface.alloc_buffer(buft, size);
46	}
47
48	size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
49	GGML_ASSERT(buft);
50	return buft->iface.get_alignment(buft);
51	}
52
53	size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
54	GGML_ASSERT(buft);
55	// get_max_size is optional, defaults to SIZE_MAX
56	if (buft->iface.get_max_size) {
57	return buft->iface.get_max_size(buft);
58	}
59	return SIZE_MAX;
60	}
61
62	size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
63	GGML_ASSERT(buft);
64	// get_alloc_size is optional, defaults to ggml_nbytes
65	if (buft->iface.get_alloc_size) {
66	size_t size = buft->iface.get_alloc_size(buft, tensor);
67	assert(size >= ggml_nbytes(tensor));
68	return size;
69	}
70	return ggml_nbytes(tensor);
71	}
72
73	bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
74	GGML_ASSERT(buft);
75	if (buft->iface.is_host) {
76	return buft->iface.is_host(buft);
77	}
78	return false;
79	}
80
81	ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
82	GGML_ASSERT(buft);
83	return buft->device;
84	}
85
86	// backend buffer
87
88	ggml_backend_buffer_t ggml_backend_buffer_init(
89	ggml_backend_buffer_type_t buft,
90	struct ggml_backend_buffer_i iface,
91	void * context,
92	size_t size) {
93	ggml_backend_buffer_t buffer = new ggml_backend_buffer {
94	/ .interface = / .iface: iface,
95	/ .buft = / buft,
96	/ .context = / context,
97	/ .size = / size,
98	/ .usage = / GGML_BACKEND_BUFFER_USAGE_ANY
99	};
100
101	return buffer;
102	}
103
104	const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
105	return ggml_backend_buft_name(buft: ggml_backend_buffer_get_type(buffer));
106	}
107
108	void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
109	if (buffer == NULL) {
110	return;
111	}
112
113	if (buffer->iface.free_buffer != NULL) {
114	buffer->iface.free_buffer(buffer);
115	}
116	delete buffer;
117	}
118
119	size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
120	GGML_ASSERT(buffer);
121	return buffer->size;
122	}
123
124	void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
125	GGML_ASSERT(buffer);
126	// get_base is optional if the buffer is zero-sized
127	if (buffer->size == `0`) {
128	return NULL;
129	}
130
131	void * base = buffer->iface.get_base(buffer);
132
133	GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
134
135	return base;
136	}
137
138	enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
139	GGML_ASSERT(buffer);
140	// init_tensor is optional
141	if (buffer->iface.init_tensor) {
142	return buffer->iface.init_tensor(buffer, tensor);
143	}
144	return GGML_STATUS_SUCCESS;
145	}
146
147	void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
148	GGML_ASSERT(buffer);
149	// clear is optional if the buffer is zero-sized
150	if (buffer->size == `0`) {
151	return;
152	}
153
154	buffer->iface.clear(buffer, value);
155	}
156
157	size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
158	return ggml_backend_buft_get_alignment(buft: ggml_backend_buffer_get_type(buffer));
159	}
160
161	size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
162	return ggml_backend_buft_get_max_size(buft: ggml_backend_buffer_get_type(buffer));
163	}
164
165	size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
166	return ggml_backend_buft_get_alloc_size(buft: ggml_backend_buffer_get_type(buffer), tensor);
167	}
168
169	bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
170	return ggml_backend_buft_is_host(buft: ggml_backend_buffer_get_type(buffer));
171	}
172
173	void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
174	GGML_ASSERT(buffer);
175	buffer->usage = usage;
176
177	// FIXME: add a generic callback to the buffer interface
178	if (ggml_backend_buffer_is_multi_buffer(buffer)) {
179	ggml_backend_multi_buffer_set_usage(buffer, usage);
180	}
181	}
182
183	enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
184	GGML_ASSERT(buffer);
185	return buffer->usage;
186	}
187
188	ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
189	GGML_ASSERT(buffer);
190	return buffer->buft;
191	}
192
193	void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
194	GGML_ASSERT(buffer);
195	if (buffer->iface.reset) {
196	buffer->iface.reset(buffer);
197	}
198	}
199
200	bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
201	ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
202	if (dst_buf->iface.cpy_tensor) {
203	return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
204	}
205	return false;
206	}
207
208	// backend
209
210	ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
211	if (backend == NULL) {
212	return NULL;
213	}
214	return backend->guid;
215	}
216
217	const char * ggml_backend_name(ggml_backend_t backend) {
218	if (backend == NULL) {
219	return "NULL";
220	}
221	return backend->iface.get_name(backend);
222	}
223
224	void ggml_backend_free(ggml_backend_t backend) {
225	if (backend == NULL) {
226	return;
227	}
228
229	backend->iface.free(backend);
230	}
231
232	ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
233	GGML_ASSERT(backend);
234	return ggml_backend_dev_buffer_type(device: backend->device);
235	}
236
237	ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
238	return ggml_backend_buft_alloc_buffer(buft: ggml_backend_get_default_buffer_type(backend), size);
239	}
240
241	size_t ggml_backend_get_alignment(ggml_backend_t backend) {
242	return ggml_backend_buft_get_alignment(buft: ggml_backend_get_default_buffer_type(backend));
243	}
244
245	size_t ggml_backend_get_max_size(ggml_backend_t backend) {
246	return ggml_backend_buft_get_max_size(buft: ggml_backend_get_default_buffer_type(backend));
247	}
248
249	void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
250	GGML_ASSERT(backend);
251	GGML_ASSERT(tensor);
252	GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
253	GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
254
255	if (backend->iface.set_tensor_async == NULL) {
256	ggml_backend_tensor_set(tensor, data, offset, size);
257	} else {
258	backend->iface.set_tensor_async(backend, tensor, data, offset, size);
259	}
260	}
261
262	void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
263	GGML_ASSERT(backend);
264	GGML_ASSERT(tensor);
265	GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
266	GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
267
268	if (backend->iface.get_tensor_async == NULL) {
269	ggml_backend_tensor_get(tensor, data, offset, size);
270	} else {
271	backend->iface.get_tensor_async(backend, tensor, data, offset, size);
272	}
273	}
274
275	void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
276	GGML_ASSERT(tensor);
277	ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
278
279	if (size == `0`) {
280	return;
281	}
282
283	GGML_ASSERT(buf != NULL && "tensor buffer not set");
284	GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
285	GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
286
287	buf->iface.set_tensor(buf, tensor, data, offset, size);
288	}
289
290	void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
291	GGML_ASSERT(tensor);
292	ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
293
294	if (size == `0`) {
295	return;
296	}
297
298	GGML_ASSERT(buf != NULL && "tensor buffer not set");
299	GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
300	GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
301
302	buf->iface.get_tensor(buf, tensor, data, offset, size);
303	}
304
305	void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
306	GGML_ASSERT(tensor);
307	ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
308
309	if (size == `0`) {
310	return;
311	}
312
313	GGML_ASSERT(buf != NULL && "tensor buffer not set");
314	GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
315	GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
316	GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
317
318	buf->iface.memset_tensor(buf, tensor, value, offset, size);
319	}
320
321	void ggml_backend_synchronize(ggml_backend_t backend) {
322	GGML_ASSERT(backend);
323	if (backend->iface.synchronize == NULL) {
324	return;
325	}
326
327	backend->iface.synchronize(backend);
328	}
329
330	ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
331	GGML_ASSERT(backend);
332	GGML_ASSERT(backend->iface.graph_plan_create != NULL);
333
334	return backend->iface.graph_plan_create(backend, cgraph);
335	}
336
337	void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
338	GGML_ASSERT(backend);
339	GGML_ASSERT(backend->iface.graph_plan_free != NULL);
340
341	backend->iface.graph_plan_free(backend, plan);
342	}
343
344	enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
345	GGML_ASSERT(backend);
346	GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
347
348	return backend->iface.graph_plan_compute(backend, plan);
349	}
350
351	enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
352	enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
353	ggml_backend_synchronize(backend);
354	return err;
355	}
356
357	enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
358	GGML_ASSERT(backend);
359	return backend->iface.graph_compute(backend, cgraph);
360	}
361
362	bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
363	GGML_ASSERT(backend);
364	return ggml_backend_dev_supports_op(device: backend->device, op);
365	}
366
367	bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
368	GGML_ASSERT(backend);
369	return ggml_backend_dev_supports_buft(device: backend->device, buft);
370	}
371
372	bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
373	GGML_ASSERT(backend);
374	return ggml_backend_dev_offload_op(device: backend->device, op);
375	}
376
377	ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
378	GGML_ASSERT(backend);
379	return backend->device;
380	}
381
382	// backend copy
383
384	void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
385	GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
386
387	if (src == dst) {
388	return;
389	}
390
391	if (ggml_backend_buffer_is_host(buffer: src->buffer)) {
392	ggml_backend_tensor_set(tensor: dst, data: src->data, offset: `0`, size: ggml_nbytes(tensor: src));
393	} else if (ggml_backend_buffer_is_host(buffer: dst->buffer)) {
394	ggml_backend_tensor_get(tensor: src, data: dst->data, offset: `0`, size: ggml_nbytes(tensor: src));
395	} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
396	#ifndef NDEBUG
397	GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
398	#endif
399	size_t nbytes = ggml_nbytes(tensor: src);
400	void * data = malloc(size: nbytes);
401	ggml_backend_tensor_get(tensor: src, data, offset: `0`, size: nbytes);
402	ggml_backend_tensor_set(tensor: dst, data, offset: `0`, size: nbytes);
403	free(ptr: data);
404	}
405	}
406
407	void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
408	GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
409
410	if (src == dst) {
411	return;
412	}
413
414	GGML_ASSERT(backend_dst);
415	if (backend_dst->iface.cpy_tensor_async != NULL) {
416	if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
417	return;
418	}
419	}
420
421	// an async copy would normally happen after all the queued operations on both backends are completed
422	// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
423	ggml_backend_synchronize(backend: backend_src);
424	ggml_backend_synchronize(backend: backend_dst);
425	ggml_backend_tensor_copy(src, dst);
426	}
427
428	// events
429
430	ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
431	// null device is allowed for the transition period to the device interface
432	if (device == NULL \|\| device->iface.event_new == NULL) {
433	return NULL;
434	}
435	return device->iface.event_new(device);
436	}
437
438	void ggml_backend_event_free(ggml_backend_event_t event) {
439	if (event == NULL) {
440	return;
441	}
442	event->device->iface.event_free(event->device, event);
443	}
444
445	void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
446	GGML_ASSERT(backend);
447	GGML_ASSERT(backend->iface.event_record != NULL);
448
449	backend->iface.event_record(backend, event);
450	}
451
452	void ggml_backend_event_synchronize(ggml_backend_event_t event) {
453	GGML_ASSERT(event);
454	GGML_ASSERT(event->device->iface.event_synchronize);
455
456	event->device->iface.event_synchronize(event->device, event);
457	}
458
459	void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
460	GGML_ASSERT(backend);
461	GGML_ASSERT(backend->iface.event_wait != NULL);
462
463	backend->iface.event_wait(backend, event);
464	}
465
466	static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
467	GGML_ASSERT(backend);
468	if (backend->iface.graph_optimize != NULL) {
469	backend->iface.graph_optimize(backend, cgraph);
470	}
471	}
472
473	// Backend device
474
475	const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
476	GGML_ASSERT(device);
477	return device->iface.get_name(device);
478	}
479
480	const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
481	GGML_ASSERT(device);
482	return device->iface.get_description(device);
483	}
484
485	void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
486	GGML_ASSERT(device);
487	device->iface.get_memory(device, free, total);
488	}
489
490	enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
491	GGML_ASSERT(device);
492	return device->iface.get_type(device);
493	}
494
495	void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
496	memset(s: props, c: `0`, n: sizeof(*props));
497	device->iface.get_props(device, props);
498	}
499
500	ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
501	GGML_ASSERT(device);
502	return device->reg;
503	}
504
505	ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
506	GGML_ASSERT(device);
507	return device->iface.init_backend(device, params);
508	}
509
510	ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
511	GGML_ASSERT(device);
512	return device->iface.get_buffer_type(device);
513	}
514
515	ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
516	GGML_ASSERT(device);
517	if (device->iface.get_host_buffer_type == NULL) {
518	return NULL;
519	}
520
521	return device->iface.get_host_buffer_type(device);
522	}
523
524	ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
525	GGML_ASSERT(device);
526	return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
527	}
528
529	bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
530	GGML_ASSERT(device);
531	return device->iface.supports_op(device, op);
532	}
533
534	bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
535	GGML_ASSERT(device);
536	return device->iface.supports_buft(device, buft);
537	}
538
539	bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
540	GGML_ASSERT(device);
541	if (device->iface.offload_op != NULL) {
542	return device->iface.offload_op(device, op);
543	}
544
545	return false;
546	}
547
548	// Backend (reg)
549
550	const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
551	GGML_ASSERT(reg);
552	return reg->iface.get_name(reg);
553	}
554
555	size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
556	GGML_ASSERT(reg);
557	return reg->iface.get_device_count(reg);
558	}
559
560	ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
561	GGML_ASSERT(reg);
562	return reg->iface.get_device(reg, index);
563	}
564
565	void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
566	GGML_ASSERT(reg);
567	if (!reg->iface.get_proc_address) {
568	return NULL;
569	}
570	return reg->iface.get_proc_address(reg, name);
571	}
572
573	// multi-buffer buffer
574
575	struct ggml_backend_multi_buffer_context {
576	ggml_backend_buffer_t * buffers;
577	size_t n_buffers;
578	};
579
580	static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
581	GGML_ASSERT(buffer);
582	ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
583	for (size_t i = `0`; i < ctx->n_buffers; i++) {
584	ggml_backend_buffer_free(buffer: ctx->buffers[i]);
585	}
586
587	free(ptr: ctx->buffers);
588	free(ptr: ctx);
589	}
590
591	static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
592	GGML_ASSERT(buffer);
593	ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
594	for (size_t i = `0`; i < ctx->n_buffers; i++) {
595	ggml_backend_buffer_clear(buffer: ctx->buffers[i], value);
596	}
597	}
598
599	static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
600	/ .free_buffer = / ggml_backend_multi_buffer_free_buffer,
601	/ .get_base = / NULL,
602	/ .init_tensor = / NULL,
603	/ .memset_tensor = / NULL,
604	/ .set_tensor = / NULL,
605	/ .get_tensor = / NULL,
606	/ .cpy_tensor = / NULL,
607	/ .clear = / ggml_backend_multi_buffer_clear,
608	/ .reset = / NULL,
609	};
610
611	ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
612	ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context ) malloc(size: sizeof(struct* ggml_backend_multi_buffer_context));
613	ctx->n_buffers = n_buffers;
614	ctx->buffers = (ggml_backend_buffer_t ) malloc(size: n_buffers sizeof(ggml_backend_buffer_t));
615
616	GGML_ASSERT(ctx->buffers != NULL);
617
618	size_t total_size = `0`;
619	for (size_t i = `0`; i < n_buffers; i++) {
620	ctx->buffers[i] = buffers[i];
621	total_size += ggml_backend_buffer_get_size(buffer: buffers[i]);
622	}
623
624	return ggml_backend_buffer_init(buft: buffers[`0`]->buft, iface: ggml_backend_multi_buffer_i, context: ctx, size: total_size);
625	}
626
627	bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
628	GGML_ASSERT(buffer);
629	return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
630	}
631
632	void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
633	GGML_ASSERT(buffer);
634	GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
635	ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
636	for (size_t i = `0`; i < ctx->n_buffers; i++) {
637	ggml_backend_buffer_set_usage(buffer: ctx->buffers[i], usage);
638	}
639	}
640
641	// creates a copy of the tensor with the same memory layout
642	static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
643	struct ggml_tensor * dup = ggml_dup_tensor(ctx, src: tensor);
644	for (int i = `0`; i < GGML_MAX_DIMS; i++) {
645	dup->nb[i] = tensor->nb[i];
646	}
647	return dup;
648	}
649
650	static bool ggml_is_view_op(enum ggml_op op) {
651	return op == GGML_OP_VIEW \|\| op == GGML_OP_RESHAPE \|\| op == GGML_OP_PERMUTE \|\| op == GGML_OP_TRANSPOSE;
652	}
653
654	// scheduler
655
656	#ifndef GGML_SCHED_MAX_BACKENDS
657	#define GGML_SCHED_MAX_BACKENDS 16
658	#endif
659
660	#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
661	#define GGML_SCHED_MAX_SPLIT_INPUTS 30
662	#endif
663
664	#ifndef GGML_SCHED_MAX_COPIES
665	#define GGML_SCHED_MAX_COPIES 4
666	#endif
667
668	struct ggml_backend_sched_split {
669	int backend_id;
670	int i_start;
671	int i_end;
672	struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
673	int n_inputs;
674	// graph view of this split
675	struct ggml_cgraph graph;
676	};
677
678	struct ggml_backend_sched {
679	bool is_reset; // true if the scheduler has been reset since the last graph split
680	bool is_alloc;
681
682	int n_backends;
683
684	ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
685	ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
686	ggml_gallocr_t galloc;
687
688	// hash map of the nodes in the graph
689	struct ggml_hash_set hash_set;
690	int * hv_tensor_backend_ids; // [hash_set.size]
691	struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
692
693	int * node_backend_ids; // [graph_size]
694	int * leaf_backend_ids; // [graph_size]
695
696	int * prev_node_backend_ids; // [graph_size]
697	int * prev_leaf_backend_ids; // [graph_size]
698
699	// copy of the graph with modified inputs
700	struct ggml_cgraph graph;
701
702	// graph splits
703	struct ggml_backend_sched_split * splits;
704	int n_splits;
705	int splits_capacity;
706
707	// pipeline parallelism support
708	int n_copies;
709	int cur_copy;
710	int next_copy;
711	ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
712	struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
713	int n_graph_inputs;
714
715	struct ggml_context * ctx;
716
717	ggml_backend_sched_eval_callback callback_eval;
718	void * callback_eval_user_data;
719
720	char * context_buffer;
721	size_t context_buffer_size;
722
723	bool op_offload;
724
725	int debug;
726	};
727
728	#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
729	#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
730	#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
731	#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
732
733	// returns the priority of the backend, lower id is higher priority
734	static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
735	for (int i = `0`; i < sched->n_backends; i++) {
736	if (sched->backends[i] == backend) {
737	return i;
738	}
739	}
740	return -`1`;
741	}
742
743	static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
744	ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
745	if (buffer == NULL) {
746	return -`1`;
747	}
748
749	// find highest prio backend that supports the buffer type and the op
750	for (int i = `0`; i < sched->n_backends; i++) {
751	if (ggml_backend_supports_buft(backend: sched->backends[i], buft: buffer->buft) &&
752	ggml_backend_supports_op(backend: sched->backends[i], op)) {
753	return i;
754	}
755	}
756
757	#ifndef NDEBUG
758	GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
759	__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
760	#endif
761
762	return -`1`;
763	}
764
765	#if 0
766	#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
767	static char causes[GGML_DEFAULT_GRAPH_SIZE`16` + GGML_SCHED_MAX_SPLITS_DEBUGGGML_SCHED_MAX_SPLIT_INPUTS][`128`]; // debug only
768	#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
769	#define GET_CAUSE(node) causes[hash_id(node)]
770	#else
771	#define SET_CAUSE(node, ...)
772	#define GET_CAUSE(node) ""
773	#endif
774
775	// returns the backend that should be used for the node based on the current locations
776	static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
777	// assign pre-allocated nodes to their backend
778	int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, op: tensor);
779	if (cur_backend_id != -`1`) {
780	SET_CAUSE(tensor, "1.dst");
781	return cur_backend_id;
782	}
783
784	// view_src
785	if (tensor->view_src != NULL) {
786	cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor: tensor->view_src, op: tensor);
787	if (cur_backend_id != -`1`) {
788	SET_CAUSE(tensor, "1.vsrc");
789	return cur_backend_id;
790	}
791	}
792
793	if (tensor->buffer \|\| (tensor->view_src && tensor->view_src->buffer)) {
794	// since the tensor is pre-allocated, it cannot be moved to another backend
795	ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
796	GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
797	}
798
799	// graph input
800	if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
801	cur_backend_id = sched->n_backends - `1`; // last backend (assumed CPU)
802	SET_CAUSE(tensor, "1.inp");
803	return cur_backend_id;
804	}
805
806	// operations with weights are preferably run on the same backend as the weights
807	for (int i = `0`; i < GGML_MAX_SRC; i++) {
808	const struct ggml_tensor * src = tensor->src[i];
809	if (src == NULL) {
810	continue;
811	}
812	// skip ROPE since the rope freqs tensor is too small to choose a backend based on it
813	// not an ideal solution
814	if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
815	int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor: src, op: tensor);
816	// check if a backend with higher prio wants to offload the op
817	if (sched->op_offload && src_backend_id == sched->n_backends - `1` && ggml_backend_buffer_is_host(buffer: src->buffer)) {
818	for (int b = `0`; b < src_backend_id; b++) {
819	if (ggml_backend_supports_op(backend: sched->backends[b], op: tensor) && ggml_backend_offload_op(backend: sched->backends[b], op: tensor)) {
820	SET_CAUSE(tensor, "1.off");
821	return b;
822	}
823	}
824	}
825	SET_CAUSE(tensor, "1.wgt%d", i);
826	return src_backend_id;
827	}
828	}
829
830	return -`1`;
831	}
832
833	static char * fmt_size(size_t size) {
834	static char buffer[`128`];
835	if (size >= `1024`*`1024`) {
836	snprintf(s: buffer, maxlen: sizeof(buffer), format: "%zuM", size/`1024`/`1024`);
837	} else {
838	snprintf(s: buffer, maxlen: sizeof(buffer), format: "%zuK", size/`1024`);
839	}
840	return buffer;
841	}
842
843	static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
844	int cur_split = `0`;
845	for (int i = `0`; i < graph->n_nodes; i++) {
846	if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
847	ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
848	GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
849	sched->splits[cur_split].n_inputs);
850	for (int j = `0`; j < sched->splits[cur_split].n_inputs; j++) {
851	if (j == `0`) {
852	GGML_LOG_DEBUG(": ");
853	}
854	GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
855	fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
856	}
857	GGML_LOG_DEBUG("\n");
858	cur_split++;
859	}
860	struct ggml_tensor * node = graph->nodes[i];
861	if (ggml_is_view_op(op: node->op)) {
862	continue;
863	}
864	if (sched->debug > `1`) {
865	ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
866	GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
867	fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
868	graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
869	for (int j = `0`; j < GGML_MAX_SRC; j++) {
870	struct ggml_tensor * src = node->src[j];
871	if (src == NULL) {
872	continue;
873	}
874	ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, node: src);
875	GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
876	fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
877	}
878	GGML_LOG_DEBUG("\n");
879	}
880	}
881	}
882
883	static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
884	ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
885	ggml_backend_buffer_type_t buft = NULL;
886
887	if (buf) {
888	// the tensor is already allocated
889	buft = buf->buft;
890	} else {
891	// see if the tensor already has a backend assigned, and use the buffer type of that backend
892	int tensor_backend_id = tensor_backend_id(t);
893	if (tensor_backend_id == -`1` && t->view_src) {
894	tensor_backend_id = tensor_backend_id(t->view_src);
895	}
896	if (tensor_backend_id != -`1`) {
897	buft = sched->bufts[tensor_backend_id];
898	}
899	}
900
901	return buft != NULL && ggml_backend_supports_buft(backend: sched->backends[backend_id], buft);
902	}
903
904	static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
905	if (ggml_backend_supports_op(backend: sched->backends[cur_backend_id], op: node)) {
906	*node_backend_id = cur_backend_id;
907	SET_CAUSE(node, "2.sup");
908	}
909	}
910
911	// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
912	void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
913	// reset splits
914	sched->n_splits = `0`;
915	sched->n_graph_inputs = `0`;
916	sched->is_reset = false;
917
918	struct ggml_init_params params = {
919	/ .mem_size = / sched->context_buffer_size,
920	/ .mem_buffer = / sched->context_buffer,
921	/ .no_alloc = / true
922	};
923
924	ggml_free(ctx: sched->ctx);
925
926	sched->ctx = ggml_init(params);
927	if (sched->ctx == NULL) {
928	GGML_ABORT("%s: failed to initialize context\n", __func__);
929	}
930
931	// pass 1: assign backends to ops with pre-allocated inputs
932	for (int i = `0`; i < graph->n_leafs; i++) {
933	struct ggml_tensor * leaf = graph->leafs[i];
934	int * leaf_backend_id = &tensor_backend_id(leaf);
935	// do not overwrite user assignments
936	if (*leaf_backend_id == -`1`) {
937	*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, tensor: leaf);
938	}
939	}
940
941	for (int i = `0`; i < graph->n_nodes; i++) {
942	struct ggml_tensor * node = graph->nodes[i];
943	int * node_backend_id = &tensor_backend_id(node);
944	// do not overwrite user assignments
945	if (*node_backend_id == -`1`) {
946	*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, tensor: node);
947
948	#if 0
949	// src
950	if (node->op == GGML_OP_NONE) {
951	continue;
952	}
953
954	for (int j = `0`; j < GGML_MAX_SRC; j++) {
955	struct ggml_tensor * src = node->src[j];
956	if (src == NULL) {
957	continue;
958	}
959	int * src_backend_id = &tensor_backend_id(src);
960	if (*src_backend_id == -`1`) {
961	*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
962	}
963	}
964	#endif
965	}
966	}
967
968	// pass 2: expand current backend assignments
969	// assign the same backend to adjacent nodes
970	// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
971	// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
972	// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
973	// expand gpu down
974	{
975	int cur_backend_id = -`1`;
976	for (int i = `0`; i < graph->n_nodes; i++) {
977	struct ggml_tensor * node = graph->nodes[i];
978	if (ggml_is_view_op(op: node->op)) {
979	continue;
980	}
981	int * node_backend_id = &tensor_backend_id(node);
982	if (*node_backend_id != -`1`) {
983	if (*node_backend_id == sched->n_backends - `1`) {
984	// skip cpu (lowest prio backend)
985	cur_backend_id = -`1`;
986	} else {
987	cur_backend_id = *node_backend_id;
988	}
989	} else if (cur_backend_id != -`1`) {
990	ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
991	}
992	}
993	}
994	// expand gpu up
995	{
996	int cur_backend_id = -`1`;
997	for (int i = graph->n_nodes - `1`; i >= `0`; i--) {
998	struct ggml_tensor * node = graph->nodes[i];
999	if (ggml_is_view_op(op: node->op)) {
1000	continue;
1001	}
1002	int * node_backend_id = &tensor_backend_id(node);
1003	if (*node_backend_id != -`1`) {
1004	if (*node_backend_id == sched->n_backends - `1`) {
1005	// skip cpu (lowest prio backend)
1006	cur_backend_id = -`1`;
1007	} else {
1008	cur_backend_id = *node_backend_id;
1009	}
1010	} else if (cur_backend_id != -`1`) {
1011	ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1012	}
1013	}
1014	}
1015	// expand rest down
1016	{
1017	int cur_backend_id = -`1`;
1018	for (int i = `0`; i < graph->n_nodes; i++) {
1019	struct ggml_tensor * node = graph->nodes[i];
1020	if (ggml_is_view_op(op: node->op)) {
1021	continue;
1022	}
1023	int * node_backend_id = &tensor_backend_id(node);
1024	if (*node_backend_id != -`1`) {
1025	cur_backend_id = *node_backend_id;
1026	} else if (cur_backend_id != -`1`) {
1027	ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1028	}
1029	}
1030	}
1031	// expand rest up
1032	{
1033	int cur_backend_id = -`1`;
1034	for (int i = graph->n_nodes - `1`; i >= `0`; i--) {
1035	struct ggml_tensor * node = graph->nodes[i];
1036	if (ggml_is_view_op(op: node->op)) {
1037	continue;
1038	}
1039	int * node_backend_id = &tensor_backend_id(node);
1040	if (*node_backend_id != -`1`) {
1041	cur_backend_id = *node_backend_id;
1042	} else if (cur_backend_id != -`1`) {
1043	ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1044	}
1045	}
1046	}
1047
1048	// pass 3: upgrade nodes to higher prio backends with compatible buffer types
1049	// if the tensor is already in the same buffer type () as another higher priority backend, we should move it there*
1050	// however, we also need to verify that the sources are in compatible buffer types
1051	// () the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph*
1052	// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1053	// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1054	// additionally, set remaining unassigned nodes to the backend with the most supported inputs
1055	// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1056	for (int i = `0`; i < graph->n_nodes; i++) {
1057	struct ggml_tensor * node = graph->nodes[i];
1058	if (ggml_is_view_op(op: node->op)) {
1059	continue;
1060	}
1061	int * node_backend_id = &tensor_backend_id(node);
1062	if (*node_backend_id == -`1`) {
1063	// unassigned node: find the backend with the most supported inputs
1064	int n_supported_best = -`1`;
1065	for (int b = `0`; b < sched->n_backends; b++) {
1066	if (ggml_backend_supports_op(backend: sched->backends[b], op: node)) {
1067	int n_supported = `0`;
1068	for (int j = `0`; j < GGML_MAX_SRC; j++) {
1069	struct ggml_tensor * src = node->src[j];
1070	if (src == NULL) {
1071	continue;
1072	}
1073	if ((tensor_backend_id(src) != -`1` \|\| tensor_backend_id(src->view_src) != -`1`) && ggml_backend_sched_buffer_supported(sched, t: src, backend_id: b)) {
1074	n_supported++;
1075	}
1076	}
1077	if (n_supported > n_supported_best) {
1078	n_supported_best = n_supported;
1079	*node_backend_id = b;
1080	SET_CAUSE(node, "3.best");
1081	}
1082	}
1083	}
1084	} else {
1085	// assigned node: upgrade to higher prio backend if possible
1086	for (int b = `0`; b < *node_backend_id; b++) {
1087	if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(backend: sched->backends[b], op: node)) {
1088	bool supported = true;
1089	for (int j = `0`; j < GGML_MAX_SRC; j++) {
1090	struct ggml_tensor * src = node->src[j];
1091	if (src == NULL) {
1092	continue;
1093	}
1094	if (!ggml_backend_sched_buffer_supported(sched, t: src, backend_id: b)) {
1095	supported = false;
1096	break;
1097	}
1098	}
1099	if (supported) {
1100	*node_backend_id = b;
1101	SET_CAUSE(node, "3.upg");
1102	break;
1103	}
1104	}
1105	}
1106	}
1107	}
1108
1109	// pass 4: assign backends to remaining src from dst and view_src
1110	for (int i = `0`; i < graph->n_nodes; i++) {
1111	struct ggml_tensor * node = graph->nodes[i];
1112	int * cur_backend_id = &tensor_backend_id(node);
1113	if (node->view_src != NULL && *cur_backend_id == -`1`) {
1114	*cur_backend_id = tensor_backend_id(node->view_src);
1115	SET_CAUSE(node, "4.vsrc");
1116	}
1117	for (int j = `0`; j < GGML_MAX_SRC; j++) {
1118	struct ggml_tensor * src = node->src[j];
1119	if (src == NULL) {
1120	continue;
1121	}
1122	int * src_backend_id = &tensor_backend_id(src);
1123	if (*src_backend_id == -`1`) {
1124	if (src->view_src != NULL) {
1125	// views are always on the same backend as the source
1126	*src_backend_id = tensor_backend_id(src->view_src);
1127	SET_CAUSE(src, "4.vsrc");
1128	} else {
1129	src_backend_id = cur_backend_id;
1130	SET_CAUSE(src, "4.cur");
1131	}
1132	}
1133	}
1134	// if the node is still unassigned, assign it to the first backend that supports it
1135	for (int b = `0`; b < sched->n_backends && *cur_backend_id == -`1`; b++) {
1136	ggml_backend_sched_set_if_supported(sched, node, cur_backend_id: b, node_backend_id: cur_backend_id);
1137	}
1138	GGML_ASSERT(*cur_backend_id != -`1`);
1139	}
1140
1141	// pass 5: split graph, find tensors that need to be copied
1142	{
1143	int i_split = `0`;
1144	struct ggml_backend_sched_split * split = &sched->splits[`0`];
1145	// find the backend of the first split, skipping view ops
1146	int i = `0`;
1147	for (; i < graph->n_nodes; i++) {
1148	struct ggml_tensor * node = graph->nodes[i];
1149	if (!ggml_is_view_op(op: node->op)) {
1150	split->backend_id = tensor_backend_id(node);
1151	break;
1152	}
1153	}
1154	split->i_start = `0`;
1155	split->n_inputs = `0`;
1156	int cur_backend_id = split->backend_id;
1157	for (; i < graph->n_nodes; i++) {
1158	struct ggml_tensor * node = graph->nodes[i];
1159
1160	if (ggml_is_view_op(op: node->op)) {
1161	continue;
1162	}
1163
1164	const int node_backend_id = tensor_backend_id(node);
1165
1166	GGML_ASSERT(node_backend_id != -`1`); // all nodes should be assigned by now, this can happen if there is no CPU fallback
1167
1168	// check if we should start a new split based on the sources of the current node
1169	bool need_new_split = false;
1170	if (node_backend_id == cur_backend_id && split->n_inputs > `0`) {
1171	for (int j = `0`; j < GGML_MAX_SRC; j++) {
1172	struct ggml_tensor * src = node->src[j];
1173	if (src == NULL) {
1174	continue;
1175	}
1176	// check if a weight is on a different and incompatible backend
1177	// by starting a new split, the memory of the previously offloaded weights can be reused
1178	if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1179	int src_backend_id = tensor_backend_id(src);
1180	if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, t: src, backend_id: cur_backend_id)) {
1181	need_new_split = true;
1182	break;
1183	}
1184	}
1185	// check if the split has too many inputs
1186	// FIXME: count the number of inputs instead of only checking when full
1187	if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1188	const size_t id = hash_id(src);
1189	int src_backend_id = sched->hv_tensor_backend_ids[id];
1190	bool supported = ggml_backend_sched_buffer_supported(sched, t: src, backend_id: cur_backend_id);
1191	if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, `0`) == NULL && !supported) {
1192	need_new_split = true;
1193	break;
1194	}
1195	}
1196	}
1197	}
1198
1199	if (node_backend_id != cur_backend_id \|\| need_new_split) {
1200	split->i_end = i;
1201	i_split++;
1202	if (i_split >= sched->splits_capacity) {
1203	sched->splits_capacity *= `2`;
1204	sched->splits = (ggml_backend_sched_split *)
1205	realloc(ptr: sched->splits, size: sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1206	GGML_ASSERT(sched->splits != NULL);
1207	}
1208	split = &sched->splits[i_split];
1209	split->backend_id = node_backend_id;
1210	split->i_start = i;
1211	split->n_inputs = `0`;
1212	cur_backend_id = node_backend_id;
1213	}
1214
1215	// find inputs that are not on the same backend
1216	for (int j = `0`; j < GGML_MAX_SRC; j++) {
1217	struct ggml_tensor * src = node->src[j];
1218	if (src == NULL) {
1219	continue;
1220	}
1221
1222	size_t src_id = hash_id(src);
1223	const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1224	GGML_ASSERT(src_backend_id != -`1`); // all inputs should be assigned by now
1225
1226	if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > `1`) {
1227	if (tensor_id_copy(src_id, src_backend_id, `0`) == NULL) {
1228	ggml_backend_t backend = sched->backends[src_backend_id];
1229	for (int c = `0`; c < sched->n_copies; c++) {
1230	struct ggml_tensor * tensor_copy;
1231	if (c == sched->cur_copy) {
1232	tensor_copy = src; // use the original tensor as the current copy
1233	} else {
1234	tensor_copy = ggml_dup_tensor_layout(ctx: sched->ctx, tensor: src);
1235	ggml_format_name(tensor: tensor_copy, fmt: "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1236	}
1237	if (sched->n_copies > `1`) {
1238	ggml_set_input(tensor: tensor_copy);
1239	ggml_set_output(tensor: tensor_copy); // prevent ggml-alloc from overwriting the tensor
1240	}
1241	tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1242	SET_CAUSE(tensor_copy, "4.cpy");
1243	}
1244	int n_graph_inputs = sched->n_graph_inputs++;
1245	GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1246	sched->graph_inputs[n_graph_inputs] = src;
1247	}
1248	}
1249
1250	if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, t: src, backend_id: cur_backend_id)) {
1251	// create a copy of the input in the split's backend
1252	if (tensor_id_copy(src_id, cur_backend_id, `0`) == NULL) {
1253	ggml_backend_t backend = sched->backends[cur_backend_id];
1254	for (int c = `0`; c < sched->n_copies; c++) {
1255	struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(ctx: sched->ctx, tensor: src);
1256	ggml_format_name(tensor: tensor_copy, fmt: "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1257	if (sched->n_copies > `1`) {
1258	ggml_set_input(tensor: tensor_copy);
1259	ggml_set_output(tensor: tensor_copy); // prevent ggml-alloc from overwriting the tensor
1260	}
1261	tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1262	SET_CAUSE(tensor_copy, "4.cpy");
1263	}
1264	int n_inputs = split->n_inputs++;
1265	GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1266	split->inputs[n_inputs] = src;
1267	}
1268	node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1269	}
1270	}
1271	}
1272	split->i_end = graph->n_nodes;
1273	sched->n_splits = i_split + `1`;
1274	}
1275
1276	if (sched->debug) {
1277	ggml_backend_sched_print_assignments(sched, graph);
1278	}
1279
1280	// swap node_backend_ids and leaf _backend_ids with prevs
1281	{
1282	int * tmp = sched->node_backend_ids;
1283	sched->node_backend_ids = sched->prev_node_backend_ids;
1284	sched->prev_node_backend_ids = tmp;
1285
1286	tmp = sched->leaf_backend_ids;
1287	sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1288	sched->prev_leaf_backend_ids = tmp;
1289	}
1290
1291	int graph_size = std::max(a: graph->n_nodes, b: graph->n_leafs) + sched->n_splitsGGML_SCHED_MAX_SPLIT_INPUTS`2`*sched->n_copies;
1292	if (sched->graph.size < graph_size) {
1293	sched->graph.size = graph_size;
1294	sched->graph.nodes = (ggml_tensor *) realloc(ptr: sched->graph.nodes, size: graph_size sizeof(struct ggml_tensor *));
1295	sched->graph.leafs = (ggml_tensor *) realloc(ptr: sched->graph.leafs, size: graph_size sizeof(struct ggml_tensor *));
1296	GGML_ASSERT(sched->graph.nodes != NULL);
1297	GGML_ASSERT(sched->graph.leafs != NULL);
1298	}
1299	sched->graph.n_nodes = `0`;
1300	sched->graph.n_leafs = `0`;
1301
1302	struct ggml_cgraph * graph_copy = &sched->graph;
1303
1304	for (int i = `0`; i < sched->n_splits; i++) {
1305	struct ggml_backend_sched_split * split = &sched->splits[i];
1306	split->graph = ggml_graph_view(cgraph: graph, i0: split->i_start, i1: split->i_end);
1307
1308	// Optimize this split of the graph. This needs to happen before we make graph_copy,
1309	// so they are in sync.
1310	ggml_backend_graph_optimize(backend: sched->backends[split->backend_id], cgraph: &split->graph);
1311
1312	// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1313	for (int j = `0`; j < split->n_inputs; j++) {
1314	assert(graph_copy->size > (graph_copy->n_nodes + `1`));
1315
1316	struct ggml_tensor * input = split->inputs[j];
1317	const size_t input_id = hash_id(input);
1318	struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
1319
1320	// add a dependency to the input source so that it is not freed before the copy is done
1321	struct ggml_tensor * input_dep = ggml_view_tensor(ctx: sched->ctx, src: input);
1322	input_dep->src[`0`] = input;
1323	sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
1324	graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1325
1326	// add a dependency to the input copy so that it is allocated at the start of the split
1327	sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1328	graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1329	}
1330
1331	for (int j = split->i_start; j < split->i_end; j++) {
1332	assert(graph_copy->size > graph_copy->n_nodes);
1333	sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1334	graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1335	}
1336	}
1337
1338	if (sched->n_copies > `1`) {
1339	// add input copies as leafs so that they are allocated first
1340	for (int i = `0`; i < sched->n_graph_inputs; i++) {
1341	struct ggml_tensor * input = sched->graph_inputs[i];
1342	size_t id = hash_id(input);
1343	int backend_id = tensor_backend_id(input);
1344	for (int c = `0`; c < sched->n_copies; c++) {
1345	struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1346	sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1347	assert(graph_copy->size > graph_copy->n_leafs);
1348	graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1349	}
1350	}
1351
1352	for (int i = `0`; i < sched->n_splits; i++) {
1353	struct ggml_backend_sched_split * split = &sched->splits[i];
1354	int backend_id = split->backend_id;
1355	for (int j = `0`; j < split->n_inputs; j++) {
1356	struct ggml_tensor * input = split->inputs[j];
1357	size_t id = hash_id(input);
1358	for (int c = `0`; c < sched->n_copies; c++) {
1359	struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1360	sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1361	assert(graph_copy->size > graph_copy->n_leafs);
1362	graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1363	}
1364	}
1365	}
1366	}
1367
1368	// add leafs from the original graph
1369	for (int i = `0`; i < graph->n_leafs; i++) {
1370	struct ggml_tensor * leaf = graph->leafs[i];
1371	sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1372	assert(graph_copy->size > graph_copy->n_leafs);
1373	graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1374	}
1375	}
1376
1377	static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1378	bool backend_ids_changed = false;
1379	for (int i = `0`; i < sched->graph.n_nodes; i++) {
1380	if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1381	sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1382	backend_ids_changed = true;
1383	break;
1384	}
1385	}
1386	if (!backend_ids_changed) {
1387	for (int i = `0`; i < sched->graph.n_leafs; i++) {
1388	if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1389	sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1390	backend_ids_changed = true;
1391	break;
1392	}
1393	}
1394	}
1395
1396	// allocate graph
1397	if (backend_ids_changed \|\| !ggml_gallocr_alloc_graph(galloc: sched->galloc, graph: &sched->graph)) {
1398	// the re-allocation may cause the split inputs to be moved to a different address
1399	// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
1400	for (int i = `0`; i < sched->n_backends; i++) {
1401	ggml_backend_synchronize(backend: sched->backends[i]);
1402	}
1403	#ifndef NDEBUG
1404	GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1405	#endif
1406	ggml_gallocr_reserve_n(galloc: sched->galloc, graph: &sched->graph, node_buffer_ids: sched->node_backend_ids, leaf_buffer_ids: sched->leaf_backend_ids);
1407	if (!ggml_gallocr_alloc_graph(galloc: sched->galloc, graph: &sched->graph)) {
1408	GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
1409	return false;
1410	}
1411	}
1412
1413	return true;
1414	}
1415
1416	static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1417	GGML_ASSERT(sched);
1418	struct ggml_backend_sched_split * splits = sched->splits;
1419
1420	ggml_tensor * prev_ids_tensor = nullptr;
1421	std::vector<int32_t> ids;
1422	std::vector<ggml_bitset_t> used_ids;
1423
1424	for (int split_id = `0`; split_id < sched->n_splits; split_id++) {
1425	struct ggml_backend_sched_split * split = &splits[split_id];
1426	int split_backend_id = split->backend_id;
1427	ggml_backend_t split_backend = sched->backends[split_backend_id];
1428
1429	// copy the input tensors to the split backend
1430	for (int input_id = `0`; input_id < split->n_inputs; input_id++) {
1431	ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, node: split->inputs[input_id]);
1432	struct ggml_tensor * input = split->inputs[input_id];
1433	struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1434
1435	if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1436	// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1437	if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1438	ggml_backend_event_synchronize(event: sched->events[split_backend_id][sched->cur_copy]);
1439	} else {
1440	ggml_backend_synchronize(backend: split_backend);
1441	}
1442	ggml_backend_tensor_copy(src: input, dst: input_cpy);
1443	} else {
1444	// wait for the split backend to finish using the input before overwriting it
1445	if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1446	ggml_backend_event_wait(backend: split_backend, event: sched->events[split_backend_id][sched->cur_copy]);
1447	} else {
1448	ggml_backend_synchronize(backend: split_backend);
1449	}
1450
1451	// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
1452	ggml_tensor * node = split->graph.nodes[`0`];
1453	if (split->graph.n_nodes > `0` &&
1454	ggml_backend_buffer_get_usage(buffer: input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
1455	ggml_backend_buffer_is_host(buffer: input->buffer) && (
1456	(node->src[`0`] == input_cpy && node->op == GGML_OP_MUL_MAT_ID)
1457	//\|\| (node->src[1] == input_cpy && node->op == GGML_OP_ADD_ID) / GGML_OP_ADD_ID weights are small and not worth splitting /
1458	)) {
1459
1460	const int64_t n_expert = node->op == GGML_OP_MUL_MAT_ID ? input->ne[`2`] : input->ne[`1`];
1461	const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[`2`] : input->nb[`1`];
1462
1463	ggml_backend_synchronize(backend: input_backend);
1464
1465	// get the ids
1466	ggml_tensor * ids_tensor = node->src[`2`];
1467	ggml_backend_t ids_backend = split_backend;
1468
1469	// if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
1470	// in that case, we use the original ids tensor
1471	for (int i = input_id + `1`; i < split->n_inputs; i++) {
1472	if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
1473	ids_tensor = split->inputs[i];
1474	ids_backend = ggml_backend_sched_get_tensor_backend(sched, node: split->inputs[i]);
1475	break;
1476	}
1477	}
1478
1479	if (ids_tensor != prev_ids_tensor) {
1480	ids.resize(new_size: ggml_nbytes(tensor: ids_tensor) / sizeof(int32_t));
1481	ggml_backend_tensor_get_async(backend: ids_backend, tensor: ids_tensor, data: ids.data(), offset: `0`, size: ggml_nbytes(tensor: ids_tensor));
1482	ggml_backend_synchronize(backend: ids_backend);
1483
1484	// find the used experts
1485	used_ids.clear();
1486	used_ids.resize(new_size: ggml_bitset_size(n: n_expert));
1487	for (int64_t i1 = `0`; i1 < ids_tensor->ne[`1`]; i1++) {
1488	for (int64_t i0 = `0`; i0 < ids_tensor->ne[`0`]; i0++) {
1489	int32_t id = ids [i1 * ids_tensor->nb[`1`]/sizeof(int32_t) + i0 * ids_tensor->nb[`0`]/sizeof(int32_t)];
1490	GGML_ASSERT(id >= `0` && id < n_expert);
1491	ggml_bitset_set(bitset: used_ids.data(), i: id);
1492	}
1493	}
1494
1495	prev_ids_tensor = ids_tensor;
1496	}
1497
1498	// group consecutive experts and copy them together
1499	auto copy_experts = [&](int32_t first_id, int32_t last_id) {
1500	const size_t expert_offset = first_id * expert_size;
1501	const size_t expert_size_copy = (last_id - first_id + `1`) * expert_size;
1502	const size_t padding = std::min<size_t>(a: expert_size, b: `512`);
1503	const size_t padding_end = last_id < n_expert - `1` ? padding : `0`;
1504
1505	ggml_backend_tensor_set_async(backend: split_backend,
1506	tensor: input_cpy,
1507	data: (const uint8_t *)input->data + expert_offset, offset: expert_offset,
1508	// copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
1509	// this is necessary for MMQ in the CUDA backend
1510	size: expert_size_copy + padding_end);
1511	};
1512
1513	int id = `0`;
1514	while (!ggml_bitset_get(bitset: used_ids.data(), i: id)) {
1515	id++;
1516	}
1517	int32_t first_id = id;
1518	int32_t last_id = first_id;
1519
1520	for (++id; id < n_expert; ++id) {
1521	if (!ggml_bitset_get(bitset: used_ids.data(), i: id)) {
1522	continue;
1523	}
1524
1525	if (id == last_id + `1`) {
1526	last_id = id;
1527	continue;
1528	}
1529
1530	copy_experts (first_id, last_id);
1531
1532	first_id = id;
1533	last_id = id;
1534	}
1535	copy_experts (first_id, last_id);
1536	} else {
1537	// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1538	// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1539	if (!split_backend->iface.cpy_tensor_async \|\| !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1540	ggml_backend_synchronize(backend: input_backend);
1541	if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1542	ggml_backend_event_synchronize(event: sched->events[split_backend_id][sched->cur_copy]);
1543	} else {
1544	ggml_backend_synchronize(backend: split_backend);
1545	}
1546	ggml_backend_tensor_copy(src: input, dst: input_cpy);
1547	}
1548	}
1549	}
1550	}
1551
1552	if (!sched->callback_eval) {
1553	enum ggml_status ec = ggml_backend_graph_compute_async(backend: split_backend, cgraph: &split->graph);
1554	if (ec != GGML_STATUS_SUCCESS) {
1555	return ec;
1556	}
1557	} else {
1558	// similar to ggml_backend_compare_graph_backend
1559	for (int j0 = `0`; j0 < split->graph.n_nodes; j0++) {
1560	struct ggml_tensor * t = split->graph.nodes[j0];
1561
1562	// check if the user needs data from this node
1563	bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1564
1565	int j1 = j0;
1566
1567	// determine the range [j0, j1] of nodes that can be computed together
1568	while (!need && j1 < split->graph.n_nodes - `1`) {
1569	t = split->graph.nodes[++j1];
1570	need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1571	}
1572
1573	struct ggml_cgraph gv = ggml_graph_view(cgraph: &split->graph, i0: j0, i1: j1 + `1`);
1574
1575	enum ggml_status ec = ggml_backend_graph_compute_async(backend: split_backend, cgraph: &gv);
1576	if (ec != GGML_STATUS_SUCCESS) {
1577	return ec;
1578	}
1579
1580	// TODO: pass backend to the callback, then the user can decide if they want to synchronize
1581	ggml_backend_synchronize(backend: split_backend);
1582
1583	if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1584	break;
1585	}
1586
1587	j0 = j1;
1588	}
1589	}
1590
1591	// record the event of this copy
1592	if (split->n_inputs > `0`) {
1593	if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1594	ggml_backend_event_record(event: sched->events[split_backend_id][sched->cur_copy], backend: split_backend);
1595	}
1596	}
1597	}
1598
1599	return GGML_STATUS_SUCCESS;
1600	}
1601
1602	ggml_backend_sched_t ggml_backend_sched_new(
1603	ggml_backend_t * backends,
1604	ggml_backend_buffer_type_t * bufts,
1605	int n_backends,
1606	size_t graph_size,
1607	bool parallel,
1608	bool op_offload) {
1609	GGML_ASSERT(n_backends > `0`);
1610	GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1611	GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - `1`])) == GGML_BACKEND_DEVICE_TYPE_CPU);
1612
1613	struct ggml_backend_sched * sched = (ggml_backend_sched ) calloc(nmemb: `1`, size: sizeof(struct* ggml_backend_sched));
1614
1615	const char * GGML_SCHED_DEBUG = getenv(name: "GGML_SCHED_DEBUG");
1616	sched->debug = GGML_SCHED_DEBUG ? atoi(nptr: GGML_SCHED_DEBUG) : `0`;
1617	sched->n_backends = n_backends;
1618	sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : `1`;
1619
1620	// initialize hash table
1621	// FIXME: needs to be size2 to account for leafs (do it in graph_split instead)*
1622	sched->hash_set = ggml_hash_set_new(size: graph_size);
1623	sched->hv_tensor_backend_ids = (int ) malloc(size: sched->hash_set.size sizeof(sched->hv_tensor_backend_ids[`0`]));
1624	sched->hv_tensor_copies = (ggml_tensor *) malloc(size: sched->hash_set.size sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1625
1626	const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1627	const size_t nodes_size = graph_size + ggml_sched_max_splitsGGML_SCHED_MAX_SPLIT_INPUTS`2`;
1628	sched->node_backend_ids = (int ) calloc(nmemb: nodes_size, size: sizeof*(sched->node_backend_ids[`0`]));
1629	sched->leaf_backend_ids = (int ) calloc(nmemb: nodes_size, size: sizeof*(sched->leaf_backend_ids[`0`]));
1630	sched->prev_node_backend_ids = (int ) calloc(nmemb: nodes_size, size: sizeof*(sched->prev_node_backend_ids[`0`]));
1631	sched->prev_leaf_backend_ids = (int ) calloc(nmemb: nodes_size, size: sizeof*(sched->prev_leaf_backend_ids[`0`]));
1632
1633	sched->context_buffer_size = ggml_sched_max_splitsGGML_SCHED_MAX_SPLIT_INPUTS`2`*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(size: graph_size, grads: false);
1634	sched->context_buffer = (char *) malloc(size: sched->context_buffer_size);
1635
1636	const int initial_splits_capacity = `16`;
1637	sched->splits = (ggml_backend_sched_split ) calloc(nmemb: initial_splits_capacity, size: sizeof*(sched->splits[`0`]));
1638	sched->splits_capacity = initial_splits_capacity;
1639
1640	for (int b = `0`; b < n_backends; b++) {
1641	sched->backends[b] = backends[b];
1642	sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backend: backends[b]);
1643	GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1644
1645	if (sched->n_copies > `1`) {
1646	for (int c = `0`; c < sched->n_copies; c++) {
1647	sched->events[b][c] = ggml_backend_event_new(device: backends[b]->device);
1648	}
1649	}
1650	}
1651
1652	sched->galloc = ggml_gallocr_new_n(bufts: sched->bufts, n_bufs: n_backends);
1653	sched->op_offload = op_offload;
1654
1655	ggml_backend_sched_reset(sched);
1656
1657	return sched;
1658	}
1659
1660	void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1661	if (sched == NULL) {
1662	return;
1663	}
1664	for (int b = `0`; b < sched->n_backends; b++) {
1665	for (int c = `0`; c < sched->n_copies; c++) {
1666	ggml_backend_event_free(event: sched->events[b][c]);
1667	}
1668	}
1669	ggml_gallocr_free(galloc: sched->galloc);
1670	ggml_free(ctx: sched->ctx);
1671	ggml_hash_set_free(hash_set: &sched->hash_set);
1672	free(ptr: sched->splits);
1673	free(ptr: sched->hv_tensor_backend_ids);
1674	free(ptr: sched->hv_tensor_copies);
1675	free(ptr: sched->node_backend_ids);
1676	free(ptr: sched->leaf_backend_ids);
1677	free(ptr: sched->prev_node_backend_ids);
1678	free(ptr: sched->prev_leaf_backend_ids);
1679	free(ptr: sched->context_buffer);
1680	free(ptr: sched->graph.nodes);
1681	free(ptr: sched->graph.leafs);
1682	free(ptr: sched);
1683	}
1684
1685	void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1686	GGML_ASSERT(sched);
1687	// reset state for the next run
1688	if (!sched->is_reset) {
1689	ggml_hash_set_reset(hash_set: &sched->hash_set);
1690	memset(s: sched->hv_tensor_backend_ids, c: -`1`, n: sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[`0`]));
1691	memset(s: sched->hv_tensor_copies, c: `0`, n: sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1692	sched->is_reset = true;
1693	}
1694	sched->is_alloc = false;
1695	}
1696
1697	bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1698	GGML_ASSERT(sched);
1699	GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1700
1701	ggml_backend_sched_reset(sched);
1702
1703	ggml_backend_sched_synchronize(sched);
1704
1705	ggml_backend_sched_split_graph(sched, graph: measure_graph);
1706
1707	if (!ggml_gallocr_reserve_n(galloc: sched->galloc, graph: &sched->graph, node_buffer_ids: sched->node_backend_ids, leaf_buffer_ids: sched->leaf_backend_ids)) {
1708	return false;
1709	}
1710
1711	ggml_backend_sched_reset(sched);
1712
1713	return true;
1714	}
1715
1716	bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1717	GGML_ASSERT(sched);
1718	GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1719	GGML_ASSERT(!sched->is_alloc);
1720
1721	sched->cur_copy = sched->next_copy;
1722	sched->next_copy = (sched->next_copy + `1`) % sched->n_copies;
1723
1724	ggml_backend_sched_split_graph(sched, graph);
1725
1726	if (!ggml_backend_sched_alloc_splits(sched)) {
1727	return false;
1728	}
1729
1730	sched->is_alloc = true;
1731
1732	return true;
1733	}
1734
1735	enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1736	enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
1737	ggml_backend_sched_synchronize(sched);
1738	return err;
1739	}
1740
1741	enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1742	GGML_ASSERT(sched);
1743	if (!sched->is_reset && !sched->is_alloc) {
1744	ggml_backend_sched_reset(sched);
1745	}
1746
1747	if (!sched->is_alloc) {
1748	if (!ggml_backend_sched_alloc_graph(sched, graph)) {
1749	return GGML_STATUS_ALLOC_FAILED;
1750	}
1751	}
1752
1753	return ggml_backend_sched_compute_splits(sched);
1754	}
1755
1756	void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1757	GGML_ASSERT(sched);
1758	for (int i = `0`; i < sched->n_backends; i++) {
1759	ggml_backend_synchronize(backend: sched->backends[i]);
1760	}
1761	if (!sched->is_alloc) {
1762	// if the graph is not already allocated, always use copy 0 after a synchronization
1763	// this ensures that during generation the same copy is used every time,
1764	// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1765	sched->next_copy = `0`;
1766	}
1767	}
1768
1769	void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1770	GGML_ASSERT(sched);
1771	sched->callback_eval = callback;
1772	sched->callback_eval_user_data = user_data;
1773	}
1774
1775	int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1776	GGML_ASSERT(sched);
1777	return sched->n_splits;
1778	}
1779
1780	int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1781	GGML_ASSERT(sched);
1782	return sched->n_copies;
1783	}
1784
1785	int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
1786	GGML_ASSERT(sched);
1787	return sched->n_backends;
1788	}
1789
1790	ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
1791	GGML_ASSERT(sched);
1792	GGML_ASSERT(i >= `0` && i < sched->n_backends);
1793	return sched->backends[i];
1794	}
1795
1796	ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
1797	GGML_ASSERT(sched);
1798	int backend_index = ggml_backend_sched_backend_id(sched, backend);
1799	GGML_ASSERT(backend_index >= `0` && backend_index < sched->n_backends);
1800
1801	return sched->bufts[backend_index];
1802	}
1803
1804	size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1805	GGML_ASSERT(sched);
1806	int backend_index = ggml_backend_sched_backend_id(sched, backend);
1807	GGML_ASSERT(backend_index >= `0` && backend_index < sched->n_backends);
1808
1809	return ggml_gallocr_get_buffer_size(galloc: sched->galloc, buffer_id: backend_index);
1810	}
1811
1812	void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
1813	GGML_ASSERT(sched);
1814	int backend_index = ggml_backend_sched_backend_id(sched, backend);
1815	GGML_ASSERT(backend_index >= `0` && backend_index < sched->n_backends);
1816	tensor_backend_id(node) = backend_index;
1817	SET_CAUSE(node, "usr");
1818	sched->is_reset = false;
1819	}
1820
1821	ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1822	GGML_ASSERT(sched);
1823	int backend_index = tensor_backend_id(node);
1824	if (backend_index == -`1`) {
1825	return NULL;
1826	}
1827	return sched->backends[backend_index];
1828	}
1829
1830	// utils
1831
1832	enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
1833	GGML_ASSERT(tensor);
1834	GGML_ASSERT(tensor->buffer == NULL);
1835	GGML_ASSERT(tensor->view_src != NULL);
1836	GGML_ASSERT(tensor->view_src->buffer != NULL);
1837	GGML_ASSERT(tensor->view_src->data != NULL);
1838
1839	tensor->buffer = tensor->view_src->buffer;
1840	tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1841	return ggml_backend_buffer_init_tensor(buffer: tensor->buffer, tensor);
1842	}
1843
1844	enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
1845	GGML_ASSERT(tensor);
1846	GGML_ASSERT(tensor->buffer == NULL);
1847	GGML_ASSERT(tensor->data == NULL);
1848	GGML_ASSERT(tensor->view_src == NULL);
1849	GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
1850	GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
1851	(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
1852
1853	tensor->buffer = buffer;
1854	tensor->data = addr;
1855	return ggml_backend_buffer_init_tensor(buffer, tensor);
1856	}
1857
1858	static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
1859	struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
1860
1861	GGML_ASSERT(src != NULL);
1862	GGML_ASSERT(src->data && "graph must be allocated");
1863
1864	size_t id = ggml_hash_insert(hash_set: &hash_set, key: src);
1865	if (id == GGML_HASHSET_ALREADY_EXISTS) {
1866	return node_copies[ggml_hash_find(hash_set: &hash_set, key: src)];
1867	}
1868
1869	struct ggml_tensor * dst = ggml_dup_tensor_layout(ctx: src->data && !src->view_src ? ctx_allocated : ctx_unallocated, tensor: src);
1870	if (src->view_src != NULL) {
1871	dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src: src->view_src);
1872	dst->view_offs = src->view_offs;
1873	}
1874	dst->op = src->op;
1875	memcpy(dest: dst->op_params, src: src->op_params, n: sizeof(dst->op_params));
1876	ggml_set_name(tensor: dst, name: src->name);
1877
1878	// copy src
1879	for (int i = `0`; i < GGML_MAX_SRC; i++) {
1880	struct ggml_tensor * s = src->src[i];
1881	if (s == NULL) {
1882	continue;
1883	}
1884	dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src: s);
1885	}
1886
1887	node_copies[id] = dst;
1888	return dst;
1889	}
1890
1891	static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
1892	size_t id = ggml_hash_find(hash_set, key: src);
1893	if (node_init[id]) {
1894	return;
1895	}
1896	node_init[id] = true;
1897
1898	struct ggml_tensor * dst = node_copies[id];
1899	if (dst->view_src != NULL) {
1900	graph_copy_init_tensor(hash_set, node_copies, node_init, src: src->view_src);
1901	enum ggml_status status = ggml_backend_view_init(tensor: dst);
1902	GGML_ASSERT(status == GGML_STATUS_SUCCESS);
1903	}
1904	else {
1905	ggml_backend_tensor_copy(src, dst);
1906	}
1907
1908	// init src
1909	for (int i = `0`; i < GGML_MAX_SRC; i++) {
1910	struct ggml_tensor * s = src->src[i];
1911	if (s == NULL) {
1912	continue;
1913	}
1914	graph_copy_init_tensor(hash_set, node_copies, node_init, src: s);
1915	}
1916	}
1917
1918	struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1919	GGML_ASSERT(graph);
1920	struct ggml_hash_set hash_set = ggml_hash_set_new(size: graph->visited_hash_set.size);
1921	struct ggml_tensor node_copies = (ggml_tensor ) calloc(nmemb: hash_set.size, size: sizeof(node_copies[`0`])); // NOLINT
1922	bool * node_init = (bool ) calloc(nmemb: hash_set.size, size: sizeof*(node_init[`0`]));
1923
1924	struct ggml_init_params params = {
1925	/ .mem_size = / ggml_tensor_overhead()hash_set.size + ggml_graph_overhead_custom(size: graph->size, grads: false*),
1926	/ .mem_buffer = / NULL,
1927	/ .no_alloc = / true
1928	};
1929
1930	struct ggml_context * ctx_allocated = ggml_init(params);
1931	struct ggml_context * ctx_unallocated = ggml_init(params);
1932
1933	if (ctx_allocated == NULL \|\| ctx_unallocated == NULL) {
1934	GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
1935	ggml_hash_set_free(hash_set: &hash_set);
1936	free(ptr: node_copies);
1937	free(ptr: node_init);
1938	ggml_free(ctx: ctx_allocated);
1939	ggml_free(ctx: ctx_unallocated);
1940	return {
1941	/ .buffer = / NULL,
1942	/ .ctx_allocated = / NULL,
1943	/ .ctx_unallocated = / NULL,
1944	/ .graph = / NULL,
1945	};
1946	}
1947
1948	// dup nodes
1949	for (int i = `0`; i < graph->n_nodes; i++) {
1950	struct ggml_tensor * node = graph->nodes[i];
1951	graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src: node);
1952	}
1953
1954	// allocate nodes
1955	ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx: ctx_allocated, backend);
1956	if (buffer == NULL) {
1957	GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
1958	ggml_hash_set_free(hash_set: &hash_set);
1959	free(ptr: node_copies);
1960	free(ptr: node_init);
1961	ggml_free(ctx: ctx_allocated);
1962	ggml_free(ctx: ctx_unallocated);
1963	return {
1964	/ .buffer = / NULL,
1965	/ .ctx_allocated = / NULL,
1966	/ .ctx_unallocated = / NULL,
1967	/ .graph = / NULL,
1968	};
1969	}
1970
1971	//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
1972
1973	// copy data and init views
1974	for (int i = `0`; i < graph->n_nodes; i++) {
1975	struct ggml_tensor * node = graph->nodes[i];
1976	graph_copy_init_tensor(hash_set: &hash_set, node_copies, node_init, src: node);
1977	}
1978
1979	// build graph copy
1980	struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx: ctx_allocated, size: graph->size, grads: false);
1981	for (int i = `0`; i < graph->n_nodes; i++) {
1982	struct ggml_tensor * node = graph->nodes[i];
1983	struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set: &hash_set, key: node)];
1984	graph_copy->nodes[i] = node_copy;
1985	}
1986	graph_copy->n_nodes = graph->n_nodes;
1987
1988	ggml_hash_set_free(hash_set: &hash_set);
1989	free(ptr: node_copies);
1990	free(ptr: node_init);
1991
1992	return {
1993	/ .buffer = / buffer,
1994	/ .ctx_allocated = / ctx_allocated,
1995	/ .ctx_unallocated = / ctx_unallocated,
1996	/ .graph = / graph_copy,
1997	};
1998	}
1999
2000	void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
2001	ggml_backend_buffer_free(buffer: copy.buffer);
2002	ggml_free(ctx: copy.ctx_allocated);
2003	ggml_free(ctx: copy.ctx_unallocated);
2004	}
2005
2006	bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
2007	struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend: backend2, graph);
2008	if (copy.buffer == NULL) {
2009	return false;
2010	}
2011
2012	struct ggml_cgraph * g1 = graph;
2013	struct ggml_cgraph * g2 = copy.graph;
2014
2015	assert(g1->n_nodes == g2->n_nodes);
2016
2017	if (test_node != nullptr) {
2018	// Compute the whole graph and only test the output for a specific tensor
2019	ggml_backend_graph_compute(backend: backend1, cgraph: g1);
2020	ggml_backend_graph_compute(backend: backend2, cgraph: g2);
2021
2022	int test_node_idx = -`1`;
2023	for (int i = `0`; i < g1->n_nodes; i++) {
2024	struct ggml_tensor * t1 = g1->nodes[i];
2025	if (t1 == test_node) {
2026	test_node_idx = i;
2027	break;
2028	}
2029	}
2030	GGML_ASSERT(test_node_idx != -`1`);
2031
2032	callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
2033	} else {
2034	for (int i = `0`; i < g1->n_nodes; i++) {
2035	struct ggml_tensor * t1 = g1->nodes[i];
2036	struct ggml_tensor * t2 = g2->nodes[i];
2037
2038	assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
2039
2040	struct ggml_cgraph g1v = ggml_graph_view(cgraph: g1, i0: i, i1: i + `1`);
2041	struct ggml_cgraph g2v = ggml_graph_view(cgraph: g2, i0: i, i1: i + `1`);
2042
2043	ggml_backend_graph_compute(backend: backend1, cgraph: &g1v);
2044	ggml_backend_graph_compute(backend: backend2, cgraph: &g2v);
2045
2046	if (ggml_is_view_op(op: t1->op)) {
2047	continue;
2048	}
2049
2050	// compare results, calculate rms etc
2051	if (!callback(i, t1, t2, user_data)) {
2052	break;
2053	}
2054	}
2055	}
2056	ggml_backend_graph_copy_free(copy);
2057
2058	return true;
2059	}
2060
2061	// CPU backend - buffer
2062
2063	static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
2064	GGML_ASSERT(buffer);
2065	uintptr_t data = (uintptr_t)buffer->context;
2066
2067	// align the buffer
2068	if (data % TENSOR_ALIGNMENT != `0`) {
2069	data = GGML_PAD(data, TENSOR_ALIGNMENT);
2070	}
2071
2072	return (void *)data;
2073	}
2074
2075	static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2076	GGML_ASSERT(buffer);
2077	ggml_aligned_free(ptr: buffer->context, size: buffer->size);
2078	}
2079
2080	static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
2081	GGML_ASSERT(tensor);
2082	memset(s: (char *)tensor->data + offset, c: value, n: size);
2083
2084	GGML_UNUSED(buffer);
2085	}
2086
2087	static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2088	GGML_ASSERT(tensor);
2089	memcpy(dest: (char *)tensor->data + offset, src: data, n: size);
2090
2091	GGML_UNUSED(buffer);
2092	}
2093
2094	static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2095	GGML_ASSERT(tensor);
2096	memcpy(dest: data, src: (const char *)tensor->data + offset, n: size);
2097
2098	GGML_UNUSED(buffer);
2099	}
2100
2101	static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
2102	GGML_ASSERT(src);
2103	if (ggml_backend_buffer_is_host(buffer: src->buffer)) {
2104	memcpy(dest: dst->data, src: src->data, n: ggml_nbytes(tensor: src));
2105	return true;
2106	}
2107	return false;
2108
2109	GGML_UNUSED(buffer);
2110	}
2111
2112	static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
2113	GGML_ASSERT(buffer);
2114	memset(s: buffer->context, c: value, n: buffer->size);
2115	}
2116
2117	static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
2118	/ .free_buffer = / ggml_backend_cpu_buffer_free_buffer,
2119	/ .get_base = / ggml_backend_cpu_buffer_get_base,
2120	/ .init_tensor = / NULL, // no initialization required
2121	/ .memset_tensor = / ggml_backend_cpu_buffer_memset_tensor,
2122	/ .set_tensor = / ggml_backend_cpu_buffer_set_tensor,
2123	/ .get_tensor = / ggml_backend_cpu_buffer_get_tensor,
2124	/ .cpy_tensor = / ggml_backend_cpu_buffer_cpy_tensor,
2125	/ .clear = / ggml_backend_cpu_buffer_clear,
2126	/ .reset = / NULL,
2127	};
2128
2129	static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
2130	/ .free_buffer = / NULL, // ptr is not owned by the buffer, so it does not need to be freed
2131	/ .get_base = / ggml_backend_cpu_buffer_get_base,
2132	/ .init_tensor = / NULL, // no initialization required
2133	/ .memset_tensor = / ggml_backend_cpu_buffer_memset_tensor,
2134	/ .set_tensor = / ggml_backend_cpu_buffer_set_tensor,
2135	/ .get_tensor = / ggml_backend_cpu_buffer_get_tensor,
2136	/ .cpy_tensor = / ggml_backend_cpu_buffer_cpy_tensor,
2137	/ .clear = / ggml_backend_cpu_buffer_clear,
2138	/ .reset = / NULL,
2139	};
2140
2141	// CPU backend buffer type
2142
2143	// this buffer type is defined here to make it available to all backends
2144
2145	static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
2146	return "CPU";
2147
2148	GGML_UNUSED(buft);
2149	}
2150
2151	static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2152	void * data = ggml_aligned_malloc(size);
2153
2154	if (data == NULL) {
2155	GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
2156	return NULL;
2157	}
2158
2159	return ggml_backend_buffer_init(buft, iface: ggml_backend_cpu_buffer_i, context: data, size);
2160	}
2161
2162	static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
2163	return TENSOR_ALIGNMENT;
2164
2165	GGML_UNUSED(buft);
2166	}
2167
2168	static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
2169	return true;
2170
2171	GGML_UNUSED(buft);
2172	}
2173
2174	ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
2175	static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
2176	/ .iface = / {
2177	/ .get_name = / ggml_backend_cpu_buffer_type_get_name,
2178	/ .alloc_buffer = / ggml_backend_cpu_buffer_type_alloc_buffer,
2179	/ .get_alignment = / ggml_backend_cpu_buffer_type_get_alignment,
2180	/ .get_max_size = / NULL, // defaults to SIZE_MAX
2181	/ .get_alloc_size = / NULL, // defaults to ggml_nbytes
2182	/ .is_host = / ggml_backend_cpu_buffer_type_is_host,
2183	},
2184	/ .device = / NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
2185	/ .context = / NULL,
2186	};
2187
2188	return &ggml_backend_cpu_buffer_type;
2189	}
2190
2191	static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
2192	return "CPU_Mapped";
2193
2194	GGML_UNUSED(buft);
2195	}
2196
2197	static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
2198	static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
2199	/ .iface = / {
2200	/ .get_name = / ggml_backend_cpu_buffer_from_ptr_type_get_name,
2201	/ .alloc_buffer = / ggml_backend_cpu_buffer_type_alloc_buffer,
2202	/ .get_alignment = / ggml_backend_cpu_buffer_type_get_alignment,
2203	/ .get_max_size = / NULL, // defaults to SIZE_MAX
2204	/ .get_alloc_size = / NULL, // defaults to ggml_nbytes
2205	/ .is_host = / ggml_backend_cpu_buffer_type_is_host,
2206	},
2207	/ .device = / NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
2208	/ .context = / NULL,
2209	};
2210
2211	return &ggml_backend_cpu_buffer_type;
2212	}
2213
2214	ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
2215	GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == `0` && "buffer pointer must be aligned");
2216	return ggml_backend_buffer_init(buft: ggml_backend_cpu_buffer_from_ptr_type(), iface: ggml_backend_cpu_buffer_from_ptr_i, context: ptr, size);
2217	}
2218

Browse the source code of llama.cpp/ggml/src/ggml-backend.cpp