ggml-cuda.cu source code [llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu]

1	#include "ggml-cuda.h"
2	#include "ggml-impl.h"
3	#include "ggml-backend-impl.h"
4
5	#include "ggml-cuda/common.cuh"
6	#include "ggml-cuda/acc.cuh"
7	#include "ggml-cuda/add-id.cuh"
8	#include "ggml-cuda/arange.cuh"
9	#include "ggml-cuda/argmax.cuh"
10	#include "ggml-cuda/argsort.cuh"
11	#include "ggml-cuda/binbcast.cuh"
12	#include "ggml-cuda/clamp.cuh"
13	#include "ggml-cuda/concat.cuh"
14	#include "ggml-cuda/conv-transpose-1d.cuh"
15	#include "ggml-cuda/conv2d.cuh"
16	#include "ggml-cuda/conv2d-dw.cuh"
17	#include "ggml-cuda/conv2d-transpose.cuh"
18	#include "ggml-cuda/convert.cuh"
19	#include "ggml-cuda/count-equal.cuh"
20	#include "ggml-cuda/cpy.cuh"
21	#include "ggml-cuda/cross-entropy-loss.cuh"
22	#include "ggml-cuda/diagmask.cuh"
23	#include "ggml-cuda/fattn.cuh"
24	#include "ggml-cuda/getrows.cuh"
25	#include "ggml-cuda/im2col.cuh"
26	#include "ggml-cuda/mmf.cuh"
27	#include "ggml-cuda/mmq.cuh"
28	#include "ggml-cuda/mmvf.cuh"
29	#include "ggml-cuda/mmvq.cuh"
30	#include "ggml-cuda/norm.cuh"
31	#include "ggml-cuda/opt-step-adamw.cuh"
32	#include "ggml-cuda/opt-step-sgd.cuh"
33	#include "ggml-cuda/out-prod.cuh"
34	#include "ggml-cuda/pad.cuh"
35	#include "ggml-cuda/pool2d.cuh"
36	#include "ggml-cuda/quantize.cuh"
37	#include "ggml-cuda/rope.cuh"
38	#include "ggml-cuda/roll.cuh"
39	#include "ggml-cuda/scale.cuh"
40	#include "ggml-cuda/softcap.cuh"
41	#include "ggml-cuda/softmax.cuh"
42	#include "ggml-cuda/ssm-conv.cuh"
43	#include "ggml-cuda/ssm-scan.cuh"
44	#include "ggml-cuda/sum.cuh"
45	#include "ggml-cuda/sumrows.cuh"
46	#include "ggml-cuda/mean.cuh"
47	#include "ggml-cuda/tsembd.cuh"
48	#include "ggml-cuda/topk-moe.cuh"
49	#include "ggml-cuda/unary.cuh"
50	#include "ggml-cuda/upscale.cuh"
51	#include "ggml-cuda/wkv.cuh"
52	#include "ggml-cuda/gla.cuh"
53	#include "ggml-cuda/set.cuh"
54	#include "ggml-cuda/set-rows.cuh"
55	#include "ggml-cuda/pad_reflect_1d.cuh"
56	#include "ggml.h"
57
58	#include <algorithm>
59	#include <array>
60	#include <atomic>
61	#include <charconv>
62	#include <cinttypes>
63	#include <condition_variable>
64	#include <cstddef>
65	#include <cstdint>
66	#include <float.h>
67	#include <initializer_list>
68	#include <limits>
69	#include <map>
70	#include <memory>
71	#include <mutex>
72	#include <stdarg.h>
73	#include <stdio.h>
74	#include <stdlib.h>
75	#include <string>
76	#include <vector>
77
78	static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
79
80	[[noreturn]]
81	void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
82	int id = -`1`; // in case cudaGetDevice fails
83	(void)cudaGetDevice(device: &id);
84
85	GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
86	GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
87	GGML_LOG_ERROR(" %s\n", stmt);
88	// abort with GGML_ABORT to get a stack trace
89	GGML_ABORT(GGML_CUDA_NAME " error");
90	}
91
92	// this is faster on Windows
93	// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
94	void ggml_cuda_set_device(int device) {
95	int current_device;
96	CUDA_CHECK(cudaGetDevice(device: &current_device));
97
98	if (device == current_device) {
99	return;
100	}
101
102	CUDA_CHECK(cudaSetDevice(device));
103	}
104
105	int ggml_cuda_get_device() {
106	int id;
107	CUDA_CHECK(cudaGetDevice(device: &id));
108	return id;
109	}
110
111	static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
112	ggml_cuda_set_device(device);
113	cudaError_t err;
114	if (getenv(name: "GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) {
115	err = cudaMallocManaged(devPtr: ptr, size);
116	#if defined(GGML_USE_HIP)
117	if (err == hipSuccess) {
118	CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
119	}
120
121	// fall back to cudaMalloc if not supported (e.g. on Windows)
122	if (err == hipErrorNotSupported) {
123	static bool warned_unsupported = false;
124	if (!warned_unsupported) {
125	GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
126	warned_unsupported = true;
127	}
128
129	err = cudaMalloc(ptr, size);
130	}
131	#endif // defined(GGML_USE_HIP)
132	} else {
133	err = cudaMalloc(devPtr: ptr, size);
134	}
135	return err;
136	}
137
138	#if defined(GGML_USE_HIP)
139	static int ggml_cuda_parse_id(char devName[]) {
140	// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
141	// these values are not stable so this is susceptible to breakage
142	// https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
143	int archMajor = `0x0`;
144	int archMinor = `0x0`;
145	int archNum = GGML_CUDA_CC_OFFSET_AMD;
146	int archLen = strlen(devName);
147	char archName[archLen + `1`];
148
149	// strip leading 'gfx' while copying into our buffer
150	if (archLen > `3`) {
151	strcpy(archName, &devName[`3`]);
152	archLen -= `3`;
153	}
154
155	// trim trailing :xnack- or :sramecc- statuses
156	archLen = strcspn(archName, ":");
157	archName[archLen] = `'\0'`;
158
159	// tease out the version information
160	if (archLen > `8`) {
161	// versions labeled generic use '-' as delimiter
162	// strip the trailing "-generic" then iterate through what remains
163	if ((strstr(archName, "-generic"))) {
164	archName[archLen - `8`] = `'\0'`;
165	char * pch;
166	if ((pch = strtok(archName, "-"))) {
167	archMajor = (int)strtoul(pch, `0`, `16`);
168	if ((pch = strtok(NULL, "-"))) {
169	archMinor = `0x10` * (int)strtoul(pch, `0`, `16`);
170	}
171	}
172	}
173	} else if (archLen >= `3`) {
174	// last two digits should be the minor 0x10 + stepping*
175	archMinor = (int)strtoul(&archName[archLen - `2`], `0`, `16`);
176	archName[archLen - `2`] = `'\0'`;
177
178	// only the major version remains
179	archMajor = (int)strtoul(archName, `0`, `16`);
180	}
181	archNum += archMajor * `0x100`;
182	archNum += archMinor;
183	return archNum;
184	}
185	#endif // defined(GGML_USE_HIP)
186
187	static ggml_cuda_device_info ggml_cuda_init() {
188	ggml_cuda_device_info info = {};
189
190	cudaError_t err = cudaGetDeviceCount(&info.device_count);
191	if (err != cudaSuccess) {
192	GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
193	return info;
194	}
195
196	GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
197
198	int64_t total_vram = `0`;
199	#ifdef GGML_CUDA_FORCE_MMQ
200	GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
201	#else
202	GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
203	#endif // GGML_CUDA_FORCE_MMQ
204	#ifdef GGML_CUDA_FORCE_CUBLAS
205	GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
206	#else
207	GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
208	#endif // GGML_CUDA_FORCE_CUBLAS
209	GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
210
211	std::vector<std::pair<int, std::string>> turing_devices_without_mma;
212	for (int id = `0`; id < info.device_count; ++id) {
213	int device_vmm = `0`;
214
215	#if defined(GGML_USE_VMM)
216	CUdevice device;
217	CU_CHECK(cuDeviceGet(&device, id));
218	CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
219
220	if (device_vmm) {
221	CUmemAllocationProp alloc_prop = {};
222	alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
223	alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
224	alloc_prop.location.id = id;
225	CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
226	}
227	#endif // defined(GGML_USE_VMM)
228	info.devices[id].vmm = !!device_vmm;
229
230	cudaDeviceProp prop;
231	CUDA_CHECK(cudaGetDeviceProperties(prop: &prop, device: id));
232
233	info.default_tensor_split[id] = total_vram;
234	total_vram += prop.totalGlobalMem;
235	info.devices[id].integrated = false; // Temporarily disabled due to issues with corrupted output (e.g. #15034)
236	info.devices[id].nsm = prop.multiProcessorCount;
237	info.devices[id].smpb = prop.sharedMemPerBlock;
238	info.devices[id].warp_size = prop.warpSize;
239	#if defined(GGML_USE_HIP)
240	info.devices[id].smpbo = prop.sharedMemPerBlock;
241
242	info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
243	if ((info.devices[id].cc & `0xff00`) == `0x0`) {
244	GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s cc %d.%d\n",
245	id, prop.name, prop.gcnArchName, prop.major, prop.minor);
246
247	// Fallback to prop.major and prop.minor
248	if (prop.major > `0`) {
249	info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * `0x100`;
250	info.devices[id].cc += prop.minor * `0x10`;
251	}
252	}
253	GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
254	id, prop.name, prop.gcnArchName, info.devices[id].cc & `0xffff`,
255	device_vmm ? "yes" : "no", prop.warpSize);
256	#elif defined(GGML_USE_MUSA)
257	// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
258	info.devices[id].warp_size = `32`;
259	info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
260	info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * `0x100`;
261	info.devices[id].cc += prop.minor * `0x10`;
262	GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
263	id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
264	#else
265	info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
266	info.devices[id].cc = `100`prop.major + `10`prop.minor;
267	GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
268	id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
269	std::string device_name(prop.name);
270	if (device_name == "NVIDIA GeForce MX450") {
271	turing_devices_without_mma.push_back(x: { id, device_name });
272	} else if (device_name == "NVIDIA GeForce MX550") {
273	turing_devices_without_mma.push_back(x: { id, device_name });
274	} else if (device_name.substr(pos: `0`, n: `21`) == "NVIDIA GeForce GTX 16") {
275	turing_devices_without_mma.push_back(x: { id, device_name });
276	}
277
278	// Temporary performance fix:
279	// Setting device scheduling strategy for iGPUs with cc121 to "spinning" to avoid delays in cuda synchronize calls.
280	// TODO: Check for future drivers the default scheduling strategy and
281	// remove this call again when cudaDeviceScheduleSpin is default.
282	if (prop.major == `12` && prop.minor == `1`) {
283	CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
284	}
285
286	#endif // defined(GGML_USE_HIP)
287	}
288
289	if (ggml_cuda_highest_compiled_arch(GGML_CUDA_CC_TURING) >= GGML_CUDA_CC_TURING && !turing_devices_without_mma.empty()) {
290	GGML_LOG_INFO("The following devices will have suboptimal performance due to a lack of tensor cores:\n");
291	for (size_t device_pos = `0`; device_pos < turing_devices_without_mma.size(); device_pos++) {
292	GGML_LOG_INFO(
293	" Device %d: %s\n", turing_devices_without_mma [device_pos].first, turing_devices_without_mma [device_pos].second.c_str());
294	}
295	GGML_LOG_INFO(
296	"Consider compiling with CMAKE_CUDA_ARCHITECTURES=61-virtual;80-virtual and DGGML_CUDA_FORCE_MMQ to force the use of the Pascal code for Turing.\n");
297	}
298
299	for (int id = `0`; id < info.device_count; ++id) {
300	info.default_tensor_split[id] /= total_vram;
301	}
302
303	// configure logging to stdout
304	// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
305
306	return info;
307	}
308
309	const ggml_cuda_device_info & ggml_cuda_info() {
310	static ggml_cuda_device_info info = ggml_cuda_init();
311	return info;
312	}
313
314	// #define DEBUG_CUDA_MALLOC
315
316	// buffer pool for cuda (legacy)
317	struct ggml_cuda_pool_leg : public ggml_cuda_pool {
318	static const int MAX_BUFFERS = `256`;
319
320	int device;
321	struct ggml_cuda_buffer {
322	void * ptr = nullptr;
323	size_t size = `0`;
324	};
325
326	ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
327	size_t pool_size = `0`;
328
329	explicit ggml_cuda_pool_leg(int device) :
330	device(device) {
331	}
332
333	~ggml_cuda_pool_leg() {
334	ggml_cuda_set_device(device);
335	for (int i = `0`; i < MAX_BUFFERS; ++i) {
336	ggml_cuda_buffer & b = buffer_pool[i];
337	if (b.ptr != nullptr) {
338	CUDA_CHECK(cudaFree(devPtr: b.ptr));
339	pool_size -= b.size;
340	}
341	}
342	GGML_ASSERT(pool_size == `0`);
343	}
344
345	void * alloc(size_t size, size_t * actual_size) override {
346	#ifdef DEBUG_CUDA_MALLOC
347	int nnz = `0`;
348	size_t max_size = `0`;
349	#endif
350	size_t best_diff = `1ull` << `36`;
351	int ibest = -`1`;
352	for (int i = `0`; i < MAX_BUFFERS; ++i) {
353	ggml_cuda_buffer& b = buffer_pool[i];
354	if (b.ptr != nullptr) {
355	#ifdef DEBUG_CUDA_MALLOC
356	++nnz;
357	if (b.size > max_size) max_size = b.size;
358	#endif
359	if (b.size >= size) {
360	size_t diff = b.size - size;
361	if (diff < best_diff) {
362	best_diff = diff;
363	ibest = i;
364	if (!best_diff) {
365	void * ptr = b.ptr;
366	*actual_size = b.size;
367	b.ptr = nullptr;
368	b.size = `0`;
369	return ptr;
370	}
371	}
372	}
373	}
374	}
375	if (ibest >= `0`) {
376	ggml_cuda_buffer& b = buffer_pool[ibest];
377	void * ptr = b.ptr;
378	*actual_size = b.size;
379	b.ptr = nullptr;
380	b.size = `0`;
381	return ptr;
382	}
383	void * ptr;
384	size_t look_ahead_size = (size_t) (`1.05` * size);
385	look_ahead_size = `256` * ((look_ahead_size + `255`)/`256`);
386	ggml_cuda_set_device(device);
387	CUDA_CHECK(ggml_cuda_device_malloc(ptr: &ptr, size: look_ahead_size, device));
388	*actual_size = look_ahead_size;
389	pool_size += look_ahead_size;
390	#ifdef DEBUG_CUDA_MALLOC
391	GGML_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
392	(uint32_t)(max_size / `1024` / `1024`), (uint32_t)(pool_size / `1024` / `1024`), (uint32_t)(size / `1024` / `1024`));
393	#endif
394	return ptr;
395	}
396
397	void free(void * ptr, size_t size) override {
398	for (int i = `0`; i < MAX_BUFFERS; ++i) {
399	ggml_cuda_buffer& b = buffer_pool[i];
400	if (b.ptr == nullptr) {
401	b.ptr = ptr;
402	b.size = size;
403	return;
404	}
405	}
406	GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
407	ggml_cuda_set_device(device);
408	CUDA_CHECK(cudaFree(devPtr: ptr));
409	pool_size -= size;
410	}
411	};
412
413	// pool with virtual memory
414	#if defined(GGML_USE_VMM)
415	struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
416	static const size_t CUDA_POOL_VMM_MAX_SIZE = `1ull` << `35`; // 32 GB
417
418	int device;
419	CUdeviceptr pool_addr = `0`;
420	size_t pool_used = `0`;
421	size_t pool_size = `0`;
422	size_t granularity;
423	#if defined(GGML_USE_HIP)
424	std::vector<std::pair<CUdeviceptr, size_t>> mappings;
425	#endif
426
427	explicit ggml_cuda_pool_vmm(int device) :
428	device(device),
429	granularity(ggml_cuda_info().devices[device].vmm_granularity) {
430	}
431
432	~ggml_cuda_pool_vmm() {
433	if (pool_addr != `0`) {
434	#if defined(GGML_USE_HIP)
435	// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
436	for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
437	CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
438	}
439	#else
440	CU_CHECK(cuMemUnmap(pool_addr, pool_size));
441	#endif
442	CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
443	}
444	}
445
446	void * alloc(size_t size, size_t * actual_size) override {
447	// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
448	const size_t alignment = `128`;
449	size = alignment * ((size + alignment - `1`) / alignment);
450
451	size_t avail = pool_size - pool_used;
452
453	if (size > avail) {
454	// round up to the next multiple of the granularity
455	size_t reserve_size = size - avail;
456	reserve_size = granularity * ((reserve_size + granularity - `1`) / granularity);
457
458	GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
459
460	// allocate more physical memory
461	CUmemAllocationProp prop = {};
462	prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
463	prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
464	prop.location.id = device;
465	CUmemGenericAllocationHandle handle;
466	CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, `0`));
467
468	// reserve virtual address space (if not already reserved)
469	if (pool_addr == `0`) {
470	CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, `0`, `0`, `0`));
471	}
472
473	// map at the end of the pool
474	CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
475	CU_CHECK(cuMemMap(start_ptr, reserve_size, `0`, handle, `0`));
476	#if defined(GGML_USE_HIP)
477	mappings.push_back({start_ptr, reserve_size});
478	#endif
479
480	// the memory allocation handle is no longer needed after mapping
481	CU_CHECK(cuMemRelease(handle));
482
483	// set access
484	CUmemAccessDesc access = {};
485	access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
486	access.location.id = device;
487	access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
488	CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, `1`));
489
490	// add to the pool
491	pool_size += reserve_size;
492
493	//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
494	// device, (unsigned long long) (pool_size/1024/1024),
495	// (unsigned long long) (reserve_size/1024/1024));
496	}
497
498	GGML_ASSERT(pool_addr != `0`);
499
500	void * ptr = (void ) ((CUdeviceptr)((char* *)(pool_addr) + pool_used));
501	*actual_size = size;
502	pool_used += size;
503
504	#ifdef DEBUG_CUDA_MALLOC
505	printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
506	#endif
507
508	return ptr;
509	}
510
511	void free(void * ptr, size_t size) override {
512	#ifdef DEBUG_CUDA_MALLOC
513	printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
514	#endif
515
516	pool_used -= size;
517
518	// all deallocations must be in reverse order of the allocations
519	GGML_ASSERT(ptr == (void ) ((char* *)(pool_addr) + pool_used));
520	}
521	};
522	#endif // defined(GGML_USE_VMM)
523
524	std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
525	#if defined(GGML_USE_VMM)
526	if (ggml_cuda_info().devices[device].vmm) {
527	return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
528	}
529	#endif // defined(GGML_USE_VMM)
530	return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
531	}
532
533	// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
534	// this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured
535
536	static std::mutex ggml_cuda_lock;
537	static std::condition_variable ggml_cuda_lock_cv;
538	static std::atomic<int> ggml_cuda_lock_counter;
539
540	ggml_backend_cuda_context::~ggml_backend_cuda_context() {
541	std::unique_lock<std::mutex> lock(ggml_cuda_lock);
542	ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == `0`; });
543
544	if (copy_event != nullptr) {
545	CUDA_CHECK(cudaEventDestroy(copy_event));
546	}
547	for (int i = `0`; i < GGML_CUDA_MAX_DEVICES; ++i) {
548	for (int j = `0`; j < GGML_CUDA_MAX_STREAMS; ++j) {
549	if (streams[i][j] != nullptr) {
550	CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
551	}
552	}
553	if (cublas_handles[i] != nullptr) {
554	CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
555	}
556	}
557	}
558
559
560	// cuda buffer
561
562	struct ggml_backend_cuda_buffer_context {
563	int device;
564	void * dev_ptr = nullptr;
565	std::string name;
566
567	ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
568	device(device), dev_ptr(dev_ptr),
569	name(GGML_CUDA_NAME + std::to_string(device)) {
570	}
571
572	~ggml_backend_cuda_buffer_context() {
573	CUDA_CHECK(cudaFree(devPtr: dev_ptr));
574	}
575	};
576
577	static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
578	ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
579	delete ctx;
580	}
581
582	static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
583	return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
584	}
585
586	static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
587	ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
588	return ctx->dev_ptr;
589	}
590
591	static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
592	ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
593
594	if (tensor->view_src != NULL) {
595	assert(tensor->view_src->buffer->buft == buffer->buft);
596	return GGML_STATUS_SUCCESS;
597	}
598
599	if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
600	// initialize padding to 0 to avoid possible NaN values
601	const size_t original_size = ggml_nbytes(tensor);
602	const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
603
604	if (padded_size > original_size) {
605	ggml_cuda_set_device(device: ctx->device);
606	CUDA_CHECK(cudaMemset(devPtr: (char *)tensor->data + original_size, value: `0`, count: padded_size - original_size));
607	}
608	}
609	return GGML_STATUS_SUCCESS;
610	}
611
612	static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
613	ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
614
615	ggml_cuda_set_device(device: ctx->device);
616	CUDA_CHECK(cudaMemsetAsync(devPtr: (char *)tensor->data + offset, value, count: size, cudaStreamPerThread));
617	CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
618	}
619
620	static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
621	ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
622
623	ggml_cuda_set_device(device: ctx->device);
624	CUDA_CHECK(cudaMemcpyAsync(dst: (char *)tensor->data + offset, src: data, count: size, kind: cudaMemcpyHostToDevice, cudaStreamPerThread));
625	CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
626	}
627
628	static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
629	ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
630
631	ggml_cuda_set_device(device: ctx->device);
632	CUDA_CHECK(cudaMemcpyAsync(dst: data, src: (const char *)tensor->data + offset, count: size, kind: cudaMemcpyDeviceToHost, cudaStreamPerThread));
633	CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
634	}
635
636	static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
637	if (ggml_backend_buffer_is_cuda(src->buffer)) {
638	ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
639	ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
640	if (src_ctx->device == dst_ctx->device) {
641	CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
642	} else {
643	#ifdef GGML_CUDA_NO_PEER_COPY
644	return false;
645	#else
646	CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
647	#endif
648	}
649	CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
650	return true;
651	}
652	return false;
653
654	GGML_UNUSED(buffer);
655	}
656
657	static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
658	ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
659
660	ggml_cuda_set_device(device: ctx->device);
661	CUDA_CHECK(cudaMemsetAsync(ctx->dev_ptr, value, buffer->size, cudaStreamPerThread));
662	CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
663	}
664
665	static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
666	/ .free_buffer = / ggml_backend_cuda_buffer_free_buffer,
667	/ .get_base = / ggml_backend_cuda_buffer_get_base,
668	/ .init_tensor = / ggml_backend_cuda_buffer_init_tensor,
669	/ .memset_tensor = / ggml_backend_cuda_buffer_memset_tensor,
670	/ .set_tensor = / ggml_backend_cuda_buffer_set_tensor,
671	/ .get_tensor = / ggml_backend_cuda_buffer_get_tensor,
672	/ .cpy_tensor = / ggml_backend_cuda_buffer_cpy_tensor,
673	/ .clear = / ggml_backend_cuda_buffer_clear,
674	/ .reset = / NULL,
675	};
676
677	// cuda buffer type
678	struct ggml_backend_cuda_buffer_type_context {
679	int device;
680	std::string name;
681	};
682
683	static const char * ggml_backend_cuda_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
684	ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
685
686	return ctx->name.c_str();
687	}
688
689	static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
690	return buft->iface.get_name == ggml_backend_cuda_buffer_type_get_name;
691	}
692
693	static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
694	ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
695
696	ggml_cuda_set_device(device: buft_ctx->device);
697
698	void * dev_ptr;
699	cudaError_t err = ggml_cuda_device_malloc(ptr: &dev_ptr, size, device: buft_ctx->device);
700	if (err != cudaSuccess) {
701	// clear the error
702	(void)cudaGetLastError();
703	GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / `1024.0` / `1024.0`, buft_ctx->device, cudaGetErrorString(error: err));
704	return nullptr;
705	}
706
707	ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context (buft_ctx->device, dev_ptr);
708
709	return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
710	}
711
712	static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
713	return `128`;
714
715	GGML_UNUSED(buft);
716	}
717
718	static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
719	size_t size = ggml_nbytes(tensor);
720	int64_t ne0 = tensor->ne[`0`];
721
722	if (ggml_is_quantized(tensor->type)) {
723	if (ne0 % MATRIX_ROW_PADDING != `0`) {
724	GGML_ASSERT(tensor->nb[`0`] == ggml_element_size(tensor));
725	size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
726	}
727	}
728
729	return size;
730
731	GGML_UNUSED(buft);
732	}
733
734	static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
735	/ .get_name = / ggml_backend_cuda_buffer_type_get_name,
736	/ .alloc_buffer = / ggml_backend_cuda_buffer_type_alloc_buffer,
737	/ .get_alignment = / ggml_backend_cuda_buffer_type_get_alignment,
738	/ .get_max_size = / NULL, // defaults to SIZE_MAX
739	/ .get_alloc_size = / ggml_backend_cuda_buffer_type_get_alloc_size,
740	/ .is_host = / NULL,
741	};
742
743	ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
744	static std::mutex mutex;
745	std::lock_guard<std::mutex> lock(mutex);
746
747	if (device >= ggml_backend_cuda_get_device_count()) {
748	return nullptr;
749	}
750
751	static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
752
753	static bool ggml_backend_cuda_buffer_type_initialized = false;
754
755	if (!ggml_backend_cuda_buffer_type_initialized) {
756	for (int i = `0`; i < ggml_backend_cuda_get_device_count(); i++) {
757	ggml_backend_cuda_buffer_types[i] = {
758	/ .iface = / ggml_backend_cuda_buffer_type_interface,
759	/ .device = / ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), i),
760	/ .context = / new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
761	};
762	}
763	ggml_backend_cuda_buffer_type_initialized = true;
764	}
765
766	return &ggml_backend_cuda_buffer_types[device];
767	}
768
769	// cuda split buffer
770
771	static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
772	int64_t row_rounding = `0`;
773	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
774	if (tensor_split[id] >= (id + `1` < ggml_backend_cuda_get_device_count() ? tensor_split[id + `1`] : `1.0f`)) {
775	continue;
776	}
777
778	const int cc = ggml_cuda_info().devices[id].cc;
779	row_rounding = std::max(a: row_rounding, b: (int64_t)get_mmq_y_host(cc));
780	}
781	return row_rounding;
782	}
783
784	static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
785	const int64_t nrows = ggml_nrows(tensor);
786	const int64_t rounding = get_row_rounding(tensor_split);
787
788	row_low = id == `0` ? `0` : nrowstensor_split[id];
789	row_low -= row_low % rounding;
790
791	if (id == ggml_backend_cuda_get_device_count() - `1`) {
792	*row_high = nrows;
793	} else {
794	row_high = nrowstensor_split[id + `1`];
795	row_high -= row_high % rounding;
796	}
797	}
798
799	static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
800	static_assert(GGML_MAX_DIMS == `4`, "GGML_MAX_DIMS is not 4 - update this function");
801
802	return nrows_split*ggml_row_size(tensor->type, tensor->ne[`0`]);
803	}
804
805	struct ggml_backend_cuda_split_buffer_type_context {
806	int main_device;
807	std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
808	std::string name;
809	};
810
811	struct ggml_backend_cuda_split_buffer_context {
812	~ggml_backend_cuda_split_buffer_context() {
813	for (ggml_tensor_extra_gpu * extra : tensor_extras) {
814	for (int id = `0`; id < GGML_CUDA_MAX_DEVICES; ++id) {
815	for (int64_t is = `0`; is < GGML_CUDA_MAX_STREAMS; ++is) {
816	if (extra->events[id][is] != nullptr) {
817	CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
818	}
819	}
820	if (extra->data_device[id] != nullptr) {
821	CUDA_CHECK(cudaFree(extra->data_device[id]));
822	}
823	}
824	delete extra;
825	}
826	}
827
828	std::vector<ggml_tensor_extra_gpu *> tensor_extras;
829	};
830
831
832	static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
833	ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
834	delete ctx;
835	}
836
837	static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
838	// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
839	return (void *)`0x1000`;
840
841	GGML_UNUSED(buffer);
842	}
843
844	static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
845	GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
846	GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
847
848	ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
849	ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
850
851	const int64_t ne0 = tensor->ne[`0`];
852
853	ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
854	ctx->tensor_extras.push_back(extra);
855
856	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
857	int64_t row_low, row_high;
858	get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
859
860	int64_t nrows_split = row_high - row_low;
861	if (nrows_split == `0`) {
862	continue;
863	}
864
865	size_t size = ggml_nbytes_split(tensor, nrows_split);
866	const size_t original_size = size;
867
868	// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
869	if (ne0 % MATRIX_ROW_PADDING != `0`) {
870	size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
871	}
872
873	// FIXME: do not crash if cudaMalloc fails
874	// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
875	ggml_cuda_set_device(device: id);
876	char * buf;
877	CUDA_CHECK(ggml_cuda_device_malloc(ptr: (void**)&buf, size, device: id));
878
879	// set padding to 0 to avoid possible NaN values
880	if (size > original_size) {
881	CUDA_CHECK(cudaMemset(devPtr: buf + original_size, value: `0`, count: size - original_size));
882	}
883
884	extra->data_device[id] = buf;
885
886	for (int64_t is = `0`; is < GGML_CUDA_MAX_STREAMS; ++is) {
887	CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
888	}
889	}
890	tensor->extra = extra;
891	return GGML_STATUS_SUCCESS;
892	}
893
894	static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
895	// split tensors must always be set in their entirety at once
896	GGML_ASSERT(offset == `0`);
897	GGML_ASSERT(size == ggml_nbytes(tensor));
898	GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
899
900	ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
901
902	const int64_t ne0 = tensor->ne[`0`];
903	const size_t nb1 = tensor->nb[`1`];
904	ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
905
906	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
907	int64_t row_low, row_high;
908	get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
909
910	int64_t nrows_split = row_high - row_low;
911	if (nrows_split == `0`) {
912	continue;
913	}
914
915	const size_t offset_split = row_low*nb1;
916	size_t size = ggml_nbytes_split(tensor, nrows_split);
917	const size_t original_size = size;
918
919	// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
920	if (ne0 % MATRIX_ROW_PADDING != `0`) {
921	size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
922	}
923
924	const char * buf_host = (const char *)data + offset_split;
925	CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
926	}
927
928	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
929	CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
930	}
931	}
932
933	static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
934	// split tensors must always be set in their entirety at once
935	GGML_ASSERT(offset == `0`);
936	GGML_ASSERT(size == ggml_nbytes(tensor));
937	GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
938
939	ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
940
941	const int64_t ne0 = tensor->ne[`0`];
942	const size_t nb1 = tensor->nb[`1`];
943	ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
944
945	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
946	int64_t row_low, row_high;
947	get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
948
949	int64_t nrows_split = row_high - row_low;
950	if (nrows_split == `0`) {
951	continue;
952	}
953
954	const size_t offset_split = row_low*nb1;
955	size_t size = ggml_nbytes_split(tensor, nrows_split);
956	const size_t original_size = size;
957
958	// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
959	if (ne0 % MATRIX_ROW_PADDING != `0`) {
960	size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
961	}
962
963	char * buf_host = (char *)data + offset_split;
964	CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
965	}
966
967	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
968	CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
969	}
970	}
971
972	static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
973	GGML_UNUSED(buffer);
974	GGML_UNUSED(value);
975	}
976
977	static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
978	/ .free_buffer = / ggml_backend_cuda_split_buffer_free_buffer,
979	/ .get_base = / ggml_backend_cuda_split_buffer_get_base,
980	/ .init_tensor = / ggml_backend_cuda_split_buffer_init_tensor,
981	/ .memset_tensor = / NULL,
982	/ .set_tensor = / ggml_backend_cuda_split_buffer_set_tensor,
983	/ .get_tensor = / ggml_backend_cuda_split_buffer_get_tensor,
984	/ .cpy_tensor = / NULL,
985	/ .clear = / ggml_backend_cuda_split_buffer_clear,
986	/ .reset = / NULL,
987	};
988
989	// cuda split buffer type
990
991	static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
992	ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
993
994	return ctx->name.c_str();
995	}
996
997	static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
998	return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_get_name;
999	}
1000
1001	static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1002	// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
1003	// instead, we allocate them for each tensor separately in init_tensor
1004	// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
1005	// as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
1006	ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context ();
1007
1008	return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
1009	}
1010
1011	static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1012	return `128`;
1013
1014	GGML_UNUSED(buft);
1015	}
1016
1017	static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
1018	ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
1019	GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
1020
1021	size_t total_size = `0`;
1022
1023	const int64_t ne0 = tensor->ne[`0`];
1024
1025	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
1026	int64_t row_low, row_high;
1027	get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
1028
1029	int64_t nrows_split = row_high - row_low;
1030	if (nrows_split == `0`) {
1031	continue;
1032	}
1033
1034	total_size += ggml_nbytes_split(tensor, nrows_split);
1035
1036	// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
1037	if (ne0 % MATRIX_ROW_PADDING != `0`) {
1038	total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
1039	}
1040	}
1041
1042	return total_size;
1043	}
1044
1045	static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1046	return false;
1047
1048	GGML_UNUSED(buft);
1049	}
1050
1051	static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
1052	/ .get_name = / ggml_backend_cuda_split_buffer_type_get_name,
1053	/ .alloc_buffer = / ggml_backend_cuda_split_buffer_type_alloc_buffer,
1054	/ .get_alignment = / ggml_backend_cuda_split_buffer_type_get_alignment,
1055	/ .get_max_size = / NULL, // defaults to SIZE_MAX
1056	/ .get_alloc_size = / ggml_backend_cuda_split_buffer_type_get_alloc_size,
1057	/ .is_host = / ggml_backend_cuda_split_buffer_type_is_host,
1058	};
1059
1060	ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
1061	static std::mutex mutex;
1062	std::lock_guard<std::mutex> lock(mutex);
1063
1064	static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
1065
1066	std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
1067
1068	bool all_zero = tensor_split == nullptr \|\| std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == `0.0f`; });
1069	if (all_zero) {
1070	tensor_split_arr = ggml_cuda_info().default_tensor_split;
1071	} else {
1072	float split_sum = `0.0f`;
1073	for (int i = `0`; i < ggml_backend_cuda_get_device_count(); ++i) {
1074	tensor_split_arr[i] = split_sum;
1075	split_sum += tensor_split[i];
1076	}
1077	for (int i = `0`; i < ggml_backend_cuda_get_device_count(); ++i) {
1078	tensor_split_arr[i] /= split_sum;
1079	}
1080	}
1081
1082	auto it = buft_map.find({main_device, tensor_split_arr});
1083	if (it != buft_map.end()) {
1084	return &it->second;
1085	}
1086	auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
1087	main_device,
1088	tensor_split_arr,
1089	GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
1090	};
1091
1092	struct ggml_backend_buffer_type buft {
1093	/ .iface = / ggml_backend_cuda_split_buffer_type_interface,
1094	/ .device = / ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
1095	/ .context = / ctx,
1096	};
1097
1098	auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
1099	return &result.first->second;
1100	}
1101
1102	// host buffer type
1103
1104	static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
1105	return GGML_CUDA_NAME "_Host";
1106
1107	GGML_UNUSED(buft);
1108	}
1109
1110	static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
1111	return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1112	}
1113
1114	static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1115	CUDA_CHECK(cudaFreeHost(buffer->context));
1116	}
1117
1118	static void * ggml_cuda_host_malloc(size_t size) {
1119	if (getenv(name: "GGML_CUDA_NO_PINNED") != nullptr) {
1120	return nullptr;
1121	}
1122
1123	void * ptr = nullptr;
1124	cudaError_t err = cudaMallocHost(ptr: (void **) &ptr, size);
1125	if (err != cudaSuccess) {
1126	// clear the error
1127	(void)cudaGetLastError();
1128	GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1129	size / `1024.0` / `1024.0`, cudaGetErrorString(error: err));
1130	return nullptr;
1131	}
1132
1133	return ptr;
1134	}
1135
1136	static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1137	void * ptr = ggml_cuda_host_malloc(size);
1138
1139	if (ptr == nullptr) {
1140	// fallback to cpu buffer
1141	return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
1142	}
1143
1144	ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
1145	buffer->buft = buft;
1146	buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
1147
1148	return buffer;
1149	}
1150
1151	ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1152	static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
1153	/ .iface = / {
1154	/ .get_name = / ggml_backend_cuda_host_buffer_type_name,
1155	/ .alloc_buffer = / ggml_backend_cuda_host_buffer_type_alloc_buffer,
1156	/ .get_alignment = / ggml_backend_cpu_buffer_type()->iface.get_alignment,
1157	/ .get_max_size = / NULL, // defaults to SIZE_MAX
1158	/ .get_alloc_size = / ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1159	/ .is_host = / ggml_backend_cpu_buffer_type()->iface.is_host,
1160	},
1161	/ .device = / ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), `0`),
1162	/ .context = / nullptr,
1163	};
1164
1165	return &ggml_backend_cuda_buffer_type_host;
1166	}
1167
1168	//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
1169	// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1170	//}
1171
1172	/// kernels
1173
1174	typedef void (*ggml_cuda_op_mul_mat_t)(
1175	ggml_backend_cuda_context & ctx,
1176	const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1177	const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1178	const int64_t src1_padded_row_size, cudaStream_t stream);
1179
1180	#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
1181	#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
1182	#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
1183
1184	#define MUL_MAT_SRC1_COL_STRIDE 128
1185
1186	static cudaError_t ggml_cuda_cpy_tensor_2d(
1187	void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1188
1189	const char * src_ptr = (const char *) src->data;
1190	char * dst_ptr = (char *) dst;
1191
1192	const int64_t ne0 = src->ne[`0`];
1193	const int64_t nb0 = src->nb[`0`];
1194	const int64_t nb1 = src->nb[`1`];
1195	const int64_t nb2 = src->nb[`2`];
1196	const int64_t nb3 = src->nb[`3`];
1197	const enum ggml_type type = src->type;
1198	const int64_t ts = ggml_type_size(type);
1199	const int64_t bs = ggml_blck_size(type);
1200	const int64_t i1_diff = i1_high - i1_low;
1201
1202	const char * x = src_ptr + i1_lownb1 + i2nb2 + i3*nb3;
1203	if (nb0 == ts && nb1 == ts*ne0/bs) {
1204	return cudaMemcpyAsync(dst: dst_ptr, src: x, count: i1_diff*nb1, kind: cudaMemcpyDeviceToDevice, stream);
1205	} else if (nb0 == ts) {
1206	return cudaMemcpy2DAsync(dst: dst_ptr, dpitch: tsne0/bs, src: x, spitch: nb1, width: tsne0/bs, height: i1_diff, kind: cudaMemcpyDeviceToDevice, stream);
1207	} else {
1208	for (int64_t i1 = `0`; i1 < i1_diff; i1++) {
1209	const void * rx = (const void ) ((const* char ) x + i1nb1);
1210	void * rd = (void ) (dst_ptr + i1ts*ne0/bs);
1211	// pretend the row is a matrix with cols=1
1212	cudaError_t r = cudaMemcpy2DAsync(dst: rd, dpitch: ts/bs, src: rx, spitch: nb0, width: ts/bs, height: ne0, kind: cudaMemcpyDeviceToDevice, stream);
1213	if (r != cudaSuccess) {
1214	return r;
1215	}
1216	}
1217	return cudaSuccess;
1218	}
1219	}
1220
1221	static void ggml_cuda_op_mul_mat_cublas(
1222	ggml_backend_cuda_context & ctx,
1223	const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1224	const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1225	const int64_t src1_padded_row_size, cudaStream_t stream) {
1226
1227	GGML_ASSERT(src0_dd_i != nullptr);
1228	GGML_ASSERT(src1_ddf_i != nullptr);
1229	GGML_ASSERT(dst_dd_i != nullptr);
1230
1231	const int64_t ne00 = src0->ne[`0`];
1232	const int64_t ne10 = src1->ne[`0`];
1233
1234	const int64_t ne0 = dst->ne[`0`];
1235
1236	const int64_t row_diff = row_high - row_low;
1237
1238	int id = ggml_cuda_get_device();
1239
1240	// the main device has a larger memory buffer to hold the results from all GPUs
1241	// ldc == nrows of the matrix that cuBLAS writes into
1242	int64_t ldc = id == ctx.device ? ne0 : row_diff;
1243
1244	const int cc = ggml_cuda_info().devices[id].cc;
1245
1246	const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) \|\| GGML_CUDA_CC_IS_AMD(cc) \|\|
1247	(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
1248
1249	const bool use_fp16 = (src0->type == GGML_TYPE_F16 \|\| ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[`1`] && dst->op_params[`0`] == GGML_PREC_DEFAULT;
1250
1251	if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[`1`]) {
1252	ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
1253	if (src1->type != GGML_TYPE_BF16) {
1254	const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
1255	GGML_ASSERT(to_bf16_cuda != nullptr);
1256	size_t ne = src1_ncols*ne10;
1257	src1_as_bf16.alloc(ne);
1258	to_bf16_cuda(src1_ddf_i, src1_as_bf16.get(), ne, stream);
1259	}
1260	const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get();
1261	const nv_bfloat16 * src0_ptr = (const nv_bfloat16 *)src0_dd_i;
1262	ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
1263
1264	const float alpha_f32 = `1.0f`;
1265	const float beta_f32 = `0.0f`;
1266
1267	CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1268	CUBLAS_CHECK(
1269	cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1270	row_diff, src1_ncols, ne10,
1271	&alpha_f32, src0_ptr, CUDA_R_16BF, ne00,
1272	src1_ptr, CUDA_R_16BF, ne10,
1273	&beta_f32, dst_bf16.get(), CUDA_R_16BF, ldc,
1274	CUBLAS_COMPUTE_32F,
1275	CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1276
1277	const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
1278	to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1279	} else if (fast_fp16_hardware_available(cc) && use_fp16) {
1280	// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1281	ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1282	if (src0->type != GGML_TYPE_F16) {
1283	const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
1284	GGML_ASSERT(to_fp16_cuda != nullptr);
1285	size_t ne = row_diff*ne00;
1286	src0_as_f16.alloc(ne);
1287	to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
1288	}
1289	const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
1290
1291	ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
1292	if (src1->type != GGML_TYPE_F16) {
1293	const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1294	GGML_ASSERT(to_fp16_cuda != nullptr);
1295	size_t ne = src1_ncols*ne10;
1296	src1_as_f16.alloc(ne);
1297	to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
1298	}
1299	const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
1300
1301	CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1302
1303	if (GGML_CUDA_CC_IS_CDNA(cc) \|\| GGML_CUDA_CC_IS_RDNA4(cc)) {
1304	const float alpha = `1.0f`;
1305	const float beta = `0.0f`;
1306	CUBLAS_CHECK(
1307	cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1308	row_diff, src1_ncols, ne10,
1309	&alpha, src0_ptr, CUDA_R_16F, ne00,
1310	src1_ptr, CUDA_R_16F, ne10,
1311	&beta, dst_dd_i, CUDA_R_32F, ldc,
1312	CUBLAS_COMPUTE_32F,
1313	CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1314	} else {
1315	ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
1316
1317	const half alpha_f16 = `1.0f`;
1318	const half beta_f16 = `0.0f`;
1319
1320	CUBLAS_CHECK(
1321	cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1322	row_diff, src1_ncols, ne10,
1323	&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
1324	src1_ptr, CUDA_R_16F, ne10,
1325	&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
1326	CUBLAS_COMPUTE_16F,
1327	CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1328
1329	const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
1330	to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1331	}
1332	} else {
1333	ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
1334	ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
1335
1336	if (src0->type != GGML_TYPE_F32) {
1337	const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
1338	GGML_ASSERT(to_fp32_cuda != nullptr);
1339	src0_ddq_as_f32.alloc(row_diff*ne00);
1340	to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
1341	}
1342	if (src1->type != GGML_TYPE_F32) {
1343	const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
1344	GGML_ASSERT(to_fp32_cuda != nullptr);
1345	src1_ddq_as_f32.alloc(src1_ncols*ne10);
1346	to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
1347	}
1348
1349	const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
1350	const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
1351
1352	const float alpha = `1.0f`;
1353	const float beta = `0.0f`;
1354
1355	CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1356	CUBLAS_CHECK(
1357	cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1358	row_diff, src1_ncols, ne10,
1359	&alpha, src0_ddf_i, ne00,
1360	src1_ddf1_i, ne10,
1361	&beta, dst_dd_i, ldc));
1362	}
1363
1364	GGML_UNUSED_VARS(dst, src1_ddq_i, src1_padded_row_size);
1365	}
1366
1367	static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
1368	static bool peer_access_enabled = false;
1369
1370	const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
1371
1372	if (peer_access_enabled == enable_peer_access) {
1373	return;
1374	}
1375
1376	#ifdef NDEBUG
1377	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
1378	ggml_cuda_set_device(device: id);
1379	CUDA_CHECK(cudaDeviceSynchronize());
1380	}
1381
1382	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
1383	ggml_cuda_set_device(device: id);
1384
1385	for (int id_other = `0`; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
1386	if (id == id_other) {
1387	continue;
1388	}
1389	if (id != main_device && id_other != main_device) {
1390	continue;
1391	}
1392
1393	int can_access_peer;
1394	CUDA_CHECK(cudaDeviceCanAccessPeer(canAccessPeer: &can_access_peer, device: id, peerDevice: id_other));
1395	if (can_access_peer) {
1396	if (enable_peer_access) {
1397	cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice: id_other, flags: `0`);
1398	if (err != cudaErrorPeerAccessAlreadyEnabled) {
1399	CUDA_CHECK(err);
1400	} else {
1401	// reset the error
1402	(void)cudaGetLastError();
1403	}
1404	} else {
1405	cudaError_t err = cudaDeviceDisablePeerAccess(peerDevice: id_other);
1406	if (err != cudaErrorPeerAccessNotEnabled) {
1407	CUDA_CHECK(err);
1408	} else {
1409	// reset the error
1410	(void)cudaGetLastError();
1411	}
1412	}
1413	}
1414	}
1415	}
1416
1417	ggml_cuda_set_device(device: main_device);
1418	#endif // NDEBUG
1419
1420	peer_access_enabled = enable_peer_access;
1421
1422	GGML_UNUSED(main_device);
1423	}
1424
1425	static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
1426	void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
1427
1428	#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1429	// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1430	cudaMemcpy3DPeerParms p = {};
1431	p.dstDevice = dstDevice;
1432	p.dstPtr = make_cudaPitchedPtr(d: dst, p: dpitch, xsz: dpitch, ysz: height);
1433	p.srcDevice = srcDevice;
1434	p.srcPtr = make_cudaPitchedPtr(d: src, p: spitch, xsz: spitch, ysz: height);
1435	p.extent = make_cudaExtent(w: width, h: height, d: `1`);
1436	return cudaMemcpy3DPeerAsync(p: &p, stream);
1437	#else
1438	// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1439	GGML_UNUSED(dstDevice);
1440	GGML_UNUSED(srcDevice);
1441	return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
1442	#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1443	}
1444
1445	static void ggml_cuda_op_mul_mat(
1446	ggml_backend_cuda_context & ctx,
1447	const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
1448	quantize_cuda_t quantize_src1) {
1449
1450	const int64_t ne00 = src0->ne[`0`];
1451	const int64_t ne01 = src0->ne[`1`];
1452	const int64_t ne02 = src0->ne[`2`];
1453	const int64_t ne03 = src0->ne[`3`];
1454
1455	const int64_t ne10 = src1->ne[`0`];
1456	const int64_t ne11 = src1->ne[`1`];
1457	const int64_t ne12 = src1->ne[`2`];
1458	const int64_t ne13 = src1->ne[`3`];
1459	const int64_t nrows1 = ggml_nrows(src1);
1460
1461	const int64_t ne0 = dst->ne[`0`];
1462	const int64_t ne1 = dst->ne[`1`];
1463
1464	// const int64_t nb10 = src1->nb[0];
1465	const int64_t nb11 = src1->nb[`1`];
1466	const int64_t nb12 = src1->nb[`2`];
1467	const int64_t nb13 = src1->nb[`3`];
1468
1469	const int64_t nb2 = dst->nb[`2`];
1470	const int64_t nb3 = dst->nb[`3`];
1471
1472	ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
1473	ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
1474
1475	GGML_ASSERT(src1->type == GGML_TYPE_F32 \|\| (src1->ne[`2`] == `1` && src1->ne[`3`] == `1`));
1476
1477	GGML_ASSERT(ne12 % ne02 == `0`);
1478	GGML_ASSERT(ne13 % ne03 == `0`);
1479
1480	const int64_t i02_divisor = ne12 / ne02;
1481	const int64_t i03_divisor = ne13 / ne03;
1482
1483	const size_t src0_ts = ggml_type_size(src0->type);
1484	const size_t src0_bs = ggml_blck_size(src0->type);
1485	const size_t q8_1_ts = sizeof(block_q8_1);
1486	const size_t q8_1_bs = QK8_1;
1487
1488	const bool src0_is_contiguous = ggml_is_contiguous(src0);
1489	const bool src1_is_contiguous = ggml_is_contiguous(src1);
1490
1491	const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
1492
1493	const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1494	GGML_ASSERT(!(split && ne02 > `1`));
1495	GGML_ASSERT(!(split && ne03 > `1`));
1496	GGML_ASSERT(!(split && ne02 < ne12));
1497	GGML_ASSERT(!(split && ne03 < ne13));
1498
1499	ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu ) src0->extra : nullptr*;
1500
1501
1502	std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
1503	if (split) {
1504	ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
1505	tensor_split = buft_ctx->tensor_split;
1506	}
1507
1508	struct dev_data {
1509	int cc;
1510
1511	ggml_cuda_pool_alloc<char> src0_dd_alloc;
1512	ggml_cuda_pool_alloc<float> src1_ddf_alloc;
1513	ggml_cuda_pool_alloc<char> src1_ddq_alloc;
1514	ggml_cuda_pool_alloc<float> dst_dd_alloc;
1515
1516	char * src0_dd = nullptr;
1517	float * src1_ddf = nullptr; // float
1518	char * src1_ddq = nullptr; // q8_1
1519	float * dst_dd = nullptr;
1520
1521	int64_t row_low;
1522	int64_t row_high;
1523	};
1524
1525	dev_data dev[GGML_CUDA_MAX_DEVICES];
1526
1527	int used_devices = `0`;
1528
1529	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
1530	dev[id].cc = ggml_cuda_info().devices[id].cc;
1531
1532	// by default, use all rows
1533	dev[id].row_low = `0`;
1534	dev[id].row_high = ne01;
1535
1536	// for multi GPU, get the row boundaries from tensor split
1537	// and round to mul_mat_q tile sizes
1538	if (split) {
1539	const int64_t rounding = get_row_rounding(tensor_split);
1540
1541	if (id != `0`) {
1542	dev[id].row_low = ne01*tensor_split[id];
1543	if (dev[id].row_low < ne01) {
1544	dev[id].row_low -= dev[id].row_low % rounding;
1545	}
1546	}
1547
1548	if (id != ggml_backend_cuda_get_device_count() - `1`) {
1549	dev[id].row_high = ne01*tensor_split[id + `1`];
1550	if (dev[id].row_high < ne01) {
1551	dev[id].row_high -= dev[id].row_high % rounding;
1552	}
1553	}
1554	}
1555	}
1556
1557	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
1558	if ((!split && id != ctx.device) \|\| dev[id].row_low == dev[id].row_high) {
1559	continue;
1560	}
1561
1562	used_devices++;
1563
1564	const bool src1_on_device = id == src1_ctx->device;
1565	const bool dst_on_device = id == dst_ctx->device;
1566
1567	ggml_cuda_set_device(device: id);
1568	cudaStream_t stream = ctx.stream(id, `0`);
1569
1570	if (src0_is_contiguous) {
1571	dev[id].src0_dd = split ? (char ) src0_extra->data_device[id] : (char* *) src0->data;
1572	} else {
1573	// If src0 is not contiguous it will be copied to a temporary buffer.
1574	// This buffer needs to be cleared entirely because multiple regions will function as padding.
1575	const size_t nbytes_data = ggml_nbytes(src0);
1576	const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1577	dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
1578	CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, `0`, nbytes_data + nbytes_padding, stream));
1579	}
1580
1581	// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
1582	if (ne00 % MATRIX_ROW_PADDING != `0` && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
1583	GGML_ASSERT(ggml_is_contiguously_allocated(src0));
1584	GGML_ASSERT(!src0->view_src);
1585	const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
1586	const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1587	CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, `0`, nbytes_padding, stream));
1588	}
1589
1590	if (src1_on_device && src1_is_contiguous) {
1591	dev[id].src1_ddf = (float *) src1->data;
1592	} else {
1593	dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
1594	}
1595
1596	if (quantize_src1) {
1597	size_t src_1_ddq_size = nrows1src1_padded_col_sizeq8_1_ts/q8_1_bs;
1598	if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1599	src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
1600	}
1601	dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
1602
1603	if (src1_on_device && src1_is_contiguous) {
1604	quantize_src1(
1605	dev[id].src1_ddf, nullptr, dev[id].src1_ddq, src0->type, ne10,
1606	nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float),
1607	src1_padded_col_size, ne11, ne12, ne13, stream);
1608	CUDA_CHECK(cudaGetLastError());
1609	}
1610	}
1611
1612	if (dst_on_device) {
1613	dev[id].dst_dd = (float *) dst->data;
1614	} else {
1615	const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
1616	dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
1617	}
1618	}
1619
1620	// if multiple devices are used they need to wait for the main device
1621	// here an event is recorded that signals that the main device has finished calculating the input data
1622	if (split && used_devices > `1`) {
1623	ggml_cuda_set_device(ctx.device);
1624	CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][`0`], ctx.stream()));
1625	}
1626
1627	const int64_t src1_col_stride = split && used_devices > `1` ? MUL_MAT_SRC1_COL_STRIDE : ne11;
1628	for (int64_t src1_col_0 = `0`; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
1629	const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : `0`;
1630	const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
1631
1632	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
1633	if ((!split && id != ctx.device) \|\| dev[id].row_low == dev[id].row_high) {
1634	continue;
1635	}
1636
1637	const bool src1_on_device = id == src1_ctx->device;
1638	const bool dst_on_device = id == dst_ctx->device;
1639	const int64_t row_diff = dev[id].row_high - dev[id].row_low;
1640
1641	ggml_cuda_set_device(device: id);
1642	cudaStream_t stream = ctx.stream(id, is);
1643
1644	// wait for main GPU data if necessary
1645	if (split && (id != ctx.device \|\| is != `0`)) {
1646	CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][`0`], `0`));
1647	}
1648
1649	for (int64_t i0 = `0`; i0 < ne13*ne12; ++i0) {
1650	const int64_t i03 = i0 / ne12;
1651	const int64_t i02 = i0 % ne12;
1652
1653	size_t src1_ddq_i_offset = i0ne11 src1_padded_col_size*q8_1_ts/q8_1_bs;
1654	if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1655	src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
1656	} else {
1657	src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1658	}
1659
1660	// for split tensors the data begins at i0 == i0_offset_low
1661	const size_t nbytes_src0_matrix = ne01ne00src0_ts / src0_bs;
1662	char * src0_dd_i = dev[id].src0_dd + ((i03/i03_divisor)ne02 + (i02/i02_divisor)) nbytes_src0_matrix;
1663	float * src1_ddf_i = dev[id].src1_ddf + (i0ne11 + src1_col_0) ne10;
1664	char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
1665	float * dst_dd_i = dev[id].dst_dd + (i0ne1 + src1_col_0) (dst_on_device ? ne0 : row_diff);
1666
1667	// the main device memory buffer can be on VRAM scratch, with space for all partial results
1668	// in that case an offset on dst_ddf_i is needed
1669	if (id == ctx.device) {
1670	dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
1671	}
1672
1673	// copy src0, src1 to device if necessary
1674	if (src1_is_contiguous) {
1675	if (id != ctx.device) {
1676	if (quantize_src1) {
1677	char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
1678	if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1679	const size_t pitch = ne11*sizeof(block_q8_1_mmq);
1680	const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
1681	const size_t height = src1_padded_col_size/(`4`*QK8_1);
1682	CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
1683	} else {
1684	CUDA_CHECK(cudaMemcpyPeerAsync(
1685	src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncolssrc1_padded_col_sizeq8_1_ts/q8_1_bs, stream));
1686	}
1687	} else {
1688	float * src1_ddf_i_source = (float *) src1->data;
1689	src1_ddf_i_source += (i0ne11 + src1_col_0) ne10;
1690	CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device,
1691	src1_ncolsne10sizeof(float), stream));
1692	}
1693	}
1694	} else if (src1_on_device && !src1_is_contiguous) {
1695	CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1696	dst: src1_ddf_i, src: src1, i3: i03, i2: i02, i1_low: src1_col_0, i1_high: src1_col_0+src1_ncols, stream));
1697	} else {
1698	GGML_ABORT("fatal error");
1699	}
1700
1701	if (quantize_src1 && !src1_is_contiguous) {
1702	quantize_src1(
1703	src1_ddf_i, nullptr, src1_ddq_i, src0->type, ne10, ne10, ne11ne10, ne12ne11*ne10,
1704	src1_padded_col_size, src1_ncols, `1`, `1`, stream);
1705	CUDA_CHECK(cudaGetLastError());
1706	}
1707
1708	if (src1_col_0 == `0` && !src0_is_contiguous && i03 % i03_divisor == `0` && i02 % i02_divisor == `0`) {
1709	CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1710	src0_dd_i, src0, i03/i03_divisor, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
1711	}
1712
1713	// do the computation
1714	op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
1715	dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
1716	CUDA_CHECK(cudaGetLastError());
1717
1718	// copy dst to host or other device if necessary
1719	if (!dst_on_device) {
1720	void * dst_off_device = dst->data;
1721	if (split) {
1722	// src0 = weight matrix is saved as a transposed matrix for better memory layout.
1723	// dst is NOT transposed.
1724	// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
1725	// Instead they need to be copied to the correct slice in ne0 = dst row index.
1726	// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
1727	float * dhf_dst_i = (float ) ((char* ) dst_off_device + i02nb2 + i03*nb3);
1728	GGML_ASSERT(dst->nb[`1`] == ne0*sizeof(float));
1729	dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
1730	CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
1731	dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
1732	} else {
1733	float * dhf_dst_i = (float ) ((char* ) dst_off_device + i02nb2 + i03*nb3);
1734	GGML_ASSERT(dst->nb[`1`] == ne0*sizeof(float));
1735	dhf_dst_i += src1_col_0*ne0;
1736	CUDA_CHECK(cudaMemcpyAsync(dst: dhf_dst_i, src: dst_dd_i, count: src1_ncolsne0sizeof(float), kind: cudaMemcpyDeviceToDevice, stream));
1737	}
1738	}
1739
1740	// add event for the main device to wait on until other device is done
1741	if (split && (id != ctx.device \|\| is != `0`)) {
1742	CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
1743	}
1744	}
1745	}
1746	}
1747
1748	// main device waits for all other devices to be finished
1749	if (split && ggml_backend_cuda_get_device_count() > `1`) {
1750	int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - `1`) / MUL_MAT_SRC1_COL_STRIDE;
1751	is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS;
1752
1753	ggml_cuda_set_device(ctx.device);
1754	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
1755	if (dev[id].row_low == dev[id].row_high) {
1756	continue;
1757	}
1758	for (int64_t is = `0`; is < is_max; ++is) {
1759	CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], `0`));
1760	}
1761	}
1762	}
1763	}
1764
1765	static __global__ void k_compute_batched_ptrs(
1766	const void * src0_as_f16, const void * src1_as_f16, char * dst,
1767	const void ** ptrs_src, void ** ptrs_dst,
1768	int64_t ne12, int64_t ne13,
1769	int64_t ne23,
1770	size_t nb02, size_t nb03,
1771	size_t nb12, size_t nb13,
1772	size_t nbd2, size_t nbd3,
1773	int64_t r2, int64_t r3) {
1774	const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
1775	const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
1776
1777	if (i13 >= ne13 \|\| i12 >= ne12) {
1778	return;
1779	}
1780
1781	const int64_t i03 = i13 / r3;
1782	const int64_t i02 = i12 / r2;
1783
1784	ptrs_src[`0`ne23 + i12 + i13ne12] = (const char ) src0_as_f16 + i02nb02 + i03*nb03;
1785	ptrs_src[`1`ne23 + i12 + i13ne12] = (const char ) src1_as_f16 + i12nb12 + i13*nb13;
1786	ptrs_dst[`0`ne23 + i12 + i13ne12] = ( char ) dst + i12nbd2 + i13*nbd3;
1787	}
1788
1789	// Type traits for mapping ggml types to CUDA/cuBLAS types
1790	template<ggml_type T>
1791	struct batched_mul_mat_traits;
1792
1793	template<>
1794	struct batched_mul_mat_traits<GGML_TYPE_F32> {
1795	using cuda_type = float;
1796	static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
1797	static inline const cudaDataType_t data_type = CUDA_R_32F;
1798	static inline const ggml_type ggml_type_val = GGML_TYPE_F32;
1799	static inline const float alpha = `1.0f`;
1800	static inline const float beta = `0.0f`;
1801	static inline const void* get_alpha() { static const float val = alpha; return &val; }
1802	static inline const void* get_beta() { static const float val = beta; return &val; }
1803	static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp32_nc_cuda(src_type); }
1804	};
1805
1806	template<>
1807	struct batched_mul_mat_traits<GGML_TYPE_BF16> {
1808	using cuda_type = nv_bfloat16;
1809	static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
1810	static inline const cudaDataType_t data_type = CUDA_R_16BF;
1811	static inline const ggml_type ggml_type_val = GGML_TYPE_BF16;
1812	static inline const float alpha = `1.0f`;
1813	static inline const float beta = `0.0f`;
1814	static inline const void* get_alpha() { static const float val = alpha; return &val; }
1815	static inline const void* get_beta() { static const float val = beta; return &val; }
1816	static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_bf16_nc_cuda(src_type); }
1817	};
1818
1819	template<>
1820	struct batched_mul_mat_traits<GGML_TYPE_F16> {
1821	using cuda_type = half;
1822	static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
1823	static inline const cudaDataType_t data_type = CUDA_R_16F;
1824	static inline const ggml_type ggml_type_val = GGML_TYPE_F16;
1825	static inline const half alpha = `1.0`;
1826	static inline const half beta = `0.0`;
1827	static inline const void* get_alpha() { static const half val = alpha; return &val; }
1828	static inline const void* get_beta() { static const half val = beta; return &val; }
1829	static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp16_nc_cuda(src_type); }
1830	};
1831
1832	template<ggml_type src0_type>
1833	static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1834	using traits = batched_mul_mat_traits<src0_type>;
1835	using cuda_t = typename traits::cuda_type;
1836
1837	GGML_ASSERT(!ggml_is_transposed(src0));
1838	GGML_ASSERT(!ggml_is_transposed(src1));
1839	GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft));
1840	GGML_ASSERT(src0->type == src0_type);
1841	GGML_ASSERT(ggml_is_contiguous(dst));
1842
1843	// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
1844	// As long as dst is contiguous this does not matter though.
1845
1846	GGML_TENSOR_BINARY_OP_LOCALS
1847
1848	const int64_t ne_dst = ggml_nelements(dst);
1849	cudaStream_t main_stream = ctx.stream();
1850	CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
1851
1852	float * dst_ddf = (float *) dst->data;
1853	const size_t ts_src1 = ggml_type_size(src1->type);
1854	GGML_ASSERT(nb10 == ts_src1);
1855	int64_t s11 = nb11 / ts_src1;
1856	int64_t s12 = nb12 / ts_src1;
1857	int64_t s13 = nb13 / ts_src1;
1858
1859	const cuda_t * src0_ptr = nullptr;
1860	const cuda_t * src1_ptr = nullptr;
1861
1862	ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
1863	ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
1864
1865	bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
1866	bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
1867
1868	// Handle src0
1869	src0_ptr = (const cuda_t *) src0->data;
1870
1871	// Handle src1 - convert if necessary
1872	if (src1->type == src0_type) {
1873	src1_ptr = (const cuda_t *) src1->data;
1874	} else {
1875	// Convert src1 to target type using traits conversion functions
1876	const int64_t ne_src1 = ggml_nelements(src1);
1877	src1_alloc.alloc(ne_src1);
1878
1879	const auto convert_func = traits::get_nc_converter(src1->type);
1880	GGML_ASSERT(convert_func != nullptr);
1881	convert_func(src1->data, src1_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
1882	src1_ptr = src1_alloc.get();
1883	s11 = ne10;
1884	s12 = ne11*s11;
1885	s13 = ne12*s12;
1886
1887	is_src1_cont_2 = true;
1888	}
1889
1890	// Setup destination buffer
1891	ggml_cuda_pool_alloc<cuda_t> dst_temp(ctx.pool());
1892	char * dst_t;
1893	size_t nbd2 = dst->nb[`2`];
1894	size_t nbd3 = dst->nb[`3`];
1895
1896	cublasComputeType_t cu_compute_type = traits::compute_type;
1897	cudaDataType_t cu_data_type = traits::data_type;
1898	cudaDataType_t cu_data_type_a = traits::data_type;
1899	cudaDataType_t cu_data_type_b = traits::data_type;
1900	const void * alpha = traits::get_alpha();
1901	const void * beta = traits::get_beta();
1902	const float alpha_f32 = `1.0f`;
1903	const float beta_f32 = `0.0f`;
1904
1905	if (dst->op_params[`0`] == GGML_PREC_DEFAULT) {
1906	if constexpr (src0_type == GGML_TYPE_F32) {
1907	dst_t = (char ) dst_ddf; // Direct F32 output*
1908	} else {
1909	dst_t = (char *) dst_temp.alloc(ne_dst);
1910	nbd2 /= sizeof(float) / sizeof(cuda_t);
1911	nbd3 /= sizeof(float) / sizeof(cuda_t);
1912	}
1913	} else {
1914	dst_t = (char *) dst_ddf;
1915	cu_compute_type = CUBLAS_COMPUTE_32F;
1916	cu_data_type = CUDA_R_32F;
1917	alpha = &alpha_f32;
1918	beta = &beta_f32;
1919	}
1920
1921	int id = ggml_cuda_get_device();
1922	const int cc = ggml_cuda_info().devices[id].cc;
1923	if (GGML_CUDA_CC_IS_CDNA(cc) \|\| GGML_CUDA_CC_IS_RDNA4(cc)) {
1924	cu_compute_type = CUBLAS_COMPUTE_32F;
1925	alpha = &alpha_f32;
1926	beta = &beta_f32;
1927	}
1928
1929	GGML_ASSERT(ne12 % ne02 == `0`);
1930	GGML_ASSERT(ne13 % ne03 == `0`);
1931
1932	// broadcast factors
1933	const int64_t r2 = ne12/ne02;
1934	const int64_t r3 = ne13/ne03;
1935
1936	if (r2 == `1` && r3 == `1` && is_src0_cont_2 && is_src1_cont_2) {
1937	// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
1938	const int64_t sma = ne02 == `1` ? nb03/nb00 : nb02/nb00;
1939	const int64_t smb = ne12 == `1` ? s13 : s12;
1940
1941	// there is no broadcast and src0, src1 are contiguous across dims 2, 3
1942	// use cublasGemmStridedBatchedEx
1943	CUBLAS_CHECK(
1944	cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1945	ne01, ne11, ne10,
1946	alpha, src0_ptr, cu_data_type_a, nb01/nb00, sma, // strideA
1947	src1_ptr, cu_data_type_b, s11, smb, // strideB
1948	beta, dst_t, cu_data_type, ne0, ne1ne0, // strideC*
1949	ne12*ne13,
1950	cu_compute_type,
1951	CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1952	} else {
1953	// use cublasGemmBatchedEx
1954	const int64_t ne23 = ne12*ne13;
1955
1956	ggml_cuda_pool_alloc<const void > ptrs_src(ctx.pool(), `2`ne23);
1957	ggml_cuda_pool_alloc< void > ptrs_dst(ctx.pool(), `1`ne23);
1958
1959	size_t src1_stride_size = sizeof(cuda_t);
1960
1961	const int threads_x = `16`;
1962	const int threads_y = `16`;
1963	dim3 block_dims(threads_x, threads_y);
1964
1965	dim3 grid_dims(
1966	(ne13 + threads_x - `1`) / threads_x,
1967	(ne12 + threads_y - `1`) / threads_y
1968	);
1969	k_compute_batched_ptrs<<<grid_dims, block_dims, `0`, main_stream>>>(
1970	src0_ptr, src1_ptr, dst_t,
1971	ptrs_src.get(), ptrs_dst.get(),
1972	ne12, ne13,
1973	ne23,
1974	nb02, nb03,
1975	(src1->type == src0_type) ? nb12 : s12*src1_stride_size,
1976	(src1->type == src0_type) ? nb13 : s13*src1_stride_size,
1977	nbd2, nbd3,
1978	r2, r3);
1979
1980	CUDA_CHECK(cudaGetLastError());
1981
1982	CUBLAS_CHECK(
1983	cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1984	ne01, ne11, ne10,
1985	alpha, (const void *) (ptrs_src.get() + `0`ne23), cu_data_type_a, nb01/nb00,
1986	(const void *) (ptrs_src.get() + `1`ne23), cu_data_type_b, s11,
1987	beta, ( void *) (ptrs_dst.get() + `0`ne23), cu_data_type, ne0,
1988	ne23,
1989	cu_compute_type,
1990	CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1991	}
1992
1993	// Convert output back to F32 if needed
1994	if (dst->op_params[`0`] == GGML_PREC_DEFAULT && cu_data_type != CUDA_R_32F) {
1995	const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(traits::ggml_type_val);
1996	to_fp32_cuda(dst_temp.get(), dst_ddf, ne_dst, main_stream);
1997	}
1998	}
1999
2000	static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2001	GGML_ASSERT(src0->type == GGML_TYPE_F16 \|\| src0->type == GGML_TYPE_BF16 \|\| src0->type == GGML_TYPE_F32);
2002
2003	switch (src0->type) {
2004	case GGML_TYPE_F32:
2005	ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F32>(ctx, src0, src1, dst);
2006	break;
2007	case GGML_TYPE_BF16:
2008	ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_BF16>(ctx, src0, src1, dst);
2009	break;
2010	case GGML_TYPE_F16:
2011	ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F16>(ctx, src0, src1, dst);
2012	break;
2013	default:
2014	GGML_ABORT("Unsupported type");
2015	}
2016	}
2017
2018	static bool ggml_cuda_should_fuse_mul_mat(const ggml_tensor * ffn_up,
2019	const ggml_tensor * ffn_gate,
2020	const ggml_tensor * glu,
2021	const ggml_tensor * ffn_up_bias = nullptr,
2022	const ggml_tensor * ffn_gate_bias = nullptr) {
2023	const bool has_bias = ffn_up_bias != nullptr \|\| ffn_gate_bias != nullptr;
2024
2025	if (has_bias && (!ffn_up_bias \|\| !ffn_gate_bias)) {
2026	return false;
2027	}
2028
2029	const bool is_mul_mat = ffn_up->op == GGML_OP_MUL_MAT && ffn_gate->op == GGML_OP_MUL_MAT && glu->op == GGML_OP_GLU;
2030	const bool is_mul_mat_id = ffn_up->op == GGML_OP_MUL_MAT_ID && ffn_gate->op == GGML_OP_MUL_MAT_ID && glu->op == GGML_OP_GLU;
2031
2032	GGML_ASSERT(ffn_up && ffn_gate && glu);
2033
2034	if (!is_mul_mat && !is_mul_mat_id) {
2035	return false;
2036	}
2037
2038	const ggml_op expected_bias_op = is_mul_mat ? GGML_OP_ADD : GGML_OP_ADD_ID;
2039
2040	if (has_bias) {
2041	if (ffn_up_bias->op != expected_bias_op \|\| ffn_gate_bias->op != expected_bias_op) {
2042	return false;
2043	}
2044
2045	if (glu->src[`0`] != ffn_gate_bias \|\| glu->src[`1`] != ffn_up_bias) {
2046	return false;
2047	}
2048
2049	if (expected_bias_op == GGML_OP_ADD) {
2050	const bool up_has_mul = ffn_up_bias->src[`0`] == ffn_up \|\| ffn_up_bias->src[`1`] == ffn_up;
2051	const bool gate_has_mul = ffn_gate_bias->src[`0`] == ffn_gate \|\| ffn_gate_bias->src[`1`] == ffn_gate;
2052	if (!up_has_mul \|\| !gate_has_mul) {
2053	return false;
2054	}
2055	} else { // GGML_OP_ADD_ID
2056	if (ffn_up_bias->src[`0`] != ffn_up \|\| ffn_gate_bias->src[`0`] != ffn_gate) {
2057	return false;
2058	}
2059	if (ffn_up_bias->src[`2`] != ffn_up->src[`2`] \|\| ffn_gate_bias->src[`2`] != ffn_gate->src[`2`]) {
2060	return false;
2061	}
2062	}
2063	} else {
2064	if (glu->src[`0`] != ffn_gate && glu->src[`1`] != ffn_up) {
2065	return false;
2066	}
2067	}
2068
2069	if (ffn_up->src[`0`]->type != ffn_gate->src[`0`]->type \|\| !ggml_are_same_shape(ffn_up->src[`0`], ffn_gate->src[`0`]) \|\|
2070	!ggml_are_same_stride(ffn_up->src[`0`], ffn_gate->src[`0`])) {
2071	return false;
2072	}
2073
2074	if (ffn_up->src[`1`] != ffn_gate->src[`1`]) {
2075	return false;
2076	}
2077
2078	if (ffn_up->src[`2`] && (ffn_up->src[`2`] != ffn_gate->src[`2`])) {
2079	return false;
2080	}
2081
2082	static constexpr std::array<ggml_glu_op, `3`> valid_glu_ops = { GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU, GGML_GLU_OP_SWIGLU_OAI };
2083
2084	if (std::find(valid_glu_ops.begin(), valid_glu_ops.end(), ggml_get_glu_op(glu)) == valid_glu_ops.end()) {
2085	return false;
2086	}
2087
2088	if (const bool swapped = ggml_get_op_params_i32(glu, `1`); swapped) {
2089	return false;
2090	}
2091
2092	const bool split = ggml_backend_buft_is_cuda_split(ffn_up->src[`0`]->buffer->buft) \|\|
2093	ggml_backend_buft_is_cuda_split(ffn_gate->src[`0`]->buffer->buft);
2094
2095	//TODO: add support for fusion for split buffers
2096	if (split) {
2097	return false;
2098	}
2099
2100	return true;
2101	}
2102
2103	static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
2104	ggml_tensor * src0 = tensor->src[`0`];
2105	ggml_tensor * src1 = tensor->src[`1`];
2106	const ggml_tensor * dst = tensor;
2107
2108	const bool is_mul_mat_id = tensor->op == GGML_OP_MUL_MAT_ID;
2109
2110	bool use_mul_mat_vec_f =
2111	(src0->type == GGML_TYPE_F32 \|\| src0->type == GGML_TYPE_F16 \|\| src0->type == GGML_TYPE_BF16) &&
2112	src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2113
2114	const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2115	use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, is_mul_mat_id ? src1->ne[`2`] : src1->ne[`1`]);
2116
2117	const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) \|\|
2118	ggml_backend_buft_is_cuda_split(src1->buffer->buft);
2119
2120	//TODO: add support for fusion for split buffers
2121	if (split) {
2122	return false;
2123	}
2124
2125	//we only support fusion for ncols_dst = 1
2126	if (tensor->op == GGML_OP_MUL_MAT && dst->ne[`1`] != `1`) {
2127	return false;
2128	}
2129
2130	if (tensor->op == GGML_OP_MUL_MAT_ID && dst->ne[`2`] != `1`) {
2131	return false;
2132	}
2133
2134
2135	return use_mul_mat_vec_f;
2136	}
2137
2138	static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
2139	ggml_tensor * src0 = tensor->src[`0`];
2140	ggml_tensor * src1 = tensor->src[`1`];
2141	const ggml_tensor * dst = tensor;
2142
2143	const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE &&
2144	ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) &&
2145	src0->view_src;
2146
2147	bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 &&
2148	dst->type == GGML_TYPE_F32 && src1->ne[`1`] <= MMVQ_MAX_BATCH_SIZE;
2149
2150	// fusion is not universally faster on Pascal
2151	const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2152	if (cc <= GGML_CUDA_CC_PASCAL) {
2153	return false;
2154	}
2155	//we only support fusion for ncols_dst = 1
2156	if (tensor->op == GGML_OP_MUL_MAT && dst->ne[`1`] != `1`) {
2157	return false;
2158	}
2159
2160	if (tensor->op == GGML_OP_MUL_MAT_ID && dst->ne[`2`] != `1`) {
2161	return false;
2162	}
2163
2164
2165	const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) \|\|
2166	ggml_backend_buft_is_cuda_split(src1->buffer->buft);
2167
2168	//TODO: add support for fusion for split buffers
2169	if (split) {
2170	return false;
2171	}
2172
2173	return use_mul_mat_vec_q;
2174	}
2175
2176	static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2177	const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
2178
2179	// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
2180	// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
2181	// Therefore, in such cases use cuBLAS.
2182	const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
2183	&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
2184
2185	bool use_mul_mat_vec_f = (src0->type == GGML_TYPE_F32 \|\| src0->type == GGML_TYPE_F16 \|\| src0->type == GGML_TYPE_BF16)
2186	&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2187	bool use_mul_mat_f = !ggml_is_quantized(src0->type)
2188	&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2189	bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
2190	&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
2191	&& src1->ne[`1`] <= MMVQ_MAX_BATCH_SIZE;
2192	bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
2193	&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2194
2195	bool any_gpus_with_slow_fp16 = false;
2196
2197	if (split) {
2198	ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
2199	auto & tensor_split = buft_ctx->tensor_split;
2200	for (int id = `0`; id < ggml_backend_cuda_get_device_count(); ++id) {
2201	// skip devices that are not going to do any work:
2202	if (tensor_split[id] >= (id + `1` < ggml_backend_cuda_get_device_count() ? tensor_split[id + `1`] : `1.0f`)) {
2203	continue;
2204	}
2205
2206	const int cc = ggml_cuda_info().devices[id].cc;
2207	const int warp_size = ggml_cuda_info().devices[id].warp_size;
2208	use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[`1`]);
2209	use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[`1`], /mul_mat_id=/false);
2210	use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[`1`]);
2211	any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 \|\| !fast_fp16_hardware_available(cc);
2212	}
2213	} else {
2214	const int cc = ggml_cuda_info().devices[ctx.device].cc;
2215	const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
2216	use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[`1`]);
2217	use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[`1`], /mul_mat_id=/false);
2218	use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[`1`]);
2219	any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 \|\| !fast_fp16_hardware_available(cc);
2220	}
2221
2222	// debug helpers
2223	//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
2224	//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
2225	//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
2226	//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
2227	//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
2228	//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
2229
2230	//TODO update for generic tensor parallelism
2231	const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2232	bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 \|\| !any_gpus_with_slow_fp16);
2233	bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
2234	bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
2235
2236	if (!split && use_mul_mat_vec_f) {
2237	// the custom F16 vector kernel can be used over batched cuBLAS GEMM
2238	// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
2239	ggml_cuda_mul_mat_vec_f(ctx, src0, src1, nullptr, dst);
2240	} else if (!split && use_mul_mat_f) {
2241	ggml_cuda_mul_mat_f(ctx, src0, src1, nullptr, dst);
2242	} else if (!split && use_mul_mat_vec_q) {
2243	ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
2244	} else if (!split && use_mul_mat_q) {
2245	ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
2246	} else if (!split && (use_batched_cublas_f16 \|\| use_batched_cublas_bf16 \|\| use_batched_cublas_f32)
2247	&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[`2`]*src1->ne[`3`] > `1`) {
2248	// general KQ + KQV multi-batch without FlashAttention
2249	ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
2250	} else if (use_mul_mat_vec_f) {
2251	ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_f, nullptr);
2252	} else if (use_mul_mat_vec_q) {
2253	ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
2254	} else if (use_mul_mat_q) {
2255	ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
2256	} else {
2257	ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
2258	}
2259	}
2260
2261	static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
2262	const ggml_tensor * src0 = dst->src[`0`];
2263	const ggml_tensor * src1 = dst->src[`1`];
2264	const ggml_tensor * ids = dst->src[`2`];
2265
2266	GGML_ASSERT(src1->type == GGML_TYPE_F32);
2267	GGML_ASSERT(dst->type == GGML_TYPE_F32);
2268	GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
2269
2270	GGML_TENSOR_BINARY_OP_LOCALS
2271
2272	const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2273
2274	if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2275	if (ne2 == `1`) {
2276	if (ggml_is_quantized(src0->type)) {
2277	ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
2278	} else {
2279	ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst);
2280	}
2281	return;
2282	}
2283
2284	if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
2285	ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
2286	return;
2287	}
2288
2289	if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[`2`], /mul_mat_id=/true)) {
2290	ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
2291	return;
2292	}
2293	}
2294
2295	cudaStream_t stream = ctx.stream();
2296
2297	GGML_ASSERT(nb12 % nb11 == `0`);
2298	GGML_ASSERT(nb2 % nb1 == `0`);
2299
2300	const ggml_type type_src1_sorted = (src0->type == GGML_TYPE_F16 && !fast_fp16_hardware_available(cc))
2301	\|\| ggml_is_quantized(src0->type) ? GGML_TYPE_F32 : src0->type;
2302	const ggml_type type_dst_sorted = GGML_TYPE_F32;
2303	const size_t ts_src1_sorted = ggml_type_size(type_src1_sorted);
2304	const size_t ts_dst_sorted = ggml_type_size(type_dst_sorted);
2305
2306	const int64_t n_expert_used = ids->ne[`0`];
2307	const int64_t ne_get_rows = ne12 * n_expert_used;
2308
2309	std::vector<int32_t> ids_to_sorted_host;
2310	ids_to_sorted_host.reserve(n: `2`*ne_get_rows);
2311	std::vector<int32_t> ids_from_sorted_host(ne_get_rows);
2312
2313	ggml_cuda_pool_alloc<int32_t> ids_buf_dev(ctx.pool(), `2`*ne_get_rows);
2314
2315	std::vector<int32_t> tokens_per_expert(ne02);
2316
2317	ggml_cuda_pool_alloc<char> src1_sorted(ctx.pool(), ne12n_expert_usedne10*ts_src1_sorted);
2318	ggml_cuda_pool_alloc<char> dst_sorted(ctx.pool(), ne2 n_expert_used ne0*ts_dst_sorted);
2319
2320	std::vector<char> ids_host(ggml_nbytes(ids));
2321	CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
2322	CUDA_CHECK(cudaStreamSynchronize(stream));
2323
2324	for (int64_t i02 = `0`; i02 < ne02; ++i02) { // expert matrices
2325	for (int64_t i12 = `0`; i12 < ne12; ++i12) { // tokens
2326	for (int64_t iex = `0`; iex < n_expert_used; ++iex) {
2327	const int32_t expert_to_use = (const* int32_t )(ids_host.data() + i12ids->nb[`1`] + iex*ids->nb[`0`]);
2328	assert(expert_to_use >= `0` && expert_to_use < ne02);
2329	if (expert_to_use == i02) {
2330	ids_from_sorted_host [i12*n_expert_used + iex] = ids_to_sorted_host.size();
2331	ids_to_sorted_host.push_back(i12*ne11 + iex % ne11);
2332	tokens_per_expert[i02]++;
2333	break;
2334	}
2335	}
2336	}
2337	}
2338	GGML_ASSERT(ids_to_sorted_host.size() == size_t(ne_get_rows));
2339
2340	ids_to_sorted_host.insert(position: ids_to_sorted_host.end(), first: ids_from_sorted_host.begin(), last: ids_from_sorted_host.end());
2341
2342	CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_to_sorted_host.data(), `2`ne_get_rowssizeof(int32_t), cudaMemcpyHostToDevice, stream));
2343	CUDA_CHECK(cudaStreamSynchronize(stream));
2344
2345	const int32_t * ids_to_sorted = ids_buf_dev.ptr + `0`*ne_get_rows;
2346	const int32_t * ids_from_sorted = ids_buf_dev.ptr + `1`*ne_get_rows;
2347
2348	get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted,
2349	ne10, nb11, nb12, nb13,
2350	ne_get_rows, `1`, `1`, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
2351	ne10ts_src1_sorted, ne_get_rowsne10ts_src1_sorted, ne_get_rowsne10*ts_src1_sorted, stream);
2352	CUDA_CHECK(cudaGetLastError());
2353
2354	char * src1_data_cur = (char *) src1_sorted.ptr;
2355	char * dst_data_cur = (char *) dst_sorted.ptr;
2356	for (int64_t i02 = `0`; i02 < ne02; ++i02) {
2357	if (tokens_per_expert[i02] == `0`) {
2358	continue;
2359	}
2360
2361	ggml_tensor src0_slice = *src0;
2362	src0_slice.ne[`2`] = `1`;
2363	src0_slice.nb[`3`] = src0_slice.nb[`2`];
2364	src0_slice.op = GGML_OP_VIEW;
2365	src0_slice.view_src = dst->src[`0`]; // non-const pointer to src0
2366	src0_slice.data = (char ) src0->data + i02nb02;
2367
2368	ggml_tensor src1_slice;
2369	memset(&src1_slice, `0`, sizeof(src1_slice));
2370	src1_slice.buffer = src1->buffer;
2371	src1_slice.type = type_src1_sorted;
2372	src1_slice.ne[`0`] = ne10;
2373	src1_slice.ne[`1`] = tokens_per_expert[i02];
2374	src1_slice.ne[`2`] = `1`;
2375	src1_slice.ne[`3`] = `1`;
2376	src1_slice.nb[`0`] = ts_src1_sorted;
2377	src1_slice.nb[`1`] = src1_slice.ne[`0`] * src1_slice.nb[`0`];
2378	src1_slice.nb[`2`] = src1_slice.ne[`1`] * src1_slice.nb[`1`];
2379	src1_slice.nb[`3`] = src1_slice.ne[`2`] * src1_slice.nb[`2`];
2380	src1_slice.data = src1_data_cur;
2381
2382	ggml_tensor dst_slice;
2383	memset(&dst_slice, `0`, sizeof(dst_slice));
2384	dst_slice.buffer = dst->buffer;
2385	dst_slice.type = type_dst_sorted;
2386	dst_slice.ne[`0`] = ne0;
2387	dst_slice.ne[`1`] = tokens_per_expert[i02];
2388	dst_slice.ne[`2`] = `1`;
2389	dst_slice.ne[`3`] = `1`;
2390	dst_slice.nb[`0`] = ts_dst_sorted;
2391	dst_slice.nb[`1`] = dst_slice.ne[`0`] * dst_slice.nb[`0`];
2392	dst_slice.nb[`2`] = dst_slice.ne[`1`] * dst_slice.nb[`1`];
2393	dst_slice.nb[`3`] = dst_slice.ne[`2`] * dst_slice.nb[`2`];
2394	dst_slice.data = dst_data_cur;
2395
2396	ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice);
2397	CUDA_CHECK(cudaGetLastError());
2398
2399	src1_data_cur += src1_slice.nb[`2`];
2400	dst_data_cur += dst_slice.nb[`2`];
2401	}
2402
2403	get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type,
2404	ne0, ne0ts_dst_sorted, ne_get_rowsne0ts_dst_sorted, ne_get_rowsne0*ts_dst_sorted,
2405	ne_get_rows, `1`, `1`, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
2406	nb1, nb2, nb3, stream);
2407	}
2408
2409	static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
2410	// why is this here instead of mul_mat?
2411	if (dst->src[`0`] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[`0`]->buffer->buft)) {
2412	ggml_cuda_set_peer_access(dst->src[`1`]->ne[`1`], ctx.device);
2413	}
2414
2415	switch (dst->op) {
2416	case GGML_OP_ARGMAX:
2417	ggml_cuda_argmax(ctx, dst);
2418	break;
2419	case GGML_OP_COUNT_EQUAL:
2420	ggml_cuda_count_equal(ctx, dst);
2421	break;
2422	case GGML_OP_REPEAT:
2423	ggml_cuda_op_repeat(ctx, dst);
2424	break;
2425	case GGML_OP_REPEAT_BACK:
2426	ggml_cuda_op_repeat_back(ctx, dst);
2427	break;
2428	case GGML_OP_GET_ROWS:
2429	ggml_cuda_op_get_rows(ctx, dst);
2430	break;
2431	case GGML_OP_GET_ROWS_BACK:
2432	ggml_cuda_op_get_rows_back(ctx, dst);
2433	break;
2434	case GGML_OP_SET_ROWS:
2435	ggml_cuda_op_set_rows(ctx, dst);
2436	break;
2437	case GGML_OP_SET:
2438	ggml_cuda_op_set(ctx, dst);
2439	break;
2440	case GGML_OP_DUP:
2441	ggml_cuda_dup(ctx, dst);
2442	break;
2443	case GGML_OP_CPY:
2444	ggml_cuda_cpy(ctx, dst->src[`0`], dst->src[`1`]);
2445	break;
2446	case GGML_OP_CONT:
2447	ggml_cuda_dup(ctx, dst);
2448	break;
2449	case GGML_OP_ADD:
2450	case GGML_OP_ADD1: // TODO: more efficient implementation
2451	ggml_cuda_op_add(ctx, dst);
2452	break;
2453	case GGML_OP_ADD_ID:
2454	ggml_cuda_op_add_id(ctx, dst);
2455	break;
2456	case GGML_OP_SUB:
2457	ggml_cuda_op_sub(ctx, dst);
2458	break;
2459	case GGML_OP_ACC:
2460	ggml_cuda_op_acc(ctx, dst);
2461	break;
2462	case GGML_OP_MUL:
2463	ggml_cuda_op_mul(ctx, dst);
2464	break;
2465	case GGML_OP_DIV:
2466	ggml_cuda_op_div(ctx, dst);
2467	break;
2468	case GGML_OP_UNARY:
2469	switch (ggml_get_unary_op(dst)) {
2470	case GGML_UNARY_OP_ABS:
2471	ggml_cuda_op_abs(ctx, dst);
2472	break;
2473	case GGML_UNARY_OP_SGN:
2474	ggml_cuda_op_sgn(ctx, dst);
2475	break;
2476	case GGML_UNARY_OP_NEG:
2477	ggml_cuda_op_neg(ctx, dst);
2478	break;
2479	case GGML_UNARY_OP_STEP:
2480	ggml_cuda_op_step(ctx, dst);
2481	break;
2482	case GGML_UNARY_OP_GELU:
2483	ggml_cuda_op_gelu(ctx, dst);
2484	break;
2485	case GGML_UNARY_OP_SILU:
2486	ggml_cuda_op_silu(ctx, dst);
2487	break;
2488	case GGML_UNARY_OP_GELU_ERF:
2489	ggml_cuda_op_gelu_erf(ctx, dst);
2490	break;
2491	case GGML_UNARY_OP_GELU_QUICK:
2492	ggml_cuda_op_gelu_quick(ctx, dst);
2493	break;
2494	case GGML_UNARY_OP_TANH:
2495	ggml_cuda_op_tanh(ctx, dst);
2496	break;
2497	case GGML_UNARY_OP_RELU:
2498	ggml_cuda_op_relu(ctx, dst);
2499	break;
2500	case GGML_UNARY_OP_SIGMOID:
2501	ggml_cuda_op_sigmoid(ctx, dst);
2502	break;
2503	case GGML_UNARY_OP_HARDSIGMOID:
2504	ggml_cuda_op_hardsigmoid(ctx, dst);
2505	break;
2506	case GGML_UNARY_OP_HARDSWISH:
2507	ggml_cuda_op_hardswish(ctx, dst);
2508	break;
2509	case GGML_UNARY_OP_EXP:
2510	ggml_cuda_op_exp(ctx, dst);
2511	break;
2512	case GGML_UNARY_OP_ELU:
2513	ggml_cuda_op_elu(ctx, dst);
2514	break;
2515	case GGML_UNARY_OP_XIELU:
2516	ggml_cuda_op_xielu(ctx, dst);
2517	break;
2518	case GGML_UNARY_OP_FLOOR:
2519	ggml_cuda_op_floor(ctx, dst);
2520	break;
2521	case GGML_UNARY_OP_CEIL:
2522	ggml_cuda_op_ceil(ctx, dst);
2523	break;
2524	case GGML_UNARY_OP_ROUND:
2525	ggml_cuda_op_round(ctx, dst);
2526	break;
2527	case GGML_UNARY_OP_TRUNC:
2528	ggml_cuda_op_trunc(ctx, dst);
2529	break;
2530	default:
2531	return false;
2532	}
2533	break;
2534	case GGML_OP_GLU:
2535	switch (ggml_get_glu_op(dst)) {
2536	case GGML_GLU_OP_REGLU:
2537	ggml_cuda_op_reglu(ctx, dst);
2538	break;
2539	case GGML_GLU_OP_GEGLU:
2540	ggml_cuda_op_geglu(ctx, dst);
2541	break;
2542	case GGML_GLU_OP_SWIGLU:
2543	ggml_cuda_op_swiglu(ctx, dst);
2544	break;
2545	case GGML_GLU_OP_SWIGLU_OAI:
2546	ggml_cuda_op_swiglu_oai(ctx, dst);
2547	break;
2548	case GGML_GLU_OP_GEGLU_ERF:
2549	ggml_cuda_op_geglu_erf(ctx, dst);
2550	break;
2551	case GGML_GLU_OP_GEGLU_QUICK:
2552	ggml_cuda_op_geglu_quick(ctx, dst);
2553	break;
2554	default:
2555	return false;
2556	}
2557	break;
2558	case GGML_OP_NORM:
2559	ggml_cuda_op_norm(ctx, dst);
2560	break;
2561	case GGML_OP_GROUP_NORM:
2562	ggml_cuda_op_group_norm(ctx, dst);
2563	break;
2564	case GGML_OP_L2_NORM:
2565	ggml_cuda_op_l2_norm(ctx, dst);
2566	break;
2567	case GGML_OP_CONCAT:
2568	ggml_cuda_op_concat(ctx, dst);
2569	break;
2570	case GGML_OP_UPSCALE:
2571	ggml_cuda_op_upscale(ctx, dst);
2572	break;
2573	case GGML_OP_PAD:
2574	ggml_cuda_op_pad(ctx, dst);
2575	break;
2576	case GGML_OP_PAD_REFLECT_1D:
2577	ggml_cuda_op_pad_reflect_1d(ctx, dst);
2578	break;
2579	case GGML_OP_ARANGE:
2580	ggml_cuda_op_arange(ctx, dst);
2581	break;
2582	case GGML_OP_TIMESTEP_EMBEDDING:
2583	ggml_cuda_op_timestep_embedding(ctx, dst);
2584	break;
2585	case GGML_OP_LEAKY_RELU:
2586	ggml_cuda_op_leaky_relu(ctx, dst);
2587	break;
2588	case GGML_OP_SILU_BACK:
2589	ggml_cuda_op_silu_back(ctx, dst);
2590	break;
2591	case GGML_OP_RMS_NORM:
2592	ggml_cuda_op_rms_norm(ctx, dst);
2593	break;
2594	case GGML_OP_RMS_NORM_BACK:
2595	ggml_cuda_op_rms_norm_back(ctx, dst);
2596	break;
2597	case GGML_OP_MUL_MAT:
2598	ggml_cuda_mul_mat(ctx, dst->src[`0`], dst->src[`1`], dst);
2599	break;
2600	case GGML_OP_MUL_MAT_ID:
2601	ggml_cuda_mul_mat_id(ctx, dst);
2602	break;
2603	case GGML_OP_OUT_PROD:
2604	ggml_cuda_out_prod(ctx, dst);
2605	break;
2606	case GGML_OP_SCALE:
2607	ggml_cuda_op_scale(ctx, dst);
2608	break;
2609	case GGML_OP_SQR:
2610	ggml_cuda_op_sqr(ctx, dst);
2611	break;
2612	case GGML_OP_SQRT:
2613	ggml_cuda_op_sqrt(ctx, dst);
2614	break;
2615	case GGML_OP_SIN:
2616	ggml_cuda_op_sin(ctx, dst);
2617	break;
2618	case GGML_OP_COS:
2619	ggml_cuda_op_cos(ctx, dst);
2620	break;
2621	case GGML_OP_CLAMP:
2622	ggml_cuda_op_clamp(ctx, dst);
2623	break;
2624	case GGML_OP_LOG:
2625	ggml_cuda_op_log(ctx, dst);
2626	break;
2627	case GGML_OP_NONE:
2628	case GGML_OP_RESHAPE:
2629	case GGML_OP_VIEW:
2630	case GGML_OP_PERMUTE:
2631	case GGML_OP_TRANSPOSE:
2632	break;
2633	case GGML_OP_DIAG_MASK_INF:
2634	ggml_cuda_op_diag_mask_inf(ctx, dst);
2635	break;
2636	case GGML_OP_SOFT_MAX:
2637	ggml_cuda_op_soft_max(ctx, dst);
2638	break;
2639	case GGML_OP_SOFT_MAX_BACK:
2640	ggml_cuda_op_soft_max_back(ctx, dst);
2641	break;
2642	case GGML_OP_ROPE:
2643	ggml_cuda_op_rope(ctx, dst);
2644	break;
2645	case GGML_OP_ROPE_BACK:
2646	ggml_cuda_op_rope_back(ctx, dst);
2647	break;
2648	case GGML_OP_ROLL:
2649	ggml_cuda_op_roll(ctx, dst);
2650	break;
2651	case GGML_OP_IM2COL:
2652	ggml_cuda_op_im2col(ctx, dst);
2653	break;
2654	case GGML_OP_IM2COL_3D:
2655	ggml_cuda_op_im2col_3d(ctx, dst);
2656	break;
2657	case GGML_OP_CONV_2D:
2658	ggml_cuda_op_conv2d(ctx, dst);
2659	break;
2660	case GGML_OP_CONV_2D_DW:
2661	ggml_cuda_op_conv2d_dw(ctx, dst);
2662	break;
2663	case GGML_OP_CONV_TRANSPOSE_2D:
2664	ggml_cuda_conv_2d_transpose_p0(ctx, dst);
2665	break;
2666	case GGML_OP_CONV_TRANSPOSE_1D:
2667	ggml_cuda_op_conv_transpose_1d(ctx,dst);
2668	break;
2669	case GGML_OP_POOL_2D:
2670	ggml_cuda_op_pool2d(ctx, dst);
2671	break;
2672	case GGML_OP_SUM:
2673	ggml_cuda_op_sum(ctx, dst);
2674	break;
2675	case GGML_OP_SUM_ROWS:
2676	ggml_cuda_op_sum_rows(ctx, dst);
2677	break;
2678	case GGML_OP_MEAN:
2679	ggml_cuda_op_mean(ctx, dst);
2680	break;
2681	case GGML_OP_SSM_CONV:
2682	ggml_cuda_op_ssm_conv(ctx, dst);
2683	break;
2684	case GGML_OP_SSM_SCAN:
2685	ggml_cuda_op_ssm_scan(ctx, dst);
2686	break;
2687	case GGML_OP_ARGSORT:
2688	ggml_cuda_op_argsort(ctx, dst);
2689	break;
2690	case GGML_OP_FLASH_ATTN_EXT:
2691	ggml_cuda_flash_attn_ext(ctx, dst);
2692	break;
2693	case GGML_OP_CROSS_ENTROPY_LOSS:
2694	ggml_cuda_cross_entropy_loss(ctx, dst);
2695	break;
2696	case GGML_OP_RWKV_WKV6:
2697	ggml_cuda_op_rwkv_wkv6(ctx, dst);
2698	break;
2699	case GGML_OP_GATED_LINEAR_ATTN:
2700	ggml_cuda_op_gated_linear_attn(ctx, dst);
2701	break;
2702	case GGML_OP_RWKV_WKV7:
2703	ggml_cuda_op_rwkv_wkv7(ctx, dst);
2704	break;
2705	case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
2706	ggml_cuda_cross_entropy_loss_back(ctx, dst);
2707	break;
2708	case GGML_OP_OPT_STEP_ADAMW:
2709	ggml_cuda_opt_step_adamw(ctx, dst);
2710	break;
2711	case GGML_OP_OPT_STEP_SGD:
2712	ggml_cuda_opt_step_sgd(ctx, dst);
2713	break;
2714	default:
2715	return false;
2716	}
2717
2718	cudaError_t err = cudaGetLastError();
2719	if (err != cudaSuccess) {
2720	GGML_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2721	CUDA_CHECK(err);
2722	}
2723
2724	return true;
2725	}
2726
2727	////////////////////////////////////////////////////////////////////////////////
2728
2729	// backend
2730
2731	static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
2732	ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2733
2734	return cuda_ctx->name.c_str();
2735	}
2736
2737	static void ggml_backend_cuda_free(ggml_backend_t backend) {
2738	ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2739
2740	delete cuda_ctx;
2741	delete backend;
2742	}
2743
2744	static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2745	ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2746	ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2747
2748	GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2749
2750	CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
2751	}
2752
2753	static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2754	ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2755	ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2756
2757	GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2758
2759	CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
2760	}
2761
2762	static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
2763	ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
2764	ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
2765
2766	if (!ggml_backend_is_cuda(backend_src) \|\| !ggml_backend_is_cuda(backend_dst)) {
2767	return false;
2768	}
2769
2770	if (!ggml_backend_buffer_is_cuda(src->buffer) \|\| !ggml_backend_buffer_is_cuda(dst->buffer)) {
2771	return false;
2772	}
2773
2774	// device -> device copy
2775	ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
2776	ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
2777
2778	ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
2779	ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
2780
2781	if (cuda_ctx_src->device != buf_ctx_src->device \|\| cuda_ctx_dst->device != buf_ctx_dst->device) {
2782	#ifndef NDEBUG
2783	GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
2784	#endif
2785	return false;
2786	}
2787
2788	if (backend_src != backend_dst) {
2789	// copy on src stream
2790	if (cuda_ctx_src->device == cuda_ctx_dst->device) {
2791	CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
2792	} else {
2793	#ifdef GGML_CUDA_NO_PEER_COPY
2794	return false;
2795	#else
2796	CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
2797	#endif
2798	}
2799
2800	// record event on src stream after the copy
2801	if (!cuda_ctx_src->copy_event) {
2802	ggml_cuda_set_device(cuda_ctx_src->device);
2803	CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
2804	}
2805
2806	CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
2807
2808	// wait on dst stream for the copy to complete
2809	CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, `0`));
2810	} else {
2811	// src and dst are on the same backend
2812	CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
2813	}
2814	return true;
2815	}
2816
2817	static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2818	ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2819
2820	CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
2821
2822	GGML_UNUSED(backend);
2823	}
2824
2825	#ifdef USE_CUDA_GRAPH
2826	static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
2827	bool use_cuda_graph) {
2828
2829	// Loop over nodes in GGML graph to obtain info needed for CUDA graph
2830
2831	const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
2832	const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
2833	const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
2834	const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
2835	const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
2836	const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
2837	const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
2838
2839	for (int i = `0`; i < cgraph->n_nodes; i++) {
2840	ggml_tensor * node = cgraph->nodes[i];
2841
2842	if (ggml_is_empty(node) \|\| node->op == GGML_OP_RESHAPE \|\| node->op == GGML_OP_TRANSPOSE \|\| node->op == GGML_OP_VIEW \|\| node->op == GGML_OP_PERMUTE \|\| node->op == GGML_OP_NONE) {
2843	continue;
2844	}
2845
2846	if (node->src[`0`] && node->src[`0`]->buffer && ggml_backend_buft_is_cuda_split(node->src[`0`]->buffer->buft)) {
2847	use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2848	#ifndef NDEBUG
2849	GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
2850	#endif
2851	}
2852
2853	if (node->op == GGML_OP_MUL_MAT_ID && node->ne[`2`] != `1`) {
2854	use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2855	#ifndef NDEBUG
2856	GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
2857	#endif
2858	}
2859
2860	if (node->op == GGML_OP_ADD &&
2861	node->src[`1`] && node->src[`1`]->ne[`1`] > `1` &&
2862	(node->src[`0`] ? node->src[`0`]->name != gemma3n_per_layer_proj_src0_name : true) &&
2863	(node->src[`1`] ? node->src[`1`]->name != gemma3n_per_layer_proj_src1_name : true) &&
2864	strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != `0` &&
2865	strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != `0` &&
2866	strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != `0` &&
2867	strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != `0` &&
2868	strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != `0`) {
2869	// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
2870	// by means of matching node names. See
2871	// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
2872	// https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
2873	// Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
2874	use_cuda_graph = false;
2875	#ifndef NDEBUG
2876	GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[`0`], node->ne[`1`], node->ne[`2`], node->ne[`3`]);
2877	#endif
2878	}
2879
2880	if (!use_cuda_graph) {
2881	break;
2882	}
2883	}
2884
2885	return use_cuda_graph;
2886	}
2887
2888	static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2889	graph_node_properties->node_address = node->data;
2890	graph_node_properties->node_op = node->op;
2891	for (int i = `0`; i < GGML_MAX_DIMS; i++) {
2892	graph_node_properties->ne[i] = node->ne[i];
2893	graph_node_properties->nb[i] = node->nb[i];
2894	}
2895	for (int i = `0`; i < GGML_MAX_SRC; i++) {
2896	graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2897	}
2898	memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
2899	}
2900
2901	static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2902	if (node->data != graph_node_properties->node_address &&
2903	node->op != GGML_OP_VIEW) {
2904	return false;
2905	}
2906
2907	if (node->op != graph_node_properties->node_op) {
2908	return false;
2909	}
2910
2911	for (int i = `0`; i < GGML_MAX_DIMS; i++) {
2912	if (node->ne[i] != graph_node_properties->ne[i]) {
2913	return false;
2914	}
2915	if (node->nb[i] != graph_node_properties->nb[i]) {
2916	return false;
2917	}
2918	}
2919
2920	for (int i = `0`; i < GGML_MAX_SRC; i++) {
2921	if (node->src[i] &&
2922	node->src[i]->data != graph_node_properties->src_address[i] &&
2923	node->op != GGML_OP_VIEW
2924	) {
2925	return false;
2926	}
2927	}
2928
2929	if ((node->op == GGML_OP_SCALE \|\| node->op == GGML_OP_GLU) &&
2930	memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != `0`) {
2931	return false;
2932	}
2933
2934	return true;
2935	}
2936
2937	static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
2938
2939	bool cuda_graph_update_required = false;
2940
2941	if (cuda_ctx->cuda_graph->instance == nullptr) {
2942	cuda_graph_update_required = true;
2943	}
2944
2945	// Check if the graph size has changed
2946	if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2947	cuda_graph_update_required = true;
2948	cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2949	}
2950
2951	// Loop over nodes in GGML graph to determine if CUDA graph update is required
2952	// and store properties to allow this comparison for the next token
2953	for (int i = `0`; i < cgraph->n_nodes; i++) {
2954	bool has_matching_properties = true;
2955	if (!cuda_graph_update_required) {
2956	has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2957	}
2958	if (!has_matching_properties) {
2959	cuda_graph_update_required = true;
2960	}
2961	set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2962	}
2963
2964	return cuda_graph_update_required;
2965	}
2966
2967	static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
2968
2969	#if CUDART_VERSION >= 12000
2970	cudaGraphExecUpdateResultInfo result_info;
2971	cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2972	#else
2973	cudaGraphNode_t errorNode;
2974	cudaGraphExecUpdateResult result_info;
2975	cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
2976	#endif // CUDART_VERSION >= 12000
2977
2978	if (stat == cudaErrorGraphExecUpdateFailure) {
2979	#ifndef NDEBUG
2980	GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
2981	#endif
2982
2983	// The pre-existing graph exec cannot be updated due to violated constraints
2984	// so instead clear error and re-instantiate
2985	(void)cudaGetLastError();
2986	CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2987	cuda_ctx->cuda_graph->instance = nullptr;
2988	CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, `0`));
2989	} else {
2990	GGML_ASSERT(stat == cudaSuccess);
2991	}
2992	}
2993	#endif
2994
2995	static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops, std::initializer_list<enum ggml_unary_op> unary_ops) {
2996	#ifndef NDEBUG
2997	const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
2998	GGML_ASSERT(unary_ops.size() == num_unary);
2999	#endif
3000
3001	//TODO: remove special case once ggml_can_fuse can handle empty nodes
3002	std::initializer_list<enum ggml_op> topk_moe_ops =
3003	ggml_cuda_topk_moe_ops(/with_norm/ false, /delayed_softmax=/false);
3004	std::initializer_list<enum ggml_op> topk_moe_ops_with_norm =
3005	ggml_cuda_topk_moe_ops(/with_norm=/true, /delayed_softmax=/false);
3006	std::initializer_list<enum ggml_op> topk_moe_ops_delayed_softmax =
3007	ggml_cuda_topk_moe_ops(/with_norm=/false, /delayed_softmax=/true);
3008
3009	if (ops.size() == topk_moe_ops_with_norm.size() &&
3010	ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + `3`, node_idx + `9` })) {
3011	ggml_tensor * softmax = cgraph->nodes[node_idx];
3012	ggml_tensor * weights = cgraph->nodes[node_idx + `9`];
3013
3014	if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3015	return true;
3016	}
3017	}
3018
3019	if (ops.size() == topk_moe_ops.size() &&
3020	ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + `3`, node_idx + `4` })) {
3021	ggml_tensor * softmax = cgraph->nodes[node_idx];
3022	ggml_tensor * weights = cgraph->nodes[node_idx + `4`];
3023	if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3024	return true;
3025	}
3026	}
3027
3028	if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
3029	ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + `1`, node_idx + `5` })) {
3030	ggml_tensor * softmax = cgraph->nodes[node_idx + `4`];
3031	ggml_tensor * weights = cgraph->nodes[node_idx + `5`];
3032
3033	if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3034	return true;
3035	}
3036	}
3037
3038	std::initializer_list<enum ggml_op> mul_mat_bias_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_GLU };
3039	std::initializer_list<enum ggml_op> mul_mat_id_bias_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_GLU };
3040
3041	std::initializer_list<enum ggml_op> mul_mat_id_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_MUL_MAT_ID, GGML_OP_GLU };
3042	std::initializer_list<enum ggml_op> mul_mat_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT, GGML_OP_GLU };
3043
3044	if (ops.size() == `5` && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + `4`}) \|\|
3045	ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + `4`}))) {
3046
3047	const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
3048	const ggml_tensor * ffn_gate_bias = cgraph->nodes[node_idx + `1`];
3049	const ggml_tensor * ffn_up = cgraph->nodes[node_idx + `2`];
3050	const ggml_tensor * ffn_up_bias = cgraph->nodes[node_idx + `3`];
3051	const ggml_tensor * glu = cgraph->nodes[node_idx + `4`];
3052
3053	if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu, ffn_up_bias, ffn_gate_bias)) {
3054	return true;
3055	}
3056	}
3057
3058	if (ops.size() == `3` && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + `2`}) \|\|
3059	ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + `2`}))) {
3060
3061	const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
3062	const ggml_tensor * ffn_up = cgraph->nodes[node_idx + `1`];
3063	const ggml_tensor * glu = cgraph->nodes[node_idx + `2`];
3064
3065	if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) {
3066	return true;
3067	}
3068	}
3069
3070	if (!ggml_can_fuse(cgraph, node_idx, ops)) {
3071	return false;
3072	}
3073
3074	if ((ops.size() == `2` \|\| ops.size() == `3`) && ops.begin()[`0`] == GGML_OP_RMS_NORM && ops.begin()[`1`] == GGML_OP_MUL) {
3075	const ggml_tensor *rms_norm = cgraph->nodes[node_idx];
3076	const ggml_tensor *mul = cgraph->nodes[node_idx+`1`];
3077	const ggml_tensor add = nullptr*;
3078
3079	if (ops.size() == `3` && ops.begin()[`2`] == GGML_OP_ADD) {
3080	add = cgraph->nodes[node_idx+`2`];
3081	}
3082
3083	GGML_ASSERT(rms_norm->src[`0`]->type == GGML_TYPE_F32);
3084	GGML_ASSERT(rms_norm->type == GGML_TYPE_F32);
3085
3086	//rms norm only supports F32
3087	if (mul->src[`0`]->type != GGML_TYPE_F32 \|\|
3088	mul->src[`1`]->type != GGML_TYPE_F32 \|\|
3089	mul->type != GGML_TYPE_F32) {
3090	return false;
3091	}
3092
3093	if (add && (add->src[`0`]->type != GGML_TYPE_F32 \|\|
3094	add->src[`1`]->type != GGML_TYPE_F32 \|\|
3095	add->type != GGML_TYPE_F32) ) {
3096	return false;
3097	}
3098
3099	//if rms norm is the B operand, then we don't handle broadcast
3100	if (rms_norm == mul->src[`1`] && !ggml_are_same_shape(mul->src[`0`], rms_norm)) {
3101	return false;
3102	}
3103
3104	//rms_norm kernel assumes contigous rows
3105	if (!ggml_is_contiguous_rows(mul->src[`0`]) \|\| !ggml_is_contiguous_rows(mul->src[`1`])) {
3106	return false;
3107	}
3108
3109	if (add && (!ggml_is_contiguous(add->src[`0`]) \|\| !ggml_is_contiguous_rows(add->src[`1`]))) {
3110	return false;
3111	}
3112
3113	return true;
3114	}
3115
3116	if (ops.size() == `3` && ops.begin()[`0`] == GGML_OP_SCALE && ops.begin()[`1`] == GGML_OP_UNARY && ops.begin()[`2`] == GGML_OP_SCALE
3117	&& unary_ops.size() == `1` && unary_ops.begin()[`0`] == GGML_UNARY_OP_TANH) {
3118	const ggml_tensor *scale = cgraph->nodes[node_idx];
3119	const ggml_tensor *tanh = cgraph->nodes[node_idx+`1`];
3120	const ggml_tensor *scale2 = cgraph->nodes[node_idx+`2`];
3121
3122	GGML_ASSERT(scale->src[`0`]->type == GGML_TYPE_F32);
3123	GGML_ASSERT(scale->type == GGML_TYPE_F32);
3124
3125	if (ggml_get_unary_op(tanh) != GGML_UNARY_OP_TANH) {
3126	return false;
3127	}
3128
3129	// Check for bias
3130	if (ggml_get_op_params_f32(scale, `1`) != `0.0f` \|\| ggml_get_op_params_f32(scale2, `1`) != `0.0f`) {
3131	return false;
3132	}
3133
3134	return true;
3135	}
3136
3137	return false;
3138	}
3139
3140	static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
3141	bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
3142	// flag used to determine whether it is an integrated_gpu
3143	const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
3144
3145	while (!graph_evaluated_or_captured) {
3146	// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
3147	// With the use of CUDA graphs, the execution will be performed by the graph launch.
3148	if (!use_cuda_graph \|\| cuda_graph_update_required) {
3149
3150	[[maybe_unused]] int prev_i = `0`;
3151
3152	for (int i = `0`; i < cgraph->n_nodes; i++) {
3153	ggml_tensor * node = cgraph->nodes[i];
3154	#ifdef GGML_CUDA_DEBUG
3155	const int nodes_fused = i - prev_i - `1`;
3156	prev_i = i;
3157	if (nodes_fused > `0`) {
3158	GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
3159	}
3160	#endif
3161
3162	if (ggml_is_empty(node) \|\| node->op == GGML_OP_RESHAPE \|\| node->op == GGML_OP_TRANSPOSE \|\| node->op == GGML_OP_VIEW \|\| node->op == GGML_OP_PERMUTE \|\| node->op == GGML_OP_NONE) {
3163	continue;
3164	}
3165
3166	static bool disable_fusion = (getenv(name: "GGML_CUDA_DISABLE_FUSION") != nullptr);
3167	if (!disable_fusion) {
3168
3169	if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/with norm/ true), {})) {
3170	ggml_tensor * weights = cgraph->nodes[i + `9`];
3171	ggml_tensor * selected_experts = cgraph->nodes[i + `3`];
3172	ggml_tensor * clamp = cgraph->nodes[i + `7`];
3173	ggml_cuda_op_topk_moe(cuda_ctx, node->src[`0`], weights, selected_experts, /with norm/* true,
3174	/delayed softmax/ false, clamp);
3175	i += `9`;
3176	continue;
3177	}
3178
3179	if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/with norm/ false), {})) {
3180	ggml_tensor * weights = cgraph->nodes[i + `4`];
3181	ggml_tensor * selected_experts = cgraph->nodes[i + `3`];
3182	ggml_cuda_op_topk_moe(cuda_ctx, node->src[`0`], weights, selected_experts, /with norm/* false,
3183	/delayed softmax/ false);
3184	i += `4`;
3185	continue;
3186	}
3187
3188	if (ggml_cuda_can_fuse(cgraph, i,
3189	ggml_cuda_topk_moe_ops(/with norm/ false, /delayed softmax/ true), {})) {
3190	ggml_tensor * weights = cgraph->nodes[i + `5`];
3191	ggml_tensor * ids = cgraph->nodes[i + `1`];
3192
3193	ggml_cuda_op_topk_moe(cuda_ctx, node->src[`0`], weights, ids, /with norm/* false,
3194	/delayed_softmax/ true);
3195	i += `5`;
3196	continue;
3197	}
3198
3199	if (node->op == GGML_OP_ADD) {
3200	int n_fuse = `0`;
3201	ggml_op ops[`8`];
3202	std::fill(ops, ops + `8`, GGML_OP_ADD);
3203
3204	for (; n_fuse <= `6`; ++n_fuse){
3205	if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, `2`)) {
3206	break;
3207	}
3208	if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + `1`]->src[`0`]) {
3209	break;
3210	}
3211	if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[`1`], cgraph->nodes[i + n_fuse + `1`]->src[`1`])) {
3212	break;
3213	}
3214	}
3215
3216	n_fuse++;
3217
3218	if (n_fuse > `1`) {
3219	for (int j = `0`; j < n_fuse - `1`; ++j) {
3220	node->src[j + `2`] = cgraph->nodes[i + j + `1`]->src[`1`];
3221	}
3222	cgraph->nodes[i + n_fuse - `1`]->data = node->data;
3223	ggml_cuda_op_fused_add(*cuda_ctx, node, n_fuse);
3224	i += n_fuse - `1`;
3225
3226	continue;
3227	}
3228	}
3229
3230	bool fused_mul_mat_vec = false;
3231	int fused_node_count = `0`;
3232
3233	for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
3234	const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
3235
3236	if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) {
3237	ggml_tensor * glu = cgraph->nodes[i + `4`];
3238	ggml_tensor * gate_bias_n = glu->src[`0`];
3239	ggml_tensor * up_bias_n = glu->src[`1`];
3240
3241	//we don't assume the order for {gate, up}. Instead infer it from the bias tensor
3242	ggml_tensor * gate_n = nullptr;
3243	ggml_tensor * up_n = nullptr;
3244
3245	if (gate_bias_n->src[`0`] == cgraph->nodes[i] \|\| gate_bias_n->src[`1`] == cgraph->nodes[i]) {
3246	gate_n = cgraph->nodes[i];
3247	up_n = cgraph->nodes[i + `2`];
3248	} else if (gate_bias_n->src[`0`] == cgraph->nodes[i + `2`] \|\| gate_bias_n->src[`1`] == cgraph->nodes[i + `2`]) {
3249	gate_n = cgraph->nodes[i + `2`];
3250	up_n = cgraph->nodes[i];
3251	} else {
3252	continue;
3253	}
3254
3255	auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) {
3256	if (op_bias == GGML_OP_ADD) {
3257	if (bias_node->src[`0`] == mul_node) {
3258	return bias_node->src[`1`];
3259	}
3260	if (bias_node->src[`1`] == mul_node) {
3261	return bias_node->src[`0`];
3262	}
3263	return (ggml_tensor ) nullptr*;
3264	}
3265	GGML_ASSERT(op_bias == GGML_OP_ADD_ID);
3266	GGML_ASSERT(bias_node->src[`0`] == mul_node);
3267	return bias_node->src[`1`];
3268	};
3269
3270	ggml_tensor * up_bias_tensor = get_bias_tensor(up_bias_n, up_n, bias_op);
3271	ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op);
3272
3273	if (!up_bias_tensor \|\| !gate_bias_tensor) {
3274	continue;
3275	}
3276
3277	// we don't support repeating adds
3278	if (bias_op == GGML_OP_ADD &&
3279	(!ggml_are_same_shape(gate_bias_n->src[`0`], gate_bias_n->src[`1`]) \|\|
3280	!ggml_are_same_shape(up_bias_n->src[`0`], up_bias_n->src[`1`]))) {
3281	continue;
3282	}
3283
3284	const ggml_tensor * src0 = up_n->src[`0`];
3285	const ggml_tensor * src1 = up_n->src[`1`];
3286	const ggml_tensor * ids = up_n->src[`2`];
3287
3288	if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) {
3289	ggml_cuda_mm_fusion_args_host fusion_data{};
3290	fusion_data.gate = gate_n->src[`0`];
3291	fusion_data.x_bias = up_bias_tensor;
3292	fusion_data.gate_bias = gate_bias_tensor;
3293	fusion_data.glu_op = ggml_get_glu_op(glu);
3294
3295	ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
3296	fused_mul_mat_vec = true;
3297	fused_node_count = `5`;
3298	break;
3299	}
3300
3301	if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) {
3302	ggml_cuda_mm_fusion_args_host fusion_data{};
3303	fusion_data.gate = gate_n->src[`0`];
3304	fusion_data.x_bias = up_bias_tensor;
3305	fusion_data.gate_bias = gate_bias_tensor;
3306	fusion_data.glu_op = ggml_get_glu_op(glu);
3307
3308	ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
3309	fused_mul_mat_vec = true;
3310	fused_node_count = `5`;
3311	break;
3312	}
3313	} else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
3314	ggml_tensor * glu = cgraph->nodes[i + `2`];
3315	ggml_tensor * gate = glu->src[`0`];
3316	ggml_tensor * up = glu->src[`1`];
3317
3318	bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + `1`])
3319	\|\| (gate == cgraph->nodes[i + `1`] && up == cgraph->nodes[i]);
3320
3321	if (!ok) continue;
3322
3323	const ggml_tensor * src0 = up->src[`0`];
3324	const ggml_tensor * src1 = up->src[`1`];
3325	const ggml_tensor * ids = up->src[`2`];
3326
3327	if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
3328	ggml_cuda_mm_fusion_args_host fusion_data{};
3329	fusion_data.gate = gate->src[`0`];
3330	fusion_data.glu_op = ggml_get_glu_op(glu);
3331
3332	ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
3333	fused_mul_mat_vec = true;
3334	fused_node_count = `3`;
3335	break;
3336	}
3337
3338	if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
3339	ggml_cuda_mm_fusion_args_host fusion_data{};
3340	fusion_data.gate = gate->src[`0`];
3341	fusion_data.glu_op = ggml_get_glu_op(glu);
3342
3343	ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
3344	fused_mul_mat_vec = true;
3345	fused_node_count = `3`;
3346	break;
3347	}
3348	}
3349	}
3350
3351	if (fused_mul_mat_vec) {
3352	i += fused_node_count - `1`;
3353	continue;
3354	}
3355
3356	fused_mul_mat_vec = false;
3357	fused_node_count = `0`;
3358
3359	for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
3360	const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
3361
3362	if (!ggml_can_fuse(cgraph, i, { op, bias_op })) {
3363	continue;
3364	}
3365
3366	ggml_tensor * mm_node = cgraph->nodes[i];
3367	ggml_tensor * bias_node = cgraph->nodes[i + `1`];
3368
3369	ggml_tensor * bias_tensor = nullptr;
3370	if (bias_op == GGML_OP_ADD) {
3371	if (bias_node->src[`0`] == mm_node) {
3372	bias_tensor = bias_node->src[`1`];
3373	} else if (bias_node->src[`1`] == mm_node) {
3374	bias_tensor = bias_node->src[`0`];
3375	} else {
3376	continue;
3377	}
3378	} else {
3379	if (bias_node->src[`0`] != mm_node) {
3380	continue;
3381	}
3382	bias_tensor = bias_node->src[`1`];
3383	}
3384
3385	const ggml_tensor * src0 = mm_node->src[`0`];
3386	const ggml_tensor * src1 = mm_node->src[`1`];
3387	const ggml_tensor * ids = mm_node->src[`2`];
3388
3389	if (bias_op == GGML_OP_ADD_ID && bias_node->src[`2`] != ids) {
3390	continue;
3391	}
3392
3393	if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[`0`], bias_node->src[`1`])) {
3394	continue;
3395	}
3396
3397	ggml_cuda_mm_fusion_args_host fusion_data{};
3398	fusion_data.x_bias = bias_tensor;
3399
3400	if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) {
3401	ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
3402	fused_mul_mat_vec = true;
3403	fused_node_count = `2`;
3404	break;
3405	}
3406
3407	if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) {
3408	ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
3409	fused_mul_mat_vec = true;
3410	fused_node_count = `2`;
3411	break;
3412	}
3413	}
3414
3415	if (fused_mul_mat_vec) {
3416	i += fused_node_count - `1`;
3417	continue;
3418	}
3419
3420	if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
3421	ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+`1`], cgraph->nodes[i+`2`]);
3422	i += `2`;
3423	continue;
3424	}
3425
3426	if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) {
3427	ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+`1`]);
3428	i++;
3429	continue;
3430	}
3431
3432	if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
3433	i += `2`;
3434	ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
3435	continue;
3436	}
3437	}
3438	#ifndef NDEBUG
3439	assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
3440	for (int j = `0`; j < GGML_MAX_SRC; j++) {
3441	if (node->src[j] != nullptr) {
3442	assert(node->src[j]->buffer);
3443	assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) \|\|
3444	ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) \|\| (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
3445	}
3446	}
3447	#else
3448	GGML_UNUSED(integrated);
3449	#endif // NDEBUG
3450
3451	bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
3452	if (!ok) {
3453	GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
3454	}
3455	GGML_ASSERT(ok);
3456	}
3457	}
3458
3459	#ifdef USE_CUDA_GRAPH
3460	if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
3461	if (cuda_ctx->cuda_graph->graph != nullptr) {
3462	CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
3463	cuda_ctx->cuda_graph->graph = nullptr;
3464	}
3465
3466	CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
3467	graph_evaluated_or_captured = true; // CUDA graph has been captured
3468
3469	std::lock_guard<std::mutex> lock(ggml_cuda_lock);
3470	if (ggml_cuda_lock_counter.fetch_sub(`1`, std::memory_order_relaxed) == `1`) {
3471	ggml_cuda_lock_cv.notify_all();
3472	}
3473	} else {
3474	graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
3475	}
3476	}
3477
3478	if (use_cuda_graph) {
3479	if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
3480	CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, `0`));
3481	}
3482	if (cuda_graph_update_required) { // Update graph executable
3483	update_cuda_graph_executable(cuda_ctx);
3484	}
3485	// Launch graph
3486	CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
3487	#else
3488	graph_evaluated_or_captured = true;
3489	#endif // USE_CUDA_GRAPH
3490	}
3491	}
3492
3493	static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
3494	ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
3495
3496	ggml_cuda_set_device(cuda_ctx->device);
3497
3498	#ifdef USE_CUDA_GRAPH
3499	static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
3500
3501	// Objects required for CUDA Graph
3502	if (cuda_ctx->cuda_graph == nullptr) {
3503	cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
3504	}
3505
3506	bool use_cuda_graph = true;
3507	bool cuda_graph_update_required = false;
3508
3509	if (cuda_ctx->cuda_graph->graph == nullptr) {
3510	if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
3511	cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
3512	#ifndef NDEBUG
3513	GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
3514	#endif
3515	}
3516	}
3517
3518	// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
3519	// or previous graph capture failure.
3520	// Also disable for multi-gpu for now. TO DO investigate
3521	if (disable_cuda_graphs_due_to_env
3522	\|\| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
3523	\|\| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
3524	\|\| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
3525	use_cuda_graph = false;
3526	}
3527
3528	if (use_cuda_graph) {
3529	cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
3530
3531	use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph);
3532
3533	// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
3534	if (use_cuda_graph && cuda_graph_update_required) {
3535	cuda_ctx->cuda_graph->number_consecutive_updates++;
3536	} else {
3537	cuda_ctx->cuda_graph->number_consecutive_updates = `0`;
3538	}
3539
3540	if (cuda_ctx->cuda_graph->number_consecutive_updates >= `4`) {
3541	cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
3542	#ifndef NDEBUG
3543	GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
3544	#endif
3545	}
3546	}
3547
3548	if (use_cuda_graph && cuda_graph_update_required) {
3549	// Start CUDA graph capture
3550	{
3551	std::lock_guard<std::mutex> lock(ggml_cuda_lock);
3552	ggml_cuda_lock_counter.fetch_add(`1`, std::memory_order_relaxed);
3553	}
3554
3555	CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
3556	}
3557
3558	#else
3559	bool use_cuda_graph = false;
3560	bool cuda_graph_update_required = false;
3561	#endif // USE_CUDA_GRAPH
3562
3563	bool graph_evaluated_or_captured = false;
3564
3565	evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
3566
3567	return GGML_STATUS_SUCCESS;
3568	}
3569
3570	static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
3571	ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
3572
3573	CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
3574	}
3575
3576	static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
3577	ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
3578
3579	if (ggml_backend_is_cuda(backend)) {
3580	CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, `0`));
3581	} else {
3582	#if 0
3583	// untested
3584	auto wait_fn = [](void * user_data) {
3585	ggml_backend_event_t event = (ggml_backend_event_t)user_data;
3586	ggml_backend_event_synchronize(event);
3587	};
3588
3589	CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
3590	#endif
3591	GGML_ABORT("fatal error");
3592	}
3593	}
3594
3595	static const ggml_backend_i ggml_backend_cuda_interface = {
3596	/ .get_name = / ggml_backend_cuda_get_name,
3597	/ .free = / ggml_backend_cuda_free,
3598	/ .set_tensor_async = / ggml_backend_cuda_set_tensor_async,
3599	/ .get_tensor_async = / ggml_backend_cuda_get_tensor_async,
3600	/ .cpy_tensor_async = / ggml_backend_cuda_cpy_tensor_async,
3601	/ .synchronize = / ggml_backend_cuda_synchronize,
3602	/ .graph_plan_create = / NULL,
3603	/ .graph_plan_free = / NULL,
3604	/ .graph_plan_update = / NULL,
3605	/ .graph_plan_compute = / NULL,
3606	/ .graph_compute = / ggml_backend_cuda_graph_compute,
3607	/ .event_record = / ggml_backend_cuda_event_record,
3608	/ .event_wait = / ggml_backend_cuda_event_wait,
3609	/ .graph_optimize = / NULL,
3610	};
3611
3612	static ggml_guid_t ggml_backend_cuda_guid() {
3613	static ggml_guid guid = { `0x2c`, `0xdd`, `0xe8`, `0x1c`, `0x65`, `0xb3`, `0x65`, `0x73`, `0x6a`, `0x12`, `0x88`, `0x61`, `0x1c`, `0xc9`, `0xdc`, `0x25` };
3614	return &guid;
3615	}
3616
3617	bool ggml_backend_is_cuda(ggml_backend_t backend) {
3618	return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
3619	}
3620
3621	int ggml_backend_cuda_get_device_count() {
3622	return ggml_cuda_info().device_count;
3623	}
3624
3625	void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
3626	cudaDeviceProp prop;
3627	CUDA_CHECK(cudaGetDeviceProperties(prop: &prop, device));
3628	snprintf(s: description, maxlen: description_size, format: "%s", prop.name);
3629	}
3630
3631	void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
3632	ggml_cuda_set_device(device);
3633
3634	CUDA_CHECK(cudaMemGetInfo(free, total));
3635	}
3636
3637	bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
3638	if (getenv(name: "GGML_CUDA_REGISTER_HOST") == nullptr) {
3639	return false;
3640	}
3641
3642	#if CUDART_VERSION >= 11010 \|\| defined(GGML_USE_MUSA) \|\| defined(GGML_USE_HIP)
3643	cudaError_t err = cudaHostRegister(ptr: buffer, size, cudaHostRegisterPortable \| cudaHostRegisterReadOnly);
3644	if (err != cudaSuccess) {
3645	// clear the error
3646	(void)cudaGetLastError();
3647
3648	GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3649	size / `1024.0` / `1024.0`, cudaGetErrorString(error: err));
3650	return false;
3651	}
3652	return true;
3653	#else
3654	GGML_UNUSED(buffer);
3655	GGML_UNUSED(size);
3656	return false;
3657	#endif // CUDART_VERSION >= 11010 \|\| defined(GGML_USE_MUSA)
3658	}
3659
3660	void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
3661	if (getenv(name: "GGML_CUDA_REGISTER_HOST") == nullptr) {
3662	return;
3663	}
3664
3665	cudaError_t err = cudaHostUnregister(ptr: buffer);
3666	if (err != cudaSuccess) {
3667	// clear the error
3668	(void)cudaGetLastError();
3669	}
3670	}
3671
3672
3673	// backend device
3674
3675	struct ggml_backend_cuda_device_context {
3676	int device;
3677	std::string name;
3678	std::string description;
3679	std::string pci_bus_id;
3680	};
3681
3682	static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
3683	ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3684	return ctx->name.c_str();
3685	}
3686
3687	static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) {
3688	ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3689	return ctx->description.c_str();
3690	}
3691
3692	static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
3693	ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3694	ggml_cuda_set_device(device: ctx->device);
3695	CUDA_CHECK(cudaMemGetInfo(free, total));
3696	}
3697
3698	static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
3699	GGML_UNUSED(dev);
3700	return GGML_BACKEND_DEVICE_TYPE_GPU;
3701	}
3702
3703	static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
3704	ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3705
3706	props->name = ggml_backend_cuda_device_get_name(dev);
3707	props->description = ggml_backend_cuda_device_get_description(dev);
3708	props->type = ggml_backend_cuda_device_get_type(dev);
3709	props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
3710	ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
3711
3712	bool host_buffer = getenv(name: "GGML_CUDA_NO_PINNED") == nullptr;
3713	#ifdef GGML_CUDA_NO_PEER_COPY
3714	bool events = false;
3715	#else
3716	bool events = true;
3717	#endif
3718
3719	props->caps = {
3720	/ .async = / true,
3721	/ .host_buffer = / host_buffer,
3722	/ .buffer_from_host_ptr = / false,
3723	/ .events = / events,
3724	};
3725	}
3726
3727	static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
3728	GGML_UNUSED(params);
3729	ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3730	return ggml_backend_cuda_init(ctx->device);
3731	}
3732
3733	static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) {
3734	ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3735	return ggml_backend_cuda_buffer_type(ctx->device);
3736	}
3737
3738	static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(ggml_backend_dev_t dev) {
3739	GGML_UNUSED(dev);
3740	return ggml_backend_cuda_host_buffer_type();
3741	}
3742
3743	// TODO: move these functions here
3744	static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
3745	ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
3746
3747	// split buffers can only be used with GGML_OP_MUL_MAT
3748	if (op->op != GGML_OP_MUL_MAT) {
3749	for (int i = `0`; i < GGML_MAX_SRC; i++) {
3750	if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
3751	return false;
3752	}
3753	}
3754	}
3755
3756	// check if all the sources are allocated on this device
3757	for (int i = `0`; i < GGML_MAX_SRC; i++) {
3758	if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
3759	ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
3760	if (buft_ctx->device != dev_ctx->device) {
3761	return false;
3762	}
3763	}
3764	}
3765
3766	switch (op->op) {
3767	case GGML_OP_UNARY:
3768	switch (ggml_get_unary_op(op)) {
3769	case GGML_UNARY_OP_ABS:
3770	case GGML_UNARY_OP_SGN:
3771	case GGML_UNARY_OP_NEG:
3772	case GGML_UNARY_OP_STEP:
3773	case GGML_UNARY_OP_GELU:
3774	case GGML_UNARY_OP_SILU:
3775	case GGML_UNARY_OP_RELU:
3776	case GGML_UNARY_OP_SIGMOID:
3777	case GGML_UNARY_OP_HARDSIGMOID:
3778	case GGML_UNARY_OP_HARDSWISH:
3779	case GGML_UNARY_OP_GELU_ERF:
3780	case GGML_UNARY_OP_GELU_QUICK:
3781	case GGML_UNARY_OP_TANH:
3782	case GGML_UNARY_OP_EXP:
3783	case GGML_UNARY_OP_ELU:
3784	case GGML_UNARY_OP_FLOOR:
3785	case GGML_UNARY_OP_CEIL:
3786	case GGML_UNARY_OP_ROUND:
3787	case GGML_UNARY_OP_TRUNC:
3788	return ggml_is_contiguous(op->src[`0`]);
3789	default:
3790	return false;
3791	}
3792	break;
3793	case GGML_OP_GLU:
3794	switch (ggml_get_glu_op(op)) {
3795	case GGML_GLU_OP_REGLU:
3796	case GGML_GLU_OP_GEGLU:
3797	case GGML_GLU_OP_SWIGLU:
3798	case GGML_GLU_OP_SWIGLU_OAI:
3799	case GGML_GLU_OP_GEGLU_ERF:
3800	case GGML_GLU_OP_GEGLU_QUICK:
3801	return ggml_is_contiguous_1(op->src[`0`]);
3802	default:
3803	return false;
3804	}
3805	break;
3806	case GGML_OP_MUL_MAT:
3807	case GGML_OP_MUL_MAT_ID:
3808	{
3809	struct ggml_tensor * a = op->src[`0`];
3810	struct ggml_tensor * b = op->src[`1`];
3811	if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
3812	if (a->ne[`2`] > `1` \|\| a->ne[`3`] > `1`) {
3813	return false;
3814	}
3815	// for small weight matrices the active device can end up without any rows, don't use row split in those cases
3816	// this avoids some edge cases (and the performance would not be good anyways)
3817	ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
3818	int64_t row_low;
3819	int64_t row_high;
3820	get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, dev_ctx->device);
3821	if (row_low == row_high) {
3822	return false;
3823	}
3824	}
3825	if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
3826	return false;
3827	}
3828	#ifdef GGML_USE_MUSA
3829	const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3830	if (b->ne[`2`]*b->ne[`3`] > `1` && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3831	if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT &&
3832	a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) {
3833	return false;
3834	}
3835	if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID &&
3836	a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) {
3837	return false;
3838	}
3839	}
3840	#endif // GGML_USE_MUSA
3841	switch (a->type) {
3842	case GGML_TYPE_F32:
3843	case GGML_TYPE_F16:
3844	case GGML_TYPE_Q4_0:
3845	case GGML_TYPE_Q4_1:
3846	case GGML_TYPE_Q5_0:
3847	case GGML_TYPE_Q5_1:
3848	case GGML_TYPE_Q8_0:
3849	case GGML_TYPE_MXFP4:
3850	case GGML_TYPE_Q2_K:
3851	case GGML_TYPE_Q3_K:
3852	case GGML_TYPE_Q4_K:
3853	case GGML_TYPE_Q5_K:
3854	case GGML_TYPE_Q6_K:
3855	case GGML_TYPE_Q8_K:
3856	case GGML_TYPE_IQ1_M:
3857	case GGML_TYPE_IQ1_S:
3858	case GGML_TYPE_IQ2_S:
3859	case GGML_TYPE_IQ2_XS:
3860	case GGML_TYPE_IQ2_XXS:
3861	case GGML_TYPE_IQ3_S:
3862	case GGML_TYPE_IQ3_XXS:
3863	case GGML_TYPE_IQ4_NL:
3864	case GGML_TYPE_IQ4_XS:
3865	case GGML_TYPE_BF16:
3866	return true;
3867	default:
3868	return false;
3869	}
3870	} break;
3871	case GGML_OP_OUT_PROD:
3872	return op->type == GGML_TYPE_F32 && op->src[`0`]->type == GGML_TYPE_F32 && op->src[`1`]->type == GGML_TYPE_F32;
3873	case GGML_OP_GET_ROWS:
3874	{
3875	switch (op->src[`0`]->type) {
3876	case GGML_TYPE_F16:
3877	case GGML_TYPE_F32:
3878	case GGML_TYPE_BF16:
3879	case GGML_TYPE_I32:
3880	case GGML_TYPE_Q4_0:
3881	case GGML_TYPE_Q4_1:
3882	case GGML_TYPE_Q5_0:
3883	case GGML_TYPE_Q5_1:
3884	case GGML_TYPE_Q8_0:
3885	return true;
3886	default:
3887	return false;
3888	}
3889	} break;
3890	case GGML_OP_GET_ROWS_BACK:
3891	{
3892	return op->type == GGML_TYPE_F32 && op->src[`0`]->type == GGML_TYPE_F32 && op->ne[`2`] == `1` && op->ne[`3`] == `1`;
3893	} break;
3894	case GGML_OP_SET_ROWS:
3895	{
3896	return (op->type == GGML_TYPE_F32 \|\| op->type == GGML_TYPE_F16 \|\| op->type == GGML_TYPE_BF16 \|\|
3897	op->type == GGML_TYPE_Q4_0 \|\| op->type == GGML_TYPE_Q4_1 \|\| op->type == GGML_TYPE_Q5_0 \|\|
3898	op->type == GGML_TYPE_Q5_1 \|\| op->type == GGML_TYPE_Q8_0 \|\| op->type == GGML_TYPE_IQ4_NL) &&
3899	op->src[`0`]->type == GGML_TYPE_F32 &&
3900	(op->src[`1`]->type == GGML_TYPE_I64 \|\| op->src[`1`]->type == GGML_TYPE_I32);
3901	} break;
3902	case GGML_OP_SET:
3903	{
3904	const ggml_type t = op->type;
3905	return (t == GGML_TYPE_F32 \|\| t == GGML_TYPE_I32) &&
3906	t == op->src[`0`]->type &&
3907	t == op->src[`1`]->type;
3908	} break;
3909	case GGML_OP_CPY:
3910	{
3911	ggml_type src0_type = op->src[`0`]->type;
3912	ggml_type src1_type = op->src[`1`]->type;
3913	if ((src0_type == GGML_TYPE_F32 \|\| src0_type == GGML_TYPE_BF16 \|\| src0_type == GGML_TYPE_F16) &&
3914	(src1_type == GGML_TYPE_F32 \|\| src1_type == GGML_TYPE_BF16 \|\| src1_type == GGML_TYPE_F16)
3915	) {
3916	return true;
3917	}
3918	if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
3919	return true;
3920	}
3921	if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
3922	return true;
3923	}
3924	if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
3925	return true;
3926	}
3927	if (src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_F32) {
3928	return true;
3929	}
3930	if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
3931	return true;
3932	}
3933	if (src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_F32) {
3934	return true;
3935	}
3936	if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
3937	return true;
3938	}
3939	if (src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_F32) {
3940	return true;
3941	}
3942	if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
3943	return true;
3944	}
3945	if (src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_F32) {
3946	return true;
3947	}
3948	if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
3949	return true;
3950	}
3951	if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) {
3952	return true;
3953	}
3954	if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) {
3955	return true;
3956	}
3957	if (src0_type == src1_type && ggml_is_contiguous(op->src[`0`]) && ggml_is_contiguous(op->src[`1`])) {
3958	return true;
3959	}
3960	return false;
3961	} break;
3962	case GGML_OP_DUP:
3963	{
3964	ggml_type src0_type = op->src[`0`]->type;
3965	return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3966	} break;
3967	case GGML_OP_ARGMAX:
3968	case GGML_OP_COUNT_EQUAL:
3969	{
3970	return true;
3971	} break;
3972	case GGML_OP_REPEAT:
3973	{
3974	ggml_type src0_type = op->src[`0`]->type;
3975	return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3976	} break;
3977	case GGML_OP_REPEAT_BACK:
3978	return op->type == GGML_TYPE_F32 && (op->src[`0`]->ne[`2`]*op->src[`0`]->ne[`3`]) <= (`1` << `15`);
3979	case GGML_OP_CONCAT:
3980	{
3981	ggml_type src0_type = op->src[`0`]->type;
3982	return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3983	} break;
3984	case GGML_OP_CONV_TRANSPOSE_1D:
3985	{
3986	ggml_type src0_type = op->src[`0`]->type;
3987	ggml_type src1_type = op->src[`1`]->type;
3988	if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
3989	return true;
3990	}
3991	return false;
3992	} break;
3993	case GGML_OP_SILU_BACK:
3994	return ggml_is_contiguous(op->src[`0`]) && op->src[`0`]->type == GGML_TYPE_F32;
3995	break;
3996	case GGML_OP_NORM:
3997	case GGML_OP_RMS_NORM:
3998	case GGML_OP_L2_NORM:
3999	return true;
4000	case GGML_OP_RMS_NORM_BACK:
4001	return ggml_is_contiguous(op->src[`0`]) && op->ne[`0`] % WARP_SIZE == `0`;
4002	break;
4003	case GGML_OP_NONE:
4004	case GGML_OP_RESHAPE:
4005	case GGML_OP_VIEW:
4006	case GGML_OP_PERMUTE:
4007	case GGML_OP_TRANSPOSE:
4008	case GGML_OP_ADD:
4009	case GGML_OP_ADD_ID:
4010	case GGML_OP_ADD1:
4011	case GGML_OP_SUB:
4012	case GGML_OP_MUL:
4013	case GGML_OP_DIV:
4014	case GGML_OP_SCALE:
4015	case GGML_OP_SQR:
4016	case GGML_OP_SQRT:
4017	case GGML_OP_SIN:
4018	case GGML_OP_COS:
4019	case GGML_OP_CLAMP:
4020	case GGML_OP_LOG:
4021	return true;
4022	case GGML_OP_SSM_SCAN: {
4023	if (op->src[`3`]->ne[`0`] == `1`) {
4024	// Mamba2
4025	// (kernel only supports (d_state == 128 \|\| d_state == 256) && d_head % 16 == 0)
4026	return (op->src[`0`]->ne[`0`] == `128` \|\| op->src[`0`]->ne[`0`] == `256`) && op->src[`0`]->ne[`1`] % `16` == `0`;
4027	} else {
4028	// Mamba
4029	// (kernel only supports d_state == 16, d_head == 1, n_head % 128 == 0, n_group == 1)
4030	return op->src[`0`]->ne[`0`] == `16` && op->src[`0`]->ne[`1`] == `1` && op->src[`0`]->ne[`2`] % `128` == `0` && op->src[`4`]->ne[`1`] == `1`;
4031	}
4032	}
4033	case GGML_OP_SSM_CONV: {
4034	// assumes d_inner % threads == 0
4035	return op->src[`0`]->ne[`1`] % `128` == `0`;
4036	}
4037	case GGML_OP_CONT:
4038	return true;
4039	case GGML_OP_DIAG_MASK_INF:
4040	return true;
4041	case GGML_OP_SOFT_MAX:
4042	return true;
4043	case GGML_OP_SOFT_MAX_BACK: {
4044	float max_bias = `0.0f`;
4045	memcpy(dest: &max_bias, src: (const float ) op->op_params + `1`, n: sizeof(float*));
4046	return max_bias == `0.0f`;
4047	}
4048	case GGML_OP_ROLL:
4049	if(op->src[`0`]->type == GGML_TYPE_F32) {
4050	return true;
4051	}
4052	return false;
4053	case GGML_OP_ROPE:
4054	case GGML_OP_ROPE_BACK: {
4055	return op->src[`0`]->nb[`0`] == ggml_type_size(op->src[`0`]->type) && ggml_is_contiguous_2(op->src[`0`]);
4056	}
4057	case GGML_OP_IM2COL:
4058	case GGML_OP_IM2COL_3D:
4059	case GGML_OP_CONV_2D:
4060	case GGML_OP_CONV_2D_DW:
4061	case GGML_OP_CONV_TRANSPOSE_2D:
4062	case GGML_OP_POOL_2D:
4063	case GGML_OP_ACC:
4064	return true;
4065	case GGML_OP_SUM:
4066	return ggml_is_contiguous_rows(op->src[`0`]);
4067	case GGML_OP_ARGSORT:
4068	#ifndef GGML_CUDA_USE_CUB
4069	return op->src[`0`]->ne[`0`] <= `1024`;
4070	#else
4071	return true;
4072	#endif
4073	case GGML_OP_SUM_ROWS:
4074	case GGML_OP_MEAN:
4075	case GGML_OP_GROUP_NORM:
4076	case GGML_OP_PAD:
4077	return ggml_is_contiguous(op->src[`0`]);
4078	case GGML_OP_UPSCALE:
4079	case GGML_OP_PAD_REFLECT_1D:
4080	case GGML_OP_ARANGE:
4081	case GGML_OP_TIMESTEP_EMBEDDING:
4082	case GGML_OP_LEAKY_RELU:
4083	case GGML_OP_RWKV_WKV6:
4084	case GGML_OP_GATED_LINEAR_ATTN:
4085	case GGML_OP_RWKV_WKV7:
4086	return true;
4087	case GGML_OP_FLASH_ATTN_EXT:
4088	return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);
4089	case GGML_OP_CROSS_ENTROPY_LOSS:
4090	case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
4091	case GGML_OP_OPT_STEP_ADAMW:
4092	case GGML_OP_OPT_STEP_SGD:
4093	return true;
4094	default:
4095	return false;
4096	}
4097	}
4098
4099	static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
4100	ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
4101	const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
4102	return (((ggml_backend_buft_is_cuda(buft) \|\| ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) \|\| (integrated && ggml_backend_buft_is_cuda_host(buft)));
4103	}
4104
4105	static int64_t get_op_batch_size(const ggml_tensor * op) {
4106	switch (op->op) {
4107	case GGML_OP_GET_ROWS:
4108	return `0`;
4109	case GGML_OP_MUL_MAT:
4110	return op->ne[`1`];
4111	case GGML_OP_MUL_MAT_ID:
4112	case GGML_OP_ROPE:
4113	case GGML_OP_ROPE_BACK:
4114	return op->ne[`2`];
4115	default:
4116	return ggml_nrows(op);
4117	}
4118	}
4119
4120	static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
4121	const int min_batch_size = `32`;
4122
4123	return get_op_batch_size(op) >= min_batch_size;
4124
4125	GGML_UNUSED(dev);
4126	}
4127
4128	static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
4129	#ifdef GGML_CUDA_NO_PEER_COPY
4130	return nullptr;
4131	#else
4132	ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
4133
4134	ggml_cuda_set_device(device: dev_ctx->device);
4135
4136	cudaEvent_t event;
4137	CUDA_CHECK(cudaEventCreateWithFlags(event: &event, cudaEventDisableTiming));
4138
4139	return new ggml_backend_event {
4140	/ .device = / dev,
4141	/ .context = / event,
4142	};
4143	#endif
4144	}
4145
4146	static void ggml_backend_cuda_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
4147	GGML_UNUSED(dev);
4148
4149	CUDA_CHECK(cudaEventDestroy(event: (cudaEvent_t)event->context));
4150	delete event;
4151	}
4152
4153	static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
4154	GGML_UNUSED(dev);
4155	CUDA_CHECK(cudaEventSynchronize(event: (cudaEvent_t)event->context));
4156	}
4157
4158	static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
4159	/ .get_name = / ggml_backend_cuda_device_get_name,
4160	/ .get_description = / ggml_backend_cuda_device_get_description,
4161	/ .get_memory = / ggml_backend_cuda_device_get_memory,
4162	/ .get_type = / ggml_backend_cuda_device_get_type,
4163	/ .get_props = / ggml_backend_cuda_device_get_props,
4164	/ .init_backend = / ggml_backend_cuda_device_init_backend,
4165	/ .get_buffer_type = / ggml_backend_cuda_device_get_buffer_type,
4166	/ .get_host_buffer_type = / ggml_backend_cuda_device_get_host_buffer_type,
4167	/ .buffer_from_host_ptr = / NULL,
4168	/ .supports_op = / ggml_backend_cuda_device_supports_op,
4169	/ .supports_buft = / ggml_backend_cuda_device_supports_buft,
4170	/ .offload_op = / ggml_backend_cuda_device_offload_op,
4171	/ .event_new = / ggml_backend_cuda_device_event_new,
4172	/ .event_free = / ggml_backend_cuda_device_event_free,
4173	/ .event_synchronize = / ggml_backend_cuda_device_event_synchronize,
4174	};
4175
4176	// backend reg
4177
4178	struct ggml_backend_cuda_reg_context {
4179	std::vector<ggml_backend_dev_t> devices;
4180	};
4181
4182	static const char * ggml_backend_cuda_reg_get_name(ggml_backend_reg_t reg) {
4183	GGML_UNUSED(reg);
4184	return GGML_CUDA_NAME;
4185	}
4186
4187	static size_t ggml_backend_cuda_reg_get_device_count(ggml_backend_reg_t reg) {
4188	ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
4189	return ctx->devices.size();
4190	}
4191
4192	static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t reg, size_t index) {
4193	ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
4194	GGML_ASSERT(index < ctx->devices.size());
4195	return ctx->devices[index];
4196	}
4197
4198	static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
4199	static std::vector<ggml_backend_feature> features = []() {
4200	std::vector<ggml_backend_feature> features;
4201	#define _STRINGIFY(...) #__VA_ARGS__
4202	#define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
4203
4204	#ifdef __CUDA_ARCH_LIST__
4205	features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
4206	#endif
4207
4208	#ifdef GGML_CUDA_FORCE_MMQ
4209	features.push_back({ "FORCE_MMQ", "1" });
4210	#endif
4211
4212	#ifdef GGML_CUDA_FORCE_CUBLAS
4213	features.push_back({ "FORCE_CUBLAS", "1" });
4214	#endif
4215
4216	#ifndef GGML_USE_VMM
4217	features.push_back({ "NO_VMM", "1" });
4218	#endif
4219
4220	#ifdef GGML_CUDA_NO_PEER_COPY
4221	features.push_back({ "NO_PEER_COPY", "1" });
4222	#endif
4223
4224	#ifdef GGML_CUDA_USE_GRAPHS
4225	features.push_back({ "USE_GRAPHS", "1" });
4226	#endif
4227
4228	#ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
4229	features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
4230	#endif
4231
4232	#ifdef GGML_CUDA_FA_ALL_QUANTS
4233	features.push_back({ "FA_ALL_QUANTS", "1" });
4234	#endif
4235
4236	#undef _STRINGIFY
4237	#undef STRINGIFY
4238
4239	features.push_back({ nullptr, nullptr });
4240
4241	return features;
4242	}();
4243
4244	return features.data();
4245
4246	GGML_UNUSED(reg);
4247	}
4248
4249	static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
4250	GGML_UNUSED(reg);
4251	if (strcmp(s1: name, s2: "ggml_backend_split_buffer_type") == `0`) {
4252	return (void *)ggml_backend_cuda_split_buffer_type;
4253	}
4254	if (strcmp(s1: name, s2: "ggml_backend_register_host_buffer") == `0`) {
4255	return (void *)ggml_backend_cuda_register_host_buffer;
4256	}
4257	if (strcmp(s1: name, s2: "ggml_backend_unregister_host_buffer") == `0`) {
4258	return (void *)ggml_backend_cuda_unregister_host_buffer;
4259	}
4260	if (strcmp(s1: name, s2: "ggml_backend_get_features") == `0`) {
4261	return (void *)ggml_backend_cuda_get_features;
4262	}
4263	return nullptr;
4264	}
4265
4266	static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
4267	/ .get_name = / ggml_backend_cuda_reg_get_name,
4268	/ .get_device_count = / ggml_backend_cuda_reg_get_device_count,
4269	/ .get_device = / ggml_backend_cuda_reg_get_device,
4270	/ .get_proc_address = / ggml_backend_cuda_reg_get_proc_address,
4271	};
4272
4273	// backend registry
4274	ggml_backend_reg_t ggml_backend_cuda_reg() {
4275	static ggml_backend_reg reg;
4276	static bool initialized = false;
4277
4278	{
4279	static std::mutex mutex;
4280	std::lock_guard<std::mutex> lock(mutex);
4281	if (!initialized) {
4282	ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
4283
4284	for (int i = `0`; i < ggml_cuda_info().device_count; i++) {
4285	ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
4286	dev_ctx->device = i;
4287	dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
4288
4289	cudaDeviceProp prop;
4290	CUDA_CHECK(cudaGetDeviceProperties(prop: &prop, device: i));
4291	dev_ctx->description = prop.name;
4292
4293	char pci_bus_id[`16`] = {};
4294	snprintf(s: pci_bus_id, maxlen: sizeof(pci_bus_id), format: "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
4295	dev_ctx->pci_bus_id = pci_bus_id;
4296
4297	ggml_backend_dev_t dev = new ggml_backend_device {
4298	/ .iface = / ggml_backend_cuda_device_interface,
4299	/ .reg = / &reg,
4300	/ .context = / dev_ctx
4301	};
4302	ctx->devices.push_back(dev);
4303	}
4304
4305	reg = ggml_backend_reg {
4306	/ .api_version = / GGML_BACKEND_API_VERSION,
4307	/ .iface = / ggml_backend_cuda_reg_interface,
4308	/ .context = / ctx
4309	};
4310	}
4311
4312	initialized = true;
4313	}
4314
4315	return ®
4316	}
4317
4318	ggml_backend_t ggml_backend_cuda_init(int device) {
4319	if (device < `0` \|\| device >= ggml_backend_cuda_get_device_count()) {
4320	GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device);
4321	return nullptr;
4322	}
4323
4324	ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
4325	if (ctx == nullptr) {
4326	GGML_LOG_ERROR("%s: failed to allocate context\n", __func__);
4327	return nullptr;
4328	}
4329
4330	ggml_backend_t cuda_backend = new ggml_backend {
4331	/ .guid = / ggml_backend_cuda_guid(),
4332	/ .iface = / ggml_backend_cuda_interface,
4333	/ .device = / ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
4334	/ .context = / ctx,
4335	};
4336
4337	return cuda_backend;
4338	}
4339
4340	GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)
4341

Browse the source code of llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu