1#include "ggml-cuda.h"
2#include "ggml-impl.h"
3#include "ggml-backend-impl.h"
4
5#include "ggml-cuda/common.cuh"
6#include "ggml-cuda/acc.cuh"
7#include "ggml-cuda/add-id.cuh"
8#include "ggml-cuda/arange.cuh"
9#include "ggml-cuda/argmax.cuh"
10#include "ggml-cuda/argsort.cuh"
11#include "ggml-cuda/binbcast.cuh"
12#include "ggml-cuda/clamp.cuh"
13#include "ggml-cuda/concat.cuh"
14#include "ggml-cuda/conv-transpose-1d.cuh"
15#include "ggml-cuda/conv2d.cuh"
16#include "ggml-cuda/conv2d-dw.cuh"
17#include "ggml-cuda/conv2d-transpose.cuh"
18#include "ggml-cuda/convert.cuh"
19#include "ggml-cuda/count-equal.cuh"
20#include "ggml-cuda/cpy.cuh"
21#include "ggml-cuda/cross-entropy-loss.cuh"
22#include "ggml-cuda/diagmask.cuh"
23#include "ggml-cuda/fattn.cuh"
24#include "ggml-cuda/getrows.cuh"
25#include "ggml-cuda/im2col.cuh"
26#include "ggml-cuda/mmf.cuh"
27#include "ggml-cuda/mmq.cuh"
28#include "ggml-cuda/mmvf.cuh"
29#include "ggml-cuda/mmvq.cuh"
30#include "ggml-cuda/norm.cuh"
31#include "ggml-cuda/opt-step-adamw.cuh"
32#include "ggml-cuda/opt-step-sgd.cuh"
33#include "ggml-cuda/out-prod.cuh"
34#include "ggml-cuda/pad.cuh"
35#include "ggml-cuda/pool2d.cuh"
36#include "ggml-cuda/quantize.cuh"
37#include "ggml-cuda/rope.cuh"
38#include "ggml-cuda/roll.cuh"
39#include "ggml-cuda/scale.cuh"
40#include "ggml-cuda/softcap.cuh"
41#include "ggml-cuda/softmax.cuh"
42#include "ggml-cuda/ssm-conv.cuh"
43#include "ggml-cuda/ssm-scan.cuh"
44#include "ggml-cuda/sum.cuh"
45#include "ggml-cuda/sumrows.cuh"
46#include "ggml-cuda/mean.cuh"
47#include "ggml-cuda/tsembd.cuh"
48#include "ggml-cuda/topk-moe.cuh"
49#include "ggml-cuda/unary.cuh"
50#include "ggml-cuda/upscale.cuh"
51#include "ggml-cuda/wkv.cuh"
52#include "ggml-cuda/gla.cuh"
53#include "ggml-cuda/set.cuh"
54#include "ggml-cuda/set-rows.cuh"
55#include "ggml-cuda/pad_reflect_1d.cuh"
56#include "ggml.h"
57
58#include <algorithm>
59#include <array>
60#include <atomic>
61#include <charconv>
62#include <cinttypes>
63#include <condition_variable>
64#include <cstddef>
65#include <cstdint>
66#include <float.h>
67#include <initializer_list>
68#include <limits>
69#include <map>
70#include <memory>
71#include <mutex>
72#include <stdarg.h>
73#include <stdio.h>
74#include <stdlib.h>
75#include <string>
76#include <vector>
77
78static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
79
80[[noreturn]]
81void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
82 int id = -1; // in case cudaGetDevice fails
83 (void)cudaGetDevice(device: &id);
84
85 GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
86 GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
87 GGML_LOG_ERROR(" %s\n", stmt);
88 // abort with GGML_ABORT to get a stack trace
89 GGML_ABORT(GGML_CUDA_NAME " error");
90}
91
92// this is faster on Windows
93// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
94void ggml_cuda_set_device(int device) {
95 int current_device;
96 CUDA_CHECK(cudaGetDevice(device: &current_device));
97
98 if (device == current_device) {
99 return;
100 }
101
102 CUDA_CHECK(cudaSetDevice(device));
103}
104
105int ggml_cuda_get_device() {
106 int id;
107 CUDA_CHECK(cudaGetDevice(device: &id));
108 return id;
109}
110
111static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
112 ggml_cuda_set_device(device);
113 cudaError_t err;
114 if (getenv(name: "GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) {
115 err = cudaMallocManaged(devPtr: ptr, size);
116#if defined(GGML_USE_HIP)
117 if (err == hipSuccess) {
118 CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
119 }
120
121 // fall back to cudaMalloc if not supported (e.g. on Windows)
122 if (err == hipErrorNotSupported) {
123 static bool warned_unsupported = false;
124 if (!warned_unsupported) {
125 GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
126 warned_unsupported = true;
127 }
128
129 err = cudaMalloc(ptr, size);
130 }
131#endif // defined(GGML_USE_HIP)
132 } else {
133 err = cudaMalloc(devPtr: ptr, size);
134 }
135 return err;
136}
137
138#if defined(GGML_USE_HIP)
139static int ggml_cuda_parse_id(char devName[]) {
140 // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
141 // these values are not stable so this is susceptible to breakage
142 // https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
143 int archMajor = 0x0;
144 int archMinor = 0x0;
145 int archNum = GGML_CUDA_CC_OFFSET_AMD;
146 int archLen = strlen(devName);
147 char archName[archLen + 1];
148
149 // strip leading 'gfx' while copying into our buffer
150 if (archLen > 3) {
151 strcpy(archName, &devName[3]);
152 archLen -= 3;
153 }
154
155 // trim trailing :xnack- or :sramecc- statuses
156 archLen = strcspn(archName, ":");
157 archName[archLen] = '\0';
158
159 // tease out the version information
160 if (archLen > 8) {
161 // versions labeled generic use '-' as delimiter
162 // strip the trailing "-generic" then iterate through what remains
163 if ((strstr(archName, "-generic"))) {
164 archName[archLen - 8] = '\0';
165 char * pch;
166 if ((pch = strtok(archName, "-"))) {
167 archMajor = (int)strtoul(pch, 0, 16);
168 if ((pch = strtok(NULL, "-"))) {
169 archMinor = 0x10 * (int)strtoul(pch, 0, 16);
170 }
171 }
172 }
173 } else if (archLen >= 3) {
174 // last two digits should be the minor * 0x10 + stepping
175 archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
176 archName[archLen - 2] = '\0';
177
178 // only the major version remains
179 archMajor = (int)strtoul(archName, 0, 16);
180 }
181 archNum += archMajor * 0x100;
182 archNum += archMinor;
183 return archNum;
184}
185#endif // defined(GGML_USE_HIP)
186
187static ggml_cuda_device_info ggml_cuda_init() {
188 ggml_cuda_device_info info = {};
189
190 cudaError_t err = cudaGetDeviceCount(&info.device_count);
191 if (err != cudaSuccess) {
192 GGML_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
193 return info;
194 }
195
196 GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
197
198 int64_t total_vram = 0;
199#ifdef GGML_CUDA_FORCE_MMQ
200 GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
201#else
202 GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
203#endif // GGML_CUDA_FORCE_MMQ
204#ifdef GGML_CUDA_FORCE_CUBLAS
205 GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
206#else
207 GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
208#endif // GGML_CUDA_FORCE_CUBLAS
209 GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
210
211 std::vector<std::pair<int, std::string>> turing_devices_without_mma;
212 for (int id = 0; id < info.device_count; ++id) {
213 int device_vmm = 0;
214
215#if defined(GGML_USE_VMM)
216 CUdevice device;
217 CU_CHECK(cuDeviceGet(&device, id));
218 CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
219
220 if (device_vmm) {
221 CUmemAllocationProp alloc_prop = {};
222 alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
223 alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
224 alloc_prop.location.id = id;
225 CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
226 }
227#endif // defined(GGML_USE_VMM)
228 info.devices[id].vmm = !!device_vmm;
229
230 cudaDeviceProp prop;
231 CUDA_CHECK(cudaGetDeviceProperties(prop: &prop, device: id));
232
233 info.default_tensor_split[id] = total_vram;
234 total_vram += prop.totalGlobalMem;
235 info.devices[id].integrated = false; // Temporarily disabled due to issues with corrupted output (e.g. #15034)
236 info.devices[id].nsm = prop.multiProcessorCount;
237 info.devices[id].smpb = prop.sharedMemPerBlock;
238 info.devices[id].warp_size = prop.warpSize;
239#if defined(GGML_USE_HIP)
240 info.devices[id].smpbo = prop.sharedMemPerBlock;
241
242 info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
243 if ((info.devices[id].cc & 0xff00) == 0x0) {
244 GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s cc %d.%d\n",
245 id, prop.name, prop.gcnArchName, prop.major, prop.minor);
246
247 // Fallback to prop.major and prop.minor
248 if (prop.major > 0) {
249 info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;
250 info.devices[id].cc += prop.minor * 0x10;
251 }
252 }
253 GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
254 id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
255 device_vmm ? "yes" : "no", prop.warpSize);
256#elif defined(GGML_USE_MUSA)
257 // FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
258 info.devices[id].warp_size = 32;
259 info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
260 info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
261 info.devices[id].cc += prop.minor * 0x10;
262 GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
263 id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
264#else
265 info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
266 info.devices[id].cc = 100*prop.major + 10*prop.minor;
267 GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
268 id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
269 std::string device_name(prop.name);
270 if (device_name == "NVIDIA GeForce MX450") {
271 turing_devices_without_mma.push_back(x: { id, device_name });
272 } else if (device_name == "NVIDIA GeForce MX550") {
273 turing_devices_without_mma.push_back(x: { id, device_name });
274 } else if (device_name.substr(pos: 0, n: 21) == "NVIDIA GeForce GTX 16") {
275 turing_devices_without_mma.push_back(x: { id, device_name });
276 }
277
278 // Temporary performance fix:
279 // Setting device scheduling strategy for iGPUs with cc121 to "spinning" to avoid delays in cuda synchronize calls.
280 // TODO: Check for future drivers the default scheduling strategy and
281 // remove this call again when cudaDeviceScheduleSpin is default.
282 if (prop.major == 12 && prop.minor == 1) {
283 CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
284 }
285
286#endif // defined(GGML_USE_HIP)
287 }
288
289 if (ggml_cuda_highest_compiled_arch(GGML_CUDA_CC_TURING) >= GGML_CUDA_CC_TURING && !turing_devices_without_mma.empty()) {
290 GGML_LOG_INFO("The following devices will have suboptimal performance due to a lack of tensor cores:\n");
291 for (size_t device_pos = 0; device_pos < turing_devices_without_mma.size(); device_pos++) {
292 GGML_LOG_INFO(
293 " Device %d: %s\n", turing_devices_without_mma[device_pos].first, turing_devices_without_mma[device_pos].second.c_str());
294 }
295 GGML_LOG_INFO(
296 "Consider compiling with CMAKE_CUDA_ARCHITECTURES=61-virtual;80-virtual and DGGML_CUDA_FORCE_MMQ to force the use of the Pascal code for Turing.\n");
297 }
298
299 for (int id = 0; id < info.device_count; ++id) {
300 info.default_tensor_split[id] /= total_vram;
301 }
302
303 // configure logging to stdout
304 // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
305
306 return info;
307}
308
309const ggml_cuda_device_info & ggml_cuda_info() {
310 static ggml_cuda_device_info info = ggml_cuda_init();
311 return info;
312}
313
314// #define DEBUG_CUDA_MALLOC
315
316// buffer pool for cuda (legacy)
317struct ggml_cuda_pool_leg : public ggml_cuda_pool {
318 static const int MAX_BUFFERS = 256;
319
320 int device;
321 struct ggml_cuda_buffer {
322 void * ptr = nullptr;
323 size_t size = 0;
324 };
325
326 ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
327 size_t pool_size = 0;
328
329 explicit ggml_cuda_pool_leg(int device) :
330 device(device) {
331 }
332
333 ~ggml_cuda_pool_leg() {
334 ggml_cuda_set_device(device);
335 for (int i = 0; i < MAX_BUFFERS; ++i) {
336 ggml_cuda_buffer & b = buffer_pool[i];
337 if (b.ptr != nullptr) {
338 CUDA_CHECK(cudaFree(devPtr: b.ptr));
339 pool_size -= b.size;
340 }
341 }
342 GGML_ASSERT(pool_size == 0);
343 }
344
345 void * alloc(size_t size, size_t * actual_size) override {
346#ifdef DEBUG_CUDA_MALLOC
347 int nnz = 0;
348 size_t max_size = 0;
349#endif
350 size_t best_diff = 1ull << 36;
351 int ibest = -1;
352 for (int i = 0; i < MAX_BUFFERS; ++i) {
353 ggml_cuda_buffer& b = buffer_pool[i];
354 if (b.ptr != nullptr) {
355#ifdef DEBUG_CUDA_MALLOC
356 ++nnz;
357 if (b.size > max_size) max_size = b.size;
358#endif
359 if (b.size >= size) {
360 size_t diff = b.size - size;
361 if (diff < best_diff) {
362 best_diff = diff;
363 ibest = i;
364 if (!best_diff) {
365 void * ptr = b.ptr;
366 *actual_size = b.size;
367 b.ptr = nullptr;
368 b.size = 0;
369 return ptr;
370 }
371 }
372 }
373 }
374 }
375 if (ibest >= 0) {
376 ggml_cuda_buffer& b = buffer_pool[ibest];
377 void * ptr = b.ptr;
378 *actual_size = b.size;
379 b.ptr = nullptr;
380 b.size = 0;
381 return ptr;
382 }
383 void * ptr;
384 size_t look_ahead_size = (size_t) (1.05 * size);
385 look_ahead_size = 256 * ((look_ahead_size + 255)/256);
386 ggml_cuda_set_device(device);
387 CUDA_CHECK(ggml_cuda_device_malloc(ptr: &ptr, size: look_ahead_size, device));
388 *actual_size = look_ahead_size;
389 pool_size += look_ahead_size;
390#ifdef DEBUG_CUDA_MALLOC
391 GGML_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
392 (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
393#endif
394 return ptr;
395 }
396
397 void free(void * ptr, size_t size) override {
398 for (int i = 0; i < MAX_BUFFERS; ++i) {
399 ggml_cuda_buffer& b = buffer_pool[i];
400 if (b.ptr == nullptr) {
401 b.ptr = ptr;
402 b.size = size;
403 return;
404 }
405 }
406 GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
407 ggml_cuda_set_device(device);
408 CUDA_CHECK(cudaFree(devPtr: ptr));
409 pool_size -= size;
410 }
411};
412
413// pool with virtual memory
414#if defined(GGML_USE_VMM)
415struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
416 static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
417
418 int device;
419 CUdeviceptr pool_addr = 0;
420 size_t pool_used = 0;
421 size_t pool_size = 0;
422 size_t granularity;
423#if defined(GGML_USE_HIP)
424 std::vector<std::pair<CUdeviceptr, size_t>> mappings;
425#endif
426
427 explicit ggml_cuda_pool_vmm(int device) :
428 device(device),
429 granularity(ggml_cuda_info().devices[device].vmm_granularity) {
430 }
431
432 ~ggml_cuda_pool_vmm() {
433 if (pool_addr != 0) {
434#if defined(GGML_USE_HIP)
435 // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
436 for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
437 CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
438 }
439#else
440 CU_CHECK(cuMemUnmap(pool_addr, pool_size));
441#endif
442 CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
443 }
444 }
445
446 void * alloc(size_t size, size_t * actual_size) override {
447 // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
448 const size_t alignment = 128;
449 size = alignment * ((size + alignment - 1) / alignment);
450
451 size_t avail = pool_size - pool_used;
452
453 if (size > avail) {
454 // round up to the next multiple of the granularity
455 size_t reserve_size = size - avail;
456 reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
457
458 GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
459
460 // allocate more physical memory
461 CUmemAllocationProp prop = {};
462 prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
463 prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
464 prop.location.id = device;
465 CUmemGenericAllocationHandle handle;
466 CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
467
468 // reserve virtual address space (if not already reserved)
469 if (pool_addr == 0) {
470 CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
471 }
472
473 // map at the end of the pool
474 CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
475 CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
476#if defined(GGML_USE_HIP)
477 mappings.push_back({start_ptr, reserve_size});
478#endif
479
480 // the memory allocation handle is no longer needed after mapping
481 CU_CHECK(cuMemRelease(handle));
482
483 // set access
484 CUmemAccessDesc access = {};
485 access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
486 access.location.id = device;
487 access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
488 CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
489
490 // add to the pool
491 pool_size += reserve_size;
492
493 //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
494 // device, (unsigned long long) (pool_size/1024/1024),
495 // (unsigned long long) (reserve_size/1024/1024));
496 }
497
498 GGML_ASSERT(pool_addr != 0);
499
500 void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
501 *actual_size = size;
502 pool_used += size;
503
504#ifdef DEBUG_CUDA_MALLOC
505 printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
506#endif
507
508 return ptr;
509 }
510
511 void free(void * ptr, size_t size) override {
512#ifdef DEBUG_CUDA_MALLOC
513 printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr);
514#endif
515
516 pool_used -= size;
517
518 // all deallocations must be in reverse order of the allocations
519 GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
520 }
521};
522#endif // defined(GGML_USE_VMM)
523
524std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
525#if defined(GGML_USE_VMM)
526 if (ggml_cuda_info().devices[device].vmm) {
527 return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
528 }
529#endif // defined(GGML_USE_VMM)
530 return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
531}
532
533// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
534// this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured
535
536static std::mutex ggml_cuda_lock;
537static std::condition_variable ggml_cuda_lock_cv;
538static std::atomic<int> ggml_cuda_lock_counter;
539
540ggml_backend_cuda_context::~ggml_backend_cuda_context() {
541 std::unique_lock<std::mutex> lock(ggml_cuda_lock);
542 ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
543
544 if (copy_event != nullptr) {
545 CUDA_CHECK(cudaEventDestroy(copy_event));
546 }
547 for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
548 for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
549 if (streams[i][j] != nullptr) {
550 CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
551 }
552 }
553 if (cublas_handles[i] != nullptr) {
554 CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
555 }
556 }
557}
558
559
560// cuda buffer
561
562struct ggml_backend_cuda_buffer_context {
563 int device;
564 void * dev_ptr = nullptr;
565 std::string name;
566
567 ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
568 device(device), dev_ptr(dev_ptr),
569 name(GGML_CUDA_NAME + std::to_string(device)) {
570 }
571
572 ~ggml_backend_cuda_buffer_context() {
573 CUDA_CHECK(cudaFree(devPtr: dev_ptr));
574 }
575};
576
577static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
578 ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
579 delete ctx;
580}
581
582static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
583 return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
584}
585
586static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
587 ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
588 return ctx->dev_ptr;
589}
590
591static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
592 ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
593
594 if (tensor->view_src != NULL) {
595 assert(tensor->view_src->buffer->buft == buffer->buft);
596 return GGML_STATUS_SUCCESS;
597 }
598
599 if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
600 // initialize padding to 0 to avoid possible NaN values
601 const size_t original_size = ggml_nbytes(tensor);
602 const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
603
604 if (padded_size > original_size) {
605 ggml_cuda_set_device(device: ctx->device);
606 CUDA_CHECK(cudaMemset(devPtr: (char *)tensor->data + original_size, value: 0, count: padded_size - original_size));
607 }
608 }
609 return GGML_STATUS_SUCCESS;
610}
611
612static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
613 ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
614
615 ggml_cuda_set_device(device: ctx->device);
616 CUDA_CHECK(cudaMemsetAsync(devPtr: (char *)tensor->data + offset, value, count: size, cudaStreamPerThread));
617 CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
618}
619
620static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
621 ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
622
623 ggml_cuda_set_device(device: ctx->device);
624 CUDA_CHECK(cudaMemcpyAsync(dst: (char *)tensor->data + offset, src: data, count: size, kind: cudaMemcpyHostToDevice, cudaStreamPerThread));
625 CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
626}
627
628static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
629 ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
630
631 ggml_cuda_set_device(device: ctx->device);
632 CUDA_CHECK(cudaMemcpyAsync(dst: data, src: (const char *)tensor->data + offset, count: size, kind: cudaMemcpyDeviceToHost, cudaStreamPerThread));
633 CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
634}
635
636static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
637 if (ggml_backend_buffer_is_cuda(src->buffer)) {
638 ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
639 ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context;
640 if (src_ctx->device == dst_ctx->device) {
641 CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
642 } else {
643#ifdef GGML_CUDA_NO_PEER_COPY
644 return false;
645#else
646 CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
647#endif
648 }
649 CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
650 return true;
651 }
652 return false;
653
654 GGML_UNUSED(buffer);
655}
656
657static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
658 ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
659
660 ggml_cuda_set_device(device: ctx->device);
661 CUDA_CHECK(cudaMemsetAsync(ctx->dev_ptr, value, buffer->size, cudaStreamPerThread));
662 CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
663}
664
665static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
666 /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
667 /* .get_base = */ ggml_backend_cuda_buffer_get_base,
668 /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
669 /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor,
670 /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
671 /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
672 /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
673 /* .clear = */ ggml_backend_cuda_buffer_clear,
674 /* .reset = */ NULL,
675};
676
677// cuda buffer type
678struct ggml_backend_cuda_buffer_type_context {
679 int device;
680 std::string name;
681};
682
683static const char * ggml_backend_cuda_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
684 ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
685
686 return ctx->name.c_str();
687}
688
689static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
690 return buft->iface.get_name == ggml_backend_cuda_buffer_type_get_name;
691}
692
693static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
694 ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
695
696 ggml_cuda_set_device(device: buft_ctx->device);
697
698 void * dev_ptr;
699 cudaError_t err = ggml_cuda_device_malloc(ptr: &dev_ptr, size, device: buft_ctx->device);
700 if (err != cudaSuccess) {
701 // clear the error
702 (void)cudaGetLastError();
703 GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(error: err));
704 return nullptr;
705 }
706
707 ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
708
709 return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
710}
711
712static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
713 return 128;
714
715 GGML_UNUSED(buft);
716}
717
718static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
719 size_t size = ggml_nbytes(tensor);
720 int64_t ne0 = tensor->ne[0];
721
722 if (ggml_is_quantized(tensor->type)) {
723 if (ne0 % MATRIX_ROW_PADDING != 0) {
724 GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
725 size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
726 }
727 }
728
729 return size;
730
731 GGML_UNUSED(buft);
732}
733
734static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
735 /* .get_name = */ ggml_backend_cuda_buffer_type_get_name,
736 /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
737 /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
738 /* .get_max_size = */ NULL, // defaults to SIZE_MAX
739 /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
740 /* .is_host = */ NULL,
741};
742
743ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
744 static std::mutex mutex;
745 std::lock_guard<std::mutex> lock(mutex);
746
747 if (device >= ggml_backend_cuda_get_device_count()) {
748 return nullptr;
749 }
750
751 static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
752
753 static bool ggml_backend_cuda_buffer_type_initialized = false;
754
755 if (!ggml_backend_cuda_buffer_type_initialized) {
756 for (int i = 0; i < ggml_backend_cuda_get_device_count(); i++) {
757 ggml_backend_cuda_buffer_types[i] = {
758 /* .iface = */ ggml_backend_cuda_buffer_type_interface,
759 /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), i),
760 /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
761 };
762 }
763 ggml_backend_cuda_buffer_type_initialized = true;
764 }
765
766 return &ggml_backend_cuda_buffer_types[device];
767}
768
769// cuda split buffer
770
771static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
772 int64_t row_rounding = 0;
773 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
774 if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
775 continue;
776 }
777
778 const int cc = ggml_cuda_info().devices[id].cc;
779 row_rounding = std::max(a: row_rounding, b: (int64_t)get_mmq_y_host(cc));
780 }
781 return row_rounding;
782}
783
784static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
785 const int64_t nrows = ggml_nrows(tensor);
786 const int64_t rounding = get_row_rounding(tensor_split);
787
788 *row_low = id == 0 ? 0 : nrows*tensor_split[id];
789 *row_low -= *row_low % rounding;
790
791 if (id == ggml_backend_cuda_get_device_count() - 1) {
792 *row_high = nrows;
793 } else {
794 *row_high = nrows*tensor_split[id + 1];
795 *row_high -= *row_high % rounding;
796 }
797}
798
799static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
800 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
801
802 return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
803}
804
805struct ggml_backend_cuda_split_buffer_type_context {
806 int main_device;
807 std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
808 std::string name;
809};
810
811struct ggml_backend_cuda_split_buffer_context {
812 ~ggml_backend_cuda_split_buffer_context() {
813 for (ggml_tensor_extra_gpu * extra : tensor_extras) {
814 for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
815 for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
816 if (extra->events[id][is] != nullptr) {
817 CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
818 }
819 }
820 if (extra->data_device[id] != nullptr) {
821 CUDA_CHECK(cudaFree(extra->data_device[id]));
822 }
823 }
824 delete extra;
825 }
826 }
827
828 std::vector<ggml_tensor_extra_gpu *> tensor_extras;
829};
830
831
832static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
833 ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
834 delete ctx;
835}
836
837static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
838 // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
839 return (void *)0x1000;
840
841 GGML_UNUSED(buffer);
842}
843
844static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
845 GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
846 GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
847
848 ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
849 ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
850
851 const int64_t ne0 = tensor->ne[0];
852
853 ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
854 ctx->tensor_extras.push_back(extra);
855
856 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
857 int64_t row_low, row_high;
858 get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
859
860 int64_t nrows_split = row_high - row_low;
861 if (nrows_split == 0) {
862 continue;
863 }
864
865 size_t size = ggml_nbytes_split(tensor, nrows_split);
866 const size_t original_size = size;
867
868 // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
869 if (ne0 % MATRIX_ROW_PADDING != 0) {
870 size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
871 }
872
873 // FIXME: do not crash if cudaMalloc fails
874 // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
875 ggml_cuda_set_device(device: id);
876 char * buf;
877 CUDA_CHECK(ggml_cuda_device_malloc(ptr: (void**)&buf, size, device: id));
878
879 // set padding to 0 to avoid possible NaN values
880 if (size > original_size) {
881 CUDA_CHECK(cudaMemset(devPtr: buf + original_size, value: 0, count: size - original_size));
882 }
883
884 extra->data_device[id] = buf;
885
886 for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) {
887 CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
888 }
889 }
890 tensor->extra = extra;
891 return GGML_STATUS_SUCCESS;
892}
893
894static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
895 // split tensors must always be set in their entirety at once
896 GGML_ASSERT(offset == 0);
897 GGML_ASSERT(size == ggml_nbytes(tensor));
898 GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
899
900 ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
901
902 const int64_t ne0 = tensor->ne[0];
903 const size_t nb1 = tensor->nb[1];
904 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
905
906 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
907 int64_t row_low, row_high;
908 get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
909
910 int64_t nrows_split = row_high - row_low;
911 if (nrows_split == 0) {
912 continue;
913 }
914
915 const size_t offset_split = row_low*nb1;
916 size_t size = ggml_nbytes_split(tensor, nrows_split);
917 const size_t original_size = size;
918
919 // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
920 if (ne0 % MATRIX_ROW_PADDING != 0) {
921 size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
922 }
923
924 const char * buf_host = (const char *)data + offset_split;
925 CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
926 }
927
928 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
929 CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
930 }
931}
932
933static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
934 // split tensors must always be set in their entirety at once
935 GGML_ASSERT(offset == 0);
936 GGML_ASSERT(size == ggml_nbytes(tensor));
937 GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
938
939 ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
940
941 const int64_t ne0 = tensor->ne[0];
942 const size_t nb1 = tensor->nb[1];
943 ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
944
945 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
946 int64_t row_low, row_high;
947 get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
948
949 int64_t nrows_split = row_high - row_low;
950 if (nrows_split == 0) {
951 continue;
952 }
953
954 const size_t offset_split = row_low*nb1;
955 size_t size = ggml_nbytes_split(tensor, nrows_split);
956 const size_t original_size = size;
957
958 // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
959 if (ne0 % MATRIX_ROW_PADDING != 0) {
960 size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
961 }
962
963 char * buf_host = (char *)data + offset_split;
964 CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
965 }
966
967 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
968 CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
969 }
970}
971
972static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
973 GGML_UNUSED(buffer);
974 GGML_UNUSED(value);
975}
976
977static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
978 /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
979 /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
980 /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
981 /* .memset_tensor = */ NULL,
982 /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
983 /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
984 /* .cpy_tensor = */ NULL,
985 /* .clear = */ ggml_backend_cuda_split_buffer_clear,
986 /* .reset = */ NULL,
987};
988
989// cuda split buffer type
990
991static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
992 ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
993
994 return ctx->name.c_str();
995}
996
997static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
998 return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_get_name;
999}
1000
1001static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1002 // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
1003 // instead, we allocate them for each tensor separately in init_tensor
1004 // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
1005 // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
1006 ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
1007
1008 return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
1009}
1010
1011static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1012 return 128;
1013
1014 GGML_UNUSED(buft);
1015}
1016
1017static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
1018 ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
1019 GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
1020
1021 size_t total_size = 0;
1022
1023 const int64_t ne0 = tensor->ne[0];
1024
1025 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1026 int64_t row_low, row_high;
1027 get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
1028
1029 int64_t nrows_split = row_high - row_low;
1030 if (nrows_split == 0) {
1031 continue;
1032 }
1033
1034 total_size += ggml_nbytes_split(tensor, nrows_split);
1035
1036 // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
1037 if (ne0 % MATRIX_ROW_PADDING != 0) {
1038 total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
1039 }
1040 }
1041
1042 return total_size;
1043}
1044
1045static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1046 return false;
1047
1048 GGML_UNUSED(buft);
1049}
1050
1051static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
1052 /* .get_name = */ ggml_backend_cuda_split_buffer_type_get_name,
1053 /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
1054 /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
1055 /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1056 /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
1057 /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
1058};
1059
1060ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
1061 static std::mutex mutex;
1062 std::lock_guard<std::mutex> lock(mutex);
1063
1064 static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
1065
1066 std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
1067
1068 bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
1069 if (all_zero) {
1070 tensor_split_arr = ggml_cuda_info().default_tensor_split;
1071 } else {
1072 float split_sum = 0.0f;
1073 for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
1074 tensor_split_arr[i] = split_sum;
1075 split_sum += tensor_split[i];
1076 }
1077 for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
1078 tensor_split_arr[i] /= split_sum;
1079 }
1080 }
1081
1082 auto it = buft_map.find({main_device, tensor_split_arr});
1083 if (it != buft_map.end()) {
1084 return &it->second;
1085 }
1086 auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
1087 main_device,
1088 tensor_split_arr,
1089 GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
1090 };
1091
1092 struct ggml_backend_buffer_type buft {
1093 /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
1094 /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
1095 /* .context = */ ctx,
1096 };
1097
1098 auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
1099 return &result.first->second;
1100}
1101
1102// host buffer type
1103
1104static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
1105 return GGML_CUDA_NAME "_Host";
1106
1107 GGML_UNUSED(buft);
1108}
1109
1110static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
1111 return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1112}
1113
1114static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1115 CUDA_CHECK(cudaFreeHost(buffer->context));
1116}
1117
1118static void * ggml_cuda_host_malloc(size_t size) {
1119 if (getenv(name: "GGML_CUDA_NO_PINNED") != nullptr) {
1120 return nullptr;
1121 }
1122
1123 void * ptr = nullptr;
1124 cudaError_t err = cudaMallocHost(ptr: (void **) &ptr, size);
1125 if (err != cudaSuccess) {
1126 // clear the error
1127 (void)cudaGetLastError();
1128 GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1129 size / 1024.0 / 1024.0, cudaGetErrorString(error: err));
1130 return nullptr;
1131 }
1132
1133 return ptr;
1134}
1135
1136static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1137 void * ptr = ggml_cuda_host_malloc(size);
1138
1139 if (ptr == nullptr) {
1140 // fallback to cpu buffer
1141 return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
1142 }
1143
1144 ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
1145 buffer->buft = buft;
1146 buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
1147
1148 return buffer;
1149}
1150
1151ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1152 static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
1153 /* .iface = */ {
1154 /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
1155 /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
1156 /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1157 /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1158 /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1159 /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1160 },
1161 /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
1162 /* .context = */ nullptr,
1163 };
1164
1165 return &ggml_backend_cuda_buffer_type_host;
1166}
1167
1168//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) {
1169// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1170//}
1171
1172/// kernels
1173
1174typedef void (*ggml_cuda_op_mul_mat_t)(
1175 ggml_backend_cuda_context & ctx,
1176 const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1177 const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1178 const int64_t src1_padded_row_size, cudaStream_t stream);
1179
1180#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
1181#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
1182#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
1183
1184#define MUL_MAT_SRC1_COL_STRIDE 128
1185
1186static cudaError_t ggml_cuda_cpy_tensor_2d(
1187 void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1188
1189 const char * src_ptr = (const char *) src->data;
1190 char * dst_ptr = (char *) dst;
1191
1192 const int64_t ne0 = src->ne[0];
1193 const int64_t nb0 = src->nb[0];
1194 const int64_t nb1 = src->nb[1];
1195 const int64_t nb2 = src->nb[2];
1196 const int64_t nb3 = src->nb[3];
1197 const enum ggml_type type = src->type;
1198 const int64_t ts = ggml_type_size(type);
1199 const int64_t bs = ggml_blck_size(type);
1200 const int64_t i1_diff = i1_high - i1_low;
1201
1202 const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
1203 if (nb0 == ts && nb1 == ts*ne0/bs) {
1204 return cudaMemcpyAsync(dst: dst_ptr, src: x, count: i1_diff*nb1, kind: cudaMemcpyDeviceToDevice, stream);
1205 } else if (nb0 == ts) {
1206 return cudaMemcpy2DAsync(dst: dst_ptr, dpitch: ts*ne0/bs, src: x, spitch: nb1, width: ts*ne0/bs, height: i1_diff, kind: cudaMemcpyDeviceToDevice, stream);
1207 } else {
1208 for (int64_t i1 = 0; i1 < i1_diff; i1++) {
1209 const void * rx = (const void *) ((const char *) x + i1*nb1);
1210 void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
1211 // pretend the row is a matrix with cols=1
1212 cudaError_t r = cudaMemcpy2DAsync(dst: rd, dpitch: ts/bs, src: rx, spitch: nb0, width: ts/bs, height: ne0, kind: cudaMemcpyDeviceToDevice, stream);
1213 if (r != cudaSuccess) {
1214 return r;
1215 }
1216 }
1217 return cudaSuccess;
1218 }
1219}
1220
1221static void ggml_cuda_op_mul_mat_cublas(
1222 ggml_backend_cuda_context & ctx,
1223 const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
1224 const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1225 const int64_t src1_padded_row_size, cudaStream_t stream) {
1226
1227 GGML_ASSERT(src0_dd_i != nullptr);
1228 GGML_ASSERT(src1_ddf_i != nullptr);
1229 GGML_ASSERT(dst_dd_i != nullptr);
1230
1231 const int64_t ne00 = src0->ne[0];
1232 const int64_t ne10 = src1->ne[0];
1233
1234 const int64_t ne0 = dst->ne[0];
1235
1236 const int64_t row_diff = row_high - row_low;
1237
1238 int id = ggml_cuda_get_device();
1239
1240 // the main device has a larger memory buffer to hold the results from all GPUs
1241 // ldc == nrows of the matrix that cuBLAS writes into
1242 int64_t ldc = id == ctx.device ? ne0 : row_diff;
1243
1244 const int cc = ggml_cuda_info().devices[id].cc;
1245
1246 const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
1247 (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
1248
1249 const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
1250
1251 if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
1252 ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
1253 if (src1->type != GGML_TYPE_BF16) {
1254 const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
1255 GGML_ASSERT(to_bf16_cuda != nullptr);
1256 size_t ne = src1_ncols*ne10;
1257 src1_as_bf16.alloc(ne);
1258 to_bf16_cuda(src1_ddf_i, src1_as_bf16.get(), ne, stream);
1259 }
1260 const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get();
1261 const nv_bfloat16 * src0_ptr = (const nv_bfloat16 *)src0_dd_i;
1262 ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
1263
1264 const float alpha_f32 = 1.0f;
1265 const float beta_f32 = 0.0f;
1266
1267 CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1268 CUBLAS_CHECK(
1269 cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1270 row_diff, src1_ncols, ne10,
1271 &alpha_f32, src0_ptr, CUDA_R_16BF, ne00,
1272 src1_ptr, CUDA_R_16BF, ne10,
1273 &beta_f32, dst_bf16.get(), CUDA_R_16BF, ldc,
1274 CUBLAS_COMPUTE_32F,
1275 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1276
1277 const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
1278 to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1279 } else if (fast_fp16_hardware_available(cc) && use_fp16) {
1280 // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1281 ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1282 if (src0->type != GGML_TYPE_F16) {
1283 const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
1284 GGML_ASSERT(to_fp16_cuda != nullptr);
1285 size_t ne = row_diff*ne00;
1286 src0_as_f16.alloc(ne);
1287 to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
1288 }
1289 const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
1290
1291 ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
1292 if (src1->type != GGML_TYPE_F16) {
1293 const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
1294 GGML_ASSERT(to_fp16_cuda != nullptr);
1295 size_t ne = src1_ncols*ne10;
1296 src1_as_f16.alloc(ne);
1297 to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
1298 }
1299 const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
1300
1301 CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1302
1303 if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
1304 const float alpha = 1.0f;
1305 const float beta = 0.0f;
1306 CUBLAS_CHECK(
1307 cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1308 row_diff, src1_ncols, ne10,
1309 &alpha, src0_ptr, CUDA_R_16F, ne00,
1310 src1_ptr, CUDA_R_16F, ne10,
1311 &beta, dst_dd_i, CUDA_R_32F, ldc,
1312 CUBLAS_COMPUTE_32F,
1313 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1314 } else {
1315 ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
1316
1317 const half alpha_f16 = 1.0f;
1318 const half beta_f16 = 0.0f;
1319
1320 CUBLAS_CHECK(
1321 cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1322 row_diff, src1_ncols, ne10,
1323 &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
1324 src1_ptr, CUDA_R_16F, ne10,
1325 &beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
1326 CUBLAS_COMPUTE_16F,
1327 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1328
1329 const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
1330 to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1331 }
1332 } else {
1333 ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
1334 ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
1335
1336 if (src0->type != GGML_TYPE_F32) {
1337 const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
1338 GGML_ASSERT(to_fp32_cuda != nullptr);
1339 src0_ddq_as_f32.alloc(row_diff*ne00);
1340 to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
1341 }
1342 if (src1->type != GGML_TYPE_F32) {
1343 const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
1344 GGML_ASSERT(to_fp32_cuda != nullptr);
1345 src1_ddq_as_f32.alloc(src1_ncols*ne10);
1346 to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
1347 }
1348
1349 const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
1350 const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
1351
1352 const float alpha = 1.0f;
1353 const float beta = 0.0f;
1354
1355 CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
1356 CUBLAS_CHECK(
1357 cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
1358 row_diff, src1_ncols, ne10,
1359 &alpha, src0_ddf_i, ne00,
1360 src1_ddf1_i, ne10,
1361 &beta, dst_dd_i, ldc));
1362 }
1363
1364 GGML_UNUSED_VARS(dst, src1_ddq_i, src1_padded_row_size);
1365}
1366
1367static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
1368 static bool peer_access_enabled = false;
1369
1370 const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
1371
1372 if (peer_access_enabled == enable_peer_access) {
1373 return;
1374 }
1375
1376#ifdef NDEBUG
1377 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1378 ggml_cuda_set_device(device: id);
1379 CUDA_CHECK(cudaDeviceSynchronize());
1380 }
1381
1382 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1383 ggml_cuda_set_device(device: id);
1384
1385 for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
1386 if (id == id_other) {
1387 continue;
1388 }
1389 if (id != main_device && id_other != main_device) {
1390 continue;
1391 }
1392
1393 int can_access_peer;
1394 CUDA_CHECK(cudaDeviceCanAccessPeer(canAccessPeer: &can_access_peer, device: id, peerDevice: id_other));
1395 if (can_access_peer) {
1396 if (enable_peer_access) {
1397 cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice: id_other, flags: 0);
1398 if (err != cudaErrorPeerAccessAlreadyEnabled) {
1399 CUDA_CHECK(err);
1400 } else {
1401 // reset the error
1402 (void)cudaGetLastError();
1403 }
1404 } else {
1405 cudaError_t err = cudaDeviceDisablePeerAccess(peerDevice: id_other);
1406 if (err != cudaErrorPeerAccessNotEnabled) {
1407 CUDA_CHECK(err);
1408 } else {
1409 // reset the error
1410 (void)cudaGetLastError();
1411 }
1412 }
1413 }
1414 }
1415 }
1416
1417 ggml_cuda_set_device(device: main_device);
1418#endif // NDEBUG
1419
1420 peer_access_enabled = enable_peer_access;
1421
1422 GGML_UNUSED(main_device);
1423}
1424
1425static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
1426 void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
1427
1428#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1429 // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1430 cudaMemcpy3DPeerParms p = {};
1431 p.dstDevice = dstDevice;
1432 p.dstPtr = make_cudaPitchedPtr(d: dst, p: dpitch, xsz: dpitch, ysz: height);
1433 p.srcDevice = srcDevice;
1434 p.srcPtr = make_cudaPitchedPtr(d: src, p: spitch, xsz: spitch, ysz: height);
1435 p.extent = make_cudaExtent(w: width, h: height, d: 1);
1436 return cudaMemcpy3DPeerAsync(p: &p, stream);
1437#else
1438 // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1439 GGML_UNUSED(dstDevice);
1440 GGML_UNUSED(srcDevice);
1441 return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
1442#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1443}
1444
1445static void ggml_cuda_op_mul_mat(
1446 ggml_backend_cuda_context & ctx,
1447 const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
1448 quantize_cuda_t quantize_src1) {
1449
1450 const int64_t ne00 = src0->ne[0];
1451 const int64_t ne01 = src0->ne[1];
1452 const int64_t ne02 = src0->ne[2];
1453 const int64_t ne03 = src0->ne[3];
1454
1455 const int64_t ne10 = src1->ne[0];
1456 const int64_t ne11 = src1->ne[1];
1457 const int64_t ne12 = src1->ne[2];
1458 const int64_t ne13 = src1->ne[3];
1459 const int64_t nrows1 = ggml_nrows(src1);
1460
1461 const int64_t ne0 = dst->ne[0];
1462 const int64_t ne1 = dst->ne[1];
1463
1464 // const int64_t nb10 = src1->nb[0];
1465 const int64_t nb11 = src1->nb[1];
1466 const int64_t nb12 = src1->nb[2];
1467 const int64_t nb13 = src1->nb[3];
1468
1469 const int64_t nb2 = dst->nb[2];
1470 const int64_t nb3 = dst->nb[3];
1471
1472 ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
1473 ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
1474
1475 GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
1476
1477 GGML_ASSERT(ne12 % ne02 == 0);
1478 GGML_ASSERT(ne13 % ne03 == 0);
1479
1480 const int64_t i02_divisor = ne12 / ne02;
1481 const int64_t i03_divisor = ne13 / ne03;
1482
1483 const size_t src0_ts = ggml_type_size(src0->type);
1484 const size_t src0_bs = ggml_blck_size(src0->type);
1485 const size_t q8_1_ts = sizeof(block_q8_1);
1486 const size_t q8_1_bs = QK8_1;
1487
1488 const bool src0_is_contiguous = ggml_is_contiguous(src0);
1489 const bool src1_is_contiguous = ggml_is_contiguous(src1);
1490
1491 const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
1492
1493 const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
1494 GGML_ASSERT(!(split && ne02 > 1));
1495 GGML_ASSERT(!(split && ne03 > 1));
1496 GGML_ASSERT(!(split && ne02 < ne12));
1497 GGML_ASSERT(!(split && ne03 < ne13));
1498
1499 ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
1500
1501
1502 std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
1503 if (split) {
1504 ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
1505 tensor_split = buft_ctx->tensor_split;
1506 }
1507
1508 struct dev_data {
1509 int cc;
1510
1511 ggml_cuda_pool_alloc<char> src0_dd_alloc;
1512 ggml_cuda_pool_alloc<float> src1_ddf_alloc;
1513 ggml_cuda_pool_alloc<char> src1_ddq_alloc;
1514 ggml_cuda_pool_alloc<float> dst_dd_alloc;
1515
1516 char * src0_dd = nullptr;
1517 float * src1_ddf = nullptr; // float
1518 char * src1_ddq = nullptr; // q8_1
1519 float * dst_dd = nullptr;
1520
1521 int64_t row_low;
1522 int64_t row_high;
1523 };
1524
1525 dev_data dev[GGML_CUDA_MAX_DEVICES];
1526
1527 int used_devices = 0;
1528
1529 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1530 dev[id].cc = ggml_cuda_info().devices[id].cc;
1531
1532 // by default, use all rows
1533 dev[id].row_low = 0;
1534 dev[id].row_high = ne01;
1535
1536 // for multi GPU, get the row boundaries from tensor split
1537 // and round to mul_mat_q tile sizes
1538 if (split) {
1539 const int64_t rounding = get_row_rounding(tensor_split);
1540
1541 if (id != 0) {
1542 dev[id].row_low = ne01*tensor_split[id];
1543 if (dev[id].row_low < ne01) {
1544 dev[id].row_low -= dev[id].row_low % rounding;
1545 }
1546 }
1547
1548 if (id != ggml_backend_cuda_get_device_count() - 1) {
1549 dev[id].row_high = ne01*tensor_split[id + 1];
1550 if (dev[id].row_high < ne01) {
1551 dev[id].row_high -= dev[id].row_high % rounding;
1552 }
1553 }
1554 }
1555 }
1556
1557 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1558 if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
1559 continue;
1560 }
1561
1562 used_devices++;
1563
1564 const bool src1_on_device = id == src1_ctx->device;
1565 const bool dst_on_device = id == dst_ctx->device;
1566
1567 ggml_cuda_set_device(device: id);
1568 cudaStream_t stream = ctx.stream(id, 0);
1569
1570 if (src0_is_contiguous) {
1571 dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
1572 } else {
1573 // If src0 is not contiguous it will be copied to a temporary buffer.
1574 // This buffer needs to be cleared entirely because multiple regions will function as padding.
1575 const size_t nbytes_data = ggml_nbytes(src0);
1576 const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1577 dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
1578 CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, 0, nbytes_data + nbytes_padding, stream));
1579 }
1580
1581 // If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
1582 if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
1583 GGML_ASSERT(ggml_is_contiguously_allocated(src0));
1584 GGML_ASSERT(!src0->view_src);
1585 const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
1586 const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1587 CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
1588 }
1589
1590 if (src1_on_device && src1_is_contiguous) {
1591 dev[id].src1_ddf = (float *) src1->data;
1592 } else {
1593 dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
1594 }
1595
1596 if (quantize_src1) {
1597 size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
1598 if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1599 src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
1600 }
1601 dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
1602
1603 if (src1_on_device && src1_is_contiguous) {
1604 quantize_src1(
1605 dev[id].src1_ddf, nullptr, dev[id].src1_ddq, src0->type, ne10,
1606 nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float),
1607 src1_padded_col_size, ne11, ne12, ne13, stream);
1608 CUDA_CHECK(cudaGetLastError());
1609 }
1610 }
1611
1612 if (dst_on_device) {
1613 dev[id].dst_dd = (float *) dst->data;
1614 } else {
1615 const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
1616 dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf);
1617 }
1618 }
1619
1620 // if multiple devices are used they need to wait for the main device
1621 // here an event is recorded that signals that the main device has finished calculating the input data
1622 if (split && used_devices > 1) {
1623 ggml_cuda_set_device(ctx.device);
1624 CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream()));
1625 }
1626
1627 const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
1628 for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
1629 const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0;
1630 const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
1631
1632 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1633 if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
1634 continue;
1635 }
1636
1637 const bool src1_on_device = id == src1_ctx->device;
1638 const bool dst_on_device = id == dst_ctx->device;
1639 const int64_t row_diff = dev[id].row_high - dev[id].row_low;
1640
1641 ggml_cuda_set_device(device: id);
1642 cudaStream_t stream = ctx.stream(id, is);
1643
1644 // wait for main GPU data if necessary
1645 if (split && (id != ctx.device || is != 0)) {
1646 CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0));
1647 }
1648
1649 for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
1650 const int64_t i03 = i0 / ne12;
1651 const int64_t i02 = i0 % ne12;
1652
1653 size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1654 if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1655 src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
1656 } else {
1657 src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1658 }
1659
1660 // for split tensors the data begins at i0 == i0_offset_low
1661 const size_t nbytes_src0_matrix = ne01*ne00*src0_ts / src0_bs;
1662 char * src0_dd_i = dev[id].src0_dd + ((i03/i03_divisor)*ne02 + (i02/i02_divisor)) * nbytes_src0_matrix;
1663 float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
1664 char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
1665 float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
1666
1667 // the main device memory buffer can be on VRAM scratch, with space for all partial results
1668 // in that case an offset on dst_ddf_i is needed
1669 if (id == ctx.device) {
1670 dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
1671 }
1672
1673 // copy src0, src1 to device if necessary
1674 if (src1_is_contiguous) {
1675 if (id != ctx.device) {
1676 if (quantize_src1) {
1677 char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
1678 if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1679 const size_t pitch = ne11*sizeof(block_q8_1_mmq);
1680 const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
1681 const size_t height = src1_padded_col_size/(4*QK8_1);
1682 CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
1683 } else {
1684 CUDA_CHECK(cudaMemcpyPeerAsync(
1685 src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1686 }
1687 } else {
1688 float * src1_ddf_i_source = (float *) src1->data;
1689 src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
1690 CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device,
1691 src1_ncols*ne10*sizeof(float), stream));
1692 }
1693 }
1694 } else if (src1_on_device && !src1_is_contiguous) {
1695 CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1696 dst: src1_ddf_i, src: src1, i3: i03, i2: i02, i1_low: src1_col_0, i1_high: src1_col_0+src1_ncols, stream));
1697 } else {
1698 GGML_ABORT("fatal error");
1699 }
1700
1701 if (quantize_src1 && !src1_is_contiguous) {
1702 quantize_src1(
1703 src1_ddf_i, nullptr, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10,
1704 src1_padded_col_size, src1_ncols, 1, 1, stream);
1705 CUDA_CHECK(cudaGetLastError());
1706 }
1707
1708 if (src1_col_0 == 0 && !src0_is_contiguous && i03 % i03_divisor == 0 && i02 % i02_divisor == 0) {
1709 CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1710 src0_dd_i, src0, i03/i03_divisor, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
1711 }
1712
1713 // do the computation
1714 op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
1715 dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
1716 CUDA_CHECK(cudaGetLastError());
1717
1718 // copy dst to host or other device if necessary
1719 if (!dst_on_device) {
1720 void * dst_off_device = dst->data;
1721 if (split) {
1722 // src0 = weight matrix is saved as a transposed matrix for better memory layout.
1723 // dst is NOT transposed.
1724 // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
1725 // Instead they need to be copied to the correct slice in ne0 = dst row index.
1726 // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
1727 float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1728 GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1729 dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
1730 CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
1731 dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
1732 } else {
1733 float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1734 GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
1735 dhf_dst_i += src1_col_0*ne0;
1736 CUDA_CHECK(cudaMemcpyAsync(dst: dhf_dst_i, src: dst_dd_i, count: src1_ncols*ne0*sizeof(float), kind: cudaMemcpyDeviceToDevice, stream));
1737 }
1738 }
1739
1740 // add event for the main device to wait on until other device is done
1741 if (split && (id != ctx.device || is != 0)) {
1742 CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
1743 }
1744 }
1745 }
1746 }
1747
1748 // main device waits for all other devices to be finished
1749 if (split && ggml_backend_cuda_get_device_count() > 1) {
1750 int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
1751 is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS;
1752
1753 ggml_cuda_set_device(ctx.device);
1754 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
1755 if (dev[id].row_low == dev[id].row_high) {
1756 continue;
1757 }
1758 for (int64_t is = 0; is < is_max; ++is) {
1759 CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0));
1760 }
1761 }
1762 }
1763}
1764
1765static __global__ void k_compute_batched_ptrs(
1766 const void * src0_as_f16, const void * src1_as_f16, char * dst,
1767 const void ** ptrs_src, void ** ptrs_dst,
1768 int64_t ne12, int64_t ne13,
1769 int64_t ne23,
1770 size_t nb02, size_t nb03,
1771 size_t nb12, size_t nb13,
1772 size_t nbd2, size_t nbd3,
1773 int64_t r2, int64_t r3) {
1774 const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
1775 const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
1776
1777 if (i13 >= ne13 || i12 >= ne12) {
1778 return;
1779 }
1780
1781 const int64_t i03 = i13 / r3;
1782 const int64_t i02 = i12 / r2;
1783
1784 ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
1785 ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
1786 ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
1787}
1788
1789// Type traits for mapping ggml types to CUDA/cuBLAS types
1790template<ggml_type T>
1791struct batched_mul_mat_traits;
1792
1793template<>
1794struct batched_mul_mat_traits<GGML_TYPE_F32> {
1795 using cuda_type = float;
1796 static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
1797 static inline const cudaDataType_t data_type = CUDA_R_32F;
1798 static inline const ggml_type ggml_type_val = GGML_TYPE_F32;
1799 static inline const float alpha = 1.0f;
1800 static inline const float beta = 0.0f;
1801 static inline const void* get_alpha() { static const float val = alpha; return &val; }
1802 static inline const void* get_beta() { static const float val = beta; return &val; }
1803 static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp32_nc_cuda(src_type); }
1804};
1805
1806template<>
1807struct batched_mul_mat_traits<GGML_TYPE_BF16> {
1808 using cuda_type = nv_bfloat16;
1809 static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
1810 static inline const cudaDataType_t data_type = CUDA_R_16BF;
1811 static inline const ggml_type ggml_type_val = GGML_TYPE_BF16;
1812 static inline const float alpha = 1.0f;
1813 static inline const float beta = 0.0f;
1814 static inline const void* get_alpha() { static const float val = alpha; return &val; }
1815 static inline const void* get_beta() { static const float val = beta; return &val; }
1816 static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_bf16_nc_cuda(src_type); }
1817};
1818
1819template<>
1820struct batched_mul_mat_traits<GGML_TYPE_F16> {
1821 using cuda_type = half;
1822 static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
1823 static inline const cudaDataType_t data_type = CUDA_R_16F;
1824 static inline const ggml_type ggml_type_val = GGML_TYPE_F16;
1825 static inline const half alpha = 1.0;
1826 static inline const half beta = 0.0;
1827 static inline const void* get_alpha() { static const half val = alpha; return &val; }
1828 static inline const void* get_beta() { static const half val = beta; return &val; }
1829 static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp16_nc_cuda(src_type); }
1830};
1831
1832template<ggml_type src0_type>
1833static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1834 using traits = batched_mul_mat_traits<src0_type>;
1835 using cuda_t = typename traits::cuda_type;
1836
1837 GGML_ASSERT(!ggml_is_transposed(src0));
1838 GGML_ASSERT(!ggml_is_transposed(src1));
1839 GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft));
1840 GGML_ASSERT(src0->type == src0_type);
1841 GGML_ASSERT(ggml_is_contiguous(dst));
1842
1843 // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
1844 // As long as dst is contiguous this does not matter though.
1845
1846 GGML_TENSOR_BINARY_OP_LOCALS
1847
1848 const int64_t ne_dst = ggml_nelements(dst);
1849 cudaStream_t main_stream = ctx.stream();
1850 CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
1851
1852 float * dst_ddf = (float *) dst->data;
1853 const size_t ts_src1 = ggml_type_size(src1->type);
1854 GGML_ASSERT(nb10 == ts_src1);
1855 int64_t s11 = nb11 / ts_src1;
1856 int64_t s12 = nb12 / ts_src1;
1857 int64_t s13 = nb13 / ts_src1;
1858
1859 const cuda_t * src0_ptr = nullptr;
1860 const cuda_t * src1_ptr = nullptr;
1861
1862 ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
1863 ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
1864
1865 bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
1866 bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
1867
1868 // Handle src0
1869 src0_ptr = (const cuda_t *) src0->data;
1870
1871 // Handle src1 - convert if necessary
1872 if (src1->type == src0_type) {
1873 src1_ptr = (const cuda_t *) src1->data;
1874 } else {
1875 // Convert src1 to target type using traits conversion functions
1876 const int64_t ne_src1 = ggml_nelements(src1);
1877 src1_alloc.alloc(ne_src1);
1878
1879 const auto convert_func = traits::get_nc_converter(src1->type);
1880 GGML_ASSERT(convert_func != nullptr);
1881 convert_func(src1->data, src1_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
1882 src1_ptr = src1_alloc.get();
1883 s11 = ne10;
1884 s12 = ne11*s11;
1885 s13 = ne12*s12;
1886
1887 is_src1_cont_2 = true;
1888 }
1889
1890 // Setup destination buffer
1891 ggml_cuda_pool_alloc<cuda_t> dst_temp(ctx.pool());
1892 char * dst_t;
1893 size_t nbd2 = dst->nb[2];
1894 size_t nbd3 = dst->nb[3];
1895
1896 cublasComputeType_t cu_compute_type = traits::compute_type;
1897 cudaDataType_t cu_data_type = traits::data_type;
1898 cudaDataType_t cu_data_type_a = traits::data_type;
1899 cudaDataType_t cu_data_type_b = traits::data_type;
1900 const void * alpha = traits::get_alpha();
1901 const void * beta = traits::get_beta();
1902 const float alpha_f32 = 1.0f;
1903 const float beta_f32 = 0.0f;
1904
1905 if (dst->op_params[0] == GGML_PREC_DEFAULT) {
1906 if constexpr (src0_type == GGML_TYPE_F32) {
1907 dst_t = (char *) dst_ddf; // Direct F32 output
1908 } else {
1909 dst_t = (char *) dst_temp.alloc(ne_dst);
1910 nbd2 /= sizeof(float) / sizeof(cuda_t);
1911 nbd3 /= sizeof(float) / sizeof(cuda_t);
1912 }
1913 } else {
1914 dst_t = (char *) dst_ddf;
1915 cu_compute_type = CUBLAS_COMPUTE_32F;
1916 cu_data_type = CUDA_R_32F;
1917 alpha = &alpha_f32;
1918 beta = &beta_f32;
1919 }
1920
1921 int id = ggml_cuda_get_device();
1922 const int cc = ggml_cuda_info().devices[id].cc;
1923 if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
1924 cu_compute_type = CUBLAS_COMPUTE_32F;
1925 alpha = &alpha_f32;
1926 beta = &beta_f32;
1927 }
1928
1929 GGML_ASSERT(ne12 % ne02 == 0);
1930 GGML_ASSERT(ne13 % ne03 == 0);
1931
1932 // broadcast factors
1933 const int64_t r2 = ne12/ne02;
1934 const int64_t r3 = ne13/ne03;
1935
1936 if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
1937 // with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
1938 const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
1939 const int64_t smb = ne12 == 1 ? s13 : s12;
1940
1941 // there is no broadcast and src0, src1 are contiguous across dims 2, 3
1942 // use cublasGemmStridedBatchedEx
1943 CUBLAS_CHECK(
1944 cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1945 ne01, ne11, ne10,
1946 alpha, src0_ptr, cu_data_type_a, nb01/nb00, sma, // strideA
1947 src1_ptr, cu_data_type_b, s11, smb, // strideB
1948 beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
1949 ne12*ne13,
1950 cu_compute_type,
1951 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1952 } else {
1953 // use cublasGemmBatchedEx
1954 const int64_t ne23 = ne12*ne13;
1955
1956 ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
1957 ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
1958
1959 size_t src1_stride_size = sizeof(cuda_t);
1960
1961 const int threads_x = 16;
1962 const int threads_y = 16;
1963 dim3 block_dims(threads_x, threads_y);
1964
1965 dim3 grid_dims(
1966 (ne13 + threads_x - 1) / threads_x,
1967 (ne12 + threads_y - 1) / threads_y
1968 );
1969 k_compute_batched_ptrs<<<grid_dims, block_dims, 0, main_stream>>>(
1970 src0_ptr, src1_ptr, dst_t,
1971 ptrs_src.get(), ptrs_dst.get(),
1972 ne12, ne13,
1973 ne23,
1974 nb02, nb03,
1975 (src1->type == src0_type) ? nb12 : s12*src1_stride_size,
1976 (src1->type == src0_type) ? nb13 : s13*src1_stride_size,
1977 nbd2, nbd3,
1978 r2, r3);
1979
1980 CUDA_CHECK(cudaGetLastError());
1981
1982 CUBLAS_CHECK(
1983 cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
1984 ne01, ne11, ne10,
1985 alpha, (const void **) (ptrs_src.get() + 0*ne23), cu_data_type_a, nb01/nb00,
1986 (const void **) (ptrs_src.get() + 1*ne23), cu_data_type_b, s11,
1987 beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
1988 ne23,
1989 cu_compute_type,
1990 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1991 }
1992
1993 // Convert output back to F32 if needed
1994 if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type != CUDA_R_32F) {
1995 const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(traits::ggml_type_val);
1996 to_fp32_cuda(dst_temp.get(), dst_ddf, ne_dst, main_stream);
1997 }
1998}
1999
2000static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2001 GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F32);
2002
2003 switch (src0->type) {
2004 case GGML_TYPE_F32:
2005 ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F32>(ctx, src0, src1, dst);
2006 break;
2007 case GGML_TYPE_BF16:
2008 ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_BF16>(ctx, src0, src1, dst);
2009 break;
2010 case GGML_TYPE_F16:
2011 ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F16>(ctx, src0, src1, dst);
2012 break;
2013 default:
2014 GGML_ABORT("Unsupported type");
2015 }
2016}
2017
2018static bool ggml_cuda_should_fuse_mul_mat(const ggml_tensor * ffn_up,
2019 const ggml_tensor * ffn_gate,
2020 const ggml_tensor * glu,
2021 const ggml_tensor * ffn_up_bias = nullptr,
2022 const ggml_tensor * ffn_gate_bias = nullptr) {
2023 const bool has_bias = ffn_up_bias != nullptr || ffn_gate_bias != nullptr;
2024
2025 if (has_bias && (!ffn_up_bias || !ffn_gate_bias)) {
2026 return false;
2027 }
2028
2029 const bool is_mul_mat = ffn_up->op == GGML_OP_MUL_MAT && ffn_gate->op == GGML_OP_MUL_MAT && glu->op == GGML_OP_GLU;
2030 const bool is_mul_mat_id = ffn_up->op == GGML_OP_MUL_MAT_ID && ffn_gate->op == GGML_OP_MUL_MAT_ID && glu->op == GGML_OP_GLU;
2031
2032 GGML_ASSERT(ffn_up && ffn_gate && glu);
2033
2034 if (!is_mul_mat && !is_mul_mat_id) {
2035 return false;
2036 }
2037
2038 const ggml_op expected_bias_op = is_mul_mat ? GGML_OP_ADD : GGML_OP_ADD_ID;
2039
2040 if (has_bias) {
2041 if (ffn_up_bias->op != expected_bias_op || ffn_gate_bias->op != expected_bias_op) {
2042 return false;
2043 }
2044
2045 if (glu->src[0] != ffn_gate_bias || glu->src[1] != ffn_up_bias) {
2046 return false;
2047 }
2048
2049 if (expected_bias_op == GGML_OP_ADD) {
2050 const bool up_has_mul = ffn_up_bias->src[0] == ffn_up || ffn_up_bias->src[1] == ffn_up;
2051 const bool gate_has_mul = ffn_gate_bias->src[0] == ffn_gate || ffn_gate_bias->src[1] == ffn_gate;
2052 if (!up_has_mul || !gate_has_mul) {
2053 return false;
2054 }
2055 } else { // GGML_OP_ADD_ID
2056 if (ffn_up_bias->src[0] != ffn_up || ffn_gate_bias->src[0] != ffn_gate) {
2057 return false;
2058 }
2059 if (ffn_up_bias->src[2] != ffn_up->src[2] || ffn_gate_bias->src[2] != ffn_gate->src[2]) {
2060 return false;
2061 }
2062 }
2063 } else {
2064 if (glu->src[0] != ffn_gate && glu->src[1] != ffn_up) {
2065 return false;
2066 }
2067 }
2068
2069 if (ffn_up->src[0]->type != ffn_gate->src[0]->type || !ggml_are_same_shape(ffn_up->src[0], ffn_gate->src[0]) ||
2070 !ggml_are_same_stride(ffn_up->src[0], ffn_gate->src[0])) {
2071 return false;
2072 }
2073
2074 if (ffn_up->src[1] != ffn_gate->src[1]) {
2075 return false;
2076 }
2077
2078 if (ffn_up->src[2] && (ffn_up->src[2] != ffn_gate->src[2])) {
2079 return false;
2080 }
2081
2082 static constexpr std::array<ggml_glu_op, 3> valid_glu_ops = { GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU, GGML_GLU_OP_SWIGLU_OAI };
2083
2084 if (std::find(valid_glu_ops.begin(), valid_glu_ops.end(), ggml_get_glu_op(glu)) == valid_glu_ops.end()) {
2085 return false;
2086 }
2087
2088 if (const bool swapped = ggml_get_op_params_i32(glu, 1); swapped) {
2089 return false;
2090 }
2091
2092 const bool split = ggml_backend_buft_is_cuda_split(ffn_up->src[0]->buffer->buft) ||
2093 ggml_backend_buft_is_cuda_split(ffn_gate->src[0]->buffer->buft);
2094
2095 //TODO: add support for fusion for split buffers
2096 if (split) {
2097 return false;
2098 }
2099
2100 return true;
2101}
2102
2103static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
2104 ggml_tensor * src0 = tensor->src[0];
2105 ggml_tensor * src1 = tensor->src[1];
2106 const ggml_tensor * dst = tensor;
2107
2108 const bool is_mul_mat_id = tensor->op == GGML_OP_MUL_MAT_ID;
2109
2110 bool use_mul_mat_vec_f =
2111 (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) &&
2112 src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2113
2114 const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2115 use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
2116
2117 const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
2118 ggml_backend_buft_is_cuda_split(src1->buffer->buft);
2119
2120 //TODO: add support for fusion for split buffers
2121 if (split) {
2122 return false;
2123 }
2124
2125 //we only support fusion for ncols_dst = 1
2126 if (tensor->op == GGML_OP_MUL_MAT && dst->ne[1] != 1) {
2127 return false;
2128 }
2129
2130 if (tensor->op == GGML_OP_MUL_MAT_ID && dst->ne[2] != 1) {
2131 return false;
2132 }
2133
2134
2135 return use_mul_mat_vec_f;
2136}
2137
2138static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
2139 ggml_tensor * src0 = tensor->src[0];
2140 ggml_tensor * src1 = tensor->src[1];
2141 const ggml_tensor * dst = tensor;
2142
2143 const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE &&
2144 ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) &&
2145 src0->view_src;
2146
2147 bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 &&
2148 dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
2149
2150 // fusion is not universally faster on Pascal
2151 const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2152 if (cc <= GGML_CUDA_CC_PASCAL) {
2153 return false;
2154 }
2155 //we only support fusion for ncols_dst = 1
2156 if (tensor->op == GGML_OP_MUL_MAT && dst->ne[1] != 1) {
2157 return false;
2158 }
2159
2160 if (tensor->op == GGML_OP_MUL_MAT_ID && dst->ne[2] != 1) {
2161 return false;
2162 }
2163
2164
2165 const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
2166 ggml_backend_buft_is_cuda_split(src1->buffer->buft);
2167
2168 //TODO: add support for fusion for split buffers
2169 if (split) {
2170 return false;
2171 }
2172
2173 return use_mul_mat_vec_q;
2174}
2175
2176static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2177 const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
2178
2179 // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
2180 // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
2181 // Therefore, in such cases use cuBLAS.
2182 const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
2183 && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
2184
2185 bool use_mul_mat_vec_f = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
2186 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2187 bool use_mul_mat_f = !ggml_is_quantized(src0->type)
2188 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2189 bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
2190 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
2191 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
2192 bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
2193 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
2194
2195 bool any_gpus_with_slow_fp16 = false;
2196
2197 if (split) {
2198 ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
2199 auto & tensor_split = buft_ctx->tensor_split;
2200 for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
2201 // skip devices that are not going to do any work:
2202 if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
2203 continue;
2204 }
2205
2206 const int cc = ggml_cuda_info().devices[id].cc;
2207 const int warp_size = ggml_cuda_info().devices[id].warp_size;
2208 use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2209 use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
2210 use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
2211 any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
2212 }
2213 } else {
2214 const int cc = ggml_cuda_info().devices[ctx.device].cc;
2215 const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
2216 use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2217 use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
2218 use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
2219 any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
2220 }
2221
2222 // debug helpers
2223 //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
2224 //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
2225 //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
2226 //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
2227 //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
2228 //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
2229
2230 //TODO update for generic tensor parallelism
2231 const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2232 bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
2233 bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
2234 bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
2235
2236 if (!split && use_mul_mat_vec_f) {
2237 // the custom F16 vector kernel can be used over batched cuBLAS GEMM
2238 // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
2239 ggml_cuda_mul_mat_vec_f(ctx, src0, src1, nullptr, dst);
2240 } else if (!split && use_mul_mat_f) {
2241 ggml_cuda_mul_mat_f(ctx, src0, src1, nullptr, dst);
2242 } else if (!split && use_mul_mat_vec_q) {
2243 ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
2244 } else if (!split && use_mul_mat_q) {
2245 ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
2246 } else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32)
2247 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
2248 // general KQ + KQV multi-batch without FlashAttention
2249 ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
2250 } else if (use_mul_mat_vec_f) {
2251 ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_f, nullptr);
2252 } else if (use_mul_mat_vec_q) {
2253 ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
2254 } else if (use_mul_mat_q) {
2255 ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
2256 } else {
2257 ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
2258 }
2259}
2260
2261static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
2262 const ggml_tensor * src0 = dst->src[0];
2263 const ggml_tensor * src1 = dst->src[1];
2264 const ggml_tensor * ids = dst->src[2];
2265
2266 GGML_ASSERT(src1->type == GGML_TYPE_F32);
2267 GGML_ASSERT(dst->type == GGML_TYPE_F32);
2268 GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
2269
2270 GGML_TENSOR_BINARY_OP_LOCALS
2271
2272 const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2273
2274 if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
2275 if (ne2 == 1) {
2276 if (ggml_is_quantized(src0->type)) {
2277 ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
2278 } else {
2279 ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst);
2280 }
2281 return;
2282 }
2283
2284 if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
2285 ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
2286 return;
2287 }
2288
2289 if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true)) {
2290 ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
2291 return;
2292 }
2293 }
2294
2295 cudaStream_t stream = ctx.stream();
2296
2297 GGML_ASSERT(nb12 % nb11 == 0);
2298 GGML_ASSERT(nb2 % nb1 == 0);
2299
2300 const ggml_type type_src1_sorted = (src0->type == GGML_TYPE_F16 && !fast_fp16_hardware_available(cc))
2301 || ggml_is_quantized(src0->type) ? GGML_TYPE_F32 : src0->type;
2302 const ggml_type type_dst_sorted = GGML_TYPE_F32;
2303 const size_t ts_src1_sorted = ggml_type_size(type_src1_sorted);
2304 const size_t ts_dst_sorted = ggml_type_size(type_dst_sorted);
2305
2306 const int64_t n_expert_used = ids->ne[0];
2307 const int64_t ne_get_rows = ne12 * n_expert_used;
2308
2309 std::vector<int32_t> ids_to_sorted_host;
2310 ids_to_sorted_host.reserve(n: 2*ne_get_rows);
2311 std::vector<int32_t> ids_from_sorted_host(ne_get_rows);
2312
2313 ggml_cuda_pool_alloc<int32_t> ids_buf_dev(ctx.pool(), 2*ne_get_rows);
2314
2315 std::vector<int32_t> tokens_per_expert(ne02);
2316
2317 ggml_cuda_pool_alloc<char> src1_sorted(ctx.pool(), ne12*n_expert_used*ne10*ts_src1_sorted);
2318 ggml_cuda_pool_alloc<char> dst_sorted(ctx.pool(), ne2 *n_expert_used* ne0*ts_dst_sorted);
2319
2320 std::vector<char> ids_host(ggml_nbytes(ids));
2321 CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
2322 CUDA_CHECK(cudaStreamSynchronize(stream));
2323
2324 for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices
2325 for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens
2326 for (int64_t iex = 0; iex < n_expert_used; ++iex) {
2327 const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]);
2328 assert(expert_to_use >= 0 && expert_to_use < ne02);
2329 if (expert_to_use == i02) {
2330 ids_from_sorted_host[i12*n_expert_used + iex] = ids_to_sorted_host.size();
2331 ids_to_sorted_host.push_back(i12*ne11 + iex % ne11);
2332 tokens_per_expert[i02]++;
2333 break;
2334 }
2335 }
2336 }
2337 }
2338 GGML_ASSERT(ids_to_sorted_host.size() == size_t(ne_get_rows));
2339
2340 ids_to_sorted_host.insert(position: ids_to_sorted_host.end(), first: ids_from_sorted_host.begin(), last: ids_from_sorted_host.end());
2341
2342 CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_to_sorted_host.data(), 2*ne_get_rows*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
2343 CUDA_CHECK(cudaStreamSynchronize(stream));
2344
2345 const int32_t * ids_to_sorted = ids_buf_dev.ptr + 0*ne_get_rows;
2346 const int32_t * ids_from_sorted = ids_buf_dev.ptr + 1*ne_get_rows;
2347
2348 get_rows_cuda(src1->data, src1->type, ids_to_sorted, src1_sorted.ptr, type_src1_sorted,
2349 ne10, nb11, nb12, nb13,
2350 ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
2351 ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, ne_get_rows*ne10*ts_src1_sorted, stream);
2352 CUDA_CHECK(cudaGetLastError());
2353
2354 char * src1_data_cur = (char *) src1_sorted.ptr;
2355 char * dst_data_cur = (char *) dst_sorted.ptr;
2356 for (int64_t i02 = 0; i02 < ne02; ++i02) {
2357 if (tokens_per_expert[i02] == 0) {
2358 continue;
2359 }
2360
2361 ggml_tensor src0_slice = *src0;
2362 src0_slice.ne[2] = 1;
2363 src0_slice.nb[3] = src0_slice.nb[2];
2364 src0_slice.op = GGML_OP_VIEW;
2365 src0_slice.view_src = dst->src[0]; // non-const pointer to src0
2366 src0_slice.data = (char *) src0->data + i02*nb02;
2367
2368 ggml_tensor src1_slice;
2369 memset(&src1_slice, 0, sizeof(src1_slice));
2370 src1_slice.buffer = src1->buffer;
2371 src1_slice.type = type_src1_sorted;
2372 src1_slice.ne[0] = ne10;
2373 src1_slice.ne[1] = tokens_per_expert[i02];
2374 src1_slice.ne[2] = 1;
2375 src1_slice.ne[3] = 1;
2376 src1_slice.nb[0] = ts_src1_sorted;
2377 src1_slice.nb[1] = src1_slice.ne[0] * src1_slice.nb[0];
2378 src1_slice.nb[2] = src1_slice.ne[1] * src1_slice.nb[1];
2379 src1_slice.nb[3] = src1_slice.ne[2] * src1_slice.nb[2];
2380 src1_slice.data = src1_data_cur;
2381
2382 ggml_tensor dst_slice;
2383 memset(&dst_slice, 0, sizeof(dst_slice));
2384 dst_slice.buffer = dst->buffer;
2385 dst_slice.type = type_dst_sorted;
2386 dst_slice.ne[0] = ne0;
2387 dst_slice.ne[1] = tokens_per_expert[i02];
2388 dst_slice.ne[2] = 1;
2389 dst_slice.ne[3] = 1;
2390 dst_slice.nb[0] = ts_dst_sorted;
2391 dst_slice.nb[1] = dst_slice.ne[0] * dst_slice.nb[0];
2392 dst_slice.nb[2] = dst_slice.ne[1] * dst_slice.nb[1];
2393 dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2];
2394 dst_slice.data = dst_data_cur;
2395
2396 ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice);
2397 CUDA_CHECK(cudaGetLastError());
2398
2399 src1_data_cur += src1_slice.nb[2];
2400 dst_data_cur += dst_slice.nb[2];
2401 }
2402
2403 get_rows_cuda(dst_sorted.ptr, type_dst_sorted, ids_from_sorted, dst->data, dst->type,
2404 ne0, ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted, ne_get_rows*ne0*ts_dst_sorted,
2405 ne_get_rows, 1, 1, sizeof(int32_t), ne_get_rows*sizeof(int32_t), ne_get_rows*sizeof(int32_t),
2406 nb1, nb2, nb3, stream);
2407}
2408
2409static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
2410 // why is this here instead of mul_mat?
2411 if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
2412 ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
2413 }
2414
2415 switch (dst->op) {
2416 case GGML_OP_ARGMAX:
2417 ggml_cuda_argmax(ctx, dst);
2418 break;
2419 case GGML_OP_COUNT_EQUAL:
2420 ggml_cuda_count_equal(ctx, dst);
2421 break;
2422 case GGML_OP_REPEAT:
2423 ggml_cuda_op_repeat(ctx, dst);
2424 break;
2425 case GGML_OP_REPEAT_BACK:
2426 ggml_cuda_op_repeat_back(ctx, dst);
2427 break;
2428 case GGML_OP_GET_ROWS:
2429 ggml_cuda_op_get_rows(ctx, dst);
2430 break;
2431 case GGML_OP_GET_ROWS_BACK:
2432 ggml_cuda_op_get_rows_back(ctx, dst);
2433 break;
2434 case GGML_OP_SET_ROWS:
2435 ggml_cuda_op_set_rows(ctx, dst);
2436 break;
2437 case GGML_OP_SET:
2438 ggml_cuda_op_set(ctx, dst);
2439 break;
2440 case GGML_OP_DUP:
2441 ggml_cuda_dup(ctx, dst);
2442 break;
2443 case GGML_OP_CPY:
2444 ggml_cuda_cpy(ctx, dst->src[0], dst->src[1]);
2445 break;
2446 case GGML_OP_CONT:
2447 ggml_cuda_dup(ctx, dst);
2448 break;
2449 case GGML_OP_ADD:
2450 case GGML_OP_ADD1: // TODO: more efficient implementation
2451 ggml_cuda_op_add(ctx, dst);
2452 break;
2453 case GGML_OP_ADD_ID:
2454 ggml_cuda_op_add_id(ctx, dst);
2455 break;
2456 case GGML_OP_SUB:
2457 ggml_cuda_op_sub(ctx, dst);
2458 break;
2459 case GGML_OP_ACC:
2460 ggml_cuda_op_acc(ctx, dst);
2461 break;
2462 case GGML_OP_MUL:
2463 ggml_cuda_op_mul(ctx, dst);
2464 break;
2465 case GGML_OP_DIV:
2466 ggml_cuda_op_div(ctx, dst);
2467 break;
2468 case GGML_OP_UNARY:
2469 switch (ggml_get_unary_op(dst)) {
2470 case GGML_UNARY_OP_ABS:
2471 ggml_cuda_op_abs(ctx, dst);
2472 break;
2473 case GGML_UNARY_OP_SGN:
2474 ggml_cuda_op_sgn(ctx, dst);
2475 break;
2476 case GGML_UNARY_OP_NEG:
2477 ggml_cuda_op_neg(ctx, dst);
2478 break;
2479 case GGML_UNARY_OP_STEP:
2480 ggml_cuda_op_step(ctx, dst);
2481 break;
2482 case GGML_UNARY_OP_GELU:
2483 ggml_cuda_op_gelu(ctx, dst);
2484 break;
2485 case GGML_UNARY_OP_SILU:
2486 ggml_cuda_op_silu(ctx, dst);
2487 break;
2488 case GGML_UNARY_OP_GELU_ERF:
2489 ggml_cuda_op_gelu_erf(ctx, dst);
2490 break;
2491 case GGML_UNARY_OP_GELU_QUICK:
2492 ggml_cuda_op_gelu_quick(ctx, dst);
2493 break;
2494 case GGML_UNARY_OP_TANH:
2495 ggml_cuda_op_tanh(ctx, dst);
2496 break;
2497 case GGML_UNARY_OP_RELU:
2498 ggml_cuda_op_relu(ctx, dst);
2499 break;
2500 case GGML_UNARY_OP_SIGMOID:
2501 ggml_cuda_op_sigmoid(ctx, dst);
2502 break;
2503 case GGML_UNARY_OP_HARDSIGMOID:
2504 ggml_cuda_op_hardsigmoid(ctx, dst);
2505 break;
2506 case GGML_UNARY_OP_HARDSWISH:
2507 ggml_cuda_op_hardswish(ctx, dst);
2508 break;
2509 case GGML_UNARY_OP_EXP:
2510 ggml_cuda_op_exp(ctx, dst);
2511 break;
2512 case GGML_UNARY_OP_ELU:
2513 ggml_cuda_op_elu(ctx, dst);
2514 break;
2515 case GGML_UNARY_OP_XIELU:
2516 ggml_cuda_op_xielu(ctx, dst);
2517 break;
2518 case GGML_UNARY_OP_FLOOR:
2519 ggml_cuda_op_floor(ctx, dst);
2520 break;
2521 case GGML_UNARY_OP_CEIL:
2522 ggml_cuda_op_ceil(ctx, dst);
2523 break;
2524 case GGML_UNARY_OP_ROUND:
2525 ggml_cuda_op_round(ctx, dst);
2526 break;
2527 case GGML_UNARY_OP_TRUNC:
2528 ggml_cuda_op_trunc(ctx, dst);
2529 break;
2530 default:
2531 return false;
2532 }
2533 break;
2534 case GGML_OP_GLU:
2535 switch (ggml_get_glu_op(dst)) {
2536 case GGML_GLU_OP_REGLU:
2537 ggml_cuda_op_reglu(ctx, dst);
2538 break;
2539 case GGML_GLU_OP_GEGLU:
2540 ggml_cuda_op_geglu(ctx, dst);
2541 break;
2542 case GGML_GLU_OP_SWIGLU:
2543 ggml_cuda_op_swiglu(ctx, dst);
2544 break;
2545 case GGML_GLU_OP_SWIGLU_OAI:
2546 ggml_cuda_op_swiglu_oai(ctx, dst);
2547 break;
2548 case GGML_GLU_OP_GEGLU_ERF:
2549 ggml_cuda_op_geglu_erf(ctx, dst);
2550 break;
2551 case GGML_GLU_OP_GEGLU_QUICK:
2552 ggml_cuda_op_geglu_quick(ctx, dst);
2553 break;
2554 default:
2555 return false;
2556 }
2557 break;
2558 case GGML_OP_NORM:
2559 ggml_cuda_op_norm(ctx, dst);
2560 break;
2561 case GGML_OP_GROUP_NORM:
2562 ggml_cuda_op_group_norm(ctx, dst);
2563 break;
2564 case GGML_OP_L2_NORM:
2565 ggml_cuda_op_l2_norm(ctx, dst);
2566 break;
2567 case GGML_OP_CONCAT:
2568 ggml_cuda_op_concat(ctx, dst);
2569 break;
2570 case GGML_OP_UPSCALE:
2571 ggml_cuda_op_upscale(ctx, dst);
2572 break;
2573 case GGML_OP_PAD:
2574 ggml_cuda_op_pad(ctx, dst);
2575 break;
2576 case GGML_OP_PAD_REFLECT_1D:
2577 ggml_cuda_op_pad_reflect_1d(ctx, dst);
2578 break;
2579 case GGML_OP_ARANGE:
2580 ggml_cuda_op_arange(ctx, dst);
2581 break;
2582 case GGML_OP_TIMESTEP_EMBEDDING:
2583 ggml_cuda_op_timestep_embedding(ctx, dst);
2584 break;
2585 case GGML_OP_LEAKY_RELU:
2586 ggml_cuda_op_leaky_relu(ctx, dst);
2587 break;
2588 case GGML_OP_SILU_BACK:
2589 ggml_cuda_op_silu_back(ctx, dst);
2590 break;
2591 case GGML_OP_RMS_NORM:
2592 ggml_cuda_op_rms_norm(ctx, dst);
2593 break;
2594 case GGML_OP_RMS_NORM_BACK:
2595 ggml_cuda_op_rms_norm_back(ctx, dst);
2596 break;
2597 case GGML_OP_MUL_MAT:
2598 ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
2599 break;
2600 case GGML_OP_MUL_MAT_ID:
2601 ggml_cuda_mul_mat_id(ctx, dst);
2602 break;
2603 case GGML_OP_OUT_PROD:
2604 ggml_cuda_out_prod(ctx, dst);
2605 break;
2606 case GGML_OP_SCALE:
2607 ggml_cuda_op_scale(ctx, dst);
2608 break;
2609 case GGML_OP_SQR:
2610 ggml_cuda_op_sqr(ctx, dst);
2611 break;
2612 case GGML_OP_SQRT:
2613 ggml_cuda_op_sqrt(ctx, dst);
2614 break;
2615 case GGML_OP_SIN:
2616 ggml_cuda_op_sin(ctx, dst);
2617 break;
2618 case GGML_OP_COS:
2619 ggml_cuda_op_cos(ctx, dst);
2620 break;
2621 case GGML_OP_CLAMP:
2622 ggml_cuda_op_clamp(ctx, dst);
2623 break;
2624 case GGML_OP_LOG:
2625 ggml_cuda_op_log(ctx, dst);
2626 break;
2627 case GGML_OP_NONE:
2628 case GGML_OP_RESHAPE:
2629 case GGML_OP_VIEW:
2630 case GGML_OP_PERMUTE:
2631 case GGML_OP_TRANSPOSE:
2632 break;
2633 case GGML_OP_DIAG_MASK_INF:
2634 ggml_cuda_op_diag_mask_inf(ctx, dst);
2635 break;
2636 case GGML_OP_SOFT_MAX:
2637 ggml_cuda_op_soft_max(ctx, dst);
2638 break;
2639 case GGML_OP_SOFT_MAX_BACK:
2640 ggml_cuda_op_soft_max_back(ctx, dst);
2641 break;
2642 case GGML_OP_ROPE:
2643 ggml_cuda_op_rope(ctx, dst);
2644 break;
2645 case GGML_OP_ROPE_BACK:
2646 ggml_cuda_op_rope_back(ctx, dst);
2647 break;
2648 case GGML_OP_ROLL:
2649 ggml_cuda_op_roll(ctx, dst);
2650 break;
2651 case GGML_OP_IM2COL:
2652 ggml_cuda_op_im2col(ctx, dst);
2653 break;
2654 case GGML_OP_IM2COL_3D:
2655 ggml_cuda_op_im2col_3d(ctx, dst);
2656 break;
2657 case GGML_OP_CONV_2D:
2658 ggml_cuda_op_conv2d(ctx, dst);
2659 break;
2660 case GGML_OP_CONV_2D_DW:
2661 ggml_cuda_op_conv2d_dw(ctx, dst);
2662 break;
2663 case GGML_OP_CONV_TRANSPOSE_2D:
2664 ggml_cuda_conv_2d_transpose_p0(ctx, dst);
2665 break;
2666 case GGML_OP_CONV_TRANSPOSE_1D:
2667 ggml_cuda_op_conv_transpose_1d(ctx,dst);
2668 break;
2669 case GGML_OP_POOL_2D:
2670 ggml_cuda_op_pool2d(ctx, dst);
2671 break;
2672 case GGML_OP_SUM:
2673 ggml_cuda_op_sum(ctx, dst);
2674 break;
2675 case GGML_OP_SUM_ROWS:
2676 ggml_cuda_op_sum_rows(ctx, dst);
2677 break;
2678 case GGML_OP_MEAN:
2679 ggml_cuda_op_mean(ctx, dst);
2680 break;
2681 case GGML_OP_SSM_CONV:
2682 ggml_cuda_op_ssm_conv(ctx, dst);
2683 break;
2684 case GGML_OP_SSM_SCAN:
2685 ggml_cuda_op_ssm_scan(ctx, dst);
2686 break;
2687 case GGML_OP_ARGSORT:
2688 ggml_cuda_op_argsort(ctx, dst);
2689 break;
2690 case GGML_OP_FLASH_ATTN_EXT:
2691 ggml_cuda_flash_attn_ext(ctx, dst);
2692 break;
2693 case GGML_OP_CROSS_ENTROPY_LOSS:
2694 ggml_cuda_cross_entropy_loss(ctx, dst);
2695 break;
2696 case GGML_OP_RWKV_WKV6:
2697 ggml_cuda_op_rwkv_wkv6(ctx, dst);
2698 break;
2699 case GGML_OP_GATED_LINEAR_ATTN:
2700 ggml_cuda_op_gated_linear_attn(ctx, dst);
2701 break;
2702 case GGML_OP_RWKV_WKV7:
2703 ggml_cuda_op_rwkv_wkv7(ctx, dst);
2704 break;
2705 case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
2706 ggml_cuda_cross_entropy_loss_back(ctx, dst);
2707 break;
2708 case GGML_OP_OPT_STEP_ADAMW:
2709 ggml_cuda_opt_step_adamw(ctx, dst);
2710 break;
2711 case GGML_OP_OPT_STEP_SGD:
2712 ggml_cuda_opt_step_sgd(ctx, dst);
2713 break;
2714 default:
2715 return false;
2716 }
2717
2718 cudaError_t err = cudaGetLastError();
2719 if (err != cudaSuccess) {
2720 GGML_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2721 CUDA_CHECK(err);
2722 }
2723
2724 return true;
2725}
2726
2727////////////////////////////////////////////////////////////////////////////////
2728
2729// backend
2730
2731static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
2732 ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2733
2734 return cuda_ctx->name.c_str();
2735}
2736
2737static void ggml_backend_cuda_free(ggml_backend_t backend) {
2738 ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2739
2740 delete cuda_ctx;
2741 delete backend;
2742}
2743
2744static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2745 ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2746 ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2747
2748 GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2749
2750 CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
2751}
2752
2753static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2754 ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2755 ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
2756
2757 GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
2758
2759 CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
2760}
2761
2762static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
2763 ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
2764 ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
2765
2766 if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
2767 return false;
2768 }
2769
2770 if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
2771 return false;
2772 }
2773
2774 // device -> device copy
2775 ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
2776 ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
2777
2778 ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
2779 ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
2780
2781 if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
2782#ifndef NDEBUG
2783 GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
2784#endif
2785 return false;
2786 }
2787
2788 if (backend_src != backend_dst) {
2789 // copy on src stream
2790 if (cuda_ctx_src->device == cuda_ctx_dst->device) {
2791 CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
2792 } else {
2793#ifdef GGML_CUDA_NO_PEER_COPY
2794 return false;
2795#else
2796 CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
2797#endif
2798 }
2799
2800 // record event on src stream after the copy
2801 if (!cuda_ctx_src->copy_event) {
2802 ggml_cuda_set_device(cuda_ctx_src->device);
2803 CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
2804 }
2805
2806 CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
2807
2808 // wait on dst stream for the copy to complete
2809 CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
2810 } else {
2811 // src and dst are on the same backend
2812 CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
2813 }
2814 return true;
2815}
2816
2817static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2818 ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2819
2820 CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
2821
2822 GGML_UNUSED(backend);
2823}
2824
2825#ifdef USE_CUDA_GRAPH
2826static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
2827 bool use_cuda_graph) {
2828
2829 // Loop over nodes in GGML graph to obtain info needed for CUDA graph
2830
2831 const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
2832 const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
2833 const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
2834 const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
2835 const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
2836 const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
2837 const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
2838
2839 for (int i = 0; i < cgraph->n_nodes; i++) {
2840 ggml_tensor * node = cgraph->nodes[i];
2841
2842 if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2843 continue;
2844 }
2845
2846 if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
2847 use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2848#ifndef NDEBUG
2849 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
2850#endif
2851 }
2852
2853 if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
2854 use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2855#ifndef NDEBUG
2856 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
2857#endif
2858 }
2859
2860 if (node->op == GGML_OP_ADD &&
2861 node->src[1] && node->src[1]->ne[1] > 1 &&
2862 (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
2863 (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
2864 strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
2865 strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
2866 strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
2867 strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
2868 strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
2869 // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
2870 // by means of matching node names. See
2871 // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
2872 // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
2873 // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
2874 use_cuda_graph = false;
2875#ifndef NDEBUG
2876 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2877#endif
2878 }
2879
2880 if (!use_cuda_graph) {
2881 break;
2882 }
2883 }
2884
2885 return use_cuda_graph;
2886}
2887
2888static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2889 graph_node_properties->node_address = node->data;
2890 graph_node_properties->node_op = node->op;
2891 for (int i = 0; i < GGML_MAX_DIMS; i++) {
2892 graph_node_properties->ne[i] = node->ne[i];
2893 graph_node_properties->nb[i] = node->nb[i];
2894 }
2895 for (int i = 0; i < GGML_MAX_SRC; i++) {
2896 graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
2897 }
2898 memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
2899}
2900
2901static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2902 if (node->data != graph_node_properties->node_address &&
2903 node->op != GGML_OP_VIEW) {
2904 return false;
2905 }
2906
2907 if (node->op != graph_node_properties->node_op) {
2908 return false;
2909 }
2910
2911 for (int i = 0; i < GGML_MAX_DIMS; i++) {
2912 if (node->ne[i] != graph_node_properties->ne[i]) {
2913 return false;
2914 }
2915 if (node->nb[i] != graph_node_properties->nb[i]) {
2916 return false;
2917 }
2918 }
2919
2920 for (int i = 0; i < GGML_MAX_SRC; i++) {
2921 if (node->src[i] &&
2922 node->src[i]->data != graph_node_properties->src_address[i] &&
2923 node->op != GGML_OP_VIEW
2924 ) {
2925 return false;
2926 }
2927 }
2928
2929 if ((node->op == GGML_OP_SCALE || node->op == GGML_OP_GLU) &&
2930 memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
2931 return false;
2932 }
2933
2934 return true;
2935}
2936
2937static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
2938
2939 bool cuda_graph_update_required = false;
2940
2941 if (cuda_ctx->cuda_graph->instance == nullptr) {
2942 cuda_graph_update_required = true;
2943 }
2944
2945 // Check if the graph size has changed
2946 if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2947 cuda_graph_update_required = true;
2948 cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2949 }
2950
2951 // Loop over nodes in GGML graph to determine if CUDA graph update is required
2952 // and store properties to allow this comparison for the next token
2953 for (int i = 0; i < cgraph->n_nodes; i++) {
2954 bool has_matching_properties = true;
2955 if (!cuda_graph_update_required) {
2956 has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2957 }
2958 if (!has_matching_properties) {
2959 cuda_graph_update_required = true;
2960 }
2961 set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
2962 }
2963
2964 return cuda_graph_update_required;
2965}
2966
2967static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
2968
2969#if CUDART_VERSION >= 12000
2970 cudaGraphExecUpdateResultInfo result_info;
2971 cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2972#else
2973 cudaGraphNode_t errorNode;
2974 cudaGraphExecUpdateResult result_info;
2975 cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
2976#endif // CUDART_VERSION >= 12000
2977
2978 if (stat == cudaErrorGraphExecUpdateFailure) {
2979#ifndef NDEBUG
2980 GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
2981#endif
2982
2983 // The pre-existing graph exec cannot be updated due to violated constraints
2984 // so instead clear error and re-instantiate
2985 (void)cudaGetLastError();
2986 CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
2987 cuda_ctx->cuda_graph->instance = nullptr;
2988 CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
2989 } else {
2990 GGML_ASSERT(stat == cudaSuccess);
2991 }
2992}
2993#endif
2994
2995static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops, std::initializer_list<enum ggml_unary_op> unary_ops) {
2996#ifndef NDEBUG
2997 const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
2998 GGML_ASSERT(unary_ops.size() == num_unary);
2999#endif
3000
3001 //TODO: remove special case once ggml_can_fuse can handle empty nodes
3002 std::initializer_list<enum ggml_op> topk_moe_ops =
3003 ggml_cuda_topk_moe_ops(/*with_norm*/ false, /*delayed_softmax=*/false);
3004 std::initializer_list<enum ggml_op> topk_moe_ops_with_norm =
3005 ggml_cuda_topk_moe_ops(/*with_norm=*/true, /*delayed_softmax=*/false);
3006 std::initializer_list<enum ggml_op> topk_moe_ops_delayed_softmax =
3007 ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
3008
3009 if (ops.size() == topk_moe_ops_with_norm.size() &&
3010 ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
3011 ggml_tensor * softmax = cgraph->nodes[node_idx];
3012 ggml_tensor * weights = cgraph->nodes[node_idx + 9];
3013
3014 if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3015 return true;
3016 }
3017 }
3018
3019 if (ops.size() == topk_moe_ops.size() &&
3020 ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
3021 ggml_tensor * softmax = cgraph->nodes[node_idx];
3022 ggml_tensor * weights = cgraph->nodes[node_idx + 4];
3023 if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3024 return true;
3025 }
3026 }
3027
3028 if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
3029 ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
3030 ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
3031 ggml_tensor * weights = cgraph->nodes[node_idx + 5];
3032
3033 if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
3034 return true;
3035 }
3036 }
3037
3038 std::initializer_list<enum ggml_op> mul_mat_bias_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_GLU };
3039 std::initializer_list<enum ggml_op> mul_mat_id_bias_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_GLU };
3040
3041 std::initializer_list<enum ggml_op> mul_mat_id_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_MUL_MAT_ID, GGML_OP_GLU };
3042 std::initializer_list<enum ggml_op> mul_mat_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT, GGML_OP_GLU };
3043
3044 if (ops.size() == 5 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}) ||
3045 ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}))) {
3046
3047 const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
3048 const ggml_tensor * ffn_gate_bias = cgraph->nodes[node_idx + 1];
3049 const ggml_tensor * ffn_up = cgraph->nodes[node_idx + 2];
3050 const ggml_tensor * ffn_up_bias = cgraph->nodes[node_idx + 3];
3051 const ggml_tensor * glu = cgraph->nodes[node_idx + 4];
3052
3053 if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu, ffn_up_bias, ffn_gate_bias)) {
3054 return true;
3055 }
3056 }
3057
3058 if (ops.size() == 3 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}) ||
3059 ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}))) {
3060
3061 const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
3062 const ggml_tensor * ffn_up = cgraph->nodes[node_idx + 1];
3063 const ggml_tensor * glu = cgraph->nodes[node_idx + 2];
3064
3065 if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) {
3066 return true;
3067 }
3068 }
3069
3070 if (!ggml_can_fuse(cgraph, node_idx, ops)) {
3071 return false;
3072 }
3073
3074 if ((ops.size() == 2 || ops.size() == 3) && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) {
3075 const ggml_tensor *rms_norm = cgraph->nodes[node_idx];
3076 const ggml_tensor *mul = cgraph->nodes[node_idx+1];
3077 const ggml_tensor *add = nullptr;
3078
3079 if (ops.size() == 3 && ops.begin()[2] == GGML_OP_ADD) {
3080 add = cgraph->nodes[node_idx+2];
3081 }
3082
3083 GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32);
3084 GGML_ASSERT(rms_norm->type == GGML_TYPE_F32);
3085
3086 //rms norm only supports F32
3087 if (mul->src[0]->type != GGML_TYPE_F32 ||
3088 mul->src[1]->type != GGML_TYPE_F32 ||
3089 mul->type != GGML_TYPE_F32) {
3090 return false;
3091 }
3092
3093 if (add && (add->src[0]->type != GGML_TYPE_F32 ||
3094 add->src[1]->type != GGML_TYPE_F32 ||
3095 add->type != GGML_TYPE_F32) ) {
3096 return false;
3097 }
3098
3099 //if rms norm is the B operand, then we don't handle broadcast
3100 if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm)) {
3101 return false;
3102 }
3103
3104 //rms_norm kernel assumes contigous rows
3105 if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
3106 return false;
3107 }
3108
3109 if (add && (!ggml_is_contiguous(add->src[0]) || !ggml_is_contiguous_rows(add->src[1]))) {
3110 return false;
3111 }
3112
3113 return true;
3114 }
3115
3116 if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
3117 && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
3118 const ggml_tensor *scale = cgraph->nodes[node_idx];
3119 const ggml_tensor *tanh = cgraph->nodes[node_idx+1];
3120 const ggml_tensor *scale2 = cgraph->nodes[node_idx+2];
3121
3122 GGML_ASSERT(scale->src[0]->type == GGML_TYPE_F32);
3123 GGML_ASSERT(scale->type == GGML_TYPE_F32);
3124
3125 if (ggml_get_unary_op(tanh) != GGML_UNARY_OP_TANH) {
3126 return false;
3127 }
3128
3129 // Check for bias
3130 if (ggml_get_op_params_f32(scale, 1) != 0.0f || ggml_get_op_params_f32(scale2, 1) != 0.0f) {
3131 return false;
3132 }
3133
3134 return true;
3135 }
3136
3137 return false;
3138}
3139
3140static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
3141 bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
3142 // flag used to determine whether it is an integrated_gpu
3143 const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
3144
3145 while (!graph_evaluated_or_captured) {
3146 // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
3147 // With the use of CUDA graphs, the execution will be performed by the graph launch.
3148 if (!use_cuda_graph || cuda_graph_update_required) {
3149
3150 [[maybe_unused]] int prev_i = 0;
3151
3152 for (int i = 0; i < cgraph->n_nodes; i++) {
3153 ggml_tensor * node = cgraph->nodes[i];
3154#ifdef GGML_CUDA_DEBUG
3155 const int nodes_fused = i - prev_i - 1;
3156 prev_i = i;
3157 if (nodes_fused > 0) {
3158 GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
3159 }
3160#endif
3161
3162 if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
3163 continue;
3164 }
3165
3166 static bool disable_fusion = (getenv(name: "GGML_CUDA_DISABLE_FUSION") != nullptr);
3167 if (!disable_fusion) {
3168
3169 if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
3170 ggml_tensor * weights = cgraph->nodes[i + 9];
3171 ggml_tensor * selected_experts = cgraph->nodes[i + 3];
3172 ggml_tensor * clamp = cgraph->nodes[i + 7];
3173 ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ true,
3174 /*delayed softmax*/ false, clamp);
3175 i += 9;
3176 continue;
3177 }
3178
3179 if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
3180 ggml_tensor * weights = cgraph->nodes[i + 4];
3181 ggml_tensor * selected_experts = cgraph->nodes[i + 3];
3182 ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ false,
3183 /*delayed softmax*/ false);
3184 i += 4;
3185 continue;
3186 }
3187
3188 if (ggml_cuda_can_fuse(cgraph, i,
3189 ggml_cuda_topk_moe_ops(/*with norm*/ false, /*delayed softmax*/ true), {})) {
3190 ggml_tensor * weights = cgraph->nodes[i + 5];
3191 ggml_tensor * ids = cgraph->nodes[i + 1];
3192
3193 ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, ids, /*with norm*/ false,
3194 /*delayed_softmax*/ true);
3195 i += 5;
3196 continue;
3197 }
3198
3199 if (node->op == GGML_OP_ADD) {
3200 int n_fuse = 0;
3201 ggml_op ops[8];
3202 std::fill(ops, ops + 8, GGML_OP_ADD);
3203
3204 for (; n_fuse <= 6; ++n_fuse){
3205 if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) {
3206 break;
3207 }
3208 if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + 1]->src[0]) {
3209 break;
3210 }
3211 if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[1], cgraph->nodes[i + n_fuse + 1]->src[1])) {
3212 break;
3213 }
3214 }
3215
3216 n_fuse++;
3217
3218 if (n_fuse > 1) {
3219 for (int j = 0; j < n_fuse - 1; ++j) {
3220 node->src[j + 2] = cgraph->nodes[i + j + 1]->src[1];
3221 }
3222 cgraph->nodes[i + n_fuse - 1]->data = node->data;
3223 ggml_cuda_op_fused_add(*cuda_ctx, node, n_fuse);
3224 i += n_fuse - 1;
3225
3226 continue;
3227 }
3228 }
3229
3230 bool fused_mul_mat_vec = false;
3231 int fused_node_count = 0;
3232
3233 for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
3234 const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
3235
3236 if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) {
3237 ggml_tensor * glu = cgraph->nodes[i + 4];
3238 ggml_tensor * gate_bias_n = glu->src[0];
3239 ggml_tensor * up_bias_n = glu->src[1];
3240
3241 //we don't assume the order for {gate, up}. Instead infer it from the bias tensor
3242 ggml_tensor * gate_n = nullptr;
3243 ggml_tensor * up_n = nullptr;
3244
3245 if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) {
3246 gate_n = cgraph->nodes[i];
3247 up_n = cgraph->nodes[i + 2];
3248 } else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) {
3249 gate_n = cgraph->nodes[i + 2];
3250 up_n = cgraph->nodes[i];
3251 } else {
3252 continue;
3253 }
3254
3255 auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) {
3256 if (op_bias == GGML_OP_ADD) {
3257 if (bias_node->src[0] == mul_node) {
3258 return bias_node->src[1];
3259 }
3260 if (bias_node->src[1] == mul_node) {
3261 return bias_node->src[0];
3262 }
3263 return (ggml_tensor *) nullptr;
3264 }
3265 GGML_ASSERT(op_bias == GGML_OP_ADD_ID);
3266 GGML_ASSERT(bias_node->src[0] == mul_node);
3267 return bias_node->src[1];
3268 };
3269
3270 ggml_tensor * up_bias_tensor = get_bias_tensor(up_bias_n, up_n, bias_op);
3271 ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op);
3272
3273 if (!up_bias_tensor || !gate_bias_tensor) {
3274 continue;
3275 }
3276
3277 // we don't support repeating adds
3278 if (bias_op == GGML_OP_ADD &&
3279 (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
3280 !ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
3281 continue;
3282 }
3283
3284 const ggml_tensor * src0 = up_n->src[0];
3285 const ggml_tensor * src1 = up_n->src[1];
3286 const ggml_tensor * ids = up_n->src[2];
3287
3288 if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) {
3289 ggml_cuda_mm_fusion_args_host fusion_data{};
3290 fusion_data.gate = gate_n->src[0];
3291 fusion_data.x_bias = up_bias_tensor;
3292 fusion_data.gate_bias = gate_bias_tensor;
3293 fusion_data.glu_op = ggml_get_glu_op(glu);
3294
3295 ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
3296 fused_mul_mat_vec = true;
3297 fused_node_count = 5;
3298 break;
3299 }
3300
3301 if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) {
3302 ggml_cuda_mm_fusion_args_host fusion_data{};
3303 fusion_data.gate = gate_n->src[0];
3304 fusion_data.x_bias = up_bias_tensor;
3305 fusion_data.gate_bias = gate_bias_tensor;
3306 fusion_data.glu_op = ggml_get_glu_op(glu);
3307
3308 ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
3309 fused_mul_mat_vec = true;
3310 fused_node_count = 5;
3311 break;
3312 }
3313 } else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
3314 ggml_tensor * glu = cgraph->nodes[i + 2];
3315 ggml_tensor * gate = glu->src[0];
3316 ggml_tensor * up = glu->src[1];
3317
3318 bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1])
3319 || (gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]);
3320
3321 if (!ok) continue;
3322
3323 const ggml_tensor * src0 = up->src[0];
3324 const ggml_tensor * src1 = up->src[1];
3325 const ggml_tensor * ids = up->src[2];
3326
3327 if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
3328 ggml_cuda_mm_fusion_args_host fusion_data{};
3329 fusion_data.gate = gate->src[0];
3330 fusion_data.glu_op = ggml_get_glu_op(glu);
3331
3332 ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
3333 fused_mul_mat_vec = true;
3334 fused_node_count = 3;
3335 break;
3336 }
3337
3338 if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
3339 ggml_cuda_mm_fusion_args_host fusion_data{};
3340 fusion_data.gate = gate->src[0];
3341 fusion_data.glu_op = ggml_get_glu_op(glu);
3342
3343 ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
3344 fused_mul_mat_vec = true;
3345 fused_node_count = 3;
3346 break;
3347 }
3348 }
3349 }
3350
3351 if (fused_mul_mat_vec) {
3352 i += fused_node_count - 1;
3353 continue;
3354 }
3355
3356 fused_mul_mat_vec = false;
3357 fused_node_count = 0;
3358
3359 for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
3360 const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
3361
3362 if (!ggml_can_fuse(cgraph, i, { op, bias_op })) {
3363 continue;
3364 }
3365
3366 ggml_tensor * mm_node = cgraph->nodes[i];
3367 ggml_tensor * bias_node = cgraph->nodes[i + 1];
3368
3369 ggml_tensor * bias_tensor = nullptr;
3370 if (bias_op == GGML_OP_ADD) {
3371 if (bias_node->src[0] == mm_node) {
3372 bias_tensor = bias_node->src[1];
3373 } else if (bias_node->src[1] == mm_node) {
3374 bias_tensor = bias_node->src[0];
3375 } else {
3376 continue;
3377 }
3378 } else {
3379 if (bias_node->src[0] != mm_node) {
3380 continue;
3381 }
3382 bias_tensor = bias_node->src[1];
3383 }
3384
3385 const ggml_tensor * src0 = mm_node->src[0];
3386 const ggml_tensor * src1 = mm_node->src[1];
3387 const ggml_tensor * ids = mm_node->src[2];
3388
3389 if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) {
3390 continue;
3391 }
3392
3393 if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
3394 continue;
3395 }
3396
3397 ggml_cuda_mm_fusion_args_host fusion_data{};
3398 fusion_data.x_bias = bias_tensor;
3399
3400 if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) {
3401 ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
3402 fused_mul_mat_vec = true;
3403 fused_node_count = 2;
3404 break;
3405 }
3406
3407 if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) {
3408 ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
3409 fused_mul_mat_vec = true;
3410 fused_node_count = 2;
3411 break;
3412 }
3413 }
3414
3415 if (fused_mul_mat_vec) {
3416 i += fused_node_count - 1;
3417 continue;
3418 }
3419
3420 if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
3421 ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
3422 i += 2;
3423 continue;
3424 }
3425
3426 if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) {
3427 ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
3428 i++;
3429 continue;
3430 }
3431
3432 if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
3433 i += 2;
3434 ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
3435 continue;
3436 }
3437 }
3438#ifndef NDEBUG
3439 assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
3440 for (int j = 0; j < GGML_MAX_SRC; j++) {
3441 if (node->src[j] != nullptr) {
3442 assert(node->src[j]->buffer);
3443 assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
3444 ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
3445 }
3446 }
3447#else
3448 GGML_UNUSED(integrated);
3449#endif // NDEBUG
3450
3451 bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
3452 if (!ok) {
3453 GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
3454 }
3455 GGML_ASSERT(ok);
3456 }
3457 }
3458
3459#ifdef USE_CUDA_GRAPH
3460 if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
3461 if (cuda_ctx->cuda_graph->graph != nullptr) {
3462 CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
3463 cuda_ctx->cuda_graph->graph = nullptr;
3464 }
3465
3466 CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
3467 graph_evaluated_or_captured = true; // CUDA graph has been captured
3468
3469 std::lock_guard<std::mutex> lock(ggml_cuda_lock);
3470 if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
3471 ggml_cuda_lock_cv.notify_all();
3472 }
3473 } else {
3474 graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
3475 }
3476 }
3477
3478 if (use_cuda_graph) {
3479 if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
3480 CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
3481 }
3482 if (cuda_graph_update_required) { // Update graph executable
3483 update_cuda_graph_executable(cuda_ctx);
3484 }
3485 // Launch graph
3486 CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
3487#else
3488 graph_evaluated_or_captured = true;
3489#endif // USE_CUDA_GRAPH
3490 }
3491}
3492
3493static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
3494 ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
3495
3496 ggml_cuda_set_device(cuda_ctx->device);
3497
3498#ifdef USE_CUDA_GRAPH
3499 static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
3500
3501 // Objects required for CUDA Graph
3502 if (cuda_ctx->cuda_graph == nullptr) {
3503 cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
3504 }
3505
3506 bool use_cuda_graph = true;
3507 bool cuda_graph_update_required = false;
3508
3509 if (cuda_ctx->cuda_graph->graph == nullptr) {
3510 if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
3511 cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
3512#ifndef NDEBUG
3513 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
3514#endif
3515 }
3516 }
3517
3518 // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
3519 // or previous graph capture failure.
3520 // Also disable for multi-gpu for now. TO DO investigate
3521 if (disable_cuda_graphs_due_to_env
3522 || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
3523 || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
3524 || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
3525 use_cuda_graph = false;
3526 }
3527
3528 if (use_cuda_graph) {
3529 cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
3530
3531 use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph);
3532
3533 // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
3534 if (use_cuda_graph && cuda_graph_update_required) {
3535 cuda_ctx->cuda_graph->number_consecutive_updates++;
3536 } else {
3537 cuda_ctx->cuda_graph->number_consecutive_updates = 0;
3538 }
3539
3540 if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
3541 cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
3542#ifndef NDEBUG
3543 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
3544#endif
3545 }
3546 }
3547
3548 if (use_cuda_graph && cuda_graph_update_required) {
3549 // Start CUDA graph capture
3550 {
3551 std::lock_guard<std::mutex> lock(ggml_cuda_lock);
3552 ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
3553 }
3554
3555 CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
3556 }
3557
3558#else
3559 bool use_cuda_graph = false;
3560 bool cuda_graph_update_required = false;
3561#endif // USE_CUDA_GRAPH
3562
3563 bool graph_evaluated_or_captured = false;
3564
3565 evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
3566
3567 return GGML_STATUS_SUCCESS;
3568}
3569
3570static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
3571 ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
3572
3573 CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream()));
3574}
3575
3576static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
3577 ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
3578
3579 if (ggml_backend_is_cuda(backend)) {
3580 CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0));
3581 } else {
3582#if 0
3583 // untested
3584 auto wait_fn = [](void * user_data) {
3585 ggml_backend_event_t event = (ggml_backend_event_t)user_data;
3586 ggml_backend_event_synchronize(event);
3587 };
3588
3589 CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
3590#endif
3591 GGML_ABORT("fatal error");
3592 }
3593}
3594
3595static const ggml_backend_i ggml_backend_cuda_interface = {
3596 /* .get_name = */ ggml_backend_cuda_get_name,
3597 /* .free = */ ggml_backend_cuda_free,
3598 /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
3599 /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
3600 /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
3601 /* .synchronize = */ ggml_backend_cuda_synchronize,
3602 /* .graph_plan_create = */ NULL,
3603 /* .graph_plan_free = */ NULL,
3604 /* .graph_plan_update = */ NULL,
3605 /* .graph_plan_compute = */ NULL,
3606 /* .graph_compute = */ ggml_backend_cuda_graph_compute,
3607 /* .event_record = */ ggml_backend_cuda_event_record,
3608 /* .event_wait = */ ggml_backend_cuda_event_wait,
3609 /* .graph_optimize = */ NULL,
3610};
3611
3612static ggml_guid_t ggml_backend_cuda_guid() {
3613 static ggml_guid guid = { 0x2c, 0xdd, 0xe8, 0x1c, 0x65, 0xb3, 0x65, 0x73, 0x6a, 0x12, 0x88, 0x61, 0x1c, 0xc9, 0xdc, 0x25 };
3614 return &guid;
3615}
3616
3617bool ggml_backend_is_cuda(ggml_backend_t backend) {
3618 return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cuda_guid());
3619}
3620
3621int ggml_backend_cuda_get_device_count() {
3622 return ggml_cuda_info().device_count;
3623}
3624
3625void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
3626 cudaDeviceProp prop;
3627 CUDA_CHECK(cudaGetDeviceProperties(prop: &prop, device));
3628 snprintf(s: description, maxlen: description_size, format: "%s", prop.name);
3629}
3630
3631void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
3632 ggml_cuda_set_device(device);
3633
3634 CUDA_CHECK(cudaMemGetInfo(free, total));
3635}
3636
3637bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
3638 if (getenv(name: "GGML_CUDA_REGISTER_HOST") == nullptr) {
3639 return false;
3640 }
3641
3642#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA) || defined(GGML_USE_HIP)
3643 cudaError_t err = cudaHostRegister(ptr: buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
3644 if (err != cudaSuccess) {
3645 // clear the error
3646 (void)cudaGetLastError();
3647
3648 GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3649 size / 1024.0 / 1024.0, cudaGetErrorString(error: err));
3650 return false;
3651 }
3652 return true;
3653#else
3654 GGML_UNUSED(buffer);
3655 GGML_UNUSED(size);
3656 return false;
3657#endif // CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
3658}
3659
3660void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
3661 if (getenv(name: "GGML_CUDA_REGISTER_HOST") == nullptr) {
3662 return;
3663 }
3664
3665 cudaError_t err = cudaHostUnregister(ptr: buffer);
3666 if (err != cudaSuccess) {
3667 // clear the error
3668 (void)cudaGetLastError();
3669 }
3670}
3671
3672
3673// backend device
3674
3675struct ggml_backend_cuda_device_context {
3676 int device;
3677 std::string name;
3678 std::string description;
3679 std::string pci_bus_id;
3680};
3681
3682static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
3683 ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3684 return ctx->name.c_str();
3685}
3686
3687static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t dev) {
3688 ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3689 return ctx->description.c_str();
3690}
3691
3692static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
3693 ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3694 ggml_cuda_set_device(device: ctx->device);
3695 CUDA_CHECK(cudaMemGetInfo(free, total));
3696}
3697
3698static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
3699 GGML_UNUSED(dev);
3700 return GGML_BACKEND_DEVICE_TYPE_GPU;
3701}
3702
3703static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
3704 ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3705
3706 props->name = ggml_backend_cuda_device_get_name(dev);
3707 props->description = ggml_backend_cuda_device_get_description(dev);
3708 props->type = ggml_backend_cuda_device_get_type(dev);
3709 props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
3710 ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
3711
3712 bool host_buffer = getenv(name: "GGML_CUDA_NO_PINNED") == nullptr;
3713#ifdef GGML_CUDA_NO_PEER_COPY
3714 bool events = false;
3715#else
3716 bool events = true;
3717#endif
3718
3719 props->caps = {
3720 /* .async = */ true,
3721 /* .host_buffer = */ host_buffer,
3722 /* .buffer_from_host_ptr = */ false,
3723 /* .events = */ events,
3724 };
3725}
3726
3727static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
3728 GGML_UNUSED(params);
3729 ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3730 return ggml_backend_cuda_init(ctx->device);
3731}
3732
3733static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) {
3734 ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
3735 return ggml_backend_cuda_buffer_type(ctx->device);
3736}
3737
3738static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(ggml_backend_dev_t dev) {
3739 GGML_UNUSED(dev);
3740 return ggml_backend_cuda_host_buffer_type();
3741}
3742
3743// TODO: move these functions here
3744static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
3745 ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
3746
3747 // split buffers can only be used with GGML_OP_MUL_MAT
3748 if (op->op != GGML_OP_MUL_MAT) {
3749 for (int i = 0; i < GGML_MAX_SRC; i++) {
3750 if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
3751 return false;
3752 }
3753 }
3754 }
3755
3756 // check if all the sources are allocated on this device
3757 for (int i = 0; i < GGML_MAX_SRC; i++) {
3758 if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
3759 ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
3760 if (buft_ctx->device != dev_ctx->device) {
3761 return false;
3762 }
3763 }
3764 }
3765
3766 switch (op->op) {
3767 case GGML_OP_UNARY:
3768 switch (ggml_get_unary_op(op)) {
3769 case GGML_UNARY_OP_ABS:
3770 case GGML_UNARY_OP_SGN:
3771 case GGML_UNARY_OP_NEG:
3772 case GGML_UNARY_OP_STEP:
3773 case GGML_UNARY_OP_GELU:
3774 case GGML_UNARY_OP_SILU:
3775 case GGML_UNARY_OP_RELU:
3776 case GGML_UNARY_OP_SIGMOID:
3777 case GGML_UNARY_OP_HARDSIGMOID:
3778 case GGML_UNARY_OP_HARDSWISH:
3779 case GGML_UNARY_OP_GELU_ERF:
3780 case GGML_UNARY_OP_GELU_QUICK:
3781 case GGML_UNARY_OP_TANH:
3782 case GGML_UNARY_OP_EXP:
3783 case GGML_UNARY_OP_ELU:
3784 case GGML_UNARY_OP_FLOOR:
3785 case GGML_UNARY_OP_CEIL:
3786 case GGML_UNARY_OP_ROUND:
3787 case GGML_UNARY_OP_TRUNC:
3788 return ggml_is_contiguous(op->src[0]);
3789 default:
3790 return false;
3791 }
3792 break;
3793 case GGML_OP_GLU:
3794 switch (ggml_get_glu_op(op)) {
3795 case GGML_GLU_OP_REGLU:
3796 case GGML_GLU_OP_GEGLU:
3797 case GGML_GLU_OP_SWIGLU:
3798 case GGML_GLU_OP_SWIGLU_OAI:
3799 case GGML_GLU_OP_GEGLU_ERF:
3800 case GGML_GLU_OP_GEGLU_QUICK:
3801 return ggml_is_contiguous_1(op->src[0]);
3802 default:
3803 return false;
3804 }
3805 break;
3806 case GGML_OP_MUL_MAT:
3807 case GGML_OP_MUL_MAT_ID:
3808 {
3809 struct ggml_tensor * a = op->src[0];
3810 struct ggml_tensor * b = op->src[1];
3811 if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
3812 if (a->ne[2] > 1 || a->ne[3] > 1) {
3813 return false;
3814 }
3815 // for small weight matrices the active device can end up without any rows, don't use row split in those cases
3816 // this avoids some edge cases (and the performance would not be good anyways)
3817 ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
3818 int64_t row_low;
3819 int64_t row_high;
3820 get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, dev_ctx->device);
3821 if (row_low == row_high) {
3822 return false;
3823 }
3824 }
3825 if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
3826 return false;
3827 }
3828#ifdef GGML_USE_MUSA
3829 const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3830 if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3831 if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT &&
3832 a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) {
3833 return false;
3834 }
3835 if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID &&
3836 a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) {
3837 return false;
3838 }
3839 }
3840#endif // GGML_USE_MUSA
3841 switch (a->type) {
3842 case GGML_TYPE_F32:
3843 case GGML_TYPE_F16:
3844 case GGML_TYPE_Q4_0:
3845 case GGML_TYPE_Q4_1:
3846 case GGML_TYPE_Q5_0:
3847 case GGML_TYPE_Q5_1:
3848 case GGML_TYPE_Q8_0:
3849 case GGML_TYPE_MXFP4:
3850 case GGML_TYPE_Q2_K:
3851 case GGML_TYPE_Q3_K:
3852 case GGML_TYPE_Q4_K:
3853 case GGML_TYPE_Q5_K:
3854 case GGML_TYPE_Q6_K:
3855 case GGML_TYPE_Q8_K:
3856 case GGML_TYPE_IQ1_M:
3857 case GGML_TYPE_IQ1_S:
3858 case GGML_TYPE_IQ2_S:
3859 case GGML_TYPE_IQ2_XS:
3860 case GGML_TYPE_IQ2_XXS:
3861 case GGML_TYPE_IQ3_S:
3862 case GGML_TYPE_IQ3_XXS:
3863 case GGML_TYPE_IQ4_NL:
3864 case GGML_TYPE_IQ4_XS:
3865 case GGML_TYPE_BF16:
3866 return true;
3867 default:
3868 return false;
3869 }
3870 } break;
3871 case GGML_OP_OUT_PROD:
3872 return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
3873 case GGML_OP_GET_ROWS:
3874 {
3875 switch (op->src[0]->type) {
3876 case GGML_TYPE_F16:
3877 case GGML_TYPE_F32:
3878 case GGML_TYPE_BF16:
3879 case GGML_TYPE_I32:
3880 case GGML_TYPE_Q4_0:
3881 case GGML_TYPE_Q4_1:
3882 case GGML_TYPE_Q5_0:
3883 case GGML_TYPE_Q5_1:
3884 case GGML_TYPE_Q8_0:
3885 return true;
3886 default:
3887 return false;
3888 }
3889 } break;
3890 case GGML_OP_GET_ROWS_BACK:
3891 {
3892 return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
3893 } break;
3894 case GGML_OP_SET_ROWS:
3895 {
3896 return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
3897 op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
3898 op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
3899 op->src[0]->type == GGML_TYPE_F32 &&
3900 (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
3901 } break;
3902 case GGML_OP_SET:
3903 {
3904 const ggml_type t = op->type;
3905 return (t == GGML_TYPE_F32 || t == GGML_TYPE_I32) &&
3906 t == op->src[0]->type &&
3907 t == op->src[1]->type;
3908 } break;
3909 case GGML_OP_CPY:
3910 {
3911 ggml_type src0_type = op->src[0]->type;
3912 ggml_type src1_type = op->src[1]->type;
3913 if ((src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_BF16 || src0_type == GGML_TYPE_F16) &&
3914 (src1_type == GGML_TYPE_F32 || src1_type == GGML_TYPE_BF16 || src1_type == GGML_TYPE_F16)
3915 ) {
3916 return true;
3917 }
3918 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
3919 return true;
3920 }
3921 if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
3922 return true;
3923 }
3924 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
3925 return true;
3926 }
3927 if (src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_F32) {
3928 return true;
3929 }
3930 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
3931 return true;
3932 }
3933 if (src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_F32) {
3934 return true;
3935 }
3936 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_0) {
3937 return true;
3938 }
3939 if (src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_F32) {
3940 return true;
3941 }
3942 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q5_1) {
3943 return true;
3944 }
3945 if (src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_F32) {
3946 return true;
3947 }
3948 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
3949 return true;
3950 }
3951 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) {
3952 return true;
3953 }
3954 if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) {
3955 return true;
3956 }
3957 if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
3958 return true;
3959 }
3960 return false;
3961 } break;
3962 case GGML_OP_DUP:
3963 {
3964 ggml_type src0_type = op->src[0]->type;
3965 return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3966 } break;
3967 case GGML_OP_ARGMAX:
3968 case GGML_OP_COUNT_EQUAL:
3969 {
3970 return true;
3971 } break;
3972 case GGML_OP_REPEAT:
3973 {
3974 ggml_type src0_type = op->src[0]->type;
3975 return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3976 } break;
3977 case GGML_OP_REPEAT_BACK:
3978 return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);
3979 case GGML_OP_CONCAT:
3980 {
3981 ggml_type src0_type = op->src[0]->type;
3982 return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3983 } break;
3984 case GGML_OP_CONV_TRANSPOSE_1D:
3985 {
3986 ggml_type src0_type = op->src[0]->type;
3987 ggml_type src1_type = op->src[1]->type;
3988 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
3989 return true;
3990 }
3991 return false;
3992 } break;
3993 case GGML_OP_SILU_BACK:
3994 return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
3995 break;
3996 case GGML_OP_NORM:
3997 case GGML_OP_RMS_NORM:
3998 case GGML_OP_L2_NORM:
3999 return true;
4000 case GGML_OP_RMS_NORM_BACK:
4001 return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
4002 break;
4003 case GGML_OP_NONE:
4004 case GGML_OP_RESHAPE:
4005 case GGML_OP_VIEW:
4006 case GGML_OP_PERMUTE:
4007 case GGML_OP_TRANSPOSE:
4008 case GGML_OP_ADD:
4009 case GGML_OP_ADD_ID:
4010 case GGML_OP_ADD1:
4011 case GGML_OP_SUB:
4012 case GGML_OP_MUL:
4013 case GGML_OP_DIV:
4014 case GGML_OP_SCALE:
4015 case GGML_OP_SQR:
4016 case GGML_OP_SQRT:
4017 case GGML_OP_SIN:
4018 case GGML_OP_COS:
4019 case GGML_OP_CLAMP:
4020 case GGML_OP_LOG:
4021 return true;
4022 case GGML_OP_SSM_SCAN: {
4023 if (op->src[3]->ne[0] == 1) {
4024 // Mamba2
4025 // (kernel only supports (d_state == 128 || d_state == 256) && d_head % 16 == 0)
4026 return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % 16 == 0;
4027 } else {
4028 // Mamba
4029 // (kernel only supports d_state == 16, d_head == 1, n_head % 128 == 0, n_group == 1)
4030 return op->src[0]->ne[0] == 16 && op->src[0]->ne[1] == 1 && op->src[0]->ne[2] % 128 == 0 && op->src[4]->ne[1] == 1;
4031 }
4032 }
4033 case GGML_OP_SSM_CONV: {
4034 // assumes d_inner % threads == 0
4035 return op->src[0]->ne[1] % 128 == 0;
4036 }
4037 case GGML_OP_CONT:
4038 return true;
4039 case GGML_OP_DIAG_MASK_INF:
4040 return true;
4041 case GGML_OP_SOFT_MAX:
4042 return true;
4043 case GGML_OP_SOFT_MAX_BACK: {
4044 float max_bias = 0.0f;
4045 memcpy(dest: &max_bias, src: (const float *) op->op_params + 1, n: sizeof(float));
4046 return max_bias == 0.0f;
4047 }
4048 case GGML_OP_ROLL:
4049 if(op->src[0]->type == GGML_TYPE_F32) {
4050 return true;
4051 }
4052 return false;
4053 case GGML_OP_ROPE:
4054 case GGML_OP_ROPE_BACK: {
4055 return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
4056 }
4057 case GGML_OP_IM2COL:
4058 case GGML_OP_IM2COL_3D:
4059 case GGML_OP_CONV_2D:
4060 case GGML_OP_CONV_2D_DW:
4061 case GGML_OP_CONV_TRANSPOSE_2D:
4062 case GGML_OP_POOL_2D:
4063 case GGML_OP_ACC:
4064 return true;
4065 case GGML_OP_SUM:
4066 return ggml_is_contiguous_rows(op->src[0]);
4067 case GGML_OP_ARGSORT:
4068#ifndef GGML_CUDA_USE_CUB
4069 return op->src[0]->ne[0] <= 1024;
4070#else
4071 return true;
4072#endif
4073 case GGML_OP_SUM_ROWS:
4074 case GGML_OP_MEAN:
4075 case GGML_OP_GROUP_NORM:
4076 case GGML_OP_PAD:
4077 return ggml_is_contiguous(op->src[0]);
4078 case GGML_OP_UPSCALE:
4079 case GGML_OP_PAD_REFLECT_1D:
4080 case GGML_OP_ARANGE:
4081 case GGML_OP_TIMESTEP_EMBEDDING:
4082 case GGML_OP_LEAKY_RELU:
4083 case GGML_OP_RWKV_WKV6:
4084 case GGML_OP_GATED_LINEAR_ATTN:
4085 case GGML_OP_RWKV_WKV7:
4086 return true;
4087 case GGML_OP_FLASH_ATTN_EXT:
4088 return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);
4089 case GGML_OP_CROSS_ENTROPY_LOSS:
4090 case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
4091 case GGML_OP_OPT_STEP_ADAMW:
4092 case GGML_OP_OPT_STEP_SGD:
4093 return true;
4094 default:
4095 return false;
4096 }
4097}
4098
4099static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
4100 ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
4101 const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
4102 return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
4103}
4104
4105static int64_t get_op_batch_size(const ggml_tensor * op) {
4106 switch (op->op) {
4107 case GGML_OP_GET_ROWS:
4108 return 0;
4109 case GGML_OP_MUL_MAT:
4110 return op->ne[1];
4111 case GGML_OP_MUL_MAT_ID:
4112 case GGML_OP_ROPE:
4113 case GGML_OP_ROPE_BACK:
4114 return op->ne[2];
4115 default:
4116 return ggml_nrows(op);
4117 }
4118}
4119
4120static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
4121 const int min_batch_size = 32;
4122
4123 return get_op_batch_size(op) >= min_batch_size;
4124
4125 GGML_UNUSED(dev);
4126}
4127
4128static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
4129#ifdef GGML_CUDA_NO_PEER_COPY
4130 return nullptr;
4131#else
4132 ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
4133
4134 ggml_cuda_set_device(device: dev_ctx->device);
4135
4136 cudaEvent_t event;
4137 CUDA_CHECK(cudaEventCreateWithFlags(event: &event, cudaEventDisableTiming));
4138
4139 return new ggml_backend_event {
4140 /* .device = */ dev,
4141 /* .context = */ event,
4142 };
4143#endif
4144}
4145
4146static void ggml_backend_cuda_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
4147 GGML_UNUSED(dev);
4148
4149 CUDA_CHECK(cudaEventDestroy(event: (cudaEvent_t)event->context));
4150 delete event;
4151}
4152
4153static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
4154 GGML_UNUSED(dev);
4155 CUDA_CHECK(cudaEventSynchronize(event: (cudaEvent_t)event->context));
4156}
4157
4158static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
4159 /* .get_name = */ ggml_backend_cuda_device_get_name,
4160 /* .get_description = */ ggml_backend_cuda_device_get_description,
4161 /* .get_memory = */ ggml_backend_cuda_device_get_memory,
4162 /* .get_type = */ ggml_backend_cuda_device_get_type,
4163 /* .get_props = */ ggml_backend_cuda_device_get_props,
4164 /* .init_backend = */ ggml_backend_cuda_device_init_backend,
4165 /* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
4166 /* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
4167 /* .buffer_from_host_ptr = */ NULL,
4168 /* .supports_op = */ ggml_backend_cuda_device_supports_op,
4169 /* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
4170 /* .offload_op = */ ggml_backend_cuda_device_offload_op,
4171 /* .event_new = */ ggml_backend_cuda_device_event_new,
4172 /* .event_free = */ ggml_backend_cuda_device_event_free,
4173 /* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
4174};
4175
4176// backend reg
4177
4178struct ggml_backend_cuda_reg_context {
4179 std::vector<ggml_backend_dev_t> devices;
4180};
4181
4182static const char * ggml_backend_cuda_reg_get_name(ggml_backend_reg_t reg) {
4183 GGML_UNUSED(reg);
4184 return GGML_CUDA_NAME;
4185}
4186
4187static size_t ggml_backend_cuda_reg_get_device_count(ggml_backend_reg_t reg) {
4188 ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
4189 return ctx->devices.size();
4190}
4191
4192static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t reg, size_t index) {
4193 ggml_backend_cuda_reg_context * ctx = (ggml_backend_cuda_reg_context *)reg->context;
4194 GGML_ASSERT(index < ctx->devices.size());
4195 return ctx->devices[index];
4196}
4197
4198static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
4199 static std::vector<ggml_backend_feature> features = []() {
4200 std::vector<ggml_backend_feature> features;
4201 #define _STRINGIFY(...) #__VA_ARGS__
4202 #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
4203
4204 #ifdef __CUDA_ARCH_LIST__
4205 features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
4206 #endif
4207
4208 #ifdef GGML_CUDA_FORCE_MMQ
4209 features.push_back({ "FORCE_MMQ", "1" });
4210 #endif
4211
4212 #ifdef GGML_CUDA_FORCE_CUBLAS
4213 features.push_back({ "FORCE_CUBLAS", "1" });
4214 #endif
4215
4216 #ifndef GGML_USE_VMM
4217 features.push_back({ "NO_VMM", "1" });
4218 #endif
4219
4220 #ifdef GGML_CUDA_NO_PEER_COPY
4221 features.push_back({ "NO_PEER_COPY", "1" });
4222 #endif
4223
4224 #ifdef GGML_CUDA_USE_GRAPHS
4225 features.push_back({ "USE_GRAPHS", "1" });
4226 #endif
4227
4228 #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
4229 features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
4230 #endif
4231
4232 #ifdef GGML_CUDA_FA_ALL_QUANTS
4233 features.push_back({ "FA_ALL_QUANTS", "1" });
4234 #endif
4235
4236 #undef _STRINGIFY
4237 #undef STRINGIFY
4238
4239 features.push_back({ nullptr, nullptr });
4240
4241 return features;
4242 }();
4243
4244 return features.data();
4245
4246 GGML_UNUSED(reg);
4247}
4248
4249static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
4250 GGML_UNUSED(reg);
4251 if (strcmp(s1: name, s2: "ggml_backend_split_buffer_type") == 0) {
4252 return (void *)ggml_backend_cuda_split_buffer_type;
4253 }
4254 if (strcmp(s1: name, s2: "ggml_backend_register_host_buffer") == 0) {
4255 return (void *)ggml_backend_cuda_register_host_buffer;
4256 }
4257 if (strcmp(s1: name, s2: "ggml_backend_unregister_host_buffer") == 0) {
4258 return (void *)ggml_backend_cuda_unregister_host_buffer;
4259 }
4260 if (strcmp(s1: name, s2: "ggml_backend_get_features") == 0) {
4261 return (void *)ggml_backend_cuda_get_features;
4262 }
4263 return nullptr;
4264}
4265
4266static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
4267 /* .get_name = */ ggml_backend_cuda_reg_get_name,
4268 /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
4269 /* .get_device = */ ggml_backend_cuda_reg_get_device,
4270 /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
4271};
4272
4273// backend registry
4274ggml_backend_reg_t ggml_backend_cuda_reg() {
4275 static ggml_backend_reg reg;
4276 static bool initialized = false;
4277
4278 {
4279 static std::mutex mutex;
4280 std::lock_guard<std::mutex> lock(mutex);
4281 if (!initialized) {
4282 ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
4283
4284 for (int i = 0; i < ggml_cuda_info().device_count; i++) {
4285 ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
4286 dev_ctx->device = i;
4287 dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
4288
4289 cudaDeviceProp prop;
4290 CUDA_CHECK(cudaGetDeviceProperties(prop: &prop, device: i));
4291 dev_ctx->description = prop.name;
4292
4293 char pci_bus_id[16] = {};
4294 snprintf(s: pci_bus_id, maxlen: sizeof(pci_bus_id), format: "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
4295 dev_ctx->pci_bus_id = pci_bus_id;
4296
4297 ggml_backend_dev_t dev = new ggml_backend_device {
4298 /* .iface = */ ggml_backend_cuda_device_interface,
4299 /* .reg = */ &reg,
4300 /* .context = */ dev_ctx
4301 };
4302 ctx->devices.push_back(dev);
4303 }
4304
4305 reg = ggml_backend_reg {
4306 /* .api_version = */ GGML_BACKEND_API_VERSION,
4307 /* .iface = */ ggml_backend_cuda_reg_interface,
4308 /* .context = */ ctx
4309 };
4310 }
4311
4312 initialized = true;
4313 }
4314
4315 return &reg;
4316}
4317
4318ggml_backend_t ggml_backend_cuda_init(int device) {
4319 if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
4320 GGML_LOG_ERROR("%s: invalid device %d\n", __func__, device);
4321 return nullptr;
4322 }
4323
4324 ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
4325 if (ctx == nullptr) {
4326 GGML_LOG_ERROR("%s: failed to allocate context\n", __func__);
4327 return nullptr;
4328 }
4329
4330 ggml_backend_t cuda_backend = new ggml_backend {
4331 /* .guid = */ ggml_backend_cuda_guid(),
4332 /* .iface = */ ggml_backend_cuda_interface,
4333 /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
4334 /* .context = */ ctx,
4335 };
4336
4337 return cuda_backend;
4338}
4339
4340GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)
4341