1#pragma once
2
3#include "ggml.h"
4#include "ggml-impl.h"
5#include "ggml-cuda.h"
6
7#include <cstdint>
8#include <memory>
9
10#if defined(GGML_USE_HIP)
11#define GGML_COMMON_DECL_HIP
12#define GGML_COMMON_IMPL_HIP
13#else
14#define GGML_COMMON_DECL_CUDA
15#define GGML_COMMON_IMPL_CUDA
16#if defined(GGML_USE_MUSA)
17#define GGML_COMMON_DECL_MUSA
18#define GGML_COMMON_IMPL_MUSA
19#endif
20#endif
21#include "ggml-common.h"
22
23#include <array>
24#include <cassert>
25#include <cfloat>
26#include <cstdio>
27#include <string>
28#include <vector>
29
30#if defined(GGML_USE_HIP)
31#include "vendors/hip.h"
32#elif defined(GGML_USE_MUSA)
33#include "vendors/musa.h"
34#else
35#include "vendors/cuda.h"
36#endif // defined(GGML_USE_HIP)
37
38#define STRINGIZE_IMPL(...) #__VA_ARGS__
39#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
40
41#define WARP_SIZE 32
42#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
43#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
44
45#define GGML_CUDA_CC_PASCAL 600
46#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
47#define GGML_CUDA_CC_VOLTA 700
48#define GGML_CUDA_CC_TURING 750
49#define GGML_CUDA_CC_AMPERE 800
50#define GGML_CUDA_CC_ADA_LOVELACE 890
51#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
52#define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
53#define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
54
55// AMD
56// GCN/CDNA, wave size is 64
57#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
58#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
59#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
60#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
61#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
62#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
63
64// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
65#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
66#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
67#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
68#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
69
70#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
71#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
72#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
73#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
74#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
75#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
76#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
77#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
78#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
79#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
80#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
81
82// Moore Threads
83#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
84
85#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
86#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
87#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
88
89#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
90#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
91#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
92#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
93
94#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
95# define GGML_CUDA_USE_CUB
96#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
97
98#ifdef __CUDA_ARCH_LIST__
99constexpr bool ggml_cuda_has_arch_impl(int) {
100 return false;
101}
102
103template<class ... Archs>
104constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
105 return arch == first || ggml_cuda_has_arch_impl(arch, rest...);
106}
107
108constexpr bool ggml_cuda_has_arch(const int arch) {
109 return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
110}
111
112constexpr int ggml_cuda_highest_compiled_arch_impl(const int /*arch*/, const int cur) {
113 if (cur == 0) {
114 return -1;
115 }
116 return cur;
117}
118
119template<class ... Archs>
120constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) {
121 if (first <= arch && first > cur) {
122 return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...);
123 } else {
124 return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...);
125 }
126}
127
128constexpr int ggml_cuda_highest_compiled_arch(const int arch) {
129 return ggml_cuda_highest_compiled_arch_impl(arch, 0, __CUDA_ARCH_LIST__);
130}
131#else
132static int ggml_cuda_highest_compiled_arch(const int arch) {
133 return arch;
134}
135#endif // __CUDA_ARCH_LIST__
136
137// ---------------------------------------------------------------------------------------------------------
138
139#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
140
141#define GGML_CUDA_MAX_STREAMS 8
142
143[[noreturn]]
144void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
145
146#define CUDA_CHECK_GEN(err, success, error_fn) \
147 do { \
148 auto err_ = (err); \
149 if (err_ != (success)) { \
150 ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
151 } \
152 } while (0)
153
154#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
155
156#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
157 static const char * cublas_get_error_str(const cublasStatus_t err) {
158 return cublasGetStatusString(status: err);
159 }
160#else
161 static const char * cublas_get_error_str(const cublasStatus_t err) {
162 switch (err) {
163 case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
164 case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
165 case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
166 case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
167 case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
168 case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
169 case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
170 case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
171 case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
172 default: return "unknown error";
173 }
174 }
175#endif // CUDART_VERSION >= 12000
176
177#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
178
179#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
180static const char * cu_get_error_str(CUresult err) {
181 const char * err_str;
182 cuGetErrorString(error: err, pStr: &err_str);
183 return err_str;
184}
185#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
186#endif
187
188#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
189# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
190 do { \
191 static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
192 const int id = ggml_cuda_get_device(); \
193 if (!shared_memory_limit_raised[id]) { \
194 CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
195 shared_memory_limit_raised[id] = true; \
196 } \
197 } while (0)
198#else
199# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
200 do { \
201 GGML_UNUSED(nbytes); \
202 } while (0)
203#endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
204
205#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
206#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
207#else
208#define GGML_CUDA_ASSUME(x)
209#endif // CUDART_VERSION >= 11010
210
211#if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
212#define GGML_USE_VMM
213#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
214
215#if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
216#define FP16_AVAILABLE
217#endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
218
219#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
220#define FAST_FP16_AVAILABLE
221#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
222
223#if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
224#define AMD_MFMA_AVAILABLE
225#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
226
227// The Volta instructions are in principle available on Turing or newer but they are effectively unusable:
228#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
229#define VOLTA_MMA_AVAILABLE
230#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
231
232#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
233#define TURING_MMA_AVAILABLE
234#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
235
236#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
237#define AMPERE_MMA_AVAILABLE
238#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
239
240#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
241#define CP_ASYNC_AVAILABLE
242#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
243
244#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
245#define FLASH_ATTN_AVAILABLE
246#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
247
248static bool fp16_available(const int cc) {
249 return ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_PASCAL;
250}
251
252static bool fast_fp16_available(const int cc) {
253 return GGML_CUDA_CC_IS_AMD(cc) ||
254 (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && ggml_cuda_highest_compiled_arch(arch: cc) != 610);
255}
256
257// To be used for feature selection of external libraries, e.g. cuBLAS.
258static bool fast_fp16_hardware_available(const int cc) {
259 return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) ||
260 (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
261}
262
263// To be used for feature selection of external libraries, e.g. cuBLAS.
264static bool fp16_mma_hardware_available(const int cc) {
265 return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
266 GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
267 (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
268}
269
270static bool bf16_mma_hardware_available(const int cc) {
271 return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
272}
273
274static bool fp32_mma_hardware_available(const int cc) {
275 return GGML_CUDA_CC_IS_CDNA(cc);
276}
277
278static bool amd_mfma_available(const int cc) {
279#if !defined(GGML_HIP_NO_MMQ_MFMA)
280 return GGML_CUDA_CC_IS_CDNA(cc);
281#else
282 return false;
283#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
284}
285
286static bool volta_mma_available(const int cc) {
287 return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) == GGML_CUDA_CC_VOLTA;
288}
289
290static bool turing_mma_available(const int cc) {
291 return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_TURING;
292}
293
294static bool ampere_mma_available(const int cc) {
295 return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_AMPERE;
296}
297
298static bool cp_async_available(const int cc) {
299 return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_AMPERE;
300}
301
302static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
303#if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
304 return 64;
305#else
306 return 32;
307#endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
308}
309
310// Maximum number of bytes that can be copied in a single instruction.
311static constexpr __device__ int ggml_cuda_get_max_cpy_bytes() {
312#ifdef GGML_USE_HIP
313 return 16;
314#else
315#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
316 return 16;
317#else
318 return 8;
319#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
320#endif // GGML_USE_HIP
321}
322
323
324[[noreturn]]
325static __device__ void no_device_code(
326 const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
327
328#if defined(GGML_USE_HIP)
329 printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
330 file_name, line, function_name, arch);
331 GGML_UNUSED(arch_list);
332#else
333 printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
334 file_name, line, function_name, arch, arch_list);
335#endif // defined(GGML_USE_HIP)
336 __trap();
337
338 GGML_UNUSED(no_device_code); // suppress unused function warning
339
340#if defined(GGML_USE_MUSA)
341 __builtin_unreachable();
342#endif // defined(GGML_USE_MUSA)
343}
344
345#ifdef __CUDA_ARCH__
346#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
347#else
348#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
349#endif // __CUDA_ARCH__
350
351// The compiler is always able to unroll loops if they contain continue expressions.
352// In such cases loop unrolling can still be achieved via recursion:
353template <int n>
354struct ggml_cuda_unroll {
355 template <typename Func, typename... Args>
356 __device__ void operator()(const Func & f, Args... args) const {
357 f(n - 1, args...);
358 ggml_cuda_unroll<n - 1>{}(f, args...);
359 }
360};
361
362template <>
363struct ggml_cuda_unroll<1> {
364 template <typename Func, typename... Args>
365 __device__ void operator()(const Func & f, Args... args) const {
366 f(0, args...);
367 }
368};
369
370template<int width = WARP_SIZE>
371static __device__ __forceinline__ int warp_reduce_sum(int x) {
372#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
373 return __reduce_add_sync(0xffffffff, x);
374#else
375#pragma unroll
376 for (int offset = width/2; offset > 0; offset >>= 1) {
377 x += __shfl_xor_sync(mask: 0xffffffff, val: x, offset: offset, width: width);
378 }
379 return x;
380#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
381}
382
383template<int width = WARP_SIZE>
384static __device__ __forceinline__ float warp_reduce_sum(float x) {
385#pragma unroll
386 for (int offset = width/2; offset > 0; offset >>= 1) {
387 x += __shfl_xor_sync(mask: 0xffffffff, val: x, offset: offset, width: width);
388 }
389 return x;
390}
391
392template<int width = WARP_SIZE>
393static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
394#pragma unroll
395 for (int offset = width/2; offset > 0; offset >>= 1) {
396 a.x += __shfl_xor_sync(mask: 0xffffffff, val: a.x, offset: offset, width: width);
397 a.y += __shfl_xor_sync(mask: 0xffffffff, val: a.y, offset: offset, width: width);
398 }
399 return a;
400}
401
402template<int width = WARP_SIZE>
403static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
404#ifdef FP16_AVAILABLE
405#pragma unroll
406 for (int offset = width/2; offset > 0; offset >>= 1) {
407 a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width));
408 }
409 return a;
410
411#else
412 NO_DEVICE_CODE;
413 return a;
414#endif // FP16_AVAILABLE
415}
416
417template<int width = WARP_SIZE>
418static __device__ __forceinline__ int warp_reduce_all(int x) {
419 if (width == ggml_cuda_get_physical_warp_size()) {
420 return __all_sync(mask: 0xffffffff, pred: x);
421 } else {
422#pragma unroll
423 for (int offset = width/2; offset > 0; offset >>= 1) {
424 x = __shfl_xor_sync(mask: 0xffffffff, val: x, offset: offset, width: width) && x;
425 }
426 return x;
427 }
428}
429
430template<int width = WARP_SIZE>
431static __device__ __forceinline__ int warp_reduce_any(int x) {
432 if (width == ggml_cuda_get_physical_warp_size()) {
433 return __any_sync(mask: 0xffffffff, pred: x);
434 } else {
435#pragma unroll
436 for (int offset = width/2; offset > 0; offset >>= 1) {
437 x = __shfl_xor_sync(mask: 0xffffffff, val: x, offset: offset, width: width) || x;
438 }
439 return x;
440 }
441}
442
443template<int width = WARP_SIZE>
444static __device__ __forceinline__ float warp_reduce_max(float x) {
445#pragma unroll
446 for (int offset = width/2; offset > 0; offset >>= 1) {
447 x = fmaxf(a: x, b: __shfl_xor_sync(mask: 0xffffffff, val: x, offset: offset, width: width));
448 }
449 return x;
450}
451
452static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
453#ifdef FP16_AVAILABLE
454
455#if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
456 return __float2half(fmaxf(__half2float(a), __half2float(b)));
457#else
458 return __hmax(a, b);
459#endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
460
461#else
462 NO_DEVICE_CODE;
463 GGML_UNUSED(b);
464 return a;
465#endif // FP16_AVAILABLE
466}
467
468static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
469#if defined(GGML_USE_HIP)
470 return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
471#elif CUDART_VERSION >= CUDART_HMAX
472 return __hmax2(a, b);
473#else
474 half2 ret;
475 reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
476 reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
477 return ret;
478#endif
479}
480
481template<int width = WARP_SIZE>
482static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
483#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
484#pragma unroll
485 for (int offset = width/2; offset > 0; offset >>= 1) {
486 x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
487 }
488 return x;
489#else
490 GGML_UNUSED(x);
491 NO_DEVICE_CODE;
492#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
493}
494
495#if (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || \
496 (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK)
497static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
498 const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
499 const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
500 return mask_low | mask_high;
501}
502#endif // (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK)
503
504static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
505#if defined(GGML_USE_HIP)
506#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
507 c = __builtin_amdgcn_sdot4(a, b, c, false);
508#elif defined(RDNA3) || defined(RDNA4)
509 c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
510#elif defined(RDNA1) || defined(__gfx900__)
511 int tmp1;
512 int tmp2;
513 asm("\n \
514 v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
515 v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
516 v_add3_u32 %0, %1, %2, %0 \n \
517 v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
518 v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
519 v_add3_u32 %0, %1, %2, %0 \n \
520 "
521 : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
522 : "v"(a), "v"(b)
523 );
524#else
525 const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
526 const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
527 c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
528#endif
529 return c;
530
531#else // defined(GGML_USE_HIP)
532
533#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
534 return __dp4a(a, b, c);
535#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
536 const int8_t * a8 = (const int8_t *) &a;
537 const int8_t * b8 = (const int8_t *) &b;
538 return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
539#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
540
541#endif // defined(GGML_USE_HIP)
542}
543
544static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float v, const float u) {
545 acc += v*u;
546}
547
548static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v, const float2 u) {
549 acc += v.x*u.x;
550 acc += v.y*u.y;
551}
552
553static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) {
554#if defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(__gfx906__) || defined(CDNA))
555 asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u));
556#else
557#ifdef FAST_FP16_AVAILABLE
558 const float2 tmp = __half22float2(v*u);
559 acc += tmp.x + tmp.y;
560#else
561 const float2 tmpv = __half22float2(a: v);
562 const float2 tmpu = __half22float2(a: u);
563 acc += tmpv.x * tmpu.x;
564 acc += tmpv.y * tmpu.y;
565#endif // FAST_FP16_AVAILABLE
566#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
567}
568
569static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v, const half2 u) {
570#ifdef FAST_FP16_AVAILABLE
571 acc += v*u;
572#else
573 const float2 tmpv = __half22float2(a: v);
574 const float2 tmpu = __half22float2(a: u);
575 float2 tmpacc = __half22float2(a: acc);
576 tmpacc.x += tmpv.x * tmpu.x;
577 tmpacc.y += tmpv.y * tmpu.y;
578 acc = make_half2(x: tmpacc.x, y: tmpacc.y);
579#endif // FAST_FP16_AVAILABLE
580}
581
582// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
583// Important: do not use this function if dst and src both point at registers.
584// Due to the strict aliasing rule the compiler can do incorrect optimizations if src and dst have different types.
585// The function is intended for copies between registers and SRAM/VRAM to make the compiler emit the right instructions.
586// If dst and src point at different address spaces then they are guaranteed to not be aliased.
587template <int nbytes, int alignment = 0>
588static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
589 if constexpr (alignment != 0) {
590 static_assert(nbytes % alignment == 0, "bad alignment");
591 }
592 constexpr int nb_per_cpy = alignment == 0 ? nbytes : alignment;
593
594#pragma unroll
595 for (int i = 0; i < nbytes/nb_per_cpy; ++i) {
596 if constexpr (nb_per_cpy == 1) {
597 ((char *) dst)[i] = ((const char *) src)[i];
598 } else if constexpr (nb_per_cpy == 2) {
599 ((short *) dst)[i] = ((const short *) src)[i];
600 } else if constexpr (nb_per_cpy == 4) {
601 ((int *) dst)[i] = ((const int *) src)[i];
602 } else if constexpr (nb_per_cpy == 8) {
603 ((int2 *) dst)[i] = ((const int2 *) src)[i];
604 } else if constexpr (nb_per_cpy == 16) {
605 ((int4 *) dst)[i] = ((const int4 *) src)[i];
606 } else {
607 static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
608 }
609 }
610}
611
612static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
613#if CUDART_VERSION >= 12080
614 const nv_bfloat16 e = __nv_cvt_e8m0_to_bf16raw(x);
615 return (float) e;
616#else
617 uint32_t bits;
618 if (x == 0) {
619 bits = 0x00400000;
620 } else {
621 bits = (uint32_t) x << 23;
622 }
623
624 float result;
625 memcpy(&result, &bits, sizeof(float));
626 return result;
627#endif // CUDART_VERSION >= 12050
628}
629
630// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
631// Precompute mp (m' in the paper) and L such that division
632// can be computed using a multiply (high 32b of 64b result)
633// and a shift:
634//
635// n/d = (mulhi(n, mp) + n) >> L;
636static const uint3 init_fastdiv_values(uint64_t d_64) {
637 GGML_ASSERT(d_64 != 0);
638 GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
639
640 uint32_t d = (uint32_t)d_64;
641
642 // compute L = ceil(log2(d));
643 uint32_t L = 0;
644 while (L < 32 && (uint32_t{ 1 } << L) < d) {
645 L++;
646 }
647
648 uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
649 // pack divisor as well to reduce error surface
650 return make_uint3(x: mp, y: L, z: d);
651}
652
653static __device__ __forceinline__ uint32_t fastdiv(uint32_t n, const uint3 fastdiv_values) {
654 // expects fastdiv_values to contain <mp, L, divisor> in <x, y, z>
655 // fastdiv_values.z is unused and optimized away by the compiler.
656 // Compute high 32 bits of n * mp
657 const uint32_t hi = __umulhi(a: n, b: fastdiv_values.x);
658 // add n, apply bit shift
659 return (hi + n) >> fastdiv_values.y;
660}
661
662static __device__ __forceinline__ uint32_t fastmodulo(uint32_t n, const uint3 fastdiv_values) {
663 // expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
664 return n - fastdiv(n, fastdiv_values) * fastdiv_values.z;
665}
666
667// Calculate both division and modulo at once, returns <n/divisor, n%divisor>
668static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3 fastdiv_values) {
669 // expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
670 const uint32_t div_val = fastdiv(n, fastdiv_values);
671 const uint32_t mod_val = n - div_val * fastdiv_values.z;
672 return make_uint2(x: div_val, y: mod_val);
673}
674
675typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);
676
677static __device__ __forceinline__ float get_alibi_slope(
678 const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
679) {
680 if (max_bias <= 0.0f) {
681 return 1.0f;
682 }
683 const float base = h < n_head_log2 ? m0 : m1;
684 const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
685
686 return powf(a: base, b: exph);
687}
688
689template <ggml_type type>
690struct ggml_cuda_type_traits;
691
692template<>
693struct ggml_cuda_type_traits<GGML_TYPE_F16> {
694 static constexpr int qk = 1;
695 static constexpr int qr = 1;
696};
697
698template<>
699struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
700 static constexpr int qk = QK4_0;
701 static constexpr int qr = QR4_0;
702 static constexpr int qi = QI4_0;
703};
704
705template<>
706struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> {
707 static constexpr int qk = QK4_1;
708 static constexpr int qr = QR4_1;
709 static constexpr int qi = QI4_1;
710};
711
712template<>
713struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> {
714 static constexpr int qk = QK5_0;
715 static constexpr int qr = QR5_0;
716 static constexpr int qi = QI5_0;
717};
718
719template<>
720struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
721 static constexpr int qk = QK5_1;
722 static constexpr int qr = QR5_1;
723 static constexpr int qi = QI5_1;
724};
725
726template<>
727struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
728 static constexpr int qk = QK8_0;
729 static constexpr int qr = QR8_0;
730 static constexpr int qi = QI8_0;
731};
732
733template<>
734struct ggml_cuda_type_traits<GGML_TYPE_MXFP4> {
735 static constexpr int qk = QK_MXFP4;
736 static constexpr int qr = QR_MXFP4;
737 static constexpr int qi = QI_MXFP4;
738};
739
740template<>
741struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
742 static constexpr int qk = QK_K;
743 static constexpr int qr = QR2_K;
744 static constexpr int qi = QI2_K;
745};
746
747template<>
748struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> {
749 static constexpr int qk = QK_K;
750 static constexpr int qr = QR3_K;
751 static constexpr int qi = QI3_K;
752};
753
754template<>
755struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
756 static constexpr int qk = QK_K;
757 static constexpr int qr = QR4_K;
758 static constexpr int qi = QI4_K;
759};
760
761template<>
762struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> {
763 static constexpr int qk = QK_K;
764 static constexpr int qr = QR5_K;
765 static constexpr int qi = QI5_K;
766};
767
768template<>
769struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> {
770 static constexpr int qk = QK_K;
771 static constexpr int qr = QR6_K;
772 static constexpr int qi = QI6_K;
773};
774
775template<>
776struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> {
777 static constexpr int qk = QK_K;
778 static constexpr int qr = QR2_XXS;
779 static constexpr int qi = QI2_XXS;
780};
781
782template<>
783struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> {
784 static constexpr int qk = QK_K;
785 static constexpr int qr = QR2_XS;
786 static constexpr int qi = QI2_XS;
787};
788
789template<>
790struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> {
791 static constexpr int qk = QK_K;
792 static constexpr int qr = QR2_S;
793 static constexpr int qi = QI2_S;
794};
795
796template<>
797struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> {
798 static constexpr int qk = QK_K;
799 static constexpr int qr = QR3_XXS;
800 static constexpr int qi = QI3_XXS;
801};
802
803template<>
804struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> {
805 static constexpr int qk = QK_K;
806 static constexpr int qr = QR1_S;
807 static constexpr int qi = QI1_S;
808};
809
810template<>
811struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
812 static constexpr int qk = QK_K;
813 static constexpr int qr = QR1_M;
814 static constexpr int qi = QI1_M;
815};
816
817template<>
818struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
819 static constexpr int qk = QK4_NL;
820 static constexpr int qr = QR4_NL;
821 static constexpr int qi = QI4_NL;
822};
823
824template<>
825struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
826 static constexpr int qk = QK_K;
827 static constexpr int qr = QR4_XS;
828 static constexpr int qi = QI4_XS;
829};
830
831template<>
832struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
833 static constexpr int qk = QK_K;
834 static constexpr int qr = QR3_S;
835 static constexpr int qi = QI3_S;
836};
837
838//////////////////////
839
840struct ggml_cuda_device_info {
841 int device_count;
842
843 struct cuda_device_info {
844 int cc; // compute capability
845 int nsm; // number of streaming multiprocessors
846 size_t smpb; // max. shared memory per block
847 size_t smpbo; // max. shared memory per block (with opt-in)
848 bool integrated; // Device is integrated as opposed to discrete
849 bool vmm; // virtual memory support
850 size_t vmm_granularity; // granularity of virtual memory
851 size_t total_vram;
852 int warp_size; // Number of threads in a dispatch
853 };
854
855 cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
856
857 std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
858};
859
860const ggml_cuda_device_info & ggml_cuda_info();
861
862void ggml_cuda_set_device(int device);
863int ggml_cuda_get_device();
864
865struct ggml_cuda_pool {
866 virtual ~ggml_cuda_pool() = default;
867
868 virtual void * alloc(size_t size, size_t * actual_size) = 0;
869 virtual void free(void * ptr, size_t size) = 0;
870};
871
872template<typename T>
873struct ggml_cuda_pool_alloc {
874 ggml_cuda_pool * pool = nullptr;
875 T * ptr = nullptr;
876 size_t actual_size = 0;
877
878 ggml_cuda_pool_alloc() = default;
879
880 explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
881 }
882
883 ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
884 alloc(size);
885 }
886
887 ~ggml_cuda_pool_alloc() {
888 if (ptr != nullptr) {
889 pool->free(ptr, size: actual_size);
890 }
891 }
892
893 // size is in number of elements
894 T * alloc(size_t size) {
895 GGML_ASSERT(pool != nullptr);
896 GGML_ASSERT(ptr == nullptr);
897 ptr = (T *) pool->alloc(size: size * sizeof(T), actual_size: &this->actual_size);
898 return ptr;
899 }
900
901 T * alloc(ggml_cuda_pool & pool, size_t size) {
902 this->pool = &pool;
903 return alloc(size);
904 }
905
906 T * get() {
907 return ptr;
908 }
909
910 ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
911 ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
912 ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
913 ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
914};
915
916
917// backend interface
918
919struct ggml_tensor_extra_gpu {
920 void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
921 cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
922};
923
924
925#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
926#define USE_CUDA_GRAPH
927#endif
928
929struct ggml_graph_node_properties {
930 void * node_address;
931 ggml_op node_op;
932 int64_t ne[GGML_MAX_DIMS];
933 size_t nb[GGML_MAX_DIMS];
934 void * src_address[GGML_MAX_SRC];
935 int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
936};
937
938struct ggml_cuda_graph {
939#ifdef USE_CUDA_GRAPH
940 ~ggml_cuda_graph() {
941 if (instance != nullptr) {
942 CUDA_CHECK(cudaGraphExecDestroy(instance));
943 }
944 if (graph != nullptr) {
945 CUDA_CHECK(cudaGraphDestroy(graph));
946 }
947 }
948 cudaGraph_t graph = nullptr;
949 cudaGraphExec_t instance = nullptr;
950 size_t num_nodes = 0;
951 std::vector<cudaGraphNode_t> nodes;
952 std::vector<cudaKernelNodeParams> params;
953 bool disable_due_to_gpu_arch = false;
954 bool disable_due_to_too_many_updates = false;
955 bool disable_due_to_failed_graph_capture = false;
956 int number_consecutive_updates = 0;
957 std::vector<ggml_graph_node_properties> ggml_graph_properties;
958#endif
959};
960
961struct ggml_backend_cuda_context {
962 int device;
963 std::string name;
964 cudaEvent_t copy_event = nullptr;
965
966 cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
967 cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
968
969 std::unique_ptr<ggml_cuda_graph> cuda_graph;
970
971 explicit ggml_backend_cuda_context(int device) :
972 device(device),
973 name(GGML_CUDA_NAME + std::to_string(val: device)) {
974 }
975
976 ~ggml_backend_cuda_context();
977
978 cudaStream_t stream(int device, int stream) {
979 if (streams[device][stream] == nullptr) {
980 ggml_cuda_set_device(device);
981 CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
982 }
983 return streams[device][stream];
984 }
985
986 cudaStream_t stream() {
987 return stream(device, stream: 0);
988 }
989
990 cublasHandle_t cublas_handle(int device) {
991 if (cublas_handles[device] == nullptr) {
992 ggml_cuda_set_device(device);
993 CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
994 CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
995 }
996 return cublas_handles[device];
997 }
998
999 cublasHandle_t cublas_handle() {
1000 return cublas_handle(device);
1001 }
1002
1003 // pool
1004 std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
1005
1006 static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
1007
1008 ggml_cuda_pool & pool(int device) {
1009 if (pools[device] == nullptr) {
1010 pools[device] = new_pool_for_device(device);
1011 }
1012 return *pools[device];
1013 }
1014
1015 ggml_cuda_pool & pool() {
1016 return pool(device);
1017 }
1018};
1019
1020struct ggml_cuda_mm_fusion_args_host {
1021 const ggml_tensor * x_bias = nullptr;
1022 const ggml_tensor * gate = nullptr;
1023 const ggml_tensor * gate_bias = nullptr;
1024 ggml_glu_op glu_op;
1025};
1026struct ggml_cuda_mm_fusion_args_device {
1027 const void * x_bias = nullptr;
1028 const void * gate = nullptr;
1029 const void * gate_bias = nullptr;
1030 ggml_glu_op glu_op;
1031};
1032