common.cuh source code [llama.cpp/ggml/src/ggml-cuda/common.cuh]

1	#pragma once
2
3	#include "ggml.h"
4	#include "ggml-impl.h"
5	#include "ggml-cuda.h"
6
7	#include <cstdint>
8	#include <memory>
9
10	#if defined(GGML_USE_HIP)
11	#define GGML_COMMON_DECL_HIP
12	#define GGML_COMMON_IMPL_HIP
13	#else
14	#define GGML_COMMON_DECL_CUDA
15	#define GGML_COMMON_IMPL_CUDA
16	#if defined(GGML_USE_MUSA)
17	#define GGML_COMMON_DECL_MUSA
18	#define GGML_COMMON_IMPL_MUSA
19	#endif
20	#endif
21	#include "ggml-common.h"
22
23	#include <array>
24	#include <cassert>
25	#include <cfloat>
26	#include <cstdio>
27	#include <string>
28	#include <vector>
29
30	#if defined(GGML_USE_HIP)
31	#include "vendors/hip.h"
32	#elif defined(GGML_USE_MUSA)
33	#include "vendors/musa.h"
34	#else
35	#include "vendors/cuda.h"
36	#endif // defined(GGML_USE_HIP)
37
38	#define STRINGIZE_IMPL(...) #__VA_ARGS__
39	#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
40
41	#define WARP_SIZE 32
42	#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
43	#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
44
45	#define GGML_CUDA_CC_PASCAL 600
46	#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
47	#define GGML_CUDA_CC_VOLTA 700
48	#define GGML_CUDA_CC_TURING 750
49	#define GGML_CUDA_CC_AMPERE 800
50	#define GGML_CUDA_CC_ADA_LOVELACE 890
51	#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
52	#define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
53	#define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
54
55	// AMD
56	// GCN/CDNA, wave size is 64
57	#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
58	#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
59	#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
60	#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
61	#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
62	#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
63
64	// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
65	#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
66	#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
67	#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
68	#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
69
70	#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
71	#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
72	#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
73	#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
74	#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
75	#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
76	#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
77	#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
78	#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
79	#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
80	#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
81
82	// Moore Threads
83	#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
84
85	#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
86	#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
87	#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
88
89	#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
90	#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
91	#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
92	#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
93
94	#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
95	# define GGML_CUDA_USE_CUB
96	#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
97
98	#ifdef __CUDA_ARCH_LIST__
99	constexpr bool ggml_cuda_has_arch_impl(int) {
100	return false;
101	}
102
103	template<class ... Archs>
104	constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
105	return arch == first \|\| ggml_cuda_has_arch_impl(arch, rest...);
106	}
107
108	constexpr bool ggml_cuda_has_arch(const int arch) {
109	return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
110	}
111
112	constexpr int ggml_cuda_highest_compiled_arch_impl(const int /arch/, const int cur) {
113	if (cur == `0`) {
114	return -`1`;
115	}
116	return cur;
117	}
118
119	template<class ... Archs>
120	constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) {
121	if (first <= arch && first > cur) {
122	return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...);
123	} else {
124	return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...);
125	}
126	}
127
128	constexpr int ggml_cuda_highest_compiled_arch(const int arch) {
129	return ggml_cuda_highest_compiled_arch_impl(arch, `0`, __CUDA_ARCH_LIST__);
130	}
131	#else
132	static int ggml_cuda_highest_compiled_arch(const int arch) {
133	return arch;
134	}
135	#endif // __CUDA_ARCH_LIST__
136
137	// ---------------------------------------------------------------------------------------------------------
138
139	#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
140
141	#define GGML_CUDA_MAX_STREAMS 8
142
143	[[noreturn]]
144	void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
145
146	#define CUDA_CHECK_GEN(err, success, error_fn) \
147	do { \
148	auto err_ = (err); \
149	if (err_ != (success)) { \
150	ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
151	} \
152	} while (0)
153
154	#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
155
156	#if CUDART_VERSION >= 12000 \|\| defined(GGML_USE_MUSA)
157	static const char * cublas_get_error_str(const cublasStatus_t err) {
158	return cublasGetStatusString(status: err);
159	}
160	#else
161	static const char * cublas_get_error_str(const cublasStatus_t err) {
162	switch (err) {
163	case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
164	case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
165	case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
166	case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
167	case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
168	case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
169	case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
170	case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
171	case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
172	default: return "unknown error";
173	}
174	}
175	#endif // CUDART_VERSION >= 12000
176
177	#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
178
179	#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
180	static const char * cu_get_error_str(CUresult err) {
181	const char * err_str;
182	cuGetErrorString(error: err, pStr: &err_str);
183	return err_str;
184	}
185	#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
186	#endif
187
188	#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
189	# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
190	do { \
191	static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
192	const int id = ggml_cuda_get_device(); \
193	if (!shared_memory_limit_raised[id]) { \
194	CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
195	shared_memory_limit_raised[id] = true; \
196	} \
197	} while (0)
198	#else
199	# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
200	do { \
201	GGML_UNUSED(nbytes); \
202	} while (0)
203	#endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
204
205	#if CUDART_VERSION >= 11010 \|\| defined(GGML_USE_MUSA)
206	#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
207	#else
208	#define GGML_CUDA_ASSUME(x)
209	#endif // CUDART_VERSION >= 11010
210
211	#if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) \|\| (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
212	#define GGML_USE_VMM
213	#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) \|\| (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
214
215	#if defined(GGML_USE_HIP) \|\| __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
216	#define FP16_AVAILABLE
217	#endif // defined(GGML_USE_HIP) \|\| __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
218
219	#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
220	#define FAST_FP16_AVAILABLE
221	#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
222
223	#if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
224	#define AMD_MFMA_AVAILABLE
225	#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
226
227	// The Volta instructions are in principle available on Turing or newer but they are effectively unusable:
228	#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
229	#define VOLTA_MMA_AVAILABLE
230	#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
231
232	#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
233	#define TURING_MMA_AVAILABLE
234	#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
235
236	#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
237	#define AMPERE_MMA_AVAILABLE
238	#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
239
240	#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
241	#define CP_ASYNC_AVAILABLE
242	#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
243
244	#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
245	#define FLASH_ATTN_AVAILABLE
246	#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
247
248	static bool fp16_available(const int cc) {
249	return ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_PASCAL;
250	}
251
252	static bool fast_fp16_available(const int cc) {
253	return GGML_CUDA_CC_IS_AMD(cc) \|\|
254	(GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && ggml_cuda_highest_compiled_arch(arch: cc) != `610`);
255	}
256
257	// To be used for feature selection of external libraries, e.g. cuBLAS.
258	static bool fast_fp16_hardware_available(const int cc) {
259	return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != `610`) \|\| GGML_CUDA_CC_IS_AMD(cc) \|\|
260	(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
261	}
262
263	// To be used for feature selection of external libraries, e.g. cuBLAS.
264	static bool fp16_mma_hardware_available(const int cc) {
265	return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) \|\|
266	GGML_CUDA_CC_IS_CDNA(cc) \|\| GGML_CUDA_CC_IS_RDNA3(cc) \|\| GGML_CUDA_CC_IS_RDNA4(cc) \|\|
267	(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
268	}
269
270	static bool bf16_mma_hardware_available(const int cc) {
271	return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) \|\| GGML_CUDA_CC_IS_CDNA(cc) \|\| cc >= GGML_CUDA_CC_RDNA3;
272	}
273
274	static bool fp32_mma_hardware_available(const int cc) {
275	return GGML_CUDA_CC_IS_CDNA(cc);
276	}
277
278	static bool amd_mfma_available(const int cc) {
279	#if !defined(GGML_HIP_NO_MMQ_MFMA)
280	return GGML_CUDA_CC_IS_CDNA(cc);
281	#else
282	return false;
283	#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
284	}
285
286	static bool volta_mma_available(const int cc) {
287	return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) == GGML_CUDA_CC_VOLTA;
288	}
289
290	static bool turing_mma_available(const int cc) {
291	return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_TURING;
292	}
293
294	static bool ampere_mma_available(const int cc) {
295	return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_AMPERE;
296	}
297
298	static bool cp_async_available(const int cc) {
299	return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_AMPERE;
300	}
301
302	static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
303	#if defined(GGML_USE_HIP) && (defined(__GFX9__) \|\| defined(__GFX8__))
304	return `64`;
305	#else
306	return `32`;
307	#endif // defined(GGML_USE_HIP) && (defined(__GFX9__) \|\| defined(__GFX8__))
308	}
309
310	// Maximum number of bytes that can be copied in a single instruction.
311	static constexpr __device__ int ggml_cuda_get_max_cpy_bytes() {
312	#ifdef GGML_USE_HIP
313	return `16`;
314	#else
315	#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
316	return `16`;
317	#else
318	return `8`;
319	#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
320	#endif // GGML_USE_HIP
321	}
322
323
324	[[noreturn]]
325	static __device__ void no_device_code(
326	const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
327
328	#if defined(GGML_USE_HIP)
329	printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
330	file_name, line, function_name, arch);
331	GGML_UNUSED(arch_list);
332	#else
333	printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
334	file_name, line, function_name, arch, arch_list);
335	#endif // defined(GGML_USE_HIP)
336	__trap();
337
338	GGML_UNUSED(no_device_code); // suppress unused function warning
339
340	#if defined(GGML_USE_MUSA)
341	__builtin_unreachable();
342	#endif // defined(GGML_USE_MUSA)
343	}
344
345	#ifdef __CUDA_ARCH__
346	#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
347	#else
348	#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
349	#endif // __CUDA_ARCH__
350
351	// The compiler is always able to unroll loops if they contain continue expressions.
352	// In such cases loop unrolling can still be achieved via recursion:
353	template <int n>
354	struct ggml_cuda_unroll {
355	template <typename Func, typename... Args>
356	__device__ void operator()(const Func & f, Args... args) const {
357	f(n - `1`, args...);
358	ggml_cuda_unroll<n - `1`>{}(f, args...);
359	}
360	};
361
362	template <>
363	struct ggml_cuda_unroll<`1`> {
364	template <typename Func, typename... Args>
365	__device__ void operator()(const Func & f, Args... args) const {
366	f(`0`, args...);
367	}
368	};
369
370	template<int width = WARP_SIZE>
371	static __device__ __forceinline__ int warp_reduce_sum(int x) {
372	#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
373	return __reduce_add_sync(`0xffffffff`, x);
374	#else
375	#pragma unroll
376	for (int offset = width/`2`; offset > `0`; offset >>= `1`) {
377	x += __shfl_xor_sync(mask: `0xffffffff`, val: x, offset: offset, width: width);
378	}
379	return x;
380	#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
381	}
382
383	template<int width = WARP_SIZE>
384	static __device__ __forceinline__ float warp_reduce_sum(float x) {
385	#pragma unroll
386	for (int offset = width/`2`; offset > `0`; offset >>= `1`) {
387	x += __shfl_xor_sync(mask: `0xffffffff`, val: x, offset: offset, width: width);
388	}
389	return x;
390	}
391
392	template<int width = WARP_SIZE>
393	static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
394	#pragma unroll
395	for (int offset = width/`2`; offset > `0`; offset >>= `1`) {
396	a.x += __shfl_xor_sync(mask: `0xffffffff`, val: a.x, offset: offset, width: width);
397	a.y += __shfl_xor_sync(mask: `0xffffffff`, val: a.y, offset: offset, width: width);
398	}
399	return a;
400	}
401
402	template<int width = WARP_SIZE>
403	static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
404	#ifdef FP16_AVAILABLE
405	#pragma unroll
406	for (int offset = width/`2`; offset > `0`; offset >>= `1`) {
407	a = __hadd2(a, __shfl_xor_sync(`0xffffffff`, a, offset, width));
408	}
409	return a;
410
411	#else
412	NO_DEVICE_CODE;
413	return a;
414	#endif // FP16_AVAILABLE
415	}
416
417	template<int width = WARP_SIZE>
418	static __device__ __forceinline__ int warp_reduce_all(int x) {
419	if (width == ggml_cuda_get_physical_warp_size()) {
420	return __all_sync(mask: `0xffffffff`, pred: x);
421	} else {
422	#pragma unroll
423	for (int offset = width/`2`; offset > `0`; offset >>= `1`) {
424	x = __shfl_xor_sync(mask: `0xffffffff`, val: x, offset: offset, width: width) && x;
425	}
426	return x;
427	}
428	}
429
430	template<int width = WARP_SIZE>
431	static __device__ __forceinline__ int warp_reduce_any(int x) {
432	if (width == ggml_cuda_get_physical_warp_size()) {
433	return __any_sync(mask: `0xffffffff`, pred: x);
434	} else {
435	#pragma unroll
436	for (int offset = width/`2`; offset > `0`; offset >>= `1`) {
437	x = __shfl_xor_sync(mask: `0xffffffff`, val: x, offset: offset, width: width) \|\| x;
438	}
439	return x;
440	}
441	}
442
443	template<int width = WARP_SIZE>
444	static __device__ __forceinline__ float warp_reduce_max(float x) {
445	#pragma unroll
446	for (int offset = width/`2`; offset > `0`; offset >>= `1`) {
447	x = fmaxf(a: x, b: __shfl_xor_sync(mask: `0xffffffff`, val: x, offset: offset, width: width));
448	}
449	return x;
450	}
451
452	static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
453	#ifdef FP16_AVAILABLE
454
455	#if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
456	return __float2half(fmaxf(__half2float(a), __half2float(b)));
457	#else
458	return __hmax(a, b);
459	#endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
460
461	#else
462	NO_DEVICE_CODE;
463	GGML_UNUSED(b);
464	return a;
465	#endif // FP16_AVAILABLE
466	}
467
468	static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
469	#if defined(GGML_USE_HIP)
470	return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
471	#elif CUDART_VERSION >= CUDART_HMAX
472	return __hmax2(a, b);
473	#else
474	half2 ret;
475	reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
476	reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
477	return ret;
478	#endif
479	}
480
481	template<int width = WARP_SIZE>
482	static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
483	#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL \|\| defined(GGML_USE_HIP)
484	#pragma unroll
485	for (int offset = width/`2`; offset > `0`; offset >>= `1`) {
486	x = ggml_cuda_hmax2(x, __shfl_xor_sync(`0xffffffff`, x, offset, width));
487	}
488	return x;
489	#else
490	GGML_UNUSED(x);
491	NO_DEVICE_CODE;
492	#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL \|\| defined(GGML_USE_HIP)
493	}
494
495	#if (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) \|\| defined(GGML_USE_HIP) \|\| \
496	(defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK)
497	static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
498	const uint32_t mask_low = `0x0000FFFF` * (float( __low2half(a)) > float( __low2half(b)));
499	const uint32_t mask_high = `0xFFFF0000` * (float(__high2half(a)) > float(__high2half(b)));
500	return mask_low \| mask_high;
501	}
502	#endif // (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) \|\| defined(GGML_USE_HIP) \|\| (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK)
503
504	static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
505	#if defined(GGML_USE_HIP)
506	#if defined(CDNA) \|\| defined(RDNA2) \|\| defined(__gfx906__)
507	c = __builtin_amdgcn_sdot4(a, b, c, false);
508	#elif defined(RDNA3) \|\| defined(RDNA4)
509	c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
510	#elif defined(RDNA1) \|\| defined(__gfx900__)
511	int tmp1;
512	int tmp2;
513	asm("\n \
514	v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
515	v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
516	v_add3_u32 %0, %1, %2, %0 \n \
517	v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
518	v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
519	v_add3_u32 %0, %1, %2, %0 \n \
520	"
521	: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
522	: "v"(a), "v"(b)
523	);
524	#else
525	const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
526	const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
527	c += va[`0`] * vb[`0`] + va[`1`] * vb[`1`] + va[`2`] * vb[`2`] + va[`3`] * vb[`3`];
528	#endif
529	return c;
530
531	#else // defined(GGML_USE_HIP)
532
533	#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A \|\| defined(GGML_USE_MUSA)
534	return __dp4a(a, b, c);
535	#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A \|\| defined(GGML_USE_MUSA)
536	const int8_t * a8 = (const int8_t *) &a;
537	const int8_t * b8 = (const int8_t *) &b;
538	return c + a8[`0`]b8[`0`] + a8[`1`]b8[`1`] + a8[`2`]b8[`2`] + a8[`3`]b8[`3`];
539	#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A \|\| defined(GGML_USE_MUSA)
540
541	#endif // defined(GGML_USE_HIP)
542	}
543
544	static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float v, const float u) {
545	acc += v*u;
546	}
547
548	static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v, const float2 u) {
549	acc += v.x*u.x;
550	acc += v.y*u.y;
551	}
552
553	static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) {
554	#if defined(GGML_USE_HIP) && (defined(RDNA2) \|\| defined(RDNA3) \|\| defined(RDNA4) \|\| defined(__gfx906__) \|\| defined(CDNA))
555	asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u));
556	#else
557	#ifdef FAST_FP16_AVAILABLE
558	const float2 tmp = __half22float2(v*u);
559	acc += tmp.x + tmp.y;
560	#else
561	const float2 tmpv = __half22float2(a: v);
562	const float2 tmpu = __half22float2(a: u);
563	acc += tmpv.x * tmpu.x;
564	acc += tmpv.y * tmpu.y;
565	#endif // FAST_FP16_AVAILABLE
566	#endif // defined(GGML_USE_HIP) && (defined(RDNA2) \|\| defined(RDNA3) \|\| defined(RDNA4) \|\| defined(GCN5) \|\| defined(CDNA))
567	}
568
569	static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v, const half2 u) {
570	#ifdef FAST_FP16_AVAILABLE
571	acc += v*u;
572	#else
573	const float2 tmpv = __half22float2(a: v);
574	const float2 tmpu = __half22float2(a: u);
575	float2 tmpacc = __half22float2(a: acc);
576	tmpacc.x += tmpv.x * tmpu.x;
577	tmpacc.y += tmpv.y * tmpu.y;
578	acc = make_half2(x: tmpacc.x, y: tmpacc.y);
579	#endif // FAST_FP16_AVAILABLE
580	}
581
582	// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
583	// Important: do not use this function if dst and src both point at registers.
584	// Due to the strict aliasing rule the compiler can do incorrect optimizations if src and dst have different types.
585	// The function is intended for copies between registers and SRAM/VRAM to make the compiler emit the right instructions.
586	// If dst and src point at different address spaces then they are guaranteed to not be aliased.
587	template <int nbytes, int alignment = `0`>
588	static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
589	if constexpr (alignment != `0`) {
590	static_assert(nbytes % alignment == `0`, "bad alignment");
591	}
592	constexpr int nb_per_cpy = alignment == `0` ? nbytes : alignment;
593
594	#pragma unroll
595	for (int i = `0`; i < nbytes/nb_per_cpy; ++i) {
596	if constexpr (nb_per_cpy == `1`) {
597	((char ) dst)[i] = ((const* char *) src)[i];
598	} else if constexpr (nb_per_cpy == `2`) {
599	((short ) dst)[i] = ((const* short *) src)[i];
600	} else if constexpr (nb_per_cpy == `4`) {
601	((int ) dst)[i] = ((const* int *) src)[i];
602	} else if constexpr (nb_per_cpy == `8`) {
603	((int2 ) dst)[i] = ((const* int2 *) src)[i];
604	} else if constexpr (nb_per_cpy == `16`) {
605	((int4 ) dst)[i] = ((const* int4 *) src)[i];
606	} else {
607	static_assert(nbytes == `0` && nbytes == -`1`, "bad nbytes");
608	}
609	}
610	}
611
612	static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
613	#if CUDART_VERSION >= 12080
614	const nv_bfloat16 e = __nv_cvt_e8m0_to_bf16raw(x);
615	return (float) e;
616	#else
617	uint32_t bits;
618	if (x == `0`) {
619	bits = `0x00400000`;
620	} else {
621	bits = (uint32_t) x << `23`;
622	}
623
624	float result;
625	memcpy(&result, &bits, sizeof(float));
626	return result;
627	#endif // CUDART_VERSION >= 12050
628	}
629
630	// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
631	// Precompute mp (m' in the paper) and L such that division
632	// can be computed using a multiply (high 32b of 64b result)
633	// and a shift:
634	//
635	// n/d = (mulhi(n, mp) + n) >> L;
636	static const uint3 init_fastdiv_values(uint64_t d_64) {
637	GGML_ASSERT(d_64 != `0`);
638	GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
639
640	uint32_t d = (uint32_t)d_64;
641
642	// compute L = ceil(log2(d));
643	uint32_t L = `0`;
644	while (L < `32` && (uint32_t{ `1` } << L) < d) {
645	L++;
646	}
647
648	uint32_t mp = (uint32_t) ((uint64_t{ `1` } << `32`) * ((uint64_t{ `1` } << L) - d) / d + `1`);
649	// pack divisor as well to reduce error surface
650	return make_uint3(x: mp, y: L, z: d);
651	}
652
653	static __device__ __forceinline__ uint32_t fastdiv(uint32_t n, const uint3 fastdiv_values) {
654	// expects fastdiv_values to contain <mp, L, divisor> in <x, y, z>
655	// fastdiv_values.z is unused and optimized away by the compiler.
656	// Compute high 32 bits of n mp*
657	const uint32_t hi = __umulhi(a: n, b: fastdiv_values.x);
658	// add n, apply bit shift
659	return (hi + n) >> fastdiv_values.y;
660	}
661
662	static __device__ __forceinline__ uint32_t fastmodulo(uint32_t n, const uint3 fastdiv_values) {
663	// expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
664	return n - fastdiv(n, fastdiv_values) * fastdiv_values.z;
665	}
666
667	// Calculate both division and modulo at once, returns <n/divisor, n%divisor>
668	static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3 fastdiv_values) {
669	// expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
670	const uint32_t div_val = fastdiv(n, fastdiv_values);
671	const uint32_t mod_val = n - div_val * fastdiv_values.z;
672	return make_uint2(x: div_val, y: mod_val);
673	}
674
675	typedef void (dequantize_kernel_t)(const* void * vx, const int64_t ib, const int iqs, float2 & v);
676
677	static __device__ __forceinline__ float get_alibi_slope(
678	const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
679	) {
680	if (max_bias <= `0.0f`) {
681	return `1.0f`;
682	}
683	const float base = h < n_head_log2 ? m0 : m1;
684	const int exph = h < n_head_log2 ? h + `1` : `2`*(h - n_head_log2) + `1`;
685
686	return powf(a: base, b: exph);
687	}
688
689	template <ggml_type type>
690	struct ggml_cuda_type_traits;
691
692	template<>
693	struct ggml_cuda_type_traits<GGML_TYPE_F16> {
694	static constexpr int qk = `1`;
695	static constexpr int qr = `1`;
696	};
697
698	template<>
699	struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
700	static constexpr int qk = QK4_0;
701	static constexpr int qr = QR4_0;
702	static constexpr int qi = QI4_0;
703	};
704
705	template<>
706	struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> {
707	static constexpr int qk = QK4_1;
708	static constexpr int qr = QR4_1;
709	static constexpr int qi = QI4_1;
710	};
711
712	template<>
713	struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> {
714	static constexpr int qk = QK5_0;
715	static constexpr int qr = QR5_0;
716	static constexpr int qi = QI5_0;
717	};
718
719	template<>
720	struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
721	static constexpr int qk = QK5_1;
722	static constexpr int qr = QR5_1;
723	static constexpr int qi = QI5_1;
724	};
725
726	template<>
727	struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
728	static constexpr int qk = QK8_0;
729	static constexpr int qr = QR8_0;
730	static constexpr int qi = QI8_0;
731	};
732
733	template<>
734	struct ggml_cuda_type_traits<GGML_TYPE_MXFP4> {
735	static constexpr int qk = QK_MXFP4;
736	static constexpr int qr = QR_MXFP4;
737	static constexpr int qi = QI_MXFP4;
738	};
739
740	template<>
741	struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
742	static constexpr int qk = QK_K;
743	static constexpr int qr = QR2_K;
744	static constexpr int qi = QI2_K;
745	};
746
747	template<>
748	struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> {
749	static constexpr int qk = QK_K;
750	static constexpr int qr = QR3_K;
751	static constexpr int qi = QI3_K;
752	};
753
754	template<>
755	struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
756	static constexpr int qk = QK_K;
757	static constexpr int qr = QR4_K;
758	static constexpr int qi = QI4_K;
759	};
760
761	template<>
762	struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> {
763	static constexpr int qk = QK_K;
764	static constexpr int qr = QR5_K;
765	static constexpr int qi = QI5_K;
766	};
767
768	template<>
769	struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> {
770	static constexpr int qk = QK_K;
771	static constexpr int qr = QR6_K;
772	static constexpr int qi = QI6_K;
773	};
774
775	template<>
776	struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> {
777	static constexpr int qk = QK_K;
778	static constexpr int qr = QR2_XXS;
779	static constexpr int qi = QI2_XXS;
780	};
781
782	template<>
783	struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> {
784	static constexpr int qk = QK_K;
785	static constexpr int qr = QR2_XS;
786	static constexpr int qi = QI2_XS;
787	};
788
789	template<>
790	struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> {
791	static constexpr int qk = QK_K;
792	static constexpr int qr = QR2_S;
793	static constexpr int qi = QI2_S;
794	};
795
796	template<>
797	struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> {
798	static constexpr int qk = QK_K;
799	static constexpr int qr = QR3_XXS;
800	static constexpr int qi = QI3_XXS;
801	};
802
803	template<>
804	struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> {
805	static constexpr int qk = QK_K;
806	static constexpr int qr = QR1_S;
807	static constexpr int qi = QI1_S;
808	};
809
810	template<>
811	struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
812	static constexpr int qk = QK_K;
813	static constexpr int qr = QR1_M;
814	static constexpr int qi = QI1_M;
815	};
816
817	template<>
818	struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
819	static constexpr int qk = QK4_NL;
820	static constexpr int qr = QR4_NL;
821	static constexpr int qi = QI4_NL;
822	};
823
824	template<>
825	struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
826	static constexpr int qk = QK_K;
827	static constexpr int qr = QR4_XS;
828	static constexpr int qi = QI4_XS;
829	};
830
831	template<>
832	struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
833	static constexpr int qk = QK_K;
834	static constexpr int qr = QR3_S;
835	static constexpr int qi = QI3_S;
836	};
837
838	//////////////////////
839
840	struct ggml_cuda_device_info {
841	int device_count;
842
843	struct cuda_device_info {
844	int cc; // compute capability
845	int nsm; // number of streaming multiprocessors
846	size_t smpb; // max. shared memory per block
847	size_t smpbo; // max. shared memory per block (with opt-in)
848	bool integrated; // Device is integrated as opposed to discrete
849	bool vmm; // virtual memory support
850	size_t vmm_granularity; // granularity of virtual memory
851	size_t total_vram;
852	int warp_size; // Number of threads in a dispatch
853	};
854
855	cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
856
857	std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
858	};
859
860	const ggml_cuda_device_info & ggml_cuda_info();
861
862	void ggml_cuda_set_device(int device);
863	int ggml_cuda_get_device();
864
865	struct ggml_cuda_pool {
866	virtual ~ggml_cuda_pool() = default;
867
868	virtual void * alloc(size_t size, size_t * actual_size) = `0`;
869	virtual void free(void * ptr, size_t size) = `0`;
870	};
871
872	template<typename T>
873	struct ggml_cuda_pool_alloc {
874	ggml_cuda_pool * pool = nullptr;
875	T * ptr = nullptr;
876	size_t actual_size = `0`;
877
878	ggml_cuda_pool_alloc() = default;
879
880	explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
881	}
882
883	ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
884	alloc(size);
885	}
886
887	~ggml_cuda_pool_alloc() {
888	if (ptr != nullptr) {
889	pool->free(ptr, size: actual_size);
890	}
891	}
892
893	// size is in number of elements
894	T * alloc(size_t size) {
895	GGML_ASSERT(pool != nullptr);
896	GGML_ASSERT(ptr == nullptr);
897	ptr = (T ) pool->alloc(size: size sizeof(T), actual_size: &this->actual_size);
898	return ptr;
899	}
900
901	T * alloc(ggml_cuda_pool & pool, size_t size) {
902	this->pool = &pool;
903	return alloc(size);
904	}
905
906	T * get() {
907	return ptr;
908	}
909
910	ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
911	ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
912	ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
913	ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
914	};
915
916
917	// backend interface
918
919	struct ggml_tensor_extra_gpu {
920	void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
921	cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
922	};
923
924
925	#if (defined(GGML_CUDA_USE_GRAPHS) \|\| defined(GGML_HIP_GRAPHS)) \|\| defined(GGML_MUSA_GRAPHS)
926	#define USE_CUDA_GRAPH
927	#endif
928
929	struct ggml_graph_node_properties {
930	void * node_address;
931	ggml_op node_op;
932	int64_t ne[GGML_MAX_DIMS];
933	size_t nb[GGML_MAX_DIMS];
934	void * src_address[GGML_MAX_SRC];
935	int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
936	};
937
938	struct ggml_cuda_graph {
939	#ifdef USE_CUDA_GRAPH
940	~ggml_cuda_graph() {
941	if (instance != nullptr) {
942	CUDA_CHECK(cudaGraphExecDestroy(instance));
943	}
944	if (graph != nullptr) {
945	CUDA_CHECK(cudaGraphDestroy(graph));
946	}
947	}
948	cudaGraph_t graph = nullptr;
949	cudaGraphExec_t instance = nullptr;
950	size_t num_nodes = `0`;
951	std::vector<cudaGraphNode_t> nodes;
952	std::vector<cudaKernelNodeParams> params;
953	bool disable_due_to_gpu_arch = false;
954	bool disable_due_to_too_many_updates = false;
955	bool disable_due_to_failed_graph_capture = false;
956	int number_consecutive_updates = `0`;
957	std::vector<ggml_graph_node_properties> ggml_graph_properties;
958	#endif
959	};
960
961	struct ggml_backend_cuda_context {
962	int device;
963	std::string name;
964	cudaEvent_t copy_event = nullptr;
965
966	cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
967	cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
968
969	std::unique_ptr<ggml_cuda_graph> cuda_graph;
970
971	explicit ggml_backend_cuda_context(int device) :
972	device(device),
973	name(GGML_CUDA_NAME + std::to_string(val: device)) {
974	}
975
976	~ggml_backend_cuda_context();
977
978	cudaStream_t stream(int device, int stream) {
979	if (streams[device][stream] == nullptr) {
980	ggml_cuda_set_device(device);
981	CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
982	}
983	return streams[device][stream];
984	}
985
986	cudaStream_t stream() {
987	return stream(device, stream: `0`);
988	}
989
990	cublasHandle_t cublas_handle(int device) {
991	if (cublas_handles[device] == nullptr) {
992	ggml_cuda_set_device(device);
993	CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
994	CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
995	}
996	return cublas_handles[device];
997	}
998
999	cublasHandle_t cublas_handle() {
1000	return cublas_handle(device);
1001	}
1002
1003	// pool
1004	std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
1005
1006	static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
1007
1008	ggml_cuda_pool & pool(int device) {
1009	if (pools[device] == nullptr) {
1010	pools[device] = new_pool_for_device(device);
1011	}
1012	return *pools[device];
1013	}
1014
1015	ggml_cuda_pool & pool() {
1016	return pool(device);
1017	}
1018	};
1019
1020	struct ggml_cuda_mm_fusion_args_host {
1021	const ggml_tensor * x_bias = nullptr;
1022	const ggml_tensor * gate = nullptr;
1023	const ggml_tensor * gate_bias = nullptr;
1024	ggml_glu_op glu_op;
1025	};
1026	struct ggml_cuda_mm_fusion_args_device {
1027	const void * x_bias = nullptr;
1028	const void * gate = nullptr;
1029	const void * gate_bias = nullptr;
1030	ggml_glu_op glu_op;
1031	};
1032

Browse the source code of llama.cpp/ggml/src/ggml-cuda/common.cuh