| 1 | #pragma once |
| 2 | |
| 3 | #include "ggml.h" |
| 4 | #include "ggml-impl.h" |
| 5 | #include "ggml-cuda.h" |
| 6 | |
| 7 | #include <cstdint> |
| 8 | #include <memory> |
| 9 | |
| 10 | #if defined(GGML_USE_HIP) |
| 11 | #define GGML_COMMON_DECL_HIP |
| 12 | #define GGML_COMMON_IMPL_HIP |
| 13 | #else |
| 14 | #define GGML_COMMON_DECL_CUDA |
| 15 | #define GGML_COMMON_IMPL_CUDA |
| 16 | #if defined(GGML_USE_MUSA) |
| 17 | #define GGML_COMMON_DECL_MUSA |
| 18 | #define GGML_COMMON_IMPL_MUSA |
| 19 | #endif |
| 20 | #endif |
| 21 | #include "ggml-common.h" |
| 22 | |
| 23 | #include <array> |
| 24 | #include <cassert> |
| 25 | #include <cfloat> |
| 26 | #include <cstdio> |
| 27 | #include <string> |
| 28 | #include <vector> |
| 29 | |
| 30 | #if defined(GGML_USE_HIP) |
| 31 | #include "vendors/hip.h" |
| 32 | #elif defined(GGML_USE_MUSA) |
| 33 | #include "vendors/musa.h" |
| 34 | #else |
| 35 | #include "vendors/cuda.h" |
| 36 | #endif // defined(GGML_USE_HIP) |
| 37 | |
| 38 | #define STRINGIZE_IMPL(...) #__VA_ARGS__ |
| 39 | #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) |
| 40 | |
| 41 | #define WARP_SIZE 32 |
| 42 | #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) |
| 43 | #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons |
| 44 | |
| 45 | #define GGML_CUDA_CC_PASCAL 600 |
| 46 | #define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products |
| 47 | #define GGML_CUDA_CC_VOLTA 700 |
| 48 | #define GGML_CUDA_CC_TURING 750 |
| 49 | #define GGML_CUDA_CC_AMPERE 800 |
| 50 | #define GGML_CUDA_CC_ADA_LOVELACE 890 |
| 51 | #define GGML_CUDA_CC_OFFSET_AMD 0x1000000 |
| 52 | #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000 |
| 53 | #define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS) |
| 54 | |
| 55 | // AMD |
| 56 | // GCN/CDNA, wave size is 64 |
| 57 | #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16 |
| 58 | #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue |
| 59 | #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a |
| 60 | #define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers |
| 61 | #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing |
| 62 | #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300 |
| 63 | |
| 64 | // RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32 |
| 65 | #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000 |
| 66 | #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a |
| 67 | #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA |
| 68 | #define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000 |
| 69 | |
| 70 | #define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD) |
| 71 | #define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1) |
| 72 | #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2) |
| 73 | #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3) |
| 74 | #define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4) |
| 75 | #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) |
| 76 | #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1) |
| 77 | #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1) |
| 78 | #define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2) |
| 79 | #define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3) |
| 80 | #define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1) |
| 81 | |
| 82 | // Moore Threads |
| 83 | #define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons |
| 84 | |
| 85 | #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 |
| 86 | #define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 |
| 87 | #define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD |
| 88 | |
| 89 | #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD) |
| 90 | #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2) |
| 91 | #define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG) |
| 92 | #define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG) |
| 93 | |
| 94 | #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 |
| 95 | # define GGML_CUDA_USE_CUB |
| 96 | #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 |
| 97 | |
| 98 | #ifdef __CUDA_ARCH_LIST__ |
| 99 | constexpr bool ggml_cuda_has_arch_impl(int) { |
| 100 | return false; |
| 101 | } |
| 102 | |
| 103 | template<class ... Archs> |
| 104 | constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) { |
| 105 | return arch == first || ggml_cuda_has_arch_impl(arch, rest...); |
| 106 | } |
| 107 | |
| 108 | constexpr bool ggml_cuda_has_arch(const int arch) { |
| 109 | return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__); |
| 110 | } |
| 111 | |
| 112 | constexpr int ggml_cuda_highest_compiled_arch_impl(const int /*arch*/, const int cur) { |
| 113 | if (cur == 0) { |
| 114 | return -1; |
| 115 | } |
| 116 | return cur; |
| 117 | } |
| 118 | |
| 119 | template<class ... Archs> |
| 120 | constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) { |
| 121 | if (first <= arch && first > cur) { |
| 122 | return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...); |
| 123 | } else { |
| 124 | return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...); |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | constexpr int ggml_cuda_highest_compiled_arch(const int arch) { |
| 129 | return ggml_cuda_highest_compiled_arch_impl(arch, 0, __CUDA_ARCH_LIST__); |
| 130 | } |
| 131 | #else |
| 132 | static int ggml_cuda_highest_compiled_arch(const int arch) { |
| 133 | return arch; |
| 134 | } |
| 135 | #endif // __CUDA_ARCH_LIST__ |
| 136 | |
| 137 | // --------------------------------------------------------------------------------------------------------- |
| 138 | |
| 139 | #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses |
| 140 | |
| 141 | #define GGML_CUDA_MAX_STREAMS 8 |
| 142 | |
| 143 | [[noreturn]] |
| 144 | void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg); |
| 145 | |
| 146 | #define CUDA_CHECK_GEN(err, success, error_fn) \ |
| 147 | do { \ |
| 148 | auto err_ = (err); \ |
| 149 | if (err_ != (success)) { \ |
| 150 | ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \ |
| 151 | } \ |
| 152 | } while (0) |
| 153 | |
| 154 | #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) |
| 155 | |
| 156 | #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) |
| 157 | static const char * cublas_get_error_str(const cublasStatus_t err) { |
| 158 | return cublasGetStatusString(status: err); |
| 159 | } |
| 160 | #else |
| 161 | static const char * cublas_get_error_str(const cublasStatus_t err) { |
| 162 | switch (err) { |
| 163 | case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS" ; |
| 164 | case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED" ; |
| 165 | case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED" ; |
| 166 | case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE" ; |
| 167 | case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH" ; |
| 168 | case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR" ; |
| 169 | case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED" ; |
| 170 | case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR" ; |
| 171 | case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED" ; |
| 172 | default: return "unknown error" ; |
| 173 | } |
| 174 | } |
| 175 | #endif // CUDART_VERSION >= 12000 |
| 176 | |
| 177 | #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str) |
| 178 | |
| 179 | #if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) |
| 180 | static const char * cu_get_error_str(CUresult err) { |
| 181 | const char * err_str; |
| 182 | cuGetErrorString(error: err, pStr: &err_str); |
| 183 | return err_str; |
| 184 | } |
| 185 | #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) |
| 186 | #endif |
| 187 | |
| 188 | #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) |
| 189 | # define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \ |
| 190 | do { \ |
| 191 | static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \ |
| 192 | const int id = ggml_cuda_get_device(); \ |
| 193 | if (!shared_memory_limit_raised[id]) { \ |
| 194 | CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \ |
| 195 | shared_memory_limit_raised[id] = true; \ |
| 196 | } \ |
| 197 | } while (0) |
| 198 | #else |
| 199 | # define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \ |
| 200 | do { \ |
| 201 | GGML_UNUSED(nbytes); \ |
| 202 | } while (0) |
| 203 | #endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) |
| 204 | |
| 205 | #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA) |
| 206 | #define GGML_CUDA_ASSUME(x) __builtin_assume(x) |
| 207 | #else |
| 208 | #define GGML_CUDA_ASSUME(x) |
| 209 | #endif // CUDART_VERSION >= 11010 |
| 210 | |
| 211 | #if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM)) |
| 212 | #define GGML_USE_VMM |
| 213 | #endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM)) |
| 214 | |
| 215 | #if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL |
| 216 | #define FP16_AVAILABLE |
| 217 | #endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL |
| 218 | |
| 219 | #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 |
| 220 | #define FAST_FP16_AVAILABLE |
| 221 | #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 |
| 222 | |
| 223 | #if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA) |
| 224 | #define AMD_MFMA_AVAILABLE |
| 225 | #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA) |
| 226 | |
| 227 | // The Volta instructions are in principle available on Turing or newer but they are effectively unusable: |
| 228 | #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA |
| 229 | #define VOLTA_MMA_AVAILABLE |
| 230 | #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA |
| 231 | |
| 232 | #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING |
| 233 | #define TURING_MMA_AVAILABLE |
| 234 | #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING |
| 235 | |
| 236 | #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
| 237 | #define AMPERE_MMA_AVAILABLE |
| 238 | #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
| 239 | |
| 240 | #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
| 241 | #define CP_ASYNC_AVAILABLE |
| 242 | #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
| 243 | |
| 244 | #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) |
| 245 | #define FLASH_ATTN_AVAILABLE |
| 246 | #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) |
| 247 | |
| 248 | static bool fp16_available(const int cc) { |
| 249 | return ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_PASCAL; |
| 250 | } |
| 251 | |
| 252 | static bool fast_fp16_available(const int cc) { |
| 253 | return GGML_CUDA_CC_IS_AMD(cc) || |
| 254 | (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && ggml_cuda_highest_compiled_arch(arch: cc) != 610); |
| 255 | } |
| 256 | |
| 257 | // To be used for feature selection of external libraries, e.g. cuBLAS. |
| 258 | static bool fast_fp16_hardware_available(const int cc) { |
| 259 | return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) || |
| 260 | (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); |
| 261 | } |
| 262 | |
| 263 | // To be used for feature selection of external libraries, e.g. cuBLAS. |
| 264 | static bool fp16_mma_hardware_available(const int cc) { |
| 265 | return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || |
| 266 | GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) || |
| 267 | (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); |
| 268 | } |
| 269 | |
| 270 | static bool bf16_mma_hardware_available(const int cc) { |
| 271 | return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; |
| 272 | } |
| 273 | |
| 274 | static bool fp32_mma_hardware_available(const int cc) { |
| 275 | return GGML_CUDA_CC_IS_CDNA(cc); |
| 276 | } |
| 277 | |
| 278 | static bool amd_mfma_available(const int cc) { |
| 279 | #if !defined(GGML_HIP_NO_MMQ_MFMA) |
| 280 | return GGML_CUDA_CC_IS_CDNA(cc); |
| 281 | #else |
| 282 | return false; |
| 283 | #endif //!defined(GGML_HIP_NO_MMQ_MFMA) |
| 284 | } |
| 285 | |
| 286 | static bool volta_mma_available(const int cc) { |
| 287 | return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) == GGML_CUDA_CC_VOLTA; |
| 288 | } |
| 289 | |
| 290 | static bool turing_mma_available(const int cc) { |
| 291 | return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_TURING; |
| 292 | } |
| 293 | |
| 294 | static bool ampere_mma_available(const int cc) { |
| 295 | return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_AMPERE; |
| 296 | } |
| 297 | |
| 298 | static bool cp_async_available(const int cc) { |
| 299 | return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(arch: cc) >= GGML_CUDA_CC_AMPERE; |
| 300 | } |
| 301 | |
| 302 | static constexpr __device__ int ggml_cuda_get_physical_warp_size() { |
| 303 | #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__)) |
| 304 | return 64; |
| 305 | #else |
| 306 | return 32; |
| 307 | #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__)) |
| 308 | } |
| 309 | |
| 310 | // Maximum number of bytes that can be copied in a single instruction. |
| 311 | static constexpr __device__ int ggml_cuda_get_max_cpy_bytes() { |
| 312 | #ifdef GGML_USE_HIP |
| 313 | return 16; |
| 314 | #else |
| 315 | #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA |
| 316 | return 16; |
| 317 | #else |
| 318 | return 8; |
| 319 | #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA |
| 320 | #endif // GGML_USE_HIP |
| 321 | } |
| 322 | |
| 323 | |
| 324 | [[noreturn]] |
| 325 | static __device__ void no_device_code( |
| 326 | const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { |
| 327 | |
| 328 | #if defined(GGML_USE_HIP) |
| 329 | printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n" , |
| 330 | file_name, line, function_name, arch); |
| 331 | GGML_UNUSED(arch_list); |
| 332 | #else |
| 333 | printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n" , |
| 334 | file_name, line, function_name, arch, arch_list); |
| 335 | #endif // defined(GGML_USE_HIP) |
| 336 | __trap(); |
| 337 | |
| 338 | GGML_UNUSED(no_device_code); // suppress unused function warning |
| 339 | |
| 340 | #if defined(GGML_USE_MUSA) |
| 341 | __builtin_unreachable(); |
| 342 | #endif // defined(GGML_USE_MUSA) |
| 343 | } |
| 344 | |
| 345 | #ifdef __CUDA_ARCH__ |
| 346 | #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__)) |
| 347 | #else |
| 348 | #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.") |
| 349 | #endif // __CUDA_ARCH__ |
| 350 | |
| 351 | // The compiler is always able to unroll loops if they contain continue expressions. |
| 352 | // In such cases loop unrolling can still be achieved via recursion: |
| 353 | template <int n> |
| 354 | struct ggml_cuda_unroll { |
| 355 | template <typename Func, typename... Args> |
| 356 | __device__ void operator()(const Func & f, Args... args) const { |
| 357 | f(n - 1, args...); |
| 358 | ggml_cuda_unroll<n - 1>{}(f, args...); |
| 359 | } |
| 360 | }; |
| 361 | |
| 362 | template <> |
| 363 | struct ggml_cuda_unroll<1> { |
| 364 | template <typename Func, typename... Args> |
| 365 | __device__ void operator()(const Func & f, Args... args) const { |
| 366 | f(0, args...); |
| 367 | } |
| 368 | }; |
| 369 | |
| 370 | template<int width = WARP_SIZE> |
| 371 | static __device__ __forceinline__ int warp_reduce_sum(int x) { |
| 372 | #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
| 373 | return __reduce_add_sync(0xffffffff, x); |
| 374 | #else |
| 375 | #pragma unroll |
| 376 | for (int offset = width/2; offset > 0; offset >>= 1) { |
| 377 | x += __shfl_xor_sync(mask: 0xffffffff, val: x, offset: offset, width: width); |
| 378 | } |
| 379 | return x; |
| 380 | #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
| 381 | } |
| 382 | |
| 383 | template<int width = WARP_SIZE> |
| 384 | static __device__ __forceinline__ float warp_reduce_sum(float x) { |
| 385 | #pragma unroll |
| 386 | for (int offset = width/2; offset > 0; offset >>= 1) { |
| 387 | x += __shfl_xor_sync(mask: 0xffffffff, val: x, offset: offset, width: width); |
| 388 | } |
| 389 | return x; |
| 390 | } |
| 391 | |
| 392 | template<int width = WARP_SIZE> |
| 393 | static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { |
| 394 | #pragma unroll |
| 395 | for (int offset = width/2; offset > 0; offset >>= 1) { |
| 396 | a.x += __shfl_xor_sync(mask: 0xffffffff, val: a.x, offset: offset, width: width); |
| 397 | a.y += __shfl_xor_sync(mask: 0xffffffff, val: a.y, offset: offset, width: width); |
| 398 | } |
| 399 | return a; |
| 400 | } |
| 401 | |
| 402 | template<int width = WARP_SIZE> |
| 403 | static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { |
| 404 | #ifdef FP16_AVAILABLE |
| 405 | #pragma unroll |
| 406 | for (int offset = width/2; offset > 0; offset >>= 1) { |
| 407 | a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width)); |
| 408 | } |
| 409 | return a; |
| 410 | |
| 411 | #else |
| 412 | NO_DEVICE_CODE; |
| 413 | return a; |
| 414 | #endif // FP16_AVAILABLE |
| 415 | } |
| 416 | |
| 417 | template<int width = WARP_SIZE> |
| 418 | static __device__ __forceinline__ int warp_reduce_all(int x) { |
| 419 | if (width == ggml_cuda_get_physical_warp_size()) { |
| 420 | return __all_sync(mask: 0xffffffff, pred: x); |
| 421 | } else { |
| 422 | #pragma unroll |
| 423 | for (int offset = width/2; offset > 0; offset >>= 1) { |
| 424 | x = __shfl_xor_sync(mask: 0xffffffff, val: x, offset: offset, width: width) && x; |
| 425 | } |
| 426 | return x; |
| 427 | } |
| 428 | } |
| 429 | |
| 430 | template<int width = WARP_SIZE> |
| 431 | static __device__ __forceinline__ int warp_reduce_any(int x) { |
| 432 | if (width == ggml_cuda_get_physical_warp_size()) { |
| 433 | return __any_sync(mask: 0xffffffff, pred: x); |
| 434 | } else { |
| 435 | #pragma unroll |
| 436 | for (int offset = width/2; offset > 0; offset >>= 1) { |
| 437 | x = __shfl_xor_sync(mask: 0xffffffff, val: x, offset: offset, width: width) || x; |
| 438 | } |
| 439 | return x; |
| 440 | } |
| 441 | } |
| 442 | |
| 443 | template<int width = WARP_SIZE> |
| 444 | static __device__ __forceinline__ float warp_reduce_max(float x) { |
| 445 | #pragma unroll |
| 446 | for (int offset = width/2; offset > 0; offset >>= 1) { |
| 447 | x = fmaxf(a: x, b: __shfl_xor_sync(mask: 0xffffffff, val: x, offset: offset, width: width)); |
| 448 | } |
| 449 | return x; |
| 450 | } |
| 451 | |
| 452 | static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) { |
| 453 | #ifdef FP16_AVAILABLE |
| 454 | |
| 455 | #if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX |
| 456 | return __float2half(fmaxf(__half2float(a), __half2float(b))); |
| 457 | #else |
| 458 | return __hmax(a, b); |
| 459 | #endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX |
| 460 | |
| 461 | #else |
| 462 | NO_DEVICE_CODE; |
| 463 | GGML_UNUSED(b); |
| 464 | return a; |
| 465 | #endif // FP16_AVAILABLE |
| 466 | } |
| 467 | |
| 468 | static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) { |
| 469 | #if defined(GGML_USE_HIP) |
| 470 | return half2(__hmax(a.x, b.x), __hmax(a.y, b.y)); |
| 471 | #elif CUDART_VERSION >= CUDART_HMAX |
| 472 | return __hmax2(a, b); |
| 473 | #else |
| 474 | half2 ret; |
| 475 | reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b))); |
| 476 | reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b))); |
| 477 | return ret; |
| 478 | #endif |
| 479 | } |
| 480 | |
| 481 | template<int width = WARP_SIZE> |
| 482 | static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { |
| 483 | #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP) |
| 484 | #pragma unroll |
| 485 | for (int offset = width/2; offset > 0; offset >>= 1) { |
| 486 | x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width)); |
| 487 | } |
| 488 | return x; |
| 489 | #else |
| 490 | GGML_UNUSED(x); |
| 491 | NO_DEVICE_CODE; |
| 492 | #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP) |
| 493 | } |
| 494 | |
| 495 | #if (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || \ |
| 496 | (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK) |
| 497 | static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) { |
| 498 | const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b))); |
| 499 | const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b))); |
| 500 | return mask_low | mask_high; |
| 501 | } |
| 502 | #endif // (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK) |
| 503 | |
| 504 | static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { |
| 505 | #if defined(GGML_USE_HIP) |
| 506 | #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__) |
| 507 | c = __builtin_amdgcn_sdot4(a, b, c, false); |
| 508 | #elif defined(RDNA3) || defined(RDNA4) |
| 509 | c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); |
| 510 | #elif defined(RDNA1) || defined(__gfx900__) |
| 511 | int tmp1; |
| 512 | int tmp2; |
| 513 | asm("\n \ |
| 514 | v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ |
| 515 | v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ |
| 516 | v_add3_u32 %0, %1, %2, %0 \n \ |
| 517 | v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ |
| 518 | v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ |
| 519 | v_add3_u32 %0, %1, %2, %0 \n \ |
| 520 | " |
| 521 | : "+v" (c), "=&v" (tmp1), "=&v" (tmp2) |
| 522 | : "v" (a), "v" (b) |
| 523 | ); |
| 524 | #else |
| 525 | const int8x4_t va = reinterpret_cast<const int8x4_t&>(a); |
| 526 | const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b); |
| 527 | c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; |
| 528 | #endif |
| 529 | return c; |
| 530 | |
| 531 | #else // defined(GGML_USE_HIP) |
| 532 | |
| 533 | #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) |
| 534 | return __dp4a(a, b, c); |
| 535 | #else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) |
| 536 | const int8_t * a8 = (const int8_t *) &a; |
| 537 | const int8_t * b8 = (const int8_t *) &b; |
| 538 | return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; |
| 539 | #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) |
| 540 | |
| 541 | #endif // defined(GGML_USE_HIP) |
| 542 | } |
| 543 | |
| 544 | static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float v, const float u) { |
| 545 | acc += v*u; |
| 546 | } |
| 547 | |
| 548 | static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v, const float2 u) { |
| 549 | acc += v.x*u.x; |
| 550 | acc += v.y*u.y; |
| 551 | } |
| 552 | |
| 553 | static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) { |
| 554 | #if defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(__gfx906__) || defined(CDNA)) |
| 555 | asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v" (acc) : "v" (v), "v" (u)); |
| 556 | #else |
| 557 | #ifdef FAST_FP16_AVAILABLE |
| 558 | const float2 tmp = __half22float2(v*u); |
| 559 | acc += tmp.x + tmp.y; |
| 560 | #else |
| 561 | const float2 tmpv = __half22float2(a: v); |
| 562 | const float2 tmpu = __half22float2(a: u); |
| 563 | acc += tmpv.x * tmpu.x; |
| 564 | acc += tmpv.y * tmpu.y; |
| 565 | #endif // FAST_FP16_AVAILABLE |
| 566 | #endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA)) |
| 567 | } |
| 568 | |
| 569 | static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v, const half2 u) { |
| 570 | #ifdef FAST_FP16_AVAILABLE |
| 571 | acc += v*u; |
| 572 | #else |
| 573 | const float2 tmpv = __half22float2(a: v); |
| 574 | const float2 tmpu = __half22float2(a: u); |
| 575 | float2 tmpacc = __half22float2(a: acc); |
| 576 | tmpacc.x += tmpv.x * tmpu.x; |
| 577 | tmpacc.y += tmpv.y * tmpu.y; |
| 578 | acc = make_half2(x: tmpacc.x, y: tmpacc.y); |
| 579 | #endif // FAST_FP16_AVAILABLE |
| 580 | } |
| 581 | |
| 582 | // Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD. |
| 583 | // Important: do not use this function if dst and src both point at registers. |
| 584 | // Due to the strict aliasing rule the compiler can do incorrect optimizations if src and dst have different types. |
| 585 | // The function is intended for copies between registers and SRAM/VRAM to make the compiler emit the right instructions. |
| 586 | // If dst and src point at different address spaces then they are guaranteed to not be aliased. |
| 587 | template <int nbytes, int alignment = 0> |
| 588 | static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) { |
| 589 | if constexpr (alignment != 0) { |
| 590 | static_assert(nbytes % alignment == 0, "bad alignment" ); |
| 591 | } |
| 592 | constexpr int nb_per_cpy = alignment == 0 ? nbytes : alignment; |
| 593 | |
| 594 | #pragma unroll |
| 595 | for (int i = 0; i < nbytes/nb_per_cpy; ++i) { |
| 596 | if constexpr (nb_per_cpy == 1) { |
| 597 | ((char *) dst)[i] = ((const char *) src)[i]; |
| 598 | } else if constexpr (nb_per_cpy == 2) { |
| 599 | ((short *) dst)[i] = ((const short *) src)[i]; |
| 600 | } else if constexpr (nb_per_cpy == 4) { |
| 601 | ((int *) dst)[i] = ((const int *) src)[i]; |
| 602 | } else if constexpr (nb_per_cpy == 8) { |
| 603 | ((int2 *) dst)[i] = ((const int2 *) src)[i]; |
| 604 | } else if constexpr (nb_per_cpy == 16) { |
| 605 | ((int4 *) dst)[i] = ((const int4 *) src)[i]; |
| 606 | } else { |
| 607 | static_assert(nbytes == 0 && nbytes == -1, "bad nbytes" ); |
| 608 | } |
| 609 | } |
| 610 | } |
| 611 | |
| 612 | static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { |
| 613 | #if CUDART_VERSION >= 12080 |
| 614 | const nv_bfloat16 e = __nv_cvt_e8m0_to_bf16raw(x); |
| 615 | return (float) e; |
| 616 | #else |
| 617 | uint32_t bits; |
| 618 | if (x == 0) { |
| 619 | bits = 0x00400000; |
| 620 | } else { |
| 621 | bits = (uint32_t) x << 23; |
| 622 | } |
| 623 | |
| 624 | float result; |
| 625 | memcpy(&result, &bits, sizeof(float)); |
| 626 | return result; |
| 627 | #endif // CUDART_VERSION >= 12050 |
| 628 | } |
| 629 | |
| 630 | // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. |
| 631 | // Precompute mp (m' in the paper) and L such that division |
| 632 | // can be computed using a multiply (high 32b of 64b result) |
| 633 | // and a shift: |
| 634 | // |
| 635 | // n/d = (mulhi(n, mp) + n) >> L; |
| 636 | static const uint3 init_fastdiv_values(uint64_t d_64) { |
| 637 | GGML_ASSERT(d_64 != 0); |
| 638 | GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max()); |
| 639 | |
| 640 | uint32_t d = (uint32_t)d_64; |
| 641 | |
| 642 | // compute L = ceil(log2(d)); |
| 643 | uint32_t L = 0; |
| 644 | while (L < 32 && (uint32_t{ 1 } << L) < d) { |
| 645 | L++; |
| 646 | } |
| 647 | |
| 648 | uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1); |
| 649 | // pack divisor as well to reduce error surface |
| 650 | return make_uint3(x: mp, y: L, z: d); |
| 651 | } |
| 652 | |
| 653 | static __device__ __forceinline__ uint32_t fastdiv(uint32_t n, const uint3 fastdiv_values) { |
| 654 | // expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> |
| 655 | // fastdiv_values.z is unused and optimized away by the compiler. |
| 656 | // Compute high 32 bits of n * mp |
| 657 | const uint32_t hi = __umulhi(a: n, b: fastdiv_values.x); |
| 658 | // add n, apply bit shift |
| 659 | return (hi + n) >> fastdiv_values.y; |
| 660 | } |
| 661 | |
| 662 | static __device__ __forceinline__ uint32_t fastmodulo(uint32_t n, const uint3 fastdiv_values) { |
| 663 | // expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values) |
| 664 | return n - fastdiv(n, fastdiv_values) * fastdiv_values.z; |
| 665 | } |
| 666 | |
| 667 | // Calculate both division and modulo at once, returns <n/divisor, n%divisor> |
| 668 | static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3 fastdiv_values) { |
| 669 | // expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values) |
| 670 | const uint32_t div_val = fastdiv(n, fastdiv_values); |
| 671 | const uint32_t mod_val = n - div_val * fastdiv_values.z; |
| 672 | return make_uint2(x: div_val, y: mod_val); |
| 673 | } |
| 674 | |
| 675 | typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v); |
| 676 | |
| 677 | static __device__ __forceinline__ float get_alibi_slope( |
| 678 | const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1 |
| 679 | ) { |
| 680 | if (max_bias <= 0.0f) { |
| 681 | return 1.0f; |
| 682 | } |
| 683 | const float base = h < n_head_log2 ? m0 : m1; |
| 684 | const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; |
| 685 | |
| 686 | return powf(a: base, b: exph); |
| 687 | } |
| 688 | |
| 689 | template <ggml_type type> |
| 690 | struct ggml_cuda_type_traits; |
| 691 | |
| 692 | template<> |
| 693 | struct ggml_cuda_type_traits<GGML_TYPE_F16> { |
| 694 | static constexpr int qk = 1; |
| 695 | static constexpr int qr = 1; |
| 696 | }; |
| 697 | |
| 698 | template<> |
| 699 | struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> { |
| 700 | static constexpr int qk = QK4_0; |
| 701 | static constexpr int qr = QR4_0; |
| 702 | static constexpr int qi = QI4_0; |
| 703 | }; |
| 704 | |
| 705 | template<> |
| 706 | struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> { |
| 707 | static constexpr int qk = QK4_1; |
| 708 | static constexpr int qr = QR4_1; |
| 709 | static constexpr int qi = QI4_1; |
| 710 | }; |
| 711 | |
| 712 | template<> |
| 713 | struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> { |
| 714 | static constexpr int qk = QK5_0; |
| 715 | static constexpr int qr = QR5_0; |
| 716 | static constexpr int qi = QI5_0; |
| 717 | }; |
| 718 | |
| 719 | template<> |
| 720 | struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> { |
| 721 | static constexpr int qk = QK5_1; |
| 722 | static constexpr int qr = QR5_1; |
| 723 | static constexpr int qi = QI5_1; |
| 724 | }; |
| 725 | |
| 726 | template<> |
| 727 | struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> { |
| 728 | static constexpr int qk = QK8_0; |
| 729 | static constexpr int qr = QR8_0; |
| 730 | static constexpr int qi = QI8_0; |
| 731 | }; |
| 732 | |
| 733 | template<> |
| 734 | struct ggml_cuda_type_traits<GGML_TYPE_MXFP4> { |
| 735 | static constexpr int qk = QK_MXFP4; |
| 736 | static constexpr int qr = QR_MXFP4; |
| 737 | static constexpr int qi = QI_MXFP4; |
| 738 | }; |
| 739 | |
| 740 | template<> |
| 741 | struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> { |
| 742 | static constexpr int qk = QK_K; |
| 743 | static constexpr int qr = QR2_K; |
| 744 | static constexpr int qi = QI2_K; |
| 745 | }; |
| 746 | |
| 747 | template<> |
| 748 | struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> { |
| 749 | static constexpr int qk = QK_K; |
| 750 | static constexpr int qr = QR3_K; |
| 751 | static constexpr int qi = QI3_K; |
| 752 | }; |
| 753 | |
| 754 | template<> |
| 755 | struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> { |
| 756 | static constexpr int qk = QK_K; |
| 757 | static constexpr int qr = QR4_K; |
| 758 | static constexpr int qi = QI4_K; |
| 759 | }; |
| 760 | |
| 761 | template<> |
| 762 | struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> { |
| 763 | static constexpr int qk = QK_K; |
| 764 | static constexpr int qr = QR5_K; |
| 765 | static constexpr int qi = QI5_K; |
| 766 | }; |
| 767 | |
| 768 | template<> |
| 769 | struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> { |
| 770 | static constexpr int qk = QK_K; |
| 771 | static constexpr int qr = QR6_K; |
| 772 | static constexpr int qi = QI6_K; |
| 773 | }; |
| 774 | |
| 775 | template<> |
| 776 | struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> { |
| 777 | static constexpr int qk = QK_K; |
| 778 | static constexpr int qr = QR2_XXS; |
| 779 | static constexpr int qi = QI2_XXS; |
| 780 | }; |
| 781 | |
| 782 | template<> |
| 783 | struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> { |
| 784 | static constexpr int qk = QK_K; |
| 785 | static constexpr int qr = QR2_XS; |
| 786 | static constexpr int qi = QI2_XS; |
| 787 | }; |
| 788 | |
| 789 | template<> |
| 790 | struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> { |
| 791 | static constexpr int qk = QK_K; |
| 792 | static constexpr int qr = QR2_S; |
| 793 | static constexpr int qi = QI2_S; |
| 794 | }; |
| 795 | |
| 796 | template<> |
| 797 | struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> { |
| 798 | static constexpr int qk = QK_K; |
| 799 | static constexpr int qr = QR3_XXS; |
| 800 | static constexpr int qi = QI3_XXS; |
| 801 | }; |
| 802 | |
| 803 | template<> |
| 804 | struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> { |
| 805 | static constexpr int qk = QK_K; |
| 806 | static constexpr int qr = QR1_S; |
| 807 | static constexpr int qi = QI1_S; |
| 808 | }; |
| 809 | |
| 810 | template<> |
| 811 | struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> { |
| 812 | static constexpr int qk = QK_K; |
| 813 | static constexpr int qr = QR1_M; |
| 814 | static constexpr int qi = QI1_M; |
| 815 | }; |
| 816 | |
| 817 | template<> |
| 818 | struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> { |
| 819 | static constexpr int qk = QK4_NL; |
| 820 | static constexpr int qr = QR4_NL; |
| 821 | static constexpr int qi = QI4_NL; |
| 822 | }; |
| 823 | |
| 824 | template<> |
| 825 | struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> { |
| 826 | static constexpr int qk = QK_K; |
| 827 | static constexpr int qr = QR4_XS; |
| 828 | static constexpr int qi = QI4_XS; |
| 829 | }; |
| 830 | |
| 831 | template<> |
| 832 | struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> { |
| 833 | static constexpr int qk = QK_K; |
| 834 | static constexpr int qr = QR3_S; |
| 835 | static constexpr int qi = QI3_S; |
| 836 | }; |
| 837 | |
| 838 | ////////////////////// |
| 839 | |
| 840 | struct ggml_cuda_device_info { |
| 841 | int device_count; |
| 842 | |
| 843 | struct cuda_device_info { |
| 844 | int cc; // compute capability |
| 845 | int nsm; // number of streaming multiprocessors |
| 846 | size_t smpb; // max. shared memory per block |
| 847 | size_t smpbo; // max. shared memory per block (with opt-in) |
| 848 | bool integrated; // Device is integrated as opposed to discrete |
| 849 | bool vmm; // virtual memory support |
| 850 | size_t vmm_granularity; // granularity of virtual memory |
| 851 | size_t total_vram; |
| 852 | int warp_size; // Number of threads in a dispatch |
| 853 | }; |
| 854 | |
| 855 | cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {}; |
| 856 | |
| 857 | std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {}; |
| 858 | }; |
| 859 | |
| 860 | const ggml_cuda_device_info & ggml_cuda_info(); |
| 861 | |
| 862 | void ggml_cuda_set_device(int device); |
| 863 | int ggml_cuda_get_device(); |
| 864 | |
| 865 | struct ggml_cuda_pool { |
| 866 | virtual ~ggml_cuda_pool() = default; |
| 867 | |
| 868 | virtual void * alloc(size_t size, size_t * actual_size) = 0; |
| 869 | virtual void free(void * ptr, size_t size) = 0; |
| 870 | }; |
| 871 | |
| 872 | template<typename T> |
| 873 | struct ggml_cuda_pool_alloc { |
| 874 | ggml_cuda_pool * pool = nullptr; |
| 875 | T * ptr = nullptr; |
| 876 | size_t actual_size = 0; |
| 877 | |
| 878 | ggml_cuda_pool_alloc() = default; |
| 879 | |
| 880 | explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) { |
| 881 | } |
| 882 | |
| 883 | ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) { |
| 884 | alloc(size); |
| 885 | } |
| 886 | |
| 887 | ~ggml_cuda_pool_alloc() { |
| 888 | if (ptr != nullptr) { |
| 889 | pool->free(ptr, size: actual_size); |
| 890 | } |
| 891 | } |
| 892 | |
| 893 | // size is in number of elements |
| 894 | T * alloc(size_t size) { |
| 895 | GGML_ASSERT(pool != nullptr); |
| 896 | GGML_ASSERT(ptr == nullptr); |
| 897 | ptr = (T *) pool->alloc(size: size * sizeof(T), actual_size: &this->actual_size); |
| 898 | return ptr; |
| 899 | } |
| 900 | |
| 901 | T * alloc(ggml_cuda_pool & pool, size_t size) { |
| 902 | this->pool = &pool; |
| 903 | return alloc(size); |
| 904 | } |
| 905 | |
| 906 | T * get() { |
| 907 | return ptr; |
| 908 | } |
| 909 | |
| 910 | ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete; |
| 911 | ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete; |
| 912 | ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete; |
| 913 | ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete; |
| 914 | }; |
| 915 | |
| 916 | |
| 917 | // backend interface |
| 918 | |
| 919 | struct { |
| 920 | void * [GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors |
| 921 | cudaEvent_t [GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs |
| 922 | }; |
| 923 | |
| 924 | |
| 925 | #if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS) |
| 926 | #define USE_CUDA_GRAPH |
| 927 | #endif |
| 928 | |
| 929 | struct ggml_graph_node_properties { |
| 930 | void * node_address; |
| 931 | ggml_op node_op; |
| 932 | int64_t ne[GGML_MAX_DIMS]; |
| 933 | size_t nb[GGML_MAX_DIMS]; |
| 934 | void * src_address[GGML_MAX_SRC]; |
| 935 | int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; |
| 936 | }; |
| 937 | |
| 938 | struct ggml_cuda_graph { |
| 939 | #ifdef USE_CUDA_GRAPH |
| 940 | ~ggml_cuda_graph() { |
| 941 | if (instance != nullptr) { |
| 942 | CUDA_CHECK(cudaGraphExecDestroy(instance)); |
| 943 | } |
| 944 | if (graph != nullptr) { |
| 945 | CUDA_CHECK(cudaGraphDestroy(graph)); |
| 946 | } |
| 947 | } |
| 948 | cudaGraph_t graph = nullptr; |
| 949 | cudaGraphExec_t instance = nullptr; |
| 950 | size_t num_nodes = 0; |
| 951 | std::vector<cudaGraphNode_t> nodes; |
| 952 | std::vector<cudaKernelNodeParams> params; |
| 953 | bool disable_due_to_gpu_arch = false; |
| 954 | bool disable_due_to_too_many_updates = false; |
| 955 | bool disable_due_to_failed_graph_capture = false; |
| 956 | int number_consecutive_updates = 0; |
| 957 | std::vector<ggml_graph_node_properties> ggml_graph_properties; |
| 958 | #endif |
| 959 | }; |
| 960 | |
| 961 | struct ggml_backend_cuda_context { |
| 962 | int device; |
| 963 | std::string name; |
| 964 | cudaEvent_t copy_event = nullptr; |
| 965 | |
| 966 | cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; |
| 967 | cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; |
| 968 | |
| 969 | std::unique_ptr<ggml_cuda_graph> cuda_graph; |
| 970 | |
| 971 | explicit ggml_backend_cuda_context(int device) : |
| 972 | device(device), |
| 973 | name(GGML_CUDA_NAME + std::to_string(val: device)) { |
| 974 | } |
| 975 | |
| 976 | ~ggml_backend_cuda_context(); |
| 977 | |
| 978 | cudaStream_t stream(int device, int stream) { |
| 979 | if (streams[device][stream] == nullptr) { |
| 980 | ggml_cuda_set_device(device); |
| 981 | CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking)); |
| 982 | } |
| 983 | return streams[device][stream]; |
| 984 | } |
| 985 | |
| 986 | cudaStream_t stream() { |
| 987 | return stream(device, stream: 0); |
| 988 | } |
| 989 | |
| 990 | cublasHandle_t cublas_handle(int device) { |
| 991 | if (cublas_handles[device] == nullptr) { |
| 992 | ggml_cuda_set_device(device); |
| 993 | CUBLAS_CHECK(cublasCreate(&cublas_handles[device])); |
| 994 | CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH)); |
| 995 | } |
| 996 | return cublas_handles[device]; |
| 997 | } |
| 998 | |
| 999 | cublasHandle_t cublas_handle() { |
| 1000 | return cublas_handle(device); |
| 1001 | } |
| 1002 | |
| 1003 | // pool |
| 1004 | std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES]; |
| 1005 | |
| 1006 | static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device); |
| 1007 | |
| 1008 | ggml_cuda_pool & pool(int device) { |
| 1009 | if (pools[device] == nullptr) { |
| 1010 | pools[device] = new_pool_for_device(device); |
| 1011 | } |
| 1012 | return *pools[device]; |
| 1013 | } |
| 1014 | |
| 1015 | ggml_cuda_pool & pool() { |
| 1016 | return pool(device); |
| 1017 | } |
| 1018 | }; |
| 1019 | |
| 1020 | struct ggml_cuda_mm_fusion_args_host { |
| 1021 | const ggml_tensor * x_bias = nullptr; |
| 1022 | const ggml_tensor * gate = nullptr; |
| 1023 | const ggml_tensor * gate_bias = nullptr; |
| 1024 | ggml_glu_op glu_op; |
| 1025 | }; |
| 1026 | struct ggml_cuda_mm_fusion_args_device { |
| 1027 | const void * x_bias = nullptr; |
| 1028 | const void * gate = nullptr; |
| 1029 | const void * gate_bias = nullptr; |
| 1030 | ggml_glu_op glu_op; |
| 1031 | }; |
| 1032 | |