| 1 | // Simplified API for asynchronous data loading. |
| 2 | |
| 3 | #include "common.cuh" |
| 4 | |
| 5 | |
| 6 | static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) { |
| 7 | #ifdef CP_ASYNC_AVAILABLE |
| 8 | return __cvta_generic_to_shared(generic_ptr); |
| 9 | #else |
| 10 | GGML_UNUSED(generic_ptr); |
| 11 | NO_DEVICE_CODE; |
| 12 | return 0; |
| 13 | #endif // CP_ASYNC_AVAILABLE |
| 14 | } |
| 15 | |
| 16 | // Copies data from global to shared memory, cg == cache global. |
| 17 | // Both the src and dst pointers must be aligned to 16 bit. |
| 18 | // Shared memory uses 32 bit addressing, the pointer is passed as unsigned int. |
| 19 | // Generic pointers can be converted to 32 bit shared memory pointers using __cvta_generic_to_shared. |
| 20 | // Only the 16 bit copy is exposed because 4 and 8 bit copies did not yield performance improvements. |
| 21 | template <int preload> |
| 22 | static __device__ __forceinline__ void cp_async_cg_16(const unsigned int dst, const void * src) { |
| 23 | static_assert(preload == 0 || preload == 64 || preload == 128 || preload == 256, "bad preload" ); |
| 24 | #ifdef CP_ASYNC_AVAILABLE |
| 25 | #if CUDART_VERSION >= 11040 |
| 26 | if (preload == 256) { |
| 27 | asm volatile("cp.async.cg.shared.global.L2::256B [%0], [%1], 16;" |
| 28 | : : "r" (dst), "l" (src)); |
| 29 | } else if (preload == 128) { |
| 30 | asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], 16;" |
| 31 | : : "r" (dst), "l" (src)); |
| 32 | } else if (preload == 64) { |
| 33 | asm volatile("cp.async.cg.shared.global.L2::64B [%0], [%1], 16;" |
| 34 | : : "r" (dst), "l" (src)); |
| 35 | } else |
| 36 | #endif // CUDART_VERSION >= 11040 |
| 37 | { |
| 38 | asm volatile("cp.async.cg.shared.global [%0], [%1], 16;" |
| 39 | : : "r" (dst), "l" (src)); |
| 40 | } |
| 41 | #else |
| 42 | GGML_UNUSED(dst); |
| 43 | GGML_UNUSED(src); |
| 44 | NO_DEVICE_CODE; |
| 45 | #endif // CP_ASYNC_AVAILABLE |
| 46 | } |
| 47 | |
| 48 | // Makes each thread wait until its asynchronous data copies are done. |
| 49 | // This does NOT provide any additional synchronization. |
| 50 | // In particular, when copying data with multiple warps a call to __syncthreads will be needed. |
| 51 | static __device__ __forceinline__ void cp_async_wait_all() { |
| 52 | #ifdef CP_ASYNC_AVAILABLE |
| 53 | asm volatile("cp.async.wait_all;" ); |
| 54 | #else |
| 55 | NO_DEVICE_CODE; |
| 56 | #endif // CP_ASYNC_AVAILABLE |
| 57 | } |
| 58 | |