cp-async.cuh source code [llama.cpp/ggml/src/ggml-cuda/cp-async.cuh]

1	// Simplified API for asynchronous data loading.
2
3	#include "common.cuh"
4
5
6	static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) {
7	#ifdef CP_ASYNC_AVAILABLE
8	return __cvta_generic_to_shared(generic_ptr);
9	#else
10	GGML_UNUSED(generic_ptr);
11	NO_DEVICE_CODE;
12	return `0`;
13	#endif // CP_ASYNC_AVAILABLE
14	}
15
16	// Copies data from global to shared memory, cg == cache global.
17	// Both the src and dst pointers must be aligned to 16 bit.
18	// Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.
19	// Generic pointers can be converted to 32 bit shared memory pointers using __cvta_generic_to_shared.
20	// Only the 16 bit copy is exposed because 4 and 8 bit copies did not yield performance improvements.
21	template <int preload>
22	static __device__ __forceinline__ void cp_async_cg_16(const unsigned int dst, const void * src) {
23	static_assert(preload == `0` \|\| preload == `64` \|\| preload == `128` \|\| preload == `256`, "bad preload");
24	#ifdef CP_ASYNC_AVAILABLE
25	#if CUDART_VERSION >= 11040
26	if (preload == `256`) {
27	asm volatile("cp.async.cg.shared.global.L2::256B [%0], [%1], 16;"
28	: : "r"(dst), "l"(src));
29	} else if (preload == `128`) {
30	asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], 16;"
31	: : "r"(dst), "l"(src));
32	} else if (preload == `64`) {
33	asm volatile("cp.async.cg.shared.global.L2::64B [%0], [%1], 16;"
34	: : "r"(dst), "l"(src));
35	} else
36	#endif // CUDART_VERSION >= 11040
37	{
38	asm volatile("cp.async.cg.shared.global [%0], [%1], 16;"
39	: : "r"(dst), "l"(src));
40	}
41	#else
42	GGML_UNUSED(dst);
43	GGML_UNUSED(src);
44	NO_DEVICE_CODE;
45	#endif // CP_ASYNC_AVAILABLE
46	}
47
48	// Makes each thread wait until its asynchronous data copies are done.
49	// This does NOT provide any additional synchronization.
50	// In particular, when copying data with multiple warps a call to __syncthreads will be needed.
51	static __device__ __forceinline__ void cp_async_wait_all() {
52	#ifdef CP_ASYNC_AVAILABLE
53	asm volatile("cp.async.wait_all;");
54	#else
55	NO_DEVICE_CODE;
56	#endif // CP_ASYNC_AVAILABLE
57	}
58

Browse the source code of llama.cpp/ggml/src/ggml-cuda/cp-async.cuh