cpy.cu source code [llama.cpp/ggml/src/ggml-cuda/cpy.cu]

1	#include "cpy.cuh"
2	#include "dequantize.cuh"
3	#include "cpy-utils.cuh"
4	#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
5	#include "ggml-musa/mudnn.cuh"
6	#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
7
8	typedef void (cpy_kernel_t)(const* char * cx, char * cdst);
9
10	const int CUDA_CPY_TILE_DIM_2D = `32`; // 2D tile dimension for transposed blocks
11	const int CUDA_CPY_BLOCK_NM = `8`; // block size of 3rd dimension if available
12	const int CUDA_CPY_BLOCK_ROWS = `8`; // block dimension for marching through rows
13
14	template <cpy_kernel_t cpy_1>
15	static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
16	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
17	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
18	const int nb12, const int nb13) {
19	const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
20
21	if (i >= ne) {
22	return;
23	}
24
25	// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
26	// then combine those indices with the corresponding byte offsets to get the total offsets
27	const int64_t i03 = i/(ne00 * ne01 * ne02);
28	const int64_t i02 = (i - i03ne00ne01ne02 )/ (ne00ne01);
29	const int64_t i01 = (i - i03ne00ne01ne02 - i02ne01*ne00) / ne00;
30	const int64_t i00 = i - i03ne00ne01ne02 - i02ne01ne00 - i01ne00;
31	const int64_t x_offset = i00nb00 + i01nb01 + i02nb02 + i03 nb03;
32
33	const int64_t i13 = i/(ne10 * ne11 * ne12);
34	const int64_t i12 = (i - i13ne10ne11ne12) / (ne10ne11);
35	const int64_t i11 = (i - i13ne10ne11ne12 - i12ne10*ne11) / ne10;
36	const int64_t i10 = i - i13ne10ne11ne12 - i12ne10ne11 - i11ne10;
37	const int64_t dst_offset = i10nb10 + i11nb11 + i12nb12 + i13 nb13;
38
39	cpy_1(cx + x_offset, cdst + dst_offset);
40	}
41
42	template <typename T>
43	static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int ne,
44	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
45	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
46	const int nb12, const int nb13) {
47
48	const T* src = reinterpret_cast<const T*>(cx);
49	T* dst = reinterpret_cast<T*>(cdst);
50
51	const int64_t nmat = ne / (ne00 * ne01);
52	const int64_t n = ne00 * ne01;
53
54	const int x = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
55	const int y = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
56	const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset
57	const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
58
59	__shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+`1`];
60
61	#pragma unroll
62	for (int i = `0`; i < CUDA_CPY_BLOCK_NM; ++i) {
63
64	const unsigned int imat = blockIdx.z * CUDA_CPY_BLOCK_NM + i;
65	if (imat >= nmat)
66	break;
67
68	#pragma unroll
69	for (int j = `0`; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
70	if(x < ne01 && y + j < ne00){
71	const int row = threadIdx.y+j;
72	const int col = threadIdx.x * sizeof(float)/sizeof(T);
73	T tile2 = reinterpret_cast<T>(tile[row]);
74	tile2[col] = src[imatn + (y+j)ne01 + x];
75	}
76	}
77
78	__syncthreads();
79
80	#pragma unroll
81	for (int j = `0`; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
82	if (ty + j < ne01 && tx < ne00) {
83	const int col = (threadIdx.y+j)*sizeof(float)/sizeof(T);
84	const T tile2 = reinterpret_cast<const* T*>(tile[threadIdx.x]);
85	dst[imatn + (ty+j)ne00 + tx] = tile2[col];
86	}
87	}
88	}
89	}
90
91	static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
92	float * cdstf = (float *)(cdsti);
93
94	#pragma unroll
95	for (int j = `0`; j < QK8_0; j += `2`) {
96	float2 dq;
97	dequantize_q8_0(vx: cxi, ib: `0`, iqs: j, v&: dq);
98	*(cdstf + j) = dq.x;
99	*(cdstf + j + `1`) = dq.y;
100	}
101	}
102
103	template<dequantize_kernel_t dequant, int qk>
104	static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
105	float * cdstf = (float *)(cdsti);
106
107	#pragma unroll
108	for (int j = `0`; j < qk/`2`; j++) {
109	float2 dq;
110	dequant(cxi, `0`, j, dq);
111	*(cdstf + j) = dq.x;
112	*(cdstf + j + qk/`2`) = dq.y;
113	}
114	}
115
116	template <cpy_kernel_t cpy_blck, int qk>
117	static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
118	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
119	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
120	const int nb12, const int nb13) {
121	const int i = (blockDim.xblockIdx.x + threadIdx.x)qk;
122
123	if (i >= ne) {
124	return;
125	}
126
127	const int i03 = i/(ne00 * ne01 * ne02);
128	const int i02 = (i - i03ne00ne01ne02 )/ (ne00ne01);
129	const int i01 = (i - i03ne00ne01ne02 - i02ne01*ne00) / ne00;
130	const int i00 = i - i03ne00ne01ne02 - i02ne01ne00 - i01ne00;
131	const int x_offset = i00nb00 + i01nb01 + i02nb02 + i03 nb03;
132
133	const int i13 = i/(ne10 * ne11 * ne12);
134	const int i12 = (i - i13ne10ne11ne12) / (ne10ne11);
135	const int i11 = (i - i13ne10ne11ne12 - i12ne10*ne11) / ne10;
136	const int i10 = i - i13ne10ne11ne12 - i12ne10ne11 - i11ne10;
137	const int dst_offset = (i10/qk)nb10 + i11nb11 + i12nb12 + i13nb13;
138
139	cpy_blck(cx + x_offset, cdst + dst_offset);
140	}
141
142	template <cpy_kernel_t cpy_blck, int qk>
143	static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
144	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
145	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
146	const int nb12, const int nb13) {
147	const int i = (blockDim.xblockIdx.x + threadIdx.x)qk;
148
149	if (i >= ne) {
150	return;
151	}
152
153	const int i03 = i/(ne00 * ne01 * ne02);
154	const int i02 = (i - i03ne00ne01ne02 )/ (ne00ne01);
155	const int i01 = (i - i03ne00ne01ne02 - i02ne01*ne00) / ne00;
156	const int i00 = i - i03ne00ne01ne02 - i02ne01ne00 - i01ne00;
157	const int x_offset = (i00/qk)nb00 + i01nb01 + i02nb02 + i03 nb03;
158
159	const int i13 = i/(ne10 * ne11 * ne12);
160	const int i12 = (i - i13ne10ne11ne12) / (ne10ne11);
161	const int i11 = (i - i13ne10ne11ne12 - i12ne10*ne11) / ne10;
162	const int i10 = i - i13ne10ne11ne12 - i12ne10ne11 - i11ne10;
163	const int dst_offset = i10nb10 + i11nb11 + i12nb12 + i13nb13;
164
165	cpy_blck(cx + x_offset, cdst + dst_offset);
166	}
167
168	template<typename src_t, typename dst_t>
169	static __global__ void cpy_flt_contiguous(const char * cx, char * cdst, const int64_t ne) {
170	const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
171
172	if (i >= ne) {
173	return;
174	}
175
176	const src_t * x = (const src_t *) cx;
177	dst_t * dst = (dst_t *) cdst;
178
179	dst[i] = ggml_cuda_cast<dst_t>(x[i]);
180	}
181
182	template<typename src_t, typename dst_t>
183	static void ggml_cpy_flt_contiguous_cuda(
184	const char * cx, char * cdst, const int64_t ne,
185	cudaStream_t stream) {
186
187	const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - `1`) / CUDA_CPY_BLOCK_SIZE;
188	cpy_flt_contiguous<src_t, dst_t><<<gridDim: num_blocks, CUDA_CPY_BLOCK_SIZE, sharedMem: `0`, stream>>>
189	(cx, cdst, ne);
190	}
191
192	template<typename src_t, typename dst_t, bool transposed = false>
193	static void ggml_cpy_flt_cuda(
194	const char * cx, char * cdst, const int ne,
195	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
196	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
197
198	if (transposed) {
199	GGML_ASSERT(ne == ne00ne01ne02); // ne[3] is 1 assumed
200	int ne00n, ne01n, ne02n;
201	if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here
202	ne00n = ne00;
203	ne01n = ne01;
204	ne02n = ne02;
205	} else if (nb00 > nb02) {
206	ne00n = ne00;
207	ne01n = ne01*ne02;
208	ne02n = `1`;
209	}
210
211	dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - `1`) / CUDA_CPY_TILE_DIM_2D,
212	(ne00n + CUDA_CPY_TILE_DIM_2D - `1`) / CUDA_CPY_TILE_DIM_2D,
213	(ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - `1`) / CUDA_CPY_BLOCK_NM);
214	dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, `1`);
215	cpy_flt_transpose<dst_t><<<gridDim: dimGrid, blockDim: dimBlock, sharedMem: `0`, stream>>>
216	(cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
217	} else {
218	const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - `1`) / CUDA_CPY_BLOCK_SIZE;
219	cpy_flt<cpy_1_flt<src_t, dst_t>><<<gridDim: num_blocks, CUDA_CPY_BLOCK_SIZE, sharedMem: `0`, stream>>>
220	(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
221	}
222	}
223
224	static void ggml_cpy_f32_q8_0_cuda(
225	const char * cx, char * cdst, const int ne,
226	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
227	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
228
229	GGML_ASSERT(ne % QK8_0 == `0`);
230	const int num_blocks = ne / QK8_0;
231	cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, `1`, `0`, stream>>>
232	(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
233	}
234
235	static void ggml_cpy_q8_0_f32_cuda(
236	const char * cx, char * cdst, const int ne,
237	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
238	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
239
240	const int num_blocks = ne;
241	cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, `1`, `0`, stream>>>
242	(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
243	}
244
245	static void ggml_cpy_f32_q4_0_cuda(
246	const char * cx, char * cdst, const int ne,
247	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
248	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
249
250	GGML_ASSERT(ne % QK4_0 == `0`);
251	const int num_blocks = ne / QK4_0;
252	cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, `1`, `0`, stream>>>
253	(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
254	}
255
256	static void ggml_cpy_q4_0_f32_cuda(
257	const char * cx, char * cdst, const int ne,
258	const int ne00, const int ne01, const int ne02,
259	const int nb00, const int nb01, const int nb02,
260	const int nb03, const int ne10, const int ne11, const int ne12,
261	const int nb10, const int nb11, const int nb12, const int nb13,
262	cudaStream_t stream) {
263	const int num_blocks = ne;
264	cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, `1`, `0`, stream>>>(
265	cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
266	ne10, ne11, ne12, nb10, nb11, nb12, nb13);
267	}
268
269	static void ggml_cpy_f32_q4_1_cuda(
270	const char * cx, char * cdst, const int ne,
271	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
272	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
273
274	GGML_ASSERT(ne % QK4_1 == `0`);
275	const int num_blocks = ne / QK4_1;
276	cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, `1`, `0`, stream>>>
277	(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
278	}
279
280	static void ggml_cpy_q4_1_f32_cuda(
281	const char * cx, char * cdst, const int ne,
282	const int ne00, const int ne01, const int ne02,
283	const int nb00, const int nb01, const int nb02,
284	const int nb03, const int ne10, const int ne11, const int ne12,
285	const int nb10, const int nb11, const int nb12, const int nb13,
286	cudaStream_t stream) {
287	const int num_blocks = ne;
288	cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, `1`, `0`, stream>>>(
289	cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
290	ne10, ne11, ne12, nb10, nb11, nb12, nb13);
291	}
292
293	static void ggml_cpy_f32_q5_0_cuda(
294	const char * cx, char * cdst, const int ne,
295	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
296	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
297
298	GGML_ASSERT(ne % QK5_0 == `0`);
299	const int num_blocks = ne / QK5_0;
300	cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, `1`, `0`, stream>>>
301	(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
302	}
303
304	static void ggml_cpy_q5_0_f32_cuda(
305	const char * cx, char * cdst, const int ne,
306	const int ne00, const int ne01, const int ne02,
307	const int nb00, const int nb01, const int nb02,
308	const int nb03, const int ne10, const int ne11, const int ne12,
309	const int nb10, const int nb11, const int nb12, const int nb13,
310	cudaStream_t stream) {
311	const int num_blocks = ne;
312	cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, `1`, `0`, stream>>>(
313	cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
314	ne10, ne11, ne12, nb10, nb11, nb12, nb13);
315	}
316
317	static void ggml_cpy_f32_q5_1_cuda(
318	const char * cx, char * cdst, const int ne,
319	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
320	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
321
322	GGML_ASSERT(ne % QK5_1 == `0`);
323	const int num_blocks = ne / QK5_1;
324	cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, `1`, `0`, stream>>>
325	(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
326	}
327
328	static void ggml_cpy_q5_1_f32_cuda(
329	const char * cx, char * cdst, const int ne,
330	const int ne00, const int ne01, const int ne02,
331	const int nb00, const int nb01, const int nb02,
332	const int nb03, const int ne10, const int ne11, const int ne12,
333	const int nb10, const int nb11, const int nb12, const int nb13,
334	cudaStream_t stream) {
335	const int num_blocks = ne;
336	cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, `1`, `0`, stream>>>(
337	cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
338	ne10, ne11, ne12, nb10, nb11, nb12, nb13);
339	}
340
341	static void ggml_cpy_f32_iq4_nl_cuda(
342	const char * cx, char * cdst, const int ne,
343	const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
344	const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
345
346	GGML_ASSERT(ne % QK4_NL == `0`);
347	const int num_blocks = ne / QK4_NL;
348	cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, `1`, `0`, stream>>>
349	(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
350	}
351
352	void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
353	const int64_t ne = ggml_nelements(src0);
354	GGML_ASSERT(ne == ggml_nelements(src1));
355
356	GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
357	GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
358
359	const int64_t ne00 = src0->ne[`0`];
360	const int64_t ne01 = src0->ne[`1`];
361	const int64_t ne02 = src0->ne[`2`];
362
363	//GGML_ASSERT(src0->ne[3] == 1);
364
365	const int64_t nb00 = src0->nb[`0`];
366	const int64_t nb01 = src0->nb[`1`];
367	const int64_t nb02 = src0->nb[`2`];
368	const int64_t nb03 = src0->nb[`3`];
369
370	const int64_t ne10 = src1->ne[`0`];
371	const int64_t ne11 = src1->ne[`1`];
372	const int64_t ne12 = src1->ne[`2`];
373
374	//GGML_ASSERT(src1->ne[3] == 1);
375
376	const int64_t nb10 = src1->nb[`0`];
377	const int64_t nb11 = src1->nb[`1`];
378	const int64_t nb12 = src1->nb[`2`];
379	const int64_t nb13 = src1->nb[`3`];
380
381	cudaStream_t main_stream = ctx.stream();
382
383	char * src0_ddc = (char *) src0->data;
384	char * src1_ddc = (char *) src1->data;
385
386	const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
387	const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && src0->ne[`3`] == `1`;
388
389	if (src0->type == src1->type && contiguous_srcs) {
390	GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
391	#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
392	if (src0->type == GGML_TYPE_F32 \|\| src0->type == GGML_TYPE_F16) {
393	CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
394	} else
395	#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
396	{
397	CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
398	}
399	} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
400	if (can_be_transposed) {
401	ggml_cpy_flt_cuda<float, float, true> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
402	} else {
403	ggml_cpy_flt_cuda<float, float> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
404	}
405	} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
406	if (contiguous_srcs) {
407	ggml_cpy_flt_contiguous_cuda<float, nv_bfloat16> (cx: src0_ddc, cdst: src1_ddc, ne, stream: main_stream);
408	} else {
409	ggml_cpy_flt_cuda<float, nv_bfloat16> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
410	}
411	} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
412	if (contiguous_srcs) {
413	ggml_cpy_flt_contiguous_cuda<float, half> (cx: src0_ddc, cdst: src1_ddc, ne, stream: main_stream);
414	} else {
415	ggml_cpy_flt_cuda<float, half> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
416	}
417	} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
418	ggml_cpy_f32_q8_0_cuda(cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
419	} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
420	ggml_cpy_q8_0_f32_cuda(cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
421	} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
422	ggml_cpy_f32_q4_0_cuda(cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
423	} else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
424	ggml_cpy_q4_0_f32_cuda(cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02,
425	nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
426	} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
427	ggml_cpy_f32_q4_1_cuda(cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
428	} else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
429	ggml_cpy_q4_1_f32_cuda(cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02,
430	nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
431	} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
432	ggml_cpy_f32_q5_0_cuda(cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
433	} else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
434	ggml_cpy_q5_0_f32_cuda(cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02,
435	nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
436	} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
437	ggml_cpy_f32_iq4_nl_cuda(cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
438	} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
439	ggml_cpy_f32_q5_1_cuda(cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
440	} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
441	ggml_cpy_q5_1_f32_cuda(cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
442	} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
443	if (can_be_transposed) {
444	ggml_cpy_flt_cuda<half, half, true> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
445	} else {
446	ggml_cpy_flt_cuda<half, half> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
447	}
448	} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
449	if (contiguous_srcs) {
450	ggml_cpy_flt_contiguous_cuda<half, nv_bfloat16> (cx: src0_ddc, cdst: src1_ddc, ne, stream: main_stream);
451	} else {
452	ggml_cpy_flt_cuda<half, nv_bfloat16> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
453	}
454	} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
455	if (contiguous_srcs) {
456	ggml_cpy_flt_contiguous_cuda<half, float> (cx: src0_ddc, cdst: src1_ddc, ne, stream: main_stream);
457	} else {
458	ggml_cpy_flt_cuda<half, float> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
459	}
460	} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
461	if (can_be_transposed) {
462	ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16, true> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
463	} else {
464	ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
465	}
466	} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
467	if (contiguous_srcs) {
468	ggml_cpy_flt_contiguous_cuda<nv_bfloat16, half> (cx: src0_ddc, cdst: src1_ddc, ne, stream: main_stream);
469	} else {
470	ggml_cpy_flt_cuda<nv_bfloat16, half> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
471	}
472	} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
473	if (contiguous_srcs) {
474	ggml_cpy_flt_contiguous_cuda<nv_bfloat16, float> (cx: src0_ddc, cdst: src1_ddc, ne, stream: main_stream);
475	} else {
476	ggml_cpy_flt_cuda<nv_bfloat16, float> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
477	}
478	} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) {
479	if (contiguous_srcs) {
480	ggml_cpy_flt_contiguous_cuda<float, int32_t> (cx: src0_ddc, cdst: src1_ddc, ne, stream: main_stream);
481	} else {
482	ggml_cpy_flt_cuda<float, int32_t> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
483	}
484	} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) {
485	if (contiguous_srcs) {
486	ggml_cpy_flt_contiguous_cuda<int32_t, float> (cx: src0_ddc, cdst: src1_ddc, ne, stream: main_stream);
487	} else {
488	ggml_cpy_flt_cuda<int32_t, float> (cx: src0_ddc, cdst: src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, stream: main_stream);
489	}
490	} else {
491	GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
492	ggml_type_name(src0->type), ggml_type_name(src1->type));
493	}
494	}
495
496	void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
497	const ggml_tensor * src0 = dst->src[`0`];
498	ggml_cuda_cpy(ctx, src0, dst);
499	}
500

Browse the source code of llama.cpp/ggml/src/ggml-cuda/cpy.cu