concat.cu source code [llama.cpp/ggml/src/ggml-cuda/concat.cu]

1	#include "concat.cuh"
2
3	// contiguous kernels
4	static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
5	int nidx = threadIdx.x + blockIdx.x * blockDim.x;
6	if (nidx >= ne0) {
7	return;
8	}
9
10	int offset_dst =
11	nidx +
12	blockIdx.y * ne0 +
13	blockIdx.z * ne0 * gridDim.y;
14
15	if (nidx < ne00) { // src0
16	int offset_src =
17	nidx +
18	blockIdx.y * ne00 +
19	blockIdx.z * ne00 * gridDim.y;
20	dst[offset_dst] = x[offset_src];
21	} else {
22	int offset_src =
23	(nidx - ne00) +
24	blockIdx.y * (ne0 - ne00) +
25	blockIdx.z * (ne0 - ne00) * gridDim.y;
26	dst[offset_dst] = y[offset_src];
27	}
28	}
29
30	static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
31	int nidx = threadIdx.x + blockIdx.x * blockDim.x;
32	if (nidx >= ne0) {
33	return;
34	}
35
36	int offset_dst =
37	nidx +
38	blockIdx.y * ne0 +
39	blockIdx.z * ne0 * gridDim.y;
40
41	if (blockIdx.y < (unsigned)ne01) { // src0
42	int offset_src =
43	nidx +
44	blockIdx.y * ne0 +
45	blockIdx.z * ne0 * ne01;
46	dst[offset_dst] = x[offset_src];
47	} else {
48	int offset_src =
49	nidx +
50	(blockIdx.y - ne01) * ne0 +
51	blockIdx.z * ne0 * (gridDim.y - ne01);
52	dst[offset_dst] = y[offset_src];
53	}
54	}
55
56	static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
57	int nidx = threadIdx.x + blockIdx.x * blockDim.x;
58	if (nidx >= ne0) {
59	return;
60	}
61
62	int offset_dst =
63	nidx +
64	blockIdx.y * ne0 +
65	blockIdx.z * ne0 * gridDim.y;
66
67	if (blockIdx.z < (unsigned)ne02) { // src0
68	int offset_src =
69	nidx +
70	blockIdx.y * ne0 +
71	blockIdx.z * ne0 * gridDim.y;
72	dst[offset_dst] = x[offset_src];
73	} else {
74	int offset_src =
75	nidx +
76	blockIdx.y * ne0 +
77	(blockIdx.z - ne02) * ne0 * gridDim.y;
78	dst[offset_dst] = y[offset_src];
79	}
80	}
81
82	static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
83	int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - `1`) / CUDA_CONCAT_BLOCK_SIZE;
84	dim3 gridDim(num_blocks, ne1, ne2);
85	if (dim == `0`) {
86	concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, sharedMem: `0`, stream>>>(x, y, dst, ne0, ne00);
87	return;
88	}
89	if (dim == `1`) {
90	concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, sharedMem: `0`, stream>>>(x, y, dst, ne0, ne01);
91	return;
92	}
93	concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, sharedMem: `0`, stream>>>(x, y, dst, ne0, ne02);
94	}
95
96	// non-contiguous kernel (slow)
97	template <int dim>
98	static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
99	concat_f32_non_cont(
100	const char * src0,
101	const char * src1,
102	char * dst,
103	int64_t ne00,
104	int64_t ne01,
105	int64_t ne02,
106	int64_t ne03,
107	uint64_t nb00,
108	uint64_t nb01,
109	uint64_t nb02,
110	uint64_t nb03,
111	int64_t /ne10/,
112	int64_t /ne11/,
113	int64_t /ne12/,
114	int64_t /ne13/,
115	uint64_t nb10,
116	uint64_t nb11,
117	uint64_t nb12,
118	uint64_t nb13,
119	int64_t ne0,
120	int64_t /ne1/,
121	int64_t /ne2/,
122	int64_t /ne3/,
123	uint64_t nb0,
124	uint64_t nb1,
125	uint64_t nb2,
126	uint64_t nb3){
127	static_assert(dim >= `0` && dim <= `3`, "dim must be in [0, 3]");
128
129	const int64_t i3 = blockIdx.z;
130	const int64_t i2 = blockIdx.y;
131	const int64_t i1 = blockIdx.x;
132
133	const float * x;
134
135	for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
136	if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
137	x = (const float )(src0 + (i3 )nb03 + (i2 )nb02 + (i1 )nb01 + (i0 )*nb00);
138	} else {
139	if constexpr (dim == `0`) {
140	x = (const float ) (src1 + i3 nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
141	} else if constexpr (dim == `1`) {
142	x = (const float ) (src1 + i3 nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
143	} else if constexpr (dim == `2`) {
144	x = (const float ) (src1 + i3 nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
145	} else if constexpr (dim == `3`) {
146	x = (const float ) (src1 + (i3 - ne03) nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
147	}
148	}
149
150	float * y = (float )(dst + i3nb3 + i2nb2 + i1nb1 + i0*nb0);
151
152	y = x;
153	}
154	}
155
156
157	void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
158	const ggml_tensor * src0 = dst->src[`0`];
159	const ggml_tensor * src1 = dst->src[`1`];
160
161	cudaStream_t stream = ctx.stream();
162
163	const int32_t dim = ((int32_t *) dst->op_params)[`0`];
164
165	GGML_ASSERT(src0->type == GGML_TYPE_F32);
166	GGML_ASSERT(src1->type == GGML_TYPE_F32);
167	GGML_ASSERT(dst->type == GGML_TYPE_F32);
168
169	if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
170	const float * src0_d = (const float *)src0->data;
171	const float * src1_d = (const float *)src1->data;
172
173	float * dst_d = (float *)dst->data;
174
175	if (dim != `3`) {
176	for (int i3 = `0`; i3 < dst->ne[`3`]; i3++) {
177	concat_f32_cuda(
178	src0_d + i3 * (src0->nb[`3`] / `4`),
179	src1_d + i3 * (src1->nb[`3`] / `4`),
180	dst_d + i3 * ( dst->nb[`3`] / `4`),
181	src0->ne[`0`], src0->ne[`1`], src0->ne[`2`],
182	dst->ne[`0`], dst->ne[`1`], dst->ne[`2`], dim, stream);
183	}
184	} else {
185	const size_t size0 = ggml_nbytes(src0);
186	const size_t size1 = ggml_nbytes(src1);
187
188	CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
189	CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/`4`, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
190	}
191	} else {
192	dim3 grid_dim(dst->ne[`1`], dst->ne[`2`], dst->ne[`3`]);
193	auto launch_kernel = [&](auto dim) {
194	concat_f32_non_cont<dim><<<gridDim: grid_dim, CUDA_CONCAT_BLOCK_SIZE, sharedMem: `0`, stream>>>(
195	(const char ) src0->data, (const* char ) src1->data, (char* *) dst->data,
196	src0->ne[`0`], src0->ne[`1`], src0->ne[`2`], src0->ne[`3`],
197	src0->nb[`0`], src0->nb[`1`], src0->nb[`2`], src0->nb[`3`],
198	src1->ne[`0`], src1->ne[`1`], src1->ne[`2`], src1->ne[`3`],
199	src1->nb[`0`], src1->nb[`1`], src1->nb[`2`], src1->nb[`3`],
200	dst->ne[`0`], dst->ne[`1`], dst->ne[`2`], dst->ne[`3`],
201	dst->nb[`0`], dst->nb[`1`], dst->nb[`2`], dst->nb[`3`]);
202	};
203	switch (dim) {
204	case `0`:
205	launch_kernel (std::integral_constant<int, `0`>{});
206	break;
207	case `1`:
208	launch_kernel (std::integral_constant<int, `1`>{});
209	break;
210	case `2`:
211	launch_kernel (std::integral_constant<int, `2`>{});
212	break;
213	case `3`:
214	launch_kernel (std::integral_constant<int, `3`>{});
215	break;
216	default:
217	GGML_ABORT("Invalid dim: %d", dim);
218	break;
219	}
220	}
221	}
222

Browse the source code of llama.cpp/ggml/src/ggml-cuda/concat.cu