mmvq.cu source code [llama.cpp/ggml/src/ggml-cuda/mmvq.cu]

1	#include "mmvq.cuh"
2	#include "quantize.cuh"
3	#include "unary.cuh"
4	#include "vecdotq.cuh"
5
6	#include <cstdint>
7
8	typedef float (vec_dot_q_cuda_t)(const* void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
9
10	static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
11	switch (type) {
12	case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1;
13	case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1;
14	case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1;
15	case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1;
16	case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1;
17	case GGML_TYPE_MXFP4: return vec_dot_mxfp4_q8_1;
18	case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1;
19	case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1;
20	case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1;
21	case GGML_TYPE_Q5_K: return vec_dot_q5_K_q8_1;
22	case GGML_TYPE_Q6_K: return vec_dot_q6_K_q8_1;
23	case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1;
24	case GGML_TYPE_IQ2_XS: return vec_dot_iq2_xs_q8_1;
25	case GGML_TYPE_IQ2_S: return vec_dot_iq2_s_q8_1;
26	case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1;
27	case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1;
28	case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1;
29	case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1;
30	case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1;
31	case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1;
32	default: return nullptr;
33	}
34	}
35
36	static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
37	switch (type) {
38	case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ;
39	case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ;
40	case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ;
41	case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ;
42	case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ;
43	case GGML_TYPE_MXFP4: return VDR_MXFP4_Q8_1_MMVQ;
44	case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ;
45	case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ;
46	case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ;
47	case GGML_TYPE_Q5_K: return VDR_Q5_K_Q8_1_MMVQ;
48	case GGML_TYPE_Q6_K: return VDR_Q6_K_Q8_1_MMVQ;
49	case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ;
50	case GGML_TYPE_IQ2_XS: return VDR_IQ2_XS_Q8_1_MMVQ;
51	case GGML_TYPE_IQ2_S: return VDR_IQ2_S_Q8_1_MMVQ;
52	case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ;
53	case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ;
54	case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ;
55	case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ;
56	default: return `1`;
57	}
58	}
59
60	enum mmvq_parameter_table_id {
61	MMVQ_PARAMETERS_GENERIC = `0`,
62	MMVQ_PARAMETERS_GCN,
63	MMVQ_PARAMETERS_RDNA2
64	};
65
66	static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
67	#if defined(RDNA2) \|\| defined(RDNA3) \|\| defined(RDNA4)
68	return MMVQ_PARAMETERS_RDNA2;
69	#elif defined(GCN) \|\| defined(CDNA)
70	return MMVQ_PARAMETERS_GCN;
71	#else
72	return MMVQ_PARAMETERS_GENERIC;
73	#endif
74	}
75
76	static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
77	if (GGML_CUDA_CC_IS_RDNA2(cc) \|\| GGML_CUDA_CC_IS_RDNA3(cc) \|\| GGML_CUDA_CC_IS_RDNA4(cc)) {
78	return MMVQ_PARAMETERS_RDNA2;
79	}
80	if (GGML_CUDA_CC_IS_GCN(cc) \|\| GGML_CUDA_CC_IS_CDNA(cc)) {
81	return MMVQ_PARAMETERS_GCN;
82	}
83	return MMVQ_PARAMETERS_GENERIC;
84	}
85
86	static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) {
87	if (table_id == MMVQ_PARAMETERS_GENERIC) {
88	switch (ncols_dst) {
89	case `1`:
90	case `2`:
91	case `3`:
92	case `4`:
93	return `4`;
94	case `5`:
95	case `6`:
96	case `7`:
97	case `8`:
98	return `2`;
99	default:
100	return `1`;
101	}
102	} else if (table_id == MMVQ_PARAMETERS_GCN) {
103	switch (ncols_dst) {
104	case `1`:
105	case `2`:
106	case `3`:
107	case `4`:
108	return `2`;
109	case `5`:
110	case `6`:
111	case `7`:
112	case `8`:
113	default:
114	return `1`;
115	}
116	}
117	return `1`;
118	}
119
120	static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id) {
121	if (table_id == MMVQ_PARAMETERS_GENERIC \|\| table_id == MMVQ_PARAMETERS_GCN) {
122	switch (ncols_dst) {
123	case `1`:
124	return `1`;
125	case `2`:
126	case `3`:
127	case `4`:
128	case `5`:
129	case `6`:
130	case `7`:
131	case `8`:
132	return `2`;
133	default:
134	return `1`;
135	}
136	}
137	return `1`;
138	}
139
140	// tell the compiler to use as many registers as it wants, see nwarps definition below
141	template <ggml_type type, int ncols_dst, bool has_fusion>
142	__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), `1`)
143	static __global__ void mul_mat_vec_q(
144	const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
145	const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
146	const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
147	const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
148	const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst) {
149
150	constexpr int qk = ggml_cuda_type_traits<type>::qk;
151	constexpr int qi = ggml_cuda_type_traits<type>::qi;
152	constexpr int vdr = get_vdr_mmvq(type);
153	constexpr mmvq_parameter_table_id table_id = get_device_table_id();
154	constexpr int nwarps = calc_nwarps(ncols_dst, table_id);
155	constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id);
156	constexpr int warp_size = ggml_cuda_get_physical_warp_size();
157
158	constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
159
160	const int tid = warp_size*threadIdx.y + threadIdx.x;
161	const int row0 = rows_per_cuda_block*blockIdx.x;
162	const int blocks_per_row_x = ncols_x / qk;
163	constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
164
165	// The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1.
166	const uint32_t channel_dst = blockIdx.y;
167	const uint32_t channel_x = ncols_dst == `1` && ids ? ids[channel_dst] : fastdiv(n: channel_dst, fastdiv_values: channel_ratio);
168	const uint32_t channel_y = ncols_dst == `1` && ids ? fastmodulo(n: channel_dst, fastdiv_values: nchannels_y) : channel_dst;
169	const uint32_t sample_dst = blockIdx.z;
170	const uint32_t sample_x = fastdiv(n: sample_dst, fastdiv_values: sample_ratio);
171	const uint32_t sample_y = sample_dst;
172
173	bool use_gate = false;
174	bool use_bias = false;
175	bool use_gate_bias = false;
176	const void * vgate = nullptr;
177	const float * x_bias = nullptr;
178	const float * gate_bias = nullptr;
179	ggml_glu_op active_glu;
180
181	if constexpr (has_fusion) {
182	use_gate = fusion.gate != nullptr;
183	use_bias = fusion.x_bias != nullptr;
184	use_gate_bias = fusion.gate_bias != nullptr && use_gate;
185	vgate = fusion.gate;
186	x_bias = (const float *) fusion.x_bias;
187	gate_bias = (const float *) fusion.gate_bias;
188	active_glu = fusion.glu_op;
189	}
190
191	const uint32_t channel_bias = ids ? channel_x : channel_dst;
192
193	float x_biases[ncols_dst] = { `0.0f` };
194	float gate_biases[ncols_dst] = { `0.0f` };
195	if constexpr (has_fusion) {
196	if (use_bias) {
197	x_bias = x_bias + sample_dststride_sample_dst + channel_biasstride_channel_dst + row0;
198	// 1. Hide latency by prefetching bias and gate here
199	// 2. load only on threads that won't die after partial sum calculation
200	if (threadIdx.x < rows_per_cuda_block && threadIdx.y == `0` &&
201	(rows_per_cuda_block == `1` \|\| uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
202	#pragma unroll
203	for (int j = `0`; j < ncols_dst; ++j) {
204	x_biases[j] = x_bias[j * stride_col_dst + threadIdx.x];
205	}
206	}
207	}
208	if (use_gate_bias) {
209	gate_bias = gate_bias + sample_dststride_sample_dst + channel_biasstride_channel_dst + row0;
210	if (threadIdx.x < rows_per_cuda_block && threadIdx.y == `0` &&
211	(rows_per_cuda_block == `1` \|\| uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
212	#pragma unroll
213	for (int j = `0`; j < ncols_dst; ++j) {
214	gate_biases[j] = gate_bias[j * stride_col_dst + threadIdx.x];
215	}
216	}
217	}
218	}
219
220	// partial sum for each thread
221	float tmp[ncols_dst][rows_per_cuda_block] = {{`0.0f`}};
222	float tmp_gate[ncols_dst][rows_per_cuda_block] = {{`0.0f`}};
223
224	const block_q8_1 * y = ((const block_q8_1 ) vy) + sample_ystride_sample_y + channel_y*stride_channel_y;
225	const int kbx_offset = sample_xstride_sample_x + channel_xstride_channel_x + row0*stride_row_x;
226
227	for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
228	const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
229
230	// x block quant index when casting the quants to int
231	const int kqs = vdr * (tid % (qi/vdr));
232
233	#pragma unroll
234	for (int j = `0`; j < ncols_dst; ++j) {
235	#pragma unroll
236	for (int i = `0`; i < rows_per_cuda_block; ++i) {
237	tmp[j][i] += vec_dot_q_cuda(
238	vx, &y[jstride_col_y + kby], kbx_offset + istride_row_x + kbx, kqs);
239	if constexpr (has_fusion) {
240	if (use_gate) {
241	tmp_gate[j][i] += vec_dot_q_cuda(
242	vgate, &y[jstride_col_y + kby], kbx_offset + istride_row_x + kbx, kqs);
243	}
244	}
245	}
246	}
247	}
248
249	__shared__ float tmp_shared[nwarps-`1` > `0` ? nwarps-`1` : `1`][ncols_dst][rows_per_cuda_block][warp_size];
250	__shared__ float tmp_shared_gate[(has_fusion && (nwarps-`1` > `0`)) ? nwarps-`1` : `1`][ncols_dst][rows_per_cuda_block][warp_size];
251	if constexpr (!has_fusion) {
252	(void) tmp_shared_gate;
253	} else if (!use_gate) {
254	(void) tmp_shared_gate;
255	}
256
257	if (threadIdx.y > `0`) {
258	#pragma unroll
259	for (int j = `0`; j < ncols_dst; ++j) {
260	#pragma unroll
261	for (int i = `0`; i < rows_per_cuda_block; ++i) {
262	tmp_shared[threadIdx.y-`1`][j][i][threadIdx.x] = tmp[j][i];
263	if constexpr (has_fusion) {
264	if (use_gate) {
265	tmp_shared_gate[threadIdx.y-`1`][j][i][threadIdx.x] = tmp_gate[j][i];
266	}
267	}
268	}
269	}
270	}
271	__syncthreads();
272	if (threadIdx.y > `0`) {
273	return;
274	}
275
276	dst += sample_dststride_sample_dst + channel_dststride_channel_dst + row0;
277
278	// sum up partial sums and write back result
279	#pragma unroll
280	for (int j = `0`; j < ncols_dst; ++j) {
281	#pragma unroll
282	for (int i = `0`; i < rows_per_cuda_block; ++i) {
283	#pragma unroll
284	for (int l = `0`; l < nwarps-`1`; ++l) {
285	tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
286	if constexpr (has_fusion) {
287	if (use_gate) {
288	tmp_gate[j][i] += tmp_shared_gate[l][j][i][threadIdx.x];
289	}
290	}
291	}
292	tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
293	if constexpr (has_fusion) {
294	if (use_gate) {
295	tmp_gate[j][i] = warp_reduce_sum<warp_size>(tmp_gate[j][i]);
296	}
297	}
298	}
299
300	if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == `1` \|\| uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
301	float result = tmp[j][threadIdx.x];
302	if constexpr (has_fusion) {
303	if (use_bias) {
304	result += x_biases[j];
305	}
306	if (use_gate) {
307	float gate_value = tmp_gate[j][threadIdx.x];
308	if (use_gate_bias) {
309	gate_value += gate_biases[j];
310	}
311	switch (active_glu) {
312	case GGML_GLU_OP_SWIGLU:
313	result *= ggml_cuda_op_silu_single(x: gate_value);
314	break;
315	case GGML_GLU_OP_GEGLU:
316	result *= ggml_cuda_op_gelu_single(x: gate_value);
317	break;
318	case GGML_GLU_OP_SWIGLU_OAI: {
319	result = ggml_cuda_op_swiglu_oai_single(x: gate_value, g: result);
320	break;
321	}
322	default:
323	result = result * gate_value;
324	break;
325	}
326	}
327	}
328	dst[j*stride_col_dst + threadIdx.x] = result;
329	}
330	}
331
332	if constexpr (!has_fusion) {
333	GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, active_glu, gate_bias, x_bias, tmp_gate);
334	}
335	}
336
337	static std::pair<dim3, dim3> calc_launch_params(
338	const int ncols_dst, const int nrows_x, const int nchannels_y, const int nsamples_y,
339	const int warp_size, const mmvq_parameter_table_id table_id) {
340	const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - `1`) / calc_rows_per_block(ncols_dst, table_id);
341	const dim3 block_nums(nblocks, nchannels_y, nsamples_y);
342	const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), `1`);
343	return {block_nums, block_dims};
344	}
345
346	template<ggml_type type, int c_ncols_dst>
347	static void mul_mat_vec_q_switch_fusion(
348	const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
349	const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
350	const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
351	const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
352	const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst,
353	const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared, cudaStream_t stream) {
354
355	const bool has_fusion = fusion.gate != nullptr \|\| fusion.x_bias != nullptr \|\| fusion.gate_bias != nullptr;
356	if constexpr (c_ncols_dst == `1`) {
357	if (has_fusion) {
358	mul_mat_vec_q<type, c_ncols_dst, true><<<gridDim: block_nums, blockDim: block_dims, sharedMem: nbytes_shared, stream>>>
359	(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
360	channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
361	sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
362	return;
363	}
364	}
365
366	GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
367
368	mul_mat_vec_q<type, c_ncols_dst, false><<<gridDim: block_nums, blockDim: block_dims, sharedMem: nbytes_shared, stream>>>
369	(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
370	channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
371	sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
372	}
373
374	template <ggml_type type>
375	static void mul_mat_vec_q_switch_ncols_dst(
376	const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
377	const int ncols_x, const int nrows_x, const int ncols_dst,
378	const int stride_row_x, const int stride_col_y, const int stride_col_dst,
379	const int nchannels_x, const int nchannels_y, const int nchannels_dst,
380	const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
381	const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
382	cudaStream_t stream) {
383
384	GGML_ASSERT(ncols_x % ggml_blck_size(type) == `0`);
385	GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE);
386
387	const uint3 nchannels_y_fd = ids ? init_fastdiv_values(d_64: nchannels_y) : make_uint3(x: `0`, y: `0`, z: `0`);
388	const uint3 channel_ratio_fd = ids ? make_uint3(x: `0`, y: `0`, z: `0`) : init_fastdiv_values(d_64: nchannels_dst / nchannels_x);
389	const uint3 sample_ratio_fd = init_fastdiv_values(d_64: nsamples_dst / nsamples_x);
390
391	const int device = ggml_cuda_get_device();
392	const int warp_size = ggml_cuda_info().devices[device].warp_size;
393	const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
394
395	const bool has_fusion = fusion.gate != nullptr \|\| fusion.x_bias != nullptr \|\| fusion.gate_bias != nullptr;
396
397	GGML_ASSERT(!ids \|\| ncols_dst == `1`);
398	switch (ncols_dst) {
399	case `1`: {
400	constexpr int c_ncols_dst = `1`;
401	std::pair<dim3, dim3> dims = calc_launch_params(ncols_dst: c_ncols_dst, nrows_x, nchannels_y: nchannels_dst, nsamples_y: nsamples_dst, warp_size, table_id);
402	mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
403	channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
404	sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
405	dims.first, dims.second, `0`, stream);
406	} break;
407	case `2`: {
408	constexpr int c_ncols_dst = `2`;
409	std::pair<dim3, dim3> dims = calc_launch_params(ncols_dst: c_ncols_dst, nrows_x, nchannels_y: nchannels_dst, nsamples_y: nsamples_dst, warp_size, table_id);
410	mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
411	channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
412	sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
413	dims.first, dims.second, `0`, stream);
414	} break;
415	case `3`: {
416	constexpr int c_ncols_dst = `3`;
417	std::pair<dim3, dim3> dims = calc_launch_params(ncols_dst: c_ncols_dst, nrows_x, nchannels_y: nchannels_dst, nsamples_y: nsamples_dst, warp_size, table_id);
418	mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
419	channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
420	sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
421	dims.first, dims.second, `0`, stream);
422	} break;
423	case `4`: {
424	constexpr int c_ncols_dst = `4`;
425	std::pair<dim3, dim3> dims = calc_launch_params(ncols_dst: c_ncols_dst, nrows_x, nchannels_y: nchannels_dst, nsamples_y: nsamples_dst, warp_size, table_id);
426	mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
427	channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
428	sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
429	dims.first, dims.second, `0`, stream);
430	} break;
431	case `5`: {
432	constexpr int c_ncols_dst = `5`;
433	std::pair<dim3, dim3> dims = calc_launch_params(ncols_dst: c_ncols_dst, nrows_x, nchannels_y: nchannels_dst, nsamples_y: nsamples_dst, warp_size, table_id);
434	mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
435	channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
436	sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
437	dims.first, dims.second, `0`, stream);
438	} break;
439	case `6`: {
440	constexpr int c_ncols_dst = `6`;
441	std::pair<dim3, dim3> dims = calc_launch_params(ncols_dst: c_ncols_dst, nrows_x, nchannels_y: nchannels_dst, nsamples_y: nsamples_dst, warp_size, table_id);
442	mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
443	channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
444	sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
445	dims.first, dims.second, `0`, stream);
446	} break;
447	case `7`: {
448	constexpr int c_ncols_dst = `7`;
449	std::pair<dim3, dim3> dims = calc_launch_params(ncols_dst: c_ncols_dst, nrows_x, nchannels_y: nchannels_dst, nsamples_y: nsamples_dst, warp_size, table_id);
450	mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
451	channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
452	sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
453	dims.first, dims.second, `0`, stream);
454	} break;
455	case `8`: {
456	constexpr int c_ncols_dst = `8`;
457	std::pair<dim3, dim3> dims = calc_launch_params(ncols_dst: c_ncols_dst, nrows_x, nchannels_y: nchannels_dst, nsamples_y: nsamples_dst, warp_size, table_id);
458	mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
459	channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
460	sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
461	dims.first, dims.second, `0`, stream);
462	} break;
463	default:
464	GGML_ABORT("fatal error");
465	break;
466	}
467
468	GGML_UNUSED(has_fusion);
469	}
470	static void mul_mat_vec_q_switch_type(
471	const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
472	const int ncols_x, const int nrows_x, const int ncols_dst,
473	const int stride_row_x, const int stride_col_y, const int stride_col_dst,
474	const int nchannels_x, const int nchannels_y, const int nchannels_dst,
475	const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
476	const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
477	cudaStream_t stream) {
478	switch (type_x) {
479	case GGML_TYPE_Q4_0:
480	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
481	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
482	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
483	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
484	break;
485	case GGML_TYPE_Q4_1:
486	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_1>
487	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
488	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
489	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
490	break;
491	case GGML_TYPE_Q5_0:
492	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_0>
493	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
494	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
495	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
496	break;
497	case GGML_TYPE_Q5_1:
498	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_1>
499	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
500	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
501	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
502	break;
503	case GGML_TYPE_Q8_0:
504	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q8_0>
505	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
506	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
507	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
508	break;
509	case GGML_TYPE_MXFP4:
510	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_MXFP4>
511	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
512	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
513	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
514	break;
515	case GGML_TYPE_Q2_K:
516	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
517	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
518	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
519	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
520	break;
521	case GGML_TYPE_Q3_K:
522	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q3_K>
523	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
524	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
525	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
526	break;
527	case GGML_TYPE_Q4_K:
528	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_K>
529	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
530	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
531	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
532	break;
533	case GGML_TYPE_Q5_K:
534	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_K>
535	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
536	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
537	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
538	break;
539	case GGML_TYPE_Q6_K:
540	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q6_K>
541	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
542	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
543	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
544	break;
545	case GGML_TYPE_IQ2_XXS:
546	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XXS>
547	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
548	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
549	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
550	break;
551	case GGML_TYPE_IQ2_XS:
552	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XS>
553	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
554	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
555	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
556	break;
557	case GGML_TYPE_IQ2_S:
558	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_S>
559	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
560	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
561	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
562	break;
563	case GGML_TYPE_IQ3_XXS:
564	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_XXS>
565	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
566	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
567	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
568	break;
569	case GGML_TYPE_IQ1_S:
570	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_S>
571	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
572	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
573	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
574	break;
575	case GGML_TYPE_IQ1_M:
576	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_M>
577	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
578	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
579	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
580	break;
581	case GGML_TYPE_IQ4_NL:
582	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_NL>
583	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
584	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
585	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
586	break;
587	case GGML_TYPE_IQ4_XS:
588	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
589	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
590	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
591	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
592	break;
593	case GGML_TYPE_IQ3_S:
594	mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_S>
595	(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
596	nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
597	nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
598	break;
599	default:
600	GGML_ABORT("fatal error");
601	break;
602	}
603	}
604
605	void ggml_cuda_mul_mat_vec_q(
606	ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
607	const ggml_cuda_mm_fusion_args_host * fusion) {
608	GGML_ASSERT( src1->type == GGML_TYPE_F32);
609	GGML_ASSERT( dst->type == GGML_TYPE_F32);
610	GGML_ASSERT(!ids \|\| ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
611
612	GGML_TENSOR_BINARY_OP_LOCALS;
613
614	cudaStream_t stream = ctx.stream();
615
616	const size_t ts_src0 = ggml_type_size(src0->type);
617	const size_t ts_src1 = ggml_type_size(src1->type);
618	const size_t ts_dst = ggml_type_size(dst->type);
619
620	GGML_ASSERT( nb00 == ts_src0);
621	GGML_ASSERT( nb10 == ts_src1);
622	GGML_ASSERT( nb0 == ts_dst);
623	GGML_ASSERT(!ids \|\| ids->nb[`0`] == ggml_type_size(ids->type));
624
625	GGML_ASSERT(!ids \|\| ne12 == `1`); // Implementation is only correct for batch size 1.
626
627	const float * src1_d = (const float *) src1->data;
628	const int32_t * ids_d = ids ? (const int32_t ) ids->data : nullptr*;
629	float * dst_d = (float *) dst->data;
630
631	ggml_cuda_mm_fusion_args_device fusion_local{};
632
633	if (fusion) {
634	GGML_ASSERT( !ids \|\| dst->ne[`2`] == `1`);
635	GGML_ASSERT( ids \|\| dst->ne[`1`] == `1`);
636
637	if (fusion->x_bias) {
638	GGML_ASSERT(fusion->x_bias->type == GGML_TYPE_F32);
639	GGML_ASSERT(fusion->x_bias->ne[`0`] == dst->ne[`0`]);
640	GGML_ASSERT(!ids \|\| fusion->x_bias->ne[`1`] == src0->ne[`2`]);
641	fusion_local.x_bias = fusion->x_bias->data;
642	}
643	if (fusion->gate) {
644	GGML_ASSERT(fusion->gate->type == src0->type && ggml_are_same_stride(fusion->gate, src0));
645	fusion_local.gate = fusion->gate->data;
646	}
647	if (fusion->gate_bias) {
648	GGML_ASSERT(fusion->gate_bias->type == GGML_TYPE_F32);
649	GGML_ASSERT(fusion->gate_bias->ne[`0`] == dst->ne[`0`]);
650	GGML_ASSERT(!ids \|\| fusion->gate_bias->ne[`1`] == src0->ne[`2`]);
651	fusion_local.gate_bias = fusion->gate_bias->data;
652	}
653	fusion_local.glu_op = fusion->glu_op;
654	}
655
656	// If src0 is a temporary compute buffer, clear any potential padding.
657	if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
658	const size_t size_data = ggml_nbytes(src0);
659	const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
660	if (size_alloc > size_data) {
661	GGML_ASSERT(ggml_is_contiguously_allocated(src0));
662	GGML_ASSERT(!src0->view_src);
663	CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, `0`, size_alloc - size_data, stream));
664	}
665	}
666
667	const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
668	ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13ne12 ne11ne10_padded sizeof(block_q8_1)/QK8_1);
669	{
670	const int64_t s11 = src1->nb[`1`] / ts_src1;
671	const int64_t s12 = src1->nb[`2`] / ts_src1;
672	const int64_t s13 = src1->nb[`3`] / ts_src1;
673	quantize_row_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
674	}
675
676	const int64_t s01 = src0->nb[`1`] / ts_src0;
677	const int64_t s11 = ne10_padded / QK8_1;
678	const int64_t s1 = dst->nb[`1`] / ts_dst;
679	const int64_t s02 = src0->nb[`2`] / ts_src0;
680	const int64_t s2 = dst->nb[`2`] / ts_dst;
681	const int64_t s03 = src0->nb[`3`] / ts_src0;
682	const int64_t s3 = dst->nb[`3`] / ts_dst;
683
684	const int64_t s12 = ne11*s11;
685	const int64_t s13 = ne12*s12;
686
687	// For MUL_MAT_ID the memory layout is different than for MUL_MAT:
688	const int64_t ncols_dst = ids ? ne2 : ne1;
689	const int64_t nchannels_y = ids ? ne11 : ne12;
690	const int64_t nchannels_dst = ids ? ne1 : ne2;
691	const int64_t stride_col_dst = ids ? s2 : s1;
692	const int64_t stride_col_y = ids ? s12 : s11;
693	const int64_t stride_channel_dst = ids ? s1 : s2;
694	const int64_t stride_channel_y = ids ? s11 : s12;
695
696	mul_mat_vec_q_switch_type(
697	src0->data, src0->type, src1_q8_1.get(), ids_d, fusion_local, dst_d, ne00,
698	ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
699	ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
700	ne03, ne3, s03, s13, s3, stream);
701	}
702
703	void ggml_cuda_op_mul_mat_vec_q(
704	ggml_backend_cuda_context & ctx,
705	const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
706	const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
707	const int64_t src1_padded_row_size, cudaStream_t stream) {
708
709	const int64_t ne00 = src0->ne[`0`];
710	const int64_t row_diff = row_high - row_low;
711
712	const int64_t ne10 = src1->ne[`0`];
713	GGML_ASSERT(ne10 % QK8_1 == `0`);
714
715	const int64_t ne0 = dst->ne[`0`];
716
717	int id = ggml_cuda_get_device();
718
719	// the main device has a larger memory buffer to hold the results from all GPUs
720	// nrows_dst == nrows of the matrix that the kernel writes into
721	const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
722
723	const int stride_row_x = ne00 / ggml_blck_size(src0->type);
724	const int stride_col_y = src1_padded_row_size / QK8_1;
725
726	ggml_cuda_mm_fusion_args_device fusion_local{};
727	mul_mat_vec_q_switch_type(
728	src0_dd_i, src0->type, src1_ddq_i, nullptr, fusion_local, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
729	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, stream);
730
731	GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_ncols, src1_padded_row_size);
732	}
733

Browse the source code of llama.cpp/ggml/src/ggml-cuda/mmvq.cu