sumrows.cu source code [llama.cpp/ggml/src/ggml-cuda/sumrows.cu]

1	#include "reduce_rows.cuh"
2	#include "sumrows.cuh"
3
4	void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
5	const int id = ggml_cuda_get_device();
6	const int nsm = ggml_cuda_info().devices[id].nsm;
7	const dim3 block_nums(nrows, `1`, `1`);
8	if ((nrows / nsm) < `2`) {
9	const dim3 block_dims(`512`, `1`, `1`);
10	reduce_rows_f32</norm=/false><<<gridDim: block_nums, blockDim: block_dims, sharedMem: `0`, stream>>>(x, dst, ncols);
11	} else {
12	const dim3 block_dims(ncols < `1024` ? `32` : `128`, `1`, `1`);
13	reduce_rows_f32</norm=/false><<<gridDim: block_nums, blockDim: block_dims, sharedMem: `0`, stream>>>(x, dst, ncols);
14	}
15	}
16
17	void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
18	const ggml_tensor * src0 = dst->src[`0`];
19	const float * src0_d = (const float *)src0->data;
20	float * dst_d = (float *)dst->data;
21	cudaStream_t stream = ctx.stream();
22
23	GGML_ASSERT(src0->type == GGML_TYPE_F32);
24	GGML_ASSERT( dst->type == GGML_TYPE_F32);
25	GGML_ASSERT(ggml_is_contiguous(src0));
26
27	const int64_t ncols = src0->ne[`0`];
28	const int64_t nrows = ggml_nrows(src0);
29
30	const dim3 block_nums(nrows, `1`, `1`);
31
32	const int id = ggml_cuda_get_device();
33	const int nsm = ggml_cuda_info().devices[id].nsm;
34	if ((nrows / nsm) < `2`) {
35	// Increase num threads to 512 for small nrows to better hide the latency
36	const dim3 block_dims(`512`, `1`, `1`);
37	reduce_rows_f32</norm=/false><<<gridDim: block_nums, blockDim: block_dims, sharedMem: `0`, stream>>>(x: src0_d, dst: dst_d, ncols);
38	} else {
39	// Enough active SMs to hide latency, use smaller blocks to allow better scheduling
40	const dim3 block_dims(ncols < `1024` ? `32` : `128`, `1`, `1`);
41	reduce_rows_f32</norm=/false><<<gridDim: block_nums, blockDim: block_dims, sharedMem: `0`, stream>>>(x: src0_d, dst: dst_d, ncols);
42	}
43	}
44

Browse the source code of llama.cpp/ggml/src/ggml-cuda/sumrows.cu