gla.cu source code [llama.cpp/ggml/src/ggml-cuda/gla.cu]

1	#include "common.cuh"
2	#include "gla.cuh"
3
4	template<int HEAD_SIZE>
5	static __global__ void gated_linear_attn_f32(const int B, const int T, const int C, const int H, const float scale,
6	const float * k, const float * v, const float * r, const float * td, const float * s, float * dst) {
7	const int tid = threadIdx.x;
8	const int bid = blockIdx.x;
9
10	const int head_size = HEAD_SIZE;
11	const int batch_i = bid / H;
12	const int head_i = bid % H;
13	const int state_size = C * head_size;
14	const int n_seq_tokens = T / B;
15
16	float state[head_size];
17	__shared__ float _k[head_size], _r[head_size], _td[head_size];
18
19	#pragma unroll
20	for (int i = `0`; i < head_size; i++) {
21	state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
22	}
23
24	for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + `1`) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
25	__syncthreads();
26	_k[tid] = k[t];
27	_r[tid] = r[t];
28	_td[tid] = td[t];
29	__syncthreads();
30
31	const float _v = v[t];
32	float y = `0`;
33	for (int j = `0`; j < head_size; j += `4`) {
34	const float4 & k = (float4 &)(_k[j]);
35	const float4 & r = (float4 &)(_r[j]);
36	const float4 & td = (float4 &)(_td[j]);
37	float4 & s = (float4 &)(state[j]);
38	float4 kv;
39
40	kv.x = k.x * _v;
41	kv.y = k.y * _v;
42	kv.z = k.z * _v;
43	kv.w = k.w * _v;
44
45	s.x = s.x * td.x + kv.x;
46	s.y = s.y * td.y + kv.y;
47	s.z = s.z * td.z + kv.z;
48	s.w = s.w * td.w + kv.w;
49
50	y += r.x * s.x;
51	y += r.y * s.y;
52	y += r.z * s.z;
53	y += r.w * s.w;
54	}
55	dst[t] = y * scale;
56	}
57
58	#pragma unroll
59	for (int i = `0`; i < head_size; i++) {
60	dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
61	}
62	}
63
64	void ggml_cuda_op_gated_linear_attn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
65	const float * k_d = (const float *)dst->src[`0`]->data;
66	const float * v_d = (const float *)dst->src[`1`]->data;
67	const float * r_d = (const float *)dst->src[`2`]->data;
68	const float * td_d = (const float *)dst->src[`3`]->data;
69	const float * s_d = (const float *)dst->src[`4`]->data;
70
71	const int64_t B = dst->src[`4`]->ne[`1`];
72	const int64_t T = dst->src[`0`]->ne[`2`];
73	const int64_t C = dst->ne[`0`];
74	const int64_t H = dst->src[`0`]->ne[`1`];
75
76	float scale;
77	memcpy(dest: &scale, src: (float)dst->op_params, n: sizeof(float*));
78
79	float * dst_d = (float *)dst->data;
80
81	cudaStream_t stream = ctx.stream();
82
83	GGML_ASSERT(dst->src[`4`]->type == GGML_TYPE_F32);
84	GGML_ASSERT(C % H == `0`);
85	GGML_ASSERT(C / H == `64` \|\| C / H == `128`);
86
87
88	if (C / H == `64`) {
89	gated_linear_attn_f32<`64`><<<gridDim: B * H, blockDim: C / H, sharedMem: `0`, stream>>>(B, T, C, H, scale, k: k_d, v: v_d, r: r_d, td: td_d, s: s_d, dst: dst_d);
90	} else {
91	gated_linear_attn_f32<`128`><<<gridDim: B * H, blockDim: C / H, sharedMem: `0`, stream>>>(B, T, C, H, scale, k: k_d, v: v_d, r: r_d, td: td_d, s: s_d, dst: dst_d);
92	}
93	}
94

Browse the source code of llama.cpp/ggml/src/ggml-cuda/gla.cu