1#include "models.h"
2
3llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
4 llm_graph_context(params) {
5 const int64_t n_embd_head = hparams.n_embd_head_v;
6 float kq_scale = 1.0f / sqrtf(x: float(n_embd_head));
7
8 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9 GGML_ASSERT(n_embd_head == hparams.n_rot);
10
11 ggml_tensor *inpL, *cur;
12 inpL = build_inp_embd(tok_embd: model.tok_embd);
13
14 ggml_tensor * inp_pos = build_inp_pos();
15
16 auto * inp_attn = build_attn_inp_kv();
17
18 // check ubatch to see if we have input tokens (text)
19 // or an input embedding vector (image)
20 bool is_text;
21 if (ubatch.token) {
22 is_text = true;
23 } else {
24 is_text = false;
25 }
26
27 for (int il = 0; il < n_layer; ++il) {
28 // get either the text or image weight tensors
29 ggml_tensor *wqkv, *wo;
30 ggml_tensor *ffn_gate, *ffn_down, *ffn_up;
31
32 if (is_text) {
33 wqkv = model.layers[il].wqkv;
34 wo = model.layers[il].wo;
35 ffn_gate = model.layers[il].ffn_gate;
36 ffn_down = model.layers[il].ffn_down;
37 ffn_up = model.layers[il].ffn_up;
38 } else {
39 wqkv = model.layers[il].visexp_attn_wqkv;
40 wo = model.layers[il].visexp_attn_wo;
41 ffn_gate = model.layers[il].visexp_ffn_gate;
42 ffn_down = model.layers[il].visexp_ffn_down;
43 ffn_up = model.layers[il].visexp_ffn_up;
44 }
45
46 ggml_tensor * inpSA = inpL;
47 cur = build_norm(cur: inpSA, mw: model.layers[il].attn_norm, NULL, type: LLM_NORM_RMS, il);
48
49 // build self attention
50 {
51 ggml_tensor * qkv = build_lora_mm(w: wqkv, cur);
52
53 // split qkv into Q, K, V along the first dimension
54 ggml_tensor * Qcur =
55 ggml_view_3d(ctx: ctx0, a: qkv, ne0: n_embd_head, ne1: n_head, ne2: n_tokens, nb1: n_embd_head * sizeof(float), nb2: qkv->nb[1], offset: 0);
56 ggml_tensor * Kcur = ggml_view_3d(ctx: ctx0, a: qkv, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens, nb1: n_embd_head * sizeof(float),
57 nb2: qkv->nb[1], offset: n_embd * ggml_element_size(tensor: qkv));
58 ggml_tensor * Vcur = ggml_view_3d(ctx: ctx0, a: qkv, ne0: n_embd_head, ne1: n_head_kv, ne2: n_tokens, nb1: n_embd_head * sizeof(float),
59 nb2: qkv->nb[1], offset: 2 * n_embd * ggml_element_size(tensor: qkv));
60
61 Qcur = ggml_rope(ctx: ctx0, a: Qcur, b: inp_pos, n_dims: n_embd_head, mode: rope_type);
62 Kcur = ggml_rope(ctx: ctx0, a: Kcur, b: inp_pos, n_dims: n_embd_head, mode: rope_type);
63
64 cur = build_attn(inp: inp_attn,
65 wo, wo_b: nullptr,
66 q_cur: Qcur, k_cur: Kcur, v_cur: Vcur,
67 kq_b: nullptr, sinks: nullptr, v_mla: nullptr,
68 kq_scale, il);
69 cb(cur, name: "attn_out", il);
70 }
71
72 ggml_tensor * ffn_inp = ggml_add(ctx: ctx0, a: cur, b: inpSA);
73 cb(cur: ffn_inp, name: "ffn_inp", il);
74
75 cur = build_norm(cur: ffn_inp, mw: model.layers[il].ffn_norm, NULL, type: LLM_NORM_RMS, il);
76 cb(cur, name: "ffn_norm", il);
77
78 cur = build_ffn(cur,
79 up: ffn_up, NULL, NULL,
80 gate: ffn_gate, NULL, NULL,
81 down: ffn_down, NULL, NULL,
82 NULL, type_op: LLM_FFN_SILU, type_gate: LLM_FFN_PAR, il);
83
84 cur = ggml_add(ctx: ctx0, a: cur, b: ffn_inp);
85 cb(cur, name: "ffn_out", il);
86
87 inpL = cur;
88 }
89
90 cur = inpL;
91
92 cur = build_norm(cur, mw: model.output_norm, NULL, type: LLM_NORM_RMS, il: -1);
93 cb(cur, name: "result_norm", il: -1);
94 res->t_embd = cur;
95
96 cur = build_lora_mm(w: model.output, cur);
97 cb(cur, name: "result_output", il: -1);
98 res->t_logits = cur;
99 ggml_build_forward_expand(cgraph: gf, tensor: cur);
100}
101