1#include "arg.h"
2#include "common.h"
3#include "log.h"
4#include "llama.h"
5#include "ggml.h"
6
7#include <cstdio>
8#include <string>
9#include <vector>
10#include <numeric>
11
12/**
13 * This the arbitrary data which will be passed to each callback.
14 * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
15 */
16struct callback_data {
17 std::vector<uint8_t> data;
18};
19
20static std::string ggml_ne_string(const ggml_tensor * t) {
21 std::string str;
22 for (int i = 0; i < GGML_MAX_DIMS; ++i) {
23 str += std::to_string(val: t->ne[i]);
24 if (i + 1 < GGML_MAX_DIMS) {
25 str += ", ";
26 }
27 }
28 return str;
29}
30
31static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
32 union {
33 float f;
34 uint32_t i;
35 } u;
36 u.i = (uint32_t)h.bits << 16;
37 return u.f;
38}
39
40static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
41 size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
42 float v;
43 if (type == GGML_TYPE_F16) {
44 v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
45 } else if (type == GGML_TYPE_F32) {
46 v = *(float *) &data[i];
47 } else if (type == GGML_TYPE_I64) {
48 v = (float) *(int64_t *) &data[i];
49 } else if (type == GGML_TYPE_I32) {
50 v = (float) *(int32_t *) &data[i];
51 } else if (type == GGML_TYPE_I16) {
52 v = (float) *(int16_t *) &data[i];
53 } else if (type == GGML_TYPE_I8) {
54 v = (float) *(int8_t *) &data[i];
55 } else if (type == GGML_TYPE_BF16) {
56 v = ggml_compute_bf16_to_fp32(h: *(ggml_bf16_t *) &data[i]);
57 } else {
58 GGML_ABORT("fatal error");
59 }
60 return v;
61}
62
63static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
64 GGML_ASSERT(n > 0);
65 float sum = 0;
66 for (int64_t i3 = 0; i3 < ne[3]; i3++) {
67 for (int64_t i2 = 0; i2 < ne[2]; i2++) {
68 for (int64_t i1 = 0; i1 < ne[1]; i1++) {
69 for (int64_t i0 = 0; i0 < ne[0]; i0++) {
70 const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
71 sum += v;
72 }
73 }
74 }
75 }
76 for (int64_t i3 = 0; i3 < ne[3]; i3++) {
77 LOG(" [\n");
78 for (int64_t i2 = 0; i2 < ne[2]; i2++) {
79 if (i2 == n && ne[2] > 2*n) {
80 LOG(" ..., \n");
81 i2 = ne[2] - n;
82 }
83 LOG(" [\n");
84 for (int64_t i1 = 0; i1 < ne[1]; i1++) {
85 if (i1 == n && ne[1] > 2*n) {
86 LOG(" ..., \n");
87 i1 = ne[1] - n;
88 }
89 LOG(" [");
90 for (int64_t i0 = 0; i0 < ne[0]; i0++) {
91 if (i0 == n && ne[0] > 2*n) {
92 LOG("..., ");
93 i0 = ne[0] - n;
94 }
95 const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
96 LOG("%12.4f", v);
97 if (i0 < ne[0] - 1) LOG(", ");
98 }
99 LOG("],\n");
100 }
101 LOG(" ],\n");
102 }
103 LOG(" ]\n");
104 LOG(" sum = %f\n", sum);
105 }
106
107 // TODO: make this abort configurable/optional?
108 if (std::isnan(x: sum)) {
109 LOG_ERR("encountered NaN - aborting\n");
110 exit(status: 0);
111 }
112}
113
114/**
115 * GGML operations callback during the graph execution.
116 *
117 * @param t current tensor
118 * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
119 * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
120 * see ggml_backend_sched_eval_callback
121 * @param user_data user data to pass at each call back
122 * @return true to receive data or continue the graph, false otherwise
123 */
124static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
125 auto * cb_data = (callback_data *) user_data;
126
127 const struct ggml_tensor * src0 = t->src[0];
128 const struct ggml_tensor * src1 = t->src[1];
129
130 if (ask) {
131 return true; // Always retrieve data
132 }
133
134 char src1_str[128] = {0};
135 if (src1) {
136 snprintf(s: src1_str, maxlen: sizeof(src1_str), format: "%s{%s}", src1->name, ggml_ne_string(t: src1).c_str());
137 }
138
139 LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
140 t->name, ggml_type_name(t->type), ggml_op_desc(t),
141 src0->name, ggml_ne_string(src0).c_str(),
142 src1 ? src1_str : "",
143 ggml_ne_string(t).c_str());
144
145
146 // copy the data from the GPU memory if needed
147 const bool is_host = ggml_backend_buffer_is_host(buffer: t->buffer);
148
149 if (!is_host) {
150 auto n_bytes = ggml_nbytes(tensor: t);
151 cb_data->data.resize(new_size: n_bytes);
152 ggml_backend_tensor_get(tensor: t, data: cb_data->data.data(), offset: 0, size: n_bytes);
153 }
154
155 if (!ggml_is_quantized(type: t->type)) {
156 uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
157 ggml_print_tensor(data, type: t->type, ne: t->ne, nb: t->nb, n: 3);
158 }
159
160 return true;
161}
162
163static bool run(llama_context * ctx, const common_params & params) {
164 const llama_model * model = llama_get_model(ctx);
165 const llama_vocab * vocab = llama_model_get_vocab(model);
166
167 const bool add_bos = llama_vocab_get_add_bos(vocab);
168
169 std::vector<llama_token> tokens = common_tokenize(ctx, text: params.prompt, add_special: add_bos);
170
171 if (tokens.empty()) {
172 LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
173 return false;
174 }
175
176 if (llama_decode(ctx, batch: llama_batch_get_one(tokens: tokens.data(), n_tokens: tokens.size()))) {
177 LOG_ERR("%s : failed to eval\n", __func__);
178 return false;
179 }
180
181 return true;
182}
183
184int main(int argc, char ** argv) {
185 callback_data cb_data;
186
187 common_params params;
188
189 if (!common_params_parse(argc, argv, params, ex: LLAMA_EXAMPLE_COMMON)) {
190 return 1;
191 }
192
193 common_init();
194
195 llama_backend_init();
196 llama_numa_init(numa: params.numa);
197
198 // pass the callback to the backend scheduler
199 // it will be executed for each node during the graph computation
200 params.cb_eval = ggml_debug;
201 params.cb_eval_user_data = &cb_data;
202 params.warmup = false;
203
204 // init
205 common_init_result llama_init = common_init_from_params(params);
206
207 llama_model * model = llama_init.model.get();
208 llama_context * ctx = llama_init.context.get();
209
210 if (model == nullptr || ctx == nullptr) {
211 LOG_ERR("%s : failed to init\n", __func__);
212 return 1;
213 }
214
215 // print system information
216 {
217 LOG_INF("\n");
218 LOG_INF("%s\n", common_params_get_system_info(params).c_str());
219 LOG_INF("\n");
220 }
221
222 bool OK = run(ctx, params);
223 if (!OK) {
224 return 1;
225 }
226
227 LOG("\n");
228 llama_perf_context_print(ctx);
229
230 llama_backend_free();
231
232 return 0;
233}
234