eval-callback.cpp source code [llama.cpp/examples/eval-callback/eval-callback.cpp]

1	#include "arg.h"
2	#include "common.h"
3	#include "log.h"
4	#include "llama.h"
5	#include "ggml.h"
6
7	#include <cstdio>
8	#include <string>
9	#include <vector>
10	#include <numeric>
11
12	/**
13	* This the arbitrary data which will be passed to each callback.
14	* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
15	*/
16	struct callback_data {
17	std::vector<uint8_t> data;
18	};
19
20	static std::string ggml_ne_string(const ggml_tensor * t) {
21	std::string str;
22	for (int i = `0`; i < GGML_MAX_DIMS; ++i) {
23	str += std::to_string(val: t->ne[i]);
24	if (i + `1` < GGML_MAX_DIMS) {
25	str += ", ";
26	}
27	}
28	return str;
29	}
30
31	static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
32	union {
33	float f;
34	uint32_t i;
35	} u;
36	u.i = (uint32_t)h.bits << `16`;
37	return u.f;
38	}
39
40	static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
41	size_t i = i3 * nb[`3`] + i2 * nb[`2`] + i1 * nb[`1`] + i0 * nb[`0`];
42	float v;
43	if (type == GGML_TYPE_F16) {
44	v = ggml_fp16_to_fp32((ggml_fp16_t ) &data[i]);
45	} else if (type == GGML_TYPE_F32) {
46	v = (float* *) &data[i];
47	} else if (type == GGML_TYPE_I64) {
48	v = (float) (int64_t ) &data[i];
49	} else if (type == GGML_TYPE_I32) {
50	v = (float) (int32_t ) &data[i];
51	} else if (type == GGML_TYPE_I16) {
52	v = (float) (int16_t ) &data[i];
53	} else if (type == GGML_TYPE_I8) {
54	v = (float) (int8_t ) &data[i];
55	} else if (type == GGML_TYPE_BF16) {
56	v = ggml_compute_bf16_to_fp32(h: (ggml_bf16_t ) &data[i]);
57	} else {
58	GGML_ABORT("fatal error");
59	}
60	return v;
61	}
62
63	static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
64	GGML_ASSERT(n > `0`);
65	float sum = `0`;
66	for (int64_t i3 = `0`; i3 < ne[`3`]; i3++) {
67	for (int64_t i2 = `0`; i2 < ne[`2`]; i2++) {
68	for (int64_t i1 = `0`; i1 < ne[`1`]; i1++) {
69	for (int64_t i0 = `0`; i0 < ne[`0`]; i0++) {
70	const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
71	sum += v;
72	}
73	}
74	}
75	}
76	for (int64_t i3 = `0`; i3 < ne[`3`]; i3++) {
77	LOG(" [\n");
78	for (int64_t i2 = `0`; i2 < ne[`2`]; i2++) {
79	if (i2 == n && ne[`2`] > `2`*n) {
80	LOG(" ..., \n");
81	i2 = ne[`2`] - n;
82	}
83	LOG(" [\n");
84	for (int64_t i1 = `0`; i1 < ne[`1`]; i1++) {
85	if (i1 == n && ne[`1`] > `2`*n) {
86	LOG(" ..., \n");
87	i1 = ne[`1`] - n;
88	}
89	LOG(" [");
90	for (int64_t i0 = `0`; i0 < ne[`0`]; i0++) {
91	if (i0 == n && ne[`0`] > `2`*n) {
92	LOG("..., ");
93	i0 = ne[`0`] - n;
94	}
95	const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
96	LOG("%12.4f", v);
97	if (i0 < ne[`0`] - `1`) LOG(", ");
98	}
99	LOG("],\n");
100	}
101	LOG(" ],\n");
102	}
103	LOG(" ]\n");
104	LOG(" sum = %f\n", sum);
105	}
106
107	// TODO: make this abort configurable/optional?
108	if (std::isnan(x: sum)) {
109	LOG_ERR("encountered NaN - aborting\n");
110	exit(status: `0`);
111	}
112	}
113
114	/**
115	* GGML operations callback during the graph execution.
116	*
117	* @param t current tensor
118	* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
119	* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
120	* see ggml_backend_sched_eval_callback
121	* @param user_data user data to pass at each call back
122	* @return true to receive data or continue the graph, false otherwise
123	*/
124	static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
125	auto * cb_data = (callback_data *) user_data;
126
127	const struct ggml_tensor * src0 = t->src[`0`];
128	const struct ggml_tensor * src1 = t->src[`1`];
129
130	if (ask) {
131	return true; // Always retrieve data
132	}
133
134	char src1_str[`128`] = {`0`};
135	if (src1) {
136	snprintf(s: src1_str, maxlen: sizeof(src1_str), format: "%s{%s}", src1->name, ggml_ne_string(t: src1).c_str());
137	}
138
139	LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
140	t->name, ggml_type_name(t->type), ggml_op_desc(t),
141	src0->name, ggml_ne_string(src0).c_str(),
142	src1 ? src1_str : "",
143	ggml_ne_string(t).c_str());
144
145
146	// copy the data from the GPU memory if needed
147	const bool is_host = ggml_backend_buffer_is_host(buffer: t->buffer);
148
149	if (!is_host) {
150	auto n_bytes = ggml_nbytes(tensor: t);
151	cb_data->data.resize(new_size: n_bytes);
152	ggml_backend_tensor_get(tensor: t, data: cb_data->data.data(), offset: `0`, size: n_bytes);
153	}
154
155	if (!ggml_is_quantized(type: t->type)) {
156	uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
157	ggml_print_tensor(data, type: t->type, ne: t->ne, nb: t->nb, n: `3`);
158	}
159
160	return true;
161	}
162
163	static bool run(llama_context * ctx, const common_params & params) {
164	const llama_model * model = llama_get_model(ctx);
165	const llama_vocab * vocab = llama_model_get_vocab(model);
166
167	const bool add_bos = llama_vocab_get_add_bos(vocab);
168
169	std::vector<llama_token> tokens = common_tokenize(ctx, text: params.prompt, add_special: add_bos);
170
171	if (tokens.empty()) {
172	LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
173	return false;
174	}
175
176	if (llama_decode(ctx, batch: llama_batch_get_one(tokens: tokens.data(), n_tokens: tokens.size()))) {
177	LOG_ERR("%s : failed to eval\n", __func__);
178	return false;
179	}
180
181	return true;
182	}
183
184	int main(int argc, char ** argv) {
185	callback_data cb_data;
186
187	common_params params;
188
189	if (!common_params_parse(argc, argv, params, ex: LLAMA_EXAMPLE_COMMON)) {
190	return `1`;
191	}
192
193	common_init();
194
195	llama_backend_init();
196	llama_numa_init(numa: params.numa);
197
198	// pass the callback to the backend scheduler
199	// it will be executed for each node during the graph computation
200	params.cb_eval = ggml_debug;
201	params.cb_eval_user_data = &cb_data;
202	params.warmup = false;
203
204	// init
205	common_init_result llama_init = common_init_from_params(params);
206
207	llama_model * model = llama_init.model.get();
208	llama_context * ctx = llama_init.context.get();
209
210	if (model == nullptr \|\| ctx == nullptr) {
211	LOG_ERR("%s : failed to init\n", __func__);
212	return `1`;
213	}
214
215	// print system information
216	{
217	LOG_INF("\n");
218	LOG_INF("%s\n", common_params_get_system_info(params).c_str());
219	LOG_INF("\n");
220	}
221
222	bool OK = run(ctx, params);
223	if (!OK) {
224	return `1`;
225	}
226
227	LOG("\n");
228	llama_perf_context_print(ctx);
229
230	llama_backend_free();
231
232	return `0`;
233	}
234

Browse the source code of llama.cpp/examples/eval-callback/eval-callback.cpp