mtmd-cli.cpp source code [llama.cpp/tools/mtmd/mtmd-cli.cpp]

1	#include "arg.h"
2	#include "log.h"
3	#include "common.h"
4	#include "sampling.h"
5	#include "llama.h"
6	#include "ggml.h"
7	#include "console.h"
8	#include "chat.h"
9	#include "mtmd.h"
10	#include "mtmd-helper.h"
11
12	#include <vector>
13	#include <limits.h>
14	#include <cinttypes>
15
16	#if defined (__unix__) \|\| (defined (__APPLE__) && defined (__MACH__))
17	#include <signal.h>
18	#include <unistd.h>
19	#elif defined (_WIN32)
20	#define WIN32_LEAN_AND_MEAN
21	#ifndef NOMINMAX
22	#define NOMINMAX
23	#endif
24	#include <windows.h>
25	#include <signal.h>
26	#endif
27
28	// volatile, because of signal being an interrupt
29	static volatile bool g_is_generating = false;
30	static volatile bool g_is_interrupted = false;
31
32	/**
33	* Please note that this is NOT a production-ready stuff.
34	* It is a playground for trying multimodal support in llama.cpp.
35	* For contributors: please keep this code simple and easy to understand.
36	*/
37
38	static void show_additional_info(int /argc/, char ** argv) {
39	LOG(
40	"Experimental CLI for multimodal\n\n"
41	"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n"
42	" -m and --mmproj are required\n"
43	" -hf user/repo can replace both -m and --mmproj in most cases\n"
44	" --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n"
45	" to disable using GPU for mmproj model, add --no-mmproj-offload\n",
46	argv[`0`]
47	);
48	}
49
50	#if defined (__unix__) \|\| (defined (__APPLE__) && defined (__MACH__)) \|\| defined (_WIN32)
51	static void sigint_handler(int signo) {
52	if (signo == SIGINT) {
53	if (g_is_generating) {
54	g_is_generating = false;
55	} else {
56	console::cleanup();
57	if (g_is_interrupted) {
58	_exit(status: `1`);
59	}
60	g_is_interrupted = true;
61	}
62	}
63	}
64	#endif
65
66	struct mtmd_cli_context {
67	mtmd::context_ptr ctx_vision;
68	common_init_result llama_init;
69
70	llama_model * model;
71	llama_context * lctx;
72	const llama_vocab * vocab;
73	common_sampler * smpl;
74	llama_batch batch;
75	int n_batch;
76
77	mtmd::bitmaps bitmaps;
78
79	// chat template
80	common_chat_templates_ptr tmpls;
81	std::vector<common_chat_msg> chat_history;
82	bool use_jinja = false;
83	// TODO: support for --system-prompt with /clear command
84
85	// support for legacy templates (models not having EOT token)
86	llama_tokens antiprompt_tokens;
87
88	int n_threads = `1`;
89	llama_pos n_past = `0`;
90
91	mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
92	model = llama_init.model.get();
93	lctx = llama_init.context.get();
94	vocab = llama_model_get_vocab(model);
95	smpl = common_sampler_init(model, params: params.sampling);
96	n_threads = params.cpuparams.n_threads;
97	batch = llama_batch_init(n_tokens: `1`, embd: `0`, n_seq_max: `1`); // batch for next token generation
98	n_batch = params.n_batch;
99
100	if (!model \|\| !lctx) {
101	exit(status: `1`);
102	}
103
104	if (!llama_model_chat_template(model, name: nullptr) && params.chat_template.empty()) {
105	LOG_ERR("Model does not have chat template.\n");
106	LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
107	LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
108	LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
109	exit(status: `1`);
110	}
111
112	tmpls = common_chat_templates_init(model, chat_template_override: params.chat_template);
113	use_jinja = params.use_jinja;
114	chat_history.clear();
115	LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
116
117	init_vision_context(params);
118
119	// load antiprompt tokens for legacy templates
120	if (params.chat_template == "vicuna") {
121	antiprompt_tokens = common_tokenize(ctx: lctx, text: "ASSISTANT:", add_special: false, parse_special: true);
122	} else if (params.chat_template == "deepseek") {
123	antiprompt_tokens = common_tokenize(ctx: lctx, text: "###", add_special: false, parse_special: true);
124	}
125	}
126
127	~mtmd_cli_context() {
128	llama_batch_free(batch);
129	common_sampler_free(gsmpl: smpl);
130	}
131
132	void init_vision_context(common_params & params) {
133	const char * clip_path = params.mmproj.path.c_str();
134	mtmd_context_params mparams = mtmd_context_params_default();
135	mparams.use_gpu = params.mmproj_use_gpu;
136	mparams.print_timings = true;
137	mparams.n_threads = params.cpuparams.n_threads;
138	mparams.verbosity = params.verbosity > `0` ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
139	mparams.flash_attn_type = params.flash_attn_type;
140	mparams.image_min_tokens = params.image_min_tokens;
141	mparams.image_max_tokens = params.image_max_tokens;
142	ctx_vision.reset(p: mtmd_init_from_file(mmproj_fname: clip_path, text_model: model, ctx_params: mparams));
143	if (!ctx_vision.get()) {
144	LOG_ERR("Failed to load vision model from %s\n", clip_path);
145	exit(status: `1`);
146	}
147	}
148
149	bool check_antiprompt(const llama_tokens & generated_tokens) {
150	if (antiprompt_tokens.empty() \|\| generated_tokens.size() < antiprompt_tokens.size()) {
151	return false;
152	}
153	return std::equal(
154	first1: generated_tokens.end() - antiprompt_tokens.size(),
155	last1: generated_tokens.end(),
156	first2: antiprompt_tokens.begin()
157	);
158	}
159
160	bool load_media(const std::string & fname) {
161	mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx: ctx_vision.get(), fname: fname.c_str()));
162	if (!bmp.ptr) {
163	return false;
164	}
165	bitmaps.entries.push_back(x: std::move(bmp));
166	return true;
167	}
168	};
169
170	static int generate_response(mtmd_cli_context & ctx, int n_predict) {
171	llama_tokens generated_tokens;
172	for (int i = `0`; i < n_predict; i++) {
173	if (i > n_predict \|\| !g_is_generating \|\| g_is_interrupted) {
174	LOG("\n");
175	break;
176	}
177
178	llama_token token_id = common_sampler_sample(gsmpl: ctx.smpl, ctx: ctx.lctx, idx: -`1`);
179	generated_tokens.push_back(x: token_id);
180	common_sampler_accept(gsmpl: ctx.smpl, token: token_id, accept_grammar: true);
181
182	if (llama_vocab_is_eog(vocab: ctx.vocab, token: token_id) \|\| ctx.check_antiprompt(generated_tokens)) {
183	LOG("\n");
184	break; // end of generation
185	}
186
187	LOG("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
188	fflush(stdout);
189
190	if (g_is_interrupted) {
191	LOG("\n");
192	break;
193	}
194
195	// eval the token
196	common_batch_clear(batch&: ctx.batch);
197	common_batch_add(batch&: ctx.batch, id: token_id, pos: ctx.n_past++, seq_ids: {`0`}, logits: true);
198	if (llama_decode(ctx: ctx.lctx, batch: ctx.batch)) {
199	LOG_ERR("failed to decode token\n");
200	return `1`;
201	}
202	}
203
204	std::string generated_text = common_detokenize(ctx: ctx.lctx, tokens: generated_tokens);
205	common_chat_msg msg;
206	msg.role = "assistant";
207	msg.content = generated_text;
208	ctx.chat_history.push_back(x: std::move(msg));
209
210	return `0`;
211	}
212
213	static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) {
214	LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n",
215	new_msg.role.c_str(), new_msg.content.c_str());
216	auto formatted = common_chat_format_single(tmpls: ctx.tmpls.get(), past_msg: ctx.chat_history,
217	new_msg, add_ass: new_msg.role == "user",
218	use_jinja: ctx.use_jinja);
219	ctx.chat_history.push_back(x: new_msg);
220	return formatted;
221	}
222
223	static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
224	bool add_bos = ctx.chat_history.empty();
225	auto formatted_chat = chat_add_and_format(ctx, new_msg&: msg);
226	LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
227
228	mtmd_input_text text;
229	text.text = formatted_chat.c_str();
230	text.add_special = add_bos;
231	text.parse_special = true;
232
233	if (g_is_interrupted) return `0`;
234
235	mtmd::input_chunks chunks(mtmd_input_chunks_init());
236	auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
237	int32_t res = mtmd_tokenize(ctx: ctx.ctx_vision.get(),
238	output: chunks.ptr.get(), // output
239	text: &text, // text
240	bitmaps: bitmaps_c_ptr.data(),
241	n_bitmaps: bitmaps_c_ptr.size());
242	if (res != `0`) {
243	LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
244	return `1`;
245	}
246
247	ctx.bitmaps.entries.clear();
248
249	llama_pos new_n_past;
250	if (mtmd_helper_eval_chunks(ctx: ctx.ctx_vision.get(),
251	lctx: ctx.lctx, // lctx
252	chunks: chunks.ptr.get(), // chunks
253	n_past: ctx.n_past, // n_past
254	seq_id: `0`, // seq_id
255	n_batch: ctx.n_batch, // n_batch
256	logits_last: true, // logits_last
257	new_n_past: &new_n_past)) {
258	LOG_ERR("Unable to eval prompt\n");
259	return `1`;
260	}
261
262	ctx.n_past = new_n_past;
263
264	LOG("\n");
265
266	return `0`;
267	}
268
269	int main(int argc, char ** argv) {
270	ggml_time_init();
271
272	common_params params;
273	params.sampling.temp = `0.2`; // lower temp by default for better quality
274
275	if (!common_params_parse(argc, argv, params, ex: LLAMA_EXAMPLE_MTMD, print_usage: show_additional_info)) {
276	return `1`;
277	}
278
279	common_init();
280
281	if (params.mmproj.path.empty()) {
282	show_additional_info(argc, argv);
283	LOG_ERR("ERR: Missing --mmproj argument\n");
284	return `1`;
285	}
286
287	mtmd_cli_context ctx(params);
288	LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());
289
290	bool is_single_turn = !params.prompt.empty() && !params.image.empty();
291
292	int n_predict = params.n_predict < `0` ? INT_MAX : params.n_predict;
293
294	// Ctrl+C handling
295	{
296	#if defined (__unix__) \|\| (defined (__APPLE__) && defined (__MACH__))
297	struct sigaction sigint_action;
298	sigint_action.sa_handler = sigint_handler;
299	sigemptyset (set: &sigint_action.sa_mask);
300	sigint_action.sa_flags = `0`;
301	sigaction(SIGINT, act: &sigint_action, NULL);
302	#elif defined (_WIN32)
303	auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
304	return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
305	};
306	SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
307	#endif
308	}
309
310	if (g_is_interrupted) return `130`;
311
312	if (is_single_turn) {
313	g_is_generating = true;
314	if (params.prompt.find(s: mtmd_default_marker()) == std::string::npos) {
315	for (size_t i = `0`; i < params.image.size(); i++) {
316	params.prompt += mtmd_default_marker();
317	}
318	}
319	common_chat_msg msg;
320	msg.role = "user";
321	msg.content = params.prompt;
322	for (const auto & image : params.image) {
323	if (!ctx.load_media(fname: image)) {
324	return `1`; // error is already printed by libmtmd
325	}
326	}
327	if (eval_message(ctx, msg)) {
328	return `1`;
329	}
330	if (!g_is_interrupted && generate_response(ctx, n_predict)) {
331	return `1`;
332	}
333
334	} else {
335	LOG("\n Running in chat mode, available commands:");
336	if (mtmd_support_vision(ctx: ctx.ctx_vision.get())) {
337	LOG("\n /image <path> load an image");
338	}
339	if (mtmd_support_audio(ctx: ctx.ctx_vision.get())) {
340	LOG("\n /audio <path> load an audio");
341	}
342	LOG("\n /clear clear the chat history");
343	LOG("\n /quit or /exit exit the program");
344	LOG("\n");
345
346	std::string content;
347
348	while (!g_is_interrupted) {
349	g_is_generating = false;
350	LOG("\n> ");
351	console::set_display(console::user_input);
352	std::string line;
353	console::readline(line, multiline_input: false);
354	if (g_is_interrupted) break;
355	console::set_display(console::reset);
356	line = string_strip(str: line);
357	if (line.empty()) {
358	continue;
359	}
360	if (line == "/quit" \|\| line == "/exit") {
361	break;
362	}
363	if (line == "/clear") {
364	ctx.n_past = `0`;
365	ctx.chat_history.clear();
366	llama_memory_clear(mem: llama_get_memory(ctx: ctx.lctx), data: true);
367	LOG("Chat history cleared\n\n");
368	continue;
369	}
370	g_is_generating = true;
371	bool is_image = line == "/image" \|\| line.find(s: "/image ") == `0`;
372	bool is_audio = line == "/audio" \|\| line.find(s: "/audio ") == `0`;
373	if (is_image \|\| is_audio) {
374	if (line.size() < `8`) {
375	LOG_ERR("ERR: Missing media filename\n");
376	continue;
377	}
378	std::string media_path = line.substr(pos: `7`);
379	if (ctx.load_media(fname: media_path)) {
380	LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
381	content += mtmd_default_marker();
382	}
383	// else, error is already printed by libmtmd
384	continue;
385	} else {
386	content += line;
387	}
388	common_chat_msg msg;
389	msg.role = "user";
390	msg.content = content;
391	int ret = eval_message(ctx, msg);
392	if (ret) {
393	return `1`;
394	}
395	if (g_is_interrupted) break;
396	if (generate_response(ctx, n_predict)) {
397	return `1`;
398	}
399	content.clear();
400	}
401	}
402	if (g_is_interrupted) LOG("\nInterrupted by user\n");
403	LOG("\n\n");
404	llama_perf_context_print(ctx: ctx.lctx);
405	return g_is_interrupted ? `130` : `0`;
406	}
407

Browse the source code of llama.cpp/tools/mtmd/mtmd-cli.cpp