mtmd.cpp source code [llama.cpp/tools/mtmd/mtmd.cpp]

1	#include "clip.h"
2	#include "clip-impl.h"
3	#include "mtmd.h"
4	#include "mtmd-audio.h"
5
6	#include "llama.h"
7
8	// fix problem with std::min and std::max
9	#if defined(_WIN32)
10	#define WIN32_LEAN_AND_MEAN
11	#ifndef NOMINMAX
12	# define NOMINMAX
13	#endif
14	#include <windows.h>
15	#endif
16
17	#include <algorithm>
18	#include <cerrno>
19	#include <cstdio>
20	#include <cstdlib>
21	#include <cstring>
22	#include <vector>
23
24	// represents raw image data, layout is RGBRGBRGB...
25	// length of data must be nx ny * 3*
26	struct mtmd_bitmap {
27	uint32_t nx;
28	uint32_t ny;
29	std::vector<unsigned char> data;
30	std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
31	bool is_audio = false; // true if the bitmap is audio
32	};
33
34	struct mtmd_image_tokens {
35	uint32_t nx; // number of tokens in x direction
36	uint32_t ny; // number of tokens in y direction
37	bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
38	uint32_t n_tokens() const { return nx * ny; }
39	clip_image_f32_batch batch_f32; // preprocessed image patches
40	std::string id; // optional user-defined ID, useful for KV cache tracking
41
42	mtmd_image_tokens clone() {
43	return mtmd_image_tokens{
44	.nx: nx,
45	.ny: ny,
46	.use_mrope_pos: use_mrope_pos,
47	.batch_f32: batch_f32.clone(),
48	.id: id
49	};
50	}
51	};
52	using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
53
54	struct mtmd_audio_tokens {
55	uint32_t n_tokens; // number of tokens
56	clip_image_f32_batch batch_f32; // preprocessed image patches
57	std::string id; // optional user-defined ID, useful for KV cache tracking
58
59	mtmd_audio_tokens clone() {
60	return mtmd_audio_tokens{
61	.n_tokens: n_tokens,
62	.batch_f32: batch_f32.clone(),
63	.id: id
64	};
65	}
66	};
67	using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;
68
69	struct mtmd_input_chunk {
70	mtmd_input_chunk_type type;
71	std::vector<llama_token> tokens_text;
72	mtmd_image_tokens_ptr tokens_image;
73	mtmd_audio_tokens_ptr tokens_audio;
74	};
75
76	struct mtmd_input_chunks {
77	std::vector<mtmd_input_chunk> entries;
78	};
79
80	// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
81	// models not having it (llava-1.6) will process embeddings without any special tokens in-between
82	enum mtmd_slice_tmpl {
83	MTMD_SLICE_TMPL_NONE,
84	MTMD_SLICE_TMPL_MINICPMV_2_5,
85	MTMD_SLICE_TMPL_MINICPMV_2_6,
86	MTMD_SLICE_TMPL_LLAMA4,
87	MTMD_SLICE_TMPL_IDEFICS3,
88	};
89
90	const char * mtmd_default_marker() {
91	return "<__media__>";
92	}
93
94	static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
95	switch (flash_attn_type) {
96	case LLAMA_FLASH_ATTN_TYPE_AUTO: return CLIP_FLASH_ATTN_TYPE_AUTO;
97	case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
98	case LLAMA_FLASH_ATTN_TYPE_ENABLED: return CLIP_FLASH_ATTN_TYPE_ENABLED;
99	}
100	return CLIP_FLASH_ATTN_TYPE_AUTO;
101	}
102
103	mtmd_context_params mtmd_context_params_default() {
104	mtmd_context_params params {
105	/ use_gpu / true,
106	/ print_timings / true,
107	/ n_threads / `4`,
108	/ verbosity / GGML_LOG_LEVEL_INFO,
109	/ image_marker / MTMD_DEFAULT_IMAGE_MARKER,
110	/ media_marker / mtmd_default_marker(),
111	/ flash_attn_type / LLAMA_FLASH_ATTN_TYPE_AUTO,
112	/ image_min_tokens / -`1`,
113	/ image_max_tokens / -`1`,
114	};
115	return params;
116	}
117
118	struct mtmd_context {
119	struct clip_ctx * ctx_v; // vision
120	struct clip_ctx * ctx_a; // audio
121	const struct llama_model * text_model;
122	std::vector<float> image_embd_v; // image embedding vector
123
124	bool print_timings;
125	int n_threads;
126	std::string media_marker;
127	const int n_embd_text;
128
129	// these are not token, but strings used to mark the beginning and end of image/audio embeddings
130	std::string img_beg;
131	std::string img_end;
132	std::string aud_beg;
133	std::string aud_end;
134
135	// for llava-uhd style models, we need special tokens in-between slices
136	// minicpmv calls them "slices", llama 4 calls them "tiles"
137	mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
138	std::vector<llama_token> tok_ov_img_start; // overview image
139	std::vector<llama_token> tok_ov_img_end; // overview image
140	std::vector<llama_token> tok_slices_start; // start of all slices
141	std::vector<llama_token> tok_slices_end; // end of all slices
142	std::vector<llama_token> tok_sli_img_start; // single slice start
143	std::vector<llama_token> tok_sli_img_end; // single slice end
144	std::vector<llama_token> tok_sli_img_mid; // between 2 slices
145	std::vector<llama_token> tok_row_end; // end of row
146	bool tok_row_end_trail = false;
147	bool ov_img_first = false;
148
149	bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
150
151	// string template for slice image delimiters with row/col (idefics3)
152	std::string sli_img_start_tmpl;
153
154	// for whisper, we pre-calculate the mel filter bank
155	whisper_preprocessor::whisper_filters w_filters;
156
157	// TODO @ngxson : add timings
158
159	mtmd_context(const char * mmproj_fname,
160	const llama_model * text_model,
161	const mtmd_context_params & ctx_params) :
162	text_model (text_model),
163	print_timings(ctx_params.print_timings),
164	n_threads (ctx_params.n_threads),
165	media_marker (ctx_params.media_marker),
166	n_embd_text (llama_model_n_embd_inp(model: text_model))
167	{
168	if (std::string (ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
169	throw std::runtime_error ("custom image_marker is not supported anymore, use media_marker instead");
170	}
171
172	if (media_marker.empty()) {
173	throw std::runtime_error ("media_marker must not be empty");
174	}
175
176	clip_context_params ctx_clip_params {
177	/ use_gpu / ctx_params.use_gpu,
178	/ verbosity / ctx_params.verbosity,
179	/ flash_attn_type / CLIP_FLASH_ATTN_TYPE_AUTO,
180	/ image_min_tokens / ctx_params.image_min_tokens,
181	/ image_max_tokens / ctx_params.image_max_tokens,
182	};
183
184	auto res = clip_init(fname: mmproj_fname, ctx_params: ctx_clip_params);
185	ctx_v = res.ctx_v;
186	ctx_a = res.ctx_a;
187	if (!ctx_v && !ctx_a) {
188	throw std::runtime_error (string_format(fmt: "Failed to load CLIP model from %s\n", mmproj_fname));
189	}
190
191	// if both vision and audio mmproj are present, we need to validate their n_embd
192	if (ctx_v && ctx_a) {
193	int n_embd_v = clip_n_mmproj_embd(ctx: ctx_v);
194	int n_embd_a = clip_n_mmproj_embd(ctx: ctx_a);
195	if (n_embd_v != n_embd_a) {
196	throw std::runtime_error (string_format(
197	fmt: "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
198	n_embd_v, n_embd_a));
199	}
200	}
201
202	// since we already validate n_embd of vision and audio mmproj,
203	// we can safely assume that they are the same
204	int n_embd_clip = clip_n_mmproj_embd(ctx: ctx_v ? ctx_v : ctx_a);
205	if (n_embd_text != n_embd_clip) {
206	throw std::runtime_error (string_format(
207	fmt: "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
208	"hint: you may be using wrong mmproj\n",
209	n_embd_text, n_embd_clip));
210	}
211	if (ctx_v) {
212	init_vision();
213	}
214	if (ctx_a) {
215	init_audio();
216	}
217	}
218
219	void init_vision() {
220	GGML_ASSERT(ctx_v != nullptr);
221	use_mrope = clip_is_qwen2vl(ctx: ctx_v);
222
223	projector_type proj = clip_get_projector_type(ctx: ctx_v);
224	int minicpmv_version = clip_is_minicpmv(ctx: ctx_v);
225	if (minicpmv_version == `2`) {
226	// minicpmv 2.5 format:
227	// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
228	slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
229	tok_ov_img_start = {lookup_token(token_text: "<image>")};
230	tok_ov_img_end = {lookup_token(token_text: "</image>")};
231	tok_slices_start = {lookup_token(token_text: "<slice>")};
232	tok_slices_end = {lookup_token(token_text: "</slice>")};
233	tok_sli_img_start = tok_ov_img_start;
234	tok_sli_img_end = tok_ov_img_end;
235	tok_row_end = {lookup_token(token_text: "\n")};
236	tok_row_end_trail = false; // no trailing end-of-row token
237	ov_img_first = true;
238
239	} else if (minicpmv_version == `3` \|\| minicpmv_version == `4` \|\| minicpmv_version == `5` \|\| minicpmv_version == `6`) {
240	// minicpmv 2.6 format:
241	// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
242	slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
243	tok_ov_img_start = {lookup_token(token_text: "<image>")};
244	tok_ov_img_end = {lookup_token(token_text: "</image>")};
245	tok_sli_img_start = {lookup_token(token_text: "<slice>")};
246	tok_sli_img_end = {lookup_token(token_text: "</slice>")};
247	tok_row_end = {lookup_token(token_text: "\n")};
248	tok_row_end_trail = false; // no trailing end-of-row token
249	ov_img_first = true;
250
251	} else if (minicpmv_version != `0`) {
252	GGML_ASSERT(false && "unsupported minicpmv version");
253	} else if (proj == PROJECTOR_TYPE_LLAMA4) {
254	// llama 4 format:
255	// <\|image_start\|>
256	// (slice) <\|tile_x_separator\|> (slice) <\|tile_x_separator\|> ... <\|tile_y_separator\|>
257	// (slice) <\|tile_x_separator\|> (slice) <\|tile_x_separator\|> ... <\|tile_y_separator\|>
258	// ... <\|tile_y_separator\|> <-- trailing end-of-row token
259	// <\|image\|> (overview) <-- overview image is last
260	// <\|image_end\|>
261	slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
262	tok_ov_img_start = {lookup_token(token_text: "<\|image\|>")};
263	tok_sli_img_mid = {lookup_token(token_text: "<\|tile_x_separator\|>")};
264	tok_row_end = {lookup_token(token_text: "<\|tile_y_separator\|>")};
265	tok_row_end_trail = true; // add trailing end-of-row token
266	ov_img_first = false; // overview image is last
267	}
268
269	// set boi/eoi
270	if (proj == PROJECTOR_TYPE_GEMMA3) {
271	// <start_of_image> ... (image embeddings) ... <end_of_image>
272	img_beg = "<start_of_image>";
273	img_end = "<end_of_image>";
274
275	} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
276	// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
277	slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
278	tok_ov_img_start = {lookup_token(token_text: "\n\n"), lookup_token(token_text: "<fake_token_around_image>"), lookup_token(token_text: "<global-img>")};
279	tok_ov_img_end = {lookup_token(token_text: "<fake_token_around_image>")};
280	tok_row_end = {lookup_token(token_text: "\n")};
281	sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
282
283	} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
284	// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
285	img_end = "[IMG_END]";
286
287	} else if (proj == PROJECTOR_TYPE_QWEN2VL \|\| proj == PROJECTOR_TYPE_QWEN25VL \|\| proj == PROJECTOR_TYPE_QWEN3VL) {
288	// <\|vision_start\|> ... (image embeddings) ... <\|vision_end\|>
289	img_beg = "<\|vision_start\|>";
290	img_end = "<\|vision_end\|>";
291
292	} else if (proj == PROJECTOR_TYPE_LLAMA4) {
293	// (more details in mtmd_context constructor)
294	img_beg = "<\|image_start\|>";
295	img_end = "<\|image_end\|>";
296	LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
297	" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
298
299	} else if (proj == PROJECTOR_TYPE_INTERNVL) {
300	// <img> ... (image embeddings) ... </img>
301	img_beg = "<img>";
302	img_end = "</img>";
303
304	} else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
305	// <\|im_start\|> ... (image embeddings) ... <\|im_end\|>
306	img_beg = "<\|im_start\|>";
307	img_end = "<\|im_end\|>";
308
309	}
310	}
311
312	void init_audio() {
313	GGML_ASSERT(ctx_a != nullptr);
314	projector_type proj = clip_get_projector_type(ctx: ctx_a);
315
316	if (clip_has_whisper_encoder(ctx: ctx_a)) {
317	// TODO @ngxson : check if model n_mel is 128 or 80
318	w_filters = whisper_precalc_filters::get_128_bins();
319	}
320
321	LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
322	" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
323
324	if (proj == PROJECTOR_TYPE_QWEN2A) {
325	// <\|audio_bos\|> ... (embeddings) ... <\|audio_eos\|>
326	aud_beg = "<\|audio_bos\|>";
327	aud_end = "<\|audio_eos\|>";
328
329	} else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
330	// [BEGIN_AUDIO] ... (embeddings) ...
331	aud_beg = "[BEGIN_AUDIO]";
332
333	}
334	}
335
336	// get clip ctx based on chunk type
337	clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
338	if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
339	return ctx_v;
340	} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
341	return ctx_a;
342	}
343	GGML_ABORT("unknown chunk type");
344	}
345
346	projector_type proj_type_v() const {
347	return ctx_v ? clip_get_projector_type(ctx: ctx_v) : PROJECTOR_TYPE_UNKNOWN;
348	}
349
350	projector_type proj_type_a() const {
351	return ctx_a ? clip_get_projector_type(ctx: ctx_a) : PROJECTOR_TYPE_UNKNOWN;
352	}
353
354	~mtmd_context() {
355	clip_free(ctx: ctx_a);
356	clip_free(ctx: ctx_v);
357	}
358
359	private:
360	llama_token lookup_token(const std::string & token_text) {
361	const llama_vocab * vocab = llama_model_get_vocab(model: text_model);
362	const int n_vocab = llama_vocab_n_tokens(vocab);
363	for (int i = `0`; i < n_vocab; i++) {
364	if (token_to_piece(vocab, token: i, special: true) == token_text) {
365	return i;
366	}
367	}
368	return LLAMA_TOKEN_NULL;
369	}
370
371	std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
372	std::string piece;
373	piece.resize(n: piece.capacity()); // using string internal cache, 15 bytes + '\n'
374	const int n_chars = llama_token_to_piece(vocab, token, buf: &piece [`0`], length: piece.size(), lstrip: `0`, special);
375	if (n_chars < `0`) {
376	piece.resize(n: -n_chars);
377	int check = llama_token_to_piece(vocab, token, buf: &piece [`0`], length: piece.size(), lstrip: `0`, special);
378	GGML_ASSERT(check == -n_chars);
379	} else {
380	piece.resize(n: n_chars);
381	}
382	return piece;
383	}
384	};
385
386	mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
387	const struct llama_model * text_model,
388	const struct mtmd_context_params ctx_params) {
389	try {
390	return new mtmd_context (mmproj_fname, text_model, ctx_params);
391	} catch (const std::exception & e) {
392	LOG_ERR("%s: error: %s\n", __func__, e.what());
393	return nullptr;
394	}
395	}
396
397	void mtmd_free(mtmd_context * ctx) {
398	delete ctx;
399	}
400
401	struct mtmd_tokenizer {
402	mtmd_context * ctx;
403	std::vector<const mtmd_bitmap *> bitmaps;
404
405	std::string input_text;
406	bool add_special;
407	bool parse_special;
408	const llama_vocab * vocab;
409
410	mtmd_input_chunks cur;
411
412	mtmd_tokenizer(mtmd_context * ctx,
413	const mtmd_input_text * text,
414	const mtmd_bitmap ** bitmaps,
415	size_t n_bitmaps) : ctx(ctx), bitmaps (bitmaps, bitmaps + n_bitmaps) {
416	add_special = text->add_special;
417	parse_special = text->parse_special;
418	input_text = text->text;
419	vocab = llama_model_get_vocab(model: ctx->text_model);
420
421	// for compatibility, we convert image marker to media marker
422	string_replace_all(s&: input_text, MTMD_DEFAULT_IMAGE_MARKER, replace: ctx->media_marker);
423	}
424
425	int32_t tokenize(mtmd_input_chunks * output) {
426	cur.entries.clear();
427	std::vector<std::string> parts = split_text(input: input_text, delimiter: ctx->media_marker);
428	size_t i_bm = `0`; // index of the current bitmap
429	for (auto & part : parts) {
430	if (part == ctx->media_marker) {
431	// this is a marker, we should add the next bitmap
432	if (i_bm >= bitmaps.size()) {
433	LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
434	__func__, bitmaps.size(), parts.size() - `1`);
435	return `1`;
436	}
437	const mtmd_bitmap * bitmap = bitmaps [i_bm++];
438	int32_t res = add_media(bitmap);
439	if (res != `0`) {
440	return res;
441	}
442	} else {
443	// this is a text part, we should add it as text
444	add_text(txt: part, parse_special);
445	}
446	}
447
448	if (add_special && llama_vocab_get_add_bos(vocab)) {
449	// if first chunk is text, we add BOS token to first text chunk
450	// otherwise, create a new text chunk with BOS token
451	if (!cur.entries.empty() && cur.entries [`0`].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
452	// add BOS token to the beginning of first text chunk
453	cur.entries [`0`].tokens_text.insert(position: cur.entries [`0`].tokens_text.begin(), x: llama_vocab_bos(vocab));
454	} else {
455	// create a new text chunk with BOS token at the beginning
456	mtmd_input_chunk bos_chunk{
457	.type: MTMD_INPUT_CHUNK_TYPE_TEXT,
458	.tokens_text: {llama_vocab_bos(vocab)},
459	.tokens_image: nullptr, // image tokens
460	.tokens_audio: nullptr, // audio tokens
461	};
462	cur.entries.insert(position: cur.entries.begin(), x: std::move(bos_chunk));
463	}
464	}
465
466	if (add_special && llama_vocab_get_add_eos(vocab)) {
467	// if last chunk is text, we add EOS token to it
468	add_text(tokens: {llama_vocab_eos(vocab)});
469	}
470
471	if (i_bm != bitmaps.size()) {
472	LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
473	__func__, bitmaps.size(), parts.size() - `1`);
474	return `1`;
475	}
476
477	*output = std::move(cur);
478
479	return `0`;
480	}
481
482	void add_text(const std::string & txt, bool parse_special) {
483	LOG_DBG("%s: %s\n", __func__, txt.c_str());
484	auto tokens = mtmd_tokenize_text_internal(vocab, text: txt, / add_special / false, parse_special);
485	add_text(tokens);
486	}
487
488	void add_text(const std::vector<llama_token> & tokens) {
489	if (tokens.empty()) {
490	return;
491	}
492	// if last entry is also a text chunk, add tokens to it instead of creating new chunk
493	if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
494	cur.entries.back().tokens_text.insert(
495	position: cur.entries.back().tokens_text.end(),
496	first: tokens.begin(),
497	last: tokens.end());
498	} else {
499	mtmd_input_chunk chunk{
500	.type: MTMD_INPUT_CHUNK_TYPE_TEXT,
501	.tokens_text: tokens,
502	.tokens_image: nullptr, // image tokens
503	.tokens_audio: nullptr, // audio tokens
504	};
505	cur.entries.emplace_back(args: std::move(chunk));
506	}
507	}
508
509	int32_t add_media(const mtmd_bitmap * bitmap) {
510	if (!bitmap->is_audio) {
511	// handle image
512
513	if (!ctx->ctx_v) {
514	LOG_ERR("%s: error: model does not support vision input\n", __func__);
515	return `2`;
516	}
517
518	if (!ctx->img_beg.empty()) {
519	add_text(txt: ctx->img_beg, parse_special: true); // add image begin token
520	}
521
522	// convert mtmd_bitmap to clip_image_u8
523	clip_image_u8_ptr img_u8(clip_image_u8_init());
524	img_u8 ->nx = bitmap->nx;
525	img_u8 ->ny = bitmap->ny;
526	img_u8 ->buf.resize(new_size: bitmap->data.size());
527	std::memcpy(dest: img_u8 ->buf.data(), src: bitmap->data.data(), n: img_u8 ->nx * img_u8 ->ny * `3`);
528
529	// preprocess image
530	clip_image_f32_batch batch_f32;
531	bool ok = clip_image_preprocess(ctx: ctx->ctx_v, img: img_u8.get(), res_imgs: &batch_f32);
532	if (!ok) {
533	LOG_ERR("Unable to preprocess image\n");
534	return `2`;
535	}
536
537	// handle llava-uhd style preprocessing
538	if (
539	ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
540	\|\| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
541	\|\| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
542	\|\| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
543	) {
544	const int n_col = batch_f32.grid_x;
545	const int n_row = batch_f32.grid_y;
546	// split batch into chunks of single images
547	// NOTE: batch_f32 will be invalidated after this call
548	auto chunks = split_batch_to_chunk(batch_f32: std::move(batch_f32), id: bitmap->id);
549	GGML_ASSERT(chunks.size() > `0`);
550
551	auto ov_chunk = std::move(chunks.front());
552	chunks.erase(position: chunks.begin());
553
554	// add overview image (first)
555	if (ctx->ov_img_first) {
556	add_text(tokens: ctx->tok_ov_img_start);
557	cur.entries.emplace_back(args: std::move(ov_chunk));
558	add_text(tokens: ctx->tok_ov_img_end);
559	}
560
561	// add slices (or tiles)
562	if (!chunks.empty()) {
563	GGML_ASSERT((int)chunks.size() == n_row * n_col);
564	add_text(tokens: ctx->tok_slices_start);
565	for (int y = `0`; y < n_row; y++) {
566	for (int x = `0`; x < n_col; x++) {
567	const bool is_last_in_row = (x == n_col - `1`);
568	if (!ctx->tok_sli_img_start.empty()) {
569	add_text(tokens: ctx->tok_sli_img_start);
570	} else if (!ctx->sli_img_start_tmpl.empty()) {
571	// If using a template to preceed a slice image
572	const size_t sz = std::snprintf(s: nullptr, maxlen: `0`, format: ctx->sli_img_start_tmpl.c_str(), y+`1`, x+`1`) + `1`;
573	std::unique_ptr<char[]> buf(new char[sz]);
574	std::snprintf(s: buf.get(), maxlen: sz, format: ctx->sli_img_start_tmpl.c_str(), y+`1`, x+`1`);
575	add_text(txt: std::string (buf.get(), buf.get() + sz - `1`), parse_special: true);
576	}
577	cur.entries.emplace_back(args: std::move(chunks [y * n_col + x]));
578	add_text(tokens: ctx->tok_sli_img_end);
579	if (!is_last_in_row) {
580	add_text(tokens: ctx->tok_sli_img_mid);
581	}
582	}
583	if ((y != n_row - `1` \|\| ctx->tok_row_end_trail)) {
584	add_text(tokens: ctx->tok_row_end);
585	}
586	}
587	add_text(tokens: ctx->tok_slices_end);
588	}
589
590	// add overview image (last)
591	if (!ctx->ov_img_first) {
592	add_text(tokens: ctx->tok_ov_img_start);
593	cur.entries.emplace_back(args: std::move(ov_chunk));
594	add_text(tokens: ctx->tok_ov_img_end);
595	}
596
597	} else {
598	size_t n_tokens = `0`;
599	for (const auto & entry : batch_f32.entries) {
600	n_tokens += clip_n_output_tokens(ctx: ctx->ctx_v, img: entry.get());
601	}
602
603	mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
604	if (ctx->use_mrope) {
605	// for Qwen2VL, we need this information for M-RoPE decoding positions
606	image_tokens ->nx = clip_n_output_tokens_x(ctx: ctx->ctx_v, img: batch_f32.entries [`0`].get());
607	image_tokens ->ny = clip_n_output_tokens_y(ctx: ctx->ctx_v, img: batch_f32.entries [`0`].get());
608	image_tokens ->use_mrope_pos = true;
609	} else {
610	// other models, we only need the total number of tokens
611	image_tokens ->nx = n_tokens;
612	image_tokens ->ny = `1`;
613	}
614	image_tokens ->batch_f32 = std::move(batch_f32);
615	image_tokens ->id = bitmap->id; // optional
616
617	LOG_DBG("image_tokens->nx = %d\n", image_tokens ->nx);
618	LOG_DBG("image_tokens->ny = %d\n", image_tokens ->ny);
619	LOG_DBG("batch_f32 size = %d\n", (int)image_tokens ->batch_f32.entries.size());
620
621	mtmd_input_chunk chunk{
622	.type: MTMD_INPUT_CHUNK_TYPE_IMAGE,
623	.tokens_text: {}, // text tokens
624	.tokens_image: std::move(image_tokens),
625	.tokens_audio: nullptr, // audio tokens
626	};
627	cur.entries.emplace_back(args: std::move(chunk));
628	}
629
630	if (!ctx->img_end.empty()) {
631	add_text(txt: ctx->img_end, parse_special: true); // add image end token
632	}
633
634	} else {
635	// handle audio
636
637	if (!ctx->ctx_a) {
638	LOG_ERR("%s: error: model does not support audio input\n", __func__);
639	return `2`;
640	}
641
642	if (bitmap->data.size() == `0`) {
643	LOG_ERR("%s: error: empty audio data\n", __func__);
644	return `2`;
645	}
646
647	if (!ctx->aud_beg.empty()) {
648	add_text(txt: ctx->aud_beg, parse_special: true); // add audio begin token
649	}
650
651	// preprocess audio
652	GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
653	std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
654	const float * samples = (const float *)bitmap->data.data();
655	size_t n_samples = bitmap->data.size() / sizeof(float);
656	bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, filters: ctx->w_filters, output&: mel_spec_chunks);
657	if (!ok) {
658	LOG_ERR("Unable to preprocess audio\n");
659	return `2`;
660	}
661
662	// consider each mel_spec as a separate audio chunk
663	// TODO: maybe support batching, but this may come with memory cost
664	for (auto & mel_spec : mel_spec_chunks) {
665	clip_image_f32_ptr mel_f32(clip_image_f32_init());
666	mel_f32 ->nx = mel_spec.n_len;
667	mel_f32 ->ny = mel_spec.n_mel;
668	mel_f32 ->buf = std::move(mel_spec.data);
669	size_t n_tokens = clip_n_output_tokens(ctx: ctx->ctx_a, img: mel_f32.get());
670
671	clip_image_f32_batch batch_f32;
672	batch_f32.is_audio = true;
673	batch_f32.entries.push_back(x: std::move(mel_f32));
674
675	mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
676	audio_tokens ->n_tokens = n_tokens;
677	audio_tokens ->batch_f32 = std::move(batch_f32);
678	audio_tokens ->id = bitmap->id; // optional
679
680	LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens ->n_tokens);
681
682	mtmd_input_chunk chunk{
683	.type: MTMD_INPUT_CHUNK_TYPE_AUDIO,
684	.tokens_text: {}, // text tokens
685	.tokens_image: nullptr, // image tokens
686	.tokens_audio: std::move(audio_tokens),
687	};
688	cur.entries.emplace_back(args: std::move(chunk));
689	}
690
691	if (!ctx->aud_end.empty()) {
692	add_text(txt: ctx->aud_end, parse_special: true); // add audio end token
693	}
694	}
695
696	return `0`;
697	}
698
699	std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
700	std::vector<mtmd_input_chunk> chunks;
701
702	for (auto & entry : batch_f32.entries) {
703	mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
704	image_tokens ->nx = clip_n_output_tokens(ctx: ctx->ctx_v, img: entry.get());
705	image_tokens ->ny = `1`;
706	image_tokens ->batch_f32.entries.push_back(x: std::move(entry));
707	image_tokens ->id = id;
708
709	mtmd_input_chunk chunk{
710	.type: MTMD_INPUT_CHUNK_TYPE_IMAGE,
711	.tokens_text: {}, // text tokens
712	.tokens_image: std::move(image_tokens),
713	.tokens_audio: nullptr, // audio tokens
714	};
715	chunks.emplace_back(args: std::move(chunk));
716	}
717
718	return chunks;
719	}
720
721	// for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
722	static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
723	std::vector<std::string> result;
724	if (input.empty()) {
725	return result;
726	}
727	size_t start = `0`;
728	size_t pos = `0`;
729	while ((pos = input.find(str: delimiter, pos: start)) != std::string::npos) {
730	if (pos > start) {
731	result.push_back(x: input.substr(pos: start, n: pos - start));
732	}
733	result.push_back(x: delimiter);
734	start = pos + delimiter.length();
735	}
736	if (start < input.length()) {
737	result.push_back(x: input.substr(pos: start));
738	}
739	return result;
740	}
741
742	// copied from common_tokenize
743	static std::vector<llama_token> mtmd_tokenize_text_internal(
744	const struct llama_vocab * vocab,
745	const std::string & text,
746	bool add_special,
747	bool parse_special) {
748	// upper limit for the number of tokens
749	int n_tokens = text.length() + `2` * add_special;
750	std::vector<llama_token> result(n_tokens);
751	n_tokens = llama_tokenize(vocab, text: text.data(), text_len: text.length(), tokens: result.data(), n_tokens_max: result.size(), add_special, parse_special);
752	if (n_tokens < `0`) {
753	result.resize(new_size: -n_tokens);
754	int check = llama_tokenize(vocab, text: text.data(), text_len: text.length(), tokens: result.data(), n_tokens_max: result.size(), add_special, parse_special);
755	GGML_ASSERT(check == -n_tokens);
756	} else {
757	result.resize(new_size: n_tokens);
758	}
759	return result;
760	}
761	};
762
763	int32_t mtmd_tokenize(mtmd_context * ctx,
764	mtmd_input_chunks * output,
765	const mtmd_input_text * text,
766	const mtmd_bitmap ** bitmaps,
767	size_t n_bitmaps) {
768	mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
769	return tokenizer.tokenize(output);
770	}
771
772	int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
773	if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
774	LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
775	return `0`;
776	} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
777	if (!ctx->ctx_v) {
778	LOG_ERR("%s: model does not support vision input\n", __func__);
779	return `1`;
780	}
781	return mtmd_encode(ctx, image_tokens: chunk->tokens_image.get());
782	} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
783	if (!ctx->ctx_a) {
784	LOG_ERR("%s: model does not support audio input\n", __func__);
785	return `1`;
786	}
787	int n_mmproj_embd = ctx->n_embd_text;
788	ctx->image_embd_v.resize(new_size: chunk->tokens_audio ->n_tokens * n_mmproj_embd);
789	bool ok = clip_image_batch_encode(
790	ctx: ctx->ctx_a,
791	n_threads: ctx->n_threads,
792	imgs: &chunk->tokens_audio ->batch_f32,
793	vec: ctx->image_embd_v.data());
794	return ok ? `0` : `1`;
795	}
796
797	LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
798	return `1`;
799	}
800
801	int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
802	clip_ctx * ctx_clip = ctx->ctx_v;
803	if (!ctx_clip) {
804	LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
805	return `1`;
806	}
807	int n_mmproj_embd = clip_n_mmproj_embd(ctx: ctx_clip);
808	ctx->image_embd_v.resize(new_size: image_tokens->n_tokens() * n_mmproj_embd);
809	bool ok = false;
810
811	if (clip_is_llava(ctx: ctx_clip)
812	\|\| clip_is_minicpmv(ctx: ctx_clip)
813	\|\| clip_is_glm(ctx: ctx_clip)) {
814	// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
815	const auto & entries = image_tokens->batch_f32.entries;
816	for (size_t i = `0`; i < entries.size(); i++) {
817	int n_tokens_per_image = clip_n_output_tokens(ctx: ctx_clip, img: entries [i].get());
818	ok = clip_image_encode(
819	ctx: ctx_clip,
820	n_threads: ctx->n_threads,
821	img: entries [i].get(),
822	vec: ctx->image_embd_v.data() + in_mmproj_embdn_tokens_per_image);
823	}
824	} else {
825	ok = clip_image_batch_encode(
826	ctx: ctx_clip,
827	n_threads: ctx->n_threads,
828	imgs: &image_tokens->batch_f32,
829	vec: ctx->image_embd_v.data());
830	}
831
832	return ok ? `0` : `1`;
833	}
834
835	float * mtmd_get_output_embd(mtmd_context * ctx) {
836	return ctx->image_embd_v.data();
837	}
838
839	bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
840	if (ctx->ctx_v && clip_get_projector_type(ctx: ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
841	return true;
842	}
843	return false;
844	}
845
846	bool mtmd_decode_use_mrope(mtmd_context * ctx) {
847	return ctx->use_mrope;
848	}
849
850	bool mtmd_support_vision(mtmd_context * ctx) {
851	return ctx->ctx_v != nullptr;
852	}
853
854	bool mtmd_support_audio(mtmd_context * ctx) {
855	return ctx->ctx_a != nullptr;
856	}
857
858	int mtmd_get_audio_bitrate(mtmd_context * ctx) {
859	if (!ctx->ctx_a) {
860	return -`1`;
861	}
862	// for now, we assume that all audio models have the same bitrate
863	return `16000`; // 16kHz
864	}
865
866	//
867	// public API functions
868	//
869
870	// mtmd_bitmap
871
872	mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
873	uint32_t ny,
874	const unsigned char * data) {
875	mtmd_bitmap * bitmap = new mtmd_bitmap;
876	bitmap->nx = nx;
877	bitmap->ny = ny;
878	size_t data_size = (size_t)nx * ny * `3`;
879	bitmap->data.resize(new_size: data_size);
880	std::memcpy(dest: bitmap->data.data(), src: data, n: data_size);
881	return bitmap;
882	}
883
884	mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
885	const float * data) {
886	mtmd_bitmap * bitmap = new mtmd_bitmap;
887	bitmap->nx = n_samples;
888	bitmap->ny = `1`;
889	bitmap->is_audio = true;
890	size_t data_size = n_samples * sizeof(float);
891	bitmap->data.resize(new_size: data_size);
892	std::memcpy(dest: bitmap->data.data(), src: data, n: data_size);
893	return bitmap;
894	}
895
896	uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
897	return bitmap->nx;
898	}
899
900	uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
901	return bitmap->ny;
902	}
903
904	const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
905	return bitmap->data.data();
906	}
907
908	size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
909	return bitmap->data.size();
910	}
911
912	bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
913	return bitmap->is_audio;
914	}
915
916	const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
917	return bitmap->id.c_str();
918	}
919
920	void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
921	if (id) {
922	bitmap->id = std::string (id);
923	} else {
924	bitmap->id.clear();
925	}
926	}
927
928	void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
929	if (bitmap) {
930	delete bitmap;
931	}
932	}
933
934	// mtmd_input_chunks
935
936	mtmd_input_chunks * mtmd_input_chunks_init() {
937	return new mtmd_input_chunks;
938	}
939
940	size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
941	return chunks->entries.size();
942	}
943
944	const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
945	if (idx >= chunks->entries.size()) {
946	return nullptr;
947	}
948	return &chunks->entries [idx];
949	}
950
951	void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
952	if (chunks) {
953	delete chunks;
954	}
955	}
956
957	// mtmd_input_chunk
958
959	enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
960	return chunk->type;
961	}
962
963	const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
964	if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
965	*n_tokens_output = chunk->tokens_text.size();
966	return chunk->tokens_text.data();
967	}
968	*n_tokens_output = `0`;
969	return nullptr;
970	}
971
972	const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
973	if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
974	return chunk->tokens_image.get();
975	}
976	return nullptr;
977	}
978
979	size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
980	if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
981	return chunk->tokens_text.size();
982	} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
983	return mtmd_image_tokens_get_n_tokens(image_tokens: chunk->tokens_image.get());
984	} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
985	return chunk->tokens_audio ->n_tokens;
986	} else {
987	GGML_ABORT("invalid chunk type");
988	}
989	}
990
991	llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
992	if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
993	return chunk->tokens_text.size();
994	} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
995	return mtmd_image_tokens_get_n_pos(image_tokens: chunk->tokens_image.get());
996	} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
997	return chunk->tokens_audio ->n_tokens;
998	} else {
999	GGML_ABORT("invalid chunk type");
1000	}
1001	}
1002
1003	const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
1004	if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1005	return chunk->tokens_image ->id.c_str();
1006	} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
1007	return chunk->tokens_audio ->id.c_str();
1008	}
1009	return nullptr;
1010	}
1011
1012	mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
1013	mtmd_input_chunk * copy = new mtmd_input_chunk{
1014	.type: chunk->type,
1015	.tokens_text: chunk->tokens_text,
1016	.tokens_image: nullptr,
1017	.tokens_audio: nullptr,
1018	};
1019	if (chunk->tokens_image) {
1020	// copy the image tokens
1021	copy->tokens_image = mtmd_image_tokens_ptr (new mtmd_image_tokens ());
1022	*copy->tokens_image = chunk->tokens_image ->clone();
1023	}
1024	if (chunk->tokens_audio) {
1025	// copy the audio tokens
1026	copy->tokens_audio = mtmd_audio_tokens_ptr (new mtmd_audio_tokens ());
1027	*copy->tokens_audio = chunk->tokens_audio ->clone();
1028	}
1029	return copy;
1030	}
1031
1032	void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
1033	if (chunk) {
1034	delete chunk;
1035	}
1036	}
1037
1038	// mtmd_image_tokens
1039
1040	size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
1041	return image_tokens->n_tokens();
1042	}
1043
1044	size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
1045	return image_tokens->nx;
1046	}
1047
1048	size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
1049	return image_tokens->ny;
1050	}
1051
1052	const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
1053	return image_tokens->id.c_str();
1054	}
1055
1056	llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
1057	if (image_tokens->use_mrope_pos) {
1058	// for M-RoPE, temporal dimension = max(t,h,w)
1059	// t is omitted as we don't support video input
1060	return std::max(a: image_tokens->nx, b: image_tokens->ny);
1061	}
1062	return image_tokens->n_tokens();
1063	}
1064
1065	// test function
1066
1067	mtmd_input_chunks * mtmd_test_create_input_chunks() {
1068	mtmd_input_chunks * chunks = mtmd_input_chunks_init();
1069	if (!chunks) {
1070	return nullptr;
1071	}
1072
1073	// create a text chunk
1074	std::vector<llama_token> tokens_text = { `1`, `2`, `3`, `4`, `5` };
1075	mtmd_input_chunk chunk_text{
1076	.type: MTMD_INPUT_CHUNK_TYPE_TEXT,
1077	.tokens_text: std::move(tokens_text),
1078	.tokens_image: nullptr, // image tokens
1079	.tokens_audio: nullptr, // audio tokens
1080	};
1081	chunks->entries.emplace_back(args: std::move(chunk_text));
1082
1083	// create an image chunk
1084	mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
1085	image_tokens ->nx = `4`;
1086	image_tokens ->ny = `4`;
1087	image_tokens ->batch_f32.entries.resize(new_size: `16`);
1088	image_tokens ->id = "image_1";
1089	mtmd_input_chunk chunk_image{
1090	.type: MTMD_INPUT_CHUNK_TYPE_IMAGE,
1091	.tokens_text: {}, // text tokens
1092	.tokens_image: std::move(image_tokens),
1093	.tokens_audio: nullptr, // audio tokens
1094	};
1095	chunks->entries.emplace_back(args: std::move(chunk_image));
1096
1097	return chunks;
1098	}
1099

Browse the source code of llama.cpp/tools/mtmd/mtmd.cpp