mtmd.h source code [llama.cpp/tools/mtmd/mtmd.h]

1	#ifndef MTMD_H
2	#define MTMD_H
3
4	#include "ggml.h"
5	#include "llama.h"
6
7	#include <stddef.h>
8	#include <stdint.h>
9	#include <stdbool.h>
10
11	#ifdef __cplusplus
12	#include <string>
13	#include <vector>
14	#include <cinttypes>
15	#include <memory>
16	#endif
17
18	/**
19	* libmtmd: A library for multimodal support in llama.cpp.
20	*
21	* WARNING: This API is experimental and subject to many BREAKING CHANGES.
22	* Issues related to API usage may receive lower priority support.
23	*
24	* For the usage, see an example in mtmd-cli.cpp
25	*/
26
27	#ifdef LLAMA_SHARED
28	# if defined(_WIN32) && !defined(__MINGW32__)
29	# ifdef LLAMA_BUILD
30	# define MTMD_API __declspec(dllexport)
31	# else
32	# define MTMD_API __declspec(dllimport)
33	# endif
34	# else
35	# define MTMD_API __attribute__ ((visibility ("default")))
36	# endif
37	#else
38	# define MTMD_API
39	#endif
40
41	// deprecated marker, use mtmd_default_marker() instead
42	#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
43
44	#ifdef __cplusplus
45	extern "C" {
46	#endif
47
48	enum mtmd_input_chunk_type {
49	MTMD_INPUT_CHUNK_TYPE_TEXT,
50	MTMD_INPUT_CHUNK_TYPE_IMAGE,
51	MTMD_INPUT_CHUNK_TYPE_AUDIO,
52	};
53
54	// opaque types
55	struct mtmd_context;
56	struct mtmd_bitmap;
57	struct mtmd_image_tokens;
58	struct mtmd_input_chunk;
59	struct mtmd_input_chunks;
60
61	struct mtmd_input_text {
62	const char * text;
63	bool add_special;
64	bool parse_special;
65	};
66
67	//
68	// C API
69	//
70
71	typedef struct mtmd_context mtmd_context;
72	typedef struct mtmd_bitmap mtmd_bitmap;
73	typedef struct mtmd_image_tokens mtmd_image_tokens;
74	typedef struct mtmd_input_chunk mtmd_input_chunk;
75	typedef struct mtmd_input_chunks mtmd_input_chunks;
76	typedef struct mtmd_input_text mtmd_input_text;
77
78	struct mtmd_context_params {
79	bool use_gpu;
80	bool print_timings;
81	int n_threads;
82	enum ggml_log_level verbosity;
83	const char * image_marker; // deprecated, use media_marker instead
84	const char * media_marker;
85	enum llama_flash_attn_type flash_attn_type;
86
87	// limit number of image tokens, only for vision models with dynamic resolution
88	int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
89	int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
90	};
91
92	MTMD_API const char * mtmd_default_marker(void);
93
94	MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
95
96	// initialize the mtmd context
97	// return nullptr on failure
98	MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
99	const struct llama_model * text_model,
100	const struct mtmd_context_params ctx_params);
101
102	MTMD_API void mtmd_free(mtmd_context * ctx);
103
104	// whether we need to set non-causal mask before llama_decode
105	MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
106
107	// whether the current model use M-RoPE for llama_decode
108	MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
109
110	// whether the current model supports vision input
111	MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
112
113	// whether the current model supports audio input
114	MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
115
116	// get audio bitrate in Hz, for example 16000 for Whisper
117	// return -1 if audio is not supported
118	MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
119
120	// mtmd_bitmap
121	//
122	// if bitmap is image:
123	// length of data must be nx ny * 3*
124	// the data is in RGBRGBRGB... format
125	// if bitmap is audio:
126	// length of data must be n_samples sizeof(float)*
127	// the data is in float format (PCM F32)
128	MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
129	MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
130	MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
131	MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
132	MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
133	MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
134	MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
135	MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
136	// bitmap ID is optional, but useful for KV cache tracking
137	// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
138	MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
139	MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
140
141
142	// mtmd_input_chunks
143	//
144	// this is simply a list of mtmd_input_chunk
145	// the elements can only be populated via mtmd_tokenize()
146	MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
147	MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
148	MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
149	MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
150
151	// mtmd_input_chunk
152	//
153	// the instance will be constructed via mtmd_tokenize()
154	// it will be freed along with mtmd_input_chunks
155	MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
156	MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
157	MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
158	MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
159	// returns nullptr for ID on text chunk
160	MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
161	// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
162	MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
163
164	// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
165	// you can move the chunk ownership to your own code by copying it
166	// remember to free the chunk when you are done with it
167	MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
168	MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
169
170
171	// mtmd_image_tokens
172	//
173	// the instance will be constructed via mtmd_tokenize()
174	// it will be freed along with mtmd_input_chunk
175	MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
176	MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
177	MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
178	MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
179	// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
180	MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
181
182	// tokenize an input text prompt and a list of bitmaps (images/audio)
183	// the prompt must have the input image marker (default: "<__media__>") in it
184	// the default marker is defined by mtmd_default_marker()
185	// the marker will be replaced with the image/audio chunk
186	// for example:
187	// "here is an image: <__media__>\ndescribe it in detail."
188	// this will gives 3 chunks:
189	// 1. "here is an image: <start_of_image>"
190	// 2. (image/audio tokens)
191	// 3. "<end_of_image>\ndescribe it in detail."
192	// number of bitmaps must be equal to the number of markers in the prompt
193	// this function is thread-safe (shared ctx)
194	// return values:
195	// 0 on success
196	// 1 on number of bitmaps not matching the number of markers
197	// 2 on image preprocessing error
198	MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
199	mtmd_input_chunks * output,
200	const mtmd_input_text * text,
201	const mtmd_bitmap ** bitmaps,
202	size_t n_bitmaps);
203
204	// returns 0 on success
205	// TODO: deprecate
206	MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
207	const mtmd_image_tokens * image_tokens);
208
209	// returns 0 on success
210	MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
211	const mtmd_input_chunk * chunk);
212
213	// get output embeddings from the last encode pass
214	// the reading size (in bytes) is equal to:
215	// llama_model_n_embd(model) mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)*
216	MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
217
218	/////////////////////////////////////////
219
220	// test function, to be used in test-mtmd-c-api.c
221	MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
222
223	#ifdef __cplusplus
224	} // extern "C"
225	#endif
226
227	//
228	// C++ wrappers
229	//
230
231	#ifdef __cplusplus
232
233	namespace mtmd {
234
235	struct mtmd_context_deleter {
236	void operator()(mtmd_context * val) { mtmd_free(val); }
237	};
238	using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
239
240	struct mtmd_bitmap_deleter {
241	void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
242	};
243	using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
244
245	struct mtmd_input_chunks_deleter {
246	void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
247	};
248	using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
249
250	struct mtmd_input_chunk_deleter {
251	void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
252	};
253	using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
254
255	struct bitmap {
256	bitmap_ptr ptr;
257	bitmap() : ptr(nullptr) {}
258	bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
259	bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
260	bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
261	ptr.reset(mtmd_bitmap_init(nx, ny, data));
262	}
263	~bitmap() = default;
264	uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
265	uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
266	const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
267	size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
268	std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
269	void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
270	};
271
272	struct bitmaps {
273	std::vector<bitmap> entries;
274	~bitmaps() = default;
275	// return list of pointers to mtmd_bitmap
276	// example:
277	// auto bitmaps_c_ptr = bitmaps.c_ptr();
278	// int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
279	std::vector<const mtmd_bitmap *> c_ptr() {
280	std::vector<const mtmd_bitmap *> res(entries.size());
281	for (size_t i = `0`; i < entries.size(); i++) {
282	res[i] = entries[i].ptr.get();
283	}
284	return res;
285	}
286	};
287
288	struct input_chunks {
289	input_chunks_ptr ptr;
290	input_chunks() = default;
291	input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
292	~input_chunks() = default;
293	size_t size() { return mtmd_input_chunks_size(ptr.get()); }
294	const mtmd_input_chunk * operator[](size_t idx) {
295	return mtmd_input_chunks_get(ptr.get(), idx);
296	}
297	};
298
299	} // namespace mtmd
300
301	#endif
302
303	#endif
304

Browse the source code of llama.cpp/tools/mtmd/mtmd.h