1#ifndef MTMD_H
2#define MTMD_H
3
4#include "ggml.h"
5#include "llama.h"
6
7#include <stddef.h>
8#include <stdint.h>
9#include <stdbool.h>
10
11#ifdef __cplusplus
12#include <string>
13#include <vector>
14#include <cinttypes>
15#include <memory>
16#endif
17
18/**
19 * libmtmd: A library for multimodal support in llama.cpp.
20 *
21 * WARNING: This API is experimental and subject to many BREAKING CHANGES.
22 * Issues related to API usage may receive lower priority support.
23 *
24 * For the usage, see an example in mtmd-cli.cpp
25 */
26
27#ifdef LLAMA_SHARED
28# if defined(_WIN32) && !defined(__MINGW32__)
29# ifdef LLAMA_BUILD
30# define MTMD_API __declspec(dllexport)
31# else
32# define MTMD_API __declspec(dllimport)
33# endif
34# else
35# define MTMD_API __attribute__ ((visibility ("default")))
36# endif
37#else
38# define MTMD_API
39#endif
40
41// deprecated marker, use mtmd_default_marker() instead
42#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
43
44#ifdef __cplusplus
45extern "C" {
46#endif
47
48enum mtmd_input_chunk_type {
49 MTMD_INPUT_CHUNK_TYPE_TEXT,
50 MTMD_INPUT_CHUNK_TYPE_IMAGE,
51 MTMD_INPUT_CHUNK_TYPE_AUDIO,
52};
53
54// opaque types
55struct mtmd_context;
56struct mtmd_bitmap;
57struct mtmd_image_tokens;
58struct mtmd_input_chunk;
59struct mtmd_input_chunks;
60
61struct mtmd_input_text {
62 const char * text;
63 bool add_special;
64 bool parse_special;
65};
66
67//
68// C API
69//
70
71typedef struct mtmd_context mtmd_context;
72typedef struct mtmd_bitmap mtmd_bitmap;
73typedef struct mtmd_image_tokens mtmd_image_tokens;
74typedef struct mtmd_input_chunk mtmd_input_chunk;
75typedef struct mtmd_input_chunks mtmd_input_chunks;
76typedef struct mtmd_input_text mtmd_input_text;
77
78struct mtmd_context_params {
79 bool use_gpu;
80 bool print_timings;
81 int n_threads;
82 enum ggml_log_level verbosity;
83 const char * image_marker; // deprecated, use media_marker instead
84 const char * media_marker;
85 enum llama_flash_attn_type flash_attn_type;
86
87 // limit number of image tokens, only for vision models with dynamic resolution
88 int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
89 int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
90};
91
92MTMD_API const char * mtmd_default_marker(void);
93
94MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
95
96// initialize the mtmd context
97// return nullptr on failure
98MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
99 const struct llama_model * text_model,
100 const struct mtmd_context_params ctx_params);
101
102MTMD_API void mtmd_free(mtmd_context * ctx);
103
104// whether we need to set non-causal mask before llama_decode
105MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
106
107// whether the current model use M-RoPE for llama_decode
108MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
109
110// whether the current model supports vision input
111MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
112
113// whether the current model supports audio input
114MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
115
116// get audio bitrate in Hz, for example 16000 for Whisper
117// return -1 if audio is not supported
118MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
119
120// mtmd_bitmap
121//
122// if bitmap is image:
123// length of data must be nx * ny * 3
124// the data is in RGBRGBRGB... format
125// if bitmap is audio:
126// length of data must be n_samples * sizeof(float)
127// the data is in float format (PCM F32)
128MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
129MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
130MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
131MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
132MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
133MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
134MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
135MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
136// bitmap ID is optional, but useful for KV cache tracking
137// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
138MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
139MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
140
141
142// mtmd_input_chunks
143//
144// this is simply a list of mtmd_input_chunk
145// the elements can only be populated via mtmd_tokenize()
146MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
147MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
148MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
149MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
150
151// mtmd_input_chunk
152//
153// the instance will be constructed via mtmd_tokenize()
154// it will be freed along with mtmd_input_chunks
155MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
156MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
157MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
158MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
159// returns nullptr for ID on text chunk
160MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
161// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
162MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
163
164// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
165// you can move the chunk ownership to your own code by copying it
166// remember to free the chunk when you are done with it
167MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
168MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
169
170
171// mtmd_image_tokens
172//
173// the instance will be constructed via mtmd_tokenize()
174// it will be freed along with mtmd_input_chunk
175MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
176MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
177MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
178MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
179// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
180MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
181
182// tokenize an input text prompt and a list of bitmaps (images/audio)
183// the prompt must have the input image marker (default: "<__media__>") in it
184// the default marker is defined by mtmd_default_marker()
185// the marker will be replaced with the image/audio chunk
186// for example:
187// "here is an image: <__media__>\ndescribe it in detail."
188// this will gives 3 chunks:
189// 1. "here is an image: <start_of_image>"
190// 2. (image/audio tokens)
191// 3. "<end_of_image>\ndescribe it in detail."
192// number of bitmaps must be equal to the number of markers in the prompt
193// this function is thread-safe (shared ctx)
194// return values:
195// 0 on success
196// 1 on number of bitmaps not matching the number of markers
197// 2 on image preprocessing error
198MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
199 mtmd_input_chunks * output,
200 const mtmd_input_text * text,
201 const mtmd_bitmap ** bitmaps,
202 size_t n_bitmaps);
203
204// returns 0 on success
205// TODO: deprecate
206MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
207 const mtmd_image_tokens * image_tokens);
208
209// returns 0 on success
210MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
211 const mtmd_input_chunk * chunk);
212
213// get output embeddings from the last encode pass
214// the reading size (in bytes) is equal to:
215// llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
216MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
217
218/////////////////////////////////////////
219
220// test function, to be used in test-mtmd-c-api.c
221MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
222
223#ifdef __cplusplus
224} // extern "C"
225#endif
226
227//
228// C++ wrappers
229//
230
231#ifdef __cplusplus
232
233namespace mtmd {
234
235struct mtmd_context_deleter {
236 void operator()(mtmd_context * val) { mtmd_free(val); }
237};
238using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
239
240struct mtmd_bitmap_deleter {
241 void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
242};
243using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
244
245struct mtmd_input_chunks_deleter {
246 void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
247};
248using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
249
250struct mtmd_input_chunk_deleter {
251 void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
252};
253using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
254
255struct bitmap {
256 bitmap_ptr ptr;
257 bitmap() : ptr(nullptr) {}
258 bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
259 bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
260 bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
261 ptr.reset(mtmd_bitmap_init(nx, ny, data));
262 }
263 ~bitmap() = default;
264 uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
265 uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
266 const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
267 size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
268 std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
269 void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
270};
271
272struct bitmaps {
273 std::vector<bitmap> entries;
274 ~bitmaps() = default;
275 // return list of pointers to mtmd_bitmap
276 // example:
277 // auto bitmaps_c_ptr = bitmaps.c_ptr();
278 // int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
279 std::vector<const mtmd_bitmap *> c_ptr() {
280 std::vector<const mtmd_bitmap *> res(entries.size());
281 for (size_t i = 0; i < entries.size(); i++) {
282 res[i] = entries[i].ptr.get();
283 }
284 return res;
285 }
286};
287
288struct input_chunks {
289 input_chunks_ptr ptr;
290 input_chunks() = default;
291 input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
292 ~input_chunks() = default;
293 size_t size() { return mtmd_input_chunks_size(ptr.get()); }
294 const mtmd_input_chunk * operator[](size_t idx) {
295 return mtmd_input_chunks_get(ptr.get(), idx);
296 }
297};
298
299} // namespace mtmd
300
301#endif
302
303#endif
304