| 1 | #ifndef MTMD_H |
| 2 | #define MTMD_H |
| 3 | |
| 4 | #include "ggml.h" |
| 5 | #include "llama.h" |
| 6 | |
| 7 | #include <stddef.h> |
| 8 | #include <stdint.h> |
| 9 | #include <stdbool.h> |
| 10 | |
| 11 | #ifdef __cplusplus |
| 12 | #include <string> |
| 13 | #include <vector> |
| 14 | #include <cinttypes> |
| 15 | #include <memory> |
| 16 | #endif |
| 17 | |
| 18 | /** |
| 19 | * libmtmd: A library for multimodal support in llama.cpp. |
| 20 | * |
| 21 | * WARNING: This API is experimental and subject to many BREAKING CHANGES. |
| 22 | * Issues related to API usage may receive lower priority support. |
| 23 | * |
| 24 | * For the usage, see an example in mtmd-cli.cpp |
| 25 | */ |
| 26 | |
| 27 | #ifdef LLAMA_SHARED |
| 28 | # if defined(_WIN32) && !defined(__MINGW32__) |
| 29 | # ifdef LLAMA_BUILD |
| 30 | # define MTMD_API __declspec(dllexport) |
| 31 | # else |
| 32 | # define MTMD_API __declspec(dllimport) |
| 33 | # endif |
| 34 | # else |
| 35 | # define MTMD_API __attribute__ ((visibility ("default"))) |
| 36 | # endif |
| 37 | #else |
| 38 | # define MTMD_API |
| 39 | #endif |
| 40 | |
| 41 | // deprecated marker, use mtmd_default_marker() instead |
| 42 | #define MTMD_DEFAULT_IMAGE_MARKER "<__image__>" |
| 43 | |
| 44 | #ifdef __cplusplus |
| 45 | extern "C" { |
| 46 | #endif |
| 47 | |
| 48 | enum mtmd_input_chunk_type { |
| 49 | MTMD_INPUT_CHUNK_TYPE_TEXT, |
| 50 | MTMD_INPUT_CHUNK_TYPE_IMAGE, |
| 51 | MTMD_INPUT_CHUNK_TYPE_AUDIO, |
| 52 | }; |
| 53 | |
| 54 | // opaque types |
| 55 | struct mtmd_context; |
| 56 | struct mtmd_bitmap; |
| 57 | struct mtmd_image_tokens; |
| 58 | struct mtmd_input_chunk; |
| 59 | struct mtmd_input_chunks; |
| 60 | |
| 61 | struct mtmd_input_text { |
| 62 | const char * text; |
| 63 | bool add_special; |
| 64 | bool parse_special; |
| 65 | }; |
| 66 | |
| 67 | // |
| 68 | // C API |
| 69 | // |
| 70 | |
| 71 | typedef struct mtmd_context mtmd_context; |
| 72 | typedef struct mtmd_bitmap mtmd_bitmap; |
| 73 | typedef struct mtmd_image_tokens mtmd_image_tokens; |
| 74 | typedef struct mtmd_input_chunk mtmd_input_chunk; |
| 75 | typedef struct mtmd_input_chunks mtmd_input_chunks; |
| 76 | typedef struct mtmd_input_text mtmd_input_text; |
| 77 | |
| 78 | struct mtmd_context_params { |
| 79 | bool use_gpu; |
| 80 | bool print_timings; |
| 81 | int n_threads; |
| 82 | enum ggml_log_level verbosity; |
| 83 | const char * image_marker; // deprecated, use media_marker instead |
| 84 | const char * media_marker; |
| 85 | enum llama_flash_attn_type flash_attn_type; |
| 86 | |
| 87 | // limit number of image tokens, only for vision models with dynamic resolution |
| 88 | int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) |
| 89 | int image_max_tokens; // maximum number of tokens for image input (default: read from metadata) |
| 90 | }; |
| 91 | |
| 92 | MTMD_API const char * mtmd_default_marker(void); |
| 93 | |
| 94 | MTMD_API struct mtmd_context_params mtmd_context_params_default(void); |
| 95 | |
| 96 | // initialize the mtmd context |
| 97 | // return nullptr on failure |
| 98 | MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname, |
| 99 | const struct llama_model * text_model, |
| 100 | const struct mtmd_context_params ctx_params); |
| 101 | |
| 102 | MTMD_API void mtmd_free(mtmd_context * ctx); |
| 103 | |
| 104 | // whether we need to set non-causal mask before llama_decode |
| 105 | MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); |
| 106 | |
| 107 | // whether the current model use M-RoPE for llama_decode |
| 108 | MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx); |
| 109 | |
| 110 | // whether the current model supports vision input |
| 111 | MTMD_API bool mtmd_support_vision(mtmd_context * ctx); |
| 112 | |
| 113 | // whether the current model supports audio input |
| 114 | MTMD_API bool mtmd_support_audio(mtmd_context * ctx); |
| 115 | |
| 116 | // get audio bitrate in Hz, for example 16000 for Whisper |
| 117 | // return -1 if audio is not supported |
| 118 | MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx); |
| 119 | |
| 120 | // mtmd_bitmap |
| 121 | // |
| 122 | // if bitmap is image: |
| 123 | // length of data must be nx * ny * 3 |
| 124 | // the data is in RGBRGBRGB... format |
| 125 | // if bitmap is audio: |
| 126 | // length of data must be n_samples * sizeof(float) |
| 127 | // the data is in float format (PCM F32) |
| 128 | MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data); |
| 129 | MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data); |
| 130 | MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap); |
| 131 | MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap); |
| 132 | MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap); |
| 133 | MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap); |
| 134 | MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap); |
| 135 | MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap); |
| 136 | // bitmap ID is optional, but useful for KV cache tracking |
| 137 | // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data() |
| 138 | MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap); |
| 139 | MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id); |
| 140 | |
| 141 | |
| 142 | // mtmd_input_chunks |
| 143 | // |
| 144 | // this is simply a list of mtmd_input_chunk |
| 145 | // the elements can only be populated via mtmd_tokenize() |
| 146 | MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void); |
| 147 | MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks); |
| 148 | MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx); |
| 149 | MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks); |
| 150 | |
| 151 | // mtmd_input_chunk |
| 152 | // |
| 153 | // the instance will be constructed via mtmd_tokenize() |
| 154 | // it will be freed along with mtmd_input_chunks |
| 155 | MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk); |
| 156 | MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output); |
| 157 | MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk); |
| 158 | MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk); |
| 159 | // returns nullptr for ID on text chunk |
| 160 | MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk); |
| 161 | // number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise) |
| 162 | MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk); |
| 163 | |
| 164 | // in case you want to use custom logic to handle the chunk (i.e. KV cache management) |
| 165 | // you can move the chunk ownership to your own code by copying it |
| 166 | // remember to free the chunk when you are done with it |
| 167 | MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk); |
| 168 | MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk); |
| 169 | |
| 170 | |
| 171 | // mtmd_image_tokens |
| 172 | // |
| 173 | // the instance will be constructed via mtmd_tokenize() |
| 174 | // it will be freed along with mtmd_input_chunk |
| 175 | MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate |
| 176 | MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens); |
| 177 | MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens); |
| 178 | MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate |
| 179 | // number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise) |
| 180 | MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate |
| 181 | |
| 182 | // tokenize an input text prompt and a list of bitmaps (images/audio) |
| 183 | // the prompt must have the input image marker (default: "<__media__>") in it |
| 184 | // the default marker is defined by mtmd_default_marker() |
| 185 | // the marker will be replaced with the image/audio chunk |
| 186 | // for example: |
| 187 | // "here is an image: <__media__>\ndescribe it in detail." |
| 188 | // this will gives 3 chunks: |
| 189 | // 1. "here is an image: <start_of_image>" |
| 190 | // 2. (image/audio tokens) |
| 191 | // 3. "<end_of_image>\ndescribe it in detail." |
| 192 | // number of bitmaps must be equal to the number of markers in the prompt |
| 193 | // this function is thread-safe (shared ctx) |
| 194 | // return values: |
| 195 | // 0 on success |
| 196 | // 1 on number of bitmaps not matching the number of markers |
| 197 | // 2 on image preprocessing error |
| 198 | MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx, |
| 199 | mtmd_input_chunks * output, |
| 200 | const mtmd_input_text * text, |
| 201 | const mtmd_bitmap ** bitmaps, |
| 202 | size_t n_bitmaps); |
| 203 | |
| 204 | // returns 0 on success |
| 205 | // TODO: deprecate |
| 206 | MTMD_API int32_t mtmd_encode(mtmd_context * ctx, |
| 207 | const mtmd_image_tokens * image_tokens); |
| 208 | |
| 209 | // returns 0 on success |
| 210 | MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, |
| 211 | const mtmd_input_chunk * chunk); |
| 212 | |
| 213 | // get output embeddings from the last encode pass |
| 214 | // the reading size (in bytes) is equal to: |
| 215 | // llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float) |
| 216 | MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); |
| 217 | |
| 218 | ///////////////////////////////////////// |
| 219 | |
| 220 | // test function, to be used in test-mtmd-c-api.c |
| 221 | MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); |
| 222 | |
| 223 | #ifdef __cplusplus |
| 224 | } // extern "C" |
| 225 | #endif |
| 226 | |
| 227 | // |
| 228 | // C++ wrappers |
| 229 | // |
| 230 | |
| 231 | #ifdef __cplusplus |
| 232 | |
| 233 | namespace mtmd { |
| 234 | |
| 235 | struct mtmd_context_deleter { |
| 236 | void operator()(mtmd_context * val) { mtmd_free(val); } |
| 237 | }; |
| 238 | using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>; |
| 239 | |
| 240 | struct mtmd_bitmap_deleter { |
| 241 | void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); } |
| 242 | }; |
| 243 | using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>; |
| 244 | |
| 245 | struct mtmd_input_chunks_deleter { |
| 246 | void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); } |
| 247 | }; |
| 248 | using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>; |
| 249 | |
| 250 | struct mtmd_input_chunk_deleter { |
| 251 | void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); } |
| 252 | }; |
| 253 | using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>; |
| 254 | |
| 255 | struct bitmap { |
| 256 | bitmap_ptr ptr; |
| 257 | bitmap() : ptr(nullptr) {} |
| 258 | bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {} |
| 259 | bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {} |
| 260 | bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) { |
| 261 | ptr.reset(mtmd_bitmap_init(nx, ny, data)); |
| 262 | } |
| 263 | ~bitmap() = default; |
| 264 | uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); } |
| 265 | uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); } |
| 266 | const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); } |
| 267 | size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); } |
| 268 | std::string id() { return mtmd_bitmap_get_id(ptr.get()); } |
| 269 | void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); } |
| 270 | }; |
| 271 | |
| 272 | struct bitmaps { |
| 273 | std::vector<bitmap> entries; |
| 274 | ~bitmaps() = default; |
| 275 | // return list of pointers to mtmd_bitmap |
| 276 | // example: |
| 277 | // auto bitmaps_c_ptr = bitmaps.c_ptr(); |
| 278 | // int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size()); |
| 279 | std::vector<const mtmd_bitmap *> c_ptr() { |
| 280 | std::vector<const mtmd_bitmap *> res(entries.size()); |
| 281 | for (size_t i = 0; i < entries.size(); i++) { |
| 282 | res[i] = entries[i].ptr.get(); |
| 283 | } |
| 284 | return res; |
| 285 | } |
| 286 | }; |
| 287 | |
| 288 | struct input_chunks { |
| 289 | input_chunks_ptr ptr; |
| 290 | input_chunks() = default; |
| 291 | input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {} |
| 292 | ~input_chunks() = default; |
| 293 | size_t size() { return mtmd_input_chunks_size(ptr.get()); } |
| 294 | const mtmd_input_chunk * operator[](size_t idx) { |
| 295 | return mtmd_input_chunks_get(ptr.get(), idx); |
| 296 | } |
| 297 | }; |
| 298 | |
| 299 | } // namespace mtmd |
| 300 | |
| 301 | #endif |
| 302 | |
| 303 | #endif |
| 304 | |