1// NOTE: This is modified from clip.cpp only for LLaVA,
2// so there might be still unnecessary artifacts hanging around
3// I'll gradually clean and extend it
4// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
5#include "clip.h"
6#include "clip-impl.h"
7#include "ggml.h"
8#include "ggml-cpp.h"
9#include "ggml-alloc.h"
10#include "ggml-backend.h"
11#include "gguf.h"
12
13#include <cassert>
14#include <cmath>
15#include <cstdlib>
16#include <cstring>
17#include <fstream>
18#include <map>
19#include <stdexcept>
20#include <unordered_set>
21#include <vector>
22#include <cinttypes>
23#include <limits>
24#include <array>
25#include <functional>
26
27// TODO: allow to pass callback from user code
28struct clip_logger_state g_logger_state = {.verbosity_thold: GGML_LOG_LEVEL_CONT, .log_callback: clip_log_callback_default, NULL};
29
30enum ffn_op_type {
31 FFN_GELU,
32 FFN_GELU_ERF,
33 FFN_SILU,
34 FFN_GELU_QUICK,
35};
36
37enum norm_type {
38 NORM_TYPE_NORMAL,
39 NORM_TYPE_RMS,
40};
41
42//#define CLIP_DEBUG_FUNCTIONS
43
44#ifdef CLIP_DEBUG_FUNCTIONS
45static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
46 std::ofstream file(filename, std::ios::binary);
47 if (!file.is_open()) {
48 LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
49 return;
50 }
51
52 // PPM header: P6 format, width, height, and max color value
53 file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
54
55 // Write pixel data
56 for (size_t i = 0; i < img.buf.size(); i += 3) {
57 // PPM expects binary data in RGB format, which matches our image buffer
58 file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
59 }
60
61 file.close();
62}
63
64static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
65 std::ofstream file(filename, std::ios::binary);
66 if (!file.is_open()) {
67 LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
68 return;
69 }
70
71 int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
72 int bytesPerPixel = 3;
73 int widthInBytes = img.nx * bytesPerPixel;
74 int paddingAmount = (4 - (widthInBytes % 4)) % 4;
75 int stride = widthInBytes + paddingAmount;
76
77 // Bitmap file header
78 unsigned char fileHeader[14] = {
79 'B','M', // Signature
80 0,0,0,0, // Image file size in bytes
81 0,0,0,0, // Reserved
82 54,0,0,0 // Start of pixel array
83 };
84
85 // Total file size
86 fileSize = 54 + (stride * img.ny);
87 fileHeader[2] = (unsigned char)(fileSize);
88 fileHeader[3] = (unsigned char)(fileSize >> 8);
89 fileHeader[4] = (unsigned char)(fileSize >> 16);
90 fileHeader[5] = (unsigned char)(fileSize >> 24);
91
92 // Bitmap information header (BITMAPINFOHEADER)
93 unsigned char infoHeader[40] = {
94 40,0,0,0, // Size of this header (40 bytes)
95 0,0,0,0, // Image width
96 0,0,0,0, // Image height
97 1,0, // Number of color planes
98 24,0, // Bits per pixel
99 0,0,0,0, // No compression
100 0,0,0,0, // Image size (can be 0 for no compression)
101 0,0,0,0, // X pixels per meter (not specified)
102 0,0,0,0, // Y pixels per meter (not specified)
103 0,0,0,0, // Total colors (color table not used)
104 0,0,0,0 // Important colors (all are important)
105 };
106
107 // Width and height in the information header
108 infoHeader[4] = (unsigned char)(img.nx);
109 infoHeader[5] = (unsigned char)(img.nx >> 8);
110 infoHeader[6] = (unsigned char)(img.nx >> 16);
111 infoHeader[7] = (unsigned char)(img.nx >> 24);
112 infoHeader[8] = (unsigned char)(img.ny);
113 infoHeader[9] = (unsigned char)(img.ny >> 8);
114 infoHeader[10] = (unsigned char)(img.ny >> 16);
115 infoHeader[11] = (unsigned char)(img.ny >> 24);
116
117 // Write file headers
118 file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
119 file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
120
121 // Pixel data
122 std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
123 for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
124 for (int x = 0; x < img.nx; ++x) {
125 // Each pixel
126 size_t pixelIndex = (y * img.nx + x) * 3;
127 unsigned char pixel[3] = {
128 img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
129 img.buf[pixelIndex + 1],
130 img.buf[pixelIndex]
131 };
132 file.write(reinterpret_cast<char*>(pixel), 3);
133 }
134 // Write padding for the row
135 file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
136 }
137
138 file.close();
139}
140
141// debug function to convert f32 to u8
142static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
143 dst.nx = src.nx;
144 dst.ny = src.ny;
145 dst.buf.resize(3 * src.nx * src.ny);
146 for (size_t i = 0; i < src.buf.size(); ++i) {
147 dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
148 }
149}
150#endif
151
152
153//
154// clip layers
155//
156
157enum patch_merge_type {
158 PATCH_MERGE_FLAT,
159 PATCH_MERGE_SPATIAL_UNPAD,
160};
161
162struct clip_hparams {
163 int32_t image_size;
164 int32_t patch_size;
165 int32_t n_embd;
166 int32_t n_ff;
167 int32_t projection_dim;
168 int32_t n_head;
169 int32_t n_layer;
170 // idefics3
171 int32_t image_longest_edge = 0;
172 int32_t image_min_pixels = -1;
173 int32_t image_max_pixels = -1;
174 int32_t n_merge = 0; // number of patch merges **per-side**
175
176 float image_mean[3];
177 float image_std[3];
178
179 // for models using dynamic image size, we need to have a smaller image size to warmup
180 // otherwise, user will get OOM everytime they load the model
181 int32_t warmup_image_size = 0;
182 int32_t warmup_audio_size = 3000;
183
184 ffn_op_type ffn_op = FFN_GELU;
185
186 patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
187
188 float eps = 1e-6;
189 float rope_theta = 0.0;
190
191 std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
192 int32_t image_crop_resolution;
193 std::unordered_set<int32_t> vision_feature_layer;
194 int32_t attn_window_size = 0;
195 int32_t n_wa_pattern = 0;
196
197 // audio
198 int32_t n_mel_bins = 0; // whisper preprocessor
199 int32_t proj_stack_factor = 0; // ultravox
200
201 // legacy
202 bool has_llava_projector = false;
203 int minicpmv_version = 0;
204 int32_t minicpmv_query_num = 0; // MiniCPM-V query number
205
206 // custom value provided by user, can be undefined if not set
207 int32_t custom_image_min_tokens = -1;
208 int32_t custom_image_max_tokens = -1;
209
210 void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
211 const int cur_merge = n_merge == 0 ? 1 : n_merge;
212 const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
213 image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
214 image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
215 warmup_image_size = static_cast<int>(std::sqrt(x: image_max_pixels));
216 }
217
218 void set_warmup_n_tokens(int n_tokens) {
219 int n_tok_per_side = static_cast<int>(std::sqrt(x: n_tokens));
220 GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
221 const int cur_merge = n_merge == 0 ? 1 : n_merge;
222 warmup_image_size = n_tok_per_side * patch_size * cur_merge;
223 // TODO: support warmup size for custom token numbers
224 }
225};
226
227struct clip_layer {
228 // attention
229 ggml_tensor * k_w = nullptr;
230 ggml_tensor * k_b = nullptr;
231 ggml_tensor * q_w = nullptr;
232 ggml_tensor * q_b = nullptr;
233 ggml_tensor * v_w = nullptr;
234 ggml_tensor * v_b = nullptr;
235 ggml_tensor * qkv_w = nullptr;
236 ggml_tensor * qkv_b = nullptr;
237
238 ggml_tensor * o_w = nullptr;
239 ggml_tensor * o_b = nullptr;
240
241 ggml_tensor * k_norm = nullptr;
242 ggml_tensor * q_norm = nullptr;
243
244 // layernorm 1
245 ggml_tensor * ln_1_w = nullptr;
246 ggml_tensor * ln_1_b = nullptr;
247
248 ggml_tensor * ff_up_w = nullptr;
249 ggml_tensor * ff_up_b = nullptr;
250 ggml_tensor * ff_gate_w = nullptr;
251 ggml_tensor * ff_gate_b = nullptr;
252 ggml_tensor * ff_down_w = nullptr;
253 ggml_tensor * ff_down_b = nullptr;
254
255 // layernorm 2
256 ggml_tensor * ln_2_w = nullptr;
257 ggml_tensor * ln_2_b = nullptr;
258
259 // layer scale (no bias)
260 ggml_tensor * ls_1_w = nullptr;
261 ggml_tensor * ls_2_w = nullptr;
262
263 // qwen3vl deepstack merger
264 ggml_tensor * deepstack_norm_w = nullptr;
265 ggml_tensor * deepstack_norm_b = nullptr;
266 ggml_tensor * deepstack_fc1_w = nullptr;
267 ggml_tensor * deepstack_fc1_b = nullptr;
268 ggml_tensor * deepstack_fc2_w = nullptr;
269 ggml_tensor * deepstack_fc2_b = nullptr;
270
271 bool has_deepstack() const {
272 return deepstack_fc1_w != nullptr;
273 }
274};
275
276struct clip_model {
277 clip_modality modality = CLIP_MODALITY_VISION;
278 projector_type proj_type = PROJECTOR_TYPE_MLP;
279 clip_hparams hparams;
280
281 // embeddings
282 ggml_tensor * class_embedding = nullptr;
283 ggml_tensor * patch_embeddings_0 = nullptr;
284 ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
285 ggml_tensor * patch_bias = nullptr;
286 ggml_tensor * position_embeddings = nullptr;
287
288 ggml_tensor * pre_ln_w = nullptr;
289 ggml_tensor * pre_ln_b = nullptr;
290
291 std::vector<clip_layer> layers;
292
293 int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
294
295 ggml_tensor * post_ln_w;
296 ggml_tensor * post_ln_b;
297
298 ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
299 ggml_tensor * mm_fc_w;
300 ggml_tensor * mm_fc_b;
301
302 // LLaVA projection
303 ggml_tensor * mm_input_norm_w = nullptr;
304 ggml_tensor * mm_input_norm_b = nullptr;
305 ggml_tensor * mm_0_w = nullptr;
306 ggml_tensor * mm_0_b = nullptr;
307 ggml_tensor * mm_2_w = nullptr;
308 ggml_tensor * mm_2_b = nullptr;
309
310 ggml_tensor * image_newline = nullptr;
311
312 // Yi type models with mlp+normalization projection
313 ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
314 ggml_tensor * mm_1_b = nullptr;
315 ggml_tensor * mm_3_w = nullptr;
316 ggml_tensor * mm_3_b = nullptr;
317 ggml_tensor * mm_4_w = nullptr;
318 ggml_tensor * mm_4_b = nullptr;
319
320 // GLMV-Edge projection
321 ggml_tensor * mm_model_adapter_conv_w = nullptr;
322 ggml_tensor * mm_model_adapter_conv_b = nullptr;
323
324 // MobileVLM projection
325 ggml_tensor * mm_model_mlp_1_w = nullptr;
326 ggml_tensor * mm_model_mlp_1_b = nullptr;
327 ggml_tensor * mm_model_mlp_3_w = nullptr;
328 ggml_tensor * mm_model_mlp_3_b = nullptr;
329 ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
330 ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
331 ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
332 ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
333 ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
334 ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
335 ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
336 ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
337 ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
338 ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
339 ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
340 ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
341 ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
342 ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
343 ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
344 ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
345 ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
346 ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
347 ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
348 ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
349
350 // MobileVLM_V2 projection
351 ggml_tensor * mm_model_mlp_0_w = nullptr;
352 ggml_tensor * mm_model_mlp_0_b = nullptr;
353 ggml_tensor * mm_model_mlp_2_w = nullptr;
354 ggml_tensor * mm_model_mlp_2_b = nullptr;
355 ggml_tensor * mm_model_peg_0_w = nullptr;
356 ggml_tensor * mm_model_peg_0_b = nullptr;
357
358 // MINICPMV projection
359 ggml_tensor * mm_model_pos_embed_k = nullptr;
360 ggml_tensor * mm_model_query = nullptr;
361 ggml_tensor * mm_model_proj = nullptr;
362 ggml_tensor * mm_model_kv_proj = nullptr;
363 ggml_tensor * mm_model_attn_q_w = nullptr;
364 ggml_tensor * mm_model_attn_q_b = nullptr;
365 ggml_tensor * mm_model_attn_k_w = nullptr;
366 ggml_tensor * mm_model_attn_k_b = nullptr;
367 ggml_tensor * mm_model_attn_v_w = nullptr;
368 ggml_tensor * mm_model_attn_v_b = nullptr;
369 ggml_tensor * mm_model_attn_o_w = nullptr;
370 ggml_tensor * mm_model_attn_o_b = nullptr;
371 ggml_tensor * mm_model_ln_q_w = nullptr;
372 ggml_tensor * mm_model_ln_q_b = nullptr;
373 ggml_tensor * mm_model_ln_kv_w = nullptr;
374 ggml_tensor * mm_model_ln_kv_b = nullptr;
375 ggml_tensor * mm_model_ln_post_w = nullptr;
376 ggml_tensor * mm_model_ln_post_b = nullptr;
377
378 // gemma3
379 ggml_tensor * mm_input_proj_w = nullptr;
380 ggml_tensor * mm_soft_emb_norm_w = nullptr;
381
382 // pixtral
383 ggml_tensor * token_embd_img_break = nullptr;
384 ggml_tensor * mm_patch_merger_w = nullptr;
385
386 // ultravox / whisper encoder
387 ggml_tensor * conv1d_1_w = nullptr;
388 ggml_tensor * conv1d_1_b = nullptr;
389 ggml_tensor * conv1d_2_w = nullptr;
390 ggml_tensor * conv1d_2_b = nullptr;
391 ggml_tensor * mm_norm_pre_w = nullptr;
392 ggml_tensor * mm_norm_mid_w = nullptr;
393
394 // cogvlm
395 ggml_tensor * mm_post_fc_norm_w = nullptr;
396 ggml_tensor * mm_post_fc_norm_b = nullptr;
397 ggml_tensor * mm_h_to_4h_w = nullptr;
398 ggml_tensor * mm_gate_w = nullptr;
399 ggml_tensor * mm_4h_to_h_w = nullptr;
400 ggml_tensor * mm_boi = nullptr;
401 ggml_tensor * mm_eoi = nullptr;
402
403 bool audio_has_avgpool() const {
404 return proj_type == PROJECTOR_TYPE_QWEN2A
405 || proj_type == PROJECTOR_TYPE_VOXTRAL;
406 }
407
408 bool audio_has_stack_frames() const {
409 return proj_type == PROJECTOR_TYPE_ULTRAVOX
410 || proj_type == PROJECTOR_TYPE_VOXTRAL;
411 }
412};
413
414struct clip_ctx {
415 clip_model model;
416
417 gguf_context_ptr ctx_gguf;
418 ggml_context_ptr ctx_data;
419
420 std::vector<uint8_t> buf_compute_meta;
421
422 std::vector<ggml_backend_t> backend_ptrs;
423 std::vector<ggml_backend_buffer_type_t> backend_buft;
424
425 ggml_backend_t backend = nullptr;
426 ggml_backend_t backend_cpu = nullptr;
427 ggml_backend_buffer_ptr buf;
428
429 int max_nodes = 8192;
430 ggml_backend_sched_ptr sched;
431 clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
432
433 // for debugging
434 bool debug_graph = false;
435 std::vector<ggml_tensor *> debug_print_tensors;
436
437 clip_ctx(clip_context_params & ctx_params) {
438 flash_attn_type = ctx_params.flash_attn_type;
439 debug_graph = std::getenv(name: "MTMD_DEBUG_GRAPH") != nullptr;
440 backend_cpu = ggml_backend_init_by_type(type: GGML_BACKEND_DEVICE_TYPE_CPU, params: nullptr);
441 if (!backend_cpu) {
442 throw std::runtime_error("failed to initialize CPU backend");
443 }
444 if (ctx_params.use_gpu) {
445 auto backend_name = std::getenv(name: "MTMD_BACKEND_DEVICE");
446 if (backend_name != nullptr) {
447 backend = ggml_backend_init_by_name(name: backend_name, params: nullptr);
448 if (!backend) {
449 LOG_WRN("%s: Warning: Failed to initialize \"%s\" backend, falling back to default GPU backend\n", __func__, backend_name);
450 }
451 }
452 if (!backend) {
453 backend = ggml_backend_init_by_type(type: GGML_BACKEND_DEVICE_TYPE_GPU, params: nullptr);
454 backend = backend ? backend : ggml_backend_init_by_type(type: GGML_BACKEND_DEVICE_TYPE_IGPU, params: nullptr);
455 }
456 }
457
458 if (backend) {
459 LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
460 backend_ptrs.push_back(x: backend);
461 backend_buft.push_back(x: ggml_backend_get_default_buffer_type(backend));
462 } else {
463 backend = backend_cpu;
464 LOG_INF("%s: CLIP using CPU backend\n", __func__);
465 }
466
467 if (ctx_params.image_min_tokens > 0) {
468 model.hparams.custom_image_min_tokens = ctx_params.image_min_tokens;
469 }
470 if (ctx_params.image_max_tokens > 0) {
471 model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
472 }
473
474 backend_ptrs.push_back(x: backend_cpu);
475 backend_buft.push_back(x: ggml_backend_get_default_buffer_type(backend: backend_cpu));
476
477 sched.reset(
478 p: ggml_backend_sched_new(backends: backend_ptrs.data(), bufts: backend_buft.data(), n_backends: backend_ptrs.size(), graph_size: 8192, parallel: false, op_offload: true)
479 );
480 }
481
482 ~clip_ctx() {
483 ggml_backend_free(backend);
484 if (backend != backend_cpu) {
485 ggml_backend_free(backend: backend_cpu);
486 }
487 }
488
489 // this function is added so that we don't change too much of the existing code
490 projector_type proj_type() const {
491 return model.proj_type;
492 }
493};
494
495struct clip_graph {
496 clip_ctx * ctx;
497 const clip_model & model;
498 const clip_hparams & hparams;
499
500 // we only support single image per batch
501 const clip_image_f32 & img;
502
503 const int patch_size;
504 const int n_patches_x;
505 const int n_patches_y;
506 const int n_patches;
507 const int n_embd;
508 const int n_head;
509 const int d_head;
510 const int n_layer;
511 const float eps;
512 const float kq_scale;
513
514 ggml_context_ptr ctx0_ptr;
515 ggml_context * ctx0;
516 ggml_cgraph * gf;
517
518 clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
519 ctx(ctx),
520 model(ctx->model),
521 hparams(model.hparams),
522 img(img),
523 patch_size(hparams.patch_size),
524 n_patches_x(img.nx / patch_size),
525 n_patches_y(img.ny / patch_size),
526 n_patches(n_patches_x * n_patches_y),
527 n_embd(hparams.n_embd),
528 n_head(hparams.n_head),
529 d_head(n_embd / n_head),
530 n_layer(hparams.n_layer),
531 eps(hparams.eps),
532 kq_scale(1.0f / sqrtf(x: (float)d_head)) {
533 struct ggml_init_params params = {
534 /*.mem_size =*/ ctx->buf_compute_meta.size(),
535 /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
536 /*.no_alloc =*/ true,
537 };
538 ctx0_ptr.reset(p: ggml_init(params));
539 ctx0 = ctx0_ptr.get();
540 gf = ggml_new_graph_custom(ctx: ctx0, size: ctx->max_nodes, grads: false);
541 }
542
543 ggml_cgraph * build_siglip() {
544 ggml_tensor * inp = build_inp();
545
546 ggml_tensor * learned_pos_embd = model.position_embeddings;
547 if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
548 learned_pos_embd = resize_position_embeddings();
549 }
550
551 ggml_tensor * cur = build_vit(
552 inp, n_pos: n_patches,
553 norm_t: NORM_TYPE_NORMAL,
554 ffn_t: hparams.ffn_op,
555 learned_pos_embd,
556 add_pos: nullptr);
557
558 if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
559 const int batch_size = 1;
560 GGML_ASSERT(n_patches_x == n_patches_y);
561 const int patches_per_image = n_patches_x;
562 const int kernel_size = hparams.n_merge;
563
564 cur = ggml_transpose(ctx: ctx0, a: cur);
565 cur = ggml_cont_4d(ctx: ctx0, a: cur, ne0: patches_per_image, ne1: patches_per_image, ne2: n_embd, ne3: batch_size);
566
567 // doing a pool2d to reduce the number of output tokens
568 cur = ggml_pool_2d(ctx: ctx0, a: cur, op: GGML_OP_POOL_AVG, k0: kernel_size, k1: kernel_size, s0: kernel_size, s1: kernel_size, p0: 0, p1: 0);
569 cur = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: cur->ne[0] * cur->ne[0], ne1: n_embd, ne2: batch_size);
570 cur = ggml_cont(ctx: ctx0, a: ggml_transpose(ctx: ctx0, a: cur));
571
572 // apply norm before projection
573 cur = ggml_rms_norm(ctx: ctx0, a: cur, eps);
574 cur = ggml_mul(ctx: ctx0, a: cur, b: model.mm_soft_emb_norm_w);
575
576 // apply projection
577 cur = ggml_mul_mat(ctx: ctx0,
578 a: ggml_cont(ctx: ctx0, a: ggml_transpose(ctx: ctx0, a: model.mm_input_proj_w)),
579 b: cur);
580
581 } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
582 // pixel_shuffle
583 // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
584 const int scale_factor = model.hparams.n_merge;
585 cur = build_patch_merge_permute(cur, scale_factor);
586 cur = ggml_mul_mat(ctx: ctx0, a: model.projection, b: cur);
587
588 } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
589 // pixel unshuffle block
590 const int scale_factor = model.hparams.n_merge;
591 cur = build_patch_merge_permute(cur, scale_factor);
592
593 // projection
594 cur = ggml_norm(ctx: ctx0, a: cur, eps: 1e-5); // default nn.LayerNorm
595 cur = ggml_mul(ctx: ctx0, a: cur, b: model.mm_input_norm_w);
596 cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_input_norm_b);
597
598 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
599 cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_1_b);
600 cur = ggml_gelu(ctx: ctx0, a: cur);
601 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: cur);
602 cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_2_b);
603
604 } else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
605 cur = build_ffn(cur,
606 up: model.mm_0_w, up_b: model.mm_0_b,
607 gate: nullptr, gate_b: nullptr,
608 down: model.mm_1_w, down_b: model.mm_1_b,
609 type_op: hparams.ffn_op,
610 il: -1);
611
612 } else {
613 GGML_ABORT("SigLIP: Unsupported projector type");
614 }
615
616 // build the graph
617 ggml_build_forward_expand(cgraph: gf, tensor: cur);
618
619 return gf;
620 }
621
622 ggml_cgraph * build_pixtral() {
623 const int n_merge = hparams.n_merge;
624
625 // 2D input positions
626 ggml_tensor * pos_h = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_patches);
627 ggml_set_name(tensor: pos_h, name: "pos_h");
628 ggml_set_input(tensor: pos_h);
629
630 ggml_tensor * pos_w = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_patches);
631 ggml_set_name(tensor: pos_w, name: "pos_w");
632 ggml_set_input(tensor: pos_w);
633
634 auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
635 return build_rope_2d(ctx0, cur, pos_a: pos_h, pos_b: pos_w, freq_base: hparams.rope_theta, interleave_freq: true);
636 };
637
638 ggml_tensor * inp = build_inp();
639 ggml_tensor * cur = build_vit(
640 inp, n_pos: n_patches,
641 norm_t: NORM_TYPE_RMS,
642 ffn_t: hparams.ffn_op,
643 learned_pos_embd: nullptr, // no learned pos embd
644 add_pos);
645
646 // mistral small 3.1 patch merger
647 // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
648 if (model.mm_patch_merger_w) {
649 GGML_ASSERT(hparams.n_merge > 0);
650
651 cur = ggml_mul(ctx: ctx0, a: ggml_rms_norm(ctx: ctx0, a: cur, eps), b: model.mm_input_norm_w);
652
653 // reshape image tokens to 2D grid
654 cur = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: n_embd, ne1: n_patches_x, ne2: n_patches_y);
655 cur = ggml_permute(ctx: ctx0, a: cur, axis0: 2, axis1: 0, axis2: 1, axis3: 3); // [x, y, n_embd]
656 cur = ggml_cont(ctx: ctx0, a: cur);
657
658 // torch.nn.functional.unfold is just an im2col under the hood
659 // we just need a dummy kernel to make it work
660 ggml_tensor * kernel = ggml_view_3d(ctx: ctx0, a: cur, ne0: n_merge, ne1: n_merge, ne2: cur->ne[2], nb1: 0, nb2: 0, offset: 0);
661 cur = ggml_im2col(ctx: ctx0, a: kernel, b: cur, s0: n_merge, s1: n_merge, p0: 0, p1: 0, d0: 1, d1: 1, is_2D: true, dst_type: inp->type);
662
663 // project to n_embd
664 cur = ggml_reshape_2d(ctx: ctx0, a: cur, ne0: cur->ne[0], ne1: cur->ne[1] * cur->ne[2]);
665 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_patch_merger_w, b: cur);
666 }
667
668 // LlavaMultiModalProjector (always using GELU activation)
669 {
670 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
671 if (model.mm_1_b) {
672 cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_1_b);
673 }
674
675 cur = ggml_gelu(ctx: ctx0, a: cur);
676 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: cur);
677 if (model.mm_2_b) {
678 cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_2_b);
679 }
680 }
681
682 // arrangement of the [IMG_BREAK] token
683 if (model.token_embd_img_break) {
684 // not efficient, but works
685 // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
686 // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
687 // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
688
689 const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
690 const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
691 const int p_total = p_x * p_y;
692 const int n_embd_text = cur->ne[0];
693 const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
694
695 ggml_tensor * tmp = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: n_embd_text, ne1: p_x, ne2: p_y);
696 ggml_tensor * tok = ggml_new_tensor_3d(ctx: ctx0, type: tmp->type, ne0: n_embd_text, ne1: 1, ne2: p_y);
697 tok = ggml_scale(ctx: ctx0, a: tok, s: 0.0); // clear the tensor
698 tok = ggml_add(ctx: ctx0, a: tok, b: model.token_embd_img_break);
699 tmp = ggml_concat(ctx: ctx0, a: tmp, b: tok, dim: 1);
700 cur = ggml_view_2d(ctx: ctx0, a: tmp,
701 ne0: n_embd_text, ne1: n_tokens_output,
702 nb1: ggml_row_size(type: tmp->type, ne: n_embd_text), offset: 0);
703 }
704
705 // build the graph
706 ggml_build_forward_expand(cgraph: gf, tensor: cur);
707
708 return gf;
709 }
710
711 // Qwen2VL and Qwen2.5VL use M-RoPE
712 ggml_cgraph * build_qwen2vl() {
713 GGML_ASSERT(model.patch_bias == nullptr);
714 GGML_ASSERT(model.class_embedding == nullptr);
715
716 const int batch_size = 1;
717 const bool use_window_attn = hparams.n_wa_pattern > 0;
718 const int n_wa_pattern = hparams.n_wa_pattern;
719 const int n_pos = n_patches;
720 const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
721
722 norm_type norm_t = ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
723 ? NORM_TYPE_RMS // qwen 2.5 vl
724 : NORM_TYPE_NORMAL; // qwen 2 vl
725
726 int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
727
728 ggml_tensor * inp_raw = build_inp_raw();
729 ggml_tensor * inp = ggml_conv_2d(ctx: ctx0, a: model.patch_embeddings_0, b: inp_raw, s0: patch_size, s1: patch_size, p0: 0, p1: 0, d0: 1, d1: 1);
730
731 GGML_ASSERT(img.nx % (patch_size * 2) == 0);
732 GGML_ASSERT(img.ny % (patch_size * 2) == 0);
733
734 // second conv dimension
735 {
736 auto inp_1 = ggml_conv_2d(ctx: ctx0, a: model.patch_embeddings_1, b: inp_raw, s0: patch_size, s1: patch_size, p0: 0, p1: 0, d0: 1, d1: 1);
737 inp = ggml_add(ctx: ctx0, a: inp, b: inp_1);
738
739 inp = ggml_permute(ctx: ctx0, a: inp, axis0: 1, axis1: 2, axis2: 0, axis3: 3); // [w, h, c, b] -> [c, w, h, b]
740 inp = ggml_cont_4d(
741 ctx: ctx0, a: inp,
742 ne0: n_embd * 2, ne1: n_patches_x / 2, ne2: n_patches_y, ne3: batch_size);
743 inp = ggml_reshape_4d(
744 ctx: ctx0, a: inp,
745 ne0: n_embd * 2, ne1: n_patches_x / 2, ne2: 2, ne3: batch_size * (n_patches_y / 2));
746 inp = ggml_permute(ctx: ctx0, a: inp, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
747 inp = ggml_cont_3d(
748 ctx: ctx0, a: inp,
749 ne0: n_embd, ne1: n_patches_x * n_patches_y, ne2: batch_size);
750 }
751
752 ggml_tensor * inpL = inp;
753 ggml_tensor * window_mask = nullptr;
754 ggml_tensor * window_idx = nullptr;
755 ggml_tensor * inv_window_idx = nullptr;
756
757 ggml_tensor * positions = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: num_position_ids);
758 ggml_set_name(tensor: positions, name: "positions");
759 ggml_set_input(tensor: positions);
760
761 // pre-layernorm
762 if (model.pre_ln_w) {
763 inpL = build_norm(cur: inpL, mw: model.pre_ln_w, mb: model.pre_ln_b, type: norm_t, norm_eps: eps, il: -1);
764 }
765
766 if (use_window_attn) {
767 // handle window attention inputs
768 inv_window_idx = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos / 4);
769 ggml_set_name(tensor: inv_window_idx, name: "inv_window_idx");
770 ggml_set_input(tensor: inv_window_idx);
771 // mask for window attention
772 window_mask = ggml_new_tensor_2d(ctx: ctx0, type: GGML_TYPE_F32, ne0: n_pos, ne1: n_pos);
773 ggml_set_name(tensor: window_mask, name: "window_mask");
774 ggml_set_input(tensor: window_mask);
775
776 // if flash attn is used, we need to pad the mask and cast to f16
777 if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
778 int n_pad = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD) - window_mask->ne[1];
779 if (n_pad > 0) {
780 window_mask = ggml_pad(ctx: ctx0, a: window_mask, p0: 0, p1: n_pad, p2: 0, p3: 0);
781 }
782 window_mask = ggml_cast(ctx: ctx0, a: window_mask, type: GGML_TYPE_F16);
783 }
784
785 // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
786 GGML_ASSERT(batch_size == 1);
787 inpL = ggml_reshape_2d(ctx: ctx0, a: inpL, ne0: n_embd * 4, ne1: n_patches_x * n_patches_y * batch_size / 4);
788 inpL = ggml_get_rows(ctx: ctx0, a: inpL, b: inv_window_idx);
789 inpL = ggml_reshape_3d(ctx: ctx0, a: inpL, ne0: n_embd, ne1: n_patches_x * n_patches_y, ne2: batch_size);
790 }
791
792 // loop over layers
793 for (int il = 0; il < n_layer; il++) {
794 auto & layer = model.layers[il];
795 const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
796
797 ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
798
799 // layernorm1
800 cur = build_norm(cur, mw: layer.ln_1_w, mb: layer.ln_1_b, type: norm_t, norm_eps: eps, il);
801 cb(cur0: cur, name: "ln1", il);
802
803 // self-attention
804 {
805 ggml_tensor * Qcur = ggml_add(ctx: ctx0,
806 a: ggml_mul_mat(ctx: ctx0, a: layer.q_w, b: cur), b: layer.q_b);
807 ggml_tensor * Kcur = ggml_add(ctx: ctx0,
808 a: ggml_mul_mat(ctx: ctx0, a: layer.k_w, b: cur), b: layer.k_b);
809 ggml_tensor * Vcur = ggml_add(ctx: ctx0,
810 a: ggml_mul_mat(ctx: ctx0, a: layer.v_w, b: cur), b: layer.v_b);
811
812 Qcur = ggml_reshape_3d(ctx: ctx0, a: Qcur, ne0: d_head, ne1: n_head, ne2: n_patches);
813 Kcur = ggml_reshape_3d(ctx: ctx0, a: Kcur, ne0: d_head, ne1: n_head, ne2: n_patches);
814 Vcur = ggml_reshape_3d(ctx: ctx0, a: Vcur, ne0: d_head, ne1: n_head, ne2: n_patches);
815
816 cb(cur0: Qcur, name: "Qcur", il);
817 cb(cur0: Kcur, name: "Kcur", il);
818 cb(cur0: Vcur, name: "Vcur", il);
819
820 // apply M-RoPE
821 Qcur = ggml_rope_multi(
822 ctx: ctx0, a: Qcur, b: positions, c: nullptr,
823 n_dims: d_head/2, sections: mrope_sections, GGML_ROPE_TYPE_VISION, n_ctx_orig: 32768, freq_base: 10000, freq_scale: 1, ext_factor: 0, attn_factor: 1, beta_fast: 32, beta_slow: 1);
824 Kcur = ggml_rope_multi(
825 ctx: ctx0, a: Kcur, b: positions, c: nullptr,
826 n_dims: d_head/2, sections: mrope_sections, GGML_ROPE_TYPE_VISION, n_ctx_orig: 32768, freq_base: 10000, freq_scale: 1, ext_factor: 0, attn_factor: 1, beta_fast: 32, beta_slow: 1);
827
828 cb(cur0: Qcur, name: "Qcur_rope", il);
829 cb(cur0: Kcur, name: "Kcur_rope", il);
830
831 ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
832
833 cur = build_attn(wo: layer.o_w, wo_b: layer.o_b,
834 q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_mask: attn_mask, kq_scale, il);
835 cb(cur0: cur, name: "attn_out", il);
836 }
837
838 // re-add the layer input, e.g., residual
839 cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
840
841 inpL = cur; // inpL = residual, cur = hidden_states
842
843 cb(cur0: cur, name: "ffn_inp", il);
844
845 // layernorm2
846 cur = build_norm(cur, mw: layer.ln_2_w, mb: layer.ln_2_b, type: norm_t, norm_eps: eps, il);
847 cb(cur0: cur, name: "ffn_inp_normed", il);
848
849 // ffn
850 cur = build_ffn(cur,
851 up: layer.ff_up_w, up_b: layer.ff_up_b,
852 gate: layer.ff_gate_w, gate_b: layer.ff_gate_b,
853 down: layer.ff_down_w, down_b: layer.ff_down_b,
854 type_op: hparams.ffn_op, il);
855
856 cb(cur0: cur, name: "ffn_out", il);
857
858 // residual 2
859 cur = ggml_add(ctx: ctx0, a: inpL, b: cur);
860 cb(cur0: cur, name: "layer_out", il);
861
862 inpL = cur;
863 }
864
865 // post-layernorm
866 if (model.post_ln_w) {
867 inpL = build_norm(cur: inpL, mw: model.post_ln_w, mb: model.post_ln_b, type: norm_t, norm_eps: eps, il: n_layer);
868 }
869
870 // multimodal projection
871 ggml_tensor * embeddings = inpL;
872 embeddings = ggml_reshape_3d(ctx: ctx0, a: embeddings, ne0: n_embd * 4, ne1: n_pos / 4, ne2: batch_size);
873
874 embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_0_w, b: embeddings);
875 embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_0_b);
876
877 // GELU activation
878 embeddings = ggml_gelu(ctx: ctx0, a: embeddings);
879
880 // Second linear layer
881 embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: embeddings);
882 embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_1_b);
883
884 if (use_window_attn) {
885 window_idx = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos / 4);
886 ggml_set_name(tensor: window_idx, name: "window_idx");
887 ggml_set_input(tensor: window_idx);
888
889 // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
890 GGML_ASSERT(batch_size == 1);
891 embeddings = ggml_reshape_2d(ctx: ctx0, a: embeddings, ne0: hparams.projection_dim, ne1: n_patches_x * n_patches_y / 4);
892 embeddings = ggml_get_rows(ctx: ctx0, a: embeddings, b: window_idx);
893 embeddings = ggml_reshape_3d(ctx: ctx0, a: embeddings, ne0: hparams.projection_dim, ne1: n_patches_x * n_patches_y / 4, ne2: batch_size);
894 }
895
896 // build the graph
897 ggml_build_forward_expand(cgraph: gf, tensor: embeddings);
898
899 return gf;
900 }
901
902 // Qwen3VL
903 ggml_cgraph * build_qwen3vl() {
904 GGML_ASSERT(model.patch_bias != nullptr);
905 GGML_ASSERT(model.position_embeddings != nullptr);
906 GGML_ASSERT(model.class_embedding == nullptr);
907
908 const int batch_size = 1;
909 const int n_pos = n_patches;
910 const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
911
912 norm_type norm_t = NORM_TYPE_NORMAL;
913
914 int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
915
916 ggml_tensor * inp_raw = build_inp_raw();
917 ggml_tensor * inp = ggml_conv_2d(ctx: ctx0, a: model.patch_embeddings_0, b: inp_raw, s0: patch_size, s1: patch_size, p0: 0, p1: 0, d0: 1, d1: 1);
918
919 GGML_ASSERT(img.nx % (patch_size * 2) == 0);
920 GGML_ASSERT(img.ny % (patch_size * 2) == 0);
921
922 // second conv dimension
923 {
924 auto inp_1 = ggml_conv_2d(ctx: ctx0, a: model.patch_embeddings_1, b: inp_raw, s0: patch_size, s1: patch_size, p0: 0, p1: 0, d0: 1, d1: 1);
925 inp = ggml_add(ctx: ctx0, a: inp, b: inp_1);
926
927 inp = ggml_permute(ctx: ctx0, a: inp, axis0: 1, axis1: 2, axis2: 0, axis3: 3); // [w, h, c, b] -> [c, w, h, b]
928 inp = ggml_cont_4d(
929 ctx: ctx0, a: inp,
930 ne0: n_embd * 2, ne1: n_patches_x / 2, ne2: n_patches_y, ne3: batch_size);
931 inp = ggml_reshape_4d(
932 ctx: ctx0, a: inp,
933 ne0: n_embd * 2, ne1: n_patches_x / 2, ne2: 2, ne3: batch_size * (n_patches_y / 2));
934 inp = ggml_permute(ctx: ctx0, a: inp, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
935 inp = ggml_cont_3d(
936 ctx: ctx0, a: inp,
937 ne0: n_embd, ne1: n_patches_x * n_patches_y, ne2: batch_size);
938 }
939
940 // add patch bias
941 if (model.patch_bias != nullptr) {
942 inp = ggml_add(ctx: ctx0, a: inp, b: model.patch_bias);
943 cb(cur0: inp, name: "patch_bias", il: -1);
944 }
945
946 // calculate absolute position embedding and apply
947 ggml_tensor * learned_pos_embd = resize_position_embeddings();
948 learned_pos_embd = ggml_cont_4d(
949 ctx: ctx0, a: learned_pos_embd,
950 ne0: n_embd * 2, ne1: n_patches_x / 2, ne2: n_patches_y, ne3: batch_size);
951 learned_pos_embd = ggml_reshape_4d(
952 ctx: ctx0, a: learned_pos_embd,
953 ne0: n_embd * 2, ne1: n_patches_x / 2, ne2: 2, ne3: batch_size * (n_patches_y / 2));
954 learned_pos_embd = ggml_permute(ctx: ctx0, a: learned_pos_embd, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
955 learned_pos_embd = ggml_cont_3d(
956 ctx: ctx0, a: learned_pos_embd,
957 ne0: n_embd, ne1: n_patches_x * n_patches_y, ne2: batch_size);
958 inp = ggml_add(ctx: ctx0, a: inp, b: learned_pos_embd);
959 cb(cur0: inp, name: "inp_pos_emb", il: -1);
960
961 ggml_tensor * inpL = inp;
962
963 ggml_tensor * positions = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: num_position_ids);
964 ggml_set_name(tensor: positions, name: "positions");
965 ggml_set_input(tensor: positions);
966
967 // pre-layernorm
968 if (model.pre_ln_w) {
969 inpL = build_norm(cur: inpL, mw: model.pre_ln_w, mb: model.pre_ln_b, type: norm_t, norm_eps: eps, il: -1);
970 }
971
972 // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
973 ggml_tensor * deepstack_features = nullptr;
974 const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
975
976 // loop over layers
977 for (int il = 0; il < n_layer; il++) {
978 auto & layer = model.layers[il];
979
980 ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
981
982 // layernorm1
983 cur = build_norm(cur, mw: layer.ln_1_w, mb: layer.ln_1_b, type: norm_t, norm_eps: eps, il);
984 cb(cur0: cur, name: "ln1", il);
985
986 // self-attention
987 {
988 cur = ggml_mul_mat(ctx: ctx0, a: layer.qkv_w, b: cur);
989 cur = ggml_add(ctx: ctx0, a: cur, b: layer.qkv_b);
990
991 ggml_tensor * Qcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
992 nb2: cur->nb[1], offset: 0);
993 ggml_tensor * Kcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
994 nb2: cur->nb[1], offset: n_embd * sizeof(float));
995 ggml_tensor * Vcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
996 nb2: cur->nb[1], offset: 2 * n_embd * sizeof(float));
997
998 cb(cur0: Qcur, name: "Qcur", il);
999 cb(cur0: Kcur, name: "Kcur", il);
1000 cb(cur0: Vcur, name: "Vcur", il);
1001
1002 // apply M-RoPE
1003 Qcur = ggml_rope_multi(
1004 ctx: ctx0, a: Qcur, b: positions, c: nullptr,
1005 n_dims: d_head/2, sections: mrope_sections, GGML_ROPE_TYPE_VISION, n_ctx_orig: 32768, freq_base: 10000, freq_scale: 1, ext_factor: 0, attn_factor: 1, beta_fast: 32, beta_slow: 1);
1006 Kcur = ggml_rope_multi(
1007 ctx: ctx0, a: Kcur, b: positions, c: nullptr,
1008 n_dims: d_head/2, sections: mrope_sections, GGML_ROPE_TYPE_VISION, n_ctx_orig: 32768, freq_base: 10000, freq_scale: 1, ext_factor: 0, attn_factor: 1, beta_fast: 32, beta_slow: 1);
1009
1010 cb(cur0: Qcur, name: "Qcur_rope", il);
1011 cb(cur0: Kcur, name: "Kcur_rope", il);
1012
1013 cur = build_attn(wo: layer.o_w, wo_b: layer.o_b,
1014 q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_mask: nullptr, kq_scale, il);
1015 cb(cur0: cur, name: "attn_out", il);
1016 }
1017
1018 // re-add the layer input, e.g., residual
1019 cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
1020
1021 inpL = cur; // inpL = residual, cur = hidden_states
1022
1023 cb(cur0: cur, name: "ffn_inp", il);
1024
1025 // layernorm2
1026 cur = build_norm(cur, mw: layer.ln_2_w, mb: layer.ln_2_b, type: norm_t, norm_eps: eps, il);
1027 cb(cur0: cur, name: "ffn_inp_normed", il);
1028
1029 // ffn
1030 cur = build_ffn(cur,
1031 up: layer.ff_up_w, up_b: layer.ff_up_b,
1032 gate: layer.ff_gate_w, gate_b: layer.ff_gate_b,
1033 down: layer.ff_down_w, down_b: layer.ff_down_b,
1034 type_op: hparams.ffn_op, il);
1035
1036 cb(cur0: cur, name: "ffn_out", il);
1037
1038 // residual 2
1039 cur = ggml_add(ctx: ctx0, a: inpL, b: cur);
1040 cb(cur0: cur, name: "layer_out", il);
1041
1042 if (layer.has_deepstack()) {
1043 ggml_tensor * feat = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: n_embd * merge_factor, ne1: n_pos / merge_factor, ne2: batch_size);
1044 feat = build_norm(cur: feat, mw: layer.deepstack_norm_w, mb: layer.deepstack_norm_b, type: norm_t, norm_eps: eps, il);
1045 feat = build_ffn(cur: feat,
1046 up: layer.deepstack_fc1_w, up_b: layer.deepstack_fc1_b,
1047 gate: nullptr, gate_b: nullptr,
1048 down: layer.deepstack_fc2_w, down_b: layer.deepstack_fc2_b,
1049 type_op: ffn_op_type::FFN_GELU, il);
1050
1051 if(!deepstack_features) {
1052 deepstack_features = feat;
1053 } else {
1054 // concat along the feature dimension
1055 deepstack_features = ggml_concat(ctx: ctx0, a: deepstack_features, b: feat, dim: 0);
1056 }
1057 }
1058
1059 inpL = cur;
1060 }
1061
1062 // post-layernorm
1063 if (model.post_ln_w) {
1064 inpL = build_norm(cur: inpL, mw: model.post_ln_w, mb: model.post_ln_b, type: norm_t, norm_eps: eps, il: n_layer);
1065 }
1066
1067 // multimodal projection
1068 ggml_tensor * embeddings = inpL;
1069 embeddings = ggml_reshape_3d(ctx: ctx0, a: embeddings, ne0: n_embd * 4, ne1: n_pos / 4, ne2: batch_size);
1070
1071 embeddings = build_ffn(cur: embeddings,
1072 up: model.mm_0_w, up_b: model.mm_0_b,
1073 gate: nullptr, gate_b: nullptr,
1074 down: model.mm_1_w, down_b: model.mm_1_b,
1075 type_op: ffn_op_type::FFN_GELU, il: -1);
1076
1077 embeddings = ggml_concat(ctx: ctx0, a: embeddings, b: deepstack_features, dim: 0); // concat along the feature dimension
1078
1079 // build the graph
1080 ggml_build_forward_expand(cgraph: gf, tensor: embeddings);
1081
1082 return gf;
1083 }
1084
1085 ggml_cgraph * build_minicpmv() {
1086 GGML_ASSERT(model.class_embedding == nullptr);
1087 const int n_pos = n_patches;
1088 const int n_embd_proj = clip_n_mmproj_embd(ctx);
1089
1090 // position embeddings for the projector (not for ViT)
1091 // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
1092 // base frequency omega
1093 ggml_tensor * omega = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_F32, ne0: n_embd_proj / 4);
1094 ggml_set_name(tensor: omega, name: "omega");
1095 ggml_set_input(tensor: omega);
1096
1097 // 2D input positions (using float for sinusoidal embeddings)
1098 ggml_tensor * pos_h = ggml_new_tensor_2d(ctx: ctx0, type: GGML_TYPE_F32, ne0: 1, ne1: n_pos);
1099 ggml_set_name(tensor: pos_h, name: "pos_h");
1100 ggml_set_input(tensor: pos_h);
1101 ggml_tensor * pos_w = ggml_new_tensor_2d(ctx: ctx0, type: GGML_TYPE_F32, ne0: 1, ne1: n_pos);
1102 ggml_set_name(tensor: pos_w, name: "pos_w");
1103 ggml_set_input(tensor: pos_w);
1104
1105 // for selecting learned pos embd, used by ViT
1106 struct ggml_tensor * positions = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos);
1107 ggml_set_name(tensor: positions, name: "positions");
1108 ggml_set_input(tensor: positions);
1109
1110 ggml_tensor * learned_pos_embd = ggml_get_rows(ctx: ctx0, a: model.position_embeddings, b: positions);
1111
1112 ggml_tensor * inp = build_inp();
1113 ggml_tensor * embeddings = build_vit(
1114 inp, n_pos,
1115 norm_t: NORM_TYPE_NORMAL,
1116 ffn_t: hparams.ffn_op,
1117 learned_pos_embd,
1118 add_pos: nullptr);
1119
1120 // resampler projector (it is just another transformer)
1121
1122 ggml_tensor * q = model.mm_model_query;
1123 ggml_tensor * v = ggml_mul_mat(ctx: ctx0, a: model.mm_model_kv_proj, b: embeddings);
1124
1125 // norm
1126 q = build_norm(cur: q, mw: model.mm_model_ln_q_w, mb: model.mm_model_ln_q_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il: -1);
1127 v = build_norm(cur: v, mw: model.mm_model_ln_kv_w, mb: model.mm_model_ln_kv_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il: -1);
1128
1129 // calculate sinusoidal pos embd
1130 ggml_tensor * pos_embed = nullptr;
1131 {
1132 // outer product
1133 ggml_tensor * omega_b = ggml_repeat_4d(ctx: ctx0, a: omega, ne0: omega->ne[0], ne1: n_pos, ne2: 1, ne3: 1); // n_pos rows
1134 ggml_tensor * theta_x = ggml_mul(ctx: ctx0, a: omega_b, b: pos_w);
1135 ggml_tensor * theta_y = ggml_mul(ctx: ctx0, a: omega_b, b: pos_h);
1136 // sin and cos
1137 ggml_tensor * pos_embd_x = ggml_concat(
1138 ctx: ctx0,
1139 a: ggml_sin(ctx: ctx0, a: theta_x),
1140 b: ggml_cos(ctx: ctx0, a: theta_x),
1141 dim: 0 // concat on first dim
1142 );
1143 ggml_tensor * pos_embd_y = ggml_concat(
1144 ctx: ctx0,
1145 a: ggml_sin(ctx: ctx0, a: theta_y),
1146 b: ggml_cos(ctx: ctx0, a: theta_y),
1147 dim: 0 // concat on first dim
1148 );
1149 pos_embed = ggml_concat(ctx: ctx0, a: pos_embd_x, b: pos_embd_y, dim: 0);
1150 }
1151
1152 // k = v + pos_embed
1153 ggml_tensor * k = ggml_add(ctx: ctx0, a: v, b: pos_embed);
1154
1155 // attention
1156 {
1157 const int d_head = 128;
1158 int n_head = n_embd_proj/d_head;
1159 // Use actual config value if available, otherwise fall back to hardcoded values
1160 int num_query = ctx->model.hparams.minicpmv_query_num;
1161 ggml_tensor * Q = ggml_add(ctx: ctx0,
1162 a: ggml_mul_mat(ctx: ctx0, a: model.mm_model_attn_q_w, b: q),
1163 b: model.mm_model_attn_q_b);
1164 ggml_tensor * K = ggml_add(ctx: ctx0,
1165 a: ggml_mul_mat(ctx: ctx0, a: model.mm_model_attn_k_w, b: k),
1166 b: model.mm_model_attn_k_b);
1167 ggml_tensor * V = ggml_add(ctx: ctx0,
1168 a: ggml_mul_mat(ctx: ctx0, a: model.mm_model_attn_v_w, b: v),
1169 b: model.mm_model_attn_v_b);
1170
1171 Q = ggml_reshape_3d(ctx: ctx0, a: Q, ne0: d_head, ne1: n_head, ne2: num_query);
1172 K = ggml_reshape_3d(ctx: ctx0, a: K, ne0: d_head, ne1: n_head, ne2: n_pos);
1173 V = ggml_reshape_3d(ctx: ctx0, a: V, ne0: d_head, ne1: n_head, ne2: n_pos);
1174
1175 cb(cur0: Q, name: "resampler_Q", il: -1);
1176 cb(cur0: K, name: "resampler_K", il: -1);
1177 cb(cur0: V, name: "resampler_V", il: -1);
1178
1179 embeddings = build_attn(
1180 wo: model.mm_model_attn_o_w,
1181 wo_b: model.mm_model_attn_o_b,
1182 q_cur: Q, k_cur: K, v_cur: V, kq_mask: nullptr, kq_scale, il: -1);
1183 cb(cur0: embeddings, name: "resampler_attn_out", il: -1);
1184 }
1185 // layernorm
1186 embeddings = build_norm(cur: embeddings, mw: model.mm_model_ln_post_w, mb: model.mm_model_ln_post_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il: -1);
1187
1188 // projection
1189 embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_model_proj, b: embeddings);
1190
1191 // build the graph
1192 ggml_build_forward_expand(cgraph: gf, tensor: embeddings);
1193
1194 return gf;
1195 }
1196
1197 ggml_cgraph * build_internvl() {
1198 GGML_ASSERT(model.class_embedding != nullptr);
1199 GGML_ASSERT(model.position_embeddings != nullptr);
1200
1201 const int n_pos = n_patches + 1;
1202 ggml_tensor * inp = build_inp();
1203
1204 // add CLS token
1205 inp = ggml_concat(ctx: ctx0, a: inp, b: model.class_embedding, dim: 1);
1206
1207 // The larger models use a different ViT, which uses RMS norm instead of layer norm
1208 // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
1209 norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
1210 ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
1211 : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
1212
1213 ggml_tensor * cur = build_vit(
1214 inp, n_pos,
1215 norm_t,
1216 ffn_t: hparams.ffn_op,
1217 learned_pos_embd: model.position_embeddings,
1218 add_pos: nullptr);
1219
1220 // remove CLS token
1221 cur = ggml_view_2d(ctx: ctx0, a: cur,
1222 ne0: n_embd, ne1: n_patches,
1223 nb1: ggml_row_size(type: cur->type, ne: n_embd), offset: 0);
1224
1225 // pixel shuffle
1226 {
1227 const int scale_factor = model.hparams.n_merge;
1228 const int bsz = 1; // batch size, always 1 for now since we don't support batching
1229 const int height = n_patches_y;
1230 const int width = n_patches_x;
1231 GGML_ASSERT(scale_factor > 0);
1232 cur = ggml_reshape_4d(ctx: ctx0, a: cur, ne0: n_embd * scale_factor, ne1: height / scale_factor, ne2: width, ne3: bsz);
1233 cur = ggml_permute(ctx: ctx0, a: cur, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
1234 cur = ggml_cont_4d(ctx: ctx0, a: cur,
1235 ne0: n_embd * scale_factor * scale_factor,
1236 ne1: height / scale_factor,
1237 ne2: width / scale_factor,
1238 ne3: bsz);
1239 cur = ggml_permute(ctx: ctx0, a: cur, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
1240 // flatten to 2D
1241 cur = ggml_cont_2d(ctx: ctx0, a: cur,
1242 ne0: n_embd * scale_factor * scale_factor,
1243 ne1: cur->ne[1] * cur->ne[2]);
1244 }
1245
1246 // projector (always using GELU activation)
1247 {
1248 // projector LayerNorm uses pytorch's default eps = 1e-5
1249 // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
1250 cur = build_norm(cur, mw: model.mm_0_w, mb: model.mm_0_b, type: NORM_TYPE_NORMAL, norm_eps: 1e-5, il: -1);
1251 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
1252 cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_1_b);
1253 cur = ggml_gelu(ctx: ctx0, a: cur);
1254 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_3_w, b: cur);
1255 cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_3_b);
1256 }
1257
1258 // build the graph
1259 ggml_build_forward_expand(cgraph: gf, tensor: cur);
1260
1261 return gf;
1262 }
1263
1264 ggml_cgraph * build_llama4() {
1265 GGML_ASSERT(model.class_embedding != nullptr);
1266 GGML_ASSERT(model.position_embeddings != nullptr);
1267
1268 const int n_pos = n_patches + 1; // +1 for [CLS]
1269
1270 // 2D input positions
1271 ggml_tensor * pos_h = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos);
1272 ggml_set_name(tensor: pos_h, name: "pos_h");
1273 ggml_set_input(tensor: pos_h);
1274
1275 ggml_tensor * pos_w = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos);
1276 ggml_set_name(tensor: pos_w, name: "pos_w");
1277 ggml_set_input(tensor: pos_w);
1278
1279 ggml_tensor * inp = build_inp_raw();
1280
1281 // Llama4UnfoldConvolution
1282 {
1283 ggml_tensor * kernel = ggml_reshape_4d(ctx: ctx0, a: model.patch_embeddings_0,
1284 ne0: patch_size, ne1: patch_size, ne2: 3, ne3: n_embd);
1285 inp = ggml_im2col(ctx: ctx0, a: kernel, b: inp, s0: patch_size, s1: patch_size, p0: 0, p1: 0, d0: 1, d1: 1, is_2D: true, dst_type: inp->type);
1286 inp = ggml_mul_mat(ctx: ctx0, a: model.patch_embeddings_0, b: inp);
1287 inp = ggml_reshape_2d(ctx: ctx0, a: inp, ne0: n_embd, ne1: n_patches);
1288 cb(cur0: inp, name: "patch_conv", il: -1);
1289 }
1290
1291 // add CLS token
1292 inp = ggml_concat(ctx: ctx0, a: inp, b: model.class_embedding, dim: 1);
1293
1294 // build ViT with 2D position embeddings
1295 auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
1296 // first half is X axis and second half is Y axis
1297 // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
1298 // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
1299 return build_rope_2d(ctx0, cur, pos_a: pos_w, pos_b: pos_h, freq_base: hparams.rope_theta, interleave_freq: false);
1300 };
1301 ggml_tensor * cur = build_vit(
1302 inp, n_pos,
1303 norm_t: NORM_TYPE_NORMAL,
1304 ffn_t: hparams.ffn_op,
1305 learned_pos_embd: model.position_embeddings,
1306 add_pos);
1307
1308 // remove CLS token
1309 cur = ggml_view_2d(ctx: ctx0, a: cur,
1310 ne0: n_embd, ne1: n_patches,
1311 nb1: ggml_row_size(type: cur->type, ne: n_embd), offset: 0);
1312
1313 // pixel shuffle
1314 // based on Llama4VisionPixelShuffleMLP
1315 // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
1316 {
1317 const int scale_factor = model.hparams.n_merge;
1318 const int bsz = 1; // batch size, always 1 for now since we don't support batching
1319 GGML_ASSERT(scale_factor > 0);
1320 GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
1321 cur = ggml_reshape_4d(ctx: ctx0, a: cur,
1322 ne0: n_embd * scale_factor,
1323 ne1: n_patches_x / scale_factor,
1324 ne2: n_patches_y,
1325 ne3: bsz);
1326 cur = ggml_permute(ctx: ctx0, a: cur, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
1327 cur = ggml_cont_4d(ctx: ctx0, a: cur,
1328 ne0: n_embd * scale_factor * scale_factor,
1329 ne1: n_patches_x / scale_factor,
1330 ne2: n_patches_y / scale_factor,
1331 ne3: bsz);
1332 //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
1333 // flatten to 2D
1334 cur = ggml_cont_2d(ctx: ctx0, a: cur,
1335 ne0: n_embd * scale_factor * scale_factor,
1336 ne1: n_patches / scale_factor / scale_factor);
1337 cb(cur0: cur, name: "pixel_shuffle", il: -1);
1338 }
1339
1340 // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
1341 {
1342 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_1_w, b: cur);
1343 cur = ggml_gelu(ctx: ctx0, a: cur);
1344 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_2_w, b: cur);
1345 cur = ggml_gelu(ctx: ctx0, a: cur);
1346 cb(cur0: cur, name: "adapter_mlp", il: -1);
1347 }
1348
1349 // Llama4MultiModalProjector
1350 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_model_proj, b: cur);
1351 cb(cur0: cur, name: "projected", il: -1);
1352
1353 // build the graph
1354 ggml_build_forward_expand(cgraph: gf, tensor: cur);
1355
1356 return gf;
1357 }
1358
1359 ggml_cgraph * build_kimivl() {
1360 // 2D input positions
1361 ggml_tensor * pos_h = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_patches);
1362 ggml_set_name(tensor: pos_h, name: "pos_h");
1363 ggml_set_input(tensor: pos_h);
1364
1365 ggml_tensor * pos_w = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_patches);
1366 ggml_set_name(tensor: pos_w, name: "pos_w");
1367 ggml_set_input(tensor: pos_w);
1368
1369 ggml_tensor * learned_pos_embd = resize_position_embeddings();
1370
1371 // build ViT with 2D position embeddings
1372 auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
1373 // first half is X axis and second half is Y axis
1374 return build_rope_2d(ctx0, cur, pos_a: pos_w, pos_b: pos_h, freq_base: hparams.rope_theta, interleave_freq: false);
1375 };
1376
1377 ggml_tensor * inp = build_inp();
1378 ggml_tensor * cur = build_vit(
1379 inp, n_pos: n_patches,
1380 norm_t: NORM_TYPE_NORMAL,
1381 ffn_t: hparams.ffn_op,
1382 learned_pos_embd,
1383 add_pos);
1384
1385 cb(cur0: cur, name: "vit_out", il: -1);
1386
1387 {
1388 // patch_merger
1389 const int scale_factor = model.hparams.n_merge;
1390 cur = build_patch_merge_permute(cur, scale_factor);
1391
1392 // projection norm
1393 int proj_inp_dim = cur->ne[0];
1394 cur = ggml_view_2d(ctx: ctx0, a: cur,
1395 ne0: n_embd, ne1: cur->ne[1] * scale_factor * scale_factor,
1396 nb1: ggml_row_size(type: cur->type, ne: n_embd), offset: 0);
1397 cur = ggml_norm(ctx: ctx0, a: cur, eps: 1e-5); // default nn.LayerNorm
1398 cur = ggml_mul(ctx: ctx0, a: cur, b: model.mm_input_norm_w);
1399 cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_input_norm_b);
1400 cur = ggml_view_2d(ctx: ctx0, a: cur,
1401 ne0: proj_inp_dim, ne1: cur->ne[1] / scale_factor / scale_factor,
1402 nb1: ggml_row_size(type: cur->type, ne: proj_inp_dim), offset: 0);
1403 cb(cur0: cur, name: "proj_inp_normed", il: -1);
1404
1405 // projection mlp
1406 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
1407 cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_1_b);
1408 cur = ggml_gelu(ctx: ctx0, a: cur);
1409 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: cur);
1410 cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_2_b);
1411 cb(cur0: cur, name: "proj_out", il: -1);
1412 }
1413
1414 // build the graph
1415 ggml_build_forward_expand(cgraph: gf, tensor: cur);
1416
1417 return gf;
1418 }
1419
1420 // this graph is used by llava, granite and glm
1421 // due to having embedding_stack (used by granite), we cannot reuse build_vit
1422 ggml_cgraph * build_llava() {
1423 const int batch_size = 1;
1424 const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
1425
1426 GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
1427
1428 // Calculate the deepest feature layer based on hparams and projector type
1429 int max_feature_layer = n_layer;
1430 {
1431 // Get the index of the second to last layer; this is the default for models that have a llava projector
1432 int il_last = hparams.n_layer - 1;
1433 int deepest_feature_layer = -1;
1434
1435 if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV || ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
1436 il_last += 1;
1437 }
1438
1439 // If we set explicit vision feature layers, only go up to the deepest one
1440 // NOTE: only used by granite-vision models for now
1441 for (const auto & feature_layer : hparams.vision_feature_layer) {
1442 if (feature_layer > deepest_feature_layer) {
1443 deepest_feature_layer = feature_layer;
1444 }
1445 }
1446 max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
1447 }
1448
1449 ggml_tensor * inp = build_inp();
1450
1451 // concat class_embeddings and patch_embeddings
1452 if (model.class_embedding) {
1453 inp = ggml_concat(ctx: ctx0, a: inp, b: model.class_embedding, dim: 1);
1454 }
1455
1456 ggml_tensor * positions = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos);
1457 ggml_set_name(tensor: positions, name: "positions");
1458 ggml_set_input(tensor: positions);
1459
1460 inp = ggml_add(ctx: ctx0, a: inp, b: ggml_get_rows(ctx: ctx0, a: model.position_embeddings, b: positions));
1461
1462 ggml_tensor * inpL = inp;
1463
1464 // pre-layernorm
1465 if (model.pre_ln_w) {
1466 inpL = build_norm(cur: inpL, mw: model.pre_ln_w, mb: model.pre_ln_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il: -1);
1467 cb(cur0: inpL, name: "pre_ln", il: -1);
1468 }
1469
1470 std::vector<ggml_tensor *> embedding_stack;
1471 const auto & vision_feature_layer = hparams.vision_feature_layer;
1472
1473 // loop over layers
1474 for (int il = 0; il < max_feature_layer; il++) {
1475 auto & layer = model.layers[il];
1476 ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
1477
1478 // If this is an embedding feature layer, save the output.
1479 // NOTE: 0 index here refers to the input to the encoder.
1480 if (vision_feature_layer.find(x: il) != vision_feature_layer.end()) {
1481 embedding_stack.push_back(x: cur);
1482 }
1483
1484 // layernorm1
1485 cur = build_norm(cur, mw: layer.ln_1_w, mb: layer.ln_1_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il);
1486 cb(cur0: cur, name: "layer_inp_normed", il);
1487
1488 // self-attention
1489 {
1490 ggml_tensor * Qcur = ggml_mul_mat(ctx: ctx0, a: layer.q_w, b: cur);
1491 if (layer.q_b) {
1492 Qcur = ggml_add(ctx: ctx0, a: Qcur, b: layer.q_b);
1493 }
1494
1495 ggml_tensor * Kcur = ggml_mul_mat(ctx: ctx0, a: layer.k_w, b: cur);
1496 if (layer.k_b) {
1497 Kcur = ggml_add(ctx: ctx0, a: Kcur, b: layer.k_b);
1498 }
1499
1500 ggml_tensor * Vcur = ggml_mul_mat(ctx: ctx0, a: layer.v_w, b: cur);
1501 if (layer.v_b) {
1502 Vcur = ggml_add(ctx: ctx0, a: Vcur, b: layer.v_b);
1503 }
1504
1505 Qcur = ggml_reshape_3d(ctx: ctx0, a: Qcur, ne0: d_head, ne1: n_head, ne2: n_pos);
1506 Kcur = ggml_reshape_3d(ctx: ctx0, a: Kcur, ne0: d_head, ne1: n_head, ne2: n_pos);
1507 Vcur = ggml_reshape_3d(ctx: ctx0, a: Vcur, ne0: d_head, ne1: n_head, ne2: n_pos);
1508
1509 cb(cur0: Qcur, name: "Qcur", il);
1510 cb(cur0: Kcur, name: "Kcur", il);
1511 cb(cur0: Vcur, name: "Vcur", il);
1512
1513 cur = build_attn(wo: layer.o_w, wo_b: layer.o_b,
1514 q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_mask: nullptr, kq_scale, il);
1515 cb(cur0: cur, name: "attn_out", il);
1516 }
1517
1518 // re-add the layer input, e.g., residual
1519 cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
1520
1521 inpL = cur; // inpL = residual, cur = hidden_states
1522
1523 cb(cur0: cur, name: "ffn_inp", il);
1524
1525 // layernorm2
1526 cur = build_norm(cur, mw: layer.ln_2_w, mb: layer.ln_2_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il);
1527 cb(cur0: cur, name: "ffn_inp_normed", il);
1528
1529 // ffn
1530 cur = build_ffn(cur,
1531 up: layer.ff_up_w, up_b: layer.ff_up_b,
1532 gate: layer.ff_gate_w, gate_b: layer.ff_gate_b,
1533 down: layer.ff_down_w, down_b: layer.ff_down_b,
1534 type_op: hparams.ffn_op, il);
1535
1536 cb(cur0: cur, name: "ffn_out", il);
1537
1538 // residual 2
1539 cur = ggml_add(ctx: ctx0, a: inpL, b: cur);
1540 cb(cur0: cur, name: "layer_out", il);
1541
1542 inpL = cur;
1543 }
1544
1545 // post-layernorm
1546 if (model.post_ln_w) {
1547 inpL = build_norm(cur: inpL, mw: model.post_ln_w, mb: model.post_ln_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il: -1);
1548 }
1549
1550 ggml_tensor * embeddings = inpL;
1551
1552 // process vision feature layers (used by granite)
1553 {
1554 // final layer is a vision feature layer
1555 if (vision_feature_layer.find(x: max_feature_layer) != vision_feature_layer.end()) {
1556 embedding_stack.push_back(x: inpL);
1557 }
1558
1559 // If feature layers are explicitly set, stack them (if we have multiple)
1560 if (!embedding_stack.empty()) {
1561 embeddings = embedding_stack[0];
1562 for (size_t i = 1; i < embedding_stack.size(); i++) {
1563 embeddings = ggml_concat(ctx: ctx0, a: embeddings, b: embedding_stack[i], dim: 0);
1564 }
1565 }
1566 }
1567
1568 // llava projector (also used by granite)
1569 if (ctx->model.hparams.has_llava_projector) {
1570 embeddings = ggml_reshape_2d(ctx: ctx0, a: embeddings, ne0: embeddings->ne[0], ne1: embeddings->ne[1]);
1571
1572 ggml_tensor * patches = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_patches);
1573 ggml_set_name(tensor: patches, name: "patches");
1574 ggml_set_input(tensor: patches);
1575
1576 // shape [1, 576, 1024]
1577 // ne is whcn, ne = [1024, 576, 1, 1]
1578 embeddings = ggml_get_rows(ctx: ctx0, a: embeddings, b: patches);
1579
1580 // print_tensor_info(embeddings, "embeddings");
1581
1582 // llava projector
1583 if (ctx->proj_type() == PROJECTOR_TYPE_MLP) {
1584 embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_0_w, b: embeddings);
1585 embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_0_b);
1586
1587 embeddings = ggml_gelu(ctx: ctx0, a: embeddings);
1588 if (model.mm_2_w) {
1589 embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: embeddings);
1590 embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_2_b);
1591 }
1592 }
1593 else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) {
1594 embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_0_w, b: embeddings);
1595 embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_0_b);
1596 // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
1597 // First LayerNorm
1598 embeddings = ggml_norm(ctx: ctx0, a: embeddings, eps);
1599 embeddings = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: embeddings, b: model.mm_1_w),
1600 b: model.mm_1_b);
1601
1602 // GELU activation
1603 embeddings = ggml_gelu(ctx: ctx0, a: embeddings);
1604
1605 // Second linear layer
1606 embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_3_w, b: embeddings);
1607 embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_3_b);
1608
1609 // Second LayerNorm
1610 embeddings = ggml_norm(ctx: ctx0, a: embeddings, eps);
1611 embeddings = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: embeddings, b: model.mm_4_w),
1612 b: model.mm_4_b);
1613 }
1614 else if (ctx->proj_type() == PROJECTOR_TYPE_LDP) {
1615 // MobileVLM projector
1616 int n_patch = 24;
1617 ggml_tensor * mlp_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_1_w, b: embeddings);
1618 mlp_1 = ggml_add(ctx: ctx0, a: mlp_1, b: model.mm_model_mlp_1_b);
1619 mlp_1 = ggml_gelu(ctx: ctx0, a: mlp_1);
1620 ggml_tensor * mlp_3 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_3_w, b: mlp_1);
1621 mlp_3 = ggml_add(ctx: ctx0, a: mlp_3, b: model.mm_model_mlp_3_b);
1622 // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
1623
1624 // block 1
1625 ggml_tensor * block_1 = nullptr;
1626 {
1627 // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
1628 mlp_3 = ggml_permute(ctx: ctx0, a: mlp_3, axis0: 1, axis1: 0, axis2: 2, axis3: 3);
1629 mlp_3 = ggml_cont_4d(ctx: ctx0, a: mlp_3, ne0: n_patch, ne1: n_patch, ne2: mlp_3->ne[1], ne3: mlp_3->ne[2]);
1630 // stride = 1, padding = 1, bias is nullptr
1631 block_1 = ggml_conv_2d_dw(ctx: ctx0, a: model.mm_model_block_1_block_0_0_w, b: mlp_3, s0: 1, s1: 1, p0: 1, p1: 1, d0: 1, d1: 1);
1632
1633 // layer norm
1634 // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1635 block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: 1, axis1: 2, axis2: 0, axis3: 3));
1636 // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1637 block_1 = ggml_norm(ctx: ctx0, a: block_1, eps);
1638 block_1 = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: block_1, b: model.mm_model_block_1_block_0_1_w), b: model.mm_model_block_1_block_0_1_b);
1639 block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: 2, axis1: 0, axis2: 1, axis3: 3));
1640
1641 // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1642 // hardswish
1643 ggml_tensor * block_1_hw = ggml_hardswish(ctx: ctx0, a: block_1);
1644
1645 block_1 = ggml_pool_2d(ctx: ctx0, a: block_1_hw, op: GGML_OP_POOL_AVG, k0: block_1_hw->ne[0], k1: block_1_hw->ne[1], s0: block_1_hw->ne[0], s1: block_1_hw->ne[1], p0: 0, p1: 0);
1646 // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1647 // pointwise conv
1648 block_1 = ggml_reshape_2d(ctx: ctx0, a: block_1, ne0: block_1->ne[0]*block_1->ne[1]*block_1->ne[2], ne1: block_1->ne[3]);
1649 block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_1_block_1_fc1_w, b: block_1);
1650 block_1 = ggml_add(ctx: ctx0, a: block_1, b: model.mm_model_block_1_block_1_fc1_b);
1651 block_1 = ggml_relu(ctx: ctx0, a: block_1);
1652 block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_1_block_1_fc2_w, b: block_1);
1653 block_1 = ggml_add(ctx: ctx0, a: block_1, b: model.mm_model_block_1_block_1_fc2_b);
1654 block_1 = ggml_hardsigmoid(ctx: ctx0, a: block_1);
1655 // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
1656 block_1 = ggml_reshape_4d(ctx: ctx0, a: block_1, ne0: 1, ne1: 1, ne2: block_1->ne[0], ne3: block_1->ne[1]);
1657 block_1 = ggml_mul(ctx: ctx0, a: block_1_hw, b: block_1);
1658
1659 int w = block_1->ne[0], h = block_1->ne[1];
1660 block_1 = ggml_reshape_3d(ctx: ctx0, a: block_1, ne0: w*h, ne1: block_1->ne[2], ne2: block_1->ne[3]);
1661 block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: 1, axis1: 0, axis2: 2, axis3: 3));
1662
1663 // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
1664 block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_1_block_2_0_w, b: block_1);
1665 block_1 = ggml_reshape_4d(ctx: ctx0, a: block_1, ne0: block_1->ne[0], ne1: w, ne2: h, ne3: block_1->ne[3]);
1666
1667 // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1668 block_1 = ggml_norm(ctx: ctx0, a: block_1, eps);
1669 block_1 = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: block_1, b: model.mm_model_block_1_block_2_1_w), b: model.mm_model_block_1_block_2_1_b);
1670 block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: 2, axis1: 0, axis2: 1, axis3: 3));
1671 // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1672 // residual
1673 block_1 = ggml_add(ctx: ctx0, a: mlp_3, b: block_1);
1674 }
1675
1676 // block_2
1677 {
1678 // stride = 2
1679 block_1 = ggml_conv_2d_dw(ctx: ctx0, a: model.mm_model_block_2_block_0_0_w, b: block_1, s0: 2, s1: 2, p0: 1, p1: 1, d0: 1, d1: 1);
1680
1681 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1682 // layer norm
1683 block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: 1, axis1: 2, axis2: 0, axis3: 3));
1684 // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1685 block_1 = ggml_norm(ctx: ctx0, a: block_1, eps);
1686 block_1 = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: block_1, b: model.mm_model_block_2_block_0_1_w), b: model.mm_model_block_2_block_0_1_b);
1687 block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: 2, axis1: 0, axis2: 1, axis3: 3));
1688 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1689 // hardswish
1690 ggml_tensor * block_1_hw = ggml_hardswish(ctx: ctx0, a: block_1);
1691
1692 // not sure the parameters is right for globalAvgPooling
1693 block_1 = ggml_pool_2d(ctx: ctx0, a: block_1_hw, op: GGML_OP_POOL_AVG, k0: block_1_hw->ne[0], k1: block_1_hw->ne[1], s0: block_1_hw->ne[0], s1: block_1_hw->ne[1], p0: 0, p1: 0);
1694 // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1695 // pointwise conv
1696 block_1 = ggml_reshape_2d(ctx: ctx0, a: block_1, ne0: block_1->ne[0]*block_1->ne[1]*block_1->ne[2], ne1: block_1->ne[3]);
1697 block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_2_block_1_fc1_w, b: block_1);
1698 block_1 = ggml_add(ctx: ctx0, a: block_1, b: model.mm_model_block_2_block_1_fc1_b);
1699 block_1 = ggml_relu(ctx: ctx0, a: block_1);
1700 block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_2_block_1_fc2_w, b: block_1);
1701 block_1 = ggml_add(ctx: ctx0, a: block_1, b: model.mm_model_block_2_block_1_fc2_b);
1702 block_1 = ggml_hardsigmoid(ctx: ctx0, a: block_1);
1703
1704 // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1705 block_1 = ggml_reshape_4d(ctx: ctx0, a: block_1, ne0: 1, ne1: 1, ne2: block_1->ne[0], ne3: block_1->ne[1]);
1706 block_1 = ggml_mul(ctx: ctx0, a: block_1_hw, b: block_1);
1707
1708 int w = block_1->ne[0], h = block_1->ne[1];
1709 block_1 = ggml_reshape_3d(ctx: ctx0, a: block_1, ne0: w*h, ne1: block_1->ne[2], ne2: block_1->ne[3]);
1710 block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: 1, axis1: 0, axis2: 2, axis3: 3));
1711 // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
1712 block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_2_block_2_0_w, b: block_1);
1713 block_1 = ggml_reshape_4d(ctx: ctx0, a: block_1, ne0: block_1->ne[0], ne1: w, ne2: h, ne3: block_1->ne[3]);
1714
1715
1716 // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1717 block_1 = ggml_norm(ctx: ctx0, a: block_1, eps);
1718 block_1 = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: block_1, b: model.mm_model_block_2_block_2_1_w), b: model.mm_model_block_2_block_2_1_b);
1719 block_1 = ggml_reshape_3d(ctx: ctx0, a: block_1, ne0: block_1->ne[0], ne1: block_1->ne[1] * block_1->ne[2], ne2: block_1->ne[3]);
1720 // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
1721 }
1722 embeddings = block_1;
1723 }
1724 else if (ctx->proj_type() == PROJECTOR_TYPE_LDPV2)
1725 {
1726 int n_patch = 24;
1727 ggml_tensor * mlp_0 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_0_w, b: embeddings);
1728 mlp_0 = ggml_add(ctx: ctx0, a: mlp_0, b: model.mm_model_mlp_0_b);
1729 mlp_0 = ggml_gelu(ctx: ctx0, a: mlp_0);
1730 ggml_tensor * mlp_2 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_2_w, b: mlp_0);
1731 mlp_2 = ggml_add(ctx: ctx0, a: mlp_2, b: model.mm_model_mlp_2_b);
1732 // mlp_2 ne = [2048, 576, 1, 1]
1733 // // AVG Pool Layer 2*2, strides = 2
1734 mlp_2 = ggml_permute(ctx: ctx0, a: mlp_2, axis0: 1, axis1: 0, axis2: 2, axis3: 3);
1735 // mlp_2 ne = [576, 2048, 1, 1]
1736 mlp_2 = ggml_cont_4d(ctx: ctx0, a: mlp_2, ne0: n_patch, ne1: n_patch, ne2: mlp_2->ne[1], ne3: mlp_2->ne[2]);
1737 // mlp_2 ne [24, 24, 2048, 1]
1738 mlp_2 = ggml_pool_2d(ctx: ctx0, a: mlp_2, op: GGML_OP_POOL_AVG, k0: 2, k1: 2, s0: 2, s1: 2, p0: 0, p1: 0);
1739 // weight ne = [3, 3, 2048, 1]
1740 ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx: ctx0, a: model.mm_model_peg_0_w, b: mlp_2, s0: 1, s1: 1, p0: 1, p1: 1, d0: 1, d1: 1);
1741 peg_0 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: peg_0, axis0: 1, axis1: 2, axis2: 0, axis3: 3));
1742 peg_0 = ggml_add(ctx: ctx0, a: peg_0, b: model.mm_model_peg_0_b);
1743 mlp_2 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: mlp_2, axis0: 1, axis1: 2, axis2: 0, axis3: 3));
1744 peg_0 = ggml_add(ctx: ctx0, a: peg_0, b: mlp_2);
1745 peg_0 = ggml_reshape_3d(ctx: ctx0, a: peg_0, ne0: peg_0->ne[0], ne1: peg_0->ne[1] * peg_0->ne[2], ne2: peg_0->ne[3]);
1746 embeddings = peg_0;
1747 }
1748 else {
1749 GGML_ABORT("fatal error");
1750 }
1751 }
1752
1753 // glm projector
1754 else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
1755 size_t gridsz = (size_t)sqrt(x: embeddings->ne[1]);
1756 embeddings = ggml_permute(ctx: ctx0,a: embeddings,axis0: 1,axis1: 0,axis2: 2,axis3: 3);
1757 embeddings = ggml_cont_3d(ctx: ctx0, a: embeddings, ne0: gridsz, ne1: gridsz, ne2: embeddings->ne[1]);
1758 embeddings = ggml_conv_2d(ctx: ctx0, a: model.mm_model_adapter_conv_w, b: embeddings, s0: 2, s1: 2, p0: 0, p1: 0, d0: 1, d1: 1);
1759 embeddings = ggml_reshape_3d(ctx: ctx0, a: embeddings,ne0: embeddings->ne[0]*embeddings->ne[1] , ne1: embeddings->ne[2], ne2: batch_size);
1760 embeddings = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0,a: embeddings, axis0: 1, axis1: 0, axis2: 2, axis3: 3));
1761 embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_model_adapter_conv_b);
1762 // GLU
1763 {
1764 embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_0_w, b: embeddings);
1765 embeddings = ggml_norm(ctx: ctx0, a: embeddings, eps);
1766 embeddings = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: embeddings, b: model.mm_model_ln_q_w), b: model.mm_model_ln_q_b);
1767 embeddings = ggml_gelu_inplace(ctx: ctx0, a: embeddings);
1768 ggml_tensor * x = embeddings;
1769 embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_2_w, b: embeddings);
1770 x = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_1_w,b: x);
1771 embeddings = ggml_swiglu_split(ctx: ctx0, a: embeddings, b: x);
1772 embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_3_w, b: embeddings);
1773 }
1774 // arrangement of BOI/EOI token embeddings
1775 // note: these embeddings are not present in text model, hence we cannot process them as text tokens
1776 // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
1777 {
1778 embeddings = ggml_concat(ctx: ctx0, a: model.mm_boi, b: embeddings, dim: 1); // BOI
1779 embeddings = ggml_concat(ctx: ctx0, a: embeddings, b: model.mm_eoi, dim: 1); // EOI
1780 }
1781 }
1782
1783 else {
1784 GGML_ABORT("llava: unknown projector type");
1785 }
1786
1787 // build the graph
1788 ggml_build_forward_expand(cgraph: gf, tensor: embeddings);
1789
1790 return gf;
1791 }
1792 // whisper encoder with custom projector
1793 ggml_cgraph * build_whisper_enc() {
1794 const int n_frames = img.nx;
1795 const int n_pos = n_frames / 2;
1796 GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
1797
1798 ggml_tensor * inp = build_inp_raw(channels: 1);
1799
1800 // conv1d block
1801 {
1802 // convolution + gelu
1803 ggml_tensor * cur = ggml_conv_1d_ph(ctx: ctx0, a: model.conv1d_1_w, b: inp, s: 1, d: 1);
1804 cur = ggml_add(ctx: ctx0, a: cur, b: model.conv1d_1_b);
1805
1806 cur = ggml_gelu_erf(ctx: ctx0, a: cur);
1807
1808 cur = ggml_conv_1d_ph(ctx: ctx0, a: model.conv1d_2_w, b: cur, s: 2, d: 1);
1809 cur = ggml_add(ctx: ctx0, a: cur, b: model.conv1d_2_b);
1810
1811 cur = ggml_gelu_erf(ctx: ctx0, a: cur);
1812 // transpose
1813 inp = ggml_cont(ctx: ctx0, a: ggml_transpose(ctx: ctx0, a: cur));
1814 cb(cur0: inp, name: "after_conv1d", il: -1);
1815 }
1816
1817 // sanity check (only check one layer, but it should be the same for all)
1818 GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
1819 GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
1820 GGML_ASSERT(model.layers[0].q_b);
1821 GGML_ASSERT(model.layers[0].v_b);
1822 GGML_ASSERT(!model.layers[0].k_b); // no bias for k
1823 GGML_ASSERT(model.post_ln_w && model.post_ln_b);
1824
1825 ggml_tensor * pos_embd_selected = ggml_view_2d(
1826 ctx: ctx0, a: model.position_embeddings,
1827 ne0: model.position_embeddings->ne[0], ne1: n_pos,
1828 nb1: model.position_embeddings->nb[1], offset: 0
1829 );
1830 ggml_tensor * cur = build_vit(
1831 inp, n_pos,
1832 norm_t: NORM_TYPE_NORMAL,
1833 ffn_t: hparams.ffn_op,
1834 learned_pos_embd: pos_embd_selected,
1835 add_pos: nullptr);
1836
1837 cb(cur0: cur, name: "after_transformer", il: -1);
1838
1839 if (model.audio_has_stack_frames()) {
1840 // StackAudioFrames
1841 // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1842 int64_t stride = n_embd * hparams.proj_stack_factor;
1843 int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
1844 int64_t pad = padded_len - ggml_nelements(tensor: cur);
1845 if (pad > 0) {
1846 cur = ggml_view_1d(ctx: ctx0, a: cur, ne0: ggml_nelements(tensor: cur), offset: 0);
1847 cur = ggml_pad(ctx: ctx0, a: cur, p0: pad, p1: 0, p2: 0, p3: 0);
1848 }
1849 cur = ggml_view_2d(ctx: ctx0, a: cur, ne0: stride, ne1: padded_len / stride,
1850 nb1: ggml_row_size(type: cur->type, ne: stride), offset: 0);
1851 cb(cur0: cur, name: "after_stacked", il: -1);
1852 }
1853
1854 if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) {
1855 // UltravoxProjector
1856 // pre-norm
1857 cur = ggml_rms_norm(ctx: ctx0, a: cur, eps: 1e-6);
1858 cur = ggml_mul(ctx: ctx0, a: cur, b: model.mm_norm_pre_w);
1859
1860 // ffn in
1861 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
1862
1863 // swiglu
1864 // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1865 cur = ggml_swiglu_swapped(ctx: ctx0, a: cur);
1866
1867 // mid-norm
1868 cur = ggml_rms_norm(ctx: ctx0, a: cur, eps: 1e-6);
1869 cur = ggml_mul(ctx: ctx0, a: cur, b: model.mm_norm_mid_w);
1870
1871 // ffn out
1872 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: cur);
1873
1874 } else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) {
1875 // projector
1876 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_fc_w, b: cur);
1877 cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_fc_b);
1878
1879 } else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
1880 // projector
1881 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
1882 cur = ggml_gelu_erf(ctx: ctx0, a: cur);
1883 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: cur);
1884
1885 } else {
1886 GGML_ABORT("%s: unknown projector type", __func__);
1887 }
1888
1889 cb(cur0: cur, name: "projected", il: -1);
1890
1891 ggml_build_forward_expand(cgraph: gf, tensor: cur);
1892
1893 return gf;
1894 }
1895
1896 // cogvlm vision encoder
1897 ggml_cgraph * build_cogvlm() {
1898 GGML_ASSERT(model.class_embedding != nullptr);
1899 GGML_ASSERT(model.position_embeddings != nullptr);
1900
1901 const int n_pos = n_patches + 1; // +1 for [CLS]
1902
1903 // build input and concatenate class embedding
1904 ggml_tensor * inp = build_inp();
1905 inp = ggml_concat(ctx: ctx0, a: inp, b: model.class_embedding, dim: 1);
1906
1907 inp = ggml_add(ctx: ctx0, a: inp, b: model.position_embeddings);
1908 cb(cur0: inp, name: "inp_pos", il: -1);
1909
1910 ggml_tensor * inpL = inp;
1911
1912 for (int il = 0; il < n_layer; il++) {
1913 auto & layer = model.layers[il];
1914 ggml_tensor * cur = inpL;
1915
1916 cur = ggml_mul_mat(ctx: ctx0, a: layer.qkv_w, b: cur);
1917
1918 cur = ggml_add(ctx: ctx0, a: cur, b: layer.qkv_b);
1919
1920 ggml_tensor * Qcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
1921 nb2: cur->nb[1], offset: 0);
1922 ggml_tensor * Kcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
1923 nb2: cur->nb[1], offset: n_embd * sizeof(float));
1924 ggml_tensor * Vcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
1925 nb2: cur->nb[1], offset: 2 * n_embd * sizeof(float));
1926
1927 cb(cur0: Qcur, name: "Qcur", il);
1928 cb(cur0: Kcur, name: "Kcur", il);
1929 cb(cur0: Vcur, name: "Vcur", il);
1930
1931 cur = build_attn(wo: layer.o_w, wo_b: layer.o_b,
1932 q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_mask: nullptr, kq_scale, il);
1933 cb(cur0: cur, name: "attn_out", il);
1934
1935 cur = build_norm(cur, mw: layer.ln_1_w, mb: layer.ln_1_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il);
1936 cb(cur0: cur, name: "attn_post_norm", il);
1937
1938 cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
1939 inpL = cur;
1940
1941 cur = build_ffn(cur,
1942 up: layer.ff_up_w, up_b: layer.ff_up_b,
1943 gate: layer.ff_gate_w, gate_b: layer.ff_gate_b,
1944 down: layer.ff_down_w, down_b: layer.ff_down_b,
1945 type_op: hparams.ffn_op, il);
1946
1947 cb(cur0: cur, name: "ffn_out", il);
1948
1949 cur = build_norm(cur, mw: layer.ln_2_w, mb: layer.ln_2_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il);
1950 cb(cur0: cur, name: "ffn_post_norm", il);
1951
1952 cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
1953 cb(cur0: cur, name: "layer_out", il);
1954 inpL = cur;
1955
1956 }
1957
1958 // remove CLS token (like build_llama4 does)
1959 ggml_tensor * cur = ggml_view_2d(ctx: ctx0, a: inpL,
1960 ne0: n_embd, ne1: n_patches,
1961 nb1: ggml_row_size(type: inpL->type, ne: n_embd), offset: 0);
1962
1963 // Multiply with mm_model_proj
1964 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_model_proj, b: cur);
1965
1966 // Apply layernorm, weight, bias
1967 cur = build_norm(cur, mw: model.mm_post_fc_norm_w, mb: model.mm_post_fc_norm_b, type: NORM_TYPE_NORMAL, norm_eps: 1e-5, il: -1);
1968
1969 // Apply GELU
1970 cur = ggml_gelu_inplace(ctx: ctx0, a: cur);
1971
1972 // Branch 1: multiply with mm_h_to_4h_w
1973 ggml_tensor * h_to_4h = ggml_mul_mat(ctx: ctx0, a: model.mm_h_to_4h_w, b: cur);
1974
1975 // Branch 2: multiply with mm_gate_w
1976 ggml_tensor * gate = ggml_mul_mat(ctx: ctx0, a: model.mm_gate_w, b: cur);
1977
1978 // Apply silu
1979 gate = ggml_swiglu_split(ctx: ctx0, a: gate, b: h_to_4h);
1980
1981 // Apply mm_4h_to_h_w
1982 cur = ggml_mul_mat(ctx: ctx0, a: model.mm_4h_to_h_w, b: gate);
1983
1984 // Concatenate with boi and eoi
1985 cur = ggml_concat(ctx: ctx0, a: model.mm_boi, b: cur, dim: 1);
1986 cur = ggml_concat(ctx: ctx0, a: cur, b: model.mm_eoi, dim: 1);
1987
1988 // build the graph
1989 ggml_build_forward_expand(cgraph: gf, tensor: cur);
1990
1991 return gf;
1992 }
1993
1994private:
1995 //
1996 // utility functions
1997 //
1998
1999 void cb(ggml_tensor * cur0, const char * name, int il) const {
2000 if (ctx->debug_graph) {
2001 ggml_tensor * cur = ggml_cpy(ctx: ctx0, a: cur0, b: ggml_dup_tensor(ctx: ctx0, src: cur0));
2002 std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(val: il) : name;
2003 ggml_set_name(tensor: cur, name: cur_name.c_str());
2004 ggml_set_output(tensor: cur);
2005 ggml_build_forward_expand(cgraph: gf, tensor: cur);
2006 ctx->debug_print_tensors.push_back(x: cur);
2007 }
2008 }
2009
2010 // siglip2 naflex
2011 ggml_tensor * resize_position_embeddings() {
2012 ggml_tensor * pos_embd = model.position_embeddings;
2013 const int height = img.ny / patch_size;
2014 const int width = img.nx / patch_size;
2015 const uint32_t mode = GGML_SCALE_MODE_BILINEAR;
2016 const int n_per_side = (int)std::sqrt(x: pos_embd->ne[1]);
2017
2018 GGML_ASSERT(pos_embd);
2019
2020 if (height == n_per_side && width == n_per_side) {
2021 return pos_embd;
2022 }
2023
2024 pos_embd = ggml_reshape_3d(ctx: ctx0, a: pos_embd, ne0: n_embd, ne1: n_per_side, ne2: n_per_side); // -> (n_embd, n_per_side, n_per_side)
2025 pos_embd = ggml_permute(ctx: ctx0, a: pos_embd, axis0: 2, axis1: 0, axis2: 1, axis3: 3); // -> (n_per_side, n_per_side, n_embd)
2026 pos_embd = ggml_interpolate(ctx: ctx0, a: pos_embd, ne0: width, ne1: height, ne2: n_embd, ne3: 1, mode); // -> (width, height, n_embd)
2027 pos_embd = ggml_permute(ctx: ctx0, a: pos_embd, axis0: 1, axis1: 2, axis2: 0, axis3: 3); // -> (n_embd, width, height)
2028 pos_embd = ggml_cont_2d(ctx: ctx0, a: pos_embd, ne0: n_embd, ne1: width * height); // -> (n_embd, width * height)
2029
2030 return pos_embd;
2031 }
2032
2033 // build vision transformer (ViT) cgraph
2034 // this function should cover most of the models
2035 // if your model has specific features, you should probably duplicate this function
2036 ggml_tensor * build_vit(
2037 ggml_tensor * inp,
2038 int64_t n_pos,
2039 norm_type norm_t,
2040 ffn_op_type ffn_t,
2041 ggml_tensor * learned_pos_embd,
2042 std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
2043 ) {
2044 if (learned_pos_embd) {
2045 inp = ggml_add(ctx: ctx0, a: inp, b: learned_pos_embd);
2046 cb(cur0: inp, name: "pos_embed", il: -1);
2047 }
2048
2049 ggml_tensor * inpL = inp;
2050
2051 // pre-layernorm
2052 if (model.pre_ln_w) {
2053 inpL = build_norm(cur: inpL, mw: model.pre_ln_w, mb: model.pre_ln_b, type: norm_t, norm_eps: eps, il: -1);
2054 cb(cur0: inpL, name: "pre_ln", il: -1);
2055 }
2056
2057 // loop over layers
2058 for (int il = 0; il < n_layer; il++) {
2059 auto & layer = model.layers[il];
2060 ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
2061
2062 // layernorm1
2063 cur = build_norm(cur, mw: layer.ln_1_w, mb: layer.ln_1_b, type: norm_t, norm_eps: eps, il);
2064 cb(cur0: cur, name: "layer_inp_normed", il);
2065
2066 // self-attention
2067 {
2068 ggml_tensor * Qcur = ggml_mul_mat(ctx: ctx0, a: layer.q_w, b: cur);
2069 if (layer.q_b) {
2070 Qcur = ggml_add(ctx: ctx0, a: Qcur, b: layer.q_b);
2071 }
2072
2073 ggml_tensor * Kcur = ggml_mul_mat(ctx: ctx0, a: layer.k_w, b: cur);
2074 if (layer.k_b) {
2075 Kcur = ggml_add(ctx: ctx0, a: Kcur, b: layer.k_b);
2076 }
2077
2078 ggml_tensor * Vcur = ggml_mul_mat(ctx: ctx0, a: layer.v_w, b: cur);
2079 if (layer.v_b) {
2080 Vcur = ggml_add(ctx: ctx0, a: Vcur, b: layer.v_b);
2081 }
2082
2083 if (layer.q_norm) {
2084 Qcur = build_norm(cur: Qcur, mw: layer.q_norm, NULL, type: norm_t, norm_eps: eps, il);
2085 cb(cur0: Qcur, name: "Qcur_norm", il);
2086 }
2087
2088 if (layer.k_norm) {
2089 Kcur = build_norm(cur: Kcur, mw: layer.k_norm, NULL, type: norm_t, norm_eps: eps, il);
2090 cb(cur0: Kcur, name: "Kcur_norm", il);
2091 }
2092
2093 Qcur = ggml_reshape_3d(ctx: ctx0, a: Qcur, ne0: d_head, ne1: n_head, ne2: n_pos);
2094 Kcur = ggml_reshape_3d(ctx: ctx0, a: Kcur, ne0: d_head, ne1: n_head, ne2: n_pos);
2095 Vcur = ggml_reshape_3d(ctx: ctx0, a: Vcur, ne0: d_head, ne1: n_head, ne2: n_pos);
2096
2097 cb(cur0: Qcur, name: "Qcur", il);
2098 cb(cur0: Kcur, name: "Kcur", il);
2099 cb(cur0: Vcur, name: "Vcur", il);
2100
2101 if (add_pos) {
2102 Qcur = add_pos(Qcur, layer);
2103 Kcur = add_pos(Kcur, layer);
2104 cb(cur0: Qcur, name: "Qcur_pos", il);
2105 cb(cur0: Kcur, name: "Kcur_pos", il);
2106 }
2107
2108 cur = build_attn(wo: layer.o_w, wo_b: layer.o_b,
2109 q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_mask: nullptr, kq_scale, il);
2110 cb(cur0: cur, name: "attn_out", il);
2111 }
2112
2113 if (layer.ls_1_w) {
2114 cur = ggml_mul(ctx: ctx0, a: cur, b: layer.ls_1_w);
2115 cb(cur0: cur, name: "attn_out_scaled", il);
2116 }
2117
2118 // re-add the layer input, e.g., residual
2119 cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
2120
2121 inpL = cur; // inpL = residual, cur = hidden_states
2122
2123 cb(cur0: cur, name: "ffn_inp", il);
2124
2125 // layernorm2
2126 cur = build_norm(cur, mw: layer.ln_2_w, mb: layer.ln_2_b, type: norm_t, norm_eps: eps, il);
2127 cb(cur0: cur, name: "ffn_inp_normed", il);
2128
2129 // ffn
2130 cur = build_ffn(cur,
2131 up: layer.ff_up_w, up_b: layer.ff_up_b,
2132 gate: layer.ff_gate_w, gate_b: layer.ff_gate_b,
2133 down: layer.ff_down_w, down_b: layer.ff_down_b,
2134 type_op: ffn_t, il);
2135
2136 cb(cur0: cur, name: "ffn_out", il);
2137
2138 if (layer.ls_2_w) {
2139 cur = ggml_mul(ctx: ctx0, a: cur, b: layer.ls_2_w);
2140 cb(cur0: cur, name: "ffn_out_scaled", il);
2141 }
2142
2143 // residual 2
2144 cur = ggml_add(ctx: ctx0, a: inpL, b: cur);
2145 cb(cur0: cur, name: "layer_out", il);
2146
2147 inpL = cur;
2148 }
2149
2150 if (ctx->model.audio_has_avgpool()) {
2151 ggml_tensor * cur = inpL;
2152 cur = ggml_transpose(ctx: ctx0, a: cur);
2153 cur = ggml_cont(ctx: ctx0, a: cur);
2154 cur = ggml_pool_1d(ctx: ctx0, a: cur, op: GGML_OP_POOL_AVG, k0: 2, s0: 2, p0: 0);
2155 cur = ggml_transpose(ctx: ctx0, a: cur);
2156 cur = ggml_cont(ctx: ctx0, a: cur);
2157 inpL = cur;
2158 }
2159
2160 // post-layernorm
2161 if (model.post_ln_w) {
2162 inpL = build_norm(cur: inpL, mw: model.post_ln_w, mb: model.post_ln_b, type: norm_t, norm_eps: eps, il: -1);
2163 }
2164 return inpL;
2165 }
2166
2167 // build the input after conv2d (inp_raw --> patches)
2168 // returns tensor with shape [n_embd, n_patches]
2169 ggml_tensor * build_inp() {
2170 ggml_tensor * inp_raw = build_inp_raw();
2171 ggml_tensor * inp = ggml_conv_2d(ctx: ctx0, a: model.patch_embeddings_0, b: inp_raw, s0: patch_size, s1: patch_size, p0: 0, p1: 0, d0: 1, d1: 1);
2172 inp = ggml_reshape_2d(ctx: ctx0, a: inp, ne0: n_patches, ne1: n_embd);
2173 inp = ggml_cont(ctx: ctx0, a: ggml_transpose(ctx: ctx0, a: inp));
2174 if (model.patch_bias) {
2175 inp = ggml_add(ctx: ctx0, a: inp, b: model.patch_bias);
2176 cb(cur0: inp, name: "patch_bias", il: -1);
2177 }
2178 return inp;
2179 }
2180
2181 ggml_tensor * build_inp_raw(int channels = 3) {
2182 ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx: ctx0, type: GGML_TYPE_F32, ne0: img.nx, ne1: img.ny, ne2: channels);
2183 ggml_set_name(tensor: inp_raw, name: "inp_raw");
2184 ggml_set_input(tensor: inp_raw);
2185 return inp_raw;
2186 }
2187
2188 ggml_tensor * build_norm(
2189 ggml_tensor * cur,
2190 ggml_tensor * mw,
2191 ggml_tensor * mb,
2192 norm_type type,
2193 float norm_eps,
2194 int il) const {
2195
2196 cur = type == NORM_TYPE_RMS
2197 ? ggml_rms_norm(ctx: ctx0, a: cur, eps: norm_eps)
2198 : ggml_norm(ctx: ctx0, a: cur, eps: norm_eps);
2199
2200 if (mw || mb) {
2201 cb(cur0: cur, name: "norm", il);
2202 }
2203
2204 if (mw) {
2205 cur = ggml_mul(ctx: ctx0, a: cur, b: mw);
2206 if (mb) {
2207 cb(cur0: cur, name: "norm_w", il);
2208 }
2209 }
2210
2211 if (mb) {
2212 cur = ggml_add(ctx: ctx0, a: cur, b: mb);
2213 }
2214
2215 return cur;
2216 }
2217
2218 ggml_tensor * build_ffn(
2219 ggml_tensor * cur,
2220 ggml_tensor * up,
2221 ggml_tensor * up_b,
2222 ggml_tensor * gate,
2223 ggml_tensor * gate_b,
2224 ggml_tensor * down,
2225 ggml_tensor * down_b,
2226 ffn_op_type type_op,
2227 int il) const {
2228
2229 ggml_tensor * tmp = up ? ggml_mul_mat(ctx: ctx0, a: up, b: cur) : cur;
2230 cb(cur0: tmp, name: "ffn_up", il);
2231
2232 if (up_b) {
2233 tmp = ggml_add(ctx: ctx0, a: tmp, b: up_b);
2234 cb(cur0: tmp, name: "ffn_up_b", il);
2235 }
2236
2237 if (gate) {
2238 cur = ggml_mul_mat(ctx: ctx0, a: gate, b: cur);
2239 cb(cur0: cur, name: "ffn_gate", il);
2240
2241 if (gate_b) {
2242 cur = ggml_add(ctx: ctx0, a: cur, b: gate_b);
2243 cb(cur0: cur, name: "ffn_gate_b", il);
2244 }
2245 } else {
2246 cur = tmp;
2247 }
2248
2249 // we only support parallel ffn for now
2250 switch (type_op) {
2251 case FFN_SILU:
2252 if (gate) {
2253 cur = ggml_swiglu_split(ctx: ctx0, a: cur, b: tmp);
2254 cb(cur0: cur, name: "ffn_swiglu", il);
2255 } else {
2256 cur = ggml_silu(ctx: ctx0, a: cur);
2257 cb(cur0: cur, name: "ffn_silu", il);
2258 } break;
2259 case FFN_GELU:
2260 if (gate) {
2261 cur = ggml_geglu_split(ctx: ctx0, a: cur, b: tmp);
2262 cb(cur0: cur, name: "ffn_geglu", il);
2263 } else {
2264 cur = ggml_gelu(ctx: ctx0, a: cur);
2265 cb(cur0: cur, name: "ffn_gelu", il);
2266 } break;
2267 case FFN_GELU_ERF:
2268 if (gate) {
2269 cur = ggml_geglu_erf_split(ctx: ctx0, a: cur, b: tmp);
2270 cb(cur0: cur, name: "ffn_geglu_erf", il);
2271 } else {
2272 cur = ggml_gelu_erf(ctx: ctx0, a: cur);
2273 cb(cur0: cur, name: "ffn_gelu_erf", il);
2274 } break;
2275 case FFN_GELU_QUICK:
2276 if (gate) {
2277 cur = ggml_geglu_quick_split(ctx: ctx0, a: cur, b: tmp);
2278 cb(cur0: cur, name: "ffn_geglu_quick", il);
2279 } else {
2280 cur = ggml_gelu_quick(ctx: ctx0, a: cur);
2281 cb(cur0: cur, name: "ffn_gelu_quick", il);
2282 } break;
2283 }
2284
2285 if (down) {
2286 cur = ggml_mul_mat(ctx: ctx0, a: down, b: cur);
2287 }
2288
2289 if (down_b) {
2290 cb(cur0: cur, name: "ffn_down", il);
2291 }
2292
2293 if (down_b) {
2294 cur = ggml_add(ctx: ctx0, a: cur, b: down_b);
2295 }
2296
2297 return cur;
2298 }
2299
2300 ggml_tensor * build_attn(
2301 ggml_tensor * wo,
2302 ggml_tensor * wo_b,
2303 ggml_tensor * q_cur,
2304 ggml_tensor * k_cur,
2305 ggml_tensor * v_cur,
2306 ggml_tensor * kq_mask,
2307 float kq_scale,
2308 int il) const {
2309 // these nodes are added to the graph together so that they are not reordered
2310 // by doing so, the number of splits in the graph is reduced
2311 ggml_build_forward_expand(cgraph: gf, tensor: q_cur);
2312 ggml_build_forward_expand(cgraph: gf, tensor: k_cur);
2313 ggml_build_forward_expand(cgraph: gf, tensor: v_cur);
2314
2315 ggml_tensor * q = ggml_permute(ctx: ctx0, a: q_cur, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
2316 //cb(q, "q", il);
2317
2318 ggml_tensor * k = ggml_permute(ctx: ctx0, a: k_cur, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
2319 //cb(k, "k", il);
2320
2321 ggml_tensor * cur;
2322
2323 if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
2324 ggml_tensor * v = ggml_permute(ctx: ctx0, a: v_cur, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
2325
2326 k = ggml_cast(ctx: ctx0, a: k, type: GGML_TYPE_F16);
2327 v = ggml_cast(ctx: ctx0, a: v, type: GGML_TYPE_F16);
2328
2329 cur = ggml_flash_attn_ext(ctx: ctx0, q, k, v, mask: kq_mask, scale: kq_scale, max_bias: 0.0f, logit_softcap: 0.0f);
2330 ggml_flash_attn_ext_set_prec(a: cur, prec: GGML_PREC_F32);
2331
2332 cur = ggml_reshape_2d(ctx: ctx0, a: cur, ne0: cur->ne[0]*cur->ne[1], ne1: cur->ne[2]*cur->ne[3]);
2333
2334 } else {
2335 ggml_tensor * v = ggml_permute(ctx: ctx0, a: v_cur, axis0: 1, axis1: 2, axis2: 0, axis3: 3);
2336 v = ggml_cont(ctx: ctx0, a: v);
2337
2338 const auto n_tokens = q->ne[1];
2339 const auto n_head = q->ne[2];
2340
2341 ggml_tensor * kq = ggml_mul_mat(ctx: ctx0, a: k, b: q);
2342 // F32 may not needed for vision encoders?
2343 // ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
2344
2345 kq = ggml_soft_max_ext(ctx: ctx0, a: kq, mask: kq_mask, scale: kq_scale, max_bias: 0.0f);
2346
2347 ggml_tensor * kqv = ggml_mul_mat(ctx: ctx0, a: v, b: kq);
2348 cur = ggml_permute(ctx: ctx0, a: kqv, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
2349 cur = ggml_cont_2d(ctx: ctx0, a: cur, ne0: cur->ne[0]*n_head, ne1: n_tokens);
2350 }
2351
2352 cb(cur0: cur, name: "kqv_out", il);
2353
2354 if (wo) {
2355 cur = ggml_mul_mat(ctx: ctx0, a: wo, b: cur);
2356 }
2357
2358 if (wo_b) {
2359 cur = ggml_add(ctx: ctx0, a: cur, b: wo_b);
2360 }
2361
2362 return cur;
2363 }
2364
2365 // implementation of the 2D RoPE without adding a new op in ggml
2366 // this is not efficient (use double the memory), but works on all backends
2367 // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
2368 static ggml_tensor * build_rope_2d(
2369 ggml_context * ctx0,
2370 ggml_tensor * cur,
2371 ggml_tensor * pos_a, // first half
2372 ggml_tensor * pos_b, // second half
2373 const float freq_base,
2374 const bool interleave_freq
2375 ) {
2376 const int64_t n_dim = cur->ne[0];
2377 const int64_t n_head = cur->ne[1];
2378 const int64_t n_pos = cur->ne[2];
2379
2380 // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
2381 // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
2382 // first half of cur will use 1e-0, 1e-2 (even)
2383 // second half of cur will use 1e-1, 1e-3 (odd)
2384 // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
2385 // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
2386 // then for the second half, we use freq_scale to shift the inv_freq
2387 // ^ why? replace (2i) with (2i+1) in the above equation
2388 const float freq_scale_odd = interleave_freq
2389 ? std::pow(x: freq_base, y: (float)-2/n_dim)
2390 : 1.0;
2391
2392 // first half
2393 ggml_tensor * first;
2394 {
2395 first = ggml_view_3d(ctx: ctx0, a: cur,
2396 ne0: n_dim/2, ne1: n_head, ne2: n_pos,
2397 nb1: ggml_row_size(type: cur->type, ne: n_dim),
2398 nb2: ggml_row_size(type: cur->type, ne: n_dim*n_head),
2399 offset: 0);
2400 first = ggml_rope_ext(
2401 ctx: ctx0,
2402 a: first,
2403 b: pos_a, // positions
2404 c: nullptr, // freq factors
2405 n_dims: n_dim/2, // n_dims
2406 mode: 0, n_ctx_orig: 0, freq_base,
2407 freq_scale: 1.0f, ext_factor: 0.0f, attn_factor: 1.0f, beta_fast: 0.0f, beta_slow: 0.0f
2408 );
2409 }
2410
2411 // second half
2412 ggml_tensor * second;
2413 {
2414 second = ggml_view_3d(ctx: ctx0, a: cur,
2415 ne0: n_dim/2, ne1: n_head, ne2: n_pos,
2416 nb1: ggml_row_size(type: cur->type, ne: n_dim),
2417 nb2: ggml_row_size(type: cur->type, ne: n_dim*n_head),
2418 offset: n_dim/2 * ggml_element_size(tensor: cur));
2419 second = ggml_rope_ext(
2420 ctx: ctx0,
2421 a: second,
2422 b: pos_b, // positions
2423 c: nullptr, // freq factors
2424 n_dims: n_dim/2, // n_dims
2425 mode: 0, n_ctx_orig: 0, freq_base,
2426 freq_scale: freq_scale_odd,
2427 ext_factor: 0.0f, attn_factor: 1.0f, beta_fast: 0.0f, beta_slow: 0.0f
2428 );
2429 }
2430
2431 cur = ggml_concat(ctx: ctx0, a: first, b: second, dim: 0);
2432 return cur;
2433 }
2434
2435 // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
2436 // support dynamic resolution
2437 ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
2438 GGML_ASSERT(scale_factor > 1);
2439
2440 const int n_embd = cur->ne[0];
2441 int width = img.nx / patch_size;
2442 int height = img.ny / patch_size;
2443
2444 // pad width and height to factor
2445 const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
2446 const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
2447 cur = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: n_embd, ne1: width, ne2: height);
2448 if (pad_width || pad_height) {
2449 cur = ggml_pad(ctx: ctx0, a: cur, p0: 0, p1: pad_width, p2: pad_height, p3: 0);
2450 width += pad_width;
2451 height += pad_height;
2452 }
2453
2454 // unshuffle h
2455 cur = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: n_embd * scale_factor, ne1: width / scale_factor, ne2: height);
2456 cur = ggml_permute(ctx: ctx0, a: cur, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
2457
2458 // unshuffle w
2459 cur = ggml_cont_3d(ctx: ctx0, a: cur, ne0: n_embd * scale_factor * scale_factor, ne1: height / scale_factor, ne2: width / scale_factor);
2460 cur = ggml_permute(ctx: ctx0, a: cur, axis0: 0, axis1: 2, axis2: 1, axis3: 3);
2461
2462 cur = ggml_cont_2d(ctx: ctx0, a: cur, ne0: cur->ne[0], ne1: cur->ne[1] * cur->ne[2]);
2463 cb(cur0: cur, name: "pixel_shuffle", il: -1);
2464
2465 return cur;
2466 }
2467
2468};
2469
2470static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
2471 GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
2472 clip_graph graph(ctx, *imgs.entries[0]);
2473
2474 ggml_cgraph * res;
2475
2476 switch (ctx->proj_type()) {
2477 case PROJECTOR_TYPE_GEMMA3:
2478 case PROJECTOR_TYPE_IDEFICS3:
2479 case PROJECTOR_TYPE_LFM2:
2480 {
2481 res = graph.build_siglip();
2482 } break;
2483 case PROJECTOR_TYPE_PIXTRAL:
2484 case PROJECTOR_TYPE_LIGHTONOCR:
2485 {
2486 res = graph.build_pixtral();
2487 } break;
2488 case PROJECTOR_TYPE_QWEN2VL:
2489 case PROJECTOR_TYPE_QWEN25VL:
2490 {
2491 res = graph.build_qwen2vl();
2492 } break;
2493 case PROJECTOR_TYPE_QWEN3VL:
2494 {
2495 res = graph.build_qwen3vl();
2496 } break;
2497 case PROJECTOR_TYPE_MINICPMV:
2498 {
2499 res = graph.build_minicpmv();
2500 } break;
2501 case PROJECTOR_TYPE_INTERNVL:
2502 {
2503 res = graph.build_internvl();
2504 } break;
2505 case PROJECTOR_TYPE_LLAMA4:
2506 {
2507 res = graph.build_llama4();
2508 } break;
2509 case PROJECTOR_TYPE_ULTRAVOX:
2510 case PROJECTOR_TYPE_VOXTRAL:
2511 case PROJECTOR_TYPE_QWEN2A:
2512 {
2513 res = graph.build_whisper_enc();
2514 } break;
2515 case PROJECTOR_TYPE_KIMIVL:
2516 {
2517 res = graph.build_kimivl();
2518 } break;
2519 case PROJECTOR_TYPE_JANUS_PRO:
2520 {
2521 res = graph.build_siglip();
2522 } break;
2523 case PROJECTOR_TYPE_COGVLM:
2524 {
2525 res = graph.build_cogvlm();
2526 } break;
2527 default:
2528 {
2529 res = graph.build_llava();
2530 } break;
2531 }
2532 return res;
2533}
2534
2535struct clip_model_loader {
2536 ggml_context_ptr ctx_meta;
2537 gguf_context_ptr ctx_gguf;
2538
2539 std::string fname;
2540
2541 size_t model_size = 0; // in bytes
2542
2543 bool has_vision = false;
2544 bool has_audio = false;
2545
2546 // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
2547 clip_model_loader(const char * fname) : fname(fname) {
2548 struct ggml_context * meta = nullptr;
2549
2550 struct gguf_init_params params = {
2551 /*.no_alloc = */ true,
2552 /*.ctx = */ &meta,
2553 };
2554
2555 ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params));
2556 if (!ctx_gguf.get()) {
2557 throw std::runtime_error(string_format(fmt: "%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
2558 }
2559
2560 ctx_meta.reset(p: meta);
2561
2562 const int n_tensors = gguf_get_n_tensors(ctx: ctx_gguf.get());
2563
2564 // print gguf info
2565 {
2566 std::string name;
2567 get_string(KEY_NAME, output&: name, required: false);
2568 std::string description;
2569 get_string(KEY_DESCRIPTION, output&: description, required: false);
2570 LOG_INF("%s: model name: %s\n", __func__, name.c_str());
2571 LOG_INF("%s: description: %s\n", __func__, description.c_str());
2572 LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx_gguf.get()));
2573 LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
2574 LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
2575 LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
2576 LOG_INF("\n");
2577 }
2578
2579 // modalities
2580 {
2581 get_bool(KEY_HAS_VISION_ENC, output&: has_vision, required: false);
2582 get_bool(KEY_HAS_AUDIO_ENC, output&: has_audio, required: false);
2583
2584 if (has_vision) {
2585 LOG_INF("%s: has vision encoder\n", __func__);
2586 }
2587 if (has_audio) {
2588 LOG_INF("%s: has audio encoder\n", __func__);
2589 }
2590 }
2591
2592 // tensors
2593 {
2594 for (int i = 0; i < n_tensors; ++i) {
2595 const char * name = gguf_get_tensor_name(ctx: ctx_gguf.get(), tensor_id: i);
2596 const size_t offset = gguf_get_tensor_offset(ctx: ctx_gguf.get(), tensor_id: i);
2597 enum ggml_type type = gguf_get_tensor_type(ctx: ctx_gguf.get(), tensor_id: i);
2598 ggml_tensor * cur = ggml_get_tensor(ctx: meta, name);
2599 size_t tensor_size = ggml_nbytes(tensor: cur);
2600 model_size += tensor_size;
2601 LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
2602 __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
2603 }
2604 }
2605 }
2606
2607 void load_hparams(clip_model & model, clip_modality modality) {
2608 auto & hparams = model.hparams;
2609 std::string log_ffn_op; // for logging
2610
2611 // sanity check
2612 if (modality == CLIP_MODALITY_VISION) {
2613 GGML_ASSERT(has_vision);
2614 } else if (modality == CLIP_MODALITY_AUDIO) {
2615 GGML_ASSERT(has_audio);
2616 }
2617 model.modality = modality;
2618
2619
2620 // projector type
2621 std::string proj_type;
2622 {
2623 // default key
2624 get_string(KEY_PROJ_TYPE, output&: proj_type, required: false);
2625
2626 // for models with mixed modalities
2627 if (proj_type.empty()) {
2628 if (modality == CLIP_MODALITY_VISION) {
2629 get_string(KEY_VISION_PROJ_TYPE, output&: proj_type, required: false);
2630 } else if (modality == CLIP_MODALITY_AUDIO) {
2631 get_string(KEY_AUDIO_PROJ_TYPE, output&: proj_type, required: false);
2632 } else {
2633 GGML_ABORT("unknown modality");
2634 }
2635 }
2636
2637 model.proj_type = clip_projector_type_from_string(str: proj_type);
2638
2639 if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
2640 throw std::runtime_error(string_format(fmt: "%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
2641 }
2642
2643 // correct arch for multimodal models (legacy method)
2644 if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
2645 model.proj_type = modality == CLIP_MODALITY_VISION
2646 ? PROJECTOR_TYPE_QWEN25VL
2647 : PROJECTOR_TYPE_QWEN2A;
2648 }
2649 }
2650
2651 const bool is_vision = model.modality == CLIP_MODALITY_VISION;
2652 const bool is_audio = model.modality == CLIP_MODALITY_AUDIO;
2653
2654 // other hparams
2655 {
2656 const char * prefix = is_vision ? "vision" : "audio";
2657 get_u32(key: string_format(KEY_N_EMBD, prefix), output&: hparams.n_embd);
2658 get_u32(key: string_format(KEY_N_HEAD, prefix), output&: hparams.n_head);
2659 get_u32(key: string_format(KEY_N_FF, prefix), output&: hparams.n_ff);
2660 get_u32(key: string_format(KEY_N_BLOCK, prefix), output&: hparams.n_layer);
2661 get_u32(key: string_format(KEY_PROJ_DIM, prefix), output&: hparams.projection_dim);
2662 get_f32(key: string_format(KEY_LAYER_NORM_EPS, prefix), output&: hparams.eps);
2663
2664 if (is_vision) {
2665 get_u32(KEY_IMAGE_SIZE, output&: hparams.image_size);
2666 get_u32(KEY_PATCH_SIZE, output&: hparams.patch_size);
2667 get_u32(KEY_IMAGE_CROP_RESOLUTION, output&: hparams.image_crop_resolution, required: false);
2668 get_i32(KEY_MINICPMV_VERSION, output&: hparams.minicpmv_version, required: false); // legacy
2669 get_u32(KEY_MINICPMV_QUERY_NUM, output&: hparams.minicpmv_query_num, required: false);
2670 if (hparams.minicpmv_query_num == 0) {
2671 // Fallback to hardcoded values for legacy models
2672 if (hparams.minicpmv_version == 3) {
2673 hparams.minicpmv_query_num = 64;
2674 } else if (hparams.minicpmv_version == 4) {
2675 hparams.minicpmv_query_num = 64;
2676 } else if (hparams.minicpmv_version == 5) {
2677 hparams.minicpmv_query_num = 64;
2678 } else if (hparams.minicpmv_version == 6) {
2679 hparams.minicpmv_query_num = 64;
2680 } else {
2681 hparams.minicpmv_query_num = 96;
2682 }
2683 }
2684 } else if (is_audio) {
2685 get_u32(KEY_A_NUM_MEL_BINS, output&: hparams.n_mel_bins);
2686
2687 } else {
2688 GGML_ASSERT(false && "unknown modality");
2689 }
2690
2691 // for pinpoints, we need to convert it into a list of resolution candidates
2692 {
2693 std::vector<int> pinpoints;
2694 get_arr_int(KEY_IMAGE_GRID_PINPOINTS, output&: pinpoints, required: false);
2695 if (!pinpoints.empty()) {
2696 for (size_t i = 0; i < pinpoints.size(); i += 2) {
2697 hparams.image_res_candidates.push_back(x: {
2698 .width: pinpoints[i],
2699 .height: pinpoints[i+1],
2700 });
2701 }
2702 }
2703 }
2704
2705 // default warmup value
2706 hparams.warmup_image_size = hparams.image_size;
2707
2708 hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP
2709 || model.proj_type == PROJECTOR_TYPE_MLP_NORM
2710 || model.proj_type == PROJECTOR_TYPE_LDP
2711 || model.proj_type == PROJECTOR_TYPE_LDPV2;
2712
2713 {
2714 bool use_gelu = false;
2715 bool use_silu = false;
2716 get_bool(KEY_USE_GELU, output&: use_gelu, required: false);
2717 get_bool(KEY_USE_SILU, output&: use_silu, required: false);
2718 if (use_gelu && use_silu) {
2719 throw std::runtime_error(string_format(fmt: "%s: both use_gelu and use_silu are set to true\n", __func__));
2720 }
2721 if (use_gelu) {
2722 hparams.ffn_op = FFN_GELU;
2723 log_ffn_op = "gelu";
2724 } else if (use_silu) {
2725 hparams.ffn_op = FFN_SILU;
2726 log_ffn_op = "silu";
2727 } else {
2728 hparams.ffn_op = FFN_GELU_QUICK;
2729 log_ffn_op = "gelu_quick";
2730 }
2731 }
2732
2733 {
2734 std::string mm_patch_merge_type;
2735 get_string(KEY_MM_PATCH_MERGE_TYPE, output&: mm_patch_merge_type, required: false);
2736 if (mm_patch_merge_type == "spatial_unpad") {
2737 hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
2738 }
2739 }
2740
2741 if (is_vision) {
2742 int idx_mean = gguf_find_key(ctx: ctx_gguf.get(), KEY_IMAGE_MEAN);
2743 int idx_std = gguf_find_key(ctx: ctx_gguf.get(), KEY_IMAGE_STD);
2744 GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
2745 GGML_ASSERT(idx_std >= 0 && "image_std not found");
2746 const float * mean_data = (const float *) gguf_get_arr_data(ctx: ctx_gguf.get(), key_id: idx_mean);
2747 const float * std_data = (const float *) gguf_get_arr_data(ctx: ctx_gguf.get(), key_id: idx_std);
2748 for (int i = 0; i < 3; ++i) {
2749 hparams.image_mean[i] = mean_data[i];
2750 hparams.image_std[i] = std_data[i];
2751 }
2752 }
2753
2754 // Load the vision feature layer indices if they are explicitly provided;
2755 // if multiple vision feature layers are present, the values will be concatenated
2756 // to form the final visual features.
2757 // NOTE: gguf conversions should standardize the values of the vision feature layer to
2758 // be non-negative, since we use -1 to mark values as unset here.
2759 std::vector<int> vision_feature_layer;
2760 get_arr_int(KEY_FEATURE_LAYER, output&: vision_feature_layer, required: false);
2761 // convert std::vector to std::unordered_set
2762 for (auto & layer : vision_feature_layer) {
2763 hparams.vision_feature_layer.insert(x: layer);
2764 }
2765
2766 // model-specific params
2767 switch (model.proj_type) {
2768 case PROJECTOR_TYPE_MINICPMV:
2769 {
2770 if (hparams.minicpmv_version == 0) {
2771 hparams.minicpmv_version = 2; // default to 2 if not set
2772 }
2773 } break;
2774 case PROJECTOR_TYPE_INTERNVL:
2775 {
2776 get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2777 } break;
2778 case PROJECTOR_TYPE_IDEFICS3:
2779 {
2780 get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2781 get_u32(KEY_PREPROC_IMAGE_SIZE, output&: hparams.image_longest_edge, required: false);
2782 } break;
2783 case PROJECTOR_TYPE_LFM2:
2784 {
2785 get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2786 // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
2787 hparams.set_limit_image_tokens(n_tokens_min: 64, n_tokens_max: 256);
2788 } break;
2789 case PROJECTOR_TYPE_PIXTRAL:
2790 case PROJECTOR_TYPE_LIGHTONOCR:
2791 {
2792 // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
2793 // TODO: verify the image_min_tokens
2794 hparams.n_merge = 1; // the original pixtral does not use patch merging
2795 hparams.rope_theta = 10000.0f;
2796 get_u32(KEY_SPATIAL_MERGE_SIZE, output&: hparams.n_merge, required: false);
2797 hparams.set_limit_image_tokens(n_tokens_min: 8, n_tokens_max: 1024);
2798 hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
2799 } break;
2800 case PROJECTOR_TYPE_KIMIVL:
2801 {
2802 hparams.rope_theta = 10000.0f;
2803 get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2804 // TODO: check kimivl preprocessor for exact values
2805 hparams.set_limit_image_tokens(n_tokens_min: 8, n_tokens_max: 1024);
2806 hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
2807 } break;
2808 case PROJECTOR_TYPE_GEMMA3:
2809 {
2810 // default value (used by all model sizes in gemma 3 family)
2811 // number of patches for each **side** is reduced by a factor of 4
2812 hparams.n_merge = 4;
2813 // test model (tinygemma3) has a different value, we optionally read it
2814 get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2815 } break;
2816 case PROJECTOR_TYPE_QWEN2VL:
2817 case PROJECTOR_TYPE_QWEN25VL:
2818 case PROJECTOR_TYPE_QWEN3VL:
2819 {
2820 hparams.n_merge = 2; // default value for Qwen 2 and 2.5
2821 get_u32(KEY_SPATIAL_MERGE_SIZE, output&: hparams.n_merge, required: false);
2822 get_u32(KEY_WIN_ATTN_PATTERN, output&: hparams.n_wa_pattern, required: model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
2823 // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
2824 hparams.set_limit_image_tokens(n_tokens_min: 8, n_tokens_max: 4096);
2825 hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
2826 const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
2827 if (hparams.image_min_pixels < warn_min_pixels) {
2828 LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
2829 LOG_WRN("%s: if you encounter problems with accuracy, try adding --image-min-tokens 1024\n", __func__);
2830 LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
2831 }
2832 } break;
2833 case PROJECTOR_TYPE_LLAMA4:
2834 {
2835 hparams.rope_theta = 10000.0f;
2836 get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2837 set_llava_uhd_res_candidates(model, max_patches_per_side: 3);
2838 } break;
2839 case PROJECTOR_TYPE_ULTRAVOX:
2840 case PROJECTOR_TYPE_QWEN2A:
2841 case PROJECTOR_TYPE_VOXTRAL:
2842 {
2843 bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
2844 model.proj_type == PROJECTOR_TYPE_VOXTRAL;
2845 get_u32(KEY_A_PROJ_STACK_FACTOR, output&: hparams.proj_stack_factor, required: require_stack);
2846 if (hparams.n_mel_bins != 128) {
2847 throw std::runtime_error(string_format(fmt: "%s: only 128 mel bins are supported for ultravox\n", __func__));
2848 }
2849 hparams.ffn_op = FFN_GELU_ERF;
2850 log_ffn_op = "gelu_erf"; // temporary solution for logging
2851 } break;
2852 default:
2853 break;
2854 }
2855
2856 // sanity check
2857 {
2858 if (hparams.image_max_pixels < hparams.image_min_pixels) {
2859 throw std::runtime_error(string_format(fmt: "%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
2860 }
2861 }
2862
2863 LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
2864 LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
2865 LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
2866 LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
2867 LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
2868 LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
2869 LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
2870 if (is_vision) {
2871 LOG_INF("\n--- vision hparams ---\n");
2872 LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
2873 LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
2874 LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
2875 LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
2876 LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
2877 LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
2878 if (hparams.image_min_pixels > 0) {
2879 LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
2880 }
2881 if (hparams.image_max_pixels > 0) {
2882 LOG_INF("%s: image_max_pixels: %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : "");
2883 }
2884 } else if (is_audio) {
2885 LOG_INF("\n--- audio hparams ---\n");
2886 LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
2887 LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
2888 }
2889 LOG_INF("\n");
2890 LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
2891 LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
2892 }
2893 }
2894
2895 void load_tensors(clip_ctx & ctx_clip) {
2896 auto & model = ctx_clip.model;
2897 auto & hparams = model.hparams;
2898 std::map<std::string, size_t> tensor_offset;
2899 std::vector<ggml_tensor *> tensors_to_load;
2900
2901 // TODO @ngxson : support both audio and video in the future
2902 const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
2903
2904 // get offsets
2905 for (int64_t i = 0; i < gguf_get_n_tensors(ctx: ctx_gguf.get()); ++i) {
2906 const char * name = gguf_get_tensor_name(ctx: ctx_gguf.get(), tensor_id: i);
2907 tensor_offset[name] = gguf_get_data_offset(ctx: ctx_gguf.get()) + gguf_get_tensor_offset(ctx: ctx_gguf.get(), tensor_id: i);
2908 }
2909
2910 // create data context
2911 struct ggml_init_params params = {
2912 /*.mem_size =*/ static_cast<size_t>(gguf_get_n_tensors(ctx: ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
2913 /*.mem_buffer =*/ NULL,
2914 /*.no_alloc =*/ true,
2915 };
2916 ctx_clip.ctx_data.reset(p: ggml_init(params));
2917 if (!ctx_clip.ctx_data) {
2918 throw std::runtime_error(string_format(fmt: "%s: failed to init ggml context\n", __func__));
2919 }
2920
2921 // helper function
2922 auto get_tensor = [&](const std::string & name, bool required = true) {
2923 ggml_tensor * cur = ggml_get_tensor(ctx: ctx_meta.get(), name: name.c_str());
2924 if (!cur && required) {
2925 throw std::runtime_error(string_format(fmt: "%s: unable to find tensor %s\n", __func__, name.c_str()));
2926 }
2927 if (cur) {
2928 tensors_to_load.push_back(x: cur);
2929 // add tensors to context
2930 ggml_tensor * data_tensor = ggml_dup_tensor(ctx: ctx_clip.ctx_data.get(), src: cur);
2931 ggml_set_name(tensor: data_tensor, name: cur->name);
2932 cur = data_tensor;
2933 }
2934 return cur;
2935 };
2936
2937 model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
2938
2939 model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
2940 model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false);
2941
2942 model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
2943 model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false);
2944
2945 model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
2946 model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
2947 model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
2948
2949 model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
2950
2951 // layers
2952 model.layers.resize(new_size: hparams.n_layer);
2953 for (int il = 0; il < hparams.n_layer; ++il) {
2954 auto & layer = model.layers[il];
2955 layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false);
2956 layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false);
2957 layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"), false);
2958 layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
2959 layer.qkv_w = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "weight"), false);
2960 layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
2961 layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
2962 layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
2963 layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
2964 layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
2965 layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
2966
2967 layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
2968 layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
2969 layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false);
2970 layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
2971 layer.qkv_b = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "bias"), false);
2972 layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false);
2973 layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false);
2974
2975 // ffn
2976 layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, prefix, il, "weight"));
2977 layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, prefix, il, "bias"), false);
2978 layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
2979 layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"), false);
2980 layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
2981 layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false);
2982
2983
2984 // qwen3vl deepstack layer
2985 layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false);
2986 layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false);
2987 layer.deepstack_fc1_w = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "weight"), false);
2988 layer.deepstack_fc1_b = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "bias"), false);
2989 layer.deepstack_fc2_w = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "weight"), false);
2990 layer.deepstack_fc2_b = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "bias"), false);
2991 if (layer.has_deepstack()) {
2992 model.n_deepstack_layers++;
2993 }
2994
2995 // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
2996 // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
2997 bool is_ffn_swapped = (
2998 // only old models need this fix
2999 model.proj_type == PROJECTOR_TYPE_MLP
3000 || model.proj_type == PROJECTOR_TYPE_MLP_NORM
3001 || model.proj_type == PROJECTOR_TYPE_LDP
3002 || model.proj_type == PROJECTOR_TYPE_LDPV2
3003 || model.proj_type == PROJECTOR_TYPE_QWEN2VL
3004 || model.proj_type == PROJECTOR_TYPE_QWEN25VL
3005 || model.proj_type == PROJECTOR_TYPE_GLM_EDGE
3006 || model.proj_type == PROJECTOR_TYPE_GEMMA3
3007 || model.proj_type == PROJECTOR_TYPE_IDEFICS3
3008 || model.proj_type == PROJECTOR_TYPE_MINICPMV
3009 ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
3010 if (is_ffn_swapped) {
3011 // swap up and down weights
3012 ggml_tensor * tmp = layer.ff_up_w;
3013 layer.ff_up_w = layer.ff_down_w;
3014 layer.ff_down_w = tmp;
3015 // swap up and down biases
3016 tmp = layer.ff_up_b;
3017 layer.ff_up_b = layer.ff_down_b;
3018 layer.ff_down_b = tmp;
3019 if (il == 0) {
3020 LOG_WRN("%s: ffn up/down are swapped\n", __func__);
3021 }
3022 }
3023 }
3024
3025 switch (model.proj_type) {
3026 case PROJECTOR_TYPE_MLP:
3027 case PROJECTOR_TYPE_MLP_NORM:
3028 {
3029 // LLaVA projection
3030 model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
3031 model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
3032 // Yi-type llava
3033 model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
3034 model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
3035 // missing in Yi-type llava
3036 model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
3037 model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
3038 // Yi-type llava
3039 model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
3040 model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
3041 model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
3042 model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
3043 if (model.mm_3_w) {
3044 // TODO: this is a hack to support Yi-type llava
3045 model.proj_type = PROJECTOR_TYPE_MLP_NORM;
3046 }
3047 model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
3048 } break;
3049 case PROJECTOR_TYPE_LDP:
3050 {
3051 // MobileVLM projection
3052 model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
3053 model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
3054 model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
3055 model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
3056 model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
3057 model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
3058 model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
3059 model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
3060 model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
3061 model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
3062 model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
3063 model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
3064 model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
3065 model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
3066 model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
3067 model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
3068 model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
3069 model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
3070 model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
3071 model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
3072 model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
3073 model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
3074 model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
3075 model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
3076 } break;
3077 case PROJECTOR_TYPE_LDPV2:
3078 {
3079 // MobilVLM_V2 projection
3080 model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
3081 model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
3082 model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
3083 model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
3084 model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
3085 model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
3086 } break;
3087 case PROJECTOR_TYPE_MINICPMV:
3088 {
3089 // model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
3090 model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
3091 model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
3092 model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
3093 model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
3094 model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
3095 model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
3096 model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
3097 model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
3098 model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
3099 model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
3100 model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
3101 model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
3102 model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
3103 model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
3104 model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
3105 model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
3106 model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
3107 model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
3108 } break;
3109 case PROJECTOR_TYPE_GLM_EDGE:
3110 {
3111 model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
3112 model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
3113 model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
3114 model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
3115 model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
3116 model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
3117 model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
3118 model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
3119 model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
3120 model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
3121 } break;
3122 case PROJECTOR_TYPE_QWEN2VL:
3123 case PROJECTOR_TYPE_QWEN25VL:
3124 {
3125 model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
3126 model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
3127 model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
3128 model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
3129 } break;
3130 case PROJECTOR_TYPE_QWEN3VL:
3131 {
3132 model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
3133 model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
3134 model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
3135 model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
3136 } break;
3137 case PROJECTOR_TYPE_GEMMA3:
3138 {
3139 model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
3140 model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
3141 } break;
3142 case PROJECTOR_TYPE_IDEFICS3:
3143 {
3144 model.projection = get_tensor(TN_MM_PROJECTOR);
3145 } break;
3146 case PROJECTOR_TYPE_LFM2:
3147 case PROJECTOR_TYPE_KIMIVL:
3148 {
3149 model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
3150 model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
3151 model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
3152 model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
3153 model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
3154 model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
3155 } break;
3156 case PROJECTOR_TYPE_PIXTRAL:
3157 {
3158 model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
3159 model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
3160 model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
3161 model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
3162 // [IMG_BREAK] token embedding
3163 model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
3164 // for mistral small 3.1
3165 model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
3166 model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
3167 } break;
3168 case PROJECTOR_TYPE_LIGHTONOCR:
3169 {
3170 model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
3171 model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
3172 model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
3173 model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
3174 model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
3175 model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
3176 } break;
3177 case PROJECTOR_TYPE_ULTRAVOX:
3178 {
3179 model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
3180 model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
3181 model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
3182 model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
3183 model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
3184 model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
3185 model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
3186 model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
3187 } break;
3188 case PROJECTOR_TYPE_QWEN2A:
3189 {
3190 model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
3191 model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
3192 model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
3193 model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
3194 model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
3195 model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
3196 } break;
3197 case PROJECTOR_TYPE_VOXTRAL:
3198 {
3199 model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
3200 model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
3201 model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
3202 model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
3203 model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
3204 model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
3205 } break;
3206 case PROJECTOR_TYPE_INTERNVL:
3207 {
3208 model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
3209 model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
3210 model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
3211 model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
3212 model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
3213 model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
3214 } break;
3215 case PROJECTOR_TYPE_LLAMA4:
3216 {
3217 model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
3218 model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
3219 model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
3220 } break;
3221 case PROJECTOR_TYPE_COGVLM:
3222 {
3223 model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
3224 model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight"));
3225 model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias"));
3226 model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight"));
3227 model.mm_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
3228 model.mm_4h_to_h_w = get_tensor(string_format(TN_MM_4H_TO_H, "weight"));
3229 model.mm_boi = get_tensor(TN_TOK_BOI);
3230 model.mm_eoi = get_tensor(TN_TOK_EOI);
3231 } break;
3232 case PROJECTOR_TYPE_JANUS_PRO:
3233 {
3234 model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
3235 model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
3236 model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
3237 model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
3238 } break;
3239 default:
3240 GGML_ASSERT(false && "unknown projector type");
3241 }
3242
3243 // load data
3244 {
3245 std::vector<uint8_t> read_buf;
3246
3247 auto fin = std::ifstream(fname, std::ios::binary);
3248 if (!fin) {
3249 throw std::runtime_error(string_format(fmt: "%s: failed to open %s\n", __func__, fname.c_str()));
3250 }
3251
3252 // alloc memory and offload data
3253 ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend: ctx_clip.backend);
3254 ctx_clip.buf.reset(p: ggml_backend_alloc_ctx_tensors_from_buft(ctx: ctx_clip.ctx_data.get(), buft));
3255 ggml_backend_buffer_set_usage(buffer: ctx_clip.buf.get(), usage: GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
3256 for (auto & t : tensors_to_load) {
3257 ggml_tensor * cur = ggml_get_tensor(ctx: ctx_clip.ctx_data.get(), name: t->name);
3258 const size_t offset = tensor_offset[t->name];
3259 fin.seekg(offset, std::ios::beg);
3260 if (!fin) {
3261 throw std::runtime_error(string_format(fmt: "%s: failed to seek for tensor %s\n", __func__, t->name));
3262 }
3263 size_t num_bytes = ggml_nbytes(tensor: cur);
3264 if (ggml_backend_buft_is_host(buft)) {
3265 // for the CPU and Metal backend, we can read directly into the tensor
3266 fin.read(s: reinterpret_cast<char *>(cur->data), n: num_bytes);
3267 } else {
3268 // read into a temporary buffer first, then copy to device memory
3269 read_buf.resize(new_size: num_bytes);
3270 fin.read(s: reinterpret_cast<char *>(read_buf.data()), n: num_bytes);
3271 ggml_backend_tensor_set(tensor: cur, data: read_buf.data(), offset: 0, size: num_bytes);
3272 }
3273 }
3274 fin.close();
3275
3276 LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
3277 }
3278 }
3279
3280 struct support_info_op {
3281 ggml_tensor * op;
3282
3283 // true if the op runs on the accelerated ctx_clip.backend
3284 bool is_accel = true;
3285 };
3286
3287 struct support_info_graph {
3288 // whether the clip_ctx.backend supports flash attention
3289 bool fattn = true;
3290 ggml_tensor * fattn_op = nullptr; // for debugging
3291
3292 std::vector<support_info_op> ops;
3293 };
3294
3295 static void warmup(clip_ctx & ctx_clip) {
3296 support_info_graph info;
3297
3298 if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
3299 // try to enable flash attention to see if it's supported
3300 ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
3301 info = alloc_compute_meta(ctx_clip);
3302 if (!info.fattn && info.fattn_op) {
3303 auto op = info.fattn_op;
3304 LOG_WRN("%s: *****************************************************************\n", __func__);
3305 LOG_WRN("%s: WARNING: flash attention not supported by %s, memory usage will increase\n", __func__, ggml_backend_name(ctx_clip.backend));
3306 LOG_WRN("%s: op params: \n", __func__);
3307 static auto print_shape = [](const char * fn, const char * name, ggml_tensor * t) {
3308 LOG_WRN("%s: %s: type = %s, ne = [%d %d %d %d], nb = [%d %d %d %d]\n", fn,
3309 name, ggml_type_name(t->type),
3310 t->ne[0], t->ne[1], t->ne[2], t->ne[3],
3311 t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
3312 };
3313 print_shape(__func__, " dst", op);
3314 print_shape(__func__, "src0", op->src[0]);
3315 print_shape(__func__, "src1", op->src[1]);
3316 print_shape(__func__, "src2", op->src[2]);
3317 LOG_WRN("%s: please report this on github as an issue\n", __func__);
3318 LOG_WRN("%s: *****************************************************************\n", __func__);
3319 ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
3320 alloc_compute_meta(ctx_clip);
3321 }
3322 } else {
3323 info = alloc_compute_meta(ctx_clip);
3324 if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
3325 LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
3326 }
3327 }
3328
3329 LOG_INF("%s: flash attention is %s\n", __func__,
3330 (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
3331
3332 // print ops that are not supported by the GPU backend (if there is one)
3333 if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) {
3334 std::vector<support_info_op> unsupported_ops;
3335 for (const auto & op : info.ops) {
3336 if (!op.is_accel) {
3337 unsupported_ops.push_back(x: op);
3338 }
3339 }
3340 if (!unsupported_ops.empty()) {
3341 LOG_WRN("%s: *****************************************************************\n", __func__);
3342 LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
3343 LOG_WRN("%s: the performance will be suboptimal \n", __func__);
3344 LOG_WRN("%s: list of unsupported ops (backend=%s):\n", __func__, ggml_backend_name(ctx_clip.backend));
3345 for (const auto & op : unsupported_ops) {
3346 LOG_WRN("%s: %16s: type = %s, ne = [%d %d %d %d]\n", __func__,
3347 ggml_op_name(op.op->op),
3348 ggml_type_name(op.op->type),
3349 op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]);
3350 }
3351 LOG_WRN("%s: flash attention is %s\n", __func__,
3352 (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
3353 LOG_WRN("%s: please report this on github as an issue\n", __func__);
3354 LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);
3355 LOG_WRN("%s: *****************************************************************\n", __func__);
3356 }
3357 }
3358 }
3359
3360 static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip) {
3361 const auto & hparams = ctx_clip.model.hparams;
3362 ctx_clip.buf_compute_meta.resize(new_size: ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
3363
3364 // create a fake batch
3365 clip_image_f32_batch batch;
3366 clip_image_f32_ptr img(clip_image_f32_init());
3367 if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
3368 img->nx = hparams.warmup_image_size;
3369 img->ny = hparams.warmup_image_size;
3370 LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
3371 } else {
3372 img->nx = hparams.warmup_audio_size;
3373 img->ny = hparams.n_mel_bins;
3374 LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
3375 }
3376 batch.entries.push_back(x: std::move(img));
3377
3378 ggml_cgraph * gf = clip_image_build_graph(ctx: &ctx_clip, imgs: batch);
3379 ggml_backend_sched_reserve(sched: ctx_clip.sched.get(), measure_graph: gf);
3380
3381 for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
3382 ggml_backend_t backend = ctx_clip.backend_ptrs[i];
3383 ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
3384 size_t size = ggml_backend_sched_get_buffer_size(sched: ctx_clip.sched.get(), backend);
3385 if (size > 1) {
3386 LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
3387 ggml_backend_buft_name(buft),
3388 size / 1024.0 / 1024.0);
3389 }
3390 }
3391
3392 const int n_splits = ggml_backend_sched_get_n_splits(sched: ctx_clip.sched.get());
3393 const int n_nodes = ggml_graph_n_nodes(cgraph: gf);
3394
3395 LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__, n_splits, n_nodes);
3396
3397 support_info_graph res {
3398 /*.fattn = */ true,
3399 /*.fattn_op = */ nullptr,
3400 /*.ops = */ {},
3401 };
3402
3403 // check op support
3404 for (int i = 0; i < ggml_graph_n_nodes(cgraph: gf); i++) {
3405 ggml_tensor * node = ggml_graph_node(cgraph: gf, i);
3406 res.ops.push_back(x: {.op: node, .is_accel: true});
3407 if (!ggml_backend_supports_op(backend: ctx_clip.backend, op: node)) {
3408 res.ops.back().is_accel = false;
3409 if (node->op == GGML_OP_FLASH_ATTN_EXT) {
3410 res.fattn = false;
3411 res.fattn_op = node;
3412 }
3413 }
3414 }
3415
3416 return res;
3417 }
3418
3419 void get_bool(const std::string & key, bool & output, bool required = true) const {
3420 const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3421 if (i < 0) {
3422 if (required) {
3423 throw std::runtime_error("Key not found: " + key);
3424 }
3425 return;
3426 }
3427 output = gguf_get_val_bool(ctx: ctx_gguf.get(), key_id: i);
3428 }
3429
3430 void get_i32(const std::string & key, int & output, bool required = true) const {
3431 const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3432 if (i < 0) {
3433 if (required) {
3434 throw std::runtime_error("Key not found: " + key);
3435 }
3436 return;
3437 }
3438 output = gguf_get_val_i32(ctx: ctx_gguf.get(), key_id: i);
3439 }
3440
3441 void get_u32(const std::string & key, int & output, bool required = true) const {
3442 const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3443 if (i < 0) {
3444 if (required) {
3445 throw std::runtime_error("Key not found: " + key);
3446 }
3447 return;
3448 }
3449 output = gguf_get_val_u32(ctx: ctx_gguf.get(), key_id: i);
3450 }
3451
3452 void get_f32(const std::string & key, float & output, bool required = true) const {
3453 const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3454 if (i < 0) {
3455 if (required) {
3456 throw std::runtime_error("Key not found: " + key);
3457 }
3458 return;
3459 }
3460 output = gguf_get_val_f32(ctx: ctx_gguf.get(), key_id: i);
3461 }
3462
3463 void get_string(const std::string & key, std::string & output, bool required = true) const {
3464 const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3465 if (i < 0) {
3466 if (required) {
3467 throw std::runtime_error("Key not found: " + key);
3468 }
3469 return;
3470 }
3471 output = std::string(gguf_get_val_str(ctx: ctx_gguf.get(), key_id: i));
3472 }
3473
3474 void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) const {
3475 const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3476 if (i < 0) {
3477 if (required) {
3478 throw std::runtime_error("Key not found: " + key);
3479 }
3480 return;
3481 }
3482 int n = gguf_get_arr_n(ctx: ctx_gguf.get(), key_id: i);
3483 output.resize(new_size: n);
3484 const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx: ctx_gguf.get(), key_id: i);
3485 for (int i = 0; i < n; ++i) {
3486 output[i] = values[i];
3487 }
3488 }
3489
3490 static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
3491 auto & hparams = model.hparams;
3492 for (int x = 1; x <= max_patches_per_side; x++) {
3493 for (int y = 1; y <= max_patches_per_side; y++) {
3494 if (x == 1 && y == 1) {
3495 continue; // skip the first point
3496 }
3497 hparams.image_res_candidates.push_back(x: clip_image_size{
3498 .width: x*hparams.image_size,
3499 .height: y*hparams.image_size,
3500 });
3501 }
3502 }
3503 }
3504};
3505
3506struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
3507 g_logger_state.verbosity_thold = ctx_params.verbosity;
3508 clip_ctx * ctx_vision = nullptr;
3509 clip_ctx * ctx_audio = nullptr;
3510
3511 try {
3512 clip_model_loader loader(fname);
3513
3514 if (loader.has_vision) {
3515 ctx_vision = new clip_ctx(ctx_params);
3516 loader.load_hparams(model&: ctx_vision->model, modality: CLIP_MODALITY_VISION);
3517 loader.load_tensors(ctx_clip&: *ctx_vision);
3518 loader.warmup(ctx_clip&: *ctx_vision);
3519 }
3520
3521 if (loader.has_audio) {
3522 ctx_audio = new clip_ctx(ctx_params);
3523 loader.load_hparams(model&: ctx_audio->model, modality: CLIP_MODALITY_AUDIO);
3524 loader.load_tensors(ctx_clip&: *ctx_audio);
3525 loader.warmup(ctx_clip&: *ctx_audio);
3526 }
3527
3528 } catch (const std::exception & e) {
3529 LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
3530
3531 delete ctx_vision;
3532 delete ctx_audio;
3533
3534 return {.ctx_v: nullptr, .ctx_a: nullptr};
3535 }
3536
3537 return {.ctx_v: ctx_vision, .ctx_a: ctx_audio};
3538}
3539
3540struct clip_image_size * clip_image_size_init() {
3541 struct clip_image_size * load_image_size = new struct clip_image_size();
3542 load_image_size->width = 448;
3543 load_image_size->height = 448;
3544 return load_image_size;
3545}
3546
3547struct clip_image_u8 * clip_image_u8_init() {
3548 return new clip_image_u8();
3549}
3550
3551struct clip_image_f32 * clip_image_f32_init() {
3552 return new clip_image_f32();
3553}
3554
3555struct clip_image_f32_batch * clip_image_f32_batch_init() {
3556 return new clip_image_f32_batch();
3557}
3558
3559unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
3560 if (nx) *nx = img->nx;
3561 if (ny) *ny = img->ny;
3562 return img->buf.data();
3563}
3564
3565void clip_image_size_free(struct clip_image_size * load_image_size) {
3566 if (load_image_size == nullptr) {
3567 return;
3568 }
3569 delete load_image_size;
3570}
3571void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
3572void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
3573void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; }
3574void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; }
3575
3576size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
3577 return batch->entries.size();
3578}
3579
3580size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
3581 if (idx < 0 || idx >= (int)batch->entries.size()) {
3582 LOG_ERR("%s: invalid index %d\n", __func__, idx);
3583 return 0;
3584 }
3585 return batch->entries[idx]->nx;
3586}
3587
3588size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
3589 if (idx < 0 || idx >= (int)batch->entries.size()) {
3590 LOG_ERR("%s: invalid index %d\n", __func__, idx);
3591 return 0;
3592 }
3593 return batch->entries[idx]->ny;
3594}
3595
3596clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
3597 if (idx < 0 || idx >= (int)batch->entries.size()) {
3598 LOG_ERR("%s: invalid index %d\n", __func__, idx);
3599 return nullptr;
3600 }
3601 return batch->entries[idx].get();
3602}
3603
3604void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
3605 img->nx = nx;
3606 img->ny = ny;
3607 img->buf.resize(new_size: 3 * nx * ny);
3608 memcpy(dest: img->buf.data(), src: rgb_pixels, n: img->buf.size());
3609}
3610
3611// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
3612static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
3613 dst.nx = src.nx;
3614 dst.ny = src.ny;
3615 dst.buf.resize(new_size: src.buf.size());
3616
3617 // TODO @ngxson : seems like this could be done more efficiently on cgraph
3618 for (size_t i = 0; i < src.buf.size(); ++i) {
3619 int c = i % 3; // rgb
3620 dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
3621 }
3622}
3623
3624// set of tools to manupulate images
3625// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
3626struct img_tool {
3627 enum resize_algo {
3628 RESIZE_ALGO_BILINEAR,
3629 RESIZE_ALGO_BICUBIC,
3630 // RESIZE_ALGO_LANCZOS, // TODO
3631 };
3632
3633 static void resize(
3634 const clip_image_u8 & src,
3635 clip_image_u8 & dst,
3636 const clip_image_size & target_resolution,
3637 resize_algo algo,
3638 bool add_padding = true, // TODO: define the behavior for add_padding = false
3639 std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
3640 dst.nx = target_resolution.width;
3641 dst.ny = target_resolution.height;
3642 dst.buf.resize(new_size: 3 * dst.nx * dst.ny);
3643
3644 if (dst.nx == src.nx && dst.ny == src.ny) {
3645 // no resize needed, simple copy
3646 dst.buf = src.buf;
3647 return;
3648 }
3649
3650 if (!add_padding) {
3651 // direct resize
3652 switch (algo) {
3653 case RESIZE_ALGO_BILINEAR:
3654 resize_bilinear(src, dst, target_width: target_resolution.width, target_height: target_resolution.height);
3655 break;
3656 case RESIZE_ALGO_BICUBIC:
3657 resize_bicubic(img: src, dst, target_width: target_resolution.width, target_height: target_resolution.height);
3658 break;
3659 default:
3660 throw std::runtime_error("Unsupported resize algorithm");
3661 }
3662 } else {
3663 // resize with padding
3664 clip_image_u8 resized_image;
3665 float scale_w = static_cast<float>(target_resolution.width) / src.nx;
3666 float scale_h = static_cast<float>(target_resolution.height) / src.ny;
3667 float scale = std::min(a: scale_w, b: scale_h);
3668 int new_width = std::min(a: static_cast<int>(std::ceil(x: src.nx * scale)), b: target_resolution.width);
3669 int new_height = std::min(a: static_cast<int>(std::ceil(x: src.ny * scale)), b: target_resolution.height);
3670
3671 switch (algo) {
3672 case RESIZE_ALGO_BILINEAR:
3673 resize_bilinear(src, dst&: resized_image, target_width: new_width, target_height: new_height);
3674 break;
3675 case RESIZE_ALGO_BICUBIC:
3676 resize_bicubic(img: src, dst&: resized_image, target_width: new_width, target_height: new_height);
3677 break;
3678 default:
3679 throw std::runtime_error("Unsupported resize algorithm");
3680 }
3681
3682 // fill dst with pad_color
3683 fill(img&: dst, color: pad_color);
3684
3685 int offset_x = (target_resolution.width - new_width) / 2;
3686 int offset_y = (target_resolution.height - new_height) / 2;
3687
3688 composite(dst, src: resized_image, offset_x, offset_y);
3689 }
3690 }
3691
3692 static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
3693 dst.nx = w;
3694 dst.ny = h;
3695 dst.buf.resize(new_size: 3 * w * h);
3696
3697 for (int i = 0; i < h; ++i) {
3698 for (int j = 0; j < w; ++j) {
3699 int src_idx = 3 * ((y + i)*image.nx + (x + j));
3700 int dst_idx = 3 * (i*w + j);
3701 dst.buf[dst_idx] = image.buf[src_idx];
3702 dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
3703 dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
3704 }
3705 }
3706 }
3707
3708 // calculate the size of the **resized** image, while preserving the aspect ratio
3709 // the calculated size will be aligned to the nearest multiple of align_size
3710 // if H or W size is larger than longest_edge, it will be resized to longest_edge
3711 static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
3712 GGML_ASSERT(align_size > 0);
3713 if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) {
3714 return {.width: 0, .height: 0};
3715 }
3716
3717 float scale = std::min(a: static_cast<float>(longest_edge) / inp_size.width,
3718 b: static_cast<float>(longest_edge) / inp_size.height);
3719
3720 float target_width_f = static_cast<float>(inp_size.width) * scale;
3721 float target_height_f = static_cast<float>(inp_size.height) * scale;
3722
3723 auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x: x / static_cast<float>(f))) * f; };
3724 int aligned_width = ceil_by_factor(target_width_f);
3725 int aligned_height = ceil_by_factor(target_height_f);
3726
3727 return {.width: aligned_width, .height: aligned_height};
3728 }
3729
3730 // calculate the size of the **resized** image, while preserving the aspect ratio
3731 // the calculated size will have min_pixels <= W*H <= max_pixels
3732 // this is referred as "smart_resize" in transformers code
3733 static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
3734 GGML_ASSERT(align_size > 0);
3735 const int width = inp_size.width;
3736 const int height = inp_size.height;
3737
3738 auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x: x / static_cast<float>(f))) * f; };
3739 auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x: x / static_cast<float>(f))) * f; };
3740
3741 // always align up first
3742 int h_bar = std::max(a: align_size, b: ceil_by_factor(height));
3743 int w_bar = std::max(a: align_size, b: ceil_by_factor(width));
3744
3745 if (h_bar * w_bar > max_pixels) {
3746 const auto beta = std::sqrt(x: static_cast<float>(height * width) / max_pixels);
3747 h_bar = std::max(a: align_size, b: floor_by_factor(height / beta));
3748 w_bar = std::max(a: align_size, b: floor_by_factor(width / beta));
3749 } else if (h_bar * w_bar < min_pixels) {
3750 const auto beta = std::sqrt(x: static_cast<float>(min_pixels) / (height * width));
3751 h_bar = ceil_by_factor(height * beta);
3752 w_bar = ceil_by_factor(width * beta);
3753 }
3754
3755 return {.width: w_bar, .height: h_bar};
3756 }
3757
3758 // draw src image into dst image at offset (offset_x, offset_y)
3759 static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
3760 for (int y = 0; y < src.ny; ++y) {
3761 for (int x = 0; x < src.nx; ++x) {
3762 int dx = x + offset_x;
3763 int dy = y + offset_y;
3764 // skip pixels that would be out of bounds in the destination
3765 if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
3766 continue;
3767 }
3768 size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
3769 size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
3770 dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
3771 dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
3772 dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
3773 }
3774 }
3775 }
3776
3777 // fill the image with a solid color
3778 static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
3779 for (size_t i = 0; i < img.buf.size(); i += 3) {
3780 img.buf[i] = color[0];
3781 img.buf[i + 1] = color[1];
3782 img.buf[i + 2] = color[2];
3783 }
3784 }
3785
3786private:
3787 // Bilinear resize function
3788 static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
3789 dst.nx = target_width;
3790 dst.ny = target_height;
3791 dst.buf.resize(new_size: 3 * target_width * target_height);
3792
3793 float x_ratio = static_cast<float>(src.nx - 1) / target_width;
3794 float y_ratio = static_cast<float>(src.ny - 1) / target_height;
3795
3796 for (int y = 0; y < target_height; y++) {
3797 for (int x = 0; x < target_width; x++) {
3798 float px = x_ratio * x;
3799 float py = y_ratio * y;
3800 int x_floor = static_cast<int>(px);
3801 int y_floor = static_cast<int>(py);
3802 float x_lerp = px - x_floor;
3803 float y_lerp = py - y_floor;
3804
3805 for (int c = 0; c < 3; c++) {
3806 float top = lerp(
3807 s: static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
3808 e: static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
3809 t: x_lerp
3810 );
3811 float bottom = lerp(
3812 s: static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
3813 e: static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
3814 t: x_lerp
3815 );
3816 dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(s: top, e: bottom, t: y_lerp));
3817 }
3818 }
3819 }
3820 }
3821
3822 // Bicubic resize function
3823 // part of image will be cropped if the aspect ratio is different
3824 static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
3825 const int nx = img.nx;
3826 const int ny = img.ny;
3827
3828 dst.nx = target_width;
3829 dst.ny = target_height;
3830 dst.buf.resize(new_size: 3 * target_width * target_height);
3831
3832 float Cc;
3833 float C[5] = {};
3834 float d0, d2, d3, a0, a1, a2, a3;
3835 int i, j, k, jj;
3836 int x, y;
3837 float dx, dy;
3838 float tx, ty;
3839
3840 tx = (float)nx / (float)target_width;
3841 ty = (float)ny / (float)target_height;
3842
3843 // Bicubic interpolation; adapted from ViT.cpp, inspired from :
3844 // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
3845 // -> https://en.wikipedia.org/wiki/Bicubic_interpolation
3846
3847 for (i = 0; i < target_height; i++) {
3848 for (j = 0; j < target_width; j++) {
3849 x = (int)(tx * j);
3850 y = (int)(ty * i);
3851
3852 dx = tx * j - x;
3853 dy = ty * i - y;
3854
3855 for (k = 0; k < 3; k++) {
3856 for (jj = 0; jj <= 3; jj++) {
3857 d0 = img.buf[(clip(x: y - 1 + jj, lower: 0, upper: ny - 1) * nx + clip(x: x - 1, lower: 0, upper: nx - 1)) * 3 + k] - img.buf[(clip(x: y - 1 + jj, lower: 0, upper: ny - 1) * nx + clip(x, lower: 0, upper: nx - 1)) * 3 + k];
3858 d2 = img.buf[(clip(x: y - 1 + jj, lower: 0, upper: ny - 1) * nx + clip(x: x + 1, lower: 0, upper: nx - 1)) * 3 + k] - img.buf[(clip(x: y - 1 + jj, lower: 0, upper: ny - 1) * nx + clip(x, lower: 0, upper: nx - 1)) * 3 + k];
3859 d3 = img.buf[(clip(x: y - 1 + jj, lower: 0, upper: ny - 1) * nx + clip(x: x + 2, lower: 0, upper: nx - 1)) * 3 + k] - img.buf[(clip(x: y - 1 + jj, lower: 0, upper: ny - 1) * nx + clip(x, lower: 0, upper: nx - 1)) * 3 + k];
3860 a0 = img.buf[(clip(x: y - 1 + jj, lower: 0, upper: ny - 1) * nx + clip(x, lower: 0, upper: nx - 1)) * 3 + k];
3861
3862 a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
3863 a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
3864 a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
3865
3866 C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
3867
3868 d0 = C[0] - C[1];
3869 d2 = C[2] - C[1];
3870 d3 = C[3] - C[1];
3871 a0 = C[1];
3872 a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
3873 a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
3874 a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
3875 Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
3876
3877 const uint8_t Cc2 = std::min(a: std::max(a: std::round(x: Cc), b: 0.0f), b: 255.0f);
3878 dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
3879 }
3880 }
3881 }
3882 }
3883
3884 return true;
3885 }
3886
3887 static inline int clip(int x, int lower, int upper) {
3888 return std::max(a: lower, b: std::min(a: x, b: upper));
3889 }
3890
3891 // Linear interpolation between two points
3892 static inline float lerp(float s, float e, float t) {
3893 return s + (e - s) * t;
3894 }
3895};
3896
3897/**
3898 * implementation of LLaVA-UHD:
3899 * - https://arxiv.org/pdf/2403.11703
3900 * - https://github.com/thunlp/LLaVA-UHD
3901 * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
3902 *
3903 * overview:
3904 * - an image always have a single overview (downscaled image)
3905 * - an image can have 0 or multiple slices, depending on the image size
3906 * - each slice can then be considered as a separate image
3907 *
3908 * for example:
3909 *
3910 * [overview] --> [slice 1] --> [slice 2]
3911 * | |
3912 * +--> [slice 3] --> [slice 4]
3913 */
3914struct llava_uhd {
3915 struct slice_coordinates {
3916 int x;
3917 int y;
3918 clip_image_size size;
3919 };
3920
3921 struct slice_instructions {
3922 clip_image_size overview_size; // size of downscaled image
3923 clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
3924 clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
3925 std::vector<slice_coordinates> slices;
3926 bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
3927 };
3928
3929 static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
3930 slice_instructions res;
3931 const int patch_size = clip_get_patch_size(ctx);
3932 const int slice_size = clip_get_image_size(ctx);
3933 const int original_width = original_size.width;
3934 const int original_height = original_size.height;
3935
3936 const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
3937 const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
3938
3939 if (!has_slices) {
3940 // skip slicing logic
3941 res.overview_size = clip_image_size{.width: slice_size, .height: slice_size};
3942 res.refined_size = clip_image_size{.width: 0, .height: 0};
3943 res.grid_size = clip_image_size{.width: 0, .height: 0};
3944
3945 return res;
3946 }
3947
3948 if (has_pinpoints) {
3949 // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
3950 auto refine_size = llava_uhd::select_best_resolution(
3951 original_size,
3952 possible_resolutions: ctx->model.hparams.image_res_candidates);
3953 res.overview_size = clip_image_size{.width: slice_size, .height: slice_size};
3954 res.refined_size = refine_size;
3955 res.grid_size = clip_image_size{.width: 0, .height: 0};
3956 res.padding_refined = true;
3957
3958 LOG_DBG("%s: using pinpoints for slicing\n", __func__);
3959 LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
3960 __func__, original_width, original_height,
3961 res.overview_size.width, res.overview_size.height,
3962 res.refined_size.width, res.refined_size.height);
3963
3964 for (int y = 0; y < refine_size.height; y += slice_size) {
3965 for (int x = 0; x < refine_size.width; x += slice_size) {
3966 slice_coordinates slice;
3967 slice.x = x;
3968 slice.y = y;
3969 slice.size.width = std::min(a: slice_size, b: refine_size.width - x);
3970 slice.size.height = std::min(a: slice_size, b: refine_size.height - y);
3971 res.slices.push_back(x: slice);
3972 LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
3973 __func__, (int)res.slices.size() - 1,
3974 slice.x, slice.y, slice.size.width, slice.size.height);
3975 }
3976 }
3977
3978 res.grid_size.height = refine_size.height / slice_size;
3979 res.grid_size.width = refine_size.width / slice_size;
3980 LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
3981
3982 return res;
3983 }
3984
3985 // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
3986
3987 auto best_size = get_best_resize(original_size, scale_resolution: slice_size, patch_size, allow_upscale: !has_slices);
3988 res.overview_size = best_size;
3989
3990 {
3991 const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
3992 const float log_ratio = log(x: (float)original_width / original_height);
3993 const float ratio = (float)original_width * original_height / (slice_size * slice_size);
3994 const int multiple = fmin(x: ceil(x: ratio), y: max_slice_nums);
3995
3996 auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
3997 auto refine_size = get_refine_size(original_size, grid: best_grid, scale_resolution: slice_size, patch_size, allow_upscale: true);
3998 res.grid_size = best_grid;
3999 res.refined_size = refine_size;
4000
4001 LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
4002 __func__, original_width, original_height,
4003 res.overview_size.width, res.overview_size.height,
4004 res.refined_size.width, res.refined_size.height,
4005 res.grid_size.width, res.grid_size.height);
4006
4007 int width = refine_size.width;
4008 int height = refine_size.height;
4009 int grid_x = int(width / best_grid.width);
4010 int grid_y = int(height / best_grid.height);
4011 for (int patches_y = 0, ic = 0;
4012 patches_y < refine_size.height && ic < best_grid.height;
4013 patches_y += grid_y, ic += 1) {
4014 for (int patches_x = 0, jc = 0;
4015 patches_x < refine_size.width && jc < best_grid.width;
4016 patches_x += grid_x, jc += 1) {
4017 slice_coordinates slice;
4018 slice.x = patches_x;
4019 slice.y = patches_y;
4020 slice.size.width = grid_x;
4021 slice.size.height = grid_y;
4022 res.slices.push_back(x: slice);
4023 LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
4024 __func__, (int)res.slices.size() - 1,
4025 slice.x, slice.y, slice.size.width, slice.size.height);
4026 }
4027 }
4028 }
4029
4030 return res;
4031 }
4032
4033 static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
4034 std::vector<clip_image_u8_ptr> output;
4035 img_tool::resize_algo interpolation = img_tool::RESIZE_ALGO_BILINEAR; // TODO: make it configurable
4036
4037 // resize to overview size
4038 clip_image_u8_ptr resized_img(clip_image_u8_init());
4039 img_tool::resize(src: *img, dst&: *resized_img, target_resolution: inst.overview_size, algo: interpolation);
4040 output.push_back(x: std::move(resized_img));
4041 if (inst.slices.empty()) {
4042 // no slices, just return the resized image
4043 return output;
4044 }
4045
4046 // resize to refined size
4047 clip_image_u8_ptr refined_img(clip_image_u8_init());
4048 if (inst.padding_refined) {
4049 img_tool::resize(src: *img, dst&: *refined_img, target_resolution: inst.refined_size, algo: interpolation);
4050 } else {
4051 // only algo bicubic preserves the ratio; old models rely on this behavior
4052 // TODO: do we need to support other algos here?
4053 img_tool::resize(src: *img, dst&: *refined_img, target_resolution: inst.refined_size, algo: img_tool::RESIZE_ALGO_BICUBIC, add_padding: false);
4054 }
4055
4056 // create slices
4057 for (const auto & slice : inst.slices) {
4058 int x = slice.x;
4059 int y = slice.y;
4060 int w = slice.size.width;
4061 int h = slice.size.height;
4062
4063 clip_image_u8_ptr img_slice(clip_image_u8_init());
4064 img_tool::crop(image: *refined_img, dst&: *img_slice, x, y, w, h);
4065 output.push_back(x: std::move(img_slice));
4066 }
4067
4068 return output;
4069 }
4070
4071private:
4072 static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
4073 int width = original_size.width;
4074 int height = original_size.height;
4075 if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
4076 float r = static_cast<float>(width) / height;
4077 height = static_cast<int>(scale_resolution / std::sqrt(x: r));
4078 width = static_cast<int>(height * r);
4079 }
4080 clip_image_size res;
4081 res.width = ensure_divide(length: width, patch_size);
4082 res.height = ensure_divide(length: height, patch_size);
4083 return res;
4084 }
4085
4086 static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
4087 float scale_width = static_cast<float>(target_max.width) / orig.width;
4088 float scale_height = static_cast<float>(target_max.height) / orig.height;
4089 float scale = std::min(a: scale_width, b: scale_height);
4090 return clip_image_size{
4091 .width: static_cast<int>(orig.width * scale),
4092 .height: static_cast<int>(orig.height * scale),
4093 };
4094 }
4095
4096 /**
4097 * Selects the best resolution from a list of possible resolutions based on the original size.
4098 *
4099 * For example, when given a list of resolutions:
4100 * - 100x100
4101 * - 200x100
4102 * - 100x200
4103 * - 200x200
4104 *
4105 * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
4106 *
4107 * @param original_size The original size of the image
4108 * @param possible_resolutions A list of possible resolutions
4109 * @return The best fit resolution
4110 */
4111 static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
4112 clip_image_size best_fit;
4113 int min_wasted_area = std::numeric_limits<int>::max();
4114 int max_effective_resolution = 0;
4115
4116 for (const clip_image_size & candidate : possible_resolutions) {
4117 auto target_size = resize_maintain_aspect_ratio(orig: original_size, target_max: candidate);
4118 int effective_resolution = std::min(
4119 a: target_size.width * target_size.height,
4120 b: original_size.width * original_size.height);
4121 int wasted_area = (candidate.width * candidate.height) - effective_resolution;
4122
4123 if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
4124 max_effective_resolution = effective_resolution;
4125 min_wasted_area = wasted_area;
4126 best_fit = candidate;
4127 }
4128
4129 LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
4130 }
4131
4132 return best_fit;
4133 }
4134
4135 static int ensure_divide(int length, int patch_size) {
4136 return std::max(a: static_cast<int>(std::round(x: static_cast<float>(length) / patch_size) * patch_size), b: patch_size);
4137 }
4138
4139 static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
4140 int width = original_size.width;
4141 int height = original_size.height;
4142 int grid_x = grid.width;
4143 int grid_y = grid.height;
4144
4145 int refine_width = ensure_divide(length: width, patch_size: grid_x);
4146 int refine_height = ensure_divide(length: height, patch_size: grid_y);
4147
4148 clip_image_size grid_size;
4149 grid_size.width = refine_width / grid_x;
4150 grid_size.height = refine_height / grid_y;
4151
4152 auto best_grid_size = get_best_resize(original_size: grid_size, scale_resolution, patch_size, allow_upscale);
4153 int best_grid_width = best_grid_size.width;
4154 int best_grid_height = best_grid_size.height;
4155
4156 clip_image_size refine_size;
4157 refine_size.width = best_grid_width * grid_x;
4158 refine_size.height = best_grid_height * grid_y;
4159 return refine_size;
4160 }
4161
4162 static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
4163 std::vector<int> candidate_split_grids_nums;
4164 for (int i : {multiple - 1, multiple, multiple + 1}) {
4165 if (i == 1 || i > max_slice_nums) {
4166 continue;
4167 }
4168 candidate_split_grids_nums.push_back(x: i);
4169 }
4170
4171 std::vector<clip_image_size> candidate_grids;
4172 for (int split_grids_nums : candidate_split_grids_nums) {
4173 int m = 1;
4174 while (m <= split_grids_nums) {
4175 if (split_grids_nums % m == 0) {
4176 candidate_grids.push_back(x: clip_image_size{.width: m, .height: split_grids_nums / m});
4177 }
4178 ++m;
4179 }
4180 }
4181
4182 clip_image_size best_grid{.width: 1, .height: 1};
4183 float min_error = std::numeric_limits<float>::infinity();
4184 for (const auto& grid : candidate_grids) {
4185 float error = std::abs(x: log_ratio - std::log(x: 1.0 * grid.width / grid.height));
4186 if (error < min_error) {
4187 best_grid = grid;
4188 min_error = error;
4189 }
4190 }
4191 return best_grid;
4192 }
4193};
4194
4195// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
4196// res_imgs memory is being allocated here, previous allocations will be freed if found
4197bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
4198 clip_image_size original_size{.width: img->nx, .height: img->ny};
4199 auto & params = ctx->model.hparams;
4200
4201 switch (ctx->proj_type()) {
4202 case PROJECTOR_TYPE_MINICPMV:
4203 {
4204 auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
4205 std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
4206
4207 for (size_t i = 0; i < imgs.size(); ++i) {
4208 // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
4209 clip_image_f32_ptr res(clip_image_f32_init());
4210 normalize_image_u8_to_f32(src: *imgs[i], dst&: *res, mean: params.image_mean, std: params.image_std);
4211 res_imgs->entries.push_back(x: std::move(res));
4212 }
4213
4214 res_imgs->grid_x = inst.grid_size.width;
4215 res_imgs->grid_y = inst.grid_size.height;
4216 } break;
4217
4218 case PROJECTOR_TYPE_QWEN2VL:
4219 case PROJECTOR_TYPE_QWEN25VL:
4220 case PROJECTOR_TYPE_QWEN3VL:
4221 {
4222 GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
4223 clip_image_u8 resized;
4224 const clip_image_size new_size = img_tool::calc_size_preserved_ratio(
4225 inp_size: original_size,
4226 align_size: params.patch_size * 2,
4227 min_pixels: params.image_min_pixels,
4228 max_pixels: params.image_max_pixels);
4229 img_tool::resize(src: *img, dst&: resized, target_resolution: new_size, algo: img_tool::RESIZE_ALGO_BILINEAR, add_padding: false);
4230 // clip_image_save_to_bmp(resized, "preproc.bmp");
4231 clip_image_f32_ptr img_f32(clip_image_f32_init());
4232 // clip_image_f32_ptr res(clip_image_f32_init());
4233 normalize_image_u8_to_f32(src: resized, dst&: *img_f32, mean: params.image_mean, std: params.image_std);
4234 // res_imgs->data[0] = *res;
4235 res_imgs->entries.push_back(x: std::move(img_f32));
4236 } break;
4237
4238 case PROJECTOR_TYPE_IDEFICS3:
4239 {
4240 // The refined size has two steps:
4241 // 1. Resize w/ aspect-ratio preserving such that the longer side is
4242 // the preprocessor longest size
4243 // 2. Resize w/out preserving aspect ratio such that both sides are
4244 // multiples of image_size (always rounding up)
4245 //
4246 // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
4247 const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
4248 inp_size: original_size, align_size: params.image_size, longest_edge: params.image_longest_edge);
4249 // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
4250 // __func__, original_size.width, original_size.height,
4251 // refined_size.width, refined_size.height);
4252
4253 llava_uhd::slice_instructions instructions;
4254 instructions.overview_size = clip_image_size{.width: params.image_size, .height: params.image_size};
4255 instructions.refined_size = refined_size;
4256 instructions.grid_size = clip_image_size{
4257 .width: static_cast<int>(std::ceil(x: static_cast<float>(refined_size.width) / params.image_size)),
4258 .height: static_cast<int>(std::ceil(x: static_cast<float>(refined_size.height) / params.image_size)),
4259 };
4260 for (int y = 0; y < refined_size.height; y += params.image_size) {
4261 for (int x = 0; x < refined_size.width; x += params.image_size) {
4262 // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
4263 instructions.slices.push_back(x: llava_uhd::slice_coordinates{
4264 /* x */x,
4265 /* y */y,
4266 /* size */clip_image_size{
4267 .width: std::min(a: params.image_size, b: refined_size.width - x),
4268 .height: std::min(a: params.image_size, b: refined_size.height - y)
4269 }
4270 });
4271 }
4272 }
4273 auto imgs = llava_uhd::slice_image(img, inst: instructions);
4274
4275 // cast and normalize to f32
4276 for (size_t i = 0; i < imgs.size(); ++i) {
4277 // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
4278 clip_image_f32_ptr res(clip_image_f32_init());
4279 normalize_image_u8_to_f32(src: *imgs[i], dst&: *res, mean: params.image_mean, std: params.image_std);
4280 res_imgs->entries.push_back(x: std::move(res));
4281 }
4282
4283 res_imgs->grid_x = instructions.grid_size.width;
4284 res_imgs->grid_y = instructions.grid_size.height;
4285 } break;
4286
4287 case PROJECTOR_TYPE_GLM_EDGE:
4288 case PROJECTOR_TYPE_GEMMA3:
4289 case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
4290 {
4291 clip_image_u8 resized_image;
4292 int sz = params.image_size;
4293 img_tool::resize(src: *img, dst&: resized_image, target_resolution: {.width: sz, .height: sz}, algo: img_tool::RESIZE_ALGO_BILINEAR);
4294 clip_image_f32_ptr img_f32(clip_image_f32_init());
4295 //clip_image_save_to_bmp(resized_image, "resized.bmp");
4296 normalize_image_u8_to_f32(src: resized_image, dst&: *img_f32, mean: params.image_mean, std: params.image_std);
4297 res_imgs->entries.push_back(x: std::move(img_f32));
4298 } break;
4299
4300 case PROJECTOR_TYPE_JANUS_PRO:
4301 {
4302 // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
4303 const std::array<uint8_t, 3> pad_color = {127, 127, 127};
4304 clip_image_u8 resized_image;
4305 int sz = params.image_size;
4306 img_tool::resize(src: *img, dst&: resized_image, target_resolution: {.width: sz, .height: sz}, algo: img_tool::RESIZE_ALGO_BILINEAR, add_padding: true, pad_color);
4307 clip_image_f32_ptr img_f32(clip_image_f32_init());
4308 normalize_image_u8_to_f32(src: resized_image, dst&: *img_f32, mean: params.image_mean, std: params.image_std);
4309 res_imgs->entries.push_back(x: std::move(img_f32));
4310 } break;
4311
4312 case PROJECTOR_TYPE_PIXTRAL:
4313 case PROJECTOR_TYPE_LIGHTONOCR:
4314 {
4315 GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
4316 clip_image_u8 resized_image;
4317 // the original pixtral model doesn't have n_merge
4318 const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge;
4319 const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
4320 inp_size: original_size,
4321 align_size: params.patch_size * cur_merge,
4322 min_pixels: params.image_min_pixels,
4323 max_pixels: params.image_max_pixels);
4324 img_tool::resize(src: *img, dst&: resized_image, target_resolution: target_size, algo: img_tool::RESIZE_ALGO_BILINEAR);
4325 clip_image_f32_ptr img_f32(clip_image_f32_init());
4326 normalize_image_u8_to_f32(src: resized_image, dst&: *img_f32, mean: params.image_mean, std: params.image_std);
4327 res_imgs->entries.push_back(x: std::move(img_f32));
4328 } break;
4329
4330 case PROJECTOR_TYPE_LLAMA4:
4331 {
4332 GGML_ASSERT(!params.image_res_candidates.empty());
4333 auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
4334 std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
4335
4336 for (size_t i = 0; i < imgs.size(); ++i) {
4337 clip_image_f32_ptr res(clip_image_f32_init());
4338 normalize_image_u8_to_f32(src: *imgs[i], dst&: *res, mean: params.image_mean, std: params.image_std);
4339 res_imgs->entries.push_back(x: std::move(res));
4340 }
4341
4342 res_imgs->grid_x = inst.grid_size.width;
4343 res_imgs->grid_y = inst.grid_size.height;
4344 } break;
4345
4346 case PROJECTOR_TYPE_LFM2:
4347 case PROJECTOR_TYPE_KIMIVL:
4348 {
4349 GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
4350 const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
4351 inp_size: original_size,
4352 align_size: params.patch_size * params.n_merge,
4353 min_pixels: params.image_min_pixels,
4354 max_pixels: params.image_max_pixels);
4355 const std::array<uint8_t, 3> pad_color = {122, 116, 104};
4356
4357 clip_image_u8 resized_img;
4358 img_tool::resize(src: *img, dst&: resized_img, target_resolution: target_size, algo: img_tool::RESIZE_ALGO_BILINEAR, add_padding: true, pad_color);
4359 clip_image_f32_ptr res(clip_image_f32_init());
4360 normalize_image_u8_to_f32(src: resized_img, dst&: *res, mean: params.image_mean, std: params.image_std);
4361 res_imgs->entries.push_back(x: std::move(res));
4362 } break;
4363
4364 case PROJECTOR_TYPE_MLP:
4365 case PROJECTOR_TYPE_MLP_NORM:
4366 case PROJECTOR_TYPE_LDP:
4367 case PROJECTOR_TYPE_LDPV2:
4368 case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm?
4369 {
4370 // TODO @ngxson : refactor the code below to avoid duplicated logic
4371
4372 // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
4373 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
4374
4375 clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
4376
4377 // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
4378 if (params.image_res_candidates.empty()) { // pad_to_square
4379 // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
4380 // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
4381 const int longer_side = std::max(a: img->nx, b: img->ny);
4382 temp->nx = longer_side;
4383 temp->ny = longer_side;
4384 temp->buf.resize(new_size: 3 * longer_side * longer_side);
4385
4386 // background color in RGB from LLaVA (this is the mean rgb color * 255)
4387 const std::array<uint8_t, 3> pad_color = {122, 116, 104};
4388
4389 // resize the image to the target_size
4390 img_tool::resize(src: *img, dst&: *temp, target_resolution: clip_image_size{.width: params.image_size, .height: params.image_size}, algo: img_tool::RESIZE_ALGO_BILINEAR, add_padding: true, pad_color);
4391
4392 clip_image_f32_ptr res(clip_image_f32_init());
4393 normalize_image_u8_to_f32(src: *temp, dst&: *res, mean: params.image_mean, std: params.image_std);
4394 res_imgs->entries.push_back(x: std::move(res));
4395
4396 } else {
4397 // "spatial_unpad" with "anyres" processing for llava-1.6
4398 auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
4399 std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
4400
4401 for (size_t i = 0; i < imgs.size(); ++i) {
4402 // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
4403 clip_image_f32_ptr res(clip_image_f32_init());
4404 normalize_image_u8_to_f32(src: *imgs[i], dst&: *res, mean: params.image_mean, std: params.image_std);
4405 res_imgs->entries.push_back(x: std::move(res));
4406 }
4407 }
4408 } break;
4409
4410 default:
4411 LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
4412 return false;
4413 }
4414
4415 return true;
4416}
4417
4418ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
4419 return ctx->model.image_newline;
4420}
4421
4422void clip_free(clip_ctx * ctx) {
4423 if (ctx == nullptr) {
4424 return;
4425 }
4426 delete ctx;
4427}
4428
4429// deprecated
4430size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
4431 const int32_t nx = ctx->model.hparams.image_size;
4432 const int32_t ny = ctx->model.hparams.image_size;
4433 return clip_embd_nbytes_by_img(ctx, img_w: nx, img_h: ny);
4434}
4435
4436size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
4437 clip_image_f32 img;
4438 img.nx = img_w;
4439 img.ny = img_h;
4440 return clip_n_output_tokens(ctx, img: &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
4441}
4442
4443int32_t clip_get_image_size(const struct clip_ctx * ctx) {
4444 return ctx->model.hparams.image_size;
4445}
4446
4447int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
4448 return ctx->model.hparams.patch_size;
4449}
4450
4451int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
4452 return ctx->model.hparams.n_embd;
4453}
4454
4455const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
4456 return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
4457}
4458
4459int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
4460 const auto & params = ctx->model.hparams;
4461 const int n_total = clip_n_output_tokens(ctx, img);
4462 if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
4463 return img->nx / (params.patch_size * 2);
4464 }
4465 return n_total;
4466}
4467
4468int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
4469 const auto & params = ctx->model.hparams;
4470 if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
4471 return img->ny / (params.patch_size * 2);
4472 }
4473 return 1;
4474}
4475
4476int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
4477 const auto & params = ctx->model.hparams;
4478
4479 // for models with fixed size image, the input image is already pre-processed and resized to square
4480 int patch_size = params.patch_size;
4481 int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
4482
4483 projector_type proj = ctx->proj_type();
4484
4485 switch (proj) {
4486 case PROJECTOR_TYPE_MLP:
4487 case PROJECTOR_TYPE_MLP_NORM:
4488 case PROJECTOR_TYPE_JANUS_PRO:
4489 {
4490 // do nothing
4491 } break;
4492 case PROJECTOR_TYPE_LDP:
4493 case PROJECTOR_TYPE_LDPV2:
4494 case PROJECTOR_TYPE_GLM_EDGE:
4495 {
4496 n_patches /= 4;
4497 if (ctx->model.mm_boi) {
4498 n_patches += 2; // for BOI and EOI token embeddings
4499 }
4500 } break;
4501 case PROJECTOR_TYPE_MINICPMV:
4502 {
4503 // Use actual config value if available, otherwise fall back to hardcoded values
4504 if (params.minicpmv_query_num > 0) {
4505 n_patches = params.minicpmv_query_num;
4506 } else {
4507 // Fallback to hardcoded values for legacy models
4508 if (params.minicpmv_version == 2) {
4509 n_patches = 96;
4510 } else if (params.minicpmv_version == 3) {
4511 n_patches = 64;
4512 } else if (params.minicpmv_version == 4) {
4513 n_patches = 64;
4514 } else if (params.minicpmv_version == 5) {
4515 // MiniCPM-V 4.0
4516 n_patches = 64;
4517 } else if (params.minicpmv_version == 6) {
4518 // MiniCPM-V 4.5
4519 n_patches = 64;
4520 } else {
4521 GGML_ABORT("Unknown minicpmv version");
4522 }
4523 }
4524 } break;
4525 case PROJECTOR_TYPE_QWEN2VL:
4526 case PROJECTOR_TYPE_QWEN25VL:
4527 case PROJECTOR_TYPE_QWEN3VL:
4528 {
4529 // dynamic size (2 conv, so double patch size)
4530 int x_patch = img->nx / (params.patch_size * 2);
4531 int y_patch = img->ny / (params.patch_size * 2);
4532 n_patches = x_patch * y_patch;
4533 } break;
4534 case PROJECTOR_TYPE_GEMMA3:
4535 case PROJECTOR_TYPE_IDEFICS3:
4536 case PROJECTOR_TYPE_INTERNVL:
4537 case PROJECTOR_TYPE_LLAMA4:
4538 {
4539 // both X and Y are downscaled by the scale factor
4540 int scale_factor = ctx->model.hparams.n_merge;
4541 n_patches /= (scale_factor * scale_factor);
4542 } break;
4543 case PROJECTOR_TYPE_LFM2:
4544 case PROJECTOR_TYPE_KIMIVL:
4545 {
4546 // dynamic size
4547 int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
4548 int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
4549 int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
4550 n_patches = x_patch * y_patch;
4551 } break;
4552 case PROJECTOR_TYPE_PIXTRAL:
4553 case PROJECTOR_TYPE_LIGHTONOCR:
4554 {
4555 // dynamic size
4556 int n_merge = ctx->model.hparams.n_merge;
4557 int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
4558 int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
4559 if (ctx->model.token_embd_img_break) {
4560 n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
4561 } else {
4562 n_patches = n_patches_y * n_patches_x;
4563 }
4564 } break;
4565 case PROJECTOR_TYPE_VOXTRAL:
4566 case PROJECTOR_TYPE_ULTRAVOX:
4567 case PROJECTOR_TYPE_QWEN2A:
4568 {
4569 n_patches = img->nx;
4570
4571 const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
4572 if (ctx->model.audio_has_stack_frames()) {
4573 GGML_ASSERT(proj_stack_factor > 0);
4574 const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor);
4575 n_patches = n_len / proj_stack_factor;
4576 }
4577
4578 // whisper downscales input token by half after conv1d
4579 n_patches /= 2;
4580
4581 if (ctx->model.audio_has_avgpool()) {
4582 // divide by 2 because of nn.AvgPool1d(2, stride=2)
4583 n_patches /= 2;
4584 }
4585 } break;
4586 case PROJECTOR_TYPE_COGVLM:
4587 {
4588 n_patches += 2; // for BOI and EOI token embeddings
4589 } break;
4590 default:
4591 GGML_ABORT("unsupported projector type");
4592 }
4593
4594 return n_patches;
4595}
4596
4597bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
4598 clip_image_f32_batch imgs;
4599 clip_image_f32_ptr img_copy(clip_image_f32_init());
4600 *img_copy = *img;
4601 imgs.entries.push_back(x: std::move(img_copy));
4602
4603 return clip_image_batch_encode(ctx, n_threads, imgs: &imgs, vec);
4604}
4605
4606bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
4607 const clip_image_f32_batch & imgs = *imgs_c_ptr;
4608 int batch_size = imgs.entries.size();
4609
4610 // TODO @ngxson : implement batch size > 1 as a loop
4611 // we don't need true batching support because the cgraph will gonna be big anyway
4612 if (batch_size != 1) {
4613 return false; // only support batch size of 1
4614 }
4615
4616 // build the inference graph
4617 ctx->debug_print_tensors.clear();
4618 ggml_backend_sched_reset(sched: ctx->sched.get());
4619 ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
4620 ggml_backend_sched_alloc_graph(sched: ctx->sched.get(), graph: gf);
4621
4622 // set inputs
4623 const auto & model = ctx->model;
4624 const auto & hparams = model.hparams;
4625
4626 const int image_size_width = imgs.entries[0]->nx;
4627 const int image_size_height = imgs.entries[0]->ny;
4628
4629 const int patch_size = hparams.patch_size;
4630 const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
4631 const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
4632 const int pos_w = image_size_width / patch_size;
4633 const int pos_h = image_size_height / patch_size;
4634
4635 const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
4636
4637 auto get_inp_tensor = [&gf](const char * name) {
4638 ggml_tensor * inp = ggml_graph_get_tensor(cgraph: gf, name);
4639 if (inp == nullptr) {
4640 GGML_ABORT("Failed to get tensor %s", name);
4641 }
4642 if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
4643 GGML_ABORT("Tensor %s is not an input tensor", name);
4644 }
4645 return inp;
4646 };
4647
4648 auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
4649 ggml_tensor * cur = get_inp_tensor(name);
4650 GGML_ASSERT(cur->type == GGML_TYPE_F32);
4651 GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
4652 ggml_backend_tensor_set(tensor: cur, data: values.data(), offset: 0, size: ggml_nbytes(tensor: cur));
4653 };
4654
4655 auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
4656 ggml_tensor * cur = get_inp_tensor(name);
4657 GGML_ASSERT(cur->type == GGML_TYPE_I32);
4658 GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
4659 ggml_backend_tensor_set(tensor: cur, data: values.data(), offset: 0, size: ggml_nbytes(tensor: cur));
4660 };
4661
4662 // set input pixel values
4663 if (!imgs.is_audio) {
4664 size_t nelem = 0;
4665 for (const auto & img : imgs.entries) {
4666 nelem += img->nx * img->ny * 3;
4667 }
4668 std::vector<float> inp_raw(nelem);
4669
4670 // layout of data (note: the channel dim is unrolled to better visualize the layout):
4671 //
4672 // ┌──W──┐
4673 // │ H │ channel = R
4674 // ├─────┤ │
4675 // │ H │ channel = G
4676 // ├─────┤ │
4677 // │ H │ channel = B
4678 // └─────┘ │
4679 // ──────┘ x B
4680
4681 for (size_t i = 0; i < imgs.entries.size(); i++) {
4682 const int nx = imgs.entries[i]->nx;
4683 const int ny = imgs.entries[i]->ny;
4684 const int n = nx * ny;
4685
4686 for (int b = 0; b < batch_size; b++) {
4687 float * batch_entry = inp_raw.data() + b * (3*n);
4688 for (int y = 0; y < ny; y++) {
4689 for (int x = 0; x < nx; x++) {
4690 size_t base_src = 3*(y * nx + x); // idx of the first channel
4691 size_t base_dst = y * nx + x; // idx of the first channel
4692 batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
4693 batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
4694 batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
4695 }
4696 }
4697 }
4698 }
4699 set_input_f32("inp_raw", inp_raw);
4700
4701 } else {
4702 // audio input
4703 GGML_ASSERT(imgs.entries.size() == 1);
4704 const auto & mel_inp = imgs.entries[0];
4705 const int n_step = mel_inp->nx;
4706 const int n_mel = mel_inp->ny;
4707 std::vector<float> inp_raw(n_step * n_mel);
4708 std::memcpy(dest: inp_raw.data(), src: mel_inp->buf.data(), n: n_step * n_mel * sizeof(float));
4709 set_input_f32("inp_raw", inp_raw);
4710 }
4711
4712 // set input per projector
4713 switch (ctx->model.proj_type) {
4714 case PROJECTOR_TYPE_MINICPMV:
4715 {
4716 // inspired from siglip:
4717 // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
4718 // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
4719 std::vector<int32_t> positions(pos_h * pos_w);
4720 int bucket_coords_h[1024];
4721 int bucket_coords_w[1024];
4722 for (int i = 0; i < pos_h; i++){
4723 bucket_coords_h[i] = std::floor(x: 70.0*i/pos_h);
4724 }
4725 for (int i = 0; i < pos_w; i++){
4726 bucket_coords_w[i] = std::floor(x: 70.0*i/pos_w);
4727 }
4728 for (int i = 0, id = 0; i < pos_h; i++){
4729 for (int j = 0; j < pos_w; j++){
4730 positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
4731 }
4732 }
4733 set_input_i32("positions", positions);
4734
4735 // inputs for resampler projector
4736 // set the 2D positions (using float for sinusoidal embedding)
4737 int n_patches_per_col = image_size_width / patch_size;
4738 std::vector<float> pos_data(n_pos);
4739 // dimension H
4740 for (int i = 0; i < n_pos; i++) {
4741 pos_data[i] = static_cast<float>(i / n_patches_per_col);
4742 }
4743 set_input_f32("pos_h", pos_data);
4744 // dimension W
4745 for (int i = 0; i < n_pos; i++) {
4746 pos_data[i] = static_cast<float>(i % n_patches_per_col);
4747 }
4748 set_input_f32("pos_w", pos_data);
4749 // base frequency omega
4750 const float base_freq = 10000.0f;
4751 const int n_embd_proj = clip_n_mmproj_embd(ctx);
4752 std::vector<float> omega(n_embd_proj / 4);
4753 for (int i = 0; i < n_embd_proj / 4; ++i) {
4754 omega[i] = 1.0f / std::pow(x: base_freq, y: static_cast<float>(i) / (n_embd_proj / 4));
4755 }
4756 set_input_f32("omega", omega);
4757 } break;
4758 case PROJECTOR_TYPE_QWEN2VL:
4759 case PROJECTOR_TYPE_QWEN3VL:
4760 {
4761 const int merge_ratio = hparams.n_merge;
4762 const int pw = image_size_width / patch_size;
4763 const int ph = image_size_height / patch_size;
4764 std::vector<int> positions(n_pos * 4);
4765 int ptr = 0;
4766 for (int y = 0; y < ph; y += merge_ratio) {
4767 for (int x = 0; x < pw; x += merge_ratio) {
4768 for (int dy = 0; dy < 2; dy++) {
4769 for (int dx = 0; dx < 2; dx++) {
4770 positions[ ptr] = y + dy;
4771 positions[ num_patches + ptr] = x + dx;
4772 positions[2 * num_patches + ptr] = y + dy;
4773 positions[3 * num_patches + ptr] = x + dx;
4774 ptr++;
4775 }
4776 }
4777 }
4778 }
4779
4780 set_input_i32("positions", positions);
4781 } break;
4782 case PROJECTOR_TYPE_QWEN25VL:
4783 {
4784 // pw * ph = number of tokens output by ViT after apply patch merger
4785 // ipw * ipw = number of vision token been processed inside ViT
4786 const int merge_ratio = 2;
4787 const int pw = image_size_width / patch_size / merge_ratio;
4788 const int ph = image_size_height / patch_size / merge_ratio;
4789 const int ipw = image_size_width / patch_size;
4790 const int iph = image_size_height / patch_size;
4791
4792 std::vector<int> idx (ph * pw);
4793 std::vector<int> inv_idx(ph * pw);
4794
4795 if (use_window_attn) {
4796 const int attn_window_size = 112;
4797 const int grid_window = attn_window_size / patch_size / merge_ratio;
4798 int dst = 0;
4799 // [num_vision_tokens, num_vision_tokens] attention mask tensor
4800 std::vector<float> mask(pow(x: ipw * iph, y: 2), std::numeric_limits<float>::lowest());
4801 int mask_row = 0;
4802
4803 for (int y = 0; y < ph; y += grid_window) {
4804 for (int x = 0; x < pw; x += grid_window) {
4805 const int win_h = std::min(a: grid_window, b: ph - y);
4806 const int win_w = std::min(a: grid_window, b: pw - x);
4807 const int dst_0 = dst;
4808 // group all tokens belong to the same window togather (to a continue range)
4809 for (int dy = 0; dy < win_h; dy++) {
4810 for (int dx = 0; dx < win_w; dx++) {
4811 const int src = (y + dy) * pw + (x + dx);
4812 GGML_ASSERT(src < (int)idx.size());
4813 GGML_ASSERT(dst < (int)inv_idx.size());
4814 idx [src] = dst;
4815 inv_idx[dst] = src;
4816 dst++;
4817 }
4818 }
4819
4820 for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
4821 int row_offset = mask_row * (ipw * iph);
4822 std::fill(
4823 first: mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
4824 last: mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
4825 value: 0.0);
4826 mask_row++;
4827 }
4828 }
4829 }
4830
4831 set_input_i32("window_idx", idx);
4832 set_input_i32("inv_window_idx", inv_idx);
4833 set_input_f32("window_mask", mask);
4834 } else {
4835 for (int i = 0; i < ph * pw; i++) {
4836 idx[i] = i;
4837 }
4838 }
4839
4840 const int mpow = merge_ratio * merge_ratio;
4841 std::vector<int> positions(n_pos * 4);
4842
4843 int ptr = 0;
4844 for (int y = 0; y < iph; y += merge_ratio) {
4845 for (int x = 0; x < ipw; x += merge_ratio) {
4846 for (int dy = 0; dy < 2; dy++) {
4847 for (int dx = 0; dx < 2; dx++) {
4848 auto remap = idx[ptr / mpow];
4849 remap = (remap * mpow) + (ptr % mpow);
4850
4851 positions[ remap] = y + dy;
4852 positions[ num_patches + remap] = x + dx;
4853 positions[2 * num_patches + remap] = y + dy;
4854 positions[3 * num_patches + remap] = x + dx;
4855 ptr++;
4856 }
4857 }
4858 }
4859 }
4860
4861 set_input_i32("positions", positions);
4862 } break;
4863 case PROJECTOR_TYPE_PIXTRAL:
4864 case PROJECTOR_TYPE_KIMIVL:
4865 case PROJECTOR_TYPE_LIGHTONOCR:
4866 {
4867 // set the 2D positions
4868 int n_patches_per_col = image_size_width / patch_size;
4869 std::vector<int> pos_data(n_pos);
4870 // dimension H
4871 for (int i = 0; i < n_pos; i++) {
4872 pos_data[i] = i / n_patches_per_col;
4873 }
4874 set_input_i32("pos_h", pos_data);
4875 // dimension W
4876 for (int i = 0; i < n_pos; i++) {
4877 pos_data[i] = i % n_patches_per_col;
4878 }
4879 set_input_i32("pos_w", pos_data);
4880 } break;
4881 case PROJECTOR_TYPE_GLM_EDGE:
4882 {
4883 // llava and other models
4884 std::vector<int32_t> positions(n_pos);
4885 for (int i = 0; i < n_pos; i++) {
4886 positions[i] = i;
4887 }
4888 set_input_i32("positions", positions);
4889 } break;
4890 case PROJECTOR_TYPE_MLP:
4891 case PROJECTOR_TYPE_MLP_NORM:
4892 case PROJECTOR_TYPE_LDP:
4893 case PROJECTOR_TYPE_LDPV2:
4894 {
4895 // llava and other models
4896 std::vector<int32_t> positions(n_pos);
4897 for (int i = 0; i < n_pos; i++) {
4898 positions[i] = i;
4899 }
4900 set_input_i32("positions", positions);
4901
4902 // The patches vector is used to get rows to index into the embeds with;
4903 // we should skip dim 0 only if we have CLS to avoid going out of bounds
4904 // when retrieving the rows.
4905 int patch_offset = model.class_embedding ? 1 : 0;
4906 std::vector<int32_t> patches(num_patches);
4907 for (int i = 0; i < num_patches; i++) {
4908 patches[i] = i + patch_offset;
4909 }
4910 set_input_i32("patches", patches);
4911 } break;
4912 case PROJECTOR_TYPE_GEMMA3:
4913 case PROJECTOR_TYPE_IDEFICS3:
4914 case PROJECTOR_TYPE_INTERNVL:
4915 case PROJECTOR_TYPE_QWEN2A:
4916 case PROJECTOR_TYPE_ULTRAVOX:
4917 case PROJECTOR_TYPE_LFM2:
4918 case PROJECTOR_TYPE_VOXTRAL:
4919 case PROJECTOR_TYPE_JANUS_PRO:
4920 case PROJECTOR_TYPE_COGVLM:
4921 {
4922 // do nothing
4923 } break;
4924 case PROJECTOR_TYPE_LLAMA4:
4925 {
4926 // set the 2D positions
4927 int n_patches_per_col = image_size_width / patch_size;
4928 std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token
4929 // last pos is always kept 0, it's for CLS
4930 // dimension H
4931 for (int i = 0; i < num_patches; i++) {
4932 pos_data[i] = (i / n_patches_per_col) + 1;
4933 }
4934 set_input_i32("pos_h", pos_data);
4935 // dimension W
4936 for (int i = 0; i < num_patches; i++) {
4937 pos_data[i] = (i % n_patches_per_col) + 1;
4938 }
4939 set_input_i32("pos_w", pos_data);
4940 } break;
4941 default:
4942 GGML_ABORT("Unknown projector type");
4943 }
4944
4945 // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
4946 ggml_backend_dev_t dev = ggml_backend_get_device(backend: ctx->backend_cpu);
4947 ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(device: dev) : nullptr;
4948 if (reg) {
4949 auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, name: "ggml_backend_set_n_threads");
4950 if (ggml_backend_set_n_threads_fn) {
4951 ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
4952 }
4953 }
4954
4955 auto status = ggml_backend_sched_graph_compute(sched: ctx->sched.get(), graph: gf);
4956 if (status != GGML_STATUS_SUCCESS) {
4957 LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
4958 return false;
4959 }
4960
4961 // print debug nodes
4962 if (ctx->debug_graph) {
4963 LOG_INF("\n\n---\n\n");
4964 LOG_INF("\n\nDebug graph:\n\n");
4965 for (ggml_tensor * t : ctx->debug_print_tensors) {
4966 std::vector<uint8_t> data(ggml_nbytes(tensor: t));
4967 ggml_backend_tensor_get(tensor: t, data: data.data(), offset: 0, size: ggml_nbytes(tensor: t));
4968 print_tensor_shape(t);
4969 print_tensor_data(t, data: data.data(), n: 3);
4970 }
4971 }
4972
4973 // the last node is the embedding tensor
4974 ggml_tensor * embeddings = ggml_graph_node(cgraph: gf, i: -1);
4975
4976 // sanity check (only support batch size of 1 for now)
4977 const int n_tokens_out = embeddings->ne[1];
4978 const int expected_n_tokens_out = clip_n_output_tokens(ctx, img: imgs.entries[0].get());
4979 if (n_tokens_out != expected_n_tokens_out) {
4980 LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
4981 GGML_ABORT("Invalid number of output tokens");
4982 }
4983
4984 // copy the embeddings to the location passed by the user
4985 ggml_backend_tensor_get(tensor: embeddings, data: vec, offset: 0, size: ggml_nbytes(tensor: embeddings));
4986
4987 return true;
4988}
4989
4990int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
4991 switch (ctx->model.proj_type) {
4992 case PROJECTOR_TYPE_LDP:
4993 return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
4994 case PROJECTOR_TYPE_LDPV2:
4995 return ctx->model.mm_model_peg_0_b->ne[0];
4996 case PROJECTOR_TYPE_MLP:
4997 case PROJECTOR_TYPE_PIXTRAL:
4998 case PROJECTOR_TYPE_LIGHTONOCR:
4999 return ctx->model.mm_2_w->ne[1];
5000 case PROJECTOR_TYPE_MLP_NORM:
5001 return ctx->model.mm_3_b->ne[0];
5002 case PROJECTOR_TYPE_MINICPMV:
5003 return ctx->model.mm_model_proj->ne[0];
5004 case PROJECTOR_TYPE_GLM_EDGE:
5005 return ctx->model.mm_model_mlp_3_w->ne[1];
5006 case PROJECTOR_TYPE_QWEN2VL:
5007 case PROJECTOR_TYPE_QWEN25VL:
5008 case PROJECTOR_TYPE_JANUS_PRO:
5009 return ctx->model.mm_1_b->ne[0];
5010 case PROJECTOR_TYPE_QWEN3VL:
5011 // main path + deepstack paths
5012 return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
5013 case PROJECTOR_TYPE_GEMMA3:
5014 return ctx->model.mm_input_proj_w->ne[0];
5015 case PROJECTOR_TYPE_IDEFICS3:
5016 return ctx->model.projection->ne[1];
5017 case PROJECTOR_TYPE_ULTRAVOX:
5018 case PROJECTOR_TYPE_VOXTRAL:
5019 return ctx->model.mm_2_w->ne[1];
5020 case PROJECTOR_TYPE_INTERNVL:
5021 return ctx->model.mm_3_w->ne[1];
5022 case PROJECTOR_TYPE_LLAMA4:
5023 return ctx->model.mm_model_proj->ne[1];
5024 case PROJECTOR_TYPE_QWEN2A:
5025 return ctx->model.mm_fc_w->ne[1];
5026 case PROJECTOR_TYPE_LFM2:
5027 case PROJECTOR_TYPE_KIMIVL:
5028 return ctx->model.mm_2_w->ne[1];
5029 case PROJECTOR_TYPE_COGVLM:
5030 return ctx->model.mm_4h_to_h_w->ne[1];
5031 default:
5032 GGML_ABORT("Unknown projector type");
5033 }
5034}
5035
5036int clip_is_minicpmv(const struct clip_ctx * ctx) {
5037 if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
5038 return ctx->model.hparams.minicpmv_version;
5039 }
5040 return 0;
5041}
5042
5043bool clip_is_glm(const struct clip_ctx * ctx) {
5044 return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
5045}
5046
5047bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
5048 return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
5049 || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
5050 || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL;
5051}
5052
5053bool clip_is_llava(const struct clip_ctx * ctx) {
5054 return ctx->model.hparams.has_llava_projector;
5055}
5056
5057bool clip_is_gemma3(const struct clip_ctx * ctx) {
5058 return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
5059}
5060
5061bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
5062 return ctx->model.modality == CLIP_MODALITY_VISION;
5063}
5064
5065bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
5066 return ctx->model.modality == CLIP_MODALITY_AUDIO;
5067}
5068
5069bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
5070 return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
5071 || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
5072 || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
5073}
5074
5075bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
5076 clip_image_f32 clip_img;
5077 clip_img.buf.resize(new_size: h * w * 3);
5078 for (int i = 0; i < h*w*3; i++)
5079 {
5080 clip_img.buf[i] = img[i];
5081 }
5082 clip_img.nx = w;
5083 clip_img.ny = h;
5084 clip_image_encode(ctx, n_threads, img: &clip_img, vec);
5085 return true;
5086}
5087
5088//
5089// API used internally with mtmd
5090//
5091
5092projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
5093 return ctx->proj_type();
5094}
5095
5096void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
5097 clip_image_f32 * audio = new clip_image_f32;
5098 audio->nx = n_frames;
5099 audio->ny = n_mel;
5100 audio->buf.resize(new_size: n_frames * n_mel);
5101 std::memcpy(dest: audio->buf.data(), src: mel, n: n_frames * n_mel * sizeof(float));
5102
5103 batch->entries.push_back(x: clip_image_f32_ptr(audio));
5104 batch->is_audio = true;
5105}
5106