clip.cpp source code [llama.cpp/tools/mtmd/clip.cpp]

1	// NOTE: This is modified from clip.cpp only for LLaVA,
2	// so there might be still unnecessary artifacts hanging around
3	// I'll gradually clean and extend it
4	// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
5	#include "clip.h"
6	#include "clip-impl.h"
7	#include "ggml.h"
8	#include "ggml-cpp.h"
9	#include "ggml-alloc.h"
10	#include "ggml-backend.h"
11	#include "gguf.h"
12
13	#include <cassert>
14	#include <cmath>
15	#include <cstdlib>
16	#include <cstring>
17	#include <fstream>
18	#include <map>
19	#include <stdexcept>
20	#include <unordered_set>
21	#include <vector>
22	#include <cinttypes>
23	#include <limits>
24	#include <array>
25	#include <functional>
26
27	// TODO: allow to pass callback from user code
28	struct clip_logger_state g_logger_state = {.verbosity_thold: GGML_LOG_LEVEL_CONT, .log_callback: clip_log_callback_default, NULL};
29
30	enum ffn_op_type {
31	FFN_GELU,
32	FFN_GELU_ERF,
33	FFN_SILU,
34	FFN_GELU_QUICK,
35	};
36
37	enum norm_type {
38	NORM_TYPE_NORMAL,
39	NORM_TYPE_RMS,
40	};
41
42	//#define CLIP_DEBUG_FUNCTIONS
43
44	#ifdef CLIP_DEBUG_FUNCTIONS
45	static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
46	std::ofstream file(filename, std::ios::binary);
47	if (!file.is_open()) {
48	LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
49	return;
50	}
51
52	// PPM header: P6 format, width, height, and max color value
53	file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
54
55	// Write pixel data
56	for (size_t i = `0`; i < img.buf.size(); i += `3`) {
57	// PPM expects binary data in RGB format, which matches our image buffer
58	file.write(reinterpret_cast<const char*>(&img.buf[i]), `3`);
59	}
60
61	file.close();
62	}
63
64	static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
65	std::ofstream file(filename, std::ios::binary);
66	if (!file.is_open()) {
67	LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
68	return;
69	}
70
71	int fileSize = `54` + `3` * img.nx * img.ny; // File header + info header + pixel data
72	int bytesPerPixel = `3`;
73	int widthInBytes = img.nx * bytesPerPixel;
74	int paddingAmount = (`4` - (widthInBytes % `4`)) % `4`;
75	int stride = widthInBytes + paddingAmount;
76
77	// Bitmap file header
78	unsigned char fileHeader[`14`] = {
79	`'B'`,`'M'`, // Signature
80	`0`,`0`,`0`,`0`, // Image file size in bytes
81	`0`,`0`,`0`,`0`, // Reserved
82	`54`,`0`,`0`,`0` // Start of pixel array
83	};
84
85	// Total file size
86	fileSize = `54` + (stride * img.ny);
87	fileHeader[`2`] = (unsigned char)(fileSize);
88	fileHeader[`3`] = (unsigned char)(fileSize >> `8`);
89	fileHeader[`4`] = (unsigned char)(fileSize >> `16`);
90	fileHeader[`5`] = (unsigned char)(fileSize >> `24`);
91
92	// Bitmap information header (BITMAPINFOHEADER)
93	unsigned char infoHeader[`40`] = {
94	`40`,`0`,`0`,`0`, // Size of this header (40 bytes)
95	`0`,`0`,`0`,`0`, // Image width
96	`0`,`0`,`0`,`0`, // Image height
97	`1`,`0`, // Number of color planes
98	`24`,`0`, // Bits per pixel
99	`0`,`0`,`0`,`0`, // No compression
100	`0`,`0`,`0`,`0`, // Image size (can be 0 for no compression)
101	`0`,`0`,`0`,`0`, // X pixels per meter (not specified)
102	`0`,`0`,`0`,`0`, // Y pixels per meter (not specified)
103	`0`,`0`,`0`,`0`, // Total colors (color table not used)
104	`0`,`0`,`0`,`0` // Important colors (all are important)
105	};
106
107	// Width and height in the information header
108	infoHeader[`4`] = (unsigned char)(img.nx);
109	infoHeader[`5`] = (unsigned char)(img.nx >> `8`);
110	infoHeader[`6`] = (unsigned char)(img.nx >> `16`);
111	infoHeader[`7`] = (unsigned char)(img.nx >> `24`);
112	infoHeader[`8`] = (unsigned char)(img.ny);
113	infoHeader[`9`] = (unsigned char)(img.ny >> `8`);
114	infoHeader[`10`] = (unsigned char)(img.ny >> `16`);
115	infoHeader[`11`] = (unsigned char)(img.ny >> `24`);
116
117	// Write file headers
118	file.write(reinterpret_cast<char>(fileHeader), sizeof*(fileHeader));
119	file.write(reinterpret_cast<char>(infoHeader), sizeof*(infoHeader));
120
121	// Pixel data
122	std::vector<unsigned char> padding(`3`, `0`); // Max padding size to be added to each row
123	for (int y = img.ny - `1`; y >= `0`; --y) { // BMP files are stored bottom-to-top
124	for (int x = `0`; x < img.nx; ++x) {
125	// Each pixel
126	size_t pixelIndex = (y * img.nx + x) * `3`;
127	unsigned char pixel[`3`] = {
128	img.buf[pixelIndex + `2`], // BMP stores pixels in BGR format
129	img.buf[pixelIndex + `1`],
130	img.buf[pixelIndex]
131	};
132	file.write(reinterpret_cast<char*>(pixel), `3`);
133	}
134	// Write padding for the row
135	file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
136	}
137
138	file.close();
139	}
140
141	// debug function to convert f32 to u8
142	static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
143	dst.nx = src.nx;
144	dst.ny = src.ny;
145	dst.buf.resize(`3` * src.nx * src.ny);
146	for (size_t i = `0`; i < src.buf.size(); ++i) {
147	dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * `255.0f`), `0`), `255`));
148	}
149	}
150	#endif
151
152
153	//
154	// clip layers
155	//
156
157	enum patch_merge_type {
158	PATCH_MERGE_FLAT,
159	PATCH_MERGE_SPATIAL_UNPAD,
160	};
161
162	struct clip_hparams {
163	int32_t image_size;
164	int32_t patch_size;
165	int32_t n_embd;
166	int32_t n_ff;
167	int32_t projection_dim;
168	int32_t n_head;
169	int32_t n_layer;
170	// idefics3
171	int32_t image_longest_edge = `0`;
172	int32_t image_min_pixels = -`1`;
173	int32_t image_max_pixels = -`1`;
174	int32_t n_merge = `0`; // number of patch merges per-side
175
176	float image_mean[`3`];
177	float image_std[`3`];
178
179	// for models using dynamic image size, we need to have a smaller image size to warmup
180	// otherwise, user will get OOM everytime they load the model
181	int32_t warmup_image_size = `0`;
182	int32_t warmup_audio_size = `3000`;
183
184	ffn_op_type ffn_op = FFN_GELU;
185
186	patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
187
188	float eps = `1e-6`;
189	float rope_theta = `0.0`;
190
191	std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
192	int32_t image_crop_resolution;
193	std::unordered_set<int32_t> vision_feature_layer;
194	int32_t attn_window_size = `0`;
195	int32_t n_wa_pattern = `0`;
196
197	// audio
198	int32_t n_mel_bins = `0`; // whisper preprocessor
199	int32_t proj_stack_factor = `0`; // ultravox
200
201	// legacy
202	bool has_llava_projector = false;
203	int minicpmv_version = `0`;
204	int32_t minicpmv_query_num = `0`; // MiniCPM-V query number
205
206	// custom value provided by user, can be undefined if not set
207	int32_t custom_image_min_tokens = -`1`;
208	int32_t custom_image_max_tokens = -`1`;
209
210	void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
211	const int cur_merge = n_merge == `0` ? `1` : n_merge;
212	const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
213	image_min_pixels = (custom_image_min_tokens > `0` ? custom_image_min_tokens : n_tokens_min) * patch_area;
214	image_max_pixels = (custom_image_max_tokens > `0` ? custom_image_max_tokens : n_tokens_max) * patch_area;
215	warmup_image_size = static_cast<int>(std::sqrt(x: image_max_pixels));
216	}
217
218	void set_warmup_n_tokens(int n_tokens) {
219	int n_tok_per_side = static_cast<int>(std::sqrt(x: n_tokens));
220	GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
221	const int cur_merge = n_merge == `0` ? `1` : n_merge;
222	warmup_image_size = n_tok_per_side * patch_size * cur_merge;
223	// TODO: support warmup size for custom token numbers
224	}
225	};
226
227	struct clip_layer {
228	// attention
229	ggml_tensor * k_w = nullptr;
230	ggml_tensor * k_b = nullptr;
231	ggml_tensor * q_w = nullptr;
232	ggml_tensor * q_b = nullptr;
233	ggml_tensor * v_w = nullptr;
234	ggml_tensor * v_b = nullptr;
235	ggml_tensor * qkv_w = nullptr;
236	ggml_tensor * qkv_b = nullptr;
237
238	ggml_tensor * o_w = nullptr;
239	ggml_tensor * o_b = nullptr;
240
241	ggml_tensor * k_norm = nullptr;
242	ggml_tensor * q_norm = nullptr;
243
244	// layernorm 1
245	ggml_tensor * ln_1_w = nullptr;
246	ggml_tensor * ln_1_b = nullptr;
247
248	ggml_tensor * ff_up_w = nullptr;
249	ggml_tensor * ff_up_b = nullptr;
250	ggml_tensor * ff_gate_w = nullptr;
251	ggml_tensor * ff_gate_b = nullptr;
252	ggml_tensor * ff_down_w = nullptr;
253	ggml_tensor * ff_down_b = nullptr;
254
255	// layernorm 2
256	ggml_tensor * ln_2_w = nullptr;
257	ggml_tensor * ln_2_b = nullptr;
258
259	// layer scale (no bias)
260	ggml_tensor * ls_1_w = nullptr;
261	ggml_tensor * ls_2_w = nullptr;
262
263	// qwen3vl deepstack merger
264	ggml_tensor * deepstack_norm_w = nullptr;
265	ggml_tensor * deepstack_norm_b = nullptr;
266	ggml_tensor * deepstack_fc1_w = nullptr;
267	ggml_tensor * deepstack_fc1_b = nullptr;
268	ggml_tensor * deepstack_fc2_w = nullptr;
269	ggml_tensor * deepstack_fc2_b = nullptr;
270
271	bool has_deepstack() const {
272	return deepstack_fc1_w != nullptr;
273	}
274	};
275
276	struct clip_model {
277	clip_modality modality = CLIP_MODALITY_VISION;
278	projector_type proj_type = PROJECTOR_TYPE_MLP;
279	clip_hparams hparams;
280
281	// embeddings
282	ggml_tensor * class_embedding = nullptr;
283	ggml_tensor * patch_embeddings_0 = nullptr;
284	ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
285	ggml_tensor * patch_bias = nullptr;
286	ggml_tensor * position_embeddings = nullptr;
287
288	ggml_tensor * pre_ln_w = nullptr;
289	ggml_tensor * pre_ln_b = nullptr;
290
291	std::vector<clip_layer> layers;
292
293	int32_t n_deepstack_layers = `0`; // used by Qwen3-VL, calculated from clip_layer
294
295	ggml_tensor * post_ln_w;
296	ggml_tensor * post_ln_b;
297
298	ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
299	ggml_tensor * mm_fc_w;
300	ggml_tensor * mm_fc_b;
301
302	// LLaVA projection
303	ggml_tensor * mm_input_norm_w = nullptr;
304	ggml_tensor * mm_input_norm_b = nullptr;
305	ggml_tensor * mm_0_w = nullptr;
306	ggml_tensor * mm_0_b = nullptr;
307	ggml_tensor * mm_2_w = nullptr;
308	ggml_tensor * mm_2_b = nullptr;
309
310	ggml_tensor * image_newline = nullptr;
311
312	// Yi type models with mlp+normalization projection
313	ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
314	ggml_tensor * mm_1_b = nullptr;
315	ggml_tensor * mm_3_w = nullptr;
316	ggml_tensor * mm_3_b = nullptr;
317	ggml_tensor * mm_4_w = nullptr;
318	ggml_tensor * mm_4_b = nullptr;
319
320	// GLMV-Edge projection
321	ggml_tensor * mm_model_adapter_conv_w = nullptr;
322	ggml_tensor * mm_model_adapter_conv_b = nullptr;
323
324	// MobileVLM projection
325	ggml_tensor * mm_model_mlp_1_w = nullptr;
326	ggml_tensor * mm_model_mlp_1_b = nullptr;
327	ggml_tensor * mm_model_mlp_3_w = nullptr;
328	ggml_tensor * mm_model_mlp_3_b = nullptr;
329	ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
330	ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
331	ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
332	ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
333	ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
334	ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
335	ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
336	ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
337	ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
338	ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
339	ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
340	ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
341	ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
342	ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
343	ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
344	ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
345	ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
346	ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
347	ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
348	ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
349
350	// MobileVLM_V2 projection
351	ggml_tensor * mm_model_mlp_0_w = nullptr;
352	ggml_tensor * mm_model_mlp_0_b = nullptr;
353	ggml_tensor * mm_model_mlp_2_w = nullptr;
354	ggml_tensor * mm_model_mlp_2_b = nullptr;
355	ggml_tensor * mm_model_peg_0_w = nullptr;
356	ggml_tensor * mm_model_peg_0_b = nullptr;
357
358	// MINICPMV projection
359	ggml_tensor * mm_model_pos_embed_k = nullptr;
360	ggml_tensor * mm_model_query = nullptr;
361	ggml_tensor * mm_model_proj = nullptr;
362	ggml_tensor * mm_model_kv_proj = nullptr;
363	ggml_tensor * mm_model_attn_q_w = nullptr;
364	ggml_tensor * mm_model_attn_q_b = nullptr;
365	ggml_tensor * mm_model_attn_k_w = nullptr;
366	ggml_tensor * mm_model_attn_k_b = nullptr;
367	ggml_tensor * mm_model_attn_v_w = nullptr;
368	ggml_tensor * mm_model_attn_v_b = nullptr;
369	ggml_tensor * mm_model_attn_o_w = nullptr;
370	ggml_tensor * mm_model_attn_o_b = nullptr;
371	ggml_tensor * mm_model_ln_q_w = nullptr;
372	ggml_tensor * mm_model_ln_q_b = nullptr;
373	ggml_tensor * mm_model_ln_kv_w = nullptr;
374	ggml_tensor * mm_model_ln_kv_b = nullptr;
375	ggml_tensor * mm_model_ln_post_w = nullptr;
376	ggml_tensor * mm_model_ln_post_b = nullptr;
377
378	// gemma3
379	ggml_tensor * mm_input_proj_w = nullptr;
380	ggml_tensor * mm_soft_emb_norm_w = nullptr;
381
382	// pixtral
383	ggml_tensor * token_embd_img_break = nullptr;
384	ggml_tensor * mm_patch_merger_w = nullptr;
385
386	// ultravox / whisper encoder
387	ggml_tensor * conv1d_1_w = nullptr;
388	ggml_tensor * conv1d_1_b = nullptr;
389	ggml_tensor * conv1d_2_w = nullptr;
390	ggml_tensor * conv1d_2_b = nullptr;
391	ggml_tensor * mm_norm_pre_w = nullptr;
392	ggml_tensor * mm_norm_mid_w = nullptr;
393
394	// cogvlm
395	ggml_tensor * mm_post_fc_norm_w = nullptr;
396	ggml_tensor * mm_post_fc_norm_b = nullptr;
397	ggml_tensor * mm_h_to_4h_w = nullptr;
398	ggml_tensor * mm_gate_w = nullptr;
399	ggml_tensor * mm_4h_to_h_w = nullptr;
400	ggml_tensor * mm_boi = nullptr;
401	ggml_tensor * mm_eoi = nullptr;
402
403	bool audio_has_avgpool() const {
404	return proj_type == PROJECTOR_TYPE_QWEN2A
405	\|\| proj_type == PROJECTOR_TYPE_VOXTRAL;
406	}
407
408	bool audio_has_stack_frames() const {
409	return proj_type == PROJECTOR_TYPE_ULTRAVOX
410	\|\| proj_type == PROJECTOR_TYPE_VOXTRAL;
411	}
412	};
413
414	struct clip_ctx {
415	clip_model model;
416
417	gguf_context_ptr ctx_gguf;
418	ggml_context_ptr ctx_data;
419
420	std::vector<uint8_t> buf_compute_meta;
421
422	std::vector<ggml_backend_t> backend_ptrs;
423	std::vector<ggml_backend_buffer_type_t> backend_buft;
424
425	ggml_backend_t backend = nullptr;
426	ggml_backend_t backend_cpu = nullptr;
427	ggml_backend_buffer_ptr buf;
428
429	int max_nodes = `8192`;
430	ggml_backend_sched_ptr sched;
431	clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
432
433	// for debugging
434	bool debug_graph = false;
435	std::vector<ggml_tensor *> debug_print_tensors;
436
437	clip_ctx(clip_context_params & ctx_params) {
438	flash_attn_type = ctx_params.flash_attn_type;
439	debug_graph = std::getenv(name: "MTMD_DEBUG_GRAPH") != nullptr;
440	backend_cpu = ggml_backend_init_by_type(type: GGML_BACKEND_DEVICE_TYPE_CPU, params: nullptr);
441	if (!backend_cpu) {
442	throw std::runtime_error ("failed to initialize CPU backend");
443	}
444	if (ctx_params.use_gpu) {
445	auto backend_name = std::getenv(name: "MTMD_BACKEND_DEVICE");
446	if (backend_name != nullptr) {
447	backend = ggml_backend_init_by_name(name: backend_name, params: nullptr);
448	if (!backend) {
449	LOG_WRN("%s: Warning: Failed to initialize \"%s\" backend, falling back to default GPU backend\n", __func__, backend_name);
450	}
451	}
452	if (!backend) {
453	backend = ggml_backend_init_by_type(type: GGML_BACKEND_DEVICE_TYPE_GPU, params: nullptr);
454	backend = backend ? backend : ggml_backend_init_by_type(type: GGML_BACKEND_DEVICE_TYPE_IGPU, params: nullptr);
455	}
456	}
457
458	if (backend) {
459	LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
460	backend_ptrs.push_back(x: backend);
461	backend_buft.push_back(x: ggml_backend_get_default_buffer_type(backend));
462	} else {
463	backend = backend_cpu;
464	LOG_INF("%s: CLIP using CPU backend\n", __func__);
465	}
466
467	if (ctx_params.image_min_tokens > `0`) {
468	model.hparams.custom_image_min_tokens = ctx_params.image_min_tokens;
469	}
470	if (ctx_params.image_max_tokens > `0`) {
471	model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
472	}
473
474	backend_ptrs.push_back(x: backend_cpu);
475	backend_buft.push_back(x: ggml_backend_get_default_buffer_type(backend: backend_cpu));
476
477	sched.reset(
478	p: ggml_backend_sched_new(backends: backend_ptrs.data(), bufts: backend_buft.data(), n_backends: backend_ptrs.size(), graph_size: `8192`, parallel: false, op_offload: true)
479	);
480	}
481
482	~clip_ctx() {
483	ggml_backend_free(backend);
484	if (backend != backend_cpu) {
485	ggml_backend_free(backend: backend_cpu);
486	}
487	}
488
489	// this function is added so that we don't change too much of the existing code
490	projector_type proj_type() const {
491	return model.proj_type;
492	}
493	};
494
495	struct clip_graph {
496	clip_ctx * ctx;
497	const clip_model & model;
498	const clip_hparams & hparams;
499
500	// we only support single image per batch
501	const clip_image_f32 & img;
502
503	const int patch_size;
504	const int n_patches_x;
505	const int n_patches_y;
506	const int n_patches;
507	const int n_embd;
508	const int n_head;
509	const int d_head;
510	const int n_layer;
511	const float eps;
512	const float kq_scale;
513
514	ggml_context_ptr ctx0_ptr;
515	ggml_context * ctx0;
516	ggml_cgraph * gf;
517
518	clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
519	ctx(ctx),
520	model(ctx->model),
521	hparams(model.hparams),
522	img(img),
523	patch_size(hparams.patch_size),
524	n_patches_x(img.nx / patch_size),
525	n_patches_y(img.ny / patch_size),
526	n_patches(n_patches_x * n_patches_y),
527	n_embd(hparams.n_embd),
528	n_head(hparams.n_head),
529	d_head(n_embd / n_head),
530	n_layer(hparams.n_layer),
531	eps(hparams.eps),
532	kq_scale(`1.0f` / sqrtf(x: (float)d_head)) {
533	struct ggml_init_params params = {
534	/.mem_size =/ ctx->buf_compute_meta.size(),
535	/.mem_buffer =/ ctx->buf_compute_meta.data(),
536	/.no_alloc =/ true,
537	};
538	ctx0_ptr.reset(p: ggml_init(params));
539	ctx0 = ctx0_ptr.get();
540	gf = ggml_new_graph_custom(ctx: ctx0, size: ctx->max_nodes, grads: false);
541	}
542
543	ggml_cgraph * build_siglip() {
544	ggml_tensor * inp = build_inp();
545
546	ggml_tensor * learned_pos_embd = model.position_embeddings;
547	if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
548	learned_pos_embd = resize_position_embeddings();
549	}
550
551	ggml_tensor * cur = build_vit(
552	inp, n_pos: n_patches,
553	norm_t: NORM_TYPE_NORMAL,
554	ffn_t: hparams.ffn_op,
555	learned_pos_embd,
556	add_pos: nullptr);
557
558	if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
559	const int batch_size = `1`;
560	GGML_ASSERT(n_patches_x == n_patches_y);
561	const int patches_per_image = n_patches_x;
562	const int kernel_size = hparams.n_merge;
563
564	cur = ggml_transpose(ctx: ctx0, a: cur);
565	cur = ggml_cont_4d(ctx: ctx0, a: cur, ne0: patches_per_image, ne1: patches_per_image, ne2: n_embd, ne3: batch_size);
566
567	// doing a pool2d to reduce the number of output tokens
568	cur = ggml_pool_2d(ctx: ctx0, a: cur, op: GGML_OP_POOL_AVG, k0: kernel_size, k1: kernel_size, s0: kernel_size, s1: kernel_size, p0: `0`, p1: `0`);
569	cur = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: cur->ne[`0`] * cur->ne[`0`], ne1: n_embd, ne2: batch_size);
570	cur = ggml_cont(ctx: ctx0, a: ggml_transpose(ctx: ctx0, a: cur));
571
572	// apply norm before projection
573	cur = ggml_rms_norm(ctx: ctx0, a: cur, eps);
574	cur = ggml_mul(ctx: ctx0, a: cur, b: model.mm_soft_emb_norm_w);
575
576	// apply projection
577	cur = ggml_mul_mat(ctx: ctx0,
578	a: ggml_cont(ctx: ctx0, a: ggml_transpose(ctx: ctx0, a: model.mm_input_proj_w)),
579	b: cur);
580
581	} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
582	// pixel_shuffle
583	// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
584	const int scale_factor = model.hparams.n_merge;
585	cur = build_patch_merge_permute(cur, scale_factor);
586	cur = ggml_mul_mat(ctx: ctx0, a: model.projection, b: cur);
587
588	} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
589	// pixel unshuffle block
590	const int scale_factor = model.hparams.n_merge;
591	cur = build_patch_merge_permute(cur, scale_factor);
592
593	// projection
594	cur = ggml_norm(ctx: ctx0, a: cur, eps: `1e-5`); // default nn.LayerNorm
595	cur = ggml_mul(ctx: ctx0, a: cur, b: model.mm_input_norm_w);
596	cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_input_norm_b);
597
598	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
599	cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_1_b);
600	cur = ggml_gelu(ctx: ctx0, a: cur);
601	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: cur);
602	cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_2_b);
603
604	} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
605	cur = build_ffn(cur,
606	up: model.mm_0_w, up_b: model.mm_0_b,
607	gate: nullptr, gate_b: nullptr,
608	down: model.mm_1_w, down_b: model.mm_1_b,
609	type_op: hparams.ffn_op,
610	il: -`1`);
611
612	} else {
613	GGML_ABORT("SigLIP: Unsupported projector type");
614	}
615
616	// build the graph
617	ggml_build_forward_expand(cgraph: gf, tensor: cur);
618
619	return gf;
620	}
621
622	ggml_cgraph * build_pixtral() {
623	const int n_merge = hparams.n_merge;
624
625	// 2D input positions
626	ggml_tensor * pos_h = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_patches);
627	ggml_set_name(tensor: pos_h, name: "pos_h");
628	ggml_set_input(tensor: pos_h);
629
630	ggml_tensor * pos_w = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_patches);
631	ggml_set_name(tensor: pos_w, name: "pos_w");
632	ggml_set_input(tensor: pos_w);
633
634	auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
635	return build_rope_2d(ctx0, cur, pos_a: pos_h, pos_b: pos_w, freq_base: hparams.rope_theta, interleave_freq: true);
636	};
637
638	ggml_tensor * inp = build_inp();
639	ggml_tensor * cur = build_vit(
640	inp, n_pos: n_patches,
641	norm_t: NORM_TYPE_RMS,
642	ffn_t: hparams.ffn_op,
643	learned_pos_embd: nullptr, // no learned pos embd
644	add_pos);
645
646	// mistral small 3.1 patch merger
647	// ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
648	if (model.mm_patch_merger_w) {
649	GGML_ASSERT(hparams.n_merge > `0`);
650
651	cur = ggml_mul(ctx: ctx0, a: ggml_rms_norm(ctx: ctx0, a: cur, eps), b: model.mm_input_norm_w);
652
653	// reshape image tokens to 2D grid
654	cur = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: n_embd, ne1: n_patches_x, ne2: n_patches_y);
655	cur = ggml_permute(ctx: ctx0, a: cur, axis0: `2`, axis1: `0`, axis2: `1`, axis3: `3`); // [x, y, n_embd]
656	cur = ggml_cont(ctx: ctx0, a: cur);
657
658	// torch.nn.functional.unfold is just an im2col under the hood
659	// we just need a dummy kernel to make it work
660	ggml_tensor * kernel = ggml_view_3d(ctx: ctx0, a: cur, ne0: n_merge, ne1: n_merge, ne2: cur->ne[`2`], nb1: `0`, nb2: `0`, offset: `0`);
661	cur = ggml_im2col(ctx: ctx0, a: kernel, b: cur, s0: n_merge, s1: n_merge, p0: `0`, p1: `0`, d0: `1`, d1: `1`, is_2D: true, dst_type: inp->type);
662
663	// project to n_embd
664	cur = ggml_reshape_2d(ctx: ctx0, a: cur, ne0: cur->ne[`0`], ne1: cur->ne[`1`] * cur->ne[`2`]);
665	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_patch_merger_w, b: cur);
666	}
667
668	// LlavaMultiModalProjector (always using GELU activation)
669	{
670	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
671	if (model.mm_1_b) {
672	cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_1_b);
673	}
674
675	cur = ggml_gelu(ctx: ctx0, a: cur);
676	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: cur);
677	if (model.mm_2_b) {
678	cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_2_b);
679	}
680	}
681
682	// arrangement of the [IMG_BREAK] token
683	if (model.token_embd_img_break) {
684	// not efficient, but works
685	// the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
686	// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
687	// after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
688
689	const int p_y = n_merge > `0` ? n_patches_y / n_merge : n_patches_y;
690	const int p_x = n_merge > `0` ? n_patches_x / n_merge : n_patches_x;
691	const int p_total = p_x * p_y;
692	const int n_embd_text = cur->ne[`0`];
693	const int n_tokens_output = p_total + p_y - `1`; // one [IMG_BREAK] per row, except the last row
694
695	ggml_tensor * tmp = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: n_embd_text, ne1: p_x, ne2: p_y);
696	ggml_tensor * tok = ggml_new_tensor_3d(ctx: ctx0, type: tmp->type, ne0: n_embd_text, ne1: `1`, ne2: p_y);
697	tok = ggml_scale(ctx: ctx0, a: tok, s: `0.0`); // clear the tensor
698	tok = ggml_add(ctx: ctx0, a: tok, b: model.token_embd_img_break);
699	tmp = ggml_concat(ctx: ctx0, a: tmp, b: tok, dim: `1`);
700	cur = ggml_view_2d(ctx: ctx0, a: tmp,
701	ne0: n_embd_text, ne1: n_tokens_output,
702	nb1: ggml_row_size(type: tmp->type, ne: n_embd_text), offset: `0`);
703	}
704
705	// build the graph
706	ggml_build_forward_expand(cgraph: gf, tensor: cur);
707
708	return gf;
709	}
710
711	// Qwen2VL and Qwen2.5VL use M-RoPE
712	ggml_cgraph * build_qwen2vl() {
713	GGML_ASSERT(model.patch_bias == nullptr);
714	GGML_ASSERT(model.class_embedding == nullptr);
715
716	const int batch_size = `1`;
717	const bool use_window_attn = hparams.n_wa_pattern > `0`;
718	const int n_wa_pattern = hparams.n_wa_pattern;
719	const int n_pos = n_patches;
720	const int num_position_ids = n_pos * `4`; // m-rope requires 4 dim per position
721
722	norm_type norm_t = ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
723	? NORM_TYPE_RMS // qwen 2.5 vl
724	: NORM_TYPE_NORMAL; // qwen 2 vl
725
726	int mrope_sections[`4`] = {d_head/`4`, d_head/`4`, d_head/`4`, d_head/`4`};
727
728	ggml_tensor * inp_raw = build_inp_raw();
729	ggml_tensor * inp = ggml_conv_2d(ctx: ctx0, a: model.patch_embeddings_0, b: inp_raw, s0: patch_size, s1: patch_size, p0: `0`, p1: `0`, d0: `1`, d1: `1`);
730
731	GGML_ASSERT(img.nx % (patch_size * `2`) == `0`);
732	GGML_ASSERT(img.ny % (patch_size * `2`) == `0`);
733
734	// second conv dimension
735	{
736	auto inp_1 = ggml_conv_2d(ctx: ctx0, a: model.patch_embeddings_1, b: inp_raw, s0: patch_size, s1: patch_size, p0: `0`, p1: `0`, d0: `1`, d1: `1`);
737	inp = ggml_add(ctx: ctx0, a: inp, b: inp_1);
738
739	inp = ggml_permute(ctx: ctx0, a: inp, axis0: `1`, axis1: `2`, axis2: `0`, axis3: `3`); // [w, h, c, b] -> [c, w, h, b]
740	inp = ggml_cont_4d(
741	ctx: ctx0, a: inp,
742	ne0: n_embd * `2`, ne1: n_patches_x / `2`, ne2: n_patches_y, ne3: batch_size);
743	inp = ggml_reshape_4d(
744	ctx: ctx0, a: inp,
745	ne0: n_embd * `2`, ne1: n_patches_x / `2`, ne2: `2`, ne3: batch_size * (n_patches_y / `2`));
746	inp = ggml_permute(ctx: ctx0, a: inp, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
747	inp = ggml_cont_3d(
748	ctx: ctx0, a: inp,
749	ne0: n_embd, ne1: n_patches_x * n_patches_y, ne2: batch_size);
750	}
751
752	ggml_tensor * inpL = inp;
753	ggml_tensor * window_mask = nullptr;
754	ggml_tensor * window_idx = nullptr;
755	ggml_tensor * inv_window_idx = nullptr;
756
757	ggml_tensor * positions = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: num_position_ids);
758	ggml_set_name(tensor: positions, name: "positions");
759	ggml_set_input(tensor: positions);
760
761	// pre-layernorm
762	if (model.pre_ln_w) {
763	inpL = build_norm(cur: inpL, mw: model.pre_ln_w, mb: model.pre_ln_b, type: norm_t, norm_eps: eps, il: -`1`);
764	}
765
766	if (use_window_attn) {
767	// handle window attention inputs
768	inv_window_idx = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos / `4`);
769	ggml_set_name(tensor: inv_window_idx, name: "inv_window_idx");
770	ggml_set_input(tensor: inv_window_idx);
771	// mask for window attention
772	window_mask = ggml_new_tensor_2d(ctx: ctx0, type: GGML_TYPE_F32, ne0: n_pos, ne1: n_pos);
773	ggml_set_name(tensor: window_mask, name: "window_mask");
774	ggml_set_input(tensor: window_mask);
775
776	// if flash attn is used, we need to pad the mask and cast to f16
777	if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
778	int n_pad = GGML_PAD(window_mask->ne[`1`], GGML_KQ_MASK_PAD) - window_mask->ne[`1`];
779	if (n_pad > `0`) {
780	window_mask = ggml_pad(ctx: ctx0, a: window_mask, p0: `0`, p1: n_pad, p2: `0`, p3: `0`);
781	}
782	window_mask = ggml_cast(ctx: ctx0, a: window_mask, type: GGML_TYPE_F16);
783	}
784
785	// inpL shape: [n_embd, n_patches_x n_patches_y, batch_size]*
786	GGML_ASSERT(batch_size == `1`);
787	inpL = ggml_reshape_2d(ctx: ctx0, a: inpL, ne0: n_embd * `4`, ne1: n_patches_x * n_patches_y * batch_size / `4`);
788	inpL = ggml_get_rows(ctx: ctx0, a: inpL, b: inv_window_idx);
789	inpL = ggml_reshape_3d(ctx: ctx0, a: inpL, ne0: n_embd, ne1: n_patches_x * n_patches_y, ne2: batch_size);
790	}
791
792	// loop over layers
793	for (int il = `0`; il < n_layer; il++) {
794	auto & layer = model.layers [il];
795	const bool full_attn = use_window_attn ? (il + `1`) % n_wa_pattern == `0` : true;
796
797	ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
798
799	// layernorm1
800	cur = build_norm(cur, mw: layer.ln_1_w, mb: layer.ln_1_b, type: norm_t, norm_eps: eps, il);
801	cb(cur0: cur, name: "ln1", il);
802
803	// self-attention
804	{
805	ggml_tensor * Qcur = ggml_add(ctx: ctx0,
806	a: ggml_mul_mat(ctx: ctx0, a: layer.q_w, b: cur), b: layer.q_b);
807	ggml_tensor * Kcur = ggml_add(ctx: ctx0,
808	a: ggml_mul_mat(ctx: ctx0, a: layer.k_w, b: cur), b: layer.k_b);
809	ggml_tensor * Vcur = ggml_add(ctx: ctx0,
810	a: ggml_mul_mat(ctx: ctx0, a: layer.v_w, b: cur), b: layer.v_b);
811
812	Qcur = ggml_reshape_3d(ctx: ctx0, a: Qcur, ne0: d_head, ne1: n_head, ne2: n_patches);
813	Kcur = ggml_reshape_3d(ctx: ctx0, a: Kcur, ne0: d_head, ne1: n_head, ne2: n_patches);
814	Vcur = ggml_reshape_3d(ctx: ctx0, a: Vcur, ne0: d_head, ne1: n_head, ne2: n_patches);
815
816	cb(cur0: Qcur, name: "Qcur", il);
817	cb(cur0: Kcur, name: "Kcur", il);
818	cb(cur0: Vcur, name: "Vcur", il);
819
820	// apply M-RoPE
821	Qcur = ggml_rope_multi(
822	ctx: ctx0, a: Qcur, b: positions, c: nullptr,
823	n_dims: d_head/`2`, sections: mrope_sections, GGML_ROPE_TYPE_VISION, n_ctx_orig: `32768`, freq_base: `10000`, freq_scale: `1`, ext_factor: `0`, attn_factor: `1`, beta_fast: `32`, beta_slow: `1`);
824	Kcur = ggml_rope_multi(
825	ctx: ctx0, a: Kcur, b: positions, c: nullptr,
826	n_dims: d_head/`2`, sections: mrope_sections, GGML_ROPE_TYPE_VISION, n_ctx_orig: `32768`, freq_base: `10000`, freq_scale: `1`, ext_factor: `0`, attn_factor: `1`, beta_fast: `32`, beta_slow: `1`);
827
828	cb(cur0: Qcur, name: "Qcur_rope", il);
829	cb(cur0: Kcur, name: "Kcur_rope", il);
830
831	ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
832
833	cur = build_attn(wo: layer.o_w, wo_b: layer.o_b,
834	q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_mask: attn_mask, kq_scale, il);
835	cb(cur0: cur, name: "attn_out", il);
836	}
837
838	// re-add the layer input, e.g., residual
839	cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
840
841	inpL = cur; // inpL = residual, cur = hidden_states
842
843	cb(cur0: cur, name: "ffn_inp", il);
844
845	// layernorm2
846	cur = build_norm(cur, mw: layer.ln_2_w, mb: layer.ln_2_b, type: norm_t, norm_eps: eps, il);
847	cb(cur0: cur, name: "ffn_inp_normed", il);
848
849	// ffn
850	cur = build_ffn(cur,
851	up: layer.ff_up_w, up_b: layer.ff_up_b,
852	gate: layer.ff_gate_w, gate_b: layer.ff_gate_b,
853	down: layer.ff_down_w, down_b: layer.ff_down_b,
854	type_op: hparams.ffn_op, il);
855
856	cb(cur0: cur, name: "ffn_out", il);
857
858	// residual 2
859	cur = ggml_add(ctx: ctx0, a: inpL, b: cur);
860	cb(cur0: cur, name: "layer_out", il);
861
862	inpL = cur;
863	}
864
865	// post-layernorm
866	if (model.post_ln_w) {
867	inpL = build_norm(cur: inpL, mw: model.post_ln_w, mb: model.post_ln_b, type: norm_t, norm_eps: eps, il: n_layer);
868	}
869
870	// multimodal projection
871	ggml_tensor * embeddings = inpL;
872	embeddings = ggml_reshape_3d(ctx: ctx0, a: embeddings, ne0: n_embd * `4`, ne1: n_pos / `4`, ne2: batch_size);
873
874	embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_0_w, b: embeddings);
875	embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_0_b);
876
877	// GELU activation
878	embeddings = ggml_gelu(ctx: ctx0, a: embeddings);
879
880	// Second linear layer
881	embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: embeddings);
882	embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_1_b);
883
884	if (use_window_attn) {
885	window_idx = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos / `4`);
886	ggml_set_name(tensor: window_idx, name: "window_idx");
887	ggml_set_input(tensor: window_idx);
888
889	// embeddings shape: [n_embd, n_patches_x n_patches_y, batch_size]*
890	GGML_ASSERT(batch_size == `1`);
891	embeddings = ggml_reshape_2d(ctx: ctx0, a: embeddings, ne0: hparams.projection_dim, ne1: n_patches_x * n_patches_y / `4`);
892	embeddings = ggml_get_rows(ctx: ctx0, a: embeddings, b: window_idx);
893	embeddings = ggml_reshape_3d(ctx: ctx0, a: embeddings, ne0: hparams.projection_dim, ne1: n_patches_x * n_patches_y / `4`, ne2: batch_size);
894	}
895
896	// build the graph
897	ggml_build_forward_expand(cgraph: gf, tensor: embeddings);
898
899	return gf;
900	}
901
902	// Qwen3VL
903	ggml_cgraph * build_qwen3vl() {
904	GGML_ASSERT(model.patch_bias != nullptr);
905	GGML_ASSERT(model.position_embeddings != nullptr);
906	GGML_ASSERT(model.class_embedding == nullptr);
907
908	const int batch_size = `1`;
909	const int n_pos = n_patches;
910	const int num_position_ids = n_pos * `4`; // m-rope requires 4 dim per position
911
912	norm_type norm_t = NORM_TYPE_NORMAL;
913
914	int mrope_sections[`4`] = {d_head/`4`, d_head/`4`, d_head/`4`, d_head/`4`};
915
916	ggml_tensor * inp_raw = build_inp_raw();
917	ggml_tensor * inp = ggml_conv_2d(ctx: ctx0, a: model.patch_embeddings_0, b: inp_raw, s0: patch_size, s1: patch_size, p0: `0`, p1: `0`, d0: `1`, d1: `1`);
918
919	GGML_ASSERT(img.nx % (patch_size * `2`) == `0`);
920	GGML_ASSERT(img.ny % (patch_size * `2`) == `0`);
921
922	// second conv dimension
923	{
924	auto inp_1 = ggml_conv_2d(ctx: ctx0, a: model.patch_embeddings_1, b: inp_raw, s0: patch_size, s1: patch_size, p0: `0`, p1: `0`, d0: `1`, d1: `1`);
925	inp = ggml_add(ctx: ctx0, a: inp, b: inp_1);
926
927	inp = ggml_permute(ctx: ctx0, a: inp, axis0: `1`, axis1: `2`, axis2: `0`, axis3: `3`); // [w, h, c, b] -> [c, w, h, b]
928	inp = ggml_cont_4d(
929	ctx: ctx0, a: inp,
930	ne0: n_embd * `2`, ne1: n_patches_x / `2`, ne2: n_patches_y, ne3: batch_size);
931	inp = ggml_reshape_4d(
932	ctx: ctx0, a: inp,
933	ne0: n_embd * `2`, ne1: n_patches_x / `2`, ne2: `2`, ne3: batch_size * (n_patches_y / `2`));
934	inp = ggml_permute(ctx: ctx0, a: inp, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
935	inp = ggml_cont_3d(
936	ctx: ctx0, a: inp,
937	ne0: n_embd, ne1: n_patches_x * n_patches_y, ne2: batch_size);
938	}
939
940	// add patch bias
941	if (model.patch_bias != nullptr) {
942	inp = ggml_add(ctx: ctx0, a: inp, b: model.patch_bias);
943	cb(cur0: inp, name: "patch_bias", il: -`1`);
944	}
945
946	// calculate absolute position embedding and apply
947	ggml_tensor * learned_pos_embd = resize_position_embeddings();
948	learned_pos_embd = ggml_cont_4d(
949	ctx: ctx0, a: learned_pos_embd,
950	ne0: n_embd * `2`, ne1: n_patches_x / `2`, ne2: n_patches_y, ne3: batch_size);
951	learned_pos_embd = ggml_reshape_4d(
952	ctx: ctx0, a: learned_pos_embd,
953	ne0: n_embd * `2`, ne1: n_patches_x / `2`, ne2: `2`, ne3: batch_size * (n_patches_y / `2`));
954	learned_pos_embd = ggml_permute(ctx: ctx0, a: learned_pos_embd, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
955	learned_pos_embd = ggml_cont_3d(
956	ctx: ctx0, a: learned_pos_embd,
957	ne0: n_embd, ne1: n_patches_x * n_patches_y, ne2: batch_size);
958	inp = ggml_add(ctx: ctx0, a: inp, b: learned_pos_embd);
959	cb(cur0: inp, name: "inp_pos_emb", il: -`1`);
960
961	ggml_tensor * inpL = inp;
962
963	ggml_tensor * positions = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: num_position_ids);
964	ggml_set_name(tensor: positions, name: "positions");
965	ggml_set_input(tensor: positions);
966
967	// pre-layernorm
968	if (model.pre_ln_w) {
969	inpL = build_norm(cur: inpL, mw: model.pre_ln_w, mb: model.pre_ln_b, type: norm_t, norm_eps: eps, il: -`1`);
970	}
971
972	// deepstack features (stack along the feature dimension), [n_embd len(deepstack_layers), n_patches_x * n_patches_y, batch_size]*
973	ggml_tensor * deepstack_features = nullptr;
974	const int merge_factor = hparams.n_merge > `0` ? hparams.n_merge * hparams.n_merge : `4`; // default 2x2=4 for qwen3vl
975
976	// loop over layers
977	for (int il = `0`; il < n_layer; il++) {
978	auto & layer = model.layers [il];
979
980	ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
981
982	// layernorm1
983	cur = build_norm(cur, mw: layer.ln_1_w, mb: layer.ln_1_b, type: norm_t, norm_eps: eps, il);
984	cb(cur0: cur, name: "ln1", il);
985
986	// self-attention
987	{
988	cur = ggml_mul_mat(ctx: ctx0, a: layer.qkv_w, b: cur);
989	cur = ggml_add(ctx: ctx0, a: cur, b: layer.qkv_b);
990
991	ggml_tensor * Qcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
992	nb2: cur->nb[`1`], offset: `0`);
993	ggml_tensor * Kcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
994	nb2: cur->nb[`1`], offset: n_embd * sizeof(float));
995	ggml_tensor * Vcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
996	nb2: cur->nb[`1`], offset: `2` * n_embd * sizeof(float));
997
998	cb(cur0: Qcur, name: "Qcur", il);
999	cb(cur0: Kcur, name: "Kcur", il);
1000	cb(cur0: Vcur, name: "Vcur", il);
1001
1002	// apply M-RoPE
1003	Qcur = ggml_rope_multi(
1004	ctx: ctx0, a: Qcur, b: positions, c: nullptr,
1005	n_dims: d_head/`2`, sections: mrope_sections, GGML_ROPE_TYPE_VISION, n_ctx_orig: `32768`, freq_base: `10000`, freq_scale: `1`, ext_factor: `0`, attn_factor: `1`, beta_fast: `32`, beta_slow: `1`);
1006	Kcur = ggml_rope_multi(
1007	ctx: ctx0, a: Kcur, b: positions, c: nullptr,
1008	n_dims: d_head/`2`, sections: mrope_sections, GGML_ROPE_TYPE_VISION, n_ctx_orig: `32768`, freq_base: `10000`, freq_scale: `1`, ext_factor: `0`, attn_factor: `1`, beta_fast: `32`, beta_slow: `1`);
1009
1010	cb(cur0: Qcur, name: "Qcur_rope", il);
1011	cb(cur0: Kcur, name: "Kcur_rope", il);
1012
1013	cur = build_attn(wo: layer.o_w, wo_b: layer.o_b,
1014	q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_mask: nullptr, kq_scale, il);
1015	cb(cur0: cur, name: "attn_out", il);
1016	}
1017
1018	// re-add the layer input, e.g., residual
1019	cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
1020
1021	inpL = cur; // inpL = residual, cur = hidden_states
1022
1023	cb(cur0: cur, name: "ffn_inp", il);
1024
1025	// layernorm2
1026	cur = build_norm(cur, mw: layer.ln_2_w, mb: layer.ln_2_b, type: norm_t, norm_eps: eps, il);
1027	cb(cur0: cur, name: "ffn_inp_normed", il);
1028
1029	// ffn
1030	cur = build_ffn(cur,
1031	up: layer.ff_up_w, up_b: layer.ff_up_b,
1032	gate: layer.ff_gate_w, gate_b: layer.ff_gate_b,
1033	down: layer.ff_down_w, down_b: layer.ff_down_b,
1034	type_op: hparams.ffn_op, il);
1035
1036	cb(cur0: cur, name: "ffn_out", il);
1037
1038	// residual 2
1039	cur = ggml_add(ctx: ctx0, a: inpL, b: cur);
1040	cb(cur0: cur, name: "layer_out", il);
1041
1042	if (layer.has_deepstack()) {
1043	ggml_tensor * feat = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: n_embd * merge_factor, ne1: n_pos / merge_factor, ne2: batch_size);
1044	feat = build_norm(cur: feat, mw: layer.deepstack_norm_w, mb: layer.deepstack_norm_b, type: norm_t, norm_eps: eps, il);
1045	feat = build_ffn(cur: feat,
1046	up: layer.deepstack_fc1_w, up_b: layer.deepstack_fc1_b,
1047	gate: nullptr, gate_b: nullptr,
1048	down: layer.deepstack_fc2_w, down_b: layer.deepstack_fc2_b,
1049	type_op: ffn_op_type::FFN_GELU, il);
1050
1051	if(!deepstack_features) {
1052	deepstack_features = feat;
1053	} else {
1054	// concat along the feature dimension
1055	deepstack_features = ggml_concat(ctx: ctx0, a: deepstack_features, b: feat, dim: `0`);
1056	}
1057	}
1058
1059	inpL = cur;
1060	}
1061
1062	// post-layernorm
1063	if (model.post_ln_w) {
1064	inpL = build_norm(cur: inpL, mw: model.post_ln_w, mb: model.post_ln_b, type: norm_t, norm_eps: eps, il: n_layer);
1065	}
1066
1067	// multimodal projection
1068	ggml_tensor * embeddings = inpL;
1069	embeddings = ggml_reshape_3d(ctx: ctx0, a: embeddings, ne0: n_embd * `4`, ne1: n_pos / `4`, ne2: batch_size);
1070
1071	embeddings = build_ffn(cur: embeddings,
1072	up: model.mm_0_w, up_b: model.mm_0_b,
1073	gate: nullptr, gate_b: nullptr,
1074	down: model.mm_1_w, down_b: model.mm_1_b,
1075	type_op: ffn_op_type::FFN_GELU, il: -`1`);
1076
1077	embeddings = ggml_concat(ctx: ctx0, a: embeddings, b: deepstack_features, dim: `0`); // concat along the feature dimension
1078
1079	// build the graph
1080	ggml_build_forward_expand(cgraph: gf, tensor: embeddings);
1081
1082	return gf;
1083	}
1084
1085	ggml_cgraph * build_minicpmv() {
1086	GGML_ASSERT(model.class_embedding == nullptr);
1087	const int n_pos = n_patches;
1088	const int n_embd_proj = clip_n_mmproj_embd(ctx);
1089
1090	// position embeddings for the projector (not for ViT)
1091	// see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
1092	// base frequency omega
1093	ggml_tensor * omega = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_F32, ne0: n_embd_proj / `4`);
1094	ggml_set_name(tensor: omega, name: "omega");
1095	ggml_set_input(tensor: omega);
1096
1097	// 2D input positions (using float for sinusoidal embeddings)
1098	ggml_tensor * pos_h = ggml_new_tensor_2d(ctx: ctx0, type: GGML_TYPE_F32, ne0: `1`, ne1: n_pos);
1099	ggml_set_name(tensor: pos_h, name: "pos_h");
1100	ggml_set_input(tensor: pos_h);
1101	ggml_tensor * pos_w = ggml_new_tensor_2d(ctx: ctx0, type: GGML_TYPE_F32, ne0: `1`, ne1: n_pos);
1102	ggml_set_name(tensor: pos_w, name: "pos_w");
1103	ggml_set_input(tensor: pos_w);
1104
1105	// for selecting learned pos embd, used by ViT
1106	struct ggml_tensor * positions = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos);
1107	ggml_set_name(tensor: positions, name: "positions");
1108	ggml_set_input(tensor: positions);
1109
1110	ggml_tensor * learned_pos_embd = ggml_get_rows(ctx: ctx0, a: model.position_embeddings, b: positions);
1111
1112	ggml_tensor * inp = build_inp();
1113	ggml_tensor * embeddings = build_vit(
1114	inp, n_pos,
1115	norm_t: NORM_TYPE_NORMAL,
1116	ffn_t: hparams.ffn_op,
1117	learned_pos_embd,
1118	add_pos: nullptr);
1119
1120	// resampler projector (it is just another transformer)
1121
1122	ggml_tensor * q = model.mm_model_query;
1123	ggml_tensor * v = ggml_mul_mat(ctx: ctx0, a: model.mm_model_kv_proj, b: embeddings);
1124
1125	// norm
1126	q = build_norm(cur: q, mw: model.mm_model_ln_q_w, mb: model.mm_model_ln_q_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il: -`1`);
1127	v = build_norm(cur: v, mw: model.mm_model_ln_kv_w, mb: model.mm_model_ln_kv_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il: -`1`);
1128
1129	// calculate sinusoidal pos embd
1130	ggml_tensor * pos_embed = nullptr;
1131	{
1132	// outer product
1133	ggml_tensor * omega_b = ggml_repeat_4d(ctx: ctx0, a: omega, ne0: omega->ne[`0`], ne1: n_pos, ne2: `1`, ne3: `1`); // n_pos rows
1134	ggml_tensor * theta_x = ggml_mul(ctx: ctx0, a: omega_b, b: pos_w);
1135	ggml_tensor * theta_y = ggml_mul(ctx: ctx0, a: omega_b, b: pos_h);
1136	// sin and cos
1137	ggml_tensor * pos_embd_x = ggml_concat(
1138	ctx: ctx0,
1139	a: ggml_sin(ctx: ctx0, a: theta_x),
1140	b: ggml_cos(ctx: ctx0, a: theta_x),
1141	dim: `0` // concat on first dim
1142	);
1143	ggml_tensor * pos_embd_y = ggml_concat(
1144	ctx: ctx0,
1145	a: ggml_sin(ctx: ctx0, a: theta_y),
1146	b: ggml_cos(ctx: ctx0, a: theta_y),
1147	dim: `0` // concat on first dim
1148	);
1149	pos_embed = ggml_concat(ctx: ctx0, a: pos_embd_x, b: pos_embd_y, dim: `0`);
1150	}
1151
1152	// k = v + pos_embed
1153	ggml_tensor * k = ggml_add(ctx: ctx0, a: v, b: pos_embed);
1154
1155	// attention
1156	{
1157	const int d_head = `128`;
1158	int n_head = n_embd_proj/d_head;
1159	// Use actual config value if available, otherwise fall back to hardcoded values
1160	int num_query = ctx->model.hparams.minicpmv_query_num;
1161	ggml_tensor * Q = ggml_add(ctx: ctx0,
1162	a: ggml_mul_mat(ctx: ctx0, a: model.mm_model_attn_q_w, b: q),
1163	b: model.mm_model_attn_q_b);
1164	ggml_tensor * K = ggml_add(ctx: ctx0,
1165	a: ggml_mul_mat(ctx: ctx0, a: model.mm_model_attn_k_w, b: k),
1166	b: model.mm_model_attn_k_b);
1167	ggml_tensor * V = ggml_add(ctx: ctx0,
1168	a: ggml_mul_mat(ctx: ctx0, a: model.mm_model_attn_v_w, b: v),
1169	b: model.mm_model_attn_v_b);
1170
1171	Q = ggml_reshape_3d(ctx: ctx0, a: Q, ne0: d_head, ne1: n_head, ne2: num_query);
1172	K = ggml_reshape_3d(ctx: ctx0, a: K, ne0: d_head, ne1: n_head, ne2: n_pos);
1173	V = ggml_reshape_3d(ctx: ctx0, a: V, ne0: d_head, ne1: n_head, ne2: n_pos);
1174
1175	cb(cur0: Q, name: "resampler_Q", il: -`1`);
1176	cb(cur0: K, name: "resampler_K", il: -`1`);
1177	cb(cur0: V, name: "resampler_V", il: -`1`);
1178
1179	embeddings = build_attn(
1180	wo: model.mm_model_attn_o_w,
1181	wo_b: model.mm_model_attn_o_b,
1182	q_cur: Q, k_cur: K, v_cur: V, kq_mask: nullptr, kq_scale, il: -`1`);
1183	cb(cur0: embeddings, name: "resampler_attn_out", il: -`1`);
1184	}
1185	// layernorm
1186	embeddings = build_norm(cur: embeddings, mw: model.mm_model_ln_post_w, mb: model.mm_model_ln_post_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il: -`1`);
1187
1188	// projection
1189	embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_model_proj, b: embeddings);
1190
1191	// build the graph
1192	ggml_build_forward_expand(cgraph: gf, tensor: embeddings);
1193
1194	return gf;
1195	}
1196
1197	ggml_cgraph * build_internvl() {
1198	GGML_ASSERT(model.class_embedding != nullptr);
1199	GGML_ASSERT(model.position_embeddings != nullptr);
1200
1201	const int n_pos = n_patches + `1`;
1202	ggml_tensor * inp = build_inp();
1203
1204	// add CLS token
1205	inp = ggml_concat(ctx: ctx0, a: inp, b: model.class_embedding, dim: `1`);
1206
1207	// The larger models use a different ViT, which uses RMS norm instead of layer norm
1208	// ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
1209	norm_type norm_t = (hparams.n_embd == `3200` && hparams.n_layer == `45`)
1210	? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
1211	: NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
1212
1213	ggml_tensor * cur = build_vit(
1214	inp, n_pos,
1215	norm_t,
1216	ffn_t: hparams.ffn_op,
1217	learned_pos_embd: model.position_embeddings,
1218	add_pos: nullptr);
1219
1220	// remove CLS token
1221	cur = ggml_view_2d(ctx: ctx0, a: cur,
1222	ne0: n_embd, ne1: n_patches,
1223	nb1: ggml_row_size(type: cur->type, ne: n_embd), offset: `0`);
1224
1225	// pixel shuffle
1226	{
1227	const int scale_factor = model.hparams.n_merge;
1228	const int bsz = `1`; // batch size, always 1 for now since we don't support batching
1229	const int height = n_patches_y;
1230	const int width = n_patches_x;
1231	GGML_ASSERT(scale_factor > `0`);
1232	cur = ggml_reshape_4d(ctx: ctx0, a: cur, ne0: n_embd * scale_factor, ne1: height / scale_factor, ne2: width, ne3: bsz);
1233	cur = ggml_permute(ctx: ctx0, a: cur, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
1234	cur = ggml_cont_4d(ctx: ctx0, a: cur,
1235	ne0: n_embd * scale_factor * scale_factor,
1236	ne1: height / scale_factor,
1237	ne2: width / scale_factor,
1238	ne3: bsz);
1239	cur = ggml_permute(ctx: ctx0, a: cur, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
1240	// flatten to 2D
1241	cur = ggml_cont_2d(ctx: ctx0, a: cur,
1242	ne0: n_embd * scale_factor * scale_factor,
1243	ne1: cur->ne[`1`] * cur->ne[`2`]);
1244	}
1245
1246	// projector (always using GELU activation)
1247	{
1248	// projector LayerNorm uses pytorch's default eps = 1e-5
1249	// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
1250	cur = build_norm(cur, mw: model.mm_0_w, mb: model.mm_0_b, type: NORM_TYPE_NORMAL, norm_eps: `1e-5`, il: -`1`);
1251	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
1252	cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_1_b);
1253	cur = ggml_gelu(ctx: ctx0, a: cur);
1254	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_3_w, b: cur);
1255	cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_3_b);
1256	}
1257
1258	// build the graph
1259	ggml_build_forward_expand(cgraph: gf, tensor: cur);
1260
1261	return gf;
1262	}
1263
1264	ggml_cgraph * build_llama4() {
1265	GGML_ASSERT(model.class_embedding != nullptr);
1266	GGML_ASSERT(model.position_embeddings != nullptr);
1267
1268	const int n_pos = n_patches + `1`; // +1 for [CLS]
1269
1270	// 2D input positions
1271	ggml_tensor * pos_h = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos);
1272	ggml_set_name(tensor: pos_h, name: "pos_h");
1273	ggml_set_input(tensor: pos_h);
1274
1275	ggml_tensor * pos_w = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos);
1276	ggml_set_name(tensor: pos_w, name: "pos_w");
1277	ggml_set_input(tensor: pos_w);
1278
1279	ggml_tensor * inp = build_inp_raw();
1280
1281	// Llama4UnfoldConvolution
1282	{
1283	ggml_tensor * kernel = ggml_reshape_4d(ctx: ctx0, a: model.patch_embeddings_0,
1284	ne0: patch_size, ne1: patch_size, ne2: `3`, ne3: n_embd);
1285	inp = ggml_im2col(ctx: ctx0, a: kernel, b: inp, s0: patch_size, s1: patch_size, p0: `0`, p1: `0`, d0: `1`, d1: `1`, is_2D: true, dst_type: inp->type);
1286	inp = ggml_mul_mat(ctx: ctx0, a: model.patch_embeddings_0, b: inp);
1287	inp = ggml_reshape_2d(ctx: ctx0, a: inp, ne0: n_embd, ne1: n_patches);
1288	cb(cur0: inp, name: "patch_conv", il: -`1`);
1289	}
1290
1291	// add CLS token
1292	inp = ggml_concat(ctx: ctx0, a: inp, b: model.class_embedding, dim: `1`);
1293
1294	// build ViT with 2D position embeddings
1295	auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
1296	// first half is X axis and second half is Y axis
1297	// ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
1298	// ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
1299	return build_rope_2d(ctx0, cur, pos_a: pos_w, pos_b: pos_h, freq_base: hparams.rope_theta, interleave_freq: false);
1300	};
1301	ggml_tensor * cur = build_vit(
1302	inp, n_pos,
1303	norm_t: NORM_TYPE_NORMAL,
1304	ffn_t: hparams.ffn_op,
1305	learned_pos_embd: model.position_embeddings,
1306	add_pos);
1307
1308	// remove CLS token
1309	cur = ggml_view_2d(ctx: ctx0, a: cur,
1310	ne0: n_embd, ne1: n_patches,
1311	nb1: ggml_row_size(type: cur->type, ne: n_embd), offset: `0`);
1312
1313	// pixel shuffle
1314	// based on Llama4VisionPixelShuffleMLP
1315	// https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
1316	{
1317	const int scale_factor = model.hparams.n_merge;
1318	const int bsz = `1`; // batch size, always 1 for now since we don't support batching
1319	GGML_ASSERT(scale_factor > `0`);
1320	GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
1321	cur = ggml_reshape_4d(ctx: ctx0, a: cur,
1322	ne0: n_embd * scale_factor,
1323	ne1: n_patches_x / scale_factor,
1324	ne2: n_patches_y,
1325	ne3: bsz);
1326	cur = ggml_permute(ctx: ctx0, a: cur, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
1327	cur = ggml_cont_4d(ctx: ctx0, a: cur,
1328	ne0: n_embd * scale_factor * scale_factor,
1329	ne1: n_patches_x / scale_factor,
1330	ne2: n_patches_y / scale_factor,
1331	ne3: bsz);
1332	//cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
1333	// flatten to 2D
1334	cur = ggml_cont_2d(ctx: ctx0, a: cur,
1335	ne0: n_embd * scale_factor * scale_factor,
1336	ne1: n_patches / scale_factor / scale_factor);
1337	cb(cur0: cur, name: "pixel_shuffle", il: -`1`);
1338	}
1339
1340	// based on Llama4VisionMLP2 (always uses GELU activation, no bias)
1341	{
1342	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_1_w, b: cur);
1343	cur = ggml_gelu(ctx: ctx0, a: cur);
1344	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_2_w, b: cur);
1345	cur = ggml_gelu(ctx: ctx0, a: cur);
1346	cb(cur0: cur, name: "adapter_mlp", il: -`1`);
1347	}
1348
1349	// Llama4MultiModalProjector
1350	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_model_proj, b: cur);
1351	cb(cur0: cur, name: "projected", il: -`1`);
1352
1353	// build the graph
1354	ggml_build_forward_expand(cgraph: gf, tensor: cur);
1355
1356	return gf;
1357	}
1358
1359	ggml_cgraph * build_kimivl() {
1360	// 2D input positions
1361	ggml_tensor * pos_h = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_patches);
1362	ggml_set_name(tensor: pos_h, name: "pos_h");
1363	ggml_set_input(tensor: pos_h);
1364
1365	ggml_tensor * pos_w = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_patches);
1366	ggml_set_name(tensor: pos_w, name: "pos_w");
1367	ggml_set_input(tensor: pos_w);
1368
1369	ggml_tensor * learned_pos_embd = resize_position_embeddings();
1370
1371	// build ViT with 2D position embeddings
1372	auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
1373	// first half is X axis and second half is Y axis
1374	return build_rope_2d(ctx0, cur, pos_a: pos_w, pos_b: pos_h, freq_base: hparams.rope_theta, interleave_freq: false);
1375	};
1376
1377	ggml_tensor * inp = build_inp();
1378	ggml_tensor * cur = build_vit(
1379	inp, n_pos: n_patches,
1380	norm_t: NORM_TYPE_NORMAL,
1381	ffn_t: hparams.ffn_op,
1382	learned_pos_embd,
1383	add_pos);
1384
1385	cb(cur0: cur, name: "vit_out", il: -`1`);
1386
1387	{
1388	// patch_merger
1389	const int scale_factor = model.hparams.n_merge;
1390	cur = build_patch_merge_permute(cur, scale_factor);
1391
1392	// projection norm
1393	int proj_inp_dim = cur->ne[`0`];
1394	cur = ggml_view_2d(ctx: ctx0, a: cur,
1395	ne0: n_embd, ne1: cur->ne[`1`] * scale_factor * scale_factor,
1396	nb1: ggml_row_size(type: cur->type, ne: n_embd), offset: `0`);
1397	cur = ggml_norm(ctx: ctx0, a: cur, eps: `1e-5`); // default nn.LayerNorm
1398	cur = ggml_mul(ctx: ctx0, a: cur, b: model.mm_input_norm_w);
1399	cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_input_norm_b);
1400	cur = ggml_view_2d(ctx: ctx0, a: cur,
1401	ne0: proj_inp_dim, ne1: cur->ne[`1`] / scale_factor / scale_factor,
1402	nb1: ggml_row_size(type: cur->type, ne: proj_inp_dim), offset: `0`);
1403	cb(cur0: cur, name: "proj_inp_normed", il: -`1`);
1404
1405	// projection mlp
1406	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
1407	cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_1_b);
1408	cur = ggml_gelu(ctx: ctx0, a: cur);
1409	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: cur);
1410	cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_2_b);
1411	cb(cur0: cur, name: "proj_out", il: -`1`);
1412	}
1413
1414	// build the graph
1415	ggml_build_forward_expand(cgraph: gf, tensor: cur);
1416
1417	return gf;
1418	}
1419
1420	// this graph is used by llava, granite and glm
1421	// due to having embedding_stack (used by granite), we cannot reuse build_vit
1422	ggml_cgraph * build_llava() {
1423	const int batch_size = `1`;
1424	const int n_pos = n_patches + (model.class_embedding ? `1` : `0`);
1425
1426	GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
1427
1428	// Calculate the deepest feature layer based on hparams and projector type
1429	int max_feature_layer = n_layer;
1430	{
1431	// Get the index of the second to last layer; this is the default for models that have a llava projector
1432	int il_last = hparams.n_layer - `1`;
1433	int deepest_feature_layer = -`1`;
1434
1435	if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV \|\| ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
1436	il_last += `1`;
1437	}
1438
1439	// If we set explicit vision feature layers, only go up to the deepest one
1440	// NOTE: only used by granite-vision models for now
1441	for (const auto & feature_layer : hparams.vision_feature_layer) {
1442	if (feature_layer > deepest_feature_layer) {
1443	deepest_feature_layer = feature_layer;
1444	}
1445	}
1446	max_feature_layer = deepest_feature_layer < `0` ? il_last : deepest_feature_layer;
1447	}
1448
1449	ggml_tensor * inp = build_inp();
1450
1451	// concat class_embeddings and patch_embeddings
1452	if (model.class_embedding) {
1453	inp = ggml_concat(ctx: ctx0, a: inp, b: model.class_embedding, dim: `1`);
1454	}
1455
1456	ggml_tensor * positions = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_pos);
1457	ggml_set_name(tensor: positions, name: "positions");
1458	ggml_set_input(tensor: positions);
1459
1460	inp = ggml_add(ctx: ctx0, a: inp, b: ggml_get_rows(ctx: ctx0, a: model.position_embeddings, b: positions));
1461
1462	ggml_tensor * inpL = inp;
1463
1464	// pre-layernorm
1465	if (model.pre_ln_w) {
1466	inpL = build_norm(cur: inpL, mw: model.pre_ln_w, mb: model.pre_ln_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il: -`1`);
1467	cb(cur0: inpL, name: "pre_ln", il: -`1`);
1468	}
1469
1470	std::vector<ggml_tensor *> embedding_stack;
1471	const auto & vision_feature_layer = hparams.vision_feature_layer;
1472
1473	// loop over layers
1474	for (int il = `0`; il < max_feature_layer; il++) {
1475	auto & layer = model.layers [il];
1476	ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
1477
1478	// If this is an embedding feature layer, save the output.
1479	// NOTE: 0 index here refers to the input to the encoder.
1480	if (vision_feature_layer.find(x: il) != vision_feature_layer.end()) {
1481	embedding_stack.push_back(x: cur);
1482	}
1483
1484	// layernorm1
1485	cur = build_norm(cur, mw: layer.ln_1_w, mb: layer.ln_1_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il);
1486	cb(cur0: cur, name: "layer_inp_normed", il);
1487
1488	// self-attention
1489	{
1490	ggml_tensor * Qcur = ggml_mul_mat(ctx: ctx0, a: layer.q_w, b: cur);
1491	if (layer.q_b) {
1492	Qcur = ggml_add(ctx: ctx0, a: Qcur, b: layer.q_b);
1493	}
1494
1495	ggml_tensor * Kcur = ggml_mul_mat(ctx: ctx0, a: layer.k_w, b: cur);
1496	if (layer.k_b) {
1497	Kcur = ggml_add(ctx: ctx0, a: Kcur, b: layer.k_b);
1498	}
1499
1500	ggml_tensor * Vcur = ggml_mul_mat(ctx: ctx0, a: layer.v_w, b: cur);
1501	if (layer.v_b) {
1502	Vcur = ggml_add(ctx: ctx0, a: Vcur, b: layer.v_b);
1503	}
1504
1505	Qcur = ggml_reshape_3d(ctx: ctx0, a: Qcur, ne0: d_head, ne1: n_head, ne2: n_pos);
1506	Kcur = ggml_reshape_3d(ctx: ctx0, a: Kcur, ne0: d_head, ne1: n_head, ne2: n_pos);
1507	Vcur = ggml_reshape_3d(ctx: ctx0, a: Vcur, ne0: d_head, ne1: n_head, ne2: n_pos);
1508
1509	cb(cur0: Qcur, name: "Qcur", il);
1510	cb(cur0: Kcur, name: "Kcur", il);
1511	cb(cur0: Vcur, name: "Vcur", il);
1512
1513	cur = build_attn(wo: layer.o_w, wo_b: layer.o_b,
1514	q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_mask: nullptr, kq_scale, il);
1515	cb(cur0: cur, name: "attn_out", il);
1516	}
1517
1518	// re-add the layer input, e.g., residual
1519	cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
1520
1521	inpL = cur; // inpL = residual, cur = hidden_states
1522
1523	cb(cur0: cur, name: "ffn_inp", il);
1524
1525	// layernorm2
1526	cur = build_norm(cur, mw: layer.ln_2_w, mb: layer.ln_2_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il);
1527	cb(cur0: cur, name: "ffn_inp_normed", il);
1528
1529	// ffn
1530	cur = build_ffn(cur,
1531	up: layer.ff_up_w, up_b: layer.ff_up_b,
1532	gate: layer.ff_gate_w, gate_b: layer.ff_gate_b,
1533	down: layer.ff_down_w, down_b: layer.ff_down_b,
1534	type_op: hparams.ffn_op, il);
1535
1536	cb(cur0: cur, name: "ffn_out", il);
1537
1538	// residual 2
1539	cur = ggml_add(ctx: ctx0, a: inpL, b: cur);
1540	cb(cur0: cur, name: "layer_out", il);
1541
1542	inpL = cur;
1543	}
1544
1545	// post-layernorm
1546	if (model.post_ln_w) {
1547	inpL = build_norm(cur: inpL, mw: model.post_ln_w, mb: model.post_ln_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il: -`1`);
1548	}
1549
1550	ggml_tensor * embeddings = inpL;
1551
1552	// process vision feature layers (used by granite)
1553	{
1554	// final layer is a vision feature layer
1555	if (vision_feature_layer.find(x: max_feature_layer) != vision_feature_layer.end()) {
1556	embedding_stack.push_back(x: inpL);
1557	}
1558
1559	// If feature layers are explicitly set, stack them (if we have multiple)
1560	if (!embedding_stack.empty()) {
1561	embeddings = embedding_stack [`0`];
1562	for (size_t i = `1`; i < embedding_stack.size(); i++) {
1563	embeddings = ggml_concat(ctx: ctx0, a: embeddings, b: embedding_stack [i], dim: `0`);
1564	}
1565	}
1566	}
1567
1568	// llava projector (also used by granite)
1569	if (ctx->model.hparams.has_llava_projector) {
1570	embeddings = ggml_reshape_2d(ctx: ctx0, a: embeddings, ne0: embeddings->ne[`0`], ne1: embeddings->ne[`1`]);
1571
1572	ggml_tensor * patches = ggml_new_tensor_1d(ctx: ctx0, type: GGML_TYPE_I32, ne0: n_patches);
1573	ggml_set_name(tensor: patches, name: "patches");
1574	ggml_set_input(tensor: patches);
1575
1576	// shape [1, 576, 1024]
1577	// ne is whcn, ne = [1024, 576, 1, 1]
1578	embeddings = ggml_get_rows(ctx: ctx0, a: embeddings, b: patches);
1579
1580	// print_tensor_info(embeddings, "embeddings");
1581
1582	// llava projector
1583	if (ctx->proj_type() == PROJECTOR_TYPE_MLP) {
1584	embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_0_w, b: embeddings);
1585	embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_0_b);
1586
1587	embeddings = ggml_gelu(ctx: ctx0, a: embeddings);
1588	if (model.mm_2_w) {
1589	embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: embeddings);
1590	embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_2_b);
1591	}
1592	}
1593	else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) {
1594	embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_0_w, b: embeddings);
1595	embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_0_b);
1596	// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
1597	// First LayerNorm
1598	embeddings = ggml_norm(ctx: ctx0, a: embeddings, eps);
1599	embeddings = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: embeddings, b: model.mm_1_w),
1600	b: model.mm_1_b);
1601
1602	// GELU activation
1603	embeddings = ggml_gelu(ctx: ctx0, a: embeddings);
1604
1605	// Second linear layer
1606	embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_3_w, b: embeddings);
1607	embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_3_b);
1608
1609	// Second LayerNorm
1610	embeddings = ggml_norm(ctx: ctx0, a: embeddings, eps);
1611	embeddings = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: embeddings, b: model.mm_4_w),
1612	b: model.mm_4_b);
1613	}
1614	else if (ctx->proj_type() == PROJECTOR_TYPE_LDP) {
1615	// MobileVLM projector
1616	int n_patch = `24`;
1617	ggml_tensor * mlp_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_1_w, b: embeddings);
1618	mlp_1 = ggml_add(ctx: ctx0, a: mlp_1, b: model.mm_model_mlp_1_b);
1619	mlp_1 = ggml_gelu(ctx: ctx0, a: mlp_1);
1620	ggml_tensor * mlp_3 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_3_w, b: mlp_1);
1621	mlp_3 = ggml_add(ctx: ctx0, a: mlp_3, b: model.mm_model_mlp_3_b);
1622	// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
1623
1624	// block 1
1625	ggml_tensor * block_1 = nullptr;
1626	{
1627	// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
1628	mlp_3 = ggml_permute(ctx: ctx0, a: mlp_3, axis0: `1`, axis1: `0`, axis2: `2`, axis3: `3`);
1629	mlp_3 = ggml_cont_4d(ctx: ctx0, a: mlp_3, ne0: n_patch, ne1: n_patch, ne2: mlp_3->ne[`1`], ne3: mlp_3->ne[`2`]);
1630	// stride = 1, padding = 1, bias is nullptr
1631	block_1 = ggml_conv_2d_dw(ctx: ctx0, a: model.mm_model_block_1_block_0_0_w, b: mlp_3, s0: `1`, s1: `1`, p0: `1`, p1: `1`, d0: `1`, d1: `1`);
1632
1633	// layer norm
1634	// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1635	block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: `1`, axis1: `2`, axis2: `0`, axis3: `3`));
1636	// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1637	block_1 = ggml_norm(ctx: ctx0, a: block_1, eps);
1638	block_1 = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: block_1, b: model.mm_model_block_1_block_0_1_w), b: model.mm_model_block_1_block_0_1_b);
1639	block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: `2`, axis1: `0`, axis2: `1`, axis3: `3`));
1640
1641	// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1642	// hardswish
1643	ggml_tensor * block_1_hw = ggml_hardswish(ctx: ctx0, a: block_1);
1644
1645	block_1 = ggml_pool_2d(ctx: ctx0, a: block_1_hw, op: GGML_OP_POOL_AVG, k0: block_1_hw->ne[`0`], k1: block_1_hw->ne[`1`], s0: block_1_hw->ne[`0`], s1: block_1_hw->ne[`1`], p0: `0`, p1: `0`);
1646	// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1647	// pointwise conv
1648	block_1 = ggml_reshape_2d(ctx: ctx0, a: block_1, ne0: block_1->ne[`0`]block_1->ne[`1`]block_1->ne[`2`], ne1: block_1->ne[`3`]);
1649	block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_1_block_1_fc1_w, b: block_1);
1650	block_1 = ggml_add(ctx: ctx0, a: block_1, b: model.mm_model_block_1_block_1_fc1_b);
1651	block_1 = ggml_relu(ctx: ctx0, a: block_1);
1652	block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_1_block_1_fc2_w, b: block_1);
1653	block_1 = ggml_add(ctx: ctx0, a: block_1, b: model.mm_model_block_1_block_1_fc2_b);
1654	block_1 = ggml_hardsigmoid(ctx: ctx0, a: block_1);
1655	// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
1656	block_1 = ggml_reshape_4d(ctx: ctx0, a: block_1, ne0: `1`, ne1: `1`, ne2: block_1->ne[`0`], ne3: block_1->ne[`1`]);
1657	block_1 = ggml_mul(ctx: ctx0, a: block_1_hw, b: block_1);
1658
1659	int w = block_1->ne[`0`], h = block_1->ne[`1`];
1660	block_1 = ggml_reshape_3d(ctx: ctx0, a: block_1, ne0: w*h, ne1: block_1->ne[`2`], ne2: block_1->ne[`3`]);
1661	block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: `1`, axis1: `0`, axis2: `2`, axis3: `3`));
1662
1663	// block_1 shape = [1, 2424, 2048], ne = [2424, 2048, 1]
1664	block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_1_block_2_0_w, b: block_1);
1665	block_1 = ggml_reshape_4d(ctx: ctx0, a: block_1, ne0: block_1->ne[`0`], ne1: w, ne2: h, ne3: block_1->ne[`3`]);
1666
1667	// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
1668	block_1 = ggml_norm(ctx: ctx0, a: block_1, eps);
1669	block_1 = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: block_1, b: model.mm_model_block_1_block_2_1_w), b: model.mm_model_block_1_block_2_1_b);
1670	block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: `2`, axis1: `0`, axis2: `1`, axis3: `3`));
1671	// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
1672	// residual
1673	block_1 = ggml_add(ctx: ctx0, a: mlp_3, b: block_1);
1674	}
1675
1676	// block_2
1677	{
1678	// stride = 2
1679	block_1 = ggml_conv_2d_dw(ctx: ctx0, a: model.mm_model_block_2_block_0_0_w, b: block_1, s0: `2`, s1: `2`, p0: `1`, p1: `1`, d0: `1`, d1: `1`);
1680
1681	// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1682	// layer norm
1683	block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: `1`, axis1: `2`, axis2: `0`, axis3: `3`));
1684	// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1685	block_1 = ggml_norm(ctx: ctx0, a: block_1, eps);
1686	block_1 = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: block_1, b: model.mm_model_block_2_block_0_1_w), b: model.mm_model_block_2_block_0_1_b);
1687	block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: `2`, axis1: `0`, axis2: `1`, axis3: `3`));
1688	// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
1689	// hardswish
1690	ggml_tensor * block_1_hw = ggml_hardswish(ctx: ctx0, a: block_1);
1691
1692	// not sure the parameters is right for globalAvgPooling
1693	block_1 = ggml_pool_2d(ctx: ctx0, a: block_1_hw, op: GGML_OP_POOL_AVG, k0: block_1_hw->ne[`0`], k1: block_1_hw->ne[`1`], s0: block_1_hw->ne[`0`], s1: block_1_hw->ne[`1`], p0: `0`, p1: `0`);
1694	// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1695	// pointwise conv
1696	block_1 = ggml_reshape_2d(ctx: ctx0, a: block_1, ne0: block_1->ne[`0`]block_1->ne[`1`]block_1->ne[`2`], ne1: block_1->ne[`3`]);
1697	block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_2_block_1_fc1_w, b: block_1);
1698	block_1 = ggml_add(ctx: ctx0, a: block_1, b: model.mm_model_block_2_block_1_fc1_b);
1699	block_1 = ggml_relu(ctx: ctx0, a: block_1);
1700	block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_2_block_1_fc2_w, b: block_1);
1701	block_1 = ggml_add(ctx: ctx0, a: block_1, b: model.mm_model_block_2_block_1_fc2_b);
1702	block_1 = ggml_hardsigmoid(ctx: ctx0, a: block_1);
1703
1704	// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
1705	block_1 = ggml_reshape_4d(ctx: ctx0, a: block_1, ne0: `1`, ne1: `1`, ne2: block_1->ne[`0`], ne3: block_1->ne[`1`]);
1706	block_1 = ggml_mul(ctx: ctx0, a: block_1_hw, b: block_1);
1707
1708	int w = block_1->ne[`0`], h = block_1->ne[`1`];
1709	block_1 = ggml_reshape_3d(ctx: ctx0, a: block_1, ne0: w*h, ne1: block_1->ne[`2`], ne2: block_1->ne[`3`]);
1710	block_1 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: block_1, axis0: `1`, axis1: `0`, axis2: `2`, axis3: `3`));
1711	// block_1 shape = [1, 2424, 2048], ne = [2424, 2048, 1]
1712	block_1 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_block_2_block_2_0_w, b: block_1);
1713	block_1 = ggml_reshape_4d(ctx: ctx0, a: block_1, ne0: block_1->ne[`0`], ne1: w, ne2: h, ne3: block_1->ne[`3`]);
1714
1715
1716	// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
1717	block_1 = ggml_norm(ctx: ctx0, a: block_1, eps);
1718	block_1 = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: block_1, b: model.mm_model_block_2_block_2_1_w), b: model.mm_model_block_2_block_2_1_b);
1719	block_1 = ggml_reshape_3d(ctx: ctx0, a: block_1, ne0: block_1->ne[`0`], ne1: block_1->ne[`1`] * block_1->ne[`2`], ne2: block_1->ne[`3`]);
1720	// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
1721	}
1722	embeddings = block_1;
1723	}
1724	else if (ctx->proj_type() == PROJECTOR_TYPE_LDPV2)
1725	{
1726	int n_patch = `24`;
1727	ggml_tensor * mlp_0 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_0_w, b: embeddings);
1728	mlp_0 = ggml_add(ctx: ctx0, a: mlp_0, b: model.mm_model_mlp_0_b);
1729	mlp_0 = ggml_gelu(ctx: ctx0, a: mlp_0);
1730	ggml_tensor * mlp_2 = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_2_w, b: mlp_0);
1731	mlp_2 = ggml_add(ctx: ctx0, a: mlp_2, b: model.mm_model_mlp_2_b);
1732	// mlp_2 ne = [2048, 576, 1, 1]
1733	// // AVG Pool Layer 22, strides = 2*
1734	mlp_2 = ggml_permute(ctx: ctx0, a: mlp_2, axis0: `1`, axis1: `0`, axis2: `2`, axis3: `3`);
1735	// mlp_2 ne = [576, 2048, 1, 1]
1736	mlp_2 = ggml_cont_4d(ctx: ctx0, a: mlp_2, ne0: n_patch, ne1: n_patch, ne2: mlp_2->ne[`1`], ne3: mlp_2->ne[`2`]);
1737	// mlp_2 ne [24, 24, 2048, 1]
1738	mlp_2 = ggml_pool_2d(ctx: ctx0, a: mlp_2, op: GGML_OP_POOL_AVG, k0: `2`, k1: `2`, s0: `2`, s1: `2`, p0: `0`, p1: `0`);
1739	// weight ne = [3, 3, 2048, 1]
1740	ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx: ctx0, a: model.mm_model_peg_0_w, b: mlp_2, s0: `1`, s1: `1`, p0: `1`, p1: `1`, d0: `1`, d1: `1`);
1741	peg_0 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: peg_0, axis0: `1`, axis1: `2`, axis2: `0`, axis3: `3`));
1742	peg_0 = ggml_add(ctx: ctx0, a: peg_0, b: model.mm_model_peg_0_b);
1743	mlp_2 = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0, a: mlp_2, axis0: `1`, axis1: `2`, axis2: `0`, axis3: `3`));
1744	peg_0 = ggml_add(ctx: ctx0, a: peg_0, b: mlp_2);
1745	peg_0 = ggml_reshape_3d(ctx: ctx0, a: peg_0, ne0: peg_0->ne[`0`], ne1: peg_0->ne[`1`] * peg_0->ne[`2`], ne2: peg_0->ne[`3`]);
1746	embeddings = peg_0;
1747	}
1748	else {
1749	GGML_ABORT("fatal error");
1750	}
1751	}
1752
1753	// glm projector
1754	else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
1755	size_t gridsz = (size_t)sqrt(x: embeddings->ne[`1`]);
1756	embeddings = ggml_permute(ctx: ctx0,a: embeddings,axis0: `1`,axis1: `0`,axis2: `2`,axis3: `3`);
1757	embeddings = ggml_cont_3d(ctx: ctx0, a: embeddings, ne0: gridsz, ne1: gridsz, ne2: embeddings->ne[`1`]);
1758	embeddings = ggml_conv_2d(ctx: ctx0, a: model.mm_model_adapter_conv_w, b: embeddings, s0: `2`, s1: `2`, p0: `0`, p1: `0`, d0: `1`, d1: `1`);
1759	embeddings = ggml_reshape_3d(ctx: ctx0, a: embeddings,ne0: embeddings->ne[`0`]*embeddings->ne[`1`] , ne1: embeddings->ne[`2`], ne2: batch_size);
1760	embeddings = ggml_cont(ctx: ctx0, a: ggml_permute(ctx: ctx0,a: embeddings, axis0: `1`, axis1: `0`, axis2: `2`, axis3: `3`));
1761	embeddings = ggml_add(ctx: ctx0, a: embeddings, b: model.mm_model_adapter_conv_b);
1762	// GLU
1763	{
1764	embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_0_w, b: embeddings);
1765	embeddings = ggml_norm(ctx: ctx0, a: embeddings, eps);
1766	embeddings = ggml_add(ctx: ctx0, a: ggml_mul(ctx: ctx0, a: embeddings, b: model.mm_model_ln_q_w), b: model.mm_model_ln_q_b);
1767	embeddings = ggml_gelu_inplace(ctx: ctx0, a: embeddings);
1768	ggml_tensor * x = embeddings;
1769	embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_2_w, b: embeddings);
1770	x = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_1_w,b: x);
1771	embeddings = ggml_swiglu_split(ctx: ctx0, a: embeddings, b: x);
1772	embeddings = ggml_mul_mat(ctx: ctx0, a: model.mm_model_mlp_3_w, b: embeddings);
1773	}
1774	// arrangement of BOI/EOI token embeddings
1775	// note: these embeddings are not present in text model, hence we cannot process them as text tokens
1776	// see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
1777	{
1778	embeddings = ggml_concat(ctx: ctx0, a: model.mm_boi, b: embeddings, dim: `1`); // BOI
1779	embeddings = ggml_concat(ctx: ctx0, a: embeddings, b: model.mm_eoi, dim: `1`); // EOI
1780	}
1781	}
1782
1783	else {
1784	GGML_ABORT("llava: unknown projector type");
1785	}
1786
1787	// build the graph
1788	ggml_build_forward_expand(cgraph: gf, tensor: embeddings);
1789
1790	return gf;
1791	}
1792	// whisper encoder with custom projector
1793	ggml_cgraph * build_whisper_enc() {
1794	const int n_frames = img.nx;
1795	const int n_pos = n_frames / `2`;
1796	GGML_ASSERT(model.position_embeddings->ne[`1`] >= n_pos);
1797
1798	ggml_tensor * inp = build_inp_raw(channels: `1`);
1799
1800	// conv1d block
1801	{
1802	// convolution + gelu
1803	ggml_tensor * cur = ggml_conv_1d_ph(ctx: ctx0, a: model.conv1d_1_w, b: inp, s: `1`, d: `1`);
1804	cur = ggml_add(ctx: ctx0, a: cur, b: model.conv1d_1_b);
1805
1806	cur = ggml_gelu_erf(ctx: ctx0, a: cur);
1807
1808	cur = ggml_conv_1d_ph(ctx: ctx0, a: model.conv1d_2_w, b: cur, s: `2`, d: `1`);
1809	cur = ggml_add(ctx: ctx0, a: cur, b: model.conv1d_2_b);
1810
1811	cur = ggml_gelu_erf(ctx: ctx0, a: cur);
1812	// transpose
1813	inp = ggml_cont(ctx: ctx0, a: ggml_transpose(ctx: ctx0, a: cur));
1814	cb(cur0: inp, name: "after_conv1d", il: -`1`);
1815	}
1816
1817	// sanity check (only check one layer, but it should be the same for all)
1818	GGML_ASSERT(model.layers[`0`].ln_1_w && model.layers[`0`].ln_1_b);
1819	GGML_ASSERT(model.layers[`0`].ln_2_w && model.layers[`0`].ln_2_b);
1820	GGML_ASSERT(model.layers[`0`].q_b);
1821	GGML_ASSERT(model.layers[`0`].v_b);
1822	GGML_ASSERT(!model.layers[`0`].k_b); // no bias for k
1823	GGML_ASSERT(model.post_ln_w && model.post_ln_b);
1824
1825	ggml_tensor * pos_embd_selected = ggml_view_2d(
1826	ctx: ctx0, a: model.position_embeddings,
1827	ne0: model.position_embeddings->ne[`0`], ne1: n_pos,
1828	nb1: model.position_embeddings->nb[`1`], offset: `0`
1829	);
1830	ggml_tensor * cur = build_vit(
1831	inp, n_pos,
1832	norm_t: NORM_TYPE_NORMAL,
1833	ffn_t: hparams.ffn_op,
1834	learned_pos_embd: pos_embd_selected,
1835	add_pos: nullptr);
1836
1837	cb(cur0: cur, name: "after_transformer", il: -`1`);
1838
1839	if (model.audio_has_stack_frames()) {
1840	// StackAudioFrames
1841	// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1842	int64_t stride = n_embd * hparams.proj_stack_factor;
1843	int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
1844	int64_t pad = padded_len - ggml_nelements(tensor: cur);
1845	if (pad > `0`) {
1846	cur = ggml_view_1d(ctx: ctx0, a: cur, ne0: ggml_nelements(tensor: cur), offset: `0`);
1847	cur = ggml_pad(ctx: ctx0, a: cur, p0: pad, p1: `0`, p2: `0`, p3: `0`);
1848	}
1849	cur = ggml_view_2d(ctx: ctx0, a: cur, ne0: stride, ne1: padded_len / stride,
1850	nb1: ggml_row_size(type: cur->type, ne: stride), offset: `0`);
1851	cb(cur0: cur, name: "after_stacked", il: -`1`);
1852	}
1853
1854	if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) {
1855	// UltravoxProjector
1856	// pre-norm
1857	cur = ggml_rms_norm(ctx: ctx0, a: cur, eps: `1e-6`);
1858	cur = ggml_mul(ctx: ctx0, a: cur, b: model.mm_norm_pre_w);
1859
1860	// ffn in
1861	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
1862
1863	// swiglu
1864	// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1865	cur = ggml_swiglu_swapped(ctx: ctx0, a: cur);
1866
1867	// mid-norm
1868	cur = ggml_rms_norm(ctx: ctx0, a: cur, eps: `1e-6`);
1869	cur = ggml_mul(ctx: ctx0, a: cur, b: model.mm_norm_mid_w);
1870
1871	// ffn out
1872	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: cur);
1873
1874	} else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) {
1875	// projector
1876	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_fc_w, b: cur);
1877	cur = ggml_add(ctx: ctx0, a: cur, b: model.mm_fc_b);
1878
1879	} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
1880	// projector
1881	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_1_w, b: cur);
1882	cur = ggml_gelu_erf(ctx: ctx0, a: cur);
1883	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_2_w, b: cur);
1884
1885	} else {
1886	GGML_ABORT("%s: unknown projector type", __func__);
1887	}
1888
1889	cb(cur0: cur, name: "projected", il: -`1`);
1890
1891	ggml_build_forward_expand(cgraph: gf, tensor: cur);
1892
1893	return gf;
1894	}
1895
1896	// cogvlm vision encoder
1897	ggml_cgraph * build_cogvlm() {
1898	GGML_ASSERT(model.class_embedding != nullptr);
1899	GGML_ASSERT(model.position_embeddings != nullptr);
1900
1901	const int n_pos = n_patches + `1`; // +1 for [CLS]
1902
1903	// build input and concatenate class embedding
1904	ggml_tensor * inp = build_inp();
1905	inp = ggml_concat(ctx: ctx0, a: inp, b: model.class_embedding, dim: `1`);
1906
1907	inp = ggml_add(ctx: ctx0, a: inp, b: model.position_embeddings);
1908	cb(cur0: inp, name: "inp_pos", il: -`1`);
1909
1910	ggml_tensor * inpL = inp;
1911
1912	for (int il = `0`; il < n_layer; il++) {
1913	auto & layer = model.layers [il];
1914	ggml_tensor * cur = inpL;
1915
1916	cur = ggml_mul_mat(ctx: ctx0, a: layer.qkv_w, b: cur);
1917
1918	cur = ggml_add(ctx: ctx0, a: cur, b: layer.qkv_b);
1919
1920	ggml_tensor * Qcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
1921	nb2: cur->nb[`1`], offset: `0`);
1922	ggml_tensor * Kcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
1923	nb2: cur->nb[`1`], offset: n_embd * sizeof(float));
1924	ggml_tensor * Vcur = ggml_view_3d(ctx: ctx0, a: cur, ne0: d_head, ne1: n_head, ne2: n_pos, nb1: d_head*sizeof(float),
1925	nb2: cur->nb[`1`], offset: `2` * n_embd * sizeof(float));
1926
1927	cb(cur0: Qcur, name: "Qcur", il);
1928	cb(cur0: Kcur, name: "Kcur", il);
1929	cb(cur0: Vcur, name: "Vcur", il);
1930
1931	cur = build_attn(wo: layer.o_w, wo_b: layer.o_b,
1932	q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_mask: nullptr, kq_scale, il);
1933	cb(cur0: cur, name: "attn_out", il);
1934
1935	cur = build_norm(cur, mw: layer.ln_1_w, mb: layer.ln_1_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il);
1936	cb(cur0: cur, name: "attn_post_norm", il);
1937
1938	cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
1939	inpL = cur;
1940
1941	cur = build_ffn(cur,
1942	up: layer.ff_up_w, up_b: layer.ff_up_b,
1943	gate: layer.ff_gate_w, gate_b: layer.ff_gate_b,
1944	down: layer.ff_down_w, down_b: layer.ff_down_b,
1945	type_op: hparams.ffn_op, il);
1946
1947	cb(cur0: cur, name: "ffn_out", il);
1948
1949	cur = build_norm(cur, mw: layer.ln_2_w, mb: layer.ln_2_b, type: NORM_TYPE_NORMAL, norm_eps: eps, il);
1950	cb(cur0: cur, name: "ffn_post_norm", il);
1951
1952	cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
1953	cb(cur0: cur, name: "layer_out", il);
1954	inpL = cur;
1955
1956	}
1957
1958	// remove CLS token (like build_llama4 does)
1959	ggml_tensor * cur = ggml_view_2d(ctx: ctx0, a: inpL,
1960	ne0: n_embd, ne1: n_patches,
1961	nb1: ggml_row_size(type: inpL->type, ne: n_embd), offset: `0`);
1962
1963	// Multiply with mm_model_proj
1964	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_model_proj, b: cur);
1965
1966	// Apply layernorm, weight, bias
1967	cur = build_norm(cur, mw: model.mm_post_fc_norm_w, mb: model.mm_post_fc_norm_b, type: NORM_TYPE_NORMAL, norm_eps: `1e-5`, il: -`1`);
1968
1969	// Apply GELU
1970	cur = ggml_gelu_inplace(ctx: ctx0, a: cur);
1971
1972	// Branch 1: multiply with mm_h_to_4h_w
1973	ggml_tensor * h_to_4h = ggml_mul_mat(ctx: ctx0, a: model.mm_h_to_4h_w, b: cur);
1974
1975	// Branch 2: multiply with mm_gate_w
1976	ggml_tensor * gate = ggml_mul_mat(ctx: ctx0, a: model.mm_gate_w, b: cur);
1977
1978	// Apply silu
1979	gate = ggml_swiglu_split(ctx: ctx0, a: gate, b: h_to_4h);
1980
1981	// Apply mm_4h_to_h_w
1982	cur = ggml_mul_mat(ctx: ctx0, a: model.mm_4h_to_h_w, b: gate);
1983
1984	// Concatenate with boi and eoi
1985	cur = ggml_concat(ctx: ctx0, a: model.mm_boi, b: cur, dim: `1`);
1986	cur = ggml_concat(ctx: ctx0, a: cur, b: model.mm_eoi, dim: `1`);
1987
1988	// build the graph
1989	ggml_build_forward_expand(cgraph: gf, tensor: cur);
1990
1991	return gf;
1992	}
1993
1994	private:
1995	//
1996	// utility functions
1997	//
1998
1999	void cb(ggml_tensor * cur0, const char * name, int il) const {
2000	if (ctx->debug_graph) {
2001	ggml_tensor * cur = ggml_cpy(ctx: ctx0, a: cur0, b: ggml_dup_tensor(ctx: ctx0, src: cur0));
2002	std::string cur_name = il >= `0` ? std::string (name) + "_" + std::to_string(val: il) : name;
2003	ggml_set_name(tensor: cur, name: cur_name.c_str());
2004	ggml_set_output(tensor: cur);
2005	ggml_build_forward_expand(cgraph: gf, tensor: cur);
2006	ctx->debug_print_tensors.push_back(x: cur);
2007	}
2008	}
2009
2010	// siglip2 naflex
2011	ggml_tensor * resize_position_embeddings() {
2012	ggml_tensor * pos_embd = model.position_embeddings;
2013	const int height = img.ny / patch_size;
2014	const int width = img.nx / patch_size;
2015	const uint32_t mode = GGML_SCALE_MODE_BILINEAR;
2016	const int n_per_side = (int)std::sqrt(x: pos_embd->ne[`1`]);
2017
2018	GGML_ASSERT(pos_embd);
2019
2020	if (height == n_per_side && width == n_per_side) {
2021	return pos_embd;
2022	}
2023
2024	pos_embd = ggml_reshape_3d(ctx: ctx0, a: pos_embd, ne0: n_embd, ne1: n_per_side, ne2: n_per_side); // -> (n_embd, n_per_side, n_per_side)
2025	pos_embd = ggml_permute(ctx: ctx0, a: pos_embd, axis0: `2`, axis1: `0`, axis2: `1`, axis3: `3`); // -> (n_per_side, n_per_side, n_embd)
2026	pos_embd = ggml_interpolate(ctx: ctx0, a: pos_embd, ne0: width, ne1: height, ne2: n_embd, ne3: `1`, mode); // -> (width, height, n_embd)
2027	pos_embd = ggml_permute(ctx: ctx0, a: pos_embd, axis0: `1`, axis1: `2`, axis2: `0`, axis3: `3`); // -> (n_embd, width, height)
2028	pos_embd = ggml_cont_2d(ctx: ctx0, a: pos_embd, ne0: n_embd, ne1: width * height); // -> (n_embd, width height)*
2029
2030	return pos_embd;
2031	}
2032
2033	// build vision transformer (ViT) cgraph
2034	// this function should cover most of the models
2035	// if your model has specific features, you should probably duplicate this function
2036	ggml_tensor * build_vit(
2037	ggml_tensor * inp,
2038	int64_t n_pos,
2039	norm_type norm_t,
2040	ffn_op_type ffn_t,
2041	ggml_tensor * learned_pos_embd,
2042	std::function<ggml_tensor (ggml_tensor , const clip_layer &)> add_pos
2043	) {
2044	if (learned_pos_embd) {
2045	inp = ggml_add(ctx: ctx0, a: inp, b: learned_pos_embd);
2046	cb(cur0: inp, name: "pos_embed", il: -`1`);
2047	}
2048
2049	ggml_tensor * inpL = inp;
2050
2051	// pre-layernorm
2052	if (model.pre_ln_w) {
2053	inpL = build_norm(cur: inpL, mw: model.pre_ln_w, mb: model.pre_ln_b, type: norm_t, norm_eps: eps, il: -`1`);
2054	cb(cur0: inpL, name: "pre_ln", il: -`1`);
2055	}
2056
2057	// loop over layers
2058	for (int il = `0`; il < n_layer; il++) {
2059	auto & layer = model.layers [il];
2060	ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
2061
2062	// layernorm1
2063	cur = build_norm(cur, mw: layer.ln_1_w, mb: layer.ln_1_b, type: norm_t, norm_eps: eps, il);
2064	cb(cur0: cur, name: "layer_inp_normed", il);
2065
2066	// self-attention
2067	{
2068	ggml_tensor * Qcur = ggml_mul_mat(ctx: ctx0, a: layer.q_w, b: cur);
2069	if (layer.q_b) {
2070	Qcur = ggml_add(ctx: ctx0, a: Qcur, b: layer.q_b);
2071	}
2072
2073	ggml_tensor * Kcur = ggml_mul_mat(ctx: ctx0, a: layer.k_w, b: cur);
2074	if (layer.k_b) {
2075	Kcur = ggml_add(ctx: ctx0, a: Kcur, b: layer.k_b);
2076	}
2077
2078	ggml_tensor * Vcur = ggml_mul_mat(ctx: ctx0, a: layer.v_w, b: cur);
2079	if (layer.v_b) {
2080	Vcur = ggml_add(ctx: ctx0, a: Vcur, b: layer.v_b);
2081	}
2082
2083	if (layer.q_norm) {
2084	Qcur = build_norm(cur: Qcur, mw: layer.q_norm, NULL, type: norm_t, norm_eps: eps, il);
2085	cb(cur0: Qcur, name: "Qcur_norm", il);
2086	}
2087
2088	if (layer.k_norm) {
2089	Kcur = build_norm(cur: Kcur, mw: layer.k_norm, NULL, type: norm_t, norm_eps: eps, il);
2090	cb(cur0: Kcur, name: "Kcur_norm", il);
2091	}
2092
2093	Qcur = ggml_reshape_3d(ctx: ctx0, a: Qcur, ne0: d_head, ne1: n_head, ne2: n_pos);
2094	Kcur = ggml_reshape_3d(ctx: ctx0, a: Kcur, ne0: d_head, ne1: n_head, ne2: n_pos);
2095	Vcur = ggml_reshape_3d(ctx: ctx0, a: Vcur, ne0: d_head, ne1: n_head, ne2: n_pos);
2096
2097	cb(cur0: Qcur, name: "Qcur", il);
2098	cb(cur0: Kcur, name: "Kcur", il);
2099	cb(cur0: Vcur, name: "Vcur", il);
2100
2101	if (add_pos) {
2102	Qcur = add_pos (Qcur, layer);
2103	Kcur = add_pos (Kcur, layer);
2104	cb(cur0: Qcur, name: "Qcur_pos", il);
2105	cb(cur0: Kcur, name: "Kcur_pos", il);
2106	}
2107
2108	cur = build_attn(wo: layer.o_w, wo_b: layer.o_b,
2109	q_cur: Qcur, k_cur: Kcur, v_cur: Vcur, kq_mask: nullptr, kq_scale, il);
2110	cb(cur0: cur, name: "attn_out", il);
2111	}
2112
2113	if (layer.ls_1_w) {
2114	cur = ggml_mul(ctx: ctx0, a: cur, b: layer.ls_1_w);
2115	cb(cur0: cur, name: "attn_out_scaled", il);
2116	}
2117
2118	// re-add the layer input, e.g., residual
2119	cur = ggml_add(ctx: ctx0, a: cur, b: inpL);
2120
2121	inpL = cur; // inpL = residual, cur = hidden_states
2122
2123	cb(cur0: cur, name: "ffn_inp", il);
2124
2125	// layernorm2
2126	cur = build_norm(cur, mw: layer.ln_2_w, mb: layer.ln_2_b, type: norm_t, norm_eps: eps, il);
2127	cb(cur0: cur, name: "ffn_inp_normed", il);
2128
2129	// ffn
2130	cur = build_ffn(cur,
2131	up: layer.ff_up_w, up_b: layer.ff_up_b,
2132	gate: layer.ff_gate_w, gate_b: layer.ff_gate_b,
2133	down: layer.ff_down_w, down_b: layer.ff_down_b,
2134	type_op: ffn_t, il);
2135
2136	cb(cur0: cur, name: "ffn_out", il);
2137
2138	if (layer.ls_2_w) {
2139	cur = ggml_mul(ctx: ctx0, a: cur, b: layer.ls_2_w);
2140	cb(cur0: cur, name: "ffn_out_scaled", il);
2141	}
2142
2143	// residual 2
2144	cur = ggml_add(ctx: ctx0, a: inpL, b: cur);
2145	cb(cur0: cur, name: "layer_out", il);
2146
2147	inpL = cur;
2148	}
2149
2150	if (ctx->model.audio_has_avgpool()) {
2151	ggml_tensor * cur = inpL;
2152	cur = ggml_transpose(ctx: ctx0, a: cur);
2153	cur = ggml_cont(ctx: ctx0, a: cur);
2154	cur = ggml_pool_1d(ctx: ctx0, a: cur, op: GGML_OP_POOL_AVG, k0: `2`, s0: `2`, p0: `0`);
2155	cur = ggml_transpose(ctx: ctx0, a: cur);
2156	cur = ggml_cont(ctx: ctx0, a: cur);
2157	inpL = cur;
2158	}
2159
2160	// post-layernorm
2161	if (model.post_ln_w) {
2162	inpL = build_norm(cur: inpL, mw: model.post_ln_w, mb: model.post_ln_b, type: norm_t, norm_eps: eps, il: -`1`);
2163	}
2164	return inpL;
2165	}
2166
2167	// build the input after conv2d (inp_raw --> patches)
2168	// returns tensor with shape [n_embd, n_patches]
2169	ggml_tensor * build_inp() {
2170	ggml_tensor * inp_raw = build_inp_raw();
2171	ggml_tensor * inp = ggml_conv_2d(ctx: ctx0, a: model.patch_embeddings_0, b: inp_raw, s0: patch_size, s1: patch_size, p0: `0`, p1: `0`, d0: `1`, d1: `1`);
2172	inp = ggml_reshape_2d(ctx: ctx0, a: inp, ne0: n_patches, ne1: n_embd);
2173	inp = ggml_cont(ctx: ctx0, a: ggml_transpose(ctx: ctx0, a: inp));
2174	if (model.patch_bias) {
2175	inp = ggml_add(ctx: ctx0, a: inp, b: model.patch_bias);
2176	cb(cur0: inp, name: "patch_bias", il: -`1`);
2177	}
2178	return inp;
2179	}
2180
2181	ggml_tensor * build_inp_raw(int channels = `3`) {
2182	ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx: ctx0, type: GGML_TYPE_F32, ne0: img.nx, ne1: img.ny, ne2: channels);
2183	ggml_set_name(tensor: inp_raw, name: "inp_raw");
2184	ggml_set_input(tensor: inp_raw);
2185	return inp_raw;
2186	}
2187
2188	ggml_tensor * build_norm(
2189	ggml_tensor * cur,
2190	ggml_tensor * mw,
2191	ggml_tensor * mb,
2192	norm_type type,
2193	float norm_eps,
2194	int il) const {
2195
2196	cur = type == NORM_TYPE_RMS
2197	? ggml_rms_norm(ctx: ctx0, a: cur, eps: norm_eps)
2198	: ggml_norm(ctx: ctx0, a: cur, eps: norm_eps);
2199
2200	if (mw \|\| mb) {
2201	cb(cur0: cur, name: "norm", il);
2202	}
2203
2204	if (mw) {
2205	cur = ggml_mul(ctx: ctx0, a: cur, b: mw);
2206	if (mb) {
2207	cb(cur0: cur, name: "norm_w", il);
2208	}
2209	}
2210
2211	if (mb) {
2212	cur = ggml_add(ctx: ctx0, a: cur, b: mb);
2213	}
2214
2215	return cur;
2216	}
2217
2218	ggml_tensor * build_ffn(
2219	ggml_tensor * cur,
2220	ggml_tensor * up,
2221	ggml_tensor * up_b,
2222	ggml_tensor * gate,
2223	ggml_tensor * gate_b,
2224	ggml_tensor * down,
2225	ggml_tensor * down_b,
2226	ffn_op_type type_op,
2227	int il) const {
2228
2229	ggml_tensor * tmp = up ? ggml_mul_mat(ctx: ctx0, a: up, b: cur) : cur;
2230	cb(cur0: tmp, name: "ffn_up", il);
2231
2232	if (up_b) {
2233	tmp = ggml_add(ctx: ctx0, a: tmp, b: up_b);
2234	cb(cur0: tmp, name: "ffn_up_b", il);
2235	}
2236
2237	if (gate) {
2238	cur = ggml_mul_mat(ctx: ctx0, a: gate, b: cur);
2239	cb(cur0: cur, name: "ffn_gate", il);
2240
2241	if (gate_b) {
2242	cur = ggml_add(ctx: ctx0, a: cur, b: gate_b);
2243	cb(cur0: cur, name: "ffn_gate_b", il);
2244	}
2245	} else {
2246	cur = tmp;
2247	}
2248
2249	// we only support parallel ffn for now
2250	switch (type_op) {
2251	case FFN_SILU:
2252	if (gate) {
2253	cur = ggml_swiglu_split(ctx: ctx0, a: cur, b: tmp);
2254	cb(cur0: cur, name: "ffn_swiglu", il);
2255	} else {
2256	cur = ggml_silu(ctx: ctx0, a: cur);
2257	cb(cur0: cur, name: "ffn_silu", il);
2258	} break;
2259	case FFN_GELU:
2260	if (gate) {
2261	cur = ggml_geglu_split(ctx: ctx0, a: cur, b: tmp);
2262	cb(cur0: cur, name: "ffn_geglu", il);
2263	} else {
2264	cur = ggml_gelu(ctx: ctx0, a: cur);
2265	cb(cur0: cur, name: "ffn_gelu", il);
2266	} break;
2267	case FFN_GELU_ERF:
2268	if (gate) {
2269	cur = ggml_geglu_erf_split(ctx: ctx0, a: cur, b: tmp);
2270	cb(cur0: cur, name: "ffn_geglu_erf", il);
2271	} else {
2272	cur = ggml_gelu_erf(ctx: ctx0, a: cur);
2273	cb(cur0: cur, name: "ffn_gelu_erf", il);
2274	} break;
2275	case FFN_GELU_QUICK:
2276	if (gate) {
2277	cur = ggml_geglu_quick_split(ctx: ctx0, a: cur, b: tmp);
2278	cb(cur0: cur, name: "ffn_geglu_quick", il);
2279	} else {
2280	cur = ggml_gelu_quick(ctx: ctx0, a: cur);
2281	cb(cur0: cur, name: "ffn_gelu_quick", il);
2282	} break;
2283	}
2284
2285	if (down) {
2286	cur = ggml_mul_mat(ctx: ctx0, a: down, b: cur);
2287	}
2288
2289	if (down_b) {
2290	cb(cur0: cur, name: "ffn_down", il);
2291	}
2292
2293	if (down_b) {
2294	cur = ggml_add(ctx: ctx0, a: cur, b: down_b);
2295	}
2296
2297	return cur;
2298	}
2299
2300	ggml_tensor * build_attn(
2301	ggml_tensor * wo,
2302	ggml_tensor * wo_b,
2303	ggml_tensor * q_cur,
2304	ggml_tensor * k_cur,
2305	ggml_tensor * v_cur,
2306	ggml_tensor * kq_mask,
2307	float kq_scale,
2308	int il) const {
2309	// these nodes are added to the graph together so that they are not reordered
2310	// by doing so, the number of splits in the graph is reduced
2311	ggml_build_forward_expand(cgraph: gf, tensor: q_cur);
2312	ggml_build_forward_expand(cgraph: gf, tensor: k_cur);
2313	ggml_build_forward_expand(cgraph: gf, tensor: v_cur);
2314
2315	ggml_tensor * q = ggml_permute(ctx: ctx0, a: q_cur, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
2316	//cb(q, "q", il);
2317
2318	ggml_tensor * k = ggml_permute(ctx: ctx0, a: k_cur, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
2319	//cb(k, "k", il);
2320
2321	ggml_tensor * cur;
2322
2323	if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
2324	ggml_tensor * v = ggml_permute(ctx: ctx0, a: v_cur, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
2325
2326	k = ggml_cast(ctx: ctx0, a: k, type: GGML_TYPE_F16);
2327	v = ggml_cast(ctx: ctx0, a: v, type: GGML_TYPE_F16);
2328
2329	cur = ggml_flash_attn_ext(ctx: ctx0, q, k, v, mask: kq_mask, scale: kq_scale, max_bias: `0.0f`, logit_softcap: `0.0f`);
2330	ggml_flash_attn_ext_set_prec(a: cur, prec: GGML_PREC_F32);
2331
2332	cur = ggml_reshape_2d(ctx: ctx0, a: cur, ne0: cur->ne[`0`]cur->ne[`1`], ne1: cur->ne[`2`]cur->ne[`3`]);
2333
2334	} else {
2335	ggml_tensor * v = ggml_permute(ctx: ctx0, a: v_cur, axis0: `1`, axis1: `2`, axis2: `0`, axis3: `3`);
2336	v = ggml_cont(ctx: ctx0, a: v);
2337
2338	const auto n_tokens = q->ne[`1`];
2339	const auto n_head = q->ne[`2`];
2340
2341	ggml_tensor * kq = ggml_mul_mat(ctx: ctx0, a: k, b: q);
2342	// F32 may not needed for vision encoders?
2343	// ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
2344
2345	kq = ggml_soft_max_ext(ctx: ctx0, a: kq, mask: kq_mask, scale: kq_scale, max_bias: `0.0f`);
2346
2347	ggml_tensor * kqv = ggml_mul_mat(ctx: ctx0, a: v, b: kq);
2348	cur = ggml_permute(ctx: ctx0, a: kqv, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
2349	cur = ggml_cont_2d(ctx: ctx0, a: cur, ne0: cur->ne[`0`]*n_head, ne1: n_tokens);
2350	}
2351
2352	cb(cur0: cur, name: "kqv_out", il);
2353
2354	if (wo) {
2355	cur = ggml_mul_mat(ctx: ctx0, a: wo, b: cur);
2356	}
2357
2358	if (wo_b) {
2359	cur = ggml_add(ctx: ctx0, a: cur, b: wo_b);
2360	}
2361
2362	return cur;
2363	}
2364
2365	// implementation of the 2D RoPE without adding a new op in ggml
2366	// this is not efficient (use double the memory), but works on all backends
2367	// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
2368	static ggml_tensor * build_rope_2d(
2369	ggml_context * ctx0,
2370	ggml_tensor * cur,
2371	ggml_tensor * pos_a, // first half
2372	ggml_tensor * pos_b, // second half
2373	const float freq_base,
2374	const bool interleave_freq
2375	) {
2376	const int64_t n_dim = cur->ne[`0`];
2377	const int64_t n_head = cur->ne[`1`];
2378	const int64_t n_pos = cur->ne[`2`];
2379
2380	// for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
2381	// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
2382	// first half of cur will use 1e-0, 1e-2 (even)
2383	// second half of cur will use 1e-1, 1e-3 (odd)
2384	// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
2385	// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
2386	// then for the second half, we use freq_scale to shift the inv_freq
2387	// ^ why? replace (2i) with (2i+1) in the above equation
2388	const float freq_scale_odd = interleave_freq
2389	? std::pow(x: freq_base, y: (float)-`2`/n_dim)
2390	: `1.0`;
2391
2392	// first half
2393	ggml_tensor * first;
2394	{
2395	first = ggml_view_3d(ctx: ctx0, a: cur,
2396	ne0: n_dim/`2`, ne1: n_head, ne2: n_pos,
2397	nb1: ggml_row_size(type: cur->type, ne: n_dim),
2398	nb2: ggml_row_size(type: cur->type, ne: n_dim*n_head),
2399	offset: `0`);
2400	first = ggml_rope_ext(
2401	ctx: ctx0,
2402	a: first,
2403	b: pos_a, // positions
2404	c: nullptr, // freq factors
2405	n_dims: n_dim/`2`, // n_dims
2406	mode: `0`, n_ctx_orig: `0`, freq_base,
2407	freq_scale: `1.0f`, ext_factor: `0.0f`, attn_factor: `1.0f`, beta_fast: `0.0f`, beta_slow: `0.0f`
2408	);
2409	}
2410
2411	// second half
2412	ggml_tensor * second;
2413	{
2414	second = ggml_view_3d(ctx: ctx0, a: cur,
2415	ne0: n_dim/`2`, ne1: n_head, ne2: n_pos,
2416	nb1: ggml_row_size(type: cur->type, ne: n_dim),
2417	nb2: ggml_row_size(type: cur->type, ne: n_dim*n_head),
2418	offset: n_dim/`2` * ggml_element_size(tensor: cur));
2419	second = ggml_rope_ext(
2420	ctx: ctx0,
2421	a: second,
2422	b: pos_b, // positions
2423	c: nullptr, // freq factors
2424	n_dims: n_dim/`2`, // n_dims
2425	mode: `0`, n_ctx_orig: `0`, freq_base,
2426	freq_scale: freq_scale_odd,
2427	ext_factor: `0.0f`, attn_factor: `1.0f`, beta_fast: `0.0f`, beta_slow: `0.0f`
2428	);
2429	}
2430
2431	cur = ggml_concat(ctx: ctx0, a: first, b: second, dim: `0`);
2432	return cur;
2433	}
2434
2435	// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
2436	// support dynamic resolution
2437	ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
2438	GGML_ASSERT(scale_factor > `1`);
2439
2440	const int n_embd = cur->ne[`0`];
2441	int width = img.nx / patch_size;
2442	int height = img.ny / patch_size;
2443
2444	// pad width and height to factor
2445	const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
2446	const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
2447	cur = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: n_embd, ne1: width, ne2: height);
2448	if (pad_width \|\| pad_height) {
2449	cur = ggml_pad(ctx: ctx0, a: cur, p0: `0`, p1: pad_width, p2: pad_height, p3: `0`);
2450	width += pad_width;
2451	height += pad_height;
2452	}
2453
2454	// unshuffle h
2455	cur = ggml_reshape_3d(ctx: ctx0, a: cur, ne0: n_embd * scale_factor, ne1: width / scale_factor, ne2: height);
2456	cur = ggml_permute(ctx: ctx0, a: cur, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
2457
2458	// unshuffle w
2459	cur = ggml_cont_3d(ctx: ctx0, a: cur, ne0: n_embd * scale_factor * scale_factor, ne1: height / scale_factor, ne2: width / scale_factor);
2460	cur = ggml_permute(ctx: ctx0, a: cur, axis0: `0`, axis1: `2`, axis2: `1`, axis3: `3`);
2461
2462	cur = ggml_cont_2d(ctx: ctx0, a: cur, ne0: cur->ne[`0`], ne1: cur->ne[`1`] * cur->ne[`2`]);
2463	cb(cur0: cur, name: "pixel_shuffle", il: -`1`);
2464
2465	return cur;
2466	}
2467
2468	};
2469
2470	static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
2471	GGML_ASSERT(imgs.entries.size() == `1` && "n_batch > 1 is not supported");
2472	clip_graph graph(ctx, *imgs.entries [`0`]);
2473
2474	ggml_cgraph * res;
2475
2476	switch (ctx->proj_type()) {
2477	case PROJECTOR_TYPE_GEMMA3:
2478	case PROJECTOR_TYPE_IDEFICS3:
2479	case PROJECTOR_TYPE_LFM2:
2480	{
2481	res = graph.build_siglip();
2482	} break;
2483	case PROJECTOR_TYPE_PIXTRAL:
2484	case PROJECTOR_TYPE_LIGHTONOCR:
2485	{
2486	res = graph.build_pixtral();
2487	} break;
2488	case PROJECTOR_TYPE_QWEN2VL:
2489	case PROJECTOR_TYPE_QWEN25VL:
2490	{
2491	res = graph.build_qwen2vl();
2492	} break;
2493	case PROJECTOR_TYPE_QWEN3VL:
2494	{
2495	res = graph.build_qwen3vl();
2496	} break;
2497	case PROJECTOR_TYPE_MINICPMV:
2498	{
2499	res = graph.build_minicpmv();
2500	} break;
2501	case PROJECTOR_TYPE_INTERNVL:
2502	{
2503	res = graph.build_internvl();
2504	} break;
2505	case PROJECTOR_TYPE_LLAMA4:
2506	{
2507	res = graph.build_llama4();
2508	} break;
2509	case PROJECTOR_TYPE_ULTRAVOX:
2510	case PROJECTOR_TYPE_VOXTRAL:
2511	case PROJECTOR_TYPE_QWEN2A:
2512	{
2513	res = graph.build_whisper_enc();
2514	} break;
2515	case PROJECTOR_TYPE_KIMIVL:
2516	{
2517	res = graph.build_kimivl();
2518	} break;
2519	case PROJECTOR_TYPE_JANUS_PRO:
2520	{
2521	res = graph.build_siglip();
2522	} break;
2523	case PROJECTOR_TYPE_COGVLM:
2524	{
2525	res = graph.build_cogvlm();
2526	} break;
2527	default:
2528	{
2529	res = graph.build_llava();
2530	} break;
2531	}
2532	return res;
2533	}
2534
2535	struct clip_model_loader {
2536	ggml_context_ptr ctx_meta;
2537	gguf_context_ptr ctx_gguf;
2538
2539	std::string fname;
2540
2541	size_t model_size = `0`; // in bytes
2542
2543	bool has_vision = false;
2544	bool has_audio = false;
2545
2546	// TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
2547	clip_model_loader(const char * fname) : fname (fname) {
2548	struct ggml_context * meta = nullptr;
2549
2550	struct gguf_init_params params = {
2551	/.no_alloc = / true,
2552	/.ctx = / &meta,
2553	};
2554
2555	ctx_gguf = gguf_context_ptr (gguf_init_from_file(fname, params));
2556	if (!ctx_gguf.get()) {
2557	throw std::runtime_error (string_format(fmt: "%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
2558	}
2559
2560	ctx_meta.reset(p: meta);
2561
2562	const int n_tensors = gguf_get_n_tensors(ctx: ctx_gguf.get());
2563
2564	// print gguf info
2565	{
2566	std::string name;
2567	get_string(KEY_NAME, output&: name, required: false);
2568	std::string description;
2569	get_string(KEY_DESCRIPTION, output&: description, required: false);
2570	LOG_INF("%s: model name: %s\n", __func__, name.c_str());
2571	LOG_INF("%s: description: %s\n", __func__, description.c_str());
2572	LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx_gguf.get()));
2573	LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
2574	LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
2575	LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
2576	LOG_INF("\n");
2577	}
2578
2579	// modalities
2580	{
2581	get_bool(KEY_HAS_VISION_ENC, output&: has_vision, required: false);
2582	get_bool(KEY_HAS_AUDIO_ENC, output&: has_audio, required: false);
2583
2584	if (has_vision) {
2585	LOG_INF("%s: has vision encoder\n", __func__);
2586	}
2587	if (has_audio) {
2588	LOG_INF("%s: has audio encoder\n", __func__);
2589	}
2590	}
2591
2592	// tensors
2593	{
2594	for (int i = `0`; i < n_tensors; ++i) {
2595	const char * name = gguf_get_tensor_name(ctx: ctx_gguf.get(), tensor_id: i);
2596	const size_t offset = gguf_get_tensor_offset(ctx: ctx_gguf.get(), tensor_id: i);
2597	enum ggml_type type = gguf_get_tensor_type(ctx: ctx_gguf.get(), tensor_id: i);
2598	ggml_tensor * cur = ggml_get_tensor(ctx: meta, name);
2599	size_t tensor_size = ggml_nbytes(tensor: cur);
2600	model_size += tensor_size;
2601	LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
2602	__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[`0`], cur->ne[`1`], cur->ne[`2`], cur->ne[`3`], ggml_type_name(type));
2603	}
2604	}
2605	}
2606
2607	void load_hparams(clip_model & model, clip_modality modality) {
2608	auto & hparams = model.hparams;
2609	std::string log_ffn_op; // for logging
2610
2611	// sanity check
2612	if (modality == CLIP_MODALITY_VISION) {
2613	GGML_ASSERT(has_vision);
2614	} else if (modality == CLIP_MODALITY_AUDIO) {
2615	GGML_ASSERT(has_audio);
2616	}
2617	model.modality = modality;
2618
2619
2620	// projector type
2621	std::string proj_type;
2622	{
2623	// default key
2624	get_string(KEY_PROJ_TYPE, output&: proj_type, required: false);
2625
2626	// for models with mixed modalities
2627	if (proj_type.empty()) {
2628	if (modality == CLIP_MODALITY_VISION) {
2629	get_string(KEY_VISION_PROJ_TYPE, output&: proj_type, required: false);
2630	} else if (modality == CLIP_MODALITY_AUDIO) {
2631	get_string(KEY_AUDIO_PROJ_TYPE, output&: proj_type, required: false);
2632	} else {
2633	GGML_ABORT("unknown modality");
2634	}
2635	}
2636
2637	model.proj_type = clip_projector_type_from_string(str: proj_type);
2638
2639	if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
2640	throw std::runtime_error (string_format(fmt: "%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
2641	}
2642
2643	// correct arch for multimodal models (legacy method)
2644	if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
2645	model.proj_type = modality == CLIP_MODALITY_VISION
2646	? PROJECTOR_TYPE_QWEN25VL
2647	: PROJECTOR_TYPE_QWEN2A;
2648	}
2649	}
2650
2651	const bool is_vision = model.modality == CLIP_MODALITY_VISION;
2652	const bool is_audio = model.modality == CLIP_MODALITY_AUDIO;
2653
2654	// other hparams
2655	{
2656	const char * prefix = is_vision ? "vision" : "audio";
2657	get_u32(key: string_format(KEY_N_EMBD, prefix), output&: hparams.n_embd);
2658	get_u32(key: string_format(KEY_N_HEAD, prefix), output&: hparams.n_head);
2659	get_u32(key: string_format(KEY_N_FF, prefix), output&: hparams.n_ff);
2660	get_u32(key: string_format(KEY_N_BLOCK, prefix), output&: hparams.n_layer);
2661	get_u32(key: string_format(KEY_PROJ_DIM, prefix), output&: hparams.projection_dim);
2662	get_f32(key: string_format(KEY_LAYER_NORM_EPS, prefix), output&: hparams.eps);
2663
2664	if (is_vision) {
2665	get_u32(KEY_IMAGE_SIZE, output&: hparams.image_size);
2666	get_u32(KEY_PATCH_SIZE, output&: hparams.patch_size);
2667	get_u32(KEY_IMAGE_CROP_RESOLUTION, output&: hparams.image_crop_resolution, required: false);
2668	get_i32(KEY_MINICPMV_VERSION, output&: hparams.minicpmv_version, required: false); // legacy
2669	get_u32(KEY_MINICPMV_QUERY_NUM, output&: hparams.minicpmv_query_num, required: false);
2670	if (hparams.minicpmv_query_num == `0`) {
2671	// Fallback to hardcoded values for legacy models
2672	if (hparams.minicpmv_version == `3`) {
2673	hparams.minicpmv_query_num = `64`;
2674	} else if (hparams.minicpmv_version == `4`) {
2675	hparams.minicpmv_query_num = `64`;
2676	} else if (hparams.minicpmv_version == `5`) {
2677	hparams.minicpmv_query_num = `64`;
2678	} else if (hparams.minicpmv_version == `6`) {
2679	hparams.minicpmv_query_num = `64`;
2680	} else {
2681	hparams.minicpmv_query_num = `96`;
2682	}
2683	}
2684	} else if (is_audio) {
2685	get_u32(KEY_A_NUM_MEL_BINS, output&: hparams.n_mel_bins);
2686
2687	} else {
2688	GGML_ASSERT(false && "unknown modality");
2689	}
2690
2691	// for pinpoints, we need to convert it into a list of resolution candidates
2692	{
2693	std::vector<int> pinpoints;
2694	get_arr_int(KEY_IMAGE_GRID_PINPOINTS, output&: pinpoints, required: false);
2695	if (!pinpoints.empty()) {
2696	for (size_t i = `0`; i < pinpoints.size(); i += `2`) {
2697	hparams.image_res_candidates.push_back(x: {
2698	.width: pinpoints [i],
2699	.height: pinpoints [i+`1`],
2700	});
2701	}
2702	}
2703	}
2704
2705	// default warmup value
2706	hparams.warmup_image_size = hparams.image_size;
2707
2708	hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP
2709	\|\| model.proj_type == PROJECTOR_TYPE_MLP_NORM
2710	\|\| model.proj_type == PROJECTOR_TYPE_LDP
2711	\|\| model.proj_type == PROJECTOR_TYPE_LDPV2;
2712
2713	{
2714	bool use_gelu = false;
2715	bool use_silu = false;
2716	get_bool(KEY_USE_GELU, output&: use_gelu, required: false);
2717	get_bool(KEY_USE_SILU, output&: use_silu, required: false);
2718	if (use_gelu && use_silu) {
2719	throw std::runtime_error (string_format(fmt: "%s: both use_gelu and use_silu are set to true\n", __func__));
2720	}
2721	if (use_gelu) {
2722	hparams.ffn_op = FFN_GELU;
2723	log_ffn_op = "gelu";
2724	} else if (use_silu) {
2725	hparams.ffn_op = FFN_SILU;
2726	log_ffn_op = "silu";
2727	} else {
2728	hparams.ffn_op = FFN_GELU_QUICK;
2729	log_ffn_op = "gelu_quick";
2730	}
2731	}
2732
2733	{
2734	std::string mm_patch_merge_type;
2735	get_string(KEY_MM_PATCH_MERGE_TYPE, output&: mm_patch_merge_type, required: false);
2736	if (mm_patch_merge_type == "spatial_unpad") {
2737	hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
2738	}
2739	}
2740
2741	if (is_vision) {
2742	int idx_mean = gguf_find_key(ctx: ctx_gguf.get(), KEY_IMAGE_MEAN);
2743	int idx_std = gguf_find_key(ctx: ctx_gguf.get(), KEY_IMAGE_STD);
2744	GGML_ASSERT(idx_mean >= `0` && "image_mean not found");
2745	GGML_ASSERT(idx_std >= `0` && "image_std not found");
2746	const float * mean_data = (const float *) gguf_get_arr_data(ctx: ctx_gguf.get(), key_id: idx_mean);
2747	const float * std_data = (const float *) gguf_get_arr_data(ctx: ctx_gguf.get(), key_id: idx_std);
2748	for (int i = `0`; i < `3`; ++i) {
2749	hparams.image_mean[i] = mean_data[i];
2750	hparams.image_std[i] = std_data[i];
2751	}
2752	}
2753
2754	// Load the vision feature layer indices if they are explicitly provided;
2755	// if multiple vision feature layers are present, the values will be concatenated
2756	// to form the final visual features.
2757	// NOTE: gguf conversions should standardize the values of the vision feature layer to
2758	// be non-negative, since we use -1 to mark values as unset here.
2759	std::vector<int> vision_feature_layer;
2760	get_arr_int(KEY_FEATURE_LAYER, output&: vision_feature_layer, required: false);
2761	// convert std::vector to std::unordered_set
2762	for (auto & layer : vision_feature_layer) {
2763	hparams.vision_feature_layer.insert(x: layer);
2764	}
2765
2766	// model-specific params
2767	switch (model.proj_type) {
2768	case PROJECTOR_TYPE_MINICPMV:
2769	{
2770	if (hparams.minicpmv_version == `0`) {
2771	hparams.minicpmv_version = `2`; // default to 2 if not set
2772	}
2773	} break;
2774	case PROJECTOR_TYPE_INTERNVL:
2775	{
2776	get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2777	} break;
2778	case PROJECTOR_TYPE_IDEFICS3:
2779	{
2780	get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2781	get_u32(KEY_PREPROC_IMAGE_SIZE, output&: hparams.image_longest_edge, required: false);
2782	} break;
2783	case PROJECTOR_TYPE_LFM2:
2784	{
2785	get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2786	// ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
2787	hparams.set_limit_image_tokens(n_tokens_min: `64`, n_tokens_max: `256`);
2788	} break;
2789	case PROJECTOR_TYPE_PIXTRAL:
2790	case PROJECTOR_TYPE_LIGHTONOCR:
2791	{
2792	// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
2793	// TODO: verify the image_min_tokens
2794	hparams.n_merge = `1`; // the original pixtral does not use patch merging
2795	hparams.rope_theta = `10000.0f`;
2796	get_u32(KEY_SPATIAL_MERGE_SIZE, output&: hparams.n_merge, required: false);
2797	hparams.set_limit_image_tokens(n_tokens_min: `8`, n_tokens_max: `1024`);
2798	hparams.set_warmup_n_tokens(`256`); // avoid OOM on warmup
2799	} break;
2800	case PROJECTOR_TYPE_KIMIVL:
2801	{
2802	hparams.rope_theta = `10000.0f`;
2803	get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2804	// TODO: check kimivl preprocessor for exact values
2805	hparams.set_limit_image_tokens(n_tokens_min: `8`, n_tokens_max: `1024`);
2806	hparams.set_warmup_n_tokens(`256`); // avoid OOM on warmup
2807	} break;
2808	case PROJECTOR_TYPE_GEMMA3:
2809	{
2810	// default value (used by all model sizes in gemma 3 family)
2811	// number of patches for each side* is reduced by a factor of 4*
2812	hparams.n_merge = `4`;
2813	// test model (tinygemma3) has a different value, we optionally read it
2814	get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2815	} break;
2816	case PROJECTOR_TYPE_QWEN2VL:
2817	case PROJECTOR_TYPE_QWEN25VL:
2818	case PROJECTOR_TYPE_QWEN3VL:
2819	{
2820	hparams.n_merge = `2`; // default value for Qwen 2 and 2.5
2821	get_u32(KEY_SPATIAL_MERGE_SIZE, output&: hparams.n_merge, required: false);
2822	get_u32(KEY_WIN_ATTN_PATTERN, output&: hparams.n_wa_pattern, required: model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
2823	// ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
2824	hparams.set_limit_image_tokens(n_tokens_min: `8`, n_tokens_max: `4096`);
2825	hparams.set_warmup_n_tokens(`46``46`); // avoid OOM on warmup*
2826	const int warn_min_pixels = `1024` * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
2827	if (hparams.image_min_pixels < warn_min_pixels) {
2828	LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
2829	LOG_WRN("%s: if you encounter problems with accuracy, try adding --image-min-tokens 1024\n", __func__);
2830	LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
2831	}
2832	} break;
2833	case PROJECTOR_TYPE_LLAMA4:
2834	{
2835	hparams.rope_theta = `10000.0f`;
2836	get_u32(KEY_PROJ_SCALE_FACTOR, output&: hparams.n_merge, required: false);
2837	set_llava_uhd_res_candidates(model, max_patches_per_side: `3`);
2838	} break;
2839	case PROJECTOR_TYPE_ULTRAVOX:
2840	case PROJECTOR_TYPE_QWEN2A:
2841	case PROJECTOR_TYPE_VOXTRAL:
2842	{
2843	bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX \|\|
2844	model.proj_type == PROJECTOR_TYPE_VOXTRAL;
2845	get_u32(KEY_A_PROJ_STACK_FACTOR, output&: hparams.proj_stack_factor, required: require_stack);
2846	if (hparams.n_mel_bins != `128`) {
2847	throw std::runtime_error (string_format(fmt: "%s: only 128 mel bins are supported for ultravox\n", __func__));
2848	}
2849	hparams.ffn_op = FFN_GELU_ERF;
2850	log_ffn_op = "gelu_erf"; // temporary solution for logging
2851	} break;
2852	default:
2853	break;
2854	}
2855
2856	// sanity check
2857	{
2858	if (hparams.image_max_pixels < hparams.image_min_pixels) {
2859	throw std::runtime_error (string_format(fmt: "%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
2860	}
2861	}
2862
2863	LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
2864	LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
2865	LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
2866	LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
2867	LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
2868	LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
2869	LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
2870	if (is_vision) {
2871	LOG_INF("\n--- vision hparams ---\n");
2872	LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
2873	LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
2874	LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
2875	LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
2876	LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
2877	LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
2878	if (hparams.image_min_pixels > `0`) {
2879	LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > `0` ? " (custom value)" : "");
2880	}
2881	if (hparams.image_max_pixels > `0`) {
2882	LOG_INF("%s: image_max_pixels: %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > `0` ? " (custom value)" : "");
2883	}
2884	} else if (is_audio) {
2885	LOG_INF("\n--- audio hparams ---\n");
2886	LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
2887	LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
2888	}
2889	LOG_INF("\n");
2890	LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / `1024.0` / `1024.0`);
2891	LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / `1024.0` / `1024.0`);
2892	}
2893	}
2894
2895	void load_tensors(clip_ctx & ctx_clip) {
2896	auto & model = ctx_clip.model;
2897	auto & hparams = model.hparams;
2898	std::map<std::string, size_t> tensor_offset;
2899	std::vector<ggml_tensor *> tensors_to_load;
2900
2901	// TODO @ngxson : support both audio and video in the future
2902	const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
2903
2904	// get offsets
2905	for (int64_t i = `0`; i < gguf_get_n_tensors(ctx: ctx_gguf.get()); ++i) {
2906	const char * name = gguf_get_tensor_name(ctx: ctx_gguf.get(), tensor_id: i);
2907	tensor_offset [name] = gguf_get_data_offset(ctx: ctx_gguf.get()) + gguf_get_tensor_offset(ctx: ctx_gguf.get(), tensor_id: i);
2908	}
2909
2910	// create data context
2911	struct ggml_init_params params = {
2912	/.mem_size =/ static_cast<size_t>(gguf_get_n_tensors(ctx: ctx_gguf.get()) + `1`) * ggml_tensor_overhead(),
2913	/.mem_buffer =/ NULL,
2914	/.no_alloc =/ true,
2915	};
2916	ctx_clip.ctx_data.reset(p: ggml_init(params));
2917	if (!ctx_clip.ctx_data) {
2918	throw std::runtime_error (string_format(fmt: "%s: failed to init ggml context\n", __func__));
2919	}
2920
2921	// helper function
2922	auto get_tensor = [&](const std::string & name, bool required = true) {
2923	ggml_tensor * cur = ggml_get_tensor(ctx: ctx_meta.get(), name: name.c_str());
2924	if (!cur && required) {
2925	throw std::runtime_error (string_format(fmt: "%s: unable to find tensor %s\n", __func__, name.c_str()));
2926	}
2927	if (cur) {
2928	tensors_to_load.push_back(x: cur);
2929	// add tensors to context
2930	ggml_tensor * data_tensor = ggml_dup_tensor(ctx: ctx_clip.ctx_data.get(), src: cur);
2931	ggml_set_name(tensor: data_tensor, name: cur->name);
2932	cur = data_tensor;
2933	}
2934	return cur;
2935	};
2936
2937	model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
2938
2939	model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
2940	model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false);
2941
2942	model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
2943	model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false);
2944
2945	model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
2946	model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
2947	model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
2948
2949	model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
2950
2951	// layers
2952	model.layers.resize(new_size: hparams.n_layer);
2953	for (int il = `0`; il < hparams.n_layer; ++il) {
2954	auto & layer = model.layers [il];
2955	layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false);
2956	layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false);
2957	layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"), false);
2958	layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
2959	layer.qkv_w = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "weight"), false);
2960	layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
2961	layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
2962	layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
2963	layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
2964	layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
2965	layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
2966
2967	layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
2968	layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
2969	layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false);
2970	layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
2971	layer.qkv_b = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "bias"), false);
2972	layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false);
2973	layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false);
2974
2975	// ffn
2976	layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, prefix, il, "weight"));
2977	layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, prefix, il, "bias"), false);
2978	layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
2979	layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"), false);
2980	layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
2981	layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false);
2982
2983
2984	// qwen3vl deepstack layer
2985	layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false);
2986	layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false);
2987	layer.deepstack_fc1_w = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "weight"), false);
2988	layer.deepstack_fc1_b = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "bias"), false);
2989	layer.deepstack_fc2_w = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "weight"), false);
2990	layer.deepstack_fc2_b = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "bias"), false);
2991	if (layer.has_deepstack()) {
2992	model.n_deepstack_layers++;
2993	}
2994
2995	// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
2996	// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
2997	bool is_ffn_swapped = (
2998	// only old models need this fix
2999	model.proj_type == PROJECTOR_TYPE_MLP
3000	\|\| model.proj_type == PROJECTOR_TYPE_MLP_NORM
3001	\|\| model.proj_type == PROJECTOR_TYPE_LDP
3002	\|\| model.proj_type == PROJECTOR_TYPE_LDPV2
3003	\|\| model.proj_type == PROJECTOR_TYPE_QWEN2VL
3004	\|\| model.proj_type == PROJECTOR_TYPE_QWEN25VL
3005	\|\| model.proj_type == PROJECTOR_TYPE_GLM_EDGE
3006	\|\| model.proj_type == PROJECTOR_TYPE_GEMMA3
3007	\|\| model.proj_type == PROJECTOR_TYPE_IDEFICS3
3008	\|\| model.proj_type == PROJECTOR_TYPE_MINICPMV
3009	) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[`0`] == hparams.n_embd;
3010	if (is_ffn_swapped) {
3011	// swap up and down weights
3012	ggml_tensor * tmp = layer.ff_up_w;
3013	layer.ff_up_w = layer.ff_down_w;
3014	layer.ff_down_w = tmp;
3015	// swap up and down biases
3016	tmp = layer.ff_up_b;
3017	layer.ff_up_b = layer.ff_down_b;
3018	layer.ff_down_b = tmp;
3019	if (il == `0`) {
3020	LOG_WRN("%s: ffn up/down are swapped\n", __func__);
3021	}
3022	}
3023	}
3024
3025	switch (model.proj_type) {
3026	case PROJECTOR_TYPE_MLP:
3027	case PROJECTOR_TYPE_MLP_NORM:
3028	{
3029	// LLaVA projection
3030	model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, `0`, "weight"), false);
3031	model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, `0`, "bias"), false);
3032	// Yi-type llava
3033	model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, `1`, "weight"), false);
3034	model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, `1`, "bias"), false);
3035	// missing in Yi-type llava
3036	model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "weight"), false);
3037	model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "bias"), false);
3038	// Yi-type llava
3039	model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, `3`, "weight"), false);
3040	model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, `3`, "bias"), false);
3041	model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, `4`, "weight"), false);
3042	model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, `4`, "bias"), false);
3043	if (model.mm_3_w) {
3044	// TODO: this is a hack to support Yi-type llava
3045	model.proj_type = PROJECTOR_TYPE_MLP_NORM;
3046	}
3047	model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
3048	} break;
3049	case PROJECTOR_TYPE_LDP:
3050	{
3051	// MobileVLM projection
3052	model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, `1`, "weight"));
3053	model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, `1`, "bias"));
3054	model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, `3`, "weight"));
3055	model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, `3`, "bias"));
3056	model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `1`, `0`, "0.weight"));
3057	model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `1`, `0`, "1.weight"));
3058	model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `1`, `0`, "1.bias"));
3059	model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `1`, `1`, "fc1.weight"));
3060	model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `1`, `1`, "fc1.bias"));
3061	model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `1`, `1`, "fc2.weight"));
3062	model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `1`, `1`, "fc2.bias"));
3063	model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `1`, `2`, "0.weight"));
3064	model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `1`, `2`, "1.weight"));
3065	model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `1`, `2`, "1.bias"));
3066	model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `2`, `0`, "0.weight"));
3067	model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `2`, `0`, "1.weight"));
3068	model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `2`, `0`, "1.bias"));
3069	model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `2`, `1`, "fc1.weight"));
3070	model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `2`, `1`, "fc1.bias"));
3071	model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `2`, `1`, "fc2.weight"));
3072	model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `2`, `1`, "fc2.bias"));
3073	model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `2`, `2`, "0.weight"));
3074	model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `2`, `2`, "1.weight"));
3075	model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, `2`, `2`, "1.bias"));
3076	} break;
3077	case PROJECTOR_TYPE_LDPV2:
3078	{
3079	// MobilVLM_V2 projection
3080	model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, `0`, "weight"));
3081	model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, `0`, "bias"));
3082	model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, `2`, "weight"));
3083	model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, `2`, "bias"));
3084	model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, `0`, "weight"));
3085	model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, `0`, "bias"));
3086	} break;
3087	case PROJECTOR_TYPE_MINICPMV:
3088	{
3089	// model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
3090	model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
3091	model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
3092	model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
3093	model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
3094	model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
3095	model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
3096	model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
3097	model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
3098	model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
3099	model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
3100	model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
3101	model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
3102	model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
3103	model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
3104	model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
3105	model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
3106	model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
3107	model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
3108	} break;
3109	case PROJECTOR_TYPE_GLM_EDGE:
3110	{
3111	model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
3112	model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
3113	model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
3114	model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
3115	model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
3116	model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
3117	model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
3118	model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
3119	model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
3120	model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
3121	} break;
3122	case PROJECTOR_TYPE_QWEN2VL:
3123	case PROJECTOR_TYPE_QWEN25VL:
3124	{
3125	model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, `0`, "weight"));
3126	model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, `0`, "bias"));
3127	model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "weight"));
3128	model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "bias"));
3129	} break;
3130	case PROJECTOR_TYPE_QWEN3VL:
3131	{
3132	model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, `0`, "weight"));
3133	model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, `0`, "bias"));
3134	model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "weight"));
3135	model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "bias"));
3136	} break;
3137	case PROJECTOR_TYPE_GEMMA3:
3138	{
3139	model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
3140	model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
3141	} break;
3142	case PROJECTOR_TYPE_IDEFICS3:
3143	{
3144	model.projection = get_tensor(TN_MM_PROJECTOR);
3145	} break;
3146	case PROJECTOR_TYPE_LFM2:
3147	case PROJECTOR_TYPE_KIMIVL:
3148	{
3149	model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
3150	model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
3151	model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, `1`, "weight"));
3152	model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, `1`, "bias"));
3153	model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "weight"));
3154	model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "bias"));
3155	} break;
3156	case PROJECTOR_TYPE_PIXTRAL:
3157	{
3158	model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, `1`, "weight"));
3159	model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, `1`, "bias"), false);
3160	model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "weight"));
3161	model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "bias"), false);
3162	// [IMG_BREAK] token embedding
3163	model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
3164	// for mistral small 3.1
3165	model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
3166	model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
3167	} break;
3168	case PROJECTOR_TYPE_LIGHTONOCR:
3169	{
3170	model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, `1`, "weight"));
3171	model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, `1`, "bias"), false);
3172	model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "weight"));
3173	model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, `2`, "bias"), false);
3174	model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
3175	model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
3176	} break;
3177	case PROJECTOR_TYPE_ULTRAVOX:
3178	{
3179	model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, `1`, "weight"));
3180	model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, `1`, "bias"));
3181	model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, `2`, "weight"));
3182	model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, `2`, "bias"));
3183	model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, `1`, "weight"));
3184	model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, `2`, "weight"));
3185	model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
3186	model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
3187	} break;
3188	case PROJECTOR_TYPE_QWEN2A:
3189	{
3190	model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, `1`, "weight"));
3191	model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, `1`, "bias"));
3192	model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, `2`, "weight"));
3193	model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, `2`, "bias"));
3194	model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
3195	model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
3196	} break;
3197	case PROJECTOR_TYPE_VOXTRAL:
3198	{
3199	model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, `1`, "weight"));
3200	model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, `1`, "bias"));
3201	model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, `2`, "weight"));
3202	model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, `2`, "bias"));
3203	model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, `1`, "weight"));
3204	model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, `2`, "weight"));
3205	} break;
3206	case PROJECTOR_TYPE_INTERNVL:
3207	{
3208	model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, `0`, "weight"));
3209	model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, `0`, "bias"));
3210	model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, `1`, "weight"));
3211	model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, `1`, "bias"));
3212	model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, `3`, "weight"));
3213	model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, `3`, "bias"));
3214	} break;
3215	case PROJECTOR_TYPE_LLAMA4:
3216	{
3217	model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
3218	model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, `1`, "weight"));
3219	model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, `2`, "weight"));
3220	} break;
3221	case PROJECTOR_TYPE_COGVLM:
3222	{
3223	model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
3224	model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight"));
3225	model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias"));
3226	model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight"));
3227	model.mm_gate_w = get_tensor(string_format(TN_MM_GATE, "weight"));
3228	model.mm_4h_to_h_w = get_tensor(string_format(TN_MM_4H_TO_H, "weight"));
3229	model.mm_boi = get_tensor(TN_TOK_BOI);
3230	model.mm_eoi = get_tensor(TN_TOK_EOI);
3231	} break;
3232	case PROJECTOR_TYPE_JANUS_PRO:
3233	{
3234	model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, `0`, "weight"));
3235	model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, `0`, "bias"));
3236	model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, `1`, "weight"));
3237	model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, `1`, "bias"));
3238	} break;
3239	default:
3240	GGML_ASSERT(false && "unknown projector type");
3241	}
3242
3243	// load data
3244	{
3245	std::vector<uint8_t> read_buf;
3246
3247	auto fin = std::ifstream (fname, std::ios::binary);
3248	if (!fin) {
3249	throw std::runtime_error (string_format(fmt: "%s: failed to open %s\n", __func__, fname.c_str()));
3250	}
3251
3252	// alloc memory and offload data
3253	ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend: ctx_clip.backend);
3254	ctx_clip.buf.reset(p: ggml_backend_alloc_ctx_tensors_from_buft(ctx: ctx_clip.ctx_data.get(), buft));
3255	ggml_backend_buffer_set_usage(buffer: ctx_clip.buf.get(), usage: GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
3256	for (auto & t : tensors_to_load) {
3257	ggml_tensor * cur = ggml_get_tensor(ctx: ctx_clip.ctx_data.get(), name: t->name);
3258	const size_t offset = tensor_offset [t->name];
3259	fin.seekg(offset, std::ios::beg);
3260	if (!fin) {
3261	throw std::runtime_error (string_format(fmt: "%s: failed to seek for tensor %s\n", __func__, t->name));
3262	}
3263	size_t num_bytes = ggml_nbytes(tensor: cur);
3264	if (ggml_backend_buft_is_host(buft)) {
3265	// for the CPU and Metal backend, we can read directly into the tensor
3266	fin.read(s: reinterpret_cast<char *>(cur->data), n: num_bytes);
3267	} else {
3268	// read into a temporary buffer first, then copy to device memory
3269	read_buf.resize(new_size: num_bytes);
3270	fin.read(s: reinterpret_cast<char *>(read_buf.data()), n: num_bytes);
3271	ggml_backend_tensor_set(tensor: cur, data: read_buf.data(), offset: `0`, size: num_bytes);
3272	}
3273	}
3274	fin.close();
3275
3276	LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
3277	}
3278	}
3279
3280	struct support_info_op {
3281	ggml_tensor * op;
3282
3283	// true if the op runs on the accelerated ctx_clip.backend
3284	bool is_accel = true;
3285	};
3286
3287	struct support_info_graph {
3288	// whether the clip_ctx.backend supports flash attention
3289	bool fattn = true;
3290	ggml_tensor * fattn_op = nullptr; // for debugging
3291
3292	std::vector<support_info_op> ops;
3293	};
3294
3295	static void warmup(clip_ctx & ctx_clip) {
3296	support_info_graph info;
3297
3298	if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
3299	// try to enable flash attention to see if it's supported
3300	ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
3301	info = alloc_compute_meta(ctx_clip);
3302	if (!info.fattn && info.fattn_op) {
3303	auto op = info.fattn_op;
3304	LOG_WRN("%s: ***************************************************************\n", __func__**);
3305	LOG_WRN("%s: WARNING: flash attention not supported by %s, memory usage will increase\n", __func__, ggml_backend_name(ctx_clip.backend));
3306	LOG_WRN("%s: op params: \n", __func__);
3307	static auto print_shape = [](const char * fn, const char * name, ggml_tensor * t) {
3308	LOG_WRN("%s: %s: type = %s, ne = [%d %d %d %d], nb = [%d %d %d %d]\n", fn,
3309	name, ggml_type_name(t->type),
3310	t->ne[`0`], t->ne[`1`], t->ne[`2`], t->ne[`3`],
3311	t->nb[`0`], t->nb[`1`], t->nb[`2`], t->nb[`3`]);
3312	};
3313	print_shape(__func__, " dst", op);
3314	print_shape(__func__, "src0", op->src[`0`]);
3315	print_shape(__func__, "src1", op->src[`1`]);
3316	print_shape(__func__, "src2", op->src[`2`]);
3317	LOG_WRN("%s: please report this on github as an issue\n", __func__);
3318	LOG_WRN("%s: ***************************************************************\n", __func__**);
3319	ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
3320	alloc_compute_meta(ctx_clip);
3321	}
3322	} else {
3323	info = alloc_compute_meta(ctx_clip);
3324	if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
3325	LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
3326	}
3327	}
3328
3329	LOG_INF("%s: flash attention is %s\n", __func__,
3330	(ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
3331
3332	// print ops that are not supported by the GPU backend (if there is one)
3333	if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) {
3334	std::vector<support_info_op> unsupported_ops;
3335	for (const auto & op : info.ops) {
3336	if (!op.is_accel) {
3337	unsupported_ops.push_back(x: op);
3338	}
3339	}
3340	if (!unsupported_ops.empty()) {
3341	LOG_WRN("%s: ***************************************************************\n", __func__**);
3342	LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
3343	LOG_WRN("%s: the performance will be suboptimal \n", __func__);
3344	LOG_WRN("%s: list of unsupported ops (backend=%s):\n", __func__, ggml_backend_name(ctx_clip.backend));
3345	for (const auto & op : unsupported_ops) {
3346	LOG_WRN("%s: %16s: type = %s, ne = [%d %d %d %d]\n", __func__,
3347	ggml_op_name(op.op->op),
3348	ggml_type_name(op.op->type),
3349	op.op->ne[`0`], op.op->ne[`1`], op.op->ne[`2`], op.op->ne[`3`]);
3350	}
3351	LOG_WRN("%s: flash attention is %s\n", __func__,
3352	(ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
3353	LOG_WRN("%s: please report this on github as an issue\n", __func__);
3354	LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);
3355	LOG_WRN("%s: ***************************************************************\n", __func__**);
3356	}
3357	}
3358	}
3359
3360	static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip) {
3361	const auto & hparams = ctx_clip.model.hparams;
3362	ctx_clip.buf_compute_meta.resize(new_size: ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
3363
3364	// create a fake batch
3365	clip_image_f32_batch batch;
3366	clip_image_f32_ptr img(clip_image_f32_init());
3367	if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
3368	img ->nx = hparams.warmup_image_size;
3369	img ->ny = hparams.warmup_image_size;
3370	LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img ->nx, img ->ny);
3371	} else {
3372	img ->nx = hparams.warmup_audio_size;
3373	img ->ny = hparams.n_mel_bins;
3374	LOG_INF("%s: warmup with audio size = %d\n", __func__, img ->nx);
3375	}
3376	batch.entries.push_back(x: std::move(img));
3377
3378	ggml_cgraph * gf = clip_image_build_graph(ctx: &ctx_clip, imgs: batch);
3379	ggml_backend_sched_reserve(sched: ctx_clip.sched.get(), measure_graph: gf);
3380
3381	for (size_t i = `0`; i < ctx_clip.backend_ptrs.size(); ++i) {
3382	ggml_backend_t backend = ctx_clip.backend_ptrs [i];
3383	ggml_backend_buffer_type_t buft = ctx_clip.backend_buft [i];
3384	size_t size = ggml_backend_sched_get_buffer_size(sched: ctx_clip.sched.get(), backend);
3385	if (size > `1`) {
3386	LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
3387	ggml_backend_buft_name(buft),
3388	size / `1024.0` / `1024.0`);
3389	}
3390	}
3391
3392	const int n_splits = ggml_backend_sched_get_n_splits(sched: ctx_clip.sched.get());
3393	const int n_nodes = ggml_graph_n_nodes(cgraph: gf);
3394
3395	LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__, n_splits, n_nodes);
3396
3397	support_info_graph res {
3398	/.fattn = / true,
3399	/.fattn_op = / nullptr,
3400	/.ops = / {},
3401	};
3402
3403	// check op support
3404	for (int i = `0`; i < ggml_graph_n_nodes(cgraph: gf); i++) {
3405	ggml_tensor * node = ggml_graph_node(cgraph: gf, i);
3406	res.ops.push_back(x: {.op: node, .is_accel: true});
3407	if (!ggml_backend_supports_op(backend: ctx_clip.backend, op: node)) {
3408	res.ops.back().is_accel = false;
3409	if (node->op == GGML_OP_FLASH_ATTN_EXT) {
3410	res.fattn = false;
3411	res.fattn_op = node;
3412	}
3413	}
3414	}
3415
3416	return res;
3417	}
3418
3419	void get_bool(const std::string & key, bool & output, bool required = true) const {
3420	const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3421	if (i < `0`) {
3422	if (required) {
3423	throw std::runtime_error ("Key not found: " + key);
3424	}
3425	return;
3426	}
3427	output = gguf_get_val_bool(ctx: ctx_gguf.get(), key_id: i);
3428	}
3429
3430	void get_i32(const std::string & key, int & output, bool required = true) const {
3431	const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3432	if (i < `0`) {
3433	if (required) {
3434	throw std::runtime_error ("Key not found: " + key);
3435	}
3436	return;
3437	}
3438	output = gguf_get_val_i32(ctx: ctx_gguf.get(), key_id: i);
3439	}
3440
3441	void get_u32(const std::string & key, int & output, bool required = true) const {
3442	const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3443	if (i < `0`) {
3444	if (required) {
3445	throw std::runtime_error ("Key not found: " + key);
3446	}
3447	return;
3448	}
3449	output = gguf_get_val_u32(ctx: ctx_gguf.get(), key_id: i);
3450	}
3451
3452	void get_f32(const std::string & key, float & output, bool required = true) const {
3453	const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3454	if (i < `0`) {
3455	if (required) {
3456	throw std::runtime_error ("Key not found: " + key);
3457	}
3458	return;
3459	}
3460	output = gguf_get_val_f32(ctx: ctx_gguf.get(), key_id: i);
3461	}
3462
3463	void get_string(const std::string & key, std::string & output, bool required = true) const {
3464	const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3465	if (i < `0`) {
3466	if (required) {
3467	throw std::runtime_error ("Key not found: " + key);
3468	}
3469	return;
3470	}
3471	output = std::string (gguf_get_val_str(ctx: ctx_gguf.get(), key_id: i));
3472	}
3473
3474	void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) const {
3475	const int i = gguf_find_key(ctx: ctx_gguf.get(), key: key.c_str());
3476	if (i < `0`) {
3477	if (required) {
3478	throw std::runtime_error ("Key not found: " + key);
3479	}
3480	return;
3481	}
3482	int n = gguf_get_arr_n(ctx: ctx_gguf.get(), key_id: i);
3483	output.resize(new_size: n);
3484	const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx: ctx_gguf.get(), key_id: i);
3485	for (int i = `0`; i < n; ++i) {
3486	output [i] = values[i];
3487	}
3488	}
3489
3490	static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
3491	auto & hparams = model.hparams;
3492	for (int x = `1`; x <= max_patches_per_side; x++) {
3493	for (int y = `1`; y <= max_patches_per_side; y++) {
3494	if (x == `1` && y == `1`) {
3495	continue; // skip the first point
3496	}
3497	hparams.image_res_candidates.push_back(x: clip_image_size{
3498	.width: x*hparams.image_size,
3499	.height: y*hparams.image_size,
3500	});
3501	}
3502	}
3503	}
3504	};
3505
3506	struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
3507	g_logger_state.verbosity_thold = ctx_params.verbosity;
3508	clip_ctx * ctx_vision = nullptr;
3509	clip_ctx * ctx_audio = nullptr;
3510
3511	try {
3512	clip_model_loader loader(fname);
3513
3514	if (loader.has_vision) {
3515	ctx_vision = new clip_ctx (ctx_params);
3516	loader.load_hparams(model&: ctx_vision->model, modality: CLIP_MODALITY_VISION);
3517	loader.load_tensors(ctx_clip&: *ctx_vision);
3518	loader.warmup(ctx_clip&: *ctx_vision);
3519	}
3520
3521	if (loader.has_audio) {
3522	ctx_audio = new clip_ctx (ctx_params);
3523	loader.load_hparams(model&: ctx_audio->model, modality: CLIP_MODALITY_AUDIO);
3524	loader.load_tensors(ctx_clip&: *ctx_audio);
3525	loader.warmup(ctx_clip&: *ctx_audio);
3526	}
3527
3528	} catch (const std::exception & e) {
3529	LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
3530
3531	delete ctx_vision;
3532	delete ctx_audio;
3533
3534	return {.ctx_v: nullptr, .ctx_a: nullptr};
3535	}
3536
3537	return {.ctx_v: ctx_vision, .ctx_a: ctx_audio};
3538	}
3539
3540	struct clip_image_size * clip_image_size_init() {
3541	struct clip_image_size * load_image_size = new struct clip_image_size ();
3542	load_image_size->width = `448`;
3543	load_image_size->height = `448`;
3544	return load_image_size;
3545	}
3546
3547	struct clip_image_u8 * clip_image_u8_init() {
3548	return new clip_image_u8 ();
3549	}
3550
3551	struct clip_image_f32 * clip_image_f32_init() {
3552	return new clip_image_f32 ();
3553	}
3554
3555	struct clip_image_f32_batch * clip_image_f32_batch_init() {
3556	return new clip_image_f32_batch ();
3557	}
3558
3559	unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
3560	if (nx) *nx = img->nx;
3561	if (ny) *ny = img->ny;
3562	return img->buf.data();
3563	}
3564
3565	void clip_image_size_free(struct clip_image_size * load_image_size) {
3566	if (load_image_size == nullptr) {
3567	return;
3568	}
3569	delete load_image_size;
3570	}
3571	void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
3572	void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
3573	void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; }
3574	void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; }
3575
3576	size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
3577	return batch->entries.size();
3578	}
3579
3580	size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
3581	if (idx < `0` \|\| idx >= (int)batch->entries.size()) {
3582	LOG_ERR("%s: invalid index %d\n", __func__, idx);
3583	return `0`;
3584	}
3585	return batch->entries [idx]->nx;
3586	}
3587
3588	size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
3589	if (idx < `0` \|\| idx >= (int)batch->entries.size()) {
3590	LOG_ERR("%s: invalid index %d\n", __func__, idx);
3591	return `0`;
3592	}
3593	return batch->entries [idx]->ny;
3594	}
3595
3596	clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
3597	if (idx < `0` \|\| idx >= (int)batch->entries.size()) {
3598	LOG_ERR("%s: invalid index %d\n", __func__, idx);
3599	return nullptr;
3600	}
3601	return batch->entries [idx].get();
3602	}
3603
3604	void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
3605	img->nx = nx;
3606	img->ny = ny;
3607	img->buf.resize(new_size: `3` * nx * ny);
3608	memcpy(dest: img->buf.data(), src: rgb_pixels, n: img->buf.size());
3609	}
3610
3611	// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
3612	static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[`3`], const float std[`3`]) {
3613	dst.nx = src.nx;
3614	dst.ny = src.ny;
3615	dst.buf.resize(new_size: src.buf.size());
3616
3617	// TODO @ngxson : seems like this could be done more efficiently on cgraph
3618	for (size_t i = `0`; i < src.buf.size(); ++i) {
3619	int c = i % `3`; // rgb
3620	dst.buf [i] = (static_cast<float>(src.buf [i]) / `255.0f` - mean[c]) / std[c];
3621	}
3622	}
3623
3624	// set of tools to manupulate images
3625	// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
3626	struct img_tool {
3627	enum resize_algo {
3628	RESIZE_ALGO_BILINEAR,
3629	RESIZE_ALGO_BICUBIC,
3630	// RESIZE_ALGO_LANCZOS, // TODO
3631	};
3632
3633	static void resize(
3634	const clip_image_u8 & src,
3635	clip_image_u8 & dst,
3636	const clip_image_size & target_resolution,
3637	resize_algo algo,
3638	bool add_padding = true, // TODO: define the behavior for add_padding = false
3639	std::array<uint8_t, `3`> pad_color = {`0`, `0`, `0`}) {
3640	dst.nx = target_resolution.width;
3641	dst.ny = target_resolution.height;
3642	dst.buf.resize(new_size: `3` * dst.nx * dst.ny);
3643
3644	if (dst.nx == src.nx && dst.ny == src.ny) {
3645	// no resize needed, simple copy
3646	dst.buf = src.buf;
3647	return;
3648	}
3649
3650	if (!add_padding) {
3651	// direct resize
3652	switch (algo) {
3653	case RESIZE_ALGO_BILINEAR:
3654	resize_bilinear(src, dst, target_width: target_resolution.width, target_height: target_resolution.height);
3655	break;
3656	case RESIZE_ALGO_BICUBIC:
3657	resize_bicubic(img: src, dst, target_width: target_resolution.width, target_height: target_resolution.height);
3658	break;
3659	default:
3660	throw std::runtime_error ("Unsupported resize algorithm");
3661	}
3662	} else {
3663	// resize with padding
3664	clip_image_u8 resized_image;
3665	float scale_w = static_cast<float>(target_resolution.width) / src.nx;
3666	float scale_h = static_cast<float>(target_resolution.height) / src.ny;
3667	float scale = std::min(a: scale_w, b: scale_h);
3668	int new_width = std::min(a: static_cast<int>(std::ceil(x: src.nx * scale)), b: target_resolution.width);
3669	int new_height = std::min(a: static_cast<int>(std::ceil(x: src.ny * scale)), b: target_resolution.height);
3670
3671	switch (algo) {
3672	case RESIZE_ALGO_BILINEAR:
3673	resize_bilinear(src, dst&: resized_image, target_width: new_width, target_height: new_height);
3674	break;
3675	case RESIZE_ALGO_BICUBIC:
3676	resize_bicubic(img: src, dst&: resized_image, target_width: new_width, target_height: new_height);
3677	break;
3678	default:
3679	throw std::runtime_error ("Unsupported resize algorithm");
3680	}
3681
3682	// fill dst with pad_color
3683	fill(img&: dst, color: pad_color);
3684
3685	int offset_x = (target_resolution.width - new_width) / `2`;
3686	int offset_y = (target_resolution.height - new_height) / `2`;
3687
3688	composite(dst, src: resized_image, offset_x, offset_y);
3689	}
3690	}
3691
3692	static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
3693	dst.nx = w;
3694	dst.ny = h;
3695	dst.buf.resize(new_size: `3` * w * h);
3696
3697	for (int i = `0`; i < h; ++i) {
3698	for (int j = `0`; j < w; ++j) {
3699	int src_idx = `3` * ((y + i)*image.nx + (x + j));
3700	int dst_idx = `3` * (i*w + j);
3701	dst.buf [dst_idx] = image.buf [src_idx];
3702	dst.buf [dst_idx + `1`] = image.buf [src_idx + `1`];
3703	dst.buf [dst_idx + `2`] = image.buf [src_idx + `2`];
3704	}
3705	}
3706	}
3707
3708	// calculate the size of the resized* image, while preserving the aspect ratio*
3709	// the calculated size will be aligned to the nearest multiple of align_size
3710	// if H or W size is larger than longest_edge, it will be resized to longest_edge
3711	static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
3712	GGML_ASSERT(align_size > `0`);
3713	if (inp_size.width <= `0` \|\| inp_size.height <= `0` \|\| longest_edge <= `0`) {
3714	return {.width: `0`, .height: `0`};
3715	}
3716
3717	float scale = std::min(a: static_cast<float>(longest_edge) / inp_size.width,
3718	b: static_cast<float>(longest_edge) / inp_size.height);
3719
3720	float target_width_f = static_cast<float>(inp_size.width) * scale;
3721	float target_height_f = static_cast<float>(inp_size.height) * scale;
3722
3723	auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x: x / static_cast<float>(f))) * f; };
3724	int aligned_width = ceil_by_factor(target_width_f);
3725	int aligned_height = ceil_by_factor(target_height_f);
3726
3727	return {.width: aligned_width, .height: aligned_height};
3728	}
3729
3730	// calculate the size of the resized* image, while preserving the aspect ratio*
3731	// the calculated size will have min_pixels <= WH <= max_pixels*
3732	// this is referred as "smart_resize" in transformers code
3733	static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
3734	GGML_ASSERT(align_size > `0`);
3735	const int width = inp_size.width;
3736	const int height = inp_size.height;
3737
3738	auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x: x / static_cast<float>(f))) * f; };
3739	auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x: x / static_cast<float>(f))) * f; };
3740
3741	// always align up first
3742	int h_bar = std::max(a: align_size, b: ceil_by_factor(height));
3743	int w_bar = std::max(a: align_size, b: ceil_by_factor(width));
3744
3745	if (h_bar * w_bar > max_pixels) {
3746	const auto beta = std::sqrt(x: static_cast<float>(height * width) / max_pixels);
3747	h_bar = std::max(a: align_size, b: floor_by_factor(height / beta));
3748	w_bar = std::max(a: align_size, b: floor_by_factor(width / beta));
3749	} else if (h_bar * w_bar < min_pixels) {
3750	const auto beta = std::sqrt(x: static_cast<float>(min_pixels) / (height * width));
3751	h_bar = ceil_by_factor(height * beta);
3752	w_bar = ceil_by_factor(width * beta);
3753	}
3754
3755	return {.width: w_bar, .height: h_bar};
3756	}
3757
3758	// draw src image into dst image at offset (offset_x, offset_y)
3759	static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
3760	for (int y = `0`; y < src.ny; ++y) {
3761	for (int x = `0`; x < src.nx; ++x) {
3762	int dx = x + offset_x;
3763	int dy = y + offset_y;
3764	// skip pixels that would be out of bounds in the destination
3765	if (dx < `0` \|\| dy < `0` \|\| dx >= dst.nx \|\| dy >= dst.ny) {
3766	continue;
3767	}
3768	size_t dst_idx = `3` * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
3769	size_t src_idx = `3` * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
3770	dst.buf [dst_idx + `0`] = src.buf [src_idx + `0`];
3771	dst.buf [dst_idx + `1`] = src.buf [src_idx + `1`];
3772	dst.buf [dst_idx + `2`] = src.buf [src_idx + `2`];
3773	}
3774	}
3775	}
3776
3777	// fill the image with a solid color
3778	static void fill(clip_image_u8 & img, const std::array<uint8_t, `3`> & color) {
3779	for (size_t i = `0`; i < img.buf.size(); i += `3`) {
3780	img.buf [i] = color [`0`];
3781	img.buf [i + `1`] = color [`1`];
3782	img.buf [i + `2`] = color [`2`];
3783	}
3784	}
3785
3786	private:
3787	// Bilinear resize function
3788	static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
3789	dst.nx = target_width;
3790	dst.ny = target_height;
3791	dst.buf.resize(new_size: `3` * target_width * target_height);
3792
3793	float x_ratio = static_cast<float>(src.nx - `1`) / target_width;
3794	float y_ratio = static_cast<float>(src.ny - `1`) / target_height;
3795
3796	for (int y = `0`; y < target_height; y++) {
3797	for (int x = `0`; x < target_width; x++) {
3798	float px = x_ratio * x;
3799	float py = y_ratio * y;
3800	int x_floor = static_cast<int>(px);
3801	int y_floor = static_cast<int>(py);
3802	float x_lerp = px - x_floor;
3803	float y_lerp = py - y_floor;
3804
3805	for (int c = `0`; c < `3`; c++) {
3806	float top = lerp(
3807	s: static_cast<float>(src.buf [`3` * (y_floor * src.nx + x_floor) + c]),
3808	e: static_cast<float>(src.buf [`3` * (y_floor * src.nx + (x_floor + `1`)) + c]),
3809	t: x_lerp
3810	);
3811	float bottom = lerp(
3812	s: static_cast<float>(src.buf [`3` * ((y_floor + `1`) * src.nx + x_floor) + c]),
3813	e: static_cast<float>(src.buf [`3` * ((y_floor + `1`) * src.nx + (x_floor + `1`)) + c]),
3814	t: x_lerp
3815	);
3816	dst.buf [`3` * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(s: top, e: bottom, t: y_lerp));
3817	}
3818	}
3819	}
3820	}
3821
3822	// Bicubic resize function
3823	// part of image will be cropped if the aspect ratio is different
3824	static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
3825	const int nx = img.nx;
3826	const int ny = img.ny;
3827
3828	dst.nx = target_width;
3829	dst.ny = target_height;
3830	dst.buf.resize(new_size: `3` * target_width * target_height);
3831
3832	float Cc;
3833	float C[`5`] = {};
3834	float d0, d2, d3, a0, a1, a2, a3;
3835	int i, j, k, jj;
3836	int x, y;
3837	float dx, dy;
3838	float tx, ty;
3839
3840	tx = (float)nx / (float)target_width;
3841	ty = (float)ny / (float)target_height;
3842
3843	// Bicubic interpolation; adapted from ViT.cpp, inspired from :
3844	// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
3845	// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
3846
3847	for (i = `0`; i < target_height; i++) {
3848	for (j = `0`; j < target_width; j++) {
3849	x = (int)(tx * j);
3850	y = (int)(ty * i);
3851
3852	dx = tx * j - x;
3853	dy = ty * i - y;
3854
3855	for (k = `0`; k < `3`; k++) {
3856	for (jj = `0`; jj <= `3`; jj++) {
3857	d0 = img.buf [(clip(x: y - `1` + jj, lower: `0`, upper: ny - `1`) * nx + clip(x: x - `1`, lower: `0`, upper: nx - `1`)) * `3` + k] - img.buf [(clip(x: y - `1` + jj, lower: `0`, upper: ny - `1`) * nx + clip(x, lower: `0`, upper: nx - `1`)) * `3` + k];
3858	d2 = img.buf [(clip(x: y - `1` + jj, lower: `0`, upper: ny - `1`) * nx + clip(x: x + `1`, lower: `0`, upper: nx - `1`)) * `3` + k] - img.buf [(clip(x: y - `1` + jj, lower: `0`, upper: ny - `1`) * nx + clip(x, lower: `0`, upper: nx - `1`)) * `3` + k];
3859	d3 = img.buf [(clip(x: y - `1` + jj, lower: `0`, upper: ny - `1`) * nx + clip(x: x + `2`, lower: `0`, upper: nx - `1`)) * `3` + k] - img.buf [(clip(x: y - `1` + jj, lower: `0`, upper: ny - `1`) * nx + clip(x, lower: `0`, upper: nx - `1`)) * `3` + k];
3860	a0 = img.buf [(clip(x: y - `1` + jj, lower: `0`, upper: ny - `1`) * nx + clip(x, lower: `0`, upper: nx - `1`)) * `3` + k];
3861
3862	a1 = -`1.0` / `3` * d0 + d2 - `1.0` / `6` * d3;
3863	a2 = `1.0` / `2` * d0 + `1.0` / `2` * d2;
3864	a3 = -`1.0` / `6` * d0 - `1.0` / `2` * d2 + `1.0` / `6` * d3;
3865
3866	C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
3867
3868	d0 = C[`0`] - C[`1`];
3869	d2 = C[`2`] - C[`1`];
3870	d3 = C[`3`] - C[`1`];
3871	a0 = C[`1`];
3872	a1 = -`1.0` / `3` * d0 + d2 - `1.0` / `6` * d3;
3873	a2 = `1.0` / `2` * d0 + `1.0` / `2` * d2;
3874	a3 = -`1.0` / `6` * d0 - `1.0` / `2` * d2 + `1.0` / `6` * d3;
3875	Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
3876
3877	const uint8_t Cc2 = std::min(a: std::max(a: std::round(x: Cc), b: `0.0f`), b: `255.0f`);
3878	dst.buf [(i * target_width + j) * `3` + k] = float(Cc2);
3879	}
3880	}
3881	}
3882	}
3883
3884	return true;
3885	}
3886
3887	static inline int clip(int x, int lower, int upper) {
3888	return std::max(a: lower, b: std::min(a: x, b: upper));
3889	}
3890
3891	// Linear interpolation between two points
3892	static inline float lerp(float s, float e, float t) {
3893	return s + (e - s) * t;
3894	}
3895	};
3896
3897	/**
3898	* implementation of LLaVA-UHD:
3899	* - https://arxiv.org/pdf/2403.11703
3900	* - https://github.com/thunlp/LLaVA-UHD
3901	* - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
3902	*
3903	* overview:
3904	* - an image always have a single overview (downscaled image)
3905	* - an image can have 0 or multiple slices, depending on the image size
3906	* - each slice can then be considered as a separate image
3907	*
3908	* for example:
3909	*
3910	* [overview] --> [slice 1] --> [slice 2]
3911	* \| \|
3912	* +--> [slice 3] --> [slice 4]
3913	*/
3914	struct llava_uhd {
3915	struct slice_coordinates {
3916	int x;
3917	int y;
3918	clip_image_size size;
3919	};
3920
3921	struct slice_instructions {
3922	clip_image_size overview_size; // size of downscaled image
3923	clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
3924	clip_image_size grid_size; // grid_size.width grid_size.height = number of slices*
3925	std::vector<slice_coordinates> slices;
3926	bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
3927	};
3928
3929	static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
3930	slice_instructions res;
3931	const int patch_size = clip_get_patch_size(ctx);
3932	const int slice_size = clip_get_image_size(ctx);
3933	const int original_width = original_size.width;
3934	const int original_height = original_size.height;
3935
3936	const bool has_slices = original_size.width > slice_size \|\| original_size.height > slice_size;
3937	const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
3938
3939	if (!has_slices) {
3940	// skip slicing logic
3941	res.overview_size = clip_image_size{.width: slice_size, .height: slice_size};
3942	res.refined_size = clip_image_size{.width: `0`, .height: `0`};
3943	res.grid_size = clip_image_size{.width: `0`, .height: `0`};
3944
3945	return res;
3946	}
3947
3948	if (has_pinpoints) {
3949	// has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
3950	auto refine_size = llava_uhd::select_best_resolution(
3951	original_size,
3952	possible_resolutions: ctx->model.hparams.image_res_candidates);
3953	res.overview_size = clip_image_size{.width: slice_size, .height: slice_size};
3954	res.refined_size = refine_size;
3955	res.grid_size = clip_image_size{.width: `0`, .height: `0`};
3956	res.padding_refined = true;
3957
3958	LOG_DBG("%s: using pinpoints for slicing\n", __func__);
3959	LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
3960	__func__, original_width, original_height,
3961	res.overview_size.width, res.overview_size.height,
3962	res.refined_size.width, res.refined_size.height);
3963
3964	for (int y = `0`; y < refine_size.height; y += slice_size) {
3965	for (int x = `0`; x < refine_size.width; x += slice_size) {
3966	slice_coordinates slice;
3967	slice.x = x;
3968	slice.y = y;
3969	slice.size.width = std::min(a: slice_size, b: refine_size.width - x);
3970	slice.size.height = std::min(a: slice_size, b: refine_size.height - y);
3971	res.slices.push_back(x: slice);
3972	LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
3973	__func__, (int)res.slices.size() - `1`,
3974	slice.x, slice.y, slice.size.width, slice.size.height);
3975	}
3976	}
3977
3978	res.grid_size.height = refine_size.height / slice_size;
3979	res.grid_size.width = refine_size.width / slice_size;
3980	LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
3981
3982	return res;
3983	}
3984
3985	// no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
3986
3987	auto best_size = get_best_resize(original_size, scale_resolution: slice_size, patch_size, allow_upscale: !has_slices);
3988	res.overview_size = best_size;
3989
3990	{
3991	const int max_slice_nums = `9`; // TODO: this is only used by minicpmv, maybe remove it
3992	const float log_ratio = log(x: (float)original_width / original_height);
3993	const float ratio = (float)original_width * original_height / (slice_size * slice_size);
3994	const int multiple = fmin(x: ceil(x: ratio), y: max_slice_nums);
3995
3996	auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
3997	auto refine_size = get_refine_size(original_size, grid: best_grid, scale_resolution: slice_size, patch_size, allow_upscale: true);
3998	res.grid_size = best_grid;
3999	res.refined_size = refine_size;
4000
4001	LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
4002	__func__, original_width, original_height,
4003	res.overview_size.width, res.overview_size.height,
4004	res.refined_size.width, res.refined_size.height,
4005	res.grid_size.width, res.grid_size.height);
4006
4007	int width = refine_size.width;
4008	int height = refine_size.height;
4009	int grid_x = int(width / best_grid.width);
4010	int grid_y = int(height / best_grid.height);
4011	for (int patches_y = `0`, ic = `0`;
4012	patches_y < refine_size.height && ic < best_grid.height;
4013	patches_y += grid_y, ic += `1`) {
4014	for (int patches_x = `0`, jc = `0`;
4015	patches_x < refine_size.width && jc < best_grid.width;
4016	patches_x += grid_x, jc += `1`) {
4017	slice_coordinates slice;
4018	slice.x = patches_x;
4019	slice.y = patches_y;
4020	slice.size.width = grid_x;
4021	slice.size.height = grid_y;
4022	res.slices.push_back(x: slice);
4023	LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
4024	__func__, (int)res.slices.size() - `1`,
4025	slice.x, slice.y, slice.size.width, slice.size.height);
4026	}
4027	}
4028	}
4029
4030	return res;
4031	}
4032
4033	static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
4034	std::vector<clip_image_u8_ptr> output;
4035	img_tool::resize_algo interpolation = img_tool::RESIZE_ALGO_BILINEAR; // TODO: make it configurable
4036
4037	// resize to overview size
4038	clip_image_u8_ptr resized_img(clip_image_u8_init());
4039	img_tool::resize(src: img, dst&: resized_img, target_resolution: inst.overview_size, algo: interpolation);
4040	output.push_back(x: std::move(resized_img));
4041	if (inst.slices.empty()) {
4042	// no slices, just return the resized image
4043	return output;
4044	}
4045
4046	// resize to refined size
4047	clip_image_u8_ptr refined_img(clip_image_u8_init());
4048	if (inst.padding_refined) {
4049	img_tool::resize(src: img, dst&: refined_img, target_resolution: inst.refined_size, algo: interpolation);
4050	} else {
4051	// only algo bicubic preserves the ratio; old models rely on this behavior
4052	// TODO: do we need to support other algos here?
4053	img_tool::resize(src: img, dst&: refined_img, target_resolution: inst.refined_size, algo: img_tool::RESIZE_ALGO_BICUBIC, add_padding: false);
4054	}
4055
4056	// create slices
4057	for (const auto & slice : inst.slices) {
4058	int x = slice.x;
4059	int y = slice.y;
4060	int w = slice.size.width;
4061	int h = slice.size.height;
4062
4063	clip_image_u8_ptr img_slice(clip_image_u8_init());
4064	img_tool::crop(image: refined_img, dst&: img_slice, x, y, w, h);
4065	output.push_back(x: std::move(img_slice));
4066	}
4067
4068	return output;
4069	}
4070
4071	private:
4072	static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
4073	int width = original_size.width;
4074	int height = original_size.height;
4075	if ((width * height > scale_resolution * scale_resolution) \|\| allow_upscale) {
4076	float r = static_cast<float>(width) / height;
4077	height = static_cast<int>(scale_resolution / std::sqrt(x: r));
4078	width = static_cast<int>(height * r);
4079	}
4080	clip_image_size res;
4081	res.width = ensure_divide(length: width, patch_size);
4082	res.height = ensure_divide(length: height, patch_size);
4083	return res;
4084	}
4085
4086	static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
4087	float scale_width = static_cast<float>(target_max.width) / orig.width;
4088	float scale_height = static_cast<float>(target_max.height) / orig.height;
4089	float scale = std::min(a: scale_width, b: scale_height);
4090	return clip_image_size{
4091	.width: static_cast<int>(orig.width * scale),
4092	.height: static_cast<int>(orig.height * scale),
4093	};
4094	}
4095
4096	/**
4097	* Selects the best resolution from a list of possible resolutions based on the original size.
4098	*
4099	* For example, when given a list of resolutions:
4100	* - 100x100
4101	* - 200x100
4102	* - 100x200
4103	* - 200x200
4104	*
4105	* And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
4106	*
4107	* @param original_size The original size of the image
4108	* @param possible_resolutions A list of possible resolutions
4109	* @return The best fit resolution
4110	*/
4111	static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
4112	clip_image_size best_fit;
4113	int min_wasted_area = std::numeric_limits<int>::max();
4114	int max_effective_resolution = `0`;
4115
4116	for (const clip_image_size & candidate : possible_resolutions) {
4117	auto target_size = resize_maintain_aspect_ratio(orig: original_size, target_max: candidate);
4118	int effective_resolution = std::min(
4119	a: target_size.width * target_size.height,
4120	b: original_size.width * original_size.height);
4121	int wasted_area = (candidate.width * candidate.height) - effective_resolution;
4122
4123	if (effective_resolution > max_effective_resolution \|\| (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
4124	max_effective_resolution = effective_resolution;
4125	min_wasted_area = wasted_area;
4126	best_fit = candidate;
4127	}
4128
4129	LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
4130	}
4131
4132	return best_fit;
4133	}
4134
4135	static int ensure_divide(int length, int patch_size) {
4136	return std::max(a: static_cast<int>(std::round(x: static_cast<float>(length) / patch_size) * patch_size), b: patch_size);
4137	}
4138
4139	static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
4140	int width = original_size.width;
4141	int height = original_size.height;
4142	int grid_x = grid.width;
4143	int grid_y = grid.height;
4144
4145	int refine_width = ensure_divide(length: width, patch_size: grid_x);
4146	int refine_height = ensure_divide(length: height, patch_size: grid_y);
4147
4148	clip_image_size grid_size;
4149	grid_size.width = refine_width / grid_x;
4150	grid_size.height = refine_height / grid_y;
4151
4152	auto best_grid_size = get_best_resize(original_size: grid_size, scale_resolution, patch_size, allow_upscale);
4153	int best_grid_width = best_grid_size.width;
4154	int best_grid_height = best_grid_size.height;
4155
4156	clip_image_size refine_size;
4157	refine_size.width = best_grid_width * grid_x;
4158	refine_size.height = best_grid_height * grid_y;
4159	return refine_size;
4160	}
4161
4162	static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
4163	std::vector<int> candidate_split_grids_nums;
4164	for (int i : {multiple - `1`, multiple, multiple + `1`}) {
4165	if (i == `1` \|\| i > max_slice_nums) {
4166	continue;
4167	}
4168	candidate_split_grids_nums.push_back(x: i);
4169	}
4170
4171	std::vector<clip_image_size> candidate_grids;
4172	for (int split_grids_nums : candidate_split_grids_nums) {
4173	int m = `1`;
4174	while (m <= split_grids_nums) {
4175	if (split_grids_nums % m == `0`) {
4176	candidate_grids.push_back(x: clip_image_size{.width: m, .height: split_grids_nums / m});
4177	}
4178	++m;
4179	}
4180	}
4181
4182	clip_image_size best_grid{.width: `1`, .height: `1`};
4183	float min_error = std::numeric_limits<float>::infinity();
4184	for (const auto& grid : candidate_grids) {
4185	float error = std::abs(x: log_ratio - std::log(x: `1.0` * grid.width / grid.height));
4186	if (error < min_error) {
4187	best_grid = grid;
4188	min_error = error;
4189	}
4190	}
4191	return best_grid;
4192	}
4193	};
4194
4195	// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
4196	// res_imgs memory is being allocated here, previous allocations will be freed if found
4197	bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
4198	clip_image_size original_size{.width: img->nx, .height: img->ny};
4199	auto & params = ctx->model.hparams;
4200
4201	switch (ctx->proj_type()) {
4202	case PROJECTOR_TYPE_MINICPMV:
4203	{
4204	auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
4205	std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
4206
4207	for (size_t i = `0`; i < imgs.size(); ++i) {
4208	// clip_image_save_to_bmp(imgs[i], "slice_" + std::to_string(i) + ".bmp");*
4209	clip_image_f32_ptr res(clip_image_f32_init());
4210	normalize_image_u8_to_f32(src: imgs [i], dst&: res, mean: params.image_mean, std: params.image_std);
4211	res_imgs->entries.push_back(x: std::move(res));
4212	}
4213
4214	res_imgs->grid_x = inst.grid_size.width;
4215	res_imgs->grid_y = inst.grid_size.height;
4216	} break;
4217
4218	case PROJECTOR_TYPE_QWEN2VL:
4219	case PROJECTOR_TYPE_QWEN25VL:
4220	case PROJECTOR_TYPE_QWEN3VL:
4221	{
4222	GGML_ASSERT(params.image_min_pixels > `0` && params.image_max_pixels > `0`);
4223	clip_image_u8 resized;
4224	const clip_image_size new_size = img_tool::calc_size_preserved_ratio(
4225	inp_size: original_size,
4226	align_size: params.patch_size * `2`,
4227	min_pixels: params.image_min_pixels,
4228	max_pixels: params.image_max_pixels);
4229	img_tool::resize(src: img, dst&: resized, target_resolution: new_size, algo: img_tool::RESIZE_ALGO_BILINEAR, add_padding: false*);
4230	// clip_image_save_to_bmp(resized, "preproc.bmp");
4231	clip_image_f32_ptr img_f32(clip_image_f32_init());
4232	// clip_image_f32_ptr res(clip_image_f32_init());
4233	normalize_image_u8_to_f32(src: resized, dst&: *img_f32, mean: params.image_mean, std: params.image_std);
4234	// res_imgs->data[0] = res;*
4235	res_imgs->entries.push_back(x: std::move(img_f32));
4236	} break;
4237
4238	case PROJECTOR_TYPE_IDEFICS3:
4239	{
4240	// The refined size has two steps:
4241	// 1. Resize w/ aspect-ratio preserving such that the longer side is
4242	// the preprocessor longest size
4243	// 2. Resize w/out preserving aspect ratio such that both sides are
4244	// multiples of image_size (always rounding up)
4245	//
4246	// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
4247	const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
4248	inp_size: original_size, align_size: params.image_size, longest_edge: params.image_longest_edge);
4249	// LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
4250	// __func__, original_size.width, original_size.height,
4251	// refined_size.width, refined_size.height);
4252
4253	llava_uhd::slice_instructions instructions;
4254	instructions.overview_size = clip_image_size{.width: params.image_size, .height: params.image_size};
4255	instructions.refined_size = refined_size;
4256	instructions.grid_size = clip_image_size{
4257	.width: static_cast<int>(std::ceil(x: static_cast<float>(refined_size.width) / params.image_size)),
4258	.height: static_cast<int>(std::ceil(x: static_cast<float>(refined_size.height) / params.image_size)),
4259	};
4260	for (int y = `0`; y < refined_size.height; y += params.image_size) {
4261	for (int x = `0`; x < refined_size.width; x += params.image_size) {
4262	// LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
4263	instructions.slices.push_back(x: llava_uhd::slice_coordinates{
4264	/ x /x,
4265	/ y /y,
4266	/ size /clip_image_size{
4267	.width: std::min(a: params.image_size, b: refined_size.width - x),
4268	.height: std::min(a: params.image_size, b: refined_size.height - y)
4269	}
4270	});
4271	}
4272	}
4273	auto imgs = llava_uhd::slice_image(img, inst: instructions);
4274
4275	// cast and normalize to f32
4276	for (size_t i = `0`; i < imgs.size(); ++i) {
4277	// clip_image_save_to_bmp(imgs[i], "slice_" + std::to_string(i) + ".bmp");*
4278	clip_image_f32_ptr res(clip_image_f32_init());
4279	normalize_image_u8_to_f32(src: imgs [i], dst&: res, mean: params.image_mean, std: params.image_std);
4280	res_imgs->entries.push_back(x: std::move(res));
4281	}
4282
4283	res_imgs->grid_x = instructions.grid_size.width;
4284	res_imgs->grid_y = instructions.grid_size.height;
4285	} break;
4286
4287	case PROJECTOR_TYPE_GLM_EDGE:
4288	case PROJECTOR_TYPE_GEMMA3:
4289	case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
4290	{
4291	clip_image_u8 resized_image;
4292	int sz = params.image_size;
4293	img_tool::resize(src: *img, dst&: resized_image, target_resolution: {.width: sz, .height: sz}, algo: img_tool::RESIZE_ALGO_BILINEAR);
4294	clip_image_f32_ptr img_f32(clip_image_f32_init());
4295	//clip_image_save_to_bmp(resized_image, "resized.bmp");
4296	normalize_image_u8_to_f32(src: resized_image, dst&: *img_f32, mean: params.image_mean, std: params.image_std);
4297	res_imgs->entries.push_back(x: std::move(img_f32));
4298	} break;
4299
4300	case PROJECTOR_TYPE_JANUS_PRO:
4301	{
4302	// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
4303	const std::array<uint8_t, `3`> pad_color = {`127`, `127`, `127`};
4304	clip_image_u8 resized_image;
4305	int sz = params.image_size;
4306	img_tool::resize(src: img, dst&: resized_image, target_resolution: {.width: sz, .height: sz}, algo: img_tool::RESIZE_ALGO_BILINEAR, add_padding: true*, pad_color);
4307	clip_image_f32_ptr img_f32(clip_image_f32_init());
4308	normalize_image_u8_to_f32(src: resized_image, dst&: *img_f32, mean: params.image_mean, std: params.image_std);
4309	res_imgs->entries.push_back(x: std::move(img_f32));
4310	} break;
4311
4312	case PROJECTOR_TYPE_PIXTRAL:
4313	case PROJECTOR_TYPE_LIGHTONOCR:
4314	{
4315	GGML_ASSERT(params.image_min_pixels > `0` && params.image_max_pixels > `0`);
4316	clip_image_u8 resized_image;
4317	// the original pixtral model doesn't have n_merge
4318	const int cur_merge = params.n_merge == `0` ? `1` : params.n_merge;
4319	const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
4320	inp_size: original_size,
4321	align_size: params.patch_size * cur_merge,
4322	min_pixels: params.image_min_pixels,
4323	max_pixels: params.image_max_pixels);
4324	img_tool::resize(src: *img, dst&: resized_image, target_resolution: target_size, algo: img_tool::RESIZE_ALGO_BILINEAR);
4325	clip_image_f32_ptr img_f32(clip_image_f32_init());
4326	normalize_image_u8_to_f32(src: resized_image, dst&: *img_f32, mean: params.image_mean, std: params.image_std);
4327	res_imgs->entries.push_back(x: std::move(img_f32));
4328	} break;
4329
4330	case PROJECTOR_TYPE_LLAMA4:
4331	{
4332	GGML_ASSERT(!params.image_res_candidates.empty());
4333	auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
4334	std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
4335
4336	for (size_t i = `0`; i < imgs.size(); ++i) {
4337	clip_image_f32_ptr res(clip_image_f32_init());
4338	normalize_image_u8_to_f32(src: imgs [i], dst&: res, mean: params.image_mean, std: params.image_std);
4339	res_imgs->entries.push_back(x: std::move(res));
4340	}
4341
4342	res_imgs->grid_x = inst.grid_size.width;
4343	res_imgs->grid_y = inst.grid_size.height;
4344	} break;
4345
4346	case PROJECTOR_TYPE_LFM2:
4347	case PROJECTOR_TYPE_KIMIVL:
4348	{
4349	GGML_ASSERT(params.image_min_pixels > `0` && params.image_max_pixels > `0`);
4350	const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
4351	inp_size: original_size,
4352	align_size: params.patch_size * params.n_merge,
4353	min_pixels: params.image_min_pixels,
4354	max_pixels: params.image_max_pixels);
4355	const std::array<uint8_t, `3`> pad_color = {`122`, `116`, `104`};
4356
4357	clip_image_u8 resized_img;
4358	img_tool::resize(src: img, dst&: resized_img, target_resolution: target_size, algo: img_tool::RESIZE_ALGO_BILINEAR, add_padding: true*, pad_color);
4359	clip_image_f32_ptr res(clip_image_f32_init());
4360	normalize_image_u8_to_f32(src: resized_img, dst&: *res, mean: params.image_mean, std: params.image_std);
4361	res_imgs->entries.push_back(x: std::move(res));
4362	} break;
4363
4364	case PROJECTOR_TYPE_MLP:
4365	case PROJECTOR_TYPE_MLP_NORM:
4366	case PROJECTOR_TYPE_LDP:
4367	case PROJECTOR_TYPE_LDPV2:
4368	case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm?
4369	{
4370	// TODO @ngxson : refactor the code below to avoid duplicated logic
4371
4372	// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
4373	// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
4374
4375	clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
4376
4377	// The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
4378	if (params.image_res_candidates.empty()) { // pad_to_square
4379	// for llava-1.5, we resize image to a square, and pad the shorter side with a background color
4380	// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
4381	const int longer_side = std::max(a: img->nx, b: img->ny);
4382	temp ->nx = longer_side;
4383	temp ->ny = longer_side;
4384	temp ->buf.resize(new_size: `3` * longer_side * longer_side);
4385
4386	// background color in RGB from LLaVA (this is the mean rgb color 255)*
4387	const std::array<uint8_t, `3`> pad_color = {`122`, `116`, `104`};
4388
4389	// resize the image to the target_size
4390	img_tool::resize(src: img, dst&: temp, target_resolution: clip_image_size{.width: params.image_size, .height: params.image_size}, algo: img_tool::RESIZE_ALGO_BILINEAR, add_padding: true, pad_color);
4391
4392	clip_image_f32_ptr res(clip_image_f32_init());
4393	normalize_image_u8_to_f32(src: temp, dst&: res, mean: params.image_mean, std: params.image_std);
4394	res_imgs->entries.push_back(x: std::move(res));
4395
4396	} else {
4397	// "spatial_unpad" with "anyres" processing for llava-1.6
4398	auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
4399	std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
4400
4401	for (size_t i = `0`; i < imgs.size(); ++i) {
4402	// clip_image_save_to_bmp(imgs[i], "slice_" + std::to_string(i) + ".bmp");*
4403	clip_image_f32_ptr res(clip_image_f32_init());
4404	normalize_image_u8_to_f32(src: imgs [i], dst&: res, mean: params.image_mean, std: params.image_std);
4405	res_imgs->entries.push_back(x: std::move(res));
4406	}
4407	}
4408	} break;
4409
4410	default:
4411	LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
4412	return false;
4413	}
4414
4415	return true;
4416	}
4417
4418	ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
4419	return ctx->model.image_newline;
4420	}
4421
4422	void clip_free(clip_ctx * ctx) {
4423	if (ctx == nullptr) {
4424	return;
4425	}
4426	delete ctx;
4427	}
4428
4429	// deprecated
4430	size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
4431	const int32_t nx = ctx->model.hparams.image_size;
4432	const int32_t ny = ctx->model.hparams.image_size;
4433	return clip_embd_nbytes_by_img(ctx, img_w: nx, img_h: ny);
4434	}
4435
4436	size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
4437	clip_image_f32 img;
4438	img.nx = img_w;
4439	img.ny = img_h;
4440	return clip_n_output_tokens(ctx, img: &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
4441	}
4442
4443	int32_t clip_get_image_size(const struct clip_ctx * ctx) {
4444	return ctx->model.hparams.image_size;
4445	}
4446
4447	int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
4448	return ctx->model.hparams.patch_size;
4449	}
4450
4451	int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
4452	return ctx->model.hparams.n_embd;
4453	}
4454
4455	const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
4456	return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
4457	}
4458
4459	int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
4460	const auto & params = ctx->model.hparams;
4461	const int n_total = clip_n_output_tokens(ctx, img);
4462	if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL \|\| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL \|\| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
4463	return img->nx / (params.patch_size * `2`);
4464	}
4465	return n_total;
4466	}
4467
4468	int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
4469	const auto & params = ctx->model.hparams;
4470	if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL \|\| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL \|\| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
4471	return img->ny / (params.patch_size * `2`);
4472	}
4473	return `1`;
4474	}
4475
4476	int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
4477	const auto & params = ctx->model.hparams;
4478
4479	// for models with fixed size image, the input image is already pre-processed and resized to square
4480	int patch_size = params.patch_size;
4481	int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
4482
4483	projector_type proj = ctx->proj_type();
4484
4485	switch (proj) {
4486	case PROJECTOR_TYPE_MLP:
4487	case PROJECTOR_TYPE_MLP_NORM:
4488	case PROJECTOR_TYPE_JANUS_PRO:
4489	{
4490	// do nothing
4491	} break;
4492	case PROJECTOR_TYPE_LDP:
4493	case PROJECTOR_TYPE_LDPV2:
4494	case PROJECTOR_TYPE_GLM_EDGE:
4495	{
4496	n_patches /= `4`;
4497	if (ctx->model.mm_boi) {
4498	n_patches += `2`; // for BOI and EOI token embeddings
4499	}
4500	} break;
4501	case PROJECTOR_TYPE_MINICPMV:
4502	{
4503	// Use actual config value if available, otherwise fall back to hardcoded values
4504	if (params.minicpmv_query_num > `0`) {
4505	n_patches = params.minicpmv_query_num;
4506	} else {
4507	// Fallback to hardcoded values for legacy models
4508	if (params.minicpmv_version == `2`) {
4509	n_patches = `96`;
4510	} else if (params.minicpmv_version == `3`) {
4511	n_patches = `64`;
4512	} else if (params.minicpmv_version == `4`) {
4513	n_patches = `64`;
4514	} else if (params.minicpmv_version == `5`) {
4515	// MiniCPM-V 4.0
4516	n_patches = `64`;
4517	} else if (params.minicpmv_version == `6`) {
4518	// MiniCPM-V 4.5
4519	n_patches = `64`;
4520	} else {
4521	GGML_ABORT("Unknown minicpmv version");
4522	}
4523	}
4524	} break;
4525	case PROJECTOR_TYPE_QWEN2VL:
4526	case PROJECTOR_TYPE_QWEN25VL:
4527	case PROJECTOR_TYPE_QWEN3VL:
4528	{
4529	// dynamic size (2 conv, so double patch size)
4530	int x_patch = img->nx / (params.patch_size * `2`);
4531	int y_patch = img->ny / (params.patch_size * `2`);
4532	n_patches = x_patch * y_patch;
4533	} break;
4534	case PROJECTOR_TYPE_GEMMA3:
4535	case PROJECTOR_TYPE_IDEFICS3:
4536	case PROJECTOR_TYPE_INTERNVL:
4537	case PROJECTOR_TYPE_LLAMA4:
4538	{
4539	// both X and Y are downscaled by the scale factor
4540	int scale_factor = ctx->model.hparams.n_merge;
4541	n_patches /= (scale_factor * scale_factor);
4542	} break;
4543	case PROJECTOR_TYPE_LFM2:
4544	case PROJECTOR_TYPE_KIMIVL:
4545	{
4546	// dynamic size
4547	int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
4548	int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
4549	int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
4550	n_patches = x_patch * y_patch;
4551	} break;
4552	case PROJECTOR_TYPE_PIXTRAL:
4553	case PROJECTOR_TYPE_LIGHTONOCR:
4554	{
4555	// dynamic size
4556	int n_merge = ctx->model.hparams.n_merge;
4557	int n_patches_x = img->nx / patch_size / (n_merge > `0` ? n_merge : `1`);
4558	int n_patches_y = img->ny / patch_size / (n_merge > `0` ? n_merge : `1`);
4559	if (ctx->model.token_embd_img_break) {
4560	n_patches = n_patches_y * n_patches_x + n_patches_y - `1`; // + one [IMG_BREAK] per row, except the last row
4561	} else {
4562	n_patches = n_patches_y * n_patches_x;
4563	}
4564	} break;
4565	case PROJECTOR_TYPE_VOXTRAL:
4566	case PROJECTOR_TYPE_ULTRAVOX:
4567	case PROJECTOR_TYPE_QWEN2A:
4568	{
4569	n_patches = img->nx;
4570
4571	const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
4572	if (ctx->model.audio_has_stack_frames()) {
4573	GGML_ASSERT(proj_stack_factor > `0`);
4574	const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor);
4575	n_patches = n_len / proj_stack_factor;
4576	}
4577
4578	// whisper downscales input token by half after conv1d
4579	n_patches /= `2`;
4580
4581	if (ctx->model.audio_has_avgpool()) {
4582	// divide by 2 because of nn.AvgPool1d(2, stride=2)
4583	n_patches /= `2`;
4584	}
4585	} break;
4586	case PROJECTOR_TYPE_COGVLM:
4587	{
4588	n_patches += `2`; // for BOI and EOI token embeddings
4589	} break;
4590	default:
4591	GGML_ABORT("unsupported projector type");
4592	}
4593
4594	return n_patches;
4595	}
4596
4597	bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
4598	clip_image_f32_batch imgs;
4599	clip_image_f32_ptr img_copy(clip_image_f32_init());
4600	img_copy = img;
4601	imgs.entries.push_back(x: std::move(img_copy));
4602
4603	return clip_image_batch_encode(ctx, n_threads, imgs: &imgs, vec);
4604	}
4605
4606	bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
4607	const clip_image_f32_batch & imgs = *imgs_c_ptr;
4608	int batch_size = imgs.entries.size();
4609
4610	// TODO @ngxson : implement batch size > 1 as a loop
4611	// we don't need true batching support because the cgraph will gonna be big anyway
4612	if (batch_size != `1`) {
4613	return false; // only support batch size of 1
4614	}
4615
4616	// build the inference graph
4617	ctx->debug_print_tensors.clear();
4618	ggml_backend_sched_reset(sched: ctx->sched.get());
4619	ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
4620	ggml_backend_sched_alloc_graph(sched: ctx->sched.get(), graph: gf);
4621
4622	// set inputs
4623	const auto & model = ctx->model;
4624	const auto & hparams = model.hparams;
4625
4626	const int image_size_width = imgs.entries [`0`]->nx;
4627	const int image_size_height = imgs.entries [`0`]->ny;
4628
4629	const int patch_size = hparams.patch_size;
4630	const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
4631	const int n_pos = num_patches + (model.class_embedding ? `1` : `0`);
4632	const int pos_w = image_size_width / patch_size;
4633	const int pos_h = image_size_height / patch_size;
4634
4635	const bool use_window_attn = hparams.n_wa_pattern > `0`; // for qwen2.5vl
4636
4637	auto get_inp_tensor = [&gf](const char * name) {
4638	ggml_tensor * inp = ggml_graph_get_tensor(cgraph: gf, name);
4639	if (inp == nullptr) {
4640	GGML_ABORT("Failed to get tensor %s", name);
4641	}
4642	if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
4643	GGML_ABORT("Tensor %s is not an input tensor", name);
4644	}
4645	return inp;
4646	};
4647
4648	auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
4649	ggml_tensor * cur = get_inp_tensor (name);
4650	GGML_ASSERT(cur->type == GGML_TYPE_F32);
4651	GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
4652	ggml_backend_tensor_set(tensor: cur, data: values.data(), offset: `0`, size: ggml_nbytes(tensor: cur));
4653	};
4654
4655	auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
4656	ggml_tensor * cur = get_inp_tensor (name);
4657	GGML_ASSERT(cur->type == GGML_TYPE_I32);
4658	GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
4659	ggml_backend_tensor_set(tensor: cur, data: values.data(), offset: `0`, size: ggml_nbytes(tensor: cur));
4660	};
4661
4662	// set input pixel values
4663	if (!imgs.is_audio) {
4664	size_t nelem = `0`;
4665	for (const auto & img : imgs.entries) {
4666	nelem += img ->nx * img ->ny * `3`;
4667	}
4668	std::vector<float> inp_raw(nelem);
4669
4670	// layout of data (note: the channel dim is unrolled to better visualize the layout):
4671	//
4672	// ┌──W──┐
4673	// │ H │ channel = R
4674	// ├─────┤ │
4675	// │ H │ channel = G
4676	// ├─────┤ │
4677	// │ H │ channel = B
4678	// └─────┘ │
4679	// ──────┘ x B
4680
4681	for (size_t i = `0`; i < imgs.entries.size(); i++) {
4682	const int nx = imgs.entries [i]->nx;
4683	const int ny = imgs.entries [i]->ny;
4684	const int n = nx * ny;
4685
4686	for (int b = `0`; b < batch_size; b++) {
4687	float * batch_entry = inp_raw.data() + b * (`3`*n);
4688	for (int y = `0`; y < ny; y++) {
4689	for (int x = `0`; x < nx; x++) {
4690	size_t base_src = `3`(y nx + x); // idx of the first channel
4691	size_t base_dst = y * nx + x; // idx of the first channel
4692	batch_entry[ base_dst] = imgs.entries [b]->buf [base_src ];
4693	batch_entry[`1`*n + base_dst] = imgs.entries [b]->buf [base_src + `1`];
4694	batch_entry[`2`*n + base_dst] = imgs.entries [b]->buf [base_src + `2`];
4695	}
4696	}
4697	}
4698	}
4699	set_input_f32 ("inp_raw", inp_raw);
4700
4701	} else {
4702	// audio input
4703	GGML_ASSERT(imgs.entries.size() == `1`);
4704	const auto & mel_inp = imgs.entries [`0`];
4705	const int n_step = mel_inp ->nx;
4706	const int n_mel = mel_inp ->ny;
4707	std::vector<float> inp_raw(n_step * n_mel);
4708	std::memcpy(dest: inp_raw.data(), src: mel_inp ->buf.data(), n: n_step * n_mel * sizeof(float));
4709	set_input_f32 ("inp_raw", inp_raw);
4710	}
4711
4712	// set input per projector
4713	switch (ctx->model.proj_type) {
4714	case PROJECTOR_TYPE_MINICPMV:
4715	{
4716	// inspired from siglip:
4717	// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
4718	// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
4719	std::vector<int32_t> positions(pos_h * pos_w);
4720	int bucket_coords_h[`1024`];
4721	int bucket_coords_w[`1024`];
4722	for (int i = `0`; i < pos_h; i++){
4723	bucket_coords_h[i] = std::floor(x: `70.0`*i/pos_h);
4724	}
4725	for (int i = `0`; i < pos_w; i++){
4726	bucket_coords_w[i] = std::floor(x: `70.0`*i/pos_w);
4727	}
4728	for (int i = `0`, id = `0`; i < pos_h; i++){
4729	for (int j = `0`; j < pos_w; j++){
4730	positions [id++] = bucket_coords_h[i]*`70` + bucket_coords_w[j];
4731	}
4732	}
4733	set_input_i32 ("positions", positions);
4734
4735	// inputs for resampler projector
4736	// set the 2D positions (using float for sinusoidal embedding)
4737	int n_patches_per_col = image_size_width / patch_size;
4738	std::vector<float> pos_data(n_pos);
4739	// dimension H
4740	for (int i = `0`; i < n_pos; i++) {
4741	pos_data [i] = static_cast<float>(i / n_patches_per_col);
4742	}
4743	set_input_f32 ("pos_h", pos_data);
4744	// dimension W
4745	for (int i = `0`; i < n_pos; i++) {
4746	pos_data [i] = static_cast<float>(i % n_patches_per_col);
4747	}
4748	set_input_f32 ("pos_w", pos_data);
4749	// base frequency omega
4750	const float base_freq = `10000.0f`;
4751	const int n_embd_proj = clip_n_mmproj_embd(ctx);
4752	std::vector<float> omega(n_embd_proj / `4`);
4753	for (int i = `0`; i < n_embd_proj / `4`; ++i) {
4754	omega [i] = `1.0f` / std::pow(x: base_freq, y: static_cast<float>(i) / (n_embd_proj / `4`));
4755	}
4756	set_input_f32 ("omega", omega);
4757	} break;
4758	case PROJECTOR_TYPE_QWEN2VL:
4759	case PROJECTOR_TYPE_QWEN3VL:
4760	{
4761	const int merge_ratio = hparams.n_merge;
4762	const int pw = image_size_width / patch_size;
4763	const int ph = image_size_height / patch_size;
4764	std::vector<int> positions(n_pos * `4`);
4765	int ptr = `0`;
4766	for (int y = `0`; y < ph; y += merge_ratio) {
4767	for (int x = `0`; x < pw; x += merge_ratio) {
4768	for (int dy = `0`; dy < `2`; dy++) {
4769	for (int dx = `0`; dx < `2`; dx++) {
4770	positions [ ptr] = y + dy;
4771	positions [ num_patches + ptr] = x + dx;
4772	positions [`2` * num_patches + ptr] = y + dy;
4773	positions [`3` * num_patches + ptr] = x + dx;
4774	ptr++;
4775	}
4776	}
4777	}
4778	}
4779
4780	set_input_i32 ("positions", positions);
4781	} break;
4782	case PROJECTOR_TYPE_QWEN25VL:
4783	{
4784	// pw ph = number of tokens output by ViT after apply patch merger*
4785	// ipw ipw = number of vision token been processed inside ViT*
4786	const int merge_ratio = `2`;
4787	const int pw = image_size_width / patch_size / merge_ratio;
4788	const int ph = image_size_height / patch_size / merge_ratio;
4789	const int ipw = image_size_width / patch_size;
4790	const int iph = image_size_height / patch_size;
4791
4792	std::vector<int> idx (ph * pw);
4793	std::vector<int> inv_idx(ph * pw);
4794
4795	if (use_window_attn) {
4796	const int attn_window_size = `112`;
4797	const int grid_window = attn_window_size / patch_size / merge_ratio;
4798	int dst = `0`;
4799	// [num_vision_tokens, num_vision_tokens] attention mask tensor
4800	std::vector<float> mask(pow(x: ipw * iph, y: `2`), std::numeric_limits<float>::lowest());
4801	int mask_row = `0`;
4802
4803	for (int y = `0`; y < ph; y += grid_window) {
4804	for (int x = `0`; x < pw; x += grid_window) {
4805	const int win_h = std::min(a: grid_window, b: ph - y);
4806	const int win_w = std::min(a: grid_window, b: pw - x);
4807	const int dst_0 = dst;
4808	// group all tokens belong to the same window togather (to a continue range)
4809	for (int dy = `0`; dy < win_h; dy++) {
4810	for (int dx = `0`; dx < win_w; dx++) {
4811	const int src = (y + dy) * pw + (x + dx);
4812	GGML_ASSERT(src < (int)idx.size());
4813	GGML_ASSERT(dst < (int)inv_idx.size());
4814	idx [src] = dst;
4815	inv_idx [dst] = src;
4816	dst++;
4817	}
4818	}
4819
4820	for (int r=`0`; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
4821	int row_offset = mask_row * (ipw * iph);
4822	std::fill(
4823	first: mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
4824	last: mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
4825	value: `0.0`);
4826	mask_row++;
4827	}
4828	}
4829	}
4830
4831	set_input_i32 ("window_idx", idx);
4832	set_input_i32 ("inv_window_idx", inv_idx);
4833	set_input_f32 ("window_mask", mask);
4834	} else {
4835	for (int i = `0`; i < ph * pw; i++) {
4836	idx [i] = i;
4837	}
4838	}
4839
4840	const int mpow = merge_ratio * merge_ratio;
4841	std::vector<int> positions(n_pos * `4`);
4842
4843	int ptr = `0`;
4844	for (int y = `0`; y < iph; y += merge_ratio) {
4845	for (int x = `0`; x < ipw; x += merge_ratio) {
4846	for (int dy = `0`; dy < `2`; dy++) {
4847	for (int dx = `0`; dx < `2`; dx++) {
4848	auto remap = idx [ptr / mpow];
4849	remap = (remap * mpow) + (ptr % mpow);
4850
4851	positions [ remap] = y + dy;
4852	positions [ num_patches + remap] = x + dx;
4853	positions [`2` * num_patches + remap] = y + dy;
4854	positions [`3` * num_patches + remap] = x + dx;
4855	ptr++;
4856	}
4857	}
4858	}
4859	}
4860
4861	set_input_i32 ("positions", positions);
4862	} break;
4863	case PROJECTOR_TYPE_PIXTRAL:
4864	case PROJECTOR_TYPE_KIMIVL:
4865	case PROJECTOR_TYPE_LIGHTONOCR:
4866	{
4867	// set the 2D positions
4868	int n_patches_per_col = image_size_width / patch_size;
4869	std::vector<int> pos_data(n_pos);
4870	// dimension H
4871	for (int i = `0`; i < n_pos; i++) {
4872	pos_data [i] = i / n_patches_per_col;
4873	}
4874	set_input_i32 ("pos_h", pos_data);
4875	// dimension W
4876	for (int i = `0`; i < n_pos; i++) {
4877	pos_data [i] = i % n_patches_per_col;
4878	}
4879	set_input_i32 ("pos_w", pos_data);
4880	} break;
4881	case PROJECTOR_TYPE_GLM_EDGE:
4882	{
4883	// llava and other models
4884	std::vector<int32_t> positions(n_pos);
4885	for (int i = `0`; i < n_pos; i++) {
4886	positions [i] = i;
4887	}
4888	set_input_i32 ("positions", positions);
4889	} break;
4890	case PROJECTOR_TYPE_MLP:
4891	case PROJECTOR_TYPE_MLP_NORM:
4892	case PROJECTOR_TYPE_LDP:
4893	case PROJECTOR_TYPE_LDPV2:
4894	{
4895	// llava and other models
4896	std::vector<int32_t> positions(n_pos);
4897	for (int i = `0`; i < n_pos; i++) {
4898	positions [i] = i;
4899	}
4900	set_input_i32 ("positions", positions);
4901
4902	// The patches vector is used to get rows to index into the embeds with;
4903	// we should skip dim 0 only if we have CLS to avoid going out of bounds
4904	// when retrieving the rows.
4905	int patch_offset = model.class_embedding ? `1` : `0`;
4906	std::vector<int32_t> patches(num_patches);
4907	for (int i = `0`; i < num_patches; i++) {
4908	patches [i] = i + patch_offset;
4909	}
4910	set_input_i32 ("patches", patches);
4911	} break;
4912	case PROJECTOR_TYPE_GEMMA3:
4913	case PROJECTOR_TYPE_IDEFICS3:
4914	case PROJECTOR_TYPE_INTERNVL:
4915	case PROJECTOR_TYPE_QWEN2A:
4916	case PROJECTOR_TYPE_ULTRAVOX:
4917	case PROJECTOR_TYPE_LFM2:
4918	case PROJECTOR_TYPE_VOXTRAL:
4919	case PROJECTOR_TYPE_JANUS_PRO:
4920	case PROJECTOR_TYPE_COGVLM:
4921	{
4922	// do nothing
4923	} break;
4924	case PROJECTOR_TYPE_LLAMA4:
4925	{
4926	// set the 2D positions
4927	int n_patches_per_col = image_size_width / patch_size;
4928	std::vector<int> pos_data(num_patches + `1`, `0`); // +1 for the [CLS] token
4929	// last pos is always kept 0, it's for CLS
4930	// dimension H
4931	for (int i = `0`; i < num_patches; i++) {
4932	pos_data [i] = (i / n_patches_per_col) + `1`;
4933	}
4934	set_input_i32 ("pos_h", pos_data);
4935	// dimension W
4936	for (int i = `0`; i < num_patches; i++) {
4937	pos_data [i] = (i % n_patches_per_col) + `1`;
4938	}
4939	set_input_i32 ("pos_w", pos_data);
4940	} break;
4941	default:
4942	GGML_ABORT("Unknown projector type");
4943	}
4944
4945	// ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
4946	ggml_backend_dev_t dev = ggml_backend_get_device(backend: ctx->backend_cpu);
4947	ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(device: dev) : nullptr;
4948	if (reg) {
4949	auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, name: "ggml_backend_set_n_threads");
4950	if (ggml_backend_set_n_threads_fn) {
4951	ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
4952	}
4953	}
4954
4955	auto status = ggml_backend_sched_graph_compute(sched: ctx->sched.get(), graph: gf);
4956	if (status != GGML_STATUS_SUCCESS) {
4957	LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
4958	return false;
4959	}
4960
4961	// print debug nodes
4962	if (ctx->debug_graph) {
4963	LOG_INF("\n\n---\n\n");
4964	LOG_INF("\n\nDebug graph:\n\n");
4965	for (ggml_tensor * t : ctx->debug_print_tensors) {
4966	std::vector<uint8_t> data(ggml_nbytes(tensor: t));
4967	ggml_backend_tensor_get(tensor: t, data: data.data(), offset: `0`, size: ggml_nbytes(tensor: t));
4968	print_tensor_shape(t);
4969	print_tensor_data(t, data: data.data(), n: `3`);
4970	}
4971	}
4972
4973	// the last node is the embedding tensor
4974	ggml_tensor * embeddings = ggml_graph_node(cgraph: gf, i: -`1`);
4975
4976	// sanity check (only support batch size of 1 for now)
4977	const int n_tokens_out = embeddings->ne[`1`];
4978	const int expected_n_tokens_out = clip_n_output_tokens(ctx, img: imgs.entries [`0`].get());
4979	if (n_tokens_out != expected_n_tokens_out) {
4980	LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
4981	GGML_ABORT("Invalid number of output tokens");
4982	}
4983
4984	// copy the embeddings to the location passed by the user
4985	ggml_backend_tensor_get(tensor: embeddings, data: vec, offset: `0`, size: ggml_nbytes(tensor: embeddings));
4986
4987	return true;
4988	}
4989
4990	int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
4991	switch (ctx->model.proj_type) {
4992	case PROJECTOR_TYPE_LDP:
4993	return ctx->model.mm_model_block_1_block_2_1_b->ne[`0`];
4994	case PROJECTOR_TYPE_LDPV2:
4995	return ctx->model.mm_model_peg_0_b->ne[`0`];
4996	case PROJECTOR_TYPE_MLP:
4997	case PROJECTOR_TYPE_PIXTRAL:
4998	case PROJECTOR_TYPE_LIGHTONOCR:
4999	return ctx->model.mm_2_w->ne[`1`];
5000	case PROJECTOR_TYPE_MLP_NORM:
5001	return ctx->model.mm_3_b->ne[`0`];
5002	case PROJECTOR_TYPE_MINICPMV:
5003	return ctx->model.mm_model_proj->ne[`0`];
5004	case PROJECTOR_TYPE_GLM_EDGE:
5005	return ctx->model.mm_model_mlp_3_w->ne[`1`];
5006	case PROJECTOR_TYPE_QWEN2VL:
5007	case PROJECTOR_TYPE_QWEN25VL:
5008	case PROJECTOR_TYPE_JANUS_PRO:
5009	return ctx->model.mm_1_b->ne[`0`];
5010	case PROJECTOR_TYPE_QWEN3VL:
5011	// main path + deepstack paths
5012	return ctx->model.mm_1_b->ne[`0`] * (`1` + ctx->model.n_deepstack_layers);
5013	case PROJECTOR_TYPE_GEMMA3:
5014	return ctx->model.mm_input_proj_w->ne[`0`];
5015	case PROJECTOR_TYPE_IDEFICS3:
5016	return ctx->model.projection->ne[`1`];
5017	case PROJECTOR_TYPE_ULTRAVOX:
5018	case PROJECTOR_TYPE_VOXTRAL:
5019	return ctx->model.mm_2_w->ne[`1`];
5020	case PROJECTOR_TYPE_INTERNVL:
5021	return ctx->model.mm_3_w->ne[`1`];
5022	case PROJECTOR_TYPE_LLAMA4:
5023	return ctx->model.mm_model_proj->ne[`1`];
5024	case PROJECTOR_TYPE_QWEN2A:
5025	return ctx->model.mm_fc_w->ne[`1`];
5026	case PROJECTOR_TYPE_LFM2:
5027	case PROJECTOR_TYPE_KIMIVL:
5028	return ctx->model.mm_2_w->ne[`1`];
5029	case PROJECTOR_TYPE_COGVLM:
5030	return ctx->model.mm_4h_to_h_w->ne[`1`];
5031	default:
5032	GGML_ABORT("Unknown projector type");
5033	}
5034	}
5035
5036	int clip_is_minicpmv(const struct clip_ctx * ctx) {
5037	if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
5038	return ctx->model.hparams.minicpmv_version;
5039	}
5040	return `0`;
5041	}
5042
5043	bool clip_is_glm(const struct clip_ctx * ctx) {
5044	return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
5045	}
5046
5047	bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
5048	return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
5049	\|\| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
5050	\|\| ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL;
5051	}
5052
5053	bool clip_is_llava(const struct clip_ctx * ctx) {
5054	return ctx->model.hparams.has_llava_projector;
5055	}
5056
5057	bool clip_is_gemma3(const struct clip_ctx * ctx) {
5058	return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
5059	}
5060
5061	bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
5062	return ctx->model.modality == CLIP_MODALITY_VISION;
5063	}
5064
5065	bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
5066	return ctx->model.modality == CLIP_MODALITY_AUDIO;
5067	}
5068
5069	bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
5070	return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
5071	\|\| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
5072	\|\| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
5073	}
5074
5075	bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
5076	clip_image_f32 clip_img;
5077	clip_img.buf.resize(new_size: h * w * `3`);
5078	for (int i = `0`; i < hw`3`; i++)
5079	{
5080	clip_img.buf [i] = img[i];
5081	}
5082	clip_img.nx = w;
5083	clip_img.ny = h;
5084	clip_image_encode(ctx, n_threads, img: &clip_img, vec);
5085	return true;
5086	}
5087
5088	//
5089	// API used internally with mtmd
5090	//
5091
5092	projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
5093	return ctx->proj_type();
5094	}
5095
5096	void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
5097	clip_image_f32 * audio = new clip_image_f32;
5098	audio->nx = n_frames;
5099	audio->ny = n_mel;
5100	audio->buf.resize(new_size: n_frames * n_mel);
5101	std::memcpy(dest: audio->buf.data(), src: mel, n: n_frames * n_mel * sizeof(float));
5102
5103	batch->entries.push_back(x: clip_image_f32_ptr (audio));
5104	batch->is_audio = true;
5105	}
5106

Browse the source code of llama.cpp/tools/mtmd/clip.cpp