| 1 | #include "ggml.h" |
| 2 | #include "gguf.h" |
| 3 | #include "clip.h" |
| 4 | |
| 5 | #include <climits> |
| 6 | #include <cstdarg> |
| 7 | #include <cinttypes> |
| 8 | #include <string> |
| 9 | #include <map> |
| 10 | #include <sstream> |
| 11 | #include <vector> |
| 12 | #include <memory> |
| 13 | |
| 14 | // Internal header for clip.cpp |
| 15 | |
| 16 | #define KEY_FTYPE "general.file_type" |
| 17 | #define KEY_NAME "general.name" |
| 18 | #define KEY_DESCRIPTION "general.description" |
| 19 | #define KEY_PROJ_TYPE "clip.projector_type" |
| 20 | #define KEY_HAS_AUDIO_ENC "clip.has_audio_encoder" |
| 21 | #define KEY_HAS_VISION_ENC "clip.has_vision_encoder" |
| 22 | #define KEY_USE_GELU "clip.use_gelu" |
| 23 | #define KEY_USE_SILU "clip.use_silu" |
| 24 | |
| 25 | #define KEY_N_EMBD "clip.%s.embedding_length" |
| 26 | #define KEY_N_FF "clip.%s.feed_forward_length" |
| 27 | #define KEY_N_BLOCK "clip.%s.block_count" |
| 28 | #define KEY_PROJ_DIM "clip.%s.projection_dim" |
| 29 | #define KEY_N_HEAD "clip.%s.attention.head_count" |
| 30 | #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" |
| 31 | |
| 32 | // vision-specific |
| 33 | #define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities |
| 34 | #define KEY_IMAGE_SIZE "clip.vision.image_size" |
| 35 | #define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" |
| 36 | #define KEY_PATCH_SIZE "clip.vision.patch_size" |
| 37 | #define KEY_IMAGE_MEAN "clip.vision.image_mean" |
| 38 | #define KEY_IMAGE_STD "clip.vision.image_std" |
| 39 | #define KEY_FEATURE_LAYER "clip.vision.feature_layer" |
| 40 | #define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" |
| 41 | #define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" |
| 42 | #define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers" |
| 43 | |
| 44 | #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" |
| 45 | #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" |
| 46 | #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" |
| 47 | #define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" |
| 48 | #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" |
| 49 | #define KEY_MINICPMV_VERSION "clip.minicpmv_version" |
| 50 | #define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" |
| 51 | |
| 52 | // audio-specific |
| 53 | #define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities |
| 54 | #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" |
| 55 | #define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor" |
| 56 | |
| 57 | |
| 58 | // |
| 59 | // tensor name constants |
| 60 | // |
| 61 | |
| 62 | #define TN_POS_EMBD "%s.position_embd.weight" |
| 63 | #define TN_CLASS_EMBD "v.class_embd" |
| 64 | #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat |
| 65 | #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" |
| 66 | #define TN_PATCH_BIAS "v.patch_embd.bias" |
| 67 | #define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s" |
| 68 | #define TN_ATTN_K "%s.blk.%d.attn_k.%s" |
| 69 | #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" |
| 70 | #define TN_ATTN_V "%s.blk.%d.attn_v.%s" |
| 71 | #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" |
| 72 | #define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s" |
| 73 | #define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s" |
| 74 | #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" |
| 75 | #define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" |
| 76 | #define TN_FFN_UP "%s.blk.%d.ffn_up.%s" |
| 77 | #define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" |
| 78 | #define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm |
| 79 | #define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm |
| 80 | #define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale |
| 81 | #define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale |
| 82 | #define TN_LN_PRE "%s.pre_ln.%s" |
| 83 | #define TN_LN_POST "%s.post_ln.%s" |
| 84 | #define TN_LLAVA_PROJ "mm.%d.%s" |
| 85 | #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" |
| 86 | #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" |
| 87 | #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" |
| 88 | #define TN_IMAGE_NEWLINE "model.image_newline" |
| 89 | #define TN_MM_INP_NORM "mm.input_norm.weight" |
| 90 | #define TN_MM_INP_NORM_B "mm.input_norm.bias" |
| 91 | #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 |
| 92 | #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 |
| 93 | #define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 |
| 94 | #define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1 |
| 95 | #define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral |
| 96 | #define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model) |
| 97 | #define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model) |
| 98 | #define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack |
| 99 | #define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack |
| 100 | #define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack |
| 101 | |
| 102 | // mimicpmv |
| 103 | #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" |
| 104 | #define TN_MINICPMV_QUERY "resampler.query" |
| 105 | #define TN_MINICPMV_PROJ "resampler.proj.weight" |
| 106 | #define TN_MINICPMV_KV_PROJ "resampler.kv.weight" |
| 107 | #define TN_MINICPMV_ATTN "resampler.attn.%s.%s" |
| 108 | #define TN_MINICPMV_LN "resampler.ln_%s.%s" |
| 109 | |
| 110 | #define TN_GLM_ADAPER_CONV "adapter.conv.%s" |
| 111 | #define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s" |
| 112 | #define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s" |
| 113 | #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" |
| 114 | #define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" |
| 115 | #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" |
| 116 | |
| 117 | // ultravox |
| 118 | #define TN_CONV1D "a.conv1d.%d.%s" |
| 119 | #define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s" |
| 120 | #define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer |
| 121 | #define TN_MM_NORM_PRE "mm.a.norm_pre.%s" |
| 122 | #define TN_MM_NORM_MID "mm.a.norm_mid.%s" |
| 123 | |
| 124 | // cogvlm |
| 125 | #define TN_MM_POST_FC_NORM "mm.post_fc_norm.%s" |
| 126 | #define TN_MM_H_TO_4H "mm.up.%s" |
| 127 | #define TN_MM_GATE "mm.gate.%s" |
| 128 | #define TN_MM_4H_TO_H "mm.down.%s" |
| 129 | #define TN_TOK_BOI "v.boi" |
| 130 | #define TN_TOK_EOI "v.eoi" |
| 131 | |
| 132 | // align x to upper multiple of n |
| 133 | #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) |
| 134 | |
| 135 | enum projector_type { |
| 136 | PROJECTOR_TYPE_MLP, |
| 137 | PROJECTOR_TYPE_MLP_NORM, |
| 138 | PROJECTOR_TYPE_LDP, |
| 139 | PROJECTOR_TYPE_LDPV2, |
| 140 | PROJECTOR_TYPE_MINICPMV, |
| 141 | PROJECTOR_TYPE_GLM_EDGE, |
| 142 | PROJECTOR_TYPE_QWEN2VL, |
| 143 | PROJECTOR_TYPE_QWEN3VL, |
| 144 | PROJECTOR_TYPE_GEMMA3, |
| 145 | PROJECTOR_TYPE_IDEFICS3, |
| 146 | PROJECTOR_TYPE_PIXTRAL, |
| 147 | PROJECTOR_TYPE_QWEN25VL, |
| 148 | PROJECTOR_TYPE_ULTRAVOX, |
| 149 | PROJECTOR_TYPE_INTERNVL, |
| 150 | PROJECTOR_TYPE_LLAMA4, |
| 151 | PROJECTOR_TYPE_QWEN2A, |
| 152 | PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx |
| 153 | PROJECTOR_TYPE_VOXTRAL, |
| 154 | PROJECTOR_TYPE_LFM2, |
| 155 | PROJECTOR_TYPE_KIMIVL, |
| 156 | PROJECTOR_TYPE_LIGHTONOCR, |
| 157 | PROJECTOR_TYPE_COGVLM, |
| 158 | PROJECTOR_TYPE_JANUS_PRO, |
| 159 | PROJECTOR_TYPE_UNKNOWN, |
| 160 | }; |
| 161 | |
| 162 | static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = { |
| 163 | { PROJECTOR_TYPE_MLP, "mlp" }, |
| 164 | { PROJECTOR_TYPE_LDP, "ldp" }, |
| 165 | { PROJECTOR_TYPE_LDPV2, "ldpv2" }, |
| 166 | { PROJECTOR_TYPE_MINICPMV, "resampler" }, |
| 167 | { PROJECTOR_TYPE_GLM_EDGE, "adapter" }, |
| 168 | { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger" }, |
| 169 | { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger" }, |
| 170 | { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger" }, |
| 171 | { PROJECTOR_TYPE_GEMMA3, "gemma3" }, |
| 172 | { PROJECTOR_TYPE_IDEFICS3, "idefics3" }, |
| 173 | { PROJECTOR_TYPE_PIXTRAL, "pixtral" }, |
| 174 | { PROJECTOR_TYPE_ULTRAVOX, "ultravox" }, |
| 175 | { PROJECTOR_TYPE_INTERNVL, "internvl" }, |
| 176 | { PROJECTOR_TYPE_LLAMA4, "llama4" }, |
| 177 | { PROJECTOR_TYPE_QWEN2A, "qwen2a" }, |
| 178 | { PROJECTOR_TYPE_QWEN25O, "qwen2.5o" }, |
| 179 | { PROJECTOR_TYPE_VOXTRAL, "voxtral" }, |
| 180 | { PROJECTOR_TYPE_LFM2, "lfm2" }, |
| 181 | { PROJECTOR_TYPE_KIMIVL, "kimivl" }, |
| 182 | { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr" }, |
| 183 | { PROJECTOR_TYPE_COGVLM, "cogvlm" }, |
| 184 | { PROJECTOR_TYPE_JANUS_PRO, "janus_pro" }, |
| 185 | }; |
| 186 | |
| 187 | static projector_type clip_projector_type_from_string(const std::string & str) { |
| 188 | for (const auto & pair : PROJECTOR_TYPE_NAMES) { |
| 189 | if (pair.second == str) { |
| 190 | return pair.first; |
| 191 | } |
| 192 | } |
| 193 | return PROJECTOR_TYPE_UNKNOWN; |
| 194 | } |
| 195 | |
| 196 | // RGB uint8 image |
| 197 | struct clip_image_u8 { |
| 198 | int nx; |
| 199 | int ny; |
| 200 | |
| 201 | std::vector<uint8_t> buf; |
| 202 | }; |
| 203 | |
| 204 | // For images, buf.size() == nx*ny*3 |
| 205 | // Memory layout: RGBRGBRGB... |
| 206 | // For audio, only one channel is used, buf.size() == nx*ny |
| 207 | // nx will be n_frames and ny will be n_mel |
| 208 | struct clip_image_f32 { |
| 209 | int nx; |
| 210 | int ny; |
| 211 | |
| 212 | std::vector<float> buf; |
| 213 | }; |
| 214 | |
| 215 | // |
| 216 | // logging |
| 217 | // |
| 218 | |
| 219 | static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) { |
| 220 | (void) level; |
| 221 | (void) user_data; |
| 222 | fputs(s: text, stderr); |
| 223 | fflush(stderr); |
| 224 | } |
| 225 | |
| 226 | struct clip_logger_state { |
| 227 | ggml_log_level verbosity_thold; |
| 228 | ggml_log_callback log_callback; |
| 229 | void * log_callback_user_data; |
| 230 | }; |
| 231 | |
| 232 | extern struct clip_logger_state g_logger_state; |
| 233 | |
| 234 | static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) { |
| 235 | if (format == NULL) { |
| 236 | return; |
| 237 | } |
| 238 | va_list args_copy; |
| 239 | va_copy(args_copy, args); |
| 240 | char buffer[128]; |
| 241 | int len = vsnprintf(s: buffer, maxlen: 128, format: format, arg: args); |
| 242 | if (len < 128) { |
| 243 | g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data); |
| 244 | } else { |
| 245 | char * buffer2 = (char *) calloc(nmemb: len + 1, size: sizeof(char)); |
| 246 | vsnprintf(s: buffer2, maxlen: len + 1, format: format, arg: args_copy); |
| 247 | buffer2[len] = 0; |
| 248 | g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data); |
| 249 | free(ptr: buffer2); |
| 250 | } |
| 251 | va_end(args_copy); |
| 252 | } |
| 253 | |
| 254 | static void clip_log_internal(enum ggml_log_level level, const char * format, ...) { |
| 255 | va_list args; |
| 256 | va_start(args, format); |
| 257 | clip_log_internal_v(level, format, args); |
| 258 | va_end(args); |
| 259 | } |
| 260 | |
| 261 | #define LOG_TMPL(level, ...) \ |
| 262 | do { \ |
| 263 | if ((level) >= g_logger_state.verbosity_thold) { \ |
| 264 | clip_log_internal((level), __VA_ARGS__); \ |
| 265 | } \ |
| 266 | } while (0) |
| 267 | #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, __VA_ARGS__) |
| 268 | #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, __VA_ARGS__) |
| 269 | #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) |
| 270 | #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) |
| 271 | #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, __VA_ARGS__) |
| 272 | |
| 273 | // |
| 274 | // cpp wrappers |
| 275 | // |
| 276 | |
| 277 | // wrapper for clip_image_size |
| 278 | struct clip_image_size_deleter { |
| 279 | void operator()(clip_image_size * val) { clip_image_size_free(img_size: val); } |
| 280 | }; |
| 281 | typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr; |
| 282 | |
| 283 | // wrapper for clip_image_u8 |
| 284 | struct clip_image_u8_deleter { |
| 285 | void operator()(clip_image_u8 * val) { clip_image_u8_free(img: val); } |
| 286 | }; |
| 287 | typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr; |
| 288 | |
| 289 | // wrapper for clip_image_f32 |
| 290 | struct clip_image_f32_deleter { |
| 291 | void operator()(clip_image_f32 * val) { clip_image_f32_free(img: val); } |
| 292 | }; |
| 293 | typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr; |
| 294 | |
| 295 | struct clip_image_u8_batch { |
| 296 | std::vector<clip_image_u8_ptr> entries; |
| 297 | }; |
| 298 | |
| 299 | struct clip_image_f32_batch { |
| 300 | std::vector<clip_image_f32_ptr> entries; |
| 301 | bool is_audio = false; |
| 302 | |
| 303 | // for llava-uhd style models, we need to know the grid size |
| 304 | // note: entries.size() == grid_x * grid_y + 1 (one overview image) |
| 305 | int grid_x = 0; |
| 306 | int grid_y = 0; |
| 307 | |
| 308 | clip_image_f32_batch clone() const { |
| 309 | clip_image_f32_batch new_batch{ |
| 310 | /* entries */ .entries: {}, |
| 311 | /* is_audio */ .is_audio: is_audio, |
| 312 | /* grid_x */ .grid_x: grid_x, |
| 313 | /* grid_y */ .grid_y: grid_y, |
| 314 | }; |
| 315 | new_batch.entries.reserve(n: entries.size()); |
| 316 | for (const auto & entry : entries) { |
| 317 | new_batch.entries.emplace_back(args: new clip_image_f32(*entry)); |
| 318 | } |
| 319 | return new_batch; |
| 320 | } |
| 321 | }; |
| 322 | |
| 323 | // |
| 324 | // common utils |
| 325 | // |
| 326 | |
| 327 | static std::string string_format(const char * fmt, ...) { |
| 328 | va_list ap; |
| 329 | va_list ap2; |
| 330 | va_start(ap, fmt); |
| 331 | va_copy(ap2, ap); |
| 332 | int size = vsnprintf(NULL, maxlen: 0, format: fmt, arg: ap); |
| 333 | GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT |
| 334 | std::vector<char> buf(size + 1); |
| 335 | int size2 = vsnprintf(s: buf.data(), maxlen: size + 1, format: fmt, arg: ap2); |
| 336 | GGML_ASSERT(size2 == size); |
| 337 | va_end(ap2); |
| 338 | va_end(ap); |
| 339 | return std::string(buf.data(), buf.size()); |
| 340 | } |
| 341 | |
| 342 | static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { |
| 343 | if (search.empty()) { |
| 344 | return; |
| 345 | } |
| 346 | std::string builder; |
| 347 | builder.reserve(res_arg: s.length()); |
| 348 | size_t pos = 0; |
| 349 | size_t last_pos = 0; |
| 350 | while ((pos = s.find(str: search, pos: last_pos)) != std::string::npos) { |
| 351 | builder.append(str: s, pos: last_pos, n: pos - last_pos); |
| 352 | builder.append(str: replace); |
| 353 | last_pos = pos + search.length(); |
| 354 | } |
| 355 | builder.append(str: s, pos: last_pos, n: std::string::npos); |
| 356 | s = std::move(builder); |
| 357 | } |
| 358 | |
| 359 | // split string by a `std::string delim` instead of `char delim` |
| 360 | static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) { |
| 361 | std::vector<std::string> tokens; |
| 362 | size_t pos = 0; |
| 363 | std::string token; |
| 364 | while ((pos = s.find(str: delimiter)) != std::string::npos) { |
| 365 | token = s.substr(pos: 0, n: pos); |
| 366 | tokens.push_back(x: token); |
| 367 | s.erase(pos: 0, n: pos + delimiter.length()); |
| 368 | } |
| 369 | tokens.push_back(x: s); |
| 370 | return tokens; |
| 371 | } |
| 372 | |
| 373 | // |
| 374 | // gguf utils |
| 375 | // |
| 376 | |
| 377 | static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { |
| 378 | switch (type) { |
| 379 | case GGUF_TYPE_UINT8: return std::to_string(val: ((const uint8_t *)data)[i]); |
| 380 | case GGUF_TYPE_INT8: return std::to_string(val: ((const int8_t *)data)[i]); |
| 381 | case GGUF_TYPE_UINT16: return std::to_string(val: ((const uint16_t *)data)[i]); |
| 382 | case GGUF_TYPE_INT16: return std::to_string(val: ((const int16_t *)data)[i]); |
| 383 | case GGUF_TYPE_UINT32: return std::to_string(val: ((const uint32_t *)data)[i]); |
| 384 | case GGUF_TYPE_INT32: return std::to_string(val: ((const int32_t *)data)[i]); |
| 385 | case GGUF_TYPE_UINT64: return std::to_string(val: ((const uint64_t *)data)[i]); |
| 386 | case GGUF_TYPE_INT64: return std::to_string(val: ((const int64_t *)data)[i]); |
| 387 | case GGUF_TYPE_FLOAT32: return std::to_string(val: ((const float *)data)[i]); |
| 388 | case GGUF_TYPE_FLOAT64: return std::to_string(val: ((const double *)data)[i]); |
| 389 | case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false" ; |
| 390 | default: return string_format(fmt: "unknown type %d" , type); |
| 391 | } |
| 392 | } |
| 393 | |
| 394 | static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { |
| 395 | const enum gguf_type type = gguf_get_kv_type(ctx: ctx_gguf, key_id: i); |
| 396 | |
| 397 | switch (type) { |
| 398 | case GGUF_TYPE_STRING: |
| 399 | return gguf_get_val_str(ctx: ctx_gguf, key_id: i); |
| 400 | case GGUF_TYPE_ARRAY: |
| 401 | { |
| 402 | const enum gguf_type arr_type = gguf_get_arr_type(ctx: ctx_gguf, key_id: i); |
| 403 | int arr_n = gguf_get_arr_n(ctx: ctx_gguf, key_id: i); |
| 404 | const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx: ctx_gguf, key_id: i); |
| 405 | std::stringstream ss; |
| 406 | ss << "[" ; |
| 407 | for (int j = 0; j < arr_n; j++) { |
| 408 | if (arr_type == GGUF_TYPE_STRING) { |
| 409 | std::string val = gguf_get_arr_str(ctx: ctx_gguf, key_id: i, i: j); |
| 410 | // escape quotes |
| 411 | string_replace_all(s&: val, search: "\\" , replace: "\\\\" ); |
| 412 | string_replace_all(s&: val, search: "\"" , replace: "\\\"" ); |
| 413 | ss << '"' << val << '"'; |
| 414 | } else if (arr_type == GGUF_TYPE_ARRAY) { |
| 415 | ss << "???" ; |
| 416 | } else { |
| 417 | ss << gguf_data_to_str(type: arr_type, data, i: j); |
| 418 | } |
| 419 | if (j < arr_n - 1) { |
| 420 | ss << ", " ; |
| 421 | } |
| 422 | } |
| 423 | ss << "]" ; |
| 424 | return ss.str(); |
| 425 | } |
| 426 | default: |
| 427 | return gguf_data_to_str(type, data: gguf_get_val_data(ctx: ctx_gguf, key_id: i), i: 0); |
| 428 | } |
| 429 | } |
| 430 | |
| 431 | // |
| 432 | // debugging |
| 433 | // |
| 434 | |
| 435 | static void print_tensor_shape(ggml_tensor * t) { |
| 436 | printf(format: "%s.shape = [" , t->name); |
| 437 | for (int i = 0; i < ggml_n_dims(tensor: t); ++i) { |
| 438 | printf(format: "%" PRId64, t->ne[i]); |
| 439 | if (i < ggml_n_dims(tensor: t) - 1) { |
| 440 | printf(format: ", " ); |
| 441 | } |
| 442 | } |
| 443 | printf(format: "]\n" ); |
| 444 | } |
| 445 | |
| 446 | static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) { |
| 447 | ggml_type type = t->type; |
| 448 | int64_t * ne = t->ne; |
| 449 | size_t * nb = t->nb; |
| 450 | for (int64_t i3 = 0; i3 < ne[3]; i3++) { |
| 451 | printf(format: "%s.data: [\n" , t->name); |
| 452 | for (int64_t i2 = 0; i2 < ne[2]; i2++) { |
| 453 | if (i2 == n && ne[2] > 2*n) { |
| 454 | printf(format: " ..., \n" ); |
| 455 | i2 = ne[2] - n; |
| 456 | } |
| 457 | printf(format: " [\n" ); |
| 458 | for (int64_t i1 = 0; i1 < ne[1]; i1++) { |
| 459 | if (i1 == n && ne[1] > 2*n) { |
| 460 | printf(format: " ..., \n" ); |
| 461 | i1 = ne[1] - n; |
| 462 | } |
| 463 | printf(format: " [" ); |
| 464 | for (int64_t i0 = 0; i0 < ne[0]; i0++) { |
| 465 | if (i0 == n && ne[0] > 2*n) { |
| 466 | printf(format: "..., " ); |
| 467 | i0 = ne[0] - n; |
| 468 | } |
| 469 | size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; |
| 470 | float v; |
| 471 | if (type == GGML_TYPE_F16) { |
| 472 | v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); |
| 473 | } else if (type == GGML_TYPE_F32) { |
| 474 | v = *(float *) &data[i]; |
| 475 | } else if (type == GGML_TYPE_I32) { |
| 476 | v = (float) *(int32_t *) &data[i]; |
| 477 | } else if (type == GGML_TYPE_I16) { |
| 478 | v = (float) *(int16_t *) &data[i]; |
| 479 | } else if (type == GGML_TYPE_I8) { |
| 480 | v = (float) *(int8_t *) &data[i]; |
| 481 | } else { |
| 482 | GGML_ABORT("fatal error" ); |
| 483 | } |
| 484 | printf(format: "%8.4f" , v); |
| 485 | if (i0 < ne[0] - 1) printf(format: ", " ); |
| 486 | } |
| 487 | printf(format: "],\n" ); |
| 488 | } |
| 489 | printf(format: " ],\n" ); |
| 490 | } |
| 491 | printf(format: " ]\n" ); |
| 492 | } |
| 493 | } |
| 494 | |
| 495 | // |
| 496 | // API used internally with mtmd |
| 497 | // |
| 498 | |
| 499 | projector_type clip_get_projector_type(const struct clip_ctx * ctx); |
| 500 | |