| 1 | #pragma once |
| 2 | |
| 3 | // GGML internal header |
| 4 | |
| 5 | #include "ggml.h" |
| 6 | #include "gguf.h" |
| 7 | |
| 8 | #include <assert.h> |
| 9 | #include <math.h> |
| 10 | #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ |
| 11 | #include <stdbool.h> |
| 12 | #include <stdint.h> |
| 13 | #include <string.h> |
| 14 | |
| 15 | #ifdef __ARM_FEATURE_SVE |
| 16 | #include <arm_sve.h> |
| 17 | #endif // __ARM_FEATURE_SVE |
| 18 | |
| 19 | #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) |
| 20 | // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: |
| 21 | // |
| 22 | // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ |
| 23 | // |
| 24 | #include <arm_neon.h> |
| 25 | #endif |
| 26 | |
| 27 | #if defined(__F16C__) |
| 28 | #include <immintrin.h> |
| 29 | #endif |
| 30 | |
| 31 | #ifdef __cplusplus |
| 32 | extern "C" { |
| 33 | #endif |
| 34 | |
| 35 | void ggml_print_backtrace(void); |
| 36 | |
| 37 | #ifndef MIN |
| 38 | # define MIN(a, b) ((a) < (b) ? (a) : (b)) |
| 39 | #endif |
| 40 | |
| 41 | #ifndef MAX |
| 42 | # define MAX(a, b) ((a) > (b) ? (a) : (b)) |
| 43 | #endif |
| 44 | |
| 45 | // required for mmap as gguf only guarantees 32-byte alignment |
| 46 | #define TENSOR_ALIGNMENT 32 |
| 47 | |
| 48 | // static_assert should be a #define, but if it's not, |
| 49 | // fall back to the _Static_assert C11 keyword. |
| 50 | // if C99 - static_assert is noop |
| 51 | // ref: https://stackoverflow.com/a/53923785/4039976 |
| 52 | #ifndef __cplusplus |
| 53 | #ifndef static_assert |
| 54 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) |
| 55 | #define static_assert(cond, msg) _Static_assert(cond, msg) |
| 56 | #else |
| 57 | #define static_assert(cond, msg) struct global_scope_noop_trick |
| 58 | #endif |
| 59 | #endif |
| 60 | #endif |
| 61 | |
| 62 | static inline int ggml_up32(int n) { |
| 63 | return (n + 31) & ~31; |
| 64 | } |
| 65 | |
| 66 | //static inline int ggml_up64(int n) { |
| 67 | // return (n + 63) & ~63; |
| 68 | //} |
| 69 | |
| 70 | static inline int ggml_up(int n, int m) { |
| 71 | // assert m is a power of 2 |
| 72 | GGML_ASSERT((m & (m - 1)) == 0); |
| 73 | return (n + m - 1) & ~(m - 1); |
| 74 | } |
| 75 | |
| 76 | // TODO: move to ggml.h? (won't be able to inline) |
| 77 | static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { |
| 78 | if (a->type != b->type) { |
| 79 | return false; |
| 80 | } |
| 81 | for (int i = 0; i < GGML_MAX_DIMS; i++) { |
| 82 | if (a->ne[i] != b->ne[i]) { |
| 83 | return false; |
| 84 | } |
| 85 | if (a->nb[i] != b->nb[i]) { |
| 86 | return false; |
| 87 | } |
| 88 | } |
| 89 | return true; |
| 90 | } |
| 91 | |
| 92 | static bool ggml_op_is_empty(enum ggml_op op) { |
| 93 | switch (op) { |
| 94 | case GGML_OP_NONE: |
| 95 | case GGML_OP_RESHAPE: |
| 96 | case GGML_OP_TRANSPOSE: |
| 97 | case GGML_OP_VIEW: |
| 98 | case GGML_OP_PERMUTE: |
| 99 | return true; |
| 100 | default: |
| 101 | return false; |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | static inline float ggml_softplus(float input) { |
| 106 | return (input > 20.0f) ? input : logf(x: 1 + expf(x: input)); |
| 107 | } |
| 108 | // |
| 109 | // logging |
| 110 | // |
| 111 | |
| 112 | GGML_ATTRIBUTE_FORMAT(2, 3) |
| 113 | GGML_API void ggml_log_internal (enum ggml_log_level level, const char * format, ...); |
| 114 | GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data); |
| 115 | |
| 116 | #define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__) |
| 117 | #define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) |
| 118 | #define GGML_LOG_WARN(...) ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) |
| 119 | #define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) |
| 120 | #define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) |
| 121 | #define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) |
| 122 | |
| 123 | #define GGML_DEBUG 0 |
| 124 | |
| 125 | #if (GGML_DEBUG >= 1) |
| 126 | #define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__) |
| 127 | #else |
| 128 | #define GGML_PRINT_DEBUG(...) |
| 129 | #endif |
| 130 | |
| 131 | #if (GGML_DEBUG >= 5) |
| 132 | #define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__) |
| 133 | #else |
| 134 | #define GGML_PRINT_DEBUG_5(...) |
| 135 | #endif |
| 136 | |
| 137 | #if (GGML_DEBUG >= 10) |
| 138 | #define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__) |
| 139 | #else |
| 140 | #define GGML_PRINT_DEBUG_10(...) |
| 141 | #endif |
| 142 | |
| 143 | // tensor params |
| 144 | |
| 145 | static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { |
| 146 | GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings |
| 147 | assert(params_size <= GGML_MAX_OP_PARAMS); |
| 148 | memcpy(dest: tensor->op_params, src: params, n: params_size); |
| 149 | } |
| 150 | |
| 151 | static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) { |
| 152 | assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); |
| 153 | return ((const int32_t *)(tensor->op_params))[i]; |
| 154 | } |
| 155 | |
| 156 | static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) { |
| 157 | assert(i < GGML_MAX_OP_PARAMS / sizeof(float)); |
| 158 | return ((const float *)(tensor->op_params))[i]; |
| 159 | } |
| 160 | |
| 161 | static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) { |
| 162 | assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); |
| 163 | ((int32_t *)(tensor->op_params))[i] = value; |
| 164 | } |
| 165 | |
| 166 | static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) { |
| 167 | assert(i < GGML_MAX_OP_PARAMS / sizeof(float)); |
| 168 | ((float *)(tensor->op_params))[i] = value; |
| 169 | } |
| 170 | |
| 171 | struct ggml_map_custom1_op_params { |
| 172 | ggml_custom1_op_t fun; |
| 173 | int n_tasks; |
| 174 | void * userdata; |
| 175 | }; |
| 176 | |
| 177 | struct ggml_map_custom2_op_params { |
| 178 | ggml_custom2_op_t fun; |
| 179 | int n_tasks; |
| 180 | void * userdata; |
| 181 | }; |
| 182 | |
| 183 | struct ggml_map_custom3_op_params { |
| 184 | ggml_custom3_op_t fun; |
| 185 | int n_tasks; |
| 186 | void * userdata; |
| 187 | }; |
| 188 | |
| 189 | struct ggml_custom_op_params { |
| 190 | ggml_custom_op_t fun; |
| 191 | int n_tasks; |
| 192 | void * userdata; |
| 193 | }; |
| 194 | |
| 195 | // bitset |
| 196 | |
| 197 | typedef uint32_t ggml_bitset_t; |
| 198 | |
| 199 | static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated" ); |
| 200 | #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8) |
| 201 | #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1) |
| 202 | |
| 203 | static size_t ggml_bitset_size(size_t n) { |
| 204 | return (n + BITSET_MASK) >> BITSET_SHR; |
| 205 | } |
| 206 | |
| 207 | static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) { |
| 208 | return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK))); |
| 209 | } |
| 210 | |
| 211 | static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) { |
| 212 | bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK)); |
| 213 | } |
| 214 | |
| 215 | static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) { |
| 216 | bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK)); |
| 217 | } |
| 218 | |
| 219 | // hash set |
| 220 | |
| 221 | #define GGML_HASHSET_FULL ((size_t)-1) |
| 222 | #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2) |
| 223 | |
| 224 | struct ggml_hash_set { |
| 225 | size_t size; |
| 226 | ggml_bitset_t * used; // whether or not the keys are in use i.e. set |
| 227 | struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i) |
| 228 | }; |
| 229 | |
| 230 | struct ggml_hash_set ggml_hash_set_new(size_t size); |
| 231 | void ggml_hash_set_free(struct ggml_hash_set * hash_set); |
| 232 | |
| 233 | // returns the minimum size for a hash set that can hold min_sz elements |
| 234 | size_t ggml_hash_size(size_t min_sz); |
| 235 | |
| 236 | // remove all elements from the hash set |
| 237 | void ggml_hash_set_reset(struct ggml_hash_set * hash_set); |
| 238 | |
| 239 | // returns true if key is in the hash set |
| 240 | static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key); |
| 241 | |
| 242 | // returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted |
| 243 | static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key); |
| 244 | |
| 245 | // returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full |
| 246 | static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key); |
| 247 | |
| 248 | // return index, asserts if table is full |
| 249 | static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key); |
| 250 | |
| 251 | // hash function for ggml_tensor |
| 252 | static inline size_t ggml_hash(const struct ggml_tensor * p) { |
| 253 | // the last 4 bits are always zero due to alignment |
| 254 | return (size_t)(uintptr_t)p >> 4; |
| 255 | } |
| 256 | |
| 257 | static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key) { |
| 258 | size_t h = ggml_hash(p: key) % hash_set->size; |
| 259 | |
| 260 | // linear probing |
| 261 | size_t i = h; |
| 262 | while (ggml_bitset_get(bitset: hash_set->used, i) && hash_set->keys[i] != key) { |
| 263 | i = (i + 1) % hash_set->size; |
| 264 | if (i == h) { |
| 265 | // visited all hash table entries -> not found |
| 266 | return GGML_HASHSET_FULL; |
| 267 | } |
| 268 | } |
| 269 | return i; |
| 270 | } |
| 271 | |
| 272 | static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) { |
| 273 | size_t i = ggml_hash_find(hash_set, key); |
| 274 | return i != GGML_HASHSET_FULL && ggml_bitset_get(bitset: hash_set->used, i); |
| 275 | } |
| 276 | |
| 277 | static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) { |
| 278 | size_t h = ggml_hash(p: key) % hash_set->size; |
| 279 | |
| 280 | // linear probing |
| 281 | size_t i = h; |
| 282 | do { |
| 283 | if (!ggml_bitset_get(bitset: hash_set->used, i)) { |
| 284 | ggml_bitset_set(bitset: hash_set->used, i); |
| 285 | hash_set->keys[i] = key; |
| 286 | return i; |
| 287 | } |
| 288 | if (hash_set->keys[i] == key) { |
| 289 | return GGML_HASHSET_ALREADY_EXISTS; |
| 290 | } |
| 291 | i = (i + 1) % hash_set->size; |
| 292 | } while (i != h); |
| 293 | |
| 294 | // visited all hash table entries -> not found |
| 295 | GGML_ABORT("fatal error" ); |
| 296 | } |
| 297 | |
| 298 | static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) { |
| 299 | size_t h = ggml_hash(p: key) % hash_set->size; |
| 300 | |
| 301 | // linear probing |
| 302 | size_t i = h; |
| 303 | do { |
| 304 | if (!ggml_bitset_get(bitset: hash_set->used, i)) { |
| 305 | ggml_bitset_set(bitset: hash_set->used, i); |
| 306 | hash_set->keys[i] = key; |
| 307 | return i; |
| 308 | } |
| 309 | if (hash_set->keys[i] == key) { |
| 310 | return i; |
| 311 | } |
| 312 | i = (i + 1) % hash_set->size; |
| 313 | } while (i != h); |
| 314 | |
| 315 | // visited all hash table entries -> not found |
| 316 | GGML_ABORT("fatal error" ); |
| 317 | } |
| 318 | |
| 319 | // computation graph |
| 320 | |
| 321 | enum ggml_cgraph_eval_order { |
| 322 | GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, |
| 323 | GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, |
| 324 | GGML_CGRAPH_EVAL_ORDER_COUNT |
| 325 | }; |
| 326 | |
| 327 | struct ggml_cgraph { |
| 328 | int size; // maximum number of nodes/leafs/grads/grad_accs |
| 329 | int n_nodes; // number of nodes currently in use |
| 330 | int n_leafs; // number of leafs currently in use |
| 331 | |
| 332 | struct ggml_tensor ** nodes; // tensors with data that can change if the graph is evaluated |
| 333 | struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes |
| 334 | struct ggml_tensor ** grad_accs; // accumulators for node gradients |
| 335 | struct ggml_tensor ** leafs; // tensors with constant data |
| 336 | int32_t * use_counts;// number of uses of each tensor, indexed by hash table slot |
| 337 | |
| 338 | struct ggml_hash_set visited_hash_set; |
| 339 | |
| 340 | enum ggml_cgraph_eval_order order; |
| 341 | }; |
| 342 | |
| 343 | // returns a slice of cgraph with nodes [i0, i1) |
| 344 | // the slice does not have leafs or gradients |
| 345 | // if you need the gradients, get them from the original graph |
| 346 | struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); |
| 347 | |
| 348 | // ggml-alloc.c: true if the operation can reuse memory from its sources |
| 349 | GGML_API bool ggml_op_can_inplace(enum ggml_op op); |
| 350 | |
| 351 | |
| 352 | // Memory allocation |
| 353 | |
| 354 | GGML_API void * ggml_aligned_malloc(size_t size); |
| 355 | GGML_API void ggml_aligned_free(void * ptr, size_t size); |
| 356 | |
| 357 | // FP16 <-> FP32 |
| 358 | // ref: https://github.com/Maratyszcza/FP16 |
| 359 | |
| 360 | static inline float fp32_from_bits(uint32_t w) { |
| 361 | union { |
| 362 | uint32_t as_bits; |
| 363 | float as_value; |
| 364 | } fp32; |
| 365 | fp32.as_bits = w; |
| 366 | return fp32.as_value; |
| 367 | } |
| 368 | |
| 369 | static inline uint32_t fp32_to_bits(float f) { |
| 370 | union { |
| 371 | float as_value; |
| 372 | uint32_t as_bits; |
| 373 | } fp32; |
| 374 | fp32.as_value = f; |
| 375 | return fp32.as_bits; |
| 376 | } |
| 377 | |
| 378 | static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { |
| 379 | const uint32_t w = (uint32_t) h << 16; |
| 380 | const uint32_t sign = w & UINT32_C(0x80000000); |
| 381 | const uint32_t two_w = w + w; |
| 382 | |
| 383 | const uint32_t exp_offset = UINT32_C(0xE0) << 23; |
| 384 | #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) |
| 385 | const float exp_scale = 0x1.0p-112f; |
| 386 | #else |
| 387 | const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); |
| 388 | #endif |
| 389 | const float normalized_value = fp32_from_bits(w: (two_w >> 4) + exp_offset) * exp_scale; |
| 390 | |
| 391 | const uint32_t magic_mask = UINT32_C(126) << 23; |
| 392 | const float magic_bias = 0.5f; |
| 393 | const float denormalized_value = fp32_from_bits(w: (two_w >> 17) | magic_mask) - magic_bias; |
| 394 | |
| 395 | const uint32_t denormalized_cutoff = UINT32_C(1) << 27; |
| 396 | const uint32_t result = sign | |
| 397 | (two_w < denormalized_cutoff ? fp32_to_bits(f: denormalized_value) : fp32_to_bits(f: normalized_value)); |
| 398 | return fp32_from_bits(w: result); |
| 399 | } |
| 400 | |
| 401 | static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { |
| 402 | #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) |
| 403 | const float scale_to_inf = 0x1.0p+112f; |
| 404 | const float scale_to_zero = 0x1.0p-110f; |
| 405 | #else |
| 406 | const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); |
| 407 | const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); |
| 408 | #endif |
| 409 | float base = (fabsf(x: f) * scale_to_inf) * scale_to_zero; |
| 410 | |
| 411 | const uint32_t w = fp32_to_bits(f); |
| 412 | const uint32_t shl1_w = w + w; |
| 413 | const uint32_t sign = w & UINT32_C(0x80000000); |
| 414 | uint32_t bias = shl1_w & UINT32_C(0xFF000000); |
| 415 | if (bias < UINT32_C(0x71000000)) { |
| 416 | bias = UINT32_C(0x71000000); |
| 417 | } |
| 418 | |
| 419 | base = fp32_from_bits(w: (bias >> 1) + UINT32_C(0x07800000)) + base; |
| 420 | const uint32_t bits = fp32_to_bits(f: base); |
| 421 | const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); |
| 422 | const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); |
| 423 | const uint32_t nonsign = exp_bits + mantissa_bits; |
| 424 | return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); |
| 425 | } |
| 426 | |
| 427 | #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
| 428 | #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) |
| 429 | |
| 430 | #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) |
| 431 | #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) |
| 432 | |
| 433 | static inline float ggml_e8m0_to_fp32(uint8_t x) { |
| 434 | uint32_t bits; // Stores the raw bit representation of the float |
| 435 | |
| 436 | // Handle special case for minimum exponent (denormalized float) |
| 437 | if (x == 0) { |
| 438 | // Bit pattern for 2^(-127): |
| 439 | // - Sign bit: 0 (positive) |
| 440 | // - Exponent: 0 (denormalized number) |
| 441 | // - Mantissa: 0x400000 (0.5 in fractional form) |
| 442 | // Value = 0.5 * 2^(-126) = 2^(-127) |
| 443 | bits = 0x00400000; |
| 444 | } |
| 445 | // note: disabled as we don't need to handle NaNs |
| 446 | //// Handle special case for NaN (all bits set) |
| 447 | //else if (x == 0xFF) { |
| 448 | // // Standard quiet NaN pattern: |
| 449 | // // - Sign bit: 0 |
| 450 | // // - Exponent: all 1s (0xFF) |
| 451 | // // - Mantissa: 0x400000 (quiet NaN flag) |
| 452 | // bits = 0x7FC00000; |
| 453 | //} |
| 454 | // Normalized values (most common case) |
| 455 | else { |
| 456 | // Construct normalized float by shifting exponent into position: |
| 457 | // - Exponent field: 8 bits (positions 30-23) |
| 458 | // - Mantissa: 0 (implicit leading 1) |
| 459 | // Value = 2^(x - 127) |
| 460 | bits = (uint32_t) x << 23; |
| 461 | } |
| 462 | |
| 463 | float result; // Final float value |
| 464 | // Safely reinterpret bit pattern as float without type-punning issues |
| 465 | memcpy(dest: &result, src: &bits, n: sizeof(float)); |
| 466 | return result; |
| 467 | } |
| 468 | |
| 469 | // Equal to ggml_e8m0_to_fp32/2 |
| 470 | // Useful with MXFP4 quantization since the E0M2 values are doubled |
| 471 | static inline float ggml_e8m0_to_fp32_half(uint8_t x) { |
| 472 | uint32_t bits; |
| 473 | |
| 474 | // For x < 2: use precomputed denormal patterns |
| 475 | if (x < 2) { |
| 476 | // 0x00200000 = 2^(-128), 0x00400000 = 2^(-127) |
| 477 | bits = 0x00200000 << x; |
| 478 | } |
| 479 | // For x >= 2: normalized exponent adjustment |
| 480 | else { |
| 481 | // 0.5 * 2^(x-127) = 2^(x-128) = normalized with exponent (x-1) |
| 482 | bits = (uint32_t)(x - 1) << 23; |
| 483 | } |
| 484 | // Note: NaNs are not handled here |
| 485 | |
| 486 | float result; |
| 487 | memcpy(dest: &result, src: &bits, n: sizeof(float)); |
| 488 | return result; |
| 489 | } |
| 490 | |
| 491 | #define GGML_E8M0_TO_FP32(x) ggml_e8m0_to_fp32(x) |
| 492 | #define GGML_E8M0_TO_FP32_HALF(x) ggml_e8m0_to_fp32_half(x) |
| 493 | |
| 494 | /** |
| 495 | * Converts brain16 to float32. |
| 496 | * |
| 497 | * The bfloat16 floating point format has the following structure: |
| 498 | * |
| 499 | * ┌sign |
| 500 | * │ |
| 501 | * │ ┌exponent |
| 502 | * │ │ |
| 503 | * │ │ ┌mantissa |
| 504 | * │ │ │ |
| 505 | * │┌──┴───┐┌─┴───┐ |
| 506 | * 0b0000000000000000 brain16 |
| 507 | * |
| 508 | * Since bf16 has the same number of exponent bits as a 32bit float, |
| 509 | * encoding and decoding numbers becomes relatively straightforward. |
| 510 | * |
| 511 | * ┌sign |
| 512 | * │ |
| 513 | * │ ┌exponent |
| 514 | * │ │ |
| 515 | * │ │ ┌mantissa |
| 516 | * │ │ │ |
| 517 | * │┌──┴───┐┌─┴───────────────────┐ |
| 518 | * 0b00000000000000000000000000000000 IEEE binary32 |
| 519 | * |
| 520 | * For comparison, the standard fp16 format has fewer exponent bits. |
| 521 | * |
| 522 | * ┌sign |
| 523 | * │ |
| 524 | * │ ┌exponent |
| 525 | * │ │ |
| 526 | * │ │ ┌mantissa |
| 527 | * │ │ │ |
| 528 | * │┌─┴─┐┌─┴──────┐ |
| 529 | * 0b0000000000000000 IEEE binary16 |
| 530 | * |
| 531 | * @see IEEE 754-2008 |
| 532 | */ |
| 533 | static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { |
| 534 | union { |
| 535 | float f; |
| 536 | uint32_t i; |
| 537 | } u; |
| 538 | u.i = (uint32_t)h.bits << 16; |
| 539 | return u.f; |
| 540 | } |
| 541 | |
| 542 | /** |
| 543 | * Converts float32 to brain16. |
| 544 | * |
| 545 | * This is binary identical with Google Brain float conversion. |
| 546 | * Floats shall round to nearest even, and NANs shall be quiet. |
| 547 | * Subnormals aren't flushed to zero, except perhaps when used. |
| 548 | * This code should vectorize nicely if using modern compilers. |
| 549 | */ |
| 550 | static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { |
| 551 | ggml_bf16_t h; |
| 552 | union { |
| 553 | float f; |
| 554 | uint32_t i; |
| 555 | } u; |
| 556 | u.f = s; |
| 557 | if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ |
| 558 | h.bits = (u.i >> 16) | 64; /* force to quiet */ |
| 559 | return h; |
| 560 | } |
| 561 | h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; |
| 562 | return h; |
| 563 | } |
| 564 | |
| 565 | #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) |
| 566 | #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) |
| 567 | |
| 568 | static inline int32_t ggml_node_get_use_count(const struct ggml_cgraph * cgraph, int node_idx) { |
| 569 | const struct ggml_tensor * node = cgraph->nodes[node_idx]; |
| 570 | |
| 571 | size_t hash_pos = ggml_hash_find(hash_set: &cgraph->visited_hash_set, key: node); |
| 572 | if (!ggml_bitset_get(bitset: cgraph->visited_hash_set.used, i: hash_pos)) { |
| 573 | return 0; |
| 574 | } |
| 575 | return cgraph->use_counts[hash_pos]; |
| 576 | } |
| 577 | |
| 578 | // return true if the node's results are only used by N other nodes |
| 579 | // and can be fused into their calculations. |
| 580 | static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int node_idx, int32_t n_uses) { |
| 581 | const struct ggml_tensor * node = cgraph->nodes[node_idx]; |
| 582 | |
| 583 | // check the use count against how many we're replacing |
| 584 | if (ggml_node_get_use_count(cgraph, node_idx) != n_uses) { |
| 585 | return false; |
| 586 | } |
| 587 | |
| 588 | // if node is a view, some other node might be using the intermediate result |
| 589 | // via the view source. |
| 590 | if (node->view_src) { |
| 591 | return false; |
| 592 | } |
| 593 | |
| 594 | // If the user requested output for the node, can't fuse |
| 595 | if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { |
| 596 | return false; |
| 597 | } |
| 598 | |
| 599 | return true; |
| 600 | } |
| 601 | |
| 602 | // Returns true if nodes with indices { node_idxs } are the sequence of ggml_ops in ops[] |
| 603 | // and are fusable. Nodes are considered fusable according to this function if: |
| 604 | // - all nodes except the last have only one use and are not views/outputs (see ggml_node_has_N_uses). |
| 605 | // - all nodes except the last are a src of the following node. |
| 606 | // - all nodes are the same shape. |
| 607 | // TODO: Consider allowing GGML_OP_NONE nodes in between |
| 608 | static inline bool ggml_can_fuse_ext(const struct ggml_cgraph * cgraph, const int * node_idxs, const enum ggml_op * ops, int num_ops) { |
| 609 | for (int i = 0; i < num_ops; ++i) { |
| 610 | if (node_idxs[i] >= cgraph->n_nodes) { |
| 611 | return false; |
| 612 | } |
| 613 | |
| 614 | struct ggml_tensor * node = cgraph->nodes[node_idxs[i]]; |
| 615 | if (node->op != ops[i]) { |
| 616 | return false; |
| 617 | } |
| 618 | if (i < num_ops - 1 && !ggml_node_has_n_uses(cgraph, node_idx: node_idxs[i], n_uses: 1)) { |
| 619 | return false; |
| 620 | } |
| 621 | if (i > 0) { |
| 622 | struct ggml_tensor * prev = cgraph->nodes[node_idxs[i - 1]]; |
| 623 | if (node->src[0] != prev && node->src[1] != prev) { |
| 624 | return false; |
| 625 | } |
| 626 | if (!ggml_are_same_shape(t0: node, t1: prev)) { |
| 627 | return false; |
| 628 | } |
| 629 | } |
| 630 | } |
| 631 | return true; |
| 632 | } |
| 633 | |
| 634 | // same as above, for sequential indices starting at node_idx |
| 635 | static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, const enum ggml_op * ops, int num_ops) { |
| 636 | assert(num_ops < 32); |
| 637 | |
| 638 | if (node_idx + num_ops > cgraph->n_nodes) { |
| 639 | return false; |
| 640 | } |
| 641 | |
| 642 | int idxs[32]; |
| 643 | for (int i = 0; i < num_ops; ++i) { |
| 644 | idxs[i] = node_idx + i; |
| 645 | } |
| 646 | |
| 647 | return ggml_can_fuse_ext(cgraph, node_idxs: idxs, ops, num_ops); |
| 648 | } |
| 649 | |
| 650 | GGML_API bool ggml_can_fuse_subgraph_ext(const struct ggml_cgraph * cgraph, |
| 651 | const int * node_idxs, |
| 652 | int count, |
| 653 | const enum ggml_op * ops, |
| 654 | const int * outputs, |
| 655 | int num_outputs); |
| 656 | |
| 657 | // Returns true if the subgraph formed by {node_idxs} can be fused |
| 658 | // checks whethers all nodes which are not part of outputs can be elided |
| 659 | // by checking if their num_uses are confined to the subgraph |
| 660 | static inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph, |
| 661 | int node_idx, |
| 662 | int count, |
| 663 | const enum ggml_op * ops, |
| 664 | const int * outputs, |
| 665 | int num_outputs) { |
| 666 | GGML_ASSERT(count < 32); |
| 667 | if (node_idx + count > cgraph->n_nodes) { |
| 668 | return false; |
| 669 | } |
| 670 | |
| 671 | int idxs[32]; |
| 672 | |
| 673 | for (int i = 0; i < count; ++i) { |
| 674 | idxs[i] = node_idx + i; |
| 675 | } |
| 676 | |
| 677 | return ggml_can_fuse_subgraph_ext(cgraph, node_idxs: idxs, count, ops, outputs, num_outputs); |
| 678 | } |
| 679 | |
| 680 | #ifdef __cplusplus |
| 681 | } |
| 682 | #endif |
| 683 | |
| 684 | #ifdef __cplusplus |
| 685 | #include <array> |
| 686 | #include <initializer_list> |
| 687 | #include <vector> |
| 688 | |
| 689 | // nicer C++ syntax for ggml_can_fuse |
| 690 | inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) { |
| 691 | return ggml_can_fuse(cgraph, node_idx, ops.begin(), (int)ops.size()); |
| 692 | } |
| 693 | |
| 694 | inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph, |
| 695 | int start_idx, |
| 696 | std::initializer_list<enum ggml_op> ops, |
| 697 | std::initializer_list<int> outputs = {}) { |
| 698 | return ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size()); |
| 699 | } |
| 700 | |
| 701 | // Return true if the edges in the graph match expectations. |
| 702 | inline bool ggml_check_edges(const struct ggml_cgraph * cgraph, |
| 703 | int start_idx, |
| 704 | std::initializer_list<std::array<int, 3>> edges) { |
| 705 | for (const auto & edge : edges) { |
| 706 | int dst_node = edge[0]; |
| 707 | int src_idx = edge[1]; |
| 708 | int src_node = edge[2]; |
| 709 | if (cgraph->nodes[start_idx + dst_node]->src[src_idx] != cgraph->nodes[start_idx + src_node]) { |
| 710 | return false; |
| 711 | } |
| 712 | } |
| 713 | return true; |
| 714 | } |
| 715 | |
| 716 | // expose GGUF internals for test code |
| 717 | GGML_API size_t gguf_type_size(enum gguf_type type); |
| 718 | GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); |
| 719 | GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta); |
| 720 | #endif // __cplusplus |
| 721 | |