| 1 | #pragma once |
| 2 | |
| 3 | #define GGML_COMMON_DECL_CPP |
| 4 | #include "ggml-common.h" |
| 5 | |
| 6 | #include "traits.h" |
| 7 | #include "ggml.h" |
| 8 | |
| 9 | // GGML internal header |
| 10 | |
| 11 | ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void); |
| 12 | |
| 13 | template <int K> constexpr int QK_0() { |
| 14 | if constexpr (K == 4) { |
| 15 | return QK4_0; |
| 16 | } |
| 17 | if constexpr (K == 8) { |
| 18 | return QK8_0; |
| 19 | } |
| 20 | return -1; |
| 21 | } |
| 22 | |
| 23 | template <int K, int N> struct block { |
| 24 | ggml_half d[N]; // deltas for N qK_0 blocks |
| 25 | int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks |
| 26 | }; |
| 27 | |
| 28 | // control size |
| 29 | static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding" ); |
| 30 | static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding" ); |
| 31 | static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding" ); |
| 32 | static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding" ); |
| 33 | |
| 34 | using block_q4_0x4 = block<4, 4>; |
| 35 | using block_q4_0x8 = block<4, 8>; |
| 36 | using block_q8_0x4 = block<8, 4>; |
| 37 | using block_q8_0x8 = block<8, 8>; |
| 38 | |
| 39 | struct block_q4_Kx8 { |
| 40 | ggml_half d[8]; // super-block scale for quantized scales |
| 41 | ggml_half dmin[8]; // super-block scale for quantized mins |
| 42 | uint8_t scales[96]; // scales and mins, quantized with 6 bits |
| 43 | uint8_t qs[1024]; // 4--bit quants |
| 44 | }; |
| 45 | |
| 46 | static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding" ); |
| 47 | struct block_q2_Kx8 { |
| 48 | ggml_half d[8]; // super-block scale for quantized scales |
| 49 | ggml_half dmin[8]; // super-block scale for quantized mins |
| 50 | uint8_t scales[128]; // scales and mins, quantized with 4 bits |
| 51 | uint8_t qs[512]; // 2--bit quants |
| 52 | }; |
| 53 | |
| 54 | static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding" ); |
| 55 | struct block_q8_Kx4 { |
| 56 | float d[4]; // delta |
| 57 | int8_t qs[QK_K * 4]; // quants |
| 58 | int16_t bsums[QK_K / 4]; // sum of quants in groups of 16 |
| 59 | }; |
| 60 | |
| 61 | static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding" ); |
| 62 | |
| 63 | struct block_iq4_nlx4 { |
| 64 | ggml_half d[4]; // deltas for 4 iq4_nl blocks |
| 65 | uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks |
| 66 | }; |
| 67 | |
| 68 | static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding" ); |
| 69 | |
| 70 | struct block_iq4_nlx8 { |
| 71 | ggml_half d[8]; // deltas for 8 iq4_nl blocks |
| 72 | uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks |
| 73 | }; |
| 74 | |
| 75 | static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding" ); |
| 76 | |
| 77 | #if defined(__cplusplus) |
| 78 | extern "C" { |
| 79 | #endif |
| 80 | |
| 81 | void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
| 82 | void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
| 83 | void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
| 84 | void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 85 | void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 86 | void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 87 | void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 88 | void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 89 | void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 90 | void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 91 | void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 92 | void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 93 | void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 94 | void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 95 | void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 96 | void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 97 | void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 98 | |
| 99 | // Native implementations |
| 100 | void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
| 101 | void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
| 102 | void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
| 103 | void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 104 | void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 105 | void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 106 | void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 107 | void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 108 | void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 109 | void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 110 | void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 111 | void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 112 | void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 113 | void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 114 | void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 115 | void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 116 | void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); |
| 117 | |
| 118 | #if defined(__cplusplus) |
| 119 | } // extern "C" |
| 120 | #endif |
| 121 | |