repack.h source code [llama.cpp/ggml/src/ggml-cpu/repack.h]

1	#pragma once
2
3	#define GGML_COMMON_DECL_CPP
4	#include "ggml-common.h"
5
6	#include "traits.h"
7	#include "ggml.h"
8
9	// GGML internal header
10
11	ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void);
12
13	template <int K> constexpr int QK_0() {
14	if constexpr (K == `4`) {
15	return QK4_0;
16	}
17	if constexpr (K == `8`) {
18	return QK8_0;
19	}
20	return -`1`;
21	}
22
23	template <int K, int N> struct block {
24	ggml_half d[N]; // deltas for N qK_0 blocks
25	int8_t qs[(QK_0<K>() * N * K) / `8`]; // quants for N qK_0 blocks
26	};
27
28	// control size
29	static_assert(sizeof(block<`4`, `4`>) == `4` * sizeof(ggml_half) + QK8_0 * `2`, "wrong block<4,4> size/padding");
30	static_assert(sizeof(block<`4`, `8`>) == `8` * sizeof(ggml_half) + QK8_0 * `4`, "wrong block<4,8> size/padding");
31	static_assert(sizeof(block<`8`, `4`>) == `4` * sizeof(ggml_half) + QK8_0 * `4`, "wrong block<8,4> size/padding");
32	static_assert(sizeof(block<`8`, `8`>) == `8` * sizeof(ggml_half) + QK8_0 * `8`, "wrong block<8,8> size/padding");
33
34	using block_q4_0x4 = block<`4`, `4`>;
35	using block_q4_0x8 = block<`4`, `8`>;
36	using block_q8_0x4 = block<`8`, `4`>;
37	using block_q8_0x8 = block<`8`, `8`>;
38
39	struct block_q4_Kx8 {
40	ggml_half d[`8`]; // super-block scale for quantized scales
41	ggml_half dmin[`8`]; // super-block scale for quantized mins
42	uint8_t scales[`96`]; // scales and mins, quantized with 6 bits
43	uint8_t qs[`1024`]; // 4--bit quants
44	};
45
46	static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * `16` + K_SCALE_SIZE * `8` + QK_K * `4`, "wrong q4_K block size/padding");
47	struct block_q2_Kx8 {
48	ggml_half d[`8`]; // super-block scale for quantized scales
49	ggml_half dmin[`8`]; // super-block scale for quantized mins
50	uint8_t scales[`128`]; // scales and mins, quantized with 4 bits
51	uint8_t qs[`512`]; // 2--bit quants
52	};
53
54	static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * `16` + QK_K/`2` + QK_K * `2`, "wrong q2_K block size/padding");
55	struct block_q8_Kx4 {
56	float d[`4`]; // delta
57	int8_t qs[QK_K * `4`]; // quants
58	int16_t bsums[QK_K / `4`]; // sum of quants in groups of 16
59	};
60
61	static_assert(sizeof(block_q8_Kx4) == sizeof(float) * `4` + QK_K * `4` + (QK_K / `4`) * sizeof(int16_t), "wrong q8_K block size/padding");
62
63	struct block_iq4_nlx4 {
64	ggml_half d[`4`]; // deltas for 4 iq4_nl blocks
65	uint8_t qs[QK4_NL * `2`]; // nibbles / quants for 4 iq4_nl blocks
66	};
67
68	static_assert(sizeof(block_iq4_nlx4) == `4` * sizeof(ggml_half) + QK4_NL * `2`, "wrong iq4_nlx4 block size/padding");
69
70	struct block_iq4_nlx8 {
71	ggml_half d[`8`]; // deltas for 8 iq4_nl blocks
72	uint8_t qs[QK4_NL * `4`]; // nibbles / quants for 8 iq4_nl blocks
73	};
74
75	static_assert(sizeof(block_iq4_nlx8) == `8` * sizeof(ggml_half) + QK4_NL * `4`, "wrong iq4_nlx8 block size/padding");
76
77	#if defined(__cplusplus)
78	extern "C" {
79	#endif
80
81	void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
82	void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
83	void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
84	void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
85	void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
86	void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87	void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88	void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
89	void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
90	void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
91	void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
92	void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
93	void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
94	void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
95	void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
96	void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
97	void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
98
99	// Native implementations
100	void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
101	void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
102	void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
103	void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104	void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
105	void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
106	void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
107	void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
108	void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
109	void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
110	void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
111	void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
112	void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
113	void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
114	void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
115	void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
116	void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
117
118	#if defined(__cplusplus)
119	} // extern "C"
120	#endif
121

Browse the source code of llama.cpp/ggml/src/ggml-cpu/repack.h