cpy-utils.cuh source code [llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh]

1	#pragma once
2
3	#include "ggml-common.h"
4	#include "convert.cuh"
5
6	static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
7	if (x <= val[`0`]) return `0`;
8	if (x >= val[n-`1`]) return n-`1`;
9	int ml = `0`, mu = n-`1`;
10	while (mu-ml > `1`) {
11	int mav = (ml+mu)/`2`;
12	if (x < val[mav]) mu = mav; else ml = mav;
13	}
14	return x - val[mu-`1`] < val[mu] - x ? mu-`1` : mu;
15	}
16
17	static __device__ void quantize_f32_q4_0_block(const float * __restrict__ x, block_q4_0 * __restrict__ y) {
18	float amax = `0.0f`;
19	float vmax = `0.0f`;
20
21	for (int j = `0`; j < QK4_0; ++j) {
22	const float v = x[j];
23	if (amax < fabsf(a: v)) {
24	amax = fabsf(a: v);
25	vmax = v;
26	}
27	}
28
29	const float d = vmax / -`8`;
30	const float id = d ? `1.0f`/d : `0.0f`;
31
32	y->d = d;
33
34	for (int j = `0`; j < QK4_0/`2`; ++j) {
35	const float x0 = x[`0` + j]*id;
36	const float x1 = x[QK4_0/`2` + j]*id;
37
38	const uint8_t xi0 = min(a: `15`, b: (int8_t)(x0 + `8.5f`));
39	const uint8_t xi1 = min(a: `15`, b: (int8_t)(x1 + `8.5f`));
40
41	y->qs[j] = xi0;
42	y->qs[j] \|= xi1 << `4`;
43	}
44	}
45
46	static __device__ void quantize_f32_q4_1_block(const float * __restrict__ x, block_q4_1 * __restrict__ y) {
47	float vmin = FLT_MAX;
48	float vmax = -FLT_MAX;
49
50	for (int j = `0`; j < QK4_1; ++j) {
51	const float v = x[j];
52	if (v < vmin) vmin = v;
53	if (v > vmax) vmax = v;
54	}
55
56	const float d = (vmax - vmin) / ((`1` << `4`) - `1`);
57	const float id = d ? `1.0f`/d : `0.0f`;
58
59	y->dm.x = d;
60	y->dm.y = vmin;
61
62	for (int j = `0`; j < QK4_1/`2`; ++j) {
63	const float x0 = (x[`0` + j] - vmin)*id;
64	const float x1 = (x[QK4_1/`2` + j] - vmin)*id;
65
66	const uint8_t xi0 = min(a: `15`, b: (int8_t)(x0 + `0.5f`));
67	const uint8_t xi1 = min(a: `15`, b: (int8_t)(x1 + `0.5f`));
68
69	y->qs[j] = xi0;
70	y->qs[j] \|= xi1 << `4`;
71	}
72	}
73
74	static __device__ void quantize_f32_q5_0_block(const float * __restrict__ x, block_q5_0 * __restrict__ y) {
75	float amax = `0.0f`;
76	float vmax = `0.0f`;
77
78	for (int j = `0`; j < QK5_0; ++j) {
79	const float v = x[j];
80	if (amax < fabsf(a: v)) {
81	amax = fabsf(a: v);
82	vmax = v;
83	}
84	}
85
86	const float d = vmax / -`16`;
87	const float id = d ? `1.0f`/d : `0.0f`;
88
89	y->d = d;
90
91	uint32_t qh = `0`;
92	for (int j = `0`; j < QK5_0/`2`; ++j) {
93	const float x0 = x[`0` + j]*id;
94	const float x1 = x[QK5_0/`2` + j]*id;
95
96	const uint8_t xi0 = min(a: `31`, b: (int8_t)(x0 + `16.5f`));
97	const uint8_t xi1 = min(a: `31`, b: (int8_t)(x1 + `16.5f`));
98
99	y->qs[j] = (xi0 & `0xf`) \| ((xi1 & `0xf`) << `4`);
100	qh \|= ((xi0 & `0x10u`) >> `4`) << (j + `0`);
101	qh \|= ((xi1 & `0x10u`) >> `4`) << (j + QK5_0/`2`);
102	}
103	memcpy(y->qh, &qh, sizeof(qh));
104	}
105
106	static __device__ void quantize_f32_q5_1_block(const float * __restrict__ x, block_q5_1 * __restrict__ y) {
107	float min = x[`0`];
108	float max = x[`0`];
109
110	for (int j = `1`; j < QK5_1; ++j) {
111	const float v = x[j];
112	min = v < min ? v : min;
113	max = v > max ? v : max;
114	}
115
116	const float d = (max - min) / `31`;
117	const float id = d ? `1.0f`/d : `0.0f`;
118
119	y->dm.x = d;
120	y->dm.y = min;
121
122	uint32_t qh = `0`;
123	for (int j = `0`; j < QK5_1/`2`; ++j) {
124	const float x0 = (x[`0` + j] - min)*id;
125	const float x1 = (x[QK5_1/`2` + j] - min)*id;
126
127	const uint8_t xi0 = (uint8_t)(x0 + `0.5f`);
128	const uint8_t xi1 = (uint8_t)(x1 + `0.5f`);
129
130	y->qs[j] = (xi0 & `0xf`) \| ((xi1 & `0xf`) << `4`);
131	qh \|= ((xi0 & `0x10u`) >> `4`) << (j + `0`);
132	qh \|= ((xi1 & `0x10u`) >> `4`) << (j + QK5_1/`2`);
133	}
134	memcpy(y->qh, &qh, sizeof(qh));
135	}
136
137	static __device__ void quantize_f32_q8_0_block(const float * __restrict__ x, block_q8_0 * __restrict__ y) {
138	float amax = `0.0f`; // absolute max
139
140	for (int j = `0`; j < QK8_0; j++) {
141	const float v = x[j];
142	amax = fmaxf(a: amax, b: fabsf(a: v));
143	}
144
145	const float d = amax / ((`1` << `7`) - `1`);
146	const float id = d ? `1.0f`/d : `0.0f`;
147
148	y->d = d;
149
150	for (int j = `0`; j < QK8_0; ++j) {
151	const float x0 = x[j]*id;
152	y->qs[j] = roundf(a: x0);
153	}
154	}
155
156	static __device__ void quantize_f32_iq4_nl_block(const float * __restrict__ x, block_iq4_nl * __restrict__ y) {
157	float amax = `0.0f`;
158	float vmax = `0.0f`;
159
160	for (int j = `0`; j < QK4_NL; ++j) {
161	const float v = x[j];
162	if (amax < fabsf(a: v)) {
163	amax = fabsf(a: v);
164	vmax = v;
165	}
166	}
167
168	float d = vmax / kvalues_iq4nl[`0`];
169	const float id = d ? `1.0f`/d : `0.0f`;
170
171	float sumqx = `0`, sumq2 = `0`;
172	for (int j = `0`; j < QK4_NL/`2`; ++j) {
173	const float x0 = x[`0` + j]*id;
174	const float x1 = x[QK4_NL/`2` + j]*id;
175	const uint8_t xi0 = best_index_int8(`16`, kvalues_iq4nl, x0);
176	const uint8_t xi1 = best_index_int8(`16`, kvalues_iq4nl, x1);
177	y->qs[j] = xi0 \| (xi1 << `4`);
178	const float v0 = kvalues_iq4nl[xi0];
179	const float v1 = kvalues_iq4nl[xi1];
180	const float w0 = x[`0` + j]*x[`0` + j];
181	const float w1 = x[QK4_NL/`2` + j]*x[QK4_NL/`2` + j];
182	sumqx += w0v0x[j] + w1v1x[QK4_NL/`2` + j];
183	sumq2 += w0v0v0 + w1v1v1;
184	}
185
186	y->d = sumq2 > `0` ? sumqx/sumq2 : d;
187	}
188
189	// Wrapper functions for cpy.cu compatibility
190	static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
191	quantize_f32_q4_0_block((const float )cxi, (block_q4_0 )cdsti);
192	}
193
194	static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
195	quantize_f32_q4_1_block((const float )cxi, (block_q4_1 )cdsti);
196	}
197
198	static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
199	quantize_f32_q5_0_block((const float )cxi, (block_q5_0 )cdsti);
200	}
201
202	static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
203	quantize_f32_q5_1_block((const float )cxi, (block_q5_1 )cdsti);
204	}
205
206	static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
207	quantize_f32_q8_0_block((const float )cxi, (block_q8_0 )cdsti);
208	}
209
210	static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
211	quantize_f32_iq4_nl_block((const float )cxi, (block_iq4_nl )cdsti);
212	}
213
214	template<typename src_t, typename dst_t>
215	static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
216	(dst_t ) cdsti = ggml_cuda_cast<dst_t>((const* src_t *) cxi);
217	}
218

Browse the source code of llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh