quants.c source code [llama.cpp/ggml/src/ggml-cpu/quants.c]

1	#define GGML_COMMON_IMPL_C
2	#include "ggml-common.h"
3
4	#include "ggml-cpu-impl.h"
5	#include "simd-mappings.h"
6	#include "ggml-quants.h"
7	#include "quants.h"
8
9	#include "arch-fallback.h"
10
11	#include <string.h>
12	#include <assert.h>
13	#include <float.h>
14	#include <stdlib.h> // for qsort
15	#include <stdio.h> // for GGML_ASSERT
16
17	#define GROUP_MAX_EPS 1e-15f
18	#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
19	#define GROUP_MAX_EPS_IQ2_S 1e-8f
20	#define GROUP_MAX_EPS_IQ1_M 1e-7f
21	#define GROUP_MAX_EPS_IQ1_S 1e-12f
22
23	#define UNUSED GGML_UNUSED
24
25	void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
26	quantize_row_q4_0_ref(x, y, k);
27	}
28
29	void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
30	quantize_row_q4_1_ref(x, y, k);
31	}
32
33	void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
34	quantize_row_q5_0_ref(x, y, k);
35	}
36
37	void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
38	quantize_row_q5_1_ref(x, y, k);
39	}
40
41	void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
42	quantize_row_q8_0_ref(x, y, k);
43	}
44
45	void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
46	quantize_row_q8_1_ref(x, y, k);
47	}
48
49	void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
50	quantize_row_mxfp4_ref(x, y, k);
51	}
52
53	//
54	// 2-6 bit quantization in super-blocks
55	//
56
57	//========================- 2-bit (de)-quantization
58
59	void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
60	quantize_row_q2_K_ref(x, y: vy, k);
61	}
62
63	//========================= 3-bit (de)-quantization
64
65	void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
66	quantize_row_q3_K_ref(x, y: vy, k);
67	}
68
69	// ====================== 4-bit (de)-quantization
70
71	void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
72	assert(k % QK_K == `0`);
73	block_q4_K * GGML_RESTRICT y = vy;
74	quantize_row_q4_K_ref(x, y, k);
75	}
76
77	// ====================== 5-bit (de)-quantization
78
79	void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
80	assert(k % QK_K == `0`);
81	block_q5_K * GGML_RESTRICT y = vy;
82	quantize_row_q5_K_ref(x, y, k);
83	}
84
85	// ====================== 6-bit (de)-quantization
86
87	void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
88	assert(k % QK_K == `0`);
89	block_q6_K * GGML_RESTRICT y = vy;
90	quantize_row_q6_K_ref(x, y, k);
91	}
92
93	// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
94
95	void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
96	assert(k % QK_K == `0`);
97	block_tq1_0 * GGML_RESTRICT y = vy;
98	quantize_row_tq1_0_ref(x, y, k);
99	}
100
101	void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
102	assert(k % QK_K == `0`);
103	block_tq2_0 * GGML_RESTRICT y = vy;
104	quantize_row_tq2_0_ref(x, y, k);
105	}
106
107	//===================================== Q8_K ==============================================
108
109	void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
110	quantize_row_q8_K_ref(x, y, k);
111	}
112
113	//===================================== Dot products =================================
114
115	void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
116	const int qk = QK8_0;
117	const int nb = n / qk;
118
119	assert(n % qk == `0`);
120	assert(nrc == `1`);
121	UNUSED(nrc);
122	UNUSED(bx);
123	UNUSED(by);
124	UNUSED(bs);
125
126	const block_q4_0 * GGML_RESTRICT x = vx;
127	const block_q8_0 * GGML_RESTRICT y = vy;
128
129	int ib = `0`;
130	float sumf = `0`;
131
132	for (; ib < nb; ++ib) {
133	int sumi0 = `0`;
134	int sumi1 = `0`;
135
136	for (int j = `0`; j < qk/`2`; ++j) {
137	const int v0 = (x[ib].qs[j] & `0x0F`) - `8`;
138	const int v1 = (x[ib].qs[j] >> `4`) - `8`;
139
140	sumi0 += (v0 * y[ib].qs[j]);
141	sumi1 += (v1 * y[ib].qs[j + qk/`2`]);
142	}
143
144	int sumi = sumi0 + sumi1;
145	sumf += sumiGGML_CPU_FP16_TO_FP32(x[ib].d)GGML_CPU_FP16_TO_FP32(y[ib].d);
146	}
147
148	*s = sumf;
149	}
150
151	// TODO: add WASM SIMD
152	void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
153	const int qk = QK8_1;
154	const int nb = n / qk;
155
156	assert(n % qk == `0`);
157	assert(nrc == `1`);
158	UNUSED(nrc);
159	UNUSED(bx);
160	UNUSED(by);
161	UNUSED(bs);
162
163	const block_q4_1 * GGML_RESTRICT x = vx;
164	const block_q8_1 * GGML_RESTRICT y = vy;
165
166	int ib = `0`;
167	float sumf = `0`;
168
169	for (; ib < nb; ++ib) {
170	int sumi0 = `0`;
171	int sumi1 = `0`;
172
173	for (int j = `0`; j < qk/`2`; ++j) {
174	const int v0 = (x[ib].qs[j] & `0x0F`);
175	const int v1 = (x[ib].qs[j] >> `4`);
176
177	sumi0 += (v0 * y[ib].qs[j]);
178	sumi1 += (v1 * y[ib].qs[j + qk/`2`]);
179	}
180
181	int sumi = sumi0 + sumi1;
182	sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)GGML_CPU_FP16_TO_FP32(y[ib].d))sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
183	}
184
185	*s = sumf;
186	}
187
188	void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
189	assert(nrc == `1`);
190	UNUSED(nrc);
191	UNUSED(bx);
192	UNUSED(by);
193	UNUSED(bs);
194	assert(n % QK_MXFP4 == `0`);
195	static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
196
197	const block_mxfp4 * GGML_RESTRICT x = vx;
198	const block_q8_0 * GGML_RESTRICT y = vy;
199
200	const int nb = n / QK_MXFP4;
201
202	int ib = `0`;
203	float sumf = `0`;
204
205	for (; ib < nb; ++ib) {
206	const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
207
208	int sumi1 = `0`;
209	int sumi2 = `0`;
210	for (int j = `0`; j < QK_MXFP4/`2`; ++j) {
211	sumi1 += y[ib].qs[j + `0`] * kvalues_mxfp4[x[ib].qs[j] & `0xf`];
212	sumi2 += y[ib].qs[j + QK_MXFP4/`2`] * kvalues_mxfp4[x[ib].qs[j] >> `4`];
213	}
214	sumf += d * (sumi1 + sumi2);
215	}
216	*s = sumf;
217	}
218
219	void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
220	const int qk = QK8_0;
221	const int nb = n / qk;
222
223	int ib = `0`;
224	float sumf = `0`;
225
226	assert(n % qk == `0`);
227	assert(qk == QK5_0);
228	assert(nrc == `1`);
229	UNUSED(nrc);
230	UNUSED(bx);
231	UNUSED(by);
232	UNUSED(bs);
233
234	const block_q5_0 * GGML_RESTRICT x = vx;
235	const block_q8_0 * GGML_RESTRICT y = vy;
236
237	for (; ib < nb; ++ib) {
238	uint32_t qh;
239	memcpy(dest: &qh, src: x[ib].qh, n: sizeof(qh));
240
241	int sumi0 = `0`;
242	int sumi1 = `0`;
243
244	for (int j = `0`; j < qk/`2`; ++j) {
245	const uint8_t xh_0 = ((qh & (`1u` << (j + `0` ))) >> (j + `0` )) << `4`;
246	const uint8_t xh_1 = ((qh & (`1u` << (j + `16`))) >> (j + `12`));
247
248	const int32_t x0 = (int8_t)(((x[ib].qs[j] & `0x0F`) \| xh_0) - `16`);
249	const int32_t x1 = (int8_t)(((x[ib].qs[j] >> `4`) \| xh_1) - `16`);
250
251	sumi0 += (x0 * y[ib].qs[j]);
252	sumi1 += (x1 * y[ib].qs[j + qk/`2`]);
253	}
254
255	int sumi = sumi0 + sumi1;
256	sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)GGML_CPU_FP16_TO_FP32(y[ib].d)) sumi;
257	}
258
259	*s = sumf;
260	}
261
262	void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
263	const int qk = QK8_1;
264	const int nb = n / qk;
265
266	int ib = `0`;
267	float sumf = `0`;
268
269	assert(n % qk == `0`);
270	assert(qk == QK5_1);
271	assert(nrc == `1`);
272	UNUSED(nrc);
273	UNUSED(bx);
274	UNUSED(by);
275	UNUSED(bs);
276
277	const block_q5_1 * GGML_RESTRICT x = vx;
278	const block_q8_1 * GGML_RESTRICT y = vy;
279
280	for (; ib < nb; ++ib) {
281	uint32_t qh;
282	memcpy(dest: &qh, src: x[ib].qh, n: sizeof(qh));
283
284	int sumi0 = `0`;
285	int sumi1 = `0`;
286
287	for (int j = `0`; j < qk/`2`; ++j) {
288	const uint8_t xh_0 = ((qh >> (j + `0`)) << `4`) & `0x10`;
289	const uint8_t xh_1 = ((qh >> (j + `12`)) ) & `0x10`;
290
291	const int32_t x0 = (x[ib].qs[j] & `0xF`) \| xh_0;
292	const int32_t x1 = (x[ib].qs[j] >> `4`) \| xh_1;
293
294	sumi0 += (x0 * y[ib].qs[j]);
295	sumi1 += (x1 * y[ib].qs[j + qk/`2`]);
296	}
297
298	int sumi = sumi0 + sumi1;
299	sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)GGML_CPU_FP16_TO_FP32(y[ib].d))sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
300	}
301
302	*s = sumf;
303	}
304
305	void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
306	const int qk = QK8_0;
307	const int nb = n / qk;
308
309	assert(n % qk == `0`);
310	assert(nrc == `1`);
311	UNUSED(nrc);
312	UNUSED(bx);
313	UNUSED(by);
314	UNUSED(bs);
315
316	const block_q8_0 * GGML_RESTRICT x = vx;
317	const block_q8_0 * GGML_RESTRICT y = vy;
318
319	int ib = `0`;
320	float sumf = `0`;
321
322	for (; ib < nb; ++ib) {
323	int sumi = `0`;
324
325	for (int j = `0`; j < qk; j++) {
326	sumi += x[ib].qs[j]*y[ib].qs[j];
327	}
328
329	sumf += sumi(GGML_CPU_FP16_TO_FP32(x[ib].d)GGML_CPU_FP16_TO_FP32(y[ib].d));
330	}
331
332	*s = sumf;
333	}
334
335	void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
336	assert(nrc == `1`);
337	UNUSED(nrc);
338	UNUSED(bx);
339	UNUSED(by);
340	UNUSED(bs);
341
342	const block_tq1_0 * GGML_RESTRICT x = vx;
343	const block_q8_K * GGML_RESTRICT y = vy;
344
345	const int nb = n / QK_K;
346
347	const uint8_t pow3[`6`] = {`1`, `3`, `9`, `27`, `81`, `243`};
348
349	float sumf = `0.0f`;
350
351	for (int i = `0`; i < nb; ++i) {
352	int sum = `0`;
353
354	for (size_t j = `0`; j < sizeof(x->qs) - sizeof(x->qs) % `32`; j += `32`) {
355	for (size_t l = `0`; l < `5`; ++l) {
356	for (size_t m = `0`; m < `32`; ++m) {
357	uint8_t q = x[i].qs[j + m] * pow3[l];
358	uint16_t xi = ((uint16_t) q * `3`) >> `8`;
359	sum += (xi - `1`) * y[i].qs[j`5` + l`32` + m];
360	}
361	}
362	}
363	for (size_t j = sizeof(x->qs) - sizeof(x->qs) % `32`; j < sizeof(x->qs); j += `16`) {
364	for (size_t l = `0`; l < `5`; ++l) {
365	for (size_t m = `0`; m < `16`; ++m) {
366	uint8_t q = x[i].qs[j + m] * pow3[l];
367	uint16_t xi = ((uint16_t) q * `3`) >> `8`;
368	sum += (xi - `1`) * y[i].qs[j`5` + l`16` + m];
369	}
370	}
371	}
372
373	for (size_t l = `0`; l < `4`; ++l) {
374	for (size_t j = `0`; j < sizeof(x->qh); ++j) {
375	uint8_t q = x[i].qh[j] * pow3[l];
376	uint16_t xi = ((uint16_t) q * `3`) >> `8`;
377	sum += (xi - `1`) * y[i].qs[sizeof(x->qs)`5` + lsizeof(x->qh) + j];
378	}
379	}
380
381	sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
382	}
383
384	*s = sumf;
385	}
386
387	void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
388	assert(nrc == `1`);
389	UNUSED(nrc);
390	UNUSED(bx);
391	UNUSED(by);
392	UNUSED(bs);
393
394	const block_tq2_0 * GGML_RESTRICT x = vx;
395	const block_q8_K * GGML_RESTRICT y = vy;
396
397	const int nb = n / QK_K;
398	float sumf = `0.0f`;
399
400	for (int i = `0`; i < nb; ++i) {
401	int32_t sumi = `0`;
402
403	for (size_t j = `0`; j < sizeof(x->qs); j += `32`) {
404	for (size_t l = `0`; l < `4`; ++l) {
405	for (size_t k = `0`; k < `32`; ++k) {
406	sumi += y[i].qs[j`4` + l`32` + k] * (((x[i].qs[j + k] >> (l*`2`)) & `3`) - `1`);
407	}
408	}
409	}
410
411	const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
412
413	sumf += (float) sumi * d;
414	}
415
416	*s = sumf;
417	}
418
419	void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
420	assert(nrc == `1`);
421	UNUSED(nrc);
422	UNUSED(bx);
423	UNUSED(by);
424	UNUSED(bs);
425
426	const block_q2_K * GGML_RESTRICT x = vx;
427	const block_q8_K * GGML_RESTRICT y = vy;
428
429	const int nb = n / QK_K;
430
431	float sumf = `0`;
432
433	for (int i = `0`; i < nb; ++i) {
434
435	const uint8_t * q2 = x[i].qs;
436	const int8_t * q8 = y[i].qs;
437	const uint8_t * sc = x[i].scales;
438
439	int summs = `0`;
440	for (int j = `0`; j < `16`; ++j) {
441	summs += y[i].bsums[j] * (sc[j] >> `4`);
442	}
443
444	const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
445	const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
446
447	int isum = `0`;
448	int is = `0`;
449	int d;
450	for (int k = `0`; k < QK_K/`128`; ++k) {
451	int shift = `0`;
452	for (int j = `0`; j < `4`; ++j) {
453	d = sc[is++] & `0xF`;
454	int isuml = `0`;
455	for (int l = `0`; l < `16`; ++l) isuml += q8[l] * ((q2[l] >> shift) & `3`);
456	isum += d * isuml;
457	d = sc[is++] & `0xF`;
458	isuml = `0`;
459	for (int l = `16`; l < `32`; ++l) isuml += q8[l] * ((q2[l] >> shift) & `3`);
460	isum += d * isuml;
461	shift += `2`;
462	q8 += `32`;
463	}
464	q2 += `32`;
465	}
466	sumf += dall * isum - dmin * summs;
467	}
468	*s = sumf;
469	}
470
471	void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
472	assert(n % QK_K == `0`);
473	assert(nrc == `1`);
474	UNUSED(nrc);
475	UNUSED(bx);
476	UNUSED(by);
477	UNUSED(bs);
478
479	const uint32_t kmask1 = `0x03030303`;
480	const uint32_t kmask2 = `0x0f0f0f0f`;
481
482	const block_q3_K * GGML_RESTRICT x = vx;
483	const block_q8_K * GGML_RESTRICT y = vy;
484
485	const int nb = n / QK_K;
486
487	// scalar version
488	// This function is written like this so the compiler can manage to vectorize most of it
489	// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
490	// manually vectorized version above. Every other version I tried would run at least 4 times slower.
491	// The ideal situation would be if we could just write the code once, and the compiler would
492	// automatically produce the best possible set of machine instructions, instead of us having to manually
493	// write vectorized versions for AVX, ARM_NEON, etc.
494
495	int8_t aux8[QK_K];
496	int16_t aux16[`8`];
497	float sums [`8`];
498	int32_t aux32[`8`];
499	memset(s: sums, c: `0`, n: `8`*sizeof(float));
500
501	uint32_t auxs[`4`];
502	const int8_t * scales = (const int8_t*)auxs;
503
504	float sumf = `0`;
505	for (int i = `0`; i < nb; ++i) {
506	const uint8_t * GGML_RESTRICT q3 = x[i].qs;
507	const uint8_t * GGML_RESTRICT hm = x[i].hmask;
508	const int8_t * GGML_RESTRICT q8 = y[i].qs;
509	memset(s: aux32, c: `0`, n: `8`*sizeof(int32_t));
510	int8_t * GGML_RESTRICT a = aux8;
511	uint8_t m = `1`;
512	for (int j = `0`; j < QK_K; j += `128`) {
513	for (int l = `0`; l < `32`; ++l) a[l] = q3[l] & `3`;
514	for (int l = `0`; l < `32`; ++l) a[l] -= (hm[l] & m ? `0` : `4`);
515	a += `32`; m <<= `1`;
516	for (int l = `0`; l < `32`; ++l) a[l] = (q3[l] >> `2`) & `3`;
517	for (int l = `0`; l < `32`; ++l) a[l] -= (hm[l] & m ? `0` : `4`);
518	a += `32`; m <<= `1`;
519	for (int l = `0`; l < `32`; ++l) a[l] = (q3[l] >> `4`) & `3`;
520	for (int l = `0`; l < `32`; ++l) a[l] -= (hm[l] & m ? `0` : `4`);
521	a += `32`; m <<= `1`;
522	for (int l = `0`; l < `32`; ++l) a[l] = (q3[l] >> `6`) & `3`;
523	for (int l = `0`; l < `32`; ++l) a[l] -= (hm[l] & m ? `0` : `4`);
524	a += `32`; m <<= `1`;
525	q3 += `32`;
526	}
527	a = aux8;
528
529	memcpy(dest: auxs, src: x[i].scales, n: `12`);
530	uint32_t tmp = auxs[`2`];
531	auxs[`2`] = ((auxs[`0`] >> `4`) & kmask2) \| (((tmp >> `4`) & kmask1) << `4`);
532	auxs[`3`] = ((auxs[`1`] >> `4`) & kmask2) \| (((tmp >> `6`) & kmask1) << `4`);
533	auxs[`0`] = (auxs[`0`] & kmask2) \| (((tmp >> `0`) & kmask1) << `4`);
534	auxs[`1`] = (auxs[`1`] & kmask2) \| (((tmp >> `2`) & kmask1) << `4`);
535	for (int j = `0`; j < QK_K/`16`; ++j) {
536	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
537	for (int l = `0`; l < `8`; ++l) aux32[l] += (scales[j] - `32`) * aux16[l];
538	q8 += `8`; a += `8`;
539	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
540	for (int l = `0`; l < `8`; ++l) aux32[l] += (scales[j] - `32`) * aux16[l];
541	q8 += `8`; a += `8`;
542	}
543	const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
544	for (int l = `0`; l < `8`; ++l) sums[l] += d * aux32[l];
545	}
546	for (int l = `0`; l < `8`; ++l) sumf += sums[l];
547	*s = sumf;
548	}
549
550	void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
551	assert(n % QK_K == `0`);
552	assert(nrc == `1`);
553	UNUSED(nrc);
554	UNUSED(bx);
555	UNUSED(by);
556	UNUSED(bs);
557
558	const block_q4_K * GGML_RESTRICT x = vx;
559	const block_q8_K * GGML_RESTRICT y = vy;
560
561	const int nb = n / QK_K;
562
563	static const uint32_t kmask1 = `0x3f3f3f3f`;
564	static const uint32_t kmask2 = `0x0f0f0f0f`;
565	static const uint32_t kmask3 = `0x03030303`;
566
567	uint32_t utmp[`4`];
568
569	const uint8_t * scales = (const uint8_t*)&utmp[`0`];
570	const uint8_t * mins = (const uint8_t*)&utmp[`2`];
571
572	int8_t aux8[QK_K];
573	int16_t aux16[`8`];
574	float sums [`8`];
575	int32_t aux32[`8`];
576	memset(s: sums, c: `0`, n: `8`*sizeof(float));
577
578	float sumf = `0`;
579	for (int i = `0`; i < nb; ++i) {
580	const uint8_t * GGML_RESTRICT q4 = x[i].qs;
581	const int8_t * GGML_RESTRICT q8 = y[i].qs;
582	memset(s: aux32, c: `0`, n: `8`*sizeof(int32_t));
583	int8_t * GGML_RESTRICT a = aux8;
584	for (int j = `0`; j < QK_K/`64`; ++j) {
585	for (int l = `0`; l < `32`; ++l) a[l] = (int8_t)(q4[l] & `0xF`);
586	a += `32`;
587	for (int l = `0`; l < `32`; ++l) a[l] = (int8_t)(q4[l] >> `4`);
588	a += `32`; q4 += `32`;
589	}
590	memcpy(dest: utmp, src: x[i].scales, n: `12`);
591	utmp[`3`] = ((utmp[`2`] >> `4`) & kmask2) \| (((utmp[`1`] >> `6`) & kmask3) << `4`);
592	const uint32_t uaux = utmp[`1`] & kmask1;
593	utmp[`1`] = (utmp[`2`] & kmask2) \| (((utmp[`0`] >> `6`) & kmask3) << `4`);
594	utmp[`2`] = uaux;
595	utmp[`0`] &= kmask1;
596
597	int sumi = `0`;
598	for (int j = `0`; j < QK_K/`16`; ++j) sumi += y[i].bsums[j] * mins[j/`2`];
599	a = aux8;
600	int is = `0`;
601	for (int j = `0`; j < QK_K/`32`; ++j) {
602	int32_t scale = scales[is++];
603	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
604	for (int l = `0`; l < `8`; ++l) aux32[l] += scale * aux16[l];
605	q8 += `8`; a += `8`;
606	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
607	for (int l = `0`; l < `8`; ++l) aux32[l] += scale * aux16[l];
608	q8 += `8`; a += `8`;
609	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
610	for (int l = `0`; l < `8`; ++l) aux32[l] += scale * aux16[l];
611	q8 += `8`; a += `8`;
612	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
613	for (int l = `0`; l < `8`; ++l) aux32[l] += scale * aux16[l];
614	q8 += `8`; a += `8`;
615	}
616	const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
617	for (int l = `0`; l < `8`; ++l) sums[l] += d * aux32[l];
618	const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
619	sumf -= dmin * sumi;
620	}
621	for (int l = `0`; l < `8`; ++l) sumf += sums[l];
622	*s = sumf;
623	}
624
625	void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
626	assert(n % QK_K == `0`);
627	assert(nrc == `1`);
628	UNUSED(nrc);
629	UNUSED(bx);
630	UNUSED(by);
631	UNUSED(bs);
632
633	const block_q5_K * GGML_RESTRICT x = vx;
634	const block_q8_K * GGML_RESTRICT y = vy;
635
636	const int nb = n / QK_K;
637
638	static const uint32_t kmask1 = `0x3f3f3f3f`;
639	static const uint32_t kmask2 = `0x0f0f0f0f`;
640	static const uint32_t kmask3 = `0x03030303`;
641
642	uint32_t utmp[`4`];
643
644	const uint8_t * scales = (const uint8_t*)&utmp[`0`];
645	const uint8_t * mins = (const uint8_t*)&utmp[`2`];
646
647	int8_t aux8[QK_K];
648	int16_t aux16[`8`];
649	float sums [`8`];
650	int32_t aux32[`8`];
651	memset(s: sums, c: `0`, n: `8`*sizeof(float));
652
653	float sumf = `0`;
654	for (int i = `0`; i < nb; ++i) {
655	const uint8_t * GGML_RESTRICT q4 = x[i].qs;
656	const uint8_t * GGML_RESTRICT hm = x[i].qh;
657	const int8_t * GGML_RESTRICT q8 = y[i].qs;
658	memset(s: aux32, c: `0`, n: `8`*sizeof(int32_t));
659	int8_t * GGML_RESTRICT a = aux8;
660	uint8_t m = `1`;
661	for (int j = `0`; j < QK_K/`64`; ++j) {
662	for (int l = `0`; l < `32`; ++l) a[l] = (int8_t)(q4[l] & `0xF`);
663	for (int l = `0`; l < `32`; ++l) a[l] += (hm[l] & m ? `16` : `0`);
664	a += `32`; m <<= `1`;
665	for (int l = `0`; l < `32`; ++l) a[l] = (int8_t)(q4[l] >> `4`);
666	for (int l = `0`; l < `32`; ++l) a[l] += (hm[l] & m ? `16` : `0`);
667	a += `32`; m <<= `1`;
668	q4 += `32`;
669	}
670	memcpy(dest: utmp, src: x[i].scales, n: `12`);
671	utmp[`3`] = ((utmp[`2`] >> `4`) & kmask2) \| (((utmp[`1`] >> `6`) & kmask3) << `4`);
672	const uint32_t uaux = utmp[`1`] & kmask1;
673	utmp[`1`] = (utmp[`2`] & kmask2) \| (((utmp[`0`] >> `6`) & kmask3) << `4`);
674	utmp[`2`] = uaux;
675	utmp[`0`] &= kmask1;
676
677	int sumi = `0`;
678	for (int j = `0`; j < QK_K/`16`; ++j) sumi += y[i].bsums[j] * mins[j/`2`];
679	a = aux8;
680	int is = `0`;
681	for (int j = `0`; j < QK_K/`32`; ++j) {
682	int32_t scale = scales[is++];
683	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
684	for (int l = `0`; l < `8`; ++l) aux32[l] += scale * aux16[l];
685	q8 += `8`; a += `8`;
686	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
687	for (int l = `0`; l < `8`; ++l) aux32[l] += scale * aux16[l];
688	q8 += `8`; a += `8`;
689	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
690	for (int l = `0`; l < `8`; ++l) aux32[l] += scale * aux16[l];
691	q8 += `8`; a += `8`;
692	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
693	for (int l = `0`; l < `8`; ++l) aux32[l] += scale * aux16[l];
694	q8 += `8`; a += `8`;
695	}
696	const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
697	for (int l = `0`; l < `8`; ++l) sums[l] += d * aux32[l];
698	const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
699	sumf -= dmin * sumi;
700	}
701	for (int l = `0`; l < `8`; ++l) sumf += sums[l];
702	*s = sumf;
703	}
704
705	void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
706	assert(n % QK_K == `0`);
707	assert(nrc == `1`);
708	UNUSED(nrc);
709	UNUSED(bx);
710	UNUSED(by);
711	UNUSED(bs);
712
713	const block_q6_K * GGML_RESTRICT x = vx;
714	const block_q8_K * GGML_RESTRICT y = vy;
715
716	const int nb = n / QK_K;
717
718	int8_t aux8[QK_K];
719	int16_t aux16[`8`];
720	float sums [`8`];
721	int32_t aux32[`8`];
722	memset(s: sums, c: `0`, n: `8`*sizeof(float));
723
724	float sumf = `0`;
725	for (int i = `0`; i < nb; ++i) {
726	const uint8_t * GGML_RESTRICT q4 = x[i].ql;
727	const uint8_t * GGML_RESTRICT qh = x[i].qh;
728	const int8_t * GGML_RESTRICT q8 = y[i].qs;
729	memset(s: aux32, c: `0`, n: `8`*sizeof(int32_t));
730	int8_t * GGML_RESTRICT a = aux8;
731	for (int j = `0`; j < QK_K; j += `128`) {
732	for (int l = `0`; l < `32`; ++l) {
733	a[l + `0`] = (int8_t)((q4[l + `0`] & `0xF`) \| (((qh[l] >> `0`) & `3`) << `4`)) - `32`;
734	a[l + `32`] = (int8_t)((q4[l + `32`] & `0xF`) \| (((qh[l] >> `2`) & `3`) << `4`)) - `32`;
735	a[l + `64`] = (int8_t)((q4[l + `0`] >> `4`) \| (((qh[l] >> `4`) & `3`) << `4`)) - `32`;
736	a[l + `96`] = (int8_t)((q4[l + `32`] >> `4`) \| (((qh[l] >> `6`) & `3`) << `4`)) - `32`;
737	}
738	a += `128`;
739	q4 += `64`;
740	qh += `32`;
741	}
742	a = aux8;
743	int is = `0`;
744	for (int j = `0`; j < QK_K/`16`; ++j) {
745	int scale = x[i].scales[is++];
746	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
747	for (int l = `0`; l < `8`; ++l) aux32[l] += scale * aux16[l];
748	q8 += `8`; a += `8`;
749	for (int l = `0`; l < `8`; ++l) aux16[l] = q8[l] * a[l];
750	for (int l = `0`; l < `8`; ++l) aux32[l] += scale * aux16[l];
751	q8 += `8`; a += `8`;
752	}
753	const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
754	for (int l = `0`; l < `8`; ++l) sums[l] += d * aux32[l];
755	}
756	for (int l = `0`; l < `8`; ++l) sumf += sums[l];
757	*s = sumf;
758	}
759
760	void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
761	assert(n % QK_K == `0`);
762	assert(nrc == `1`);
763	UNUSED(nrc);
764	UNUSED(bx);
765	UNUSED(by);
766	UNUSED(bs);
767
768	const block_iq2_xxs * GGML_RESTRICT x = vx;
769	const block_q8_K * GGML_RESTRICT y = vy;
770
771	const int nb = n / QK_K;
772
773	uint32_t aux32[`2`];
774	const uint8_t * aux8 = (const uint8_t *)aux32;
775
776	float sumf = `0.f`;
777	for (int i = `0`; i < nb; ++i) {
778	const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
779	const uint16_t * GGML_RESTRICT q2 = x[i].qs;
780	const int8_t * GGML_RESTRICT q8 = y[i].qs;
781	int32_t bsum = `0`;
782	for (int ib32 = `0`; ib32 < QK_K/`32`; ++ib32) {
783	memcpy(dest: aux32, src: q2, n: `2`*sizeof(uint32_t));
784	q2 += `4`;
785	const uint32_t ls = `2`*(aux32[`1`] >> `28`) + `1`;
786	int32_t sumi = `0`;
787	for (int l = `0`; l < `4`; ++l) {
788	const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
789	const uint8_t signs = ksigns_iq2xs[(aux32[`1`] >> `7`*l) & `127`];
790	for (int j = `0`; j < `8`; ++j) {
791	sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -`1` : `1`);
792	}
793	q8 += `8`;
794	}
795	bsum += sumi * ls;
796	}
797	sumf += d * bsum;
798	}
799	s = `0.125f` sumf;
800	}
801
802	void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
803	assert(n % QK_K == `0`);
804	assert(nrc == `1`);
805	UNUSED(nrc);
806	UNUSED(bx);
807	UNUSED(by);
808	UNUSED(bs);
809
810	const block_iq2_xs * GGML_RESTRICT x = vx;
811	const block_q8_K * GGML_RESTRICT y = vy;
812
813	const int nb = n / QK_K;
814
815	float sumf = `0.f`;
816	for (int i = `0`; i < nb; ++i) {
817	const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
818	const uint16_t * GGML_RESTRICT q2 = x[i].qs;
819	const uint8_t * GGML_RESTRICT sc = x[i].scales;
820	const int8_t * GGML_RESTRICT q8 = y[i].qs;
821	int32_t bsum = `0`;
822	for (int ib32 = `0`; ib32 < QK_K/`32`; ++ib32) {
823	const uint16_t ls1 = `2`*(sc[ib32] & `0xf`) + `1`;
824	const uint16_t ls2 = `2`*(sc[ib32] >> `4`) + `1`;
825	int32_t sumi = `0`;
826	for (int l = `0`; l < `2`; ++l) {
827	const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & `511`));
828	const uint8_t signs = ksigns_iq2xs[q2[l] >> `9`];
829	for (int j = `0`; j < `8`; ++j) {
830	sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -`1` : `1`);
831	}
832	q8 += `8`;
833	}
834	bsum += sumi * ls1;
835	sumi = `0`;
836	for (int l = `2`; l < `4`; ++l) {
837	const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & `511`));
838	const uint8_t signs = ksigns_iq2xs[q2[l] >> `9`];
839	for (int j = `0`; j < `8`; ++j) {
840	sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -`1` : `1`);
841	}
842	q8 += `8`;
843	}
844	bsum += sumi * ls2;
845	q2 += `4`;
846	}
847	sumf += d * bsum;
848	}
849	s = `0.125f` sumf;
850	}
851
852	void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
853	assert(n % QK_K == `0`);
854	assert(nrc == `1`);
855	UNUSED(nrc);
856	UNUSED(bx);
857	UNUSED(by);
858	UNUSED(bs);
859
860	const block_iq2_s * GGML_RESTRICT x = vx;
861	const block_q8_K * GGML_RESTRICT y = vy;
862
863	const int nb = n / QK_K;
864
865	float sumf = `0`;
866	for (int i = `0`; i < nb; i++) {
867
868	const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
869	const int8_t * q8 = y[i].qs;
870	const uint8_t * qs = x[i].qs;
871	const uint8_t * qh = x[i].qh;
872	const uint8_t * signs = qs + QK_K/`8`;
873
874	int bsum = `0`;
875	for (int ib32 = `0`; ib32 < QK_K/`32`; ++ib32) {
876	int ls1 = `1` + `2`*(x[i].scales[ib32] & `0xf`);
877	int ls2 = `1` + `2`*(x[i].scales[ib32] >> `4`);
878	int sumi1 = `0`, sumi2 = `0`;
879	for (int l = `0`; l < `2`; ++l) {
880	const uint8_t * grid = (const uint8_t )(iq2s_grid + (qs[l] \| (qh[ib32] << (`8`-`2`l) & `0x300`)));
881	for (int j = `0`; j < `8`; ++j) {
882	sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -`1` : `1`);
883	}
884	q8 += `8`;
885	}
886	for (int l = `2`; l < `4`; ++l) {
887	const uint8_t * grid = (const uint8_t )(iq2s_grid + (qs[l] \| (qh[ib32] << (`8`-`2`l) & `0x300`)));
888	for (int j = `0`; j < `8`; ++j) {
889	sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -`1` : `1`);
890	}
891	q8 += `8`;
892	}
893	bsum += ls1 * sumi1 + ls2 * sumi2;
894	qs += `4`;
895	signs += `4`;
896	}
897
898	sumf += d * bsum;
899	}
900
901	s = `0.125f` sumf;
902	}
903
904	void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
905	assert(n % QK_K == `0`);
906	assert(nrc == `1`);
907	UNUSED(nrc);
908	UNUSED(bx);
909	UNUSED(by);
910	UNUSED(bs);
911
912	const block_iq3_xxs * GGML_RESTRICT x = vx;
913	const block_q8_K * GGML_RESTRICT y = vy;
914
915	const int nb = n / QK_K;
916
917	uint32_t aux32;
918
919	float sumf = `0.f`;
920	for (int i = `0`; i < nb; ++i) {
921	const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
922	const uint8_t * GGML_RESTRICT q3 = x[i].qs;
923	const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/`4`;
924	const int8_t * GGML_RESTRICT q8 = y[i].qs;
925	int32_t bsum = `0`;
926	for (int ib32 = `0`; ib32 < QK_K/`32`; ++ib32) {
927	memcpy(dest: &aux32, src: gas, n: sizeof(uint32_t)); gas += sizeof(uint32_t);
928	const uint32_t ls = `2`*(aux32 >> `28`) + `1`;
929	int32_t sumi = `0`;
930	for (int l = `0`; l < `4`; ++l) {
931	const uint8_t * grid1 = (const uint8_t )(iq3xxs_grid + q3[`2`l+`0`]);
932	const uint8_t * grid2 = (const uint8_t )(iq3xxs_grid + q3[`2`l+`1`]);
933	const uint8_t signs = ksigns_iq2xs[(aux32 >> `7`*l) & `127`];
934	for (int j = `0`; j < `4`; ++j) {
935	sumi += grid1[j] * q8[j+`0`] * (signs & kmask_iq2xs[j+`0`] ? -`1` : `1`);
936	sumi += grid2[j] * q8[j+`4`] * (signs & kmask_iq2xs[j+`4`] ? -`1` : `1`);
937	}
938	q8 += `8`;
939	}
940	q3 += `8`;
941	bsum += sumi * ls;
942	}
943	sumf += d * bsum;
944	}
945	s = `0.25f` sumf;
946	}
947
948	void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
949	assert(n % QK_K == `0`);
950	assert(nrc == `1`);
951	UNUSED(nrc);
952	UNUSED(bx);
953	UNUSED(by);
954	UNUSED(bs);
955
956	const block_iq3_s * GGML_RESTRICT x = vx;
957	const block_q8_K * GGML_RESTRICT y = vy;
958
959	const int nb = n / QK_K;
960
961	float sumf = `0.f`;
962	for (int i = `0`; i < nb; ++i) {
963	const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
964	const uint8_t * GGML_RESTRICT qs = x[i].qs;
965	const uint8_t * GGML_RESTRICT qh = x[i].qh;
966	const uint8_t * GGML_RESTRICT signs = x[i].signs;
967	const int8_t * GGML_RESTRICT q8 = y[i].qs;
968	int32_t bsum = `0`;
969	for (int ib32 = `0`; ib32 < QK_K/`32`; ib32 += `2`) {
970	const uint32_t ls1 = `2`*(x[i].scales[ib32/`2`] & `0xf`) + `1`;
971	const uint32_t ls2 = `2`*(x[i].scales[ib32/`2`] >> `4`) + `1`;
972	int32_t sumi = `0`;
973	for (int l = `0`; l < `4`; ++l) {
974	const uint8_t * grid1 = (const uint8_t )(iq3s_grid + (qs[`2`l+`0`] \| ((qh[ib32+`0`] << (`8`-`2`*l)) & `256`)));
975	const uint8_t * grid2 = (const uint8_t )(iq3s_grid + (qs[`2`l+`1`] \| ((qh[ib32+`0`] << (`7`-`2`*l)) & `256`)));
976	for (int j = `0`; j < `4`; ++j) {
977	sumi += grid1[j] * q8[j+`0`] * (signs[l] & kmask_iq2xs[j+`0`] ? -`1` : `1`);
978	sumi += grid2[j] * q8[j+`4`] * (signs[l] & kmask_iq2xs[j+`4`] ? -`1` : `1`);
979	}
980	q8 += `8`;
981	}
982	qs += `8`;
983	signs += `4`;
984	bsum += sumi * ls1;
985	sumi = `0`;
986	for (int l = `0`; l < `4`; ++l) {
987	const uint8_t * grid1 = (const uint8_t )(iq3s_grid + (qs[`2`l+`0`] \| ((qh[ib32+`1`] << (`8`-`2`*l)) & `256`)));
988	const uint8_t * grid2 = (const uint8_t )(iq3s_grid + (qs[`2`l+`1`] \| ((qh[ib32+`1`] << (`7`-`2`*l)) & `256`)));
989	for (int j = `0`; j < `4`; ++j) {
990	sumi += grid1[j] * q8[j+`0`] * (signs[l] & kmask_iq2xs[j+`0`] ? -`1` : `1`);
991	sumi += grid2[j] * q8[j+`4`] * (signs[l] & kmask_iq2xs[j+`4`] ? -`1` : `1`);
992	}
993	q8 += `8`;
994	}
995	qs += `8`;
996	signs += `4`;
997	bsum += sumi * ls2;
998	}
999	sumf += d * bsum;
1000	}
1001	*s = sumf;
1002	}
1003
1004	void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1005	assert(n % QK_K == `0`);
1006	assert(nrc == `1`);
1007	UNUSED(nrc);
1008	UNUSED(bx);
1009	UNUSED(by);
1010	UNUSED(bs);
1011
1012	const block_iq1_s * GGML_RESTRICT x = vx;
1013	const block_q8_K * GGML_RESTRICT y = vy;
1014
1015	const int nb = n / QK_K;
1016
1017	float sumf = `0`;
1018	for (int i = `0`; i < nb; i++) {
1019
1020	const int8_t * q8 = y[i].qs;
1021	const uint8_t * qs = x[i].qs;
1022	const uint16_t * qh = x[i].qh;
1023
1024	int sumi = `0`, sumi1 = `0`;
1025	for (int ib = `0`; ib < QK_K/`32`; ++ib) {
1026	const int ls = `2`*((qh[ib] >> `12`) & `7`) + `1`;
1027	const int delta = qh[ib] & `0x8000` ? -`1` : `1`;
1028	int lsum = `0`;
1029	for (int l = `0`; l < `4`; ++l) {
1030	const int8_t * grid = (const int8_t )(iq1s_grid + (qs[l] \| (((qh[ib] >> `3`l) & `7`) << `8`)));
1031	for (int j = `0`; j < `8`; ++j) {
1032	lsum += q8[j] * grid[j];
1033	}
1034	q8 += `8`;
1035	}
1036	sumi += ls * lsum;
1037	sumi1 += ls * delta * (y[i].bsums[`2`ib+`0`] + y[i].bsums[`2`ib+`1`]);
1038	qs += `4`;
1039	}
1040
1041	sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
1042	}
1043
1044	*s = sumf;
1045	}
1046
1047	void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1048	assert(n % QK_K == `0`);
1049	assert(nrc == `1`);
1050	UNUSED(nrc);
1051	UNUSED(bx);
1052	UNUSED(by);
1053	UNUSED(bs);
1054
1055	const block_iq1_m * GGML_RESTRICT x = vx;
1056	const block_q8_K * GGML_RESTRICT y = vy;
1057
1058	const int nb = n / QK_K;
1059
1060	iq1m_scale_t scale;
1061
1062	int sum1[`2`], sum2[`2`], delta[`4`];
1063
1064	float sumf = `0`;
1065	for (int i = `0`; i < nb; i++) {
1066
1067	const int8_t * q8 = y[i].qs;
1068	const uint8_t * qs = x[i].qs;
1069	const uint8_t * qh = x[i].qh;
1070	const uint16_t * sc = (const uint16_t *)x[i].scales;
1071
1072	scale.u16 = (sc[`0`] >> `12`) \| ((sc[`1`] >> `8`) & `0x00f0`) \| ((sc[`2`] >> `4`) & `0x0f00`) \| (sc[`3`] & `0xf000`);
1073
1074	int sumi1 = `0`, sumi2 = `0`;
1075	for (int ib = `0`; ib < QK_K/`32`; ++ib) {
1076	delta[`0`] = qh[`0`] & `0x08` ? -`1` : `1`;
1077	delta[`1`] = qh[`0`] & `0x80` ? -`1` : `1`;
1078	delta[`2`] = qh[`1`] & `0x08` ? -`1` : `1`;
1079	delta[`3`] = qh[`1`] & `0x80` ? -`1` : `1`;
1080	sum1[`0`] = sum1[`1`] = sum2[`0`] = sum2[`1`] = `0`;
1081	for (int l = `0`; l < `4`; ++l) {
1082	const int8_t * grid = (const int8_t )(iq1s_grid + (qs[l] \| (((uint16_t)qh[l/`2`] << (`8` - `4`(l%`2`))) & `0x700`)));
1083	int lsum1 = `0`, lsum2 = `0`;
1084	for (int j = `0`; j < `8`; ++j) {
1085	lsum1 += q8[j] * grid[j];
1086	lsum2 += q8[j];
1087	}
1088	q8 += `8`;
1089	sum1[l/`2`] += lsum1;
1090	sum2[l/`2`] += lsum2*delta[l];
1091	}
1092
1093	const int ls1 = `2`((sc[ib/`2`] >> (`6`(ib%`2`)+`0`)) & `0x7`) + `1`;
1094	const int ls2 = `2`((sc[ib/`2`] >> (`6`(ib%`2`)+`3`)) & `0x7`) + `1`;
1095
1096	sumi1 += sum1[`0`] * ls1 + sum1[`1`] * ls2;
1097	sumi2 += sum2[`0`] * ls1 + sum2[`1`] * ls2;
1098	qs += `4`;
1099	qh += `2`;
1100	}
1101
1102	sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
1103	}
1104
1105	*s = sumf;
1106	}
1107
1108	void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1109	assert(nrc == `1`);
1110	UNUSED(nrc);
1111	UNUSED(bx);
1112	UNUSED(by);
1113	UNUSED(bs);
1114	assert(n % QK4_NL == `0`);
1115	static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
1116
1117	const block_iq4_nl * GGML_RESTRICT x = vx;
1118	const block_q8_0 * GGML_RESTRICT y = vy;
1119
1120	const int nb = n / QK4_NL;
1121
1122	int ib = `0`;
1123	float sumf = `0`;
1124
1125	for (; ib < nb; ++ib) {
1126	const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
1127	int sumi1 = `0`, sumi2 = `0`;
1128	for (int j = `0`; j < QK4_NL/`2`; ++j) {
1129	sumi1 += y[ib].qs[j+ `0`] * kvalues_iq4nl[x[ib].qs[j] & `0xf`];
1130	sumi2 += y[ib].qs[j+QK4_NL/`2`] * kvalues_iq4nl[x[ib].qs[j] >> `4`];
1131	}
1132	sumf += d * (sumi1 + sumi2);
1133	}
1134	*s = sumf;
1135	}
1136
1137	void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1138	assert(nrc == `1`);
1139	UNUSED(nrc);
1140	UNUSED(bx);
1141	UNUSED(by);
1142	UNUSED(bs);
1143	assert(n % QK_K == `0`);
1144
1145	const block_iq4_xs * GGML_RESTRICT x = vx;
1146	const block_q8_K * GGML_RESTRICT y = vy;
1147
1148	const int nb = n / QK_K;
1149
1150	float sumf = `0`;
1151	for (int ibl = `0`; ibl < nb; ++ibl) {
1152	const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1153	uint16_t h = x[ibl].scales_h;
1154	const uint8_t * qs = x[ibl].qs;
1155	const int8_t * q8 = y[ibl].qs;
1156	for (int ib = `0`; ib < QK_K/`32`; ib += `2`) {
1157	const uint8_t ls1 = (x[ibl].scales_l[ib/`2`] & `0xf`) \| ((h << `4`) & `0x30`);
1158	const uint8_t ls2 = (x[ibl].scales_l[ib/`2`] >> `4`) \| ((h << `2`) & `0x30`);
1159	h >>= `4`;
1160	const float d1 = d4d8*(ls1 - `32`);
1161	const float d2 = d4d8*(ls2 - `32`);
1162	int sumi1 = `0`, sumi2 = `0`;
1163	for (int j = `0`; j < `16`; ++j) {
1164	sumi1 += q8[j+ `0`] * kvalues_iq4nl[qs[j] & `0xf`];
1165	sumi2 += q8[j+`16`] * kvalues_iq4nl[qs[j] >> `4`];
1166	}
1167	sumf += d1 * (sumi1 + sumi2);
1168	qs += `16`;
1169	q8 += `32`;
1170	sumi1 = sumi2 = `0`;
1171	for (int j = `0`; j < `16`; ++j) {
1172	sumi1 += q8[j+ `0`] * kvalues_iq4nl[qs[j] & `0xf`];
1173	sumi2 += q8[j+`16`] * kvalues_iq4nl[qs[j] >> `4`];
1174	}
1175	sumf += d2 * (sumi1 + sumi2);
1176	qs += `16`;
1177	q8 += `32`;
1178	}
1179	}
1180	*s = sumf;
1181	}
1182
1183	// ============================ 4-bit non-linear quants
1184
1185	void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1186	assert(k % QK4_NL == `0`);
1187	quantize_row_iq4_nl_ref(x, y, k);
1188	}
1189
1190	void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1191	assert(k % QK_K == `0`);
1192	quantize_iq4_xs(src: x, dst: y, nrows: `1`, n_per_row: k, NULL);
1193	}
1194

Browse the source code of llama.cpp/ggml/src/ggml-cpu/quants.c