vdot.cpp source code [llama.cpp/pocs/vdot/vdot.cpp]

1	#include <cstdio>
2	#include <vector>
3	#include <random>
4	#include <chrono>
5	#include <cstdlib>
6	#include <cmath>
7	#include <cassert>
8	#include <cstring>
9	#include <array>
10
11	#include <ggml.h>
12	#include <ggml-cpu.h>
13
14	#if defined(_MSC_VER)
15	#pragma warning(disable: 4244 4267) // possible loss of data
16	#endif
17
18	constexpr int kVecSize = `1` << `18`;
19
20	static float drawFromGaussianPdf(std::mt19937& rndm) {
21	constexpr double kScale = `1.`/(`1.` + std::mt19937::max());
22	constexpr double kTwoPiTimesScale = `6.28318530717958647692`*kScale;
23	static float lastX;
24	static bool haveX = false;
25	if (haveX) { haveX = false; return lastX; }
26	auto r = sqrt(x: -`2`log(x: `1` - kScalerndm ()));
27	auto phi = kTwoPiTimesScale * rndm ();
28	lastX = r*sin(x: phi);
29	haveX = true;
30	return r*cos(x: phi);
31	}
32
33	static void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = `0`) {
34	for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
35	}
36
37	// Copy-pasted from ggml.c
38	#define QK4_0 32
39	typedef struct {
40	float d; // delta
41	uint8_t qs[QK4_0 / `2`]; // nibbles / quants
42	} block_q4_0;
43	static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / `2`, "wrong q4_0 block size/padding");
44
45	#define QK4_1 32
46	typedef struct {
47	float d; // delta
48	float m; // min
49	uint8_t qs[QK4_1 / `2`]; // nibbles / quants
50	} block_q4_1;
51	static_assert(sizeof(block_q4_1) == sizeof(float) * `2` + QK4_1 / `2`, "wrong q4_1 block size/padding");
52
53	// Copy-pasted from ggml.c
54	#define QK8_0 32
55	typedef struct {
56	float d; // delta
57	int8_t qs[QK8_0]; // quants
58	} block_q8_0;
59	static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
60
61	// "Scalar" dot product between the quantized vector x and float vector y
62	inline double dot(int n, const block_q4_0* x, const float* y) {
63	const static float kValues[`16`] = {-`8.f`, -`7.f`, -`6.f`, -`5.f`, -`4.f`, -`3.f`, -`2.f`, -`1.f`, `0.f`, `1.f`, `2.f`, `3.f`, `4.f`, `5.f`, `6.f`, `7.f`};
64	constexpr uint32_t kMask1 = `0x0f0f0f0f`;
65	uint32_t u1, u2;
66	auto q1 = (const uint8_t*)&u1;
67	auto q2 = (const uint8_t*)&u2;
68	double sum = `0`;
69	for (int i=`0`; i<n; ++i) {
70	float d = x->d;
71	auto u = (const uint32_t*)x->qs;
72	float s = `0`;
73	for (int k=`0`; k<`4`; ++k) {
74	u1 = u[k] & kMask1;
75	u2 = (u[k] >> `4`) & kMask1;
76	s += y[`0`]kValues[q1[`0`]] + y[`1`]kValues[q2[`0`]] +
77	y[`2`]kValues[q1[`1`]] + y[`3`]kValues[q2[`1`]] +
78	y[`4`]kValues[q1[`2`]] + y[`5`]kValues[q2[`2`]] +
79	y[`6`]kValues[q1[`3`]] + y[`7`]kValues[q2[`3`]];
80	y += `8`;
81	}
82	sum += s*d;
83	++x;
84	}
85	return sum;
86	}
87	// Alternative version of the above. Faster on my Mac (~45 us vs ~55 us per dot product),
88	// but about the same on X86_64 (Ryzen 7950X CPU).
89	inline double dot3(int n, const block_q4_0* x, const float* y) {
90	const static std::pair<float,float> kValues[`256`] = {
91	{-`8.f`, -`8.f`}, {-`7.f`, -`8.f`}, {-`6.f`, -`8.f`}, {-`5.f`, -`8.f`}, {-`4.f`, -`8.f`}, {-`3.f`, -`8.f`}, {-`2.f`, -`8.f`}, {-`1.f`, -`8.f`},
92	{ `0.f`, -`8.f`}, { `1.f`, -`8.f`}, { `2.f`, -`8.f`}, { `3.f`, -`8.f`}, { `4.f`, -`8.f`}, { `5.f`, -`8.f`}, { `6.f`, -`8.f`}, { `7.f`, -`8.f`},
93	{-`8.f`, -`7.f`}, {-`7.f`, -`7.f`}, {-`6.f`, -`7.f`}, {-`5.f`, -`7.f`}, {-`4.f`, -`7.f`}, {-`3.f`, -`7.f`}, {-`2.f`, -`7.f`}, {-`1.f`, -`7.f`},
94	{ `0.f`, -`7.f`}, { `1.f`, -`7.f`}, { `2.f`, -`7.f`}, { `3.f`, -`7.f`}, { `4.f`, -`7.f`}, { `5.f`, -`7.f`}, { `6.f`, -`7.f`}, { `7.f`, -`7.f`},
95	{-`8.f`, -`6.f`}, {-`7.f`, -`6.f`}, {-`6.f`, -`6.f`}, {-`5.f`, -`6.f`}, {-`4.f`, -`6.f`}, {-`3.f`, -`6.f`}, {-`2.f`, -`6.f`}, {-`1.f`, -`6.f`},
96	{ `0.f`, -`6.f`}, { `1.f`, -`6.f`}, { `2.f`, -`6.f`}, { `3.f`, -`6.f`}, { `4.f`, -`6.f`}, { `5.f`, -`6.f`}, { `6.f`, -`6.f`}, { `7.f`, -`6.f`},
97	{-`8.f`, -`5.f`}, {-`7.f`, -`5.f`}, {-`6.f`, -`5.f`}, {-`5.f`, -`5.f`}, {-`4.f`, -`5.f`}, {-`3.f`, -`5.f`}, {-`2.f`, -`5.f`}, {-`1.f`, -`5.f`},
98	{ `0.f`, -`5.f`}, { `1.f`, -`5.f`}, { `2.f`, -`5.f`}, { `3.f`, -`5.f`}, { `4.f`, -`5.f`}, { `5.f`, -`5.f`}, { `6.f`, -`5.f`}, { `7.f`, -`5.f`},
99	{-`8.f`, -`4.f`}, {-`7.f`, -`4.f`}, {-`6.f`, -`4.f`}, {-`5.f`, -`4.f`}, {-`4.f`, -`4.f`}, {-`3.f`, -`4.f`}, {-`2.f`, -`4.f`}, {-`1.f`, -`4.f`},
100	{ `0.f`, -`4.f`}, { `1.f`, -`4.f`}, { `2.f`, -`4.f`}, { `3.f`, -`4.f`}, { `4.f`, -`4.f`}, { `5.f`, -`4.f`}, { `6.f`, -`4.f`}, { `7.f`, -`4.f`},
101	{-`8.f`, -`3.f`}, {-`7.f`, -`3.f`}, {-`6.f`, -`3.f`}, {-`5.f`, -`3.f`}, {-`4.f`, -`3.f`}, {-`3.f`, -`3.f`}, {-`2.f`, -`3.f`}, {-`1.f`, -`3.f`},
102	{ `0.f`, -`3.f`}, { `1.f`, -`3.f`}, { `2.f`, -`3.f`}, { `3.f`, -`3.f`}, { `4.f`, -`3.f`}, { `5.f`, -`3.f`}, { `6.f`, -`3.f`}, { `7.f`, -`3.f`},
103	{-`8.f`, -`2.f`}, {-`7.f`, -`2.f`}, {-`6.f`, -`2.f`}, {-`5.f`, -`2.f`}, {-`4.f`, -`2.f`}, {-`3.f`, -`2.f`}, {-`2.f`, -`2.f`}, {-`1.f`, -`2.f`},
104	{ `0.f`, -`2.f`}, { `1.f`, -`2.f`}, { `2.f`, -`2.f`}, { `3.f`, -`2.f`}, { `4.f`, -`2.f`}, { `5.f`, -`2.f`}, { `6.f`, -`2.f`}, { `7.f`, -`2.f`},
105	{-`8.f`, -`1.f`}, {-`7.f`, -`1.f`}, {-`6.f`, -`1.f`}, {-`5.f`, -`1.f`}, {-`4.f`, -`1.f`}, {-`3.f`, -`1.f`}, {-`2.f`, -`1.f`}, {-`1.f`, -`1.f`},
106	{ `0.f`, -`1.f`}, { `1.f`, -`1.f`}, { `2.f`, -`1.f`}, { `3.f`, -`1.f`}, { `4.f`, -`1.f`}, { `5.f`, -`1.f`}, { `6.f`, -`1.f`}, { `7.f`, -`1.f`},
107	{-`8.f`, `0.f`}, {-`7.f`, `0.f`}, {-`6.f`, `0.f`}, {-`5.f`, `0.f`}, {-`4.f`, `0.f`}, {-`3.f`, `0.f`}, {-`2.f`, `0.f`}, {-`1.f`, `0.f`},
108	{ `0.f`, `0.f`}, { `1.f`, `0.f`}, { `2.f`, `0.f`}, { `3.f`, `0.f`}, { `4.f`, `0.f`}, { `5.f`, `0.f`}, { `6.f`, `0.f`}, { `7.f`, `0.f`},
109	{-`8.f`, `1.f`}, {-`7.f`, `1.f`}, {-`6.f`, `1.f`}, {-`5.f`, `1.f`}, {-`4.f`, `1.f`}, {-`3.f`, `1.f`}, {-`2.f`, `1.f`}, {-`1.f`, `1.f`},
110	{ `0.f`, `1.f`}, { `1.f`, `1.f`}, { `2.f`, `1.f`}, { `3.f`, `1.f`}, { `4.f`, `1.f`}, { `5.f`, `1.f`}, { `6.f`, `1.f`}, { `7.f`, `1.f`},
111	{-`8.f`, `2.f`}, {-`7.f`, `2.f`}, {-`6.f`, `2.f`}, {-`5.f`, `2.f`}, {-`4.f`, `2.f`}, {-`3.f`, `2.f`}, {-`2.f`, `2.f`}, {-`1.f`, `2.f`},
112	{ `0.f`, `2.f`}, { `1.f`, `2.f`}, { `2.f`, `2.f`}, { `3.f`, `2.f`}, { `4.f`, `2.f`}, { `5.f`, `2.f`}, { `6.f`, `2.f`}, { `7.f`, `2.f`},
113	{-`8.f`, `3.f`}, {-`7.f`, `3.f`}, {-`6.f`, `3.f`}, {-`5.f`, `3.f`}, {-`4.f`, `3.f`}, {-`3.f`, `3.f`}, {-`2.f`, `3.f`}, {-`1.f`, `3.f`},
114	{ `0.f`, `3.f`}, { `1.f`, `3.f`}, { `2.f`, `3.f`}, { `3.f`, `3.f`}, { `4.f`, `3.f`}, { `5.f`, `3.f`}, { `6.f`, `3.f`}, { `7.f`, `3.f`},
115	{-`8.f`, `4.f`}, {-`7.f`, `4.f`}, {-`6.f`, `4.f`}, {-`5.f`, `4.f`}, {-`4.f`, `4.f`}, {-`3.f`, `4.f`}, {-`2.f`, `4.f`}, {-`1.f`, `4.f`},
116	{ `0.f`, `4.f`}, { `1.f`, `4.f`}, { `2.f`, `4.f`}, { `3.f`, `4.f`}, { `4.f`, `4.f`}, { `5.f`, `4.f`}, { `6.f`, `4.f`}, { `7.f`, `4.f`},
117	{-`8.f`, `5.f`}, {-`7.f`, `5.f`}, {-`6.f`, `5.f`}, {-`5.f`, `5.f`}, {-`4.f`, `5.f`}, {-`3.f`, `5.f`}, {-`2.f`, `5.f`}, {-`1.f`, `5.f`},
118	{ `0.f`, `5.f`}, { `1.f`, `5.f`}, { `2.f`, `5.f`}, { `3.f`, `5.f`}, { `4.f`, `5.f`}, { `5.f`, `5.f`}, { `6.f`, `5.f`}, { `7.f`, `5.f`},
119	{-`8.f`, `6.f`}, {-`7.f`, `6.f`}, {-`6.f`, `6.f`}, {-`5.f`, `6.f`}, {-`4.f`, `6.f`}, {-`3.f`, `6.f`}, {-`2.f`, `6.f`}, {-`1.f`, `6.f`},
120	{ `0.f`, `6.f`}, { `1.f`, `6.f`}, { `2.f`, `6.f`}, { `3.f`, `6.f`}, { `4.f`, `6.f`}, { `5.f`, `6.f`}, { `6.f`, `6.f`}, { `7.f`, `6.f`},
121	{-`8.f`, `7.f`}, {-`7.f`, `7.f`}, {-`6.f`, `7.f`}, {-`5.f`, `7.f`}, {-`4.f`, `7.f`}, {-`3.f`, `7.f`}, {-`2.f`, `7.f`}, {-`1.f`, `7.f`},
122	{ `0.f`, `7.f`}, { `1.f`, `7.f`}, { `2.f`, `7.f`}, { `3.f`, `7.f`}, { `4.f`, `7.f`}, { `5.f`, `7.f`}, { `6.f`, `7.f`}, { `7.f`, `7.f`}
123	};
124	double sum = `0`;
125	for (int i=`0`; i<n; ++i) {
126	float d = x->d;
127	auto q = x->qs;
128	float s = `0`;
129	for (int k=`0`; k<`4`; ++k) {
130	s += y[`0`]kValues[q[`0`]].first + y[`1`]kValues[q[`0`]].second +
131	y[`2`]kValues[q[`1`]].first + y[`3`]kValues[q[`1`]].second +
132	y[`4`]kValues[q[`2`]].first + y[`5`]kValues[q[`2`]].second +
133	y[`6`]kValues[q[`3`]].first + y[`7`]kValues[q[`3`]].second;
134	y += `8`; q += `4`;
135	}
136	sum += s*d;
137	++x;
138	}
139	return sum;
140	}
141
142	inline double dot41(int n, const block_q4_1* x, const float* y) {
143	const static float kValues[`16`] = {`0.f`, `1.f`, `2.f`, `3.f`, `4.f`, `5.f`, `6.f`, `7.f`, `8.f`, `9.f`, `10.f`, `11.f`, `12.f`, `13.f`, `14.f`, `15.f`};
144	constexpr uint32_t kMask1 = `0x0f0f0f0f`;
145	uint32_t u1, u2;
146	auto q1 = (const uint8_t*)&u1;
147	auto q2 = (const uint8_t*)&u2;
148	double sum = `0`;
149	for (int i=`0`; i<n; ++i) {
150	auto u = (const uint32_t*)x->qs;
151	float s = `0`, s1 = `0`;
152	for (int k=`0`; k<`4`; ++k) {
153	u1 = u[k] & kMask1;
154	u2 = (u[k] >> `4`) & kMask1;
155	s += y[`0`]kValues[q1[`0`]] + y[`1`]kValues[q2[`0`]] +
156	y[`2`]kValues[q1[`1`]] + y[`3`]kValues[q2[`1`]] +
157	y[`4`]kValues[q1[`2`]] + y[`5`]kValues[q2[`2`]] +
158	y[`6`]kValues[q1[`3`]] + y[`7`]kValues[q2[`3`]];
159	s1 += y[`0`] + y[`1`] + y[`2`] + y[`3`] + y[`4`] + y[`5`] + y[`6`] + y[`7`];
160	y += `8`;
161	}
162	sum += sx->d + s1x->m;
163	++x;
164	}
165	return sum;
166	}
167
168	// Copy-pasted from ggml.c
169	static void quantize_row_q8_0_reference(const float x, block_q8_0 y, int k) {
170	assert(k % QK8_0 == `0`);
171	const int nb = k / QK8_0;
172
173	for (int i = `0`; i < nb; i++) {
174	float amax = `0.0f`; // absolute max
175
176	for (int l = `0`; l < QK8_0; l++) {
177	const float v = x[i*QK8_0 + l];
178	amax = std::max(a: amax, b: fabsf(x: v));
179	}
180
181	const float d = amax / ((`1` << `7`) - `1`);
182	const float id = d ? `1.0f`/d : `0.0f`;
183
184	y[i].d = d;
185
186	for (int l = `0`; l < QK8_0; ++l) {
187	const float v = x[iQK8_0 + l]id;
188	y[i].qs[l] = roundf(x: v);
189	}
190	}
191	}
192
193	// Copy-pasted from ggml.c
194	static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) {
195	const int nb = n / QK8_0;
196	const block_q4_0* x = (const block_q4_0*)vx;
197	const block_q8_0* y = (const block_q8_0*)vy;
198	float sumf = `0`;
199	for (int i = `0`; i < nb; i++) {
200	const float d0 = x[i].d;
201	const float d1 = y[i].d;
202
203	const uint8_t * p0 = x[i].qs;
204	const int8_t * p1 = y[i].qs;
205
206	int sumi = `0`;
207	for (int j = `0`; j < QK8_0/`2`; j++) {
208	const uint8_t v0 = p0[j];
209
210	const int i0 = (int8_t) (v0 & `0xf`) - `8`;
211	const int i1 = (int8_t) (v0 >> `4`) - `8`;
212
213	const int i2 = p1[`2`*j + `0`];
214	const int i3 = p1[`2`*j + `1`];
215
216	sumi += i0i2 + i1i3;
217	}
218	sumf += d0d1sumi;
219	}
220	*s = sumf;
221	}
222
223	int main(int argc, char** argv) {
224
225	int nloop = argc > `1` ? atoi(nptr: argv[`1`]) : `10`;
226	bool scalar = argc > `2` ? atoi(nptr: argv[`2`]) : false;
227	bool useQ4_1 = argc > `3` ? atoi(nptr: argv[`3`]) : false;
228
229	if (scalar && useQ4_1) {
230	printf(format: "It is not possible to use Q4_1 quantization and scalar implementations\n");
231	return `1`;
232	}
233
234	std::mt19937 rndm(`1234`);
235
236	std::vector<float> x1(kVecSize), y1(kVecSize);
237	int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = `64`*((n4 + `63`)/`64`);
238	int n8 = kVecSize / QK8_0; n8 = `64`*((n8 + `63`)/`64`);
239
240	const auto * funcs_cpu = ggml_get_type_traits_cpu(type: useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
241
242	std::vector<block_q4_0> q40;
243	std::vector<block_q4_1> q41;
244	if (useQ4_1) q41.resize(new_size: n4);
245	else q40.resize(new_size: n4);
246	std::vector<block_q8_0> q8(n8);
247	double sumt = `0`, sumt2 = `0`, maxt = `0`;
248	double sumqt = `0`, sumqt2 = `0`, maxqt = `0`;
249	double sum = `0`, sumq = `0`, exactSum = `0`;
250	for (int iloop=`0`; iloop<nloop; ++iloop) {
251
252	// Fill vector x with random numbers
253	fillRandomGaussianFloats(values&: x1, rndm);
254
255	// Fill vector y with random numbers
256	fillRandomGaussianFloats(values&: y1, rndm);
257
258	// Compute the exact dot product
259	for (int k=`0`; k<kVecSize; ++k) exactSum += x1 [k]*y1 [k];
260
261	// quantize x.
262	// Note, we do not include this in the timing as in practical application
263	// we already have the quantized model weights.
264	if (useQ4_1) {
265	funcs_cpu->from_float(x1.data(), q41.data(), kVecSize);
266	} else {
267	funcs_cpu->from_float(x1.data(), q40.data(), kVecSize);
268	}
269
270	// Now measure time the dot product needs using the "scalar" version above
271	auto t1 = std::chrono::high_resolution_clock::now();
272	if (useQ4_1) sum += dot41(n: kVecSize / QK4_1, x: q41.data(), y: y1.data());
273	else sum += dot(n: kVecSize / QK4_0, x: q40.data(), y: y1.data());
274	auto t2 = std::chrono::high_resolution_clock::now();
275	auto t = `1e-3`*std::chrono::duration_cast<std::chrono::nanoseconds>(d: t2 -t1).count();
276	sumt += t; sumt2 += t*t; maxt = std::max(a: maxt, b: t);
277
278	// And now measure the time needed to quantize y and perform the dot product with the quantized y
279	t1 = std::chrono::high_resolution_clock::now();
280	float result;
281	if (scalar) {
282	quantize_row_q8_0_reference(x: y1.data(), y: q8.data(), k: kVecSize);
283	dot_q4_q8(n: kVecSize, s: &result, vx: q40.data(), vy: q8.data());
284	}
285	else {
286	const auto * vdot = ggml_get_type_traits_cpu(type: funcs_cpu->vec_dot_type);
287	vdot->from_float(y1.data(), q8.data(), kVecSize);
288	if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, `0`, q41.data(), `0`, q8.data(), `0`, `1`);
289	else funcs_cpu->vec_dot(kVecSize, &result, `0`, q40.data(), `0`, q8.data(), `0`, `1`);
290	}
291	sumq += result;
292	t2 = std::chrono::high_resolution_clock::now();
293	t = `1e-3`*std::chrono::duration_cast<std::chrono::nanoseconds>(d: t2 -t1).count();
294	sumqt += t; sumqt2 += t*t; maxqt = std::max(a: maxqt, b: t);
295
296	}
297
298	// Report the time (and the average of the dot products so the compiler does not come up with the idea
299	// of optimizing away the function calls after figuring that the result is not used).
300	sum /= nloop; sumq /= nloop;
301	exactSum /= nloop;
302	printf(format: "Exact result: <dot> = %g\n",exactSum);
303	printf(format: "<dot> = %g, %g\n",sum,sumq);
304	sumt /= nloop; sumt2 /= nloop; sumt2 -= sumt*sumt;
305	if (sumt2 > `0`) sumt2 = sqrt(x: sumt2);
306	printf(format: "time = %g +/- %g us. maxt = %g us\n",sumt,sumt2,maxt);
307	sumqt /= nloop; sumqt2 /= nloop; sumqt2 -= sumqt*sumqt;
308	if (sumqt2 > `0`) sumqt2 = sqrt(x: sumqt2);
309	printf(format: "timeq = %g +/- %g us. maxt = %g us\n",sumqt,sumqt2,maxqt);
310	return `0`;
311	}
312

Browse the source code of llama.cpp/pocs/vdot/vdot.cpp