ggml-cpu-impl.h source code [llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h]

1	#pragma once
2
3	// GGML CPU internal header
4
5	#include "ggml.h"
6	#include "ggml-impl.h"
7
8	#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
9	//#include <stddef.h>
10	#include <stdbool.h>
11	#include <string.h> // memcpy
12	#include <math.h> // fabsf
13
14	#ifdef __cplusplus
15	extern "C" {
16	#endif
17
18	struct ggml_compute_params {
19	// ith = thread index, nth = number of threads
20	int ith, nth;
21
22	// work buffer for all threads
23	size_t wsize;
24	void * wdata;
25
26	struct ggml_threadpool * threadpool;
27	};
28
29
30	#if defined(_MSC_VER)
31
32	#define m512bh(p) p
33	#define m512i(p) p
34
35	#else
36
37	#define m512bh(p) (__m512bh)(p)
38	#define m512i(p) (__m512i)(p)
39
40	#endif
41
42	// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
43	#if defined(_MSC_VER) && (defined(__AVX2__) \|\| defined(__AVX512F__))
44	#ifndef __FMA__
45	#define __FMA__
46	#endif
47	#ifndef __F16C__
48	#define __F16C__
49	#endif
50	#endif
51
52	// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
53	#if defined(_MSC_VER) && (defined(__AVX__) \|\| defined(__AVX2__) \|\| defined(__AVX512F__))
54	#ifndef __SSE3__
55	#define __SSE3__
56	#endif
57	#ifndef __SSSE3__
58	#define __SSSE3__
59	#endif
60	#endif
61
62	#if defined(__s390x__) && defined(__VEC__)
63	#ifndef __VXE__
64	#define __VXE__
65	#endif // __VXE__
66	#ifndef __VXE2__
67	#define __VXE2__
68	#endif // __VXE2__
69	#endif // __s390x__ && __VEC__
70
71	#if defined(__ARM_FEATURE_SVE) && defined(__linux__)
72	#include <sys/prctl.h>
73	#endif
74
75	#if defined(__ARM_NEON)
76
77	// ref: https://github.com/ggml-org/llama.cpp/pull/5404
78	#ifdef _MSC_VER
79	#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
80	#else
81	#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
82	#endif // _MSC_VER
83
84	#if !defined(__aarch64__)
85
86	// 32-bit ARM compatibility
87
88	// vaddlvq_s16
89	// vpaddq_s16
90	// vpaddq_s32
91	// vaddvq_s32
92	// vaddvq_f32
93	// vmaxvq_f32
94	// vcvtnq_s32_f32
95	// vzip1_u8
96	// vzip2_u8
97
98	inline static int32_t vaddlvq_s16(int16x8_t v) {
99	int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
100	return vgetq_lane_s32(v0, `0`) + vgetq_lane_s32(v0, `2`);
101	}
102
103	inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
104	int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
105	int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
106	return vcombine_s16(a0, b0);
107	}
108
109	inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
110	int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
111	int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
112	return vcombine_s32(a0, b0);
113	}
114
115	inline static int32_t vaddvq_s32(int32x4_t v) {
116	return vgetq_lane_s32(v, `0`) + vgetq_lane_s32(v, `1`) + vgetq_lane_s32(v, `2`) + vgetq_lane_s32(v, `3`);
117	}
118
119	inline static float vaddvq_f32(float32x4_t v) {
120	return vgetq_lane_f32(v, `0`) + vgetq_lane_f32(v, `1`) + vgetq_lane_f32(v, `2`) + vgetq_lane_f32(v, `3`);
121	}
122
123	inline static float vmaxvq_f32(float32x4_t v) {
124	return
125	MAX(MAX(vgetq_lane_f32(v, `0`), vgetq_lane_f32(v, `1`)),
126	MAX(vgetq_lane_f32(v, `2`), vgetq_lane_f32(v, `3`)));
127	}
128
129	inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
130	int32x4_t res;
131
132	res[`0`] = roundf(vgetq_lane_f32(v, `0`));
133	res[`1`] = roundf(vgetq_lane_f32(v, `1`));
134	res[`2`] = roundf(vgetq_lane_f32(v, `2`));
135	res[`3`] = roundf(vgetq_lane_f32(v, `3`));
136
137	return res;
138	}
139
140	inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
141	uint8x8_t res;
142
143	res[`0`] = a[`0`]; res[`1`] = b[`0`];
144	res[`2`] = a[`1`]; res[`3`] = b[`1`];
145	res[`4`] = a[`2`]; res[`5`] = b[`2`];
146	res[`6`] = a[`3`]; res[`7`] = b[`3`];
147
148	return res;
149	}
150
151	inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
152	uint8x8_t res;
153
154	res[`0`] = a[`4`]; res[`1`] = b[`4`];
155	res[`2`] = a[`5`]; res[`3`] = b[`5`];
156	res[`4`] = a[`6`]; res[`5`] = b[`6`];
157	res[`6`] = a[`7`]; res[`7`] = b[`7`];
158
159	return res;
160	}
161
162	// vld1q_s16_x2
163	// vld1q_u8_x2
164	// vld1q_u8_x4
165	// vld1q_s8_x2
166	// vld1q_s8_x4
167	// TODO: double-check these work correctly
168
169	typedef struct ggml_int16x8x2_t {
170	int16x8_t val[`2`];
171	} ggml_int16x8x2_t;
172
173	inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
174	ggml_int16x8x2_t res;
175
176	res.val[`0`] = vld1q_s16(ptr + `0`);
177	res.val[`1`] = vld1q_s16(ptr + `8`);
178
179	return res;
180	}
181
182	typedef struct ggml_uint8x16x2_t {
183	uint8x16_t val[`2`];
184	} ggml_uint8x16x2_t;
185
186	inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
187	ggml_uint8x16x2_t res;
188
189	res.val[`0`] = vld1q_u8(ptr + `0`);
190	res.val[`1`] = vld1q_u8(ptr + `16`);
191
192	return res;
193	}
194
195	typedef struct ggml_uint8x16x4_t {
196	uint8x16_t val[`4`];
197	} ggml_uint8x16x4_t;
198
199	inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
200	ggml_uint8x16x4_t res;
201
202	res.val[`0`] = vld1q_u8(ptr + `0`);
203	res.val[`1`] = vld1q_u8(ptr + `16`);
204	res.val[`2`] = vld1q_u8(ptr + `32`);
205	res.val[`3`] = vld1q_u8(ptr + `48`);
206
207	return res;
208	}
209
210	typedef struct ggml_int8x16x2_t {
211	int8x16_t val[`2`];
212	} ggml_int8x16x2_t;
213
214	inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
215	ggml_int8x16x2_t res;
216
217	res.val[`0`] = vld1q_s8(ptr + `0`);
218	res.val[`1`] = vld1q_s8(ptr + `16`);
219
220	return res;
221	}
222
223	typedef struct ggml_int8x16x4_t {
224	int8x16_t val[`4`];
225	} ggml_int8x16x4_t;
226
227	inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
228	ggml_int8x16x4_t res;
229
230	res.val[`0`] = vld1q_s8(ptr + `0`);
231	res.val[`1`] = vld1q_s8(ptr + `16`);
232	res.val[`2`] = vld1q_s8(ptr + `32`);
233	res.val[`3`] = vld1q_s8(ptr + `48`);
234
235	return res;
236	}
237
238	// NOTE: not tested
239	inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
240	int8x16_t res;
241
242	res[ `0`] = a[b[ `0`]];
243	res[ `1`] = a[b[ `1`]];
244	res[ `2`] = a[b[ `2`]];
245	res[ `3`] = a[b[ `3`]];
246	res[ `4`] = a[b[ `4`]];
247	res[ `5`] = a[b[ `5`]];
248	res[ `6`] = a[b[ `6`]];
249	res[ `7`] = a[b[ `7`]];
250	res[ `8`] = a[b[ `8`]];
251	res[ `9`] = a[b[ `9`]];
252	res[`10`] = a[b[`10`]];
253	res[`11`] = a[b[`11`]];
254	res[`12`] = a[b[`12`]];
255	res[`13`] = a[b[`13`]];
256	res[`14`] = a[b[`14`]];
257	res[`15`] = a[b[`15`]];
258
259	return res;
260	}
261
262	// NOTE: not tested
263	inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
264	uint8x16_t res;
265
266	res[ `0`] = a[b[ `0`]];
267	res[ `1`] = a[b[ `1`]];
268	res[ `2`] = a[b[ `2`]];
269	res[ `3`] = a[b[ `3`]];
270	res[ `4`] = a[b[ `4`]];
271	res[ `5`] = a[b[ `5`]];
272	res[ `6`] = a[b[ `6`]];
273	res[ `7`] = a[b[ `7`]];
274	res[ `8`] = a[b[ `8`]];
275	res[ `9`] = a[b[ `9`]];
276	res[`10`] = a[b[`10`]];
277	res[`11`] = a[b[`11`]];
278	res[`12`] = a[b[`12`]];
279	res[`13`] = a[b[`13`]];
280	res[`14`] = a[b[`14`]];
281	res[`15`] = a[b[`15`]];
282
283	return res;
284	}
285
286	#else
287
288	#define ggml_int16x8x2_t int16x8x2_t
289	#define ggml_uint8x16x2_t uint8x16x2_t
290	#define ggml_uint8x16x4_t uint8x16x4_t
291	#define ggml_int8x16x2_t int8x16x2_t
292	#define ggml_int8x16x4_t int8x16x4_t
293
294	#define ggml_vld1q_s16_x2 vld1q_s16_x2
295	#define ggml_vld1q_u8_x2 vld1q_u8_x2
296	#define ggml_vld1q_u8_x4 vld1q_u8_x4
297	#define ggml_vld1q_s8_x2 vld1q_s8_x2
298	#define ggml_vld1q_s8_x4 vld1q_s8_x4
299	#define ggml_vqtbl1q_s8 vqtbl1q_s8
300	#define ggml_vqtbl1q_u8 vqtbl1q_u8
301
302	#endif // !defined(__aarch64__)
303
304	#if !defined(__ARM_FEATURE_DOTPROD)
305
306	inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
307	const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
308	const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
309
310	return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
311	}
312
313	#else
314
315	#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
316
317	#endif // !defined(__ARM_FEATURE_DOTPROD)
318
319	#endif // defined(__ARM_NEON)
320
321	#ifdef __wasm_simd128__
322	#include <wasm_simd128.h>
323	#endif
324
325	#ifdef __POWER9_VECTOR__
326	#include <altivec.h>
327	#endif
328
329	#if defined(_MSC_VER) \|\| defined(__MINGW32__)
330	#include <intrin.h>
331	#elif defined(__AVX__) \|\| defined(__AVX2__) \|\| defined(__AVX512F__) \|\| defined(__SSSE3__) \|\| defined(__SSE3__) \|\| defined(__SSE__)
332	#include <immintrin.h>
333	#endif
334
335	#ifdef __riscv_v_intrinsic
336	#include <riscv_vector.h>
337	#endif
338
339	#if defined(__loongarch64)
340	#if defined(__loongarch_asx)
341	#include <lasxintrin.h>
342	#endif
343	#if defined(__loongarch_sx)
344	#include <lsxintrin.h>
345	#endif
346	#endif
347
348	#if defined(__VXE__) \|\| defined(__VXE2__)
349	#include <vecintrin.h>
350
351	#define vec_neg(a) (-(a)) // Vector Negate
352	#define vec_add(a, b) ((a) + (b)) // Vector Add
353	#define vec_sub(a, b) ((a) - (b)) // Vector Subtract
354	#define vec_mul(a, b) ((a) * (b)) // Vector Multiply
355	#define vec_div(a, b) ((a) / (b)) // Vector Divide
356	#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
357	#define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right
358	#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
359	#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
360	#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
361
362	#ifndef vec_and
363	#define vec_and(a, b) ((a) & (b)) // Vector AND
364	#endif
365
366	#ifndef vec_or
367	#define vec_or(a, b) ((a) \| (b)) // Vector OR
368	#endif
369
370	#ifndef vec_xor
371	#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
372	#endif
373
374	typedef signed char char8x16_t __attribute__((vector_size(`16`)));
375	typedef unsigned char uchar8x16_t __attribute__((vector_size(`16`)));
376
377	typedef int8_t int8x16_t __attribute__((vector_size(`16`)));
378	typedef int16_t int16x8_t __attribute__((vector_size(`16`)));
379	typedef int32_t int32x4_t __attribute__((vector_size(`16`)));
380
381	typedef uint8_t uint8x16_t __attribute__((vector_size(`16`)));
382	typedef uint16_t uint16x8_t __attribute__((vector_size(`16`)));
383	typedef uint32_t uint32x4_t __attribute__((vector_size(`16`)));
384
385	typedef float float32x4_t __attribute__((vector_size(`16`)));
386	typedef double double64x2_t __attribute__((vector_size(`16`)));
387
388	typedef signed long long long64x2_t __attribute__((vector_size(`16`)));
389	typedef unsigned long long ulong64x2_t __attribute__((vector_size(`16`)));
390
391	typedef struct ggml_uint8x16x2_t {
392	uint8x16_t val[`2`];
393	} ggml_uint8x16x2_t;
394
395	inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2(const uint8_t * ptr) {
396	ggml_uint8x16x2_t res;
397
398	res.val[`0`] = vec_xl( `0`, ptr);
399	res.val[`1`] = vec_xl(`16`, ptr);
400
401	return res;
402	}
403
404	typedef struct ggml_uint8x16x4_t {
405	uint8x16_t val[`4`];
406	} ggml_uint8x16x4_t;
407
408	inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4(const uint8_t * ptr) {
409	ggml_uint8x16x4_t res;
410
411	res.val[`0`] = vec_xl( `0`, ptr);
412	res.val[`1`] = vec_xl(`16`, ptr);
413	res.val[`2`] = vec_xl(`32`, ptr);
414	res.val[`3`] = vec_xl(`48`, ptr);
415
416	return res;
417	}
418
419	typedef struct ggml_int8x16x4_t {
420	int8x16_t val[`4`];
421	} ggml_int8x16x4_t;
422
423	inline static ggml_int8x16x4_t ggml_vec_xl_s8x4(const int8_t * ptr) {
424	ggml_int8x16x4_t res;
425
426	res.val[`0`] = vec_xl( `0`, ptr);
427	res.val[`1`] = vec_xl(`16`, ptr);
428	res.val[`2`] = vec_xl(`32`, ptr);
429	res.val[`3`] = vec_xl(`48`, ptr);
430
431	return res;
432	}
433
434	typedef struct ggml_int16x8x2_t {
435	int16x8_t val[`2`];
436	} ggml_int16x8x2_t;
437
438	inline static ggml_int16x8x2_t ggml_vec_xl_s16x2(const int16_t * ptr) {
439	ggml_int16x8x2_t res;
440
441	res.val[`0`] = vec_xl( `0`, ptr);
442	res.val[`1`] = vec_xl(`16`, ptr);
443
444	return res;
445	}
446
447	/*
448	! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
449	! or iq4_nl for example implementation.
450	*/
451	inline static int8x16_t ggml_vec_tbl(int8x16_t a, uint8x16_t b) {
452	int8x16_t res;
453
454	res[ `0`] = a[b[ `0`]];
455	res[ `1`] = a[b[ `1`]];
456	res[ `2`] = a[b[ `2`]];
457	res[ `3`] = a[b[ `3`]];
458	res[ `4`] = a[b[ `4`]];
459	res[ `5`] = a[b[ `5`]];
460	res[ `6`] = a[b[ `6`]];
461	res[ `7`] = a[b[ `7`]];
462	res[ `8`] = a[b[ `8`]];
463	res[ `9`] = a[b[ `9`]];
464	res[`10`] = a[b[`10`]];
465	res[`11`] = a[b[`11`]];
466	res[`12`] = a[b[`12`]];
467	res[`13`] = a[b[`13`]];
468	res[`14`] = a[b[`14`]];
469	res[`15`] = a[b[`15`]];
470
471	return res;
472	}
473
474	inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
475	const uchar8x16_t v_maske = { `0`, `1`, `4`, `5`, `8`, `9`, `12`, `13`,
476	`16`, `17`, `20`, `21`, `24`, `25`, `28`, `29` };
477
478	const int16x8_t v_abo = vec_pack((int32x4_t)a, (int32x4_t)b);
479	const int16x8_t v_abe = vec_perm(a, b, v_maske);
480	return v_abo + v_abe;
481	}
482
483	/**
484	* @see https://github.com/ggml-org/llama.cpp/pull/14037
485	*/
486	inline static float vec_hsum_f32x4(float32x4_t v) {
487	float32x4_t v_temp = v + vec_reve(v);
488	return v_temp[`0`] + v_temp[`1`];
489	}
490
491	inline static int32_t vec_hsum_i32x4(int32x4_t v) {
492	int32x4_t v_temp = v + vec_reve(v);
493	return v_temp[`0`] + v_temp[`1`];
494	}
495
496	inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
497	const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
498	return acc + (vec_unpackh(p) + vec_unpackl(p));
499	}
500
501	#endif
502
503	#if defined(__loongarch_sx)
504	/ float type data load instructions /
505	static __m128 __lsx_vreplfr2vr_s(const float val) {
506	v4f32 res = {val, val, val, val};
507	return (__m128)res;
508	}
509	#endif
510
511	#if defined(__loongarch_asx)
512	static __m256 __lasx_xvreplfr2vr_s(const float val) {
513	v8f32 res = {val, val, val, val, val, val, val, val};
514	return (__m256)res;
515	}
516	#endif
517
518	// TODO: move to ggml-threading
519	void ggml_barrier(struct ggml_threadpool * tp);
520
521	void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
522	int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
523
524	#ifdef __cplusplus
525	}
526	#endif
527

Browse the source code of llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h