vec.h source code [llama.cpp/ggml/src/ggml-cpu/vec.h]

1	// Vectorized functions for fundamental operations
2
3	#pragma once
4
5	#include "ggml-impl.h"
6	#include "simd-mappings.h"
7	#include "ggml.h"
8	#include "ggml-cpu.h"
9
10	#if defined(GGML_USE_ACCELERATE)
11	#include <Accelerate/Accelerate.h>
12	#endif
13
14	// floating point type used to accumulate sums
15	typedef double ggml_float;
16
17	#define GGML_GELU_FP16
18	#define GGML_GELU_QUICK_FP16
19
20	#define GGML_SOFT_MAX_UNROLL 4
21	#define GGML_VEC_DOT_UNROLL 2
22	#define GGML_VEC_MAD_UNROLL 32
23
24	#ifdef __cplusplus
25	extern "C" {
26	#endif
27
28	//
29	// global data
30	//
31
32	// precomputed gelu table for f16 (128 KB)
33	extern ggml_fp16_t ggml_table_gelu_f16[`1` << `16`];
34
35	// precomputed quick gelu table for f16 (128 KB)
36	extern ggml_fp16_t ggml_table_gelu_quick_f16[`1` << `16`];
37
38	//
39	// fundamental operations
40	//
41
42	void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
43	void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
44	void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
45
46	void ggml_vec_silu_f32(const int n, float * y, const float * x);
47	ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
48	ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
49	ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
50
51	inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = `0`; i < n; ++i) x[i] = v; }
52	inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = `0`; i < n; ++i) x[i] = v; }
53
54	inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = `0`; i < n; ++i) x[i] = v; }
55	inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = `0`; i < n; ++i) y[i] = x[i]; }
56
57	inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = `0`; i < n; ++i) x[i] = v; }
58	inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = `0`; i < n; ++i) x[i] = v; }
59
60	inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
61	int i = `0`;
62	#if defined(__AVX2__)
63	for (; i + `7` < n; i += `8`) {
64	__m256 vx = _mm256_loadu_ps(p: x + i);
65	__m256 vy = _mm256_loadu_ps(p: y + i);
66	__m256 vz = _mm256_add_ps(a: vx, b: vy);
67	_mm256_storeu_ps(p: z + i, a: vz);
68	}
69	#endif
70	for (; i < n; ++i) {
71	z[i] = x[i] + y[i];
72	}
73	}
74
75	inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
76	for (int i = `0`; i < n; ++i) {
77	z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
78	}
79	}
80	inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = `0`; i < n; ++i) z[i] = x[i] + v; }
81	inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] += x[i]; }
82	inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = `0`; i < n; ++i) y[i] += v; }
83	inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = `0`; i < n; ++i) z[i] = x[i] - y[i]; }
84	inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
85	for (int i = `0`; i < n; ++i) {
86	z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
87	}
88	}
89	inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = `0`; i < n; ++i) x[i] = v; }
90	inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = x[i]; }
91	inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = -x[i]; }
92	inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
93	for (int i = `0`; i < n; ++i) {
94	y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
95	}
96	}
97
98	inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = `0`; i < n; ++i) z[i] = x[i]*y[i]; }
99	inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
100	for (int i = `0`; i < n; ++i) {
101	z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
102	}
103	}
104	inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = `0`; i < n; ++i) z[i] = x[i]/y[i]; }
105	inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
106	for (int i = `0`; i < n; ++i) {
107	z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
108	}
109	}
110
111	// compute GGML_VEC_DOT_UNROLL dot products at once
112	// xs - x row stride in bytes
113	inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
114	ggml_float sumf[GGML_VEC_DOT_UNROLL] = { `0.0` };
115
116	ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
117
118	for (int i = `0`; i < GGML_VEC_DOT_UNROLL; ++i) {
119	x[i] = (ggml_fp16_t ) ((char* ) xv + ixs);
120	}
121
122	#if defined(GGML_SIMD)
123	#if defined(__ARM_FEATURE_SVE)
124
125	const int sve_register_length = svcntb() * `8`;
126	const int ggml_f16_epr = sve_register_length / `16`; // running when 16
127	const int ggml_f16_step = `8` * ggml_f16_epr; // choose 8 SVE registers
128
129	const int np = (n & ~(ggml_f16_step - `1`));
130
131	svfloat16_t sum_00 = svdup_n_f16(`0.0f`);
132	svfloat16_t sum_01 = svdup_n_f16(`0.0f`);
133	svfloat16_t sum_02 = svdup_n_f16(`0.0f`);
134	svfloat16_t sum_03 = svdup_n_f16(`0.0f`);
135
136	svfloat16_t sum_10 = svdup_n_f16(`0.0f`);
137	svfloat16_t sum_11 = svdup_n_f16(`0.0f`);
138	svfloat16_t sum_12 = svdup_n_f16(`0.0f`);
139	svfloat16_t sum_13 = svdup_n_f16(`0.0f`);
140
141	svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
142	svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
143
144	for (int i = `0`; i < np; i += ggml_f16_step) {
145	ay1 = GGML_F16x_VEC_LOAD(y + i + `0` * ggml_f16_epr, `0`); // 8 elements
146
147	ax1 = GGML_F16x_VEC_LOAD(x[`0`] + i + `0`ggml_f16_epr, `0`); // 8 elements*
148	sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1ay1*
149	ax1 = GGML_F16x_VEC_LOAD(x[`1`] + i + `0`ggml_f16_epr, `0`); // 8 elements*
150	sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
151
152	ay2 = GGML_F16x_VEC_LOAD(y + i + `1` * ggml_f16_epr, `1`); // next 8 elements
153
154	ax2 = GGML_F16x_VEC_LOAD(x[`0`] + i + `1`ggml_f16_epr, `1`); // next 8 elements*
155	sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
156	ax2 = GGML_F16x_VEC_LOAD(x[`1`] + i + `1`*ggml_f16_epr, `1`);
157	sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
158
159	ay3 = GGML_F16x_VEC_LOAD(y + i + `2` * ggml_f16_epr, `2`);
160
161	ax3 = GGML_F16x_VEC_LOAD(x[`0`] + i + `2`*ggml_f16_epr, `2`);
162	sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
163	ax3 = GGML_F16x_VEC_LOAD(x[`1`] + i + `2`*ggml_f16_epr, `2`);
164	sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
165
166	ay4 = GGML_F16x_VEC_LOAD(y + i + `3` * ggml_f16_epr, `3`);
167
168	ax4 = GGML_F16x_VEC_LOAD(x[`0`] + i + `3`*ggml_f16_epr, `3`);
169	sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
170	ax4 = GGML_F16x_VEC_LOAD(x[`1`] + i + `3`*ggml_f16_epr, `3`);
171	sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
172
173	ay5 = GGML_F16x_VEC_LOAD(y + i + `4` * ggml_f16_epr, `4`);
174
175	ax5 = GGML_F16x_VEC_LOAD(x[`0`] + i + `4`*ggml_f16_epr, `4`);
176
177	sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
178	ax5 = GGML_F16x_VEC_LOAD(x[`1`] + i + `4`*ggml_f16_epr, `4`);
179	sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
180
181	ay6 = GGML_F16x_VEC_LOAD(y + i + `5` * ggml_f16_epr, `5`);
182
183	ax6 = GGML_F16x_VEC_LOAD(x[`0`] + i + `5`*ggml_f16_epr, `5`);
184
185	sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
186	ax6 = GGML_F16x_VEC_LOAD(x[`1`] + i + `5`*ggml_f16_epr, `5`);
187	sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
188
189	ay7 = GGML_F16x_VEC_LOAD(y + i + `6` * ggml_f16_epr, `6`);
190
191	ax7 = GGML_F16x_VEC_LOAD(x[`0`] + i + `6`*ggml_f16_epr, `6`);
192
193	sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
194	ax7 = GGML_F16x_VEC_LOAD(x[`1`] + i + `6`*ggml_f16_epr, `6`);
195	sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
196
197	ay8 = GGML_F16x_VEC_LOAD(y + i + `7` * ggml_f16_epr, `7`);
198
199	ax8 = GGML_F16x_VEC_LOAD(x[`0`] + i + `7`*ggml_f16_epr, `7`);
200
201	sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
202	ax8 = GGML_F16x_VEC_LOAD(x[`1`] + i + `7`*ggml_f16_epr, `7`);
203	sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
204	}
205
206	const int np2 = (n & ~(ggml_f16_epr - `1`));
207	for (int k = np; k < np2; k += ggml_f16_epr) {
208	svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, `0`);
209
210	svfloat16_t rx = GGML_F16x_VEC_LOAD(x[`0`] + k, `0`);
211	sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
212	rx = GGML_F16x_VEC_LOAD(x[`1`] + k, `0`);
213	sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
214	}
215
216	if (np2 < n) {
217	svbool_t pg = svwhilelt_b16(np2, n);
218	svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[`0`] + np2));
219	svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[`1`] + np2));
220	svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
221
222	sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
223	sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
224	}
225	GGML_F16x_VEC_REDUCE(sumf[`0`], sum_00, sum_01, sum_02, sum_03);
226	GGML_F16x_VEC_REDUCE(sumf[`1`], sum_10, sum_11, sum_12, sum_13);
227	#elif defined(__riscv_v_intrinsic)
228	// todo: RVV impl
229	for (int i = `0`; i < n; ++i) {
230	for (int j = `0`; j < GGML_VEC_DOT_UNROLL; ++j) {
231	sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
232	}
233	}
234	#else
235	const int np = (n & ~(GGML_F16_STEP - `1`));
236
237	GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
238
239	GGML_F16_VEC ax[GGML_F16_ARR];
240	GGML_F16_VEC ay[GGML_F16_ARR];
241
242	for (int i = `0`; i < np; i += GGML_F16_STEP) {
243	for (int j = `0`; j < GGML_F16_ARR; j++) {
244	ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
245
246	for (int k = `0`; k < GGML_VEC_DOT_UNROLL; ++k) {
247	ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
248
249	sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
250	}
251	}
252	}
253
254	// reduce sum0..sum3 to sum0
255	for (int k = `0`; k < GGML_VEC_DOT_UNROLL; ++k) {
256	GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
257	}
258
259	// leftovers
260	for (int i = np; i < n; ++i) {
261	for (int j = `0`; j < GGML_VEC_DOT_UNROLL; ++j) {
262	sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
263	}
264	}
265	#endif
266	#else
267	for (int i = `0`; i < n; ++i) {
268	for (int j = `0`; j < GGML_VEC_DOT_UNROLL; ++j) {
269	sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
270	}
271	}
272	#endif
273
274	for (int i = `0`; i < GGML_VEC_DOT_UNROLL; ++i) {
275	s[i] = (float)sumf[i];
276	}
277	}
278
279	inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
280	#if defined(GGML_SIMD)
281	#if defined(__ARM_FEATURE_SVE)
282
283	const int sve_register_length = ggml_cpu_get_sve_cnt() * `8`;
284	const int ggml_f32_epr = sve_register_length / `32`;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
285	const int ggml_f32_step = `8` * ggml_f32_epr; // choose 8 SVE registers
286	GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
287
288	const int np = (n & ~(ggml_f32_step - `1`));
289	svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
290	svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
291	for (int i = `0`; i < np; i += ggml_f32_step) {
292
293	ax1 = GGML_F32_VEC_LOAD(x + i);
294	ay1 = GGML_F32_VEC_LOAD(y + i);
295	ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
296
297	GGML_F32_VEC_STORE(y + i, ay1);
298
299	ax2 = GGML_F32_VEC_LOAD(x + i + `1`*ggml_f32_epr);
300	ay2 = GGML_F32_VEC_LOAD(y + i + `1`*ggml_f32_epr);
301	ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
302
303	GGML_F32_VEC_STORE(y + i + `1`*ggml_f32_epr, ay2);
304
305	ax3 = GGML_F32_VEC_LOAD(x + i + `2`*ggml_f32_epr);
306	ay3 = GGML_F32_VEC_LOAD(y + i + `2`*ggml_f32_epr);
307	ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
308
309	GGML_F32_VEC_STORE(y + i + `2`*ggml_f32_epr, ay3);
310
311	ax4 = GGML_F32_VEC_LOAD(x + i + `3`*ggml_f32_epr);
312	ay4 = GGML_F32_VEC_LOAD(y + i + `3`*ggml_f32_epr);
313	ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
314
315	GGML_F32_VEC_STORE(y + i + `3`*ggml_f32_epr, ay4);
316
317	ax5 = GGML_F32_VEC_LOAD(x + i + `4`*ggml_f32_epr);
318	ay5 = GGML_F32_VEC_LOAD(y + i + `4`*ggml_f32_epr);
319	ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
320
321	GGML_F32_VEC_STORE(y + i + `4`*ggml_f32_epr, ay5);
322
323	ax6 = GGML_F32_VEC_LOAD(x + i + `5`*ggml_f32_epr);
324	ay6 = GGML_F32_VEC_LOAD(y + i + `5`*ggml_f32_epr);
325	ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
326
327	GGML_F32_VEC_STORE(y + i + `5`*ggml_f32_epr, ay6);
328
329	ax7 = GGML_F32_VEC_LOAD(x + i + `6`*ggml_f32_epr);
330	ay7 = GGML_F32_VEC_LOAD(y + i + `6`*ggml_f32_epr);
331	ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
332
333	GGML_F32_VEC_STORE(y + i + `6`*ggml_f32_epr, ay7);
334
335	ax8 = GGML_F32_VEC_LOAD(x + i + `7`*ggml_f32_epr);
336	ay8 = GGML_F32_VEC_LOAD(y + i + `7`*ggml_f32_epr);
337	ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
338
339	GGML_F32_VEC_STORE(y + i + `7`*ggml_f32_epr, ay8);
340	}
341	// leftovers
342	// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
343	const int np2 = (n & ~(ggml_f32_epr - `1`));
344	for (int i = np; i < np2; i += ggml_f32_epr) {
345	ax1 = GGML_F32_VEC_LOAD(x + i);
346	ay1 = GGML_F32_VEC_LOAD(y + i);
347	ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
348
349	GGML_F32_VEC_STORE(y + i, ay1);
350	}
351	// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
352	if (np2 < n) {
353	svbool_t pg =svwhilelt_b32(np2, n);
354	ax1 = svld1_f32(pg, x + np2);
355	ay1 = svld1_f32(pg, y + np2);
356	ay1 = svmad_f32_m(pg, ax1, vx, ay1);
357
358	svst1_f32(pg, y + np2, ay1);
359	}
360	#elif defined(__riscv_v_intrinsic)
361	for (int i = `0`, avl; i < n; i += avl) {
362	avl = __riscv_vsetvl_e32m8(n - i);
363	vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
364	vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
365	vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
366	__riscv_vse32_v_f32m8(&y[i], ny, avl);
367	}
368	#else
369	const int np = (n & ~(GGML_F32_STEP - `1`));
370
371	GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
372
373	GGML_F32_VEC ax[GGML_F32_ARR];
374	GGML_F32_VEC ay[GGML_F32_ARR];
375
376	for (int i = `0`; i < np; i += GGML_F32_STEP) {
377	for (int j = `0`; j < GGML_F32_ARR; j++) {
378	ax[j] = GGML_F32_VEC_LOAD(p: x + i + j*GGML_F32_EPR);
379	ay[j] = GGML_F32_VEC_LOAD(p: y + i + j*GGML_F32_EPR);
380	ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
381
382	GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]);
383	}
384	}
385
386	// leftovers
387	for (int i = np; i < n; ++i) {
388	y[i] += x[i]*v;
389	}
390	#endif
391	#else
392	// scalar
393	for (int i = `0`; i < n; ++i) {
394	y[i] += x[i]*v;
395	}
396	#endif
397	}
398
399	inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
400	#if defined(GGML_SIMD)
401	#if defined(__ARM_FEATURE_SVE)
402	const int sve_register_length = svcntb() * `8`;
403	const int ggml_f16_epr = sve_register_length / `16`;
404	const int ggml_f16_step = `8` * ggml_f16_epr;
405
406	GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
407
408	const int np= (n & ~(ggml_f16_step - `1`));
409
410	svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
411	svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
412	for (int i = `0`; i < np; i += ggml_f16_step) {
413	ax1 = GGML_F16x_VEC_LOAD(x + i + `0` * ggml_f16_epr, `0`);
414	ay1 = GGML_F16x_VEC_LOAD(y + i + `0` * ggml_f16_epr, `0`);
415	ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
416
417	GGML_F16x_VEC_STORE(y + i + `0` * ggml_f16_epr, ay1, `0`);
418
419	ax2 = GGML_F16x_VEC_LOAD(x + i + `1` * ggml_f16_epr, `1`);
420	ay2 = GGML_F16x_VEC_LOAD(y + i + `1` * ggml_f16_epr, `1`);
421	ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
422
423	GGML_F16x_VEC_STORE(y + i + `1` * ggml_f16_epr, ay2, `1`);
424
425	ax3 = GGML_F16x_VEC_LOAD(x + i + `2` * ggml_f16_epr, `2`);
426	ay3 = GGML_F16x_VEC_LOAD(y + i + `2` * ggml_f16_epr, `2`);
427	ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
428
429	GGML_F16x_VEC_STORE(y + i + `2` * ggml_f16_epr, ay3, `2`);
430
431	ax4 = GGML_F16x_VEC_LOAD(x + i + `3` * ggml_f16_epr, `3`);
432	ay4 = GGML_F16x_VEC_LOAD(y + i + `3` * ggml_f16_epr, `3`);
433	ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
434
435	GGML_F16x_VEC_STORE(y + i + `3` * ggml_f16_epr, ay4, `3`);
436
437	ax5 = GGML_F16x_VEC_LOAD(x + i + `4` * ggml_f16_epr, `4`);
438	ay5 = GGML_F16x_VEC_LOAD(y + i + `4` * ggml_f16_epr, `4`);
439	ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
440
441	GGML_F16x_VEC_STORE(y + i + `4` * ggml_f16_epr, ay5, `4`);
442
443	ax6 = GGML_F16x_VEC_LOAD(x + i + `5` * ggml_f16_epr, `5`);
444	ay6 = GGML_F16x_VEC_LOAD(y + i + `5` * ggml_f16_epr, `5`);
445	ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
446
447	GGML_F16x_VEC_STORE(y + i + `5` * ggml_f16_epr, ay6, `5`);
448
449	ax7 = GGML_F16x_VEC_LOAD(x + i + `6` * ggml_f16_epr, `6`);
450	ay7 = GGML_F16x_VEC_LOAD(y + i + `6` * ggml_f16_epr, `6`);
451	ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
452
453	GGML_F16x_VEC_STORE(y + i + `6` * ggml_f16_epr, ay7, `6`);
454
455	ax8 = GGML_F16x_VEC_LOAD(x + i + `7` * ggml_f16_epr, `7`);
456	ay8 = GGML_F16x_VEC_LOAD(y + i + `7` * ggml_f16_epr, `7`);
457	ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
458
459	GGML_F16x_VEC_STORE(y + i + `7` * ggml_f16_epr, ay8, `7`);
460	}
461	const int np2 = (n & ~(ggml_f16_epr - `1`));
462	for (int k = np; k < np2; k += ggml_f16_epr) {
463	svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, `0`);
464	svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, `0`);
465	ry = GGML_F16x_VEC_FMA(ry, rx, vx);
466
467	GGML_F16x_VEC_STORE(y + k, ry, `0`);
468	}
469
470	if (np2 < n) {
471	svbool_t pg = svwhilelt_b16(np2, n);
472	svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
473	svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
474	hy = svmad_f16_x(pg, hx, vx, hy);
475	svst1_f16(pg, (__fp16 *)(y + np2), hy);
476	}
477
478	#elif defined(__riscv_v_intrinsic)
479	// todo: RVV impl
480	// scalar
481	for (int i = `0`; i < n; ++i) {
482	y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
483	}
484	#else
485	const int np = (n & ~(GGML_F16_STEP - `1`));
486
487	GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
488
489	GGML_F16_VEC ax[GGML_F16_ARR];
490	GGML_F16_VEC ay[GGML_F16_ARR];
491
492	for (int i = `0`; i < np; i += GGML_F16_STEP) {
493	for (int j = `0`; j < GGML_F16_ARR; j++) {
494	ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
495	ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
496	ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
497
498	GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
499	}
500	}
501
502	// leftovers
503	for (int i = np; i < n; ++i) {
504	y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
505	}
506	#endif
507	#else
508	// scalar
509	for (int i = `0`; i < n; ++i) {
510	y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
511	}
512	#endif
513	}
514
515	// xs and vs are byte strides of x and v
516	inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
517
518	const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
519	const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
520
521	for (int i = `0`; i < GGML_VEC_MAD_UNROLL; ++i) {
522	x[i] = (const float ) ((const* char ) xv + ixs);
523	v[i] = (const float ) ((const* char ) vv + ivs);
524	}
525
526	#if defined(GGML_SIMD)
527	#if defined(__ARM_FEATURE_SVE)
528	// scalar Route to scalar implementation //TODO: Write SVE code
529	for (int k = `0`; k < GGML_VEC_MAD_UNROLL; ++k) {
530	for (int i = `0`; i < n; ++i) {
531	y[i] += x[k][i]*v[k][`0`];
532	}
533	}
534	#elif defined(__riscv_v_intrinsic)
535	for (int i = `0`, avl; i < n; i += avl) {
536	avl = __riscv_vsetvl_e32m8(n - i);
537	vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
538	for (int k = `0`; k < GGML_VEC_MAD_UNROLL; k++) {
539	vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
540	ay = __riscv_vfmadd_vf_f32m8(ax, v[k][`0`], ay, avl);
541	}
542	__riscv_vse32_v_f32m8(&y[i], ay, avl);
543	}
544	#else
545	const int np = (n & ~(GGML_F32_STEP - `1`));
546
547	GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
548
549	for (int k = `0`; k < GGML_VEC_MAD_UNROLL; ++k) {
550	vx[k] = GGML_F32_VEC_SET1(v[k][`0`]);
551	}
552
553	GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
554	GGML_F32_VEC ay[GGML_F32_ARR];
555
556	for (int i = `0`; i < np; i += GGML_F32_STEP) {
557	for (int j = `0`; j < GGML_F32_ARR; j++) {
558	ay[j] = GGML_F32_VEC_LOAD(p: y + i + j*GGML_F32_EPR);
559
560	for (int k = `0`; k < GGML_VEC_MAD_UNROLL; ++k) {
561	ax[k][j] = GGML_F32_VEC_LOAD(p: x[k] + i + j*GGML_F32_EPR);
562	ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
563	}
564
565	GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]);
566	}
567	}
568
569	// leftovers
570	for (int k = `0`; k < GGML_VEC_MAD_UNROLL; ++k) {
571	for (int i = np; i < n; ++i) {
572	y[i] += x[k][i]*v[k][`0`];
573	}
574	}
575	#endif
576	#else
577	// scalar
578	for (int k = `0`; k < GGML_VEC_MAD_UNROLL; ++k) {
579	for (int i = `0`; i < n; ++i) {
580	y[i] += x[k][i]*v[k][`0`];
581	}
582	}
583	#endif
584	}
585
586	inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
587	#if defined(GGML_USE_ACCELERATE)
588	vDSP_vsmsa(x, `1`, &s, &b, y, `1`, n);
589	#elif defined(GGML_SIMD)
590	#if defined(__ARM_FEATURE_SVE)
591	// scalar ; TODO: Write SVE code
592	for (int i = `0`; i < n; ++i) {
593	y[i] = x[i]*s + b;
594	}
595	#elif defined(__riscv_v_intrinsic)
596	for (int i = `0`, avl; i < n; i += avl) {
597	avl = __riscv_vsetvl_e32m8(n - i);
598	vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
599	vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
600	vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
601	__riscv_vse32_v_f32m8(&y[i], ny, avl);
602	}
603	#else
604	const int np = (n & ~(GGML_F32_STEP - `1`));
605
606	GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
607	GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
608
609	GGML_F32_VEC ay[GGML_F32_ARR];
610
611	for (int i = `0`; i < np; i += GGML_F32_STEP) {
612	for (int j = `0`; j < GGML_F32_ARR; j++) {
613	ay[j] = GGML_F32_VEC_LOAD(p: x + i + j*GGML_F32_EPR);
614	ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
615
616	GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]);
617	}
618	}
619
620	// leftovers
621	for (int i = np; i < n; ++i) {
622	y[i] = x[i]*s + b;
623	}
624	#endif
625	#else
626	// scalar
627	for (int i = `0`; i < n; ++i) {
628	y[i] = x[i]*s + b;
629	}
630	#endif
631	}
632
633	//inline static void ggml_vec_scale_f32(const int n, float y, const float v) { for (int i = 0; i < n; ++i) y[i] = v; }
634	inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
635	#if defined(GGML_USE_ACCELERATE)
636	vDSP_vsmul(y, `1`, &v, y, `1`, n);
637	#elif defined(GGML_SIMD)
638	#if defined(__ARM_FEATURE_SVE)
639	const int sve_register_length = ggml_cpu_get_sve_cnt() * `8`;
640	const int ggml_f32_epr = sve_register_length / `32`;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
641	const int ggml_f32_step = `2` * ggml_f32_epr;
642
643	GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
644	const int np = (n & ~(ggml_f32_step - `1`));
645	svfloat32_t ay1;
646	svfloat32_t ay2;
647	for (int i = `0`; i < np; i += ggml_f32_step) {
648	ay1 = GGML_F32_VEC_LOAD(y + i);
649	ay1 = GGML_F32_VEC_MUL(ay1, vx);
650	GGML_F32_VEC_STORE(y + i, ay1);
651
652	ay2 = GGML_F32_VEC_LOAD(y + i + `1`*ggml_f32_epr);
653	ay2 = GGML_F32_VEC_MUL(ay2, vx);
654	GGML_F32_VEC_STORE(y + i + `1`*ggml_f32_epr, ay2);
655	}
656	// leftovers
657	// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
658	for (int i = np; i < n; i += ggml_f32_epr) {
659	svbool_t pg = svwhilelt_b32(i, n);
660	ay1 = svld1_f32(pg, y + i);
661	ay1 = svmul_f32_m(pg, ay1, vx);
662	svst1_f32(pg, y + i, ay1);
663	}
664	#elif defined(__riscv_v_intrinsic)
665	for (int i = `0`, avl; i < n; i += avl) {
666	avl = __riscv_vsetvl_e32m8(n - i);
667	vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
668	vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
669	__riscv_vse32_v_f32m8(&y[i], ny, avl);
670	}
671	#else
672	const int np = (n & ~(GGML_F32_STEP - `1`));
673
674	GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
675
676	GGML_F32_VEC ay[GGML_F32_ARR];
677
678	for (int i = `0`; i < np; i += GGML_F32_STEP) {
679	for (int j = `0`; j < GGML_F32_ARR; j++) {
680	ay[j] = GGML_F32_VEC_LOAD(p: y + i + j*GGML_F32_EPR);
681	ay[j] = GGML_F32_VEC_MUL(a: ay[j], b: vx);
682
683	GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]);
684	}
685	}
686
687	// leftovers
688	for (int i = np; i < n; ++i) {
689	y[i] *= v;
690	}
691	#endif
692	#else
693	// scalar
694	for (int i = `0`; i < n; ++i) {
695	y[i] *= v;
696	}
697	#endif
698	}
699
700	inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
701	#if defined(GGML_SIMD)
702	#if defined(__ARM_FEATURE_SVE)
703	const int sve_register_length = svcntb() * `8`;
704	const int ggml_f16_epr = sve_register_length / `16`;
705	const int ggml_f16_step = `2` * ggml_f16_epr;
706
707	GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
708	const int np = (n & ~(ggml_f16_step - `1`));
709	svfloat16_t ay1, ay2;
710
711	for (int i = `0`; i < np; i += ggml_f16_step) {
712	ay1 = GGML_F16x_VEC_LOAD(y + i + `0`*ggml_f16_epr, `0`);
713	ay1 = GGML_F16x_VEC_MUL(ay1, vx);
714	GGML_F16x_VEC_STORE(y + i + `0`*ggml_f16_epr, ay1, `0`);
715
716	ay2 = GGML_F16x_VEC_LOAD(y + i + `1`*ggml_f16_epr, `1`);
717	ay2 = GGML_F16x_VEC_MUL(ay2, vx);
718	GGML_F16x_VEC_STORE(y + i + `1`*ggml_f16_epr, ay2, `1`);
719	}
720	// leftovers
721	// maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
722	if (np < n) {
723	svbool_t pg = svwhilelt_b16(np, n);
724	svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
725	svfloat16_t out = svmul_f16_m(pg, hy, vx);
726	svst1_f16(pg, (__fp16 *)(y + np), out);
727	}
728	#elif defined(__riscv_v_intrinsic)
729	// todo: RVV impl
730	// scalar
731	for (int i = `0`; i < n; ++i) {
732	y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
733	}
734	#else
735	const int np = (n & ~(GGML_F16_STEP - `1`));
736
737	GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
738
739	GGML_F16_VEC ay[GGML_F16_ARR];
740
741	for (int i = `0`; i < np; i += GGML_F16_STEP) {
742	for (int j = `0`; j < GGML_F16_ARR; j++) {
743	ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
744	ay[j] = GGML_F16_VEC_MUL(a: ay[j], b: vx);
745
746	GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
747	}
748	}
749
750	// leftovers
751	for (int i = np; i < n; ++i) {
752	y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
753	}
754	#endif
755	#else
756	// scalar
757	for (int i = `0`; i < n; ++i) {
758	y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
759	}
760	#endif
761	}
762
763	inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, bs: `0`, x, bx: `0`, y: x, by: `0`, nrc: `1`); s = sqrtf(x: s); }
764	inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = x[i]*x[i]; }
765	inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
766	for (int i = `0`; i < n; ++i) {
767	float v = GGML_CPU_FP16_TO_FP32(x[i]);
768	y[i] = GGML_CPU_FP32_TO_FP16(v*v);
769	}
770	}
771	inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = sqrtf(x: x[i]); }
772	inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
773	for (int i = `0`; i < n; ++i) {
774	y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
775	}
776	}
777	inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = logf(x: x[i]); }
778	inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
779	for (int i = `0`; i < n; ++i) {
780	y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
781	}
782	}
783	inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = sinf(x: x[i]); }
784	inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
785	for (int i = `0`; i < n; ++i) {
786	y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
787	}
788	}
789	inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = cosf(x: x[i]); }
790	inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
791	for (int i = `0`; i < n; ++i) {
792	y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
793	}
794	}
795	inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = fabsf(x: x[i]); }
796	inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
797	for (int i = `0`; i < n; ++i) {
798	y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
799	}
800	}
801	inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = (x[i] > `0.f`) ? `1.f` : ((x[i] < `0.f`) ? -`1.f` : `0.f`); }
802	inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
803	for (int i = `0`; i < n; ++i) {
804	float v = GGML_CPU_FP16_TO_FP32(x[i]);
805	y[i] = GGML_CPU_FP32_TO_FP16((v > `0.f`) ? `1.f` : ((v < `0.f`) ? -`1.f` : `0.f`));
806	}
807	}
808	inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = (x[i] > `0.f`) ? `1.f` : `0.f`; }
809	inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
810	for (int i = `0`; i < n; ++i) {
811	y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > `0.f`) ? `1.f` : `0.f`);
812	}
813	}
814	inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = tanhf(x: x[i]); }
815	inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
816	for (int i = `0`; i < n; ++i) {
817	y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
818	}
819	}
820	inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = (x[i] > `0.f`) ? x[i] : expm1f(x: x[i]); }
821	inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
822	for (int i = `0`; i < n; ++i) {
823	const float v = GGML_CPU_FP16_TO_FP32(x[i]);
824	y[i] = GGML_CPU_FP32_TO_FP16((v > `0.f`) ? v : expm1f(v));
825	}
826	}
827	inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = (x[i] > `0.f`) ? x[i] : `0.f`; }
828	inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
829	for (int i = `0`; i < n; ++i) {
830	float v = GGML_CPU_FP16_TO_FP32(x[i]);
831	y[i] = GGML_CPU_FP32_TO_FP16((v > `0.f`) ? v : `0.f`);
832	}
833	}
834	inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = `0`; i < n; ++i) y[i] = ((x[i] > `0.f`) ? x[i] : `0.f`) + ns * ((x[i] < `0.0f`) ? x[i] : `0.f`); }
835	inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
836	for (int i = `0`; i < n; ++i) {
837	float v = GGML_CPU_FP16_TO_FP32(x[i]);
838	y[i] = GGML_CPU_FP32_TO_FP16(((v > `0.f`) ? v : `0.f`) + ns * ((v < `0.0f`) ? v : `0.f`));
839	}
840	}
841	inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = `1.f` / (`1.f` + expf(x: -x[i])); }
842	inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
843	for (int i = `0`; i < n; ++i) {
844	y[i] = GGML_CPU_FP32_TO_FP16(`1.f` / (`1.f` + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
845	}
846	}
847	// TODO: optimize performance
848	inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = x[i] * fminf(x: `1.0f`, y: fmaxf(x: `0.0f`, y: (x[i] + `3.0f`) / `6.0f`)); }
849	inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
850	for (int i = `0`; i < n; ++i) {
851	float v = GGML_CPU_FP16_TO_FP32(x[i]);
852	y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(`1.0f`, fmaxf(`0.0f`, (v + `3.0f`) / `6.0f`)));
853	}
854	}
855	inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = fminf(x: `1.0f`, y: fmaxf(x: `0.0f`, y: (x[i] + `3.0f`) / `6.0f`)); }
856	inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
857	for (int i = `0`; i < n; ++i) {
858	y[i] = GGML_CPU_FP32_TO_FP16(fminf(`1.0f`, fmaxf(`0.0f`, (GGML_CPU_FP16_TO_FP32(x[i]) + `3.0f`) / `6.0f`)));
859	}
860	}
861	inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = `0`; i < n; ++i) y[i] = expf(x: x[i]); }
862	inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
863	for (int i = `0`; i < n; ++i) {
864	y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
865	}
866	}
867
868	static const float GELU_COEF_A = `0.044715f`;
869	static const float GELU_QUICK_COEF = -`1.702f`;
870	static const float SQRT_2_OVER_PI = `0.79788456080286535587989211986876f`;
871	static const float SQRT_2_INV = `0.70710678118654752440084436210484f`;
872
873	inline static float ggml_gelu_f32(float x) {
874	return `0.5f`x(`1.0f` + tanhf(x: SQRT_2_OVER_PIx(`1.0f` + GELU_COEF_Axx)));
875	}
876
877	inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
878	const uint16_t * i16 = (const uint16_t *) x;
879	for (int i = `0`; i < n; ++i) {
880	y[i] = ggml_table_gelu_f16[i16[i]];
881	}
882	}
883
884	inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
885	for (int i = `0`; i < n; ++i) {
886	float xi = GGML_CPU_FP16_TO_FP32(x[i]);
887	float res = `0.5f`xi(`1.0f` + erff(xi*SQRT_2_INV));
888	y[i] = GGML_CPU_FP32_TO_FP16(res);
889	}
890	}
891
892	#ifdef GGML_GELU_FP16
893	inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
894	uint16_t t;
895	for (int i = `0`; i < n; ++i) {
896	if (x[i] <= -`10.0f`) {
897	y[i] = `0.0f`;
898	} else if (x[i] >= `10.0f`) {
899	y[i] = x[i];
900	} else {
901	ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
902	memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t));
903	y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
904	}
905	}
906	}
907	#else
908	inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
909	for (int i = `0`; i < n; ++i) {
910	y[i] = ggml_gelu_f32(x[i]);
911	}
912	}
913	#endif
914
915	inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
916	for (int i = `0`; i < n; ++i) {
917	float xi = x[i];
918	y[i] = `0.5f`xi(`1.0f` + erff(xi*SQRT_2_INV));
919	}
920	}
921
922	inline static float ggml_gelu_quick_f32(float x) {
923	return x(`1.0f`/(`1.0f`+expf(x: GELU_QUICK_COEFx)));
924	}
925
926	//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t y, const ggml_fp16_t * x) {*
927	// const uint16_t i16 = (const uint16_t ) x;
928	// for (int i = 0; i < n; ++i) {
929	// y[i] = ggml_table_gelu_quick_f16[i16[i]];
930	// }
931	//}
932
933	#ifdef GGML_GELU_QUICK_FP16
934	inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
935	uint16_t t;
936	for (int i = `0`; i < n; ++i) {
937	ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
938	memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t));
939	y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
940	}
941	}
942	#else
943	inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
944	for (int i = `0`; i < n; ++i) {
945	y[i] = ggml_gelu_quick_f32(x[i]);
946	}
947	}
948	#endif
949
950	inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
951	for (int i = `0`; i < n; ++i) {
952	float v = GGML_CPU_FP16_TO_FP32(x[i]);
953	y[i] = GGML_CPU_FP32_TO_FP16(v(`1.0f`/(`1.0f`+expf(GELU_QUICK_COEFv))));
954	}
955	}
956
957	// Sigmoid Linear Unit (SiLU) function
958	inline static float ggml_silu_f32(float x) {
959	return x/(`1.0f` + expf(x: -x));
960	}
961	inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
962	float v = GGML_CPU_FP16_TO_FP32(x);
963	return GGML_CPU_FP32_TO_FP16(v/(`1.0f` + expf(-v)));
964	}
965
966	#if __FINITE_MATH_ONLY__
967	#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
968	#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
969	#endif
970
971	/ Below function was borrowed from the GitHub repository:*
972	https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp /*
973	#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
974	inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
975	// Constants
976	const svfloat32_t log2_e = svdup_n_f32(`1.4426950409f`);
977	const svfloat32_t ln2 = svdup_n_f32(`0.6931473921f`);
978	const svfloat32_t half_ln2_sq = svdup_n_f32(`0.2413862043f`);
979	const svuint32_t not_mask17 = svdup_n_u32(~((`1u` << `17`) - `1`));
980	const svfloat32_t one = svdup_n_f32(`1.0f`);
981	const svfloat32_t inactive1 = svdup_n_f32(`0.0f`);
982	const svint32_t inactive2 = svdup_n_s32(`0`);
983
984	// Algorithm starts here
985	svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x log2(e)*
986	svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
987	svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
988
989	t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
990	t1 = svadd_f32_m(pg, t1, one); // b = a + 1
991
992	svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), `17`); // v = b >> 17 (u32)
993	svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
994	t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) 2^(n)*
995
996	// and_(t2.d, t1.d, not_mask17.d)
997	svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
998	t5 = svsub_f32_m(pg, t1, t5); // z
999	t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq z*
1000	t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 z) + (half_ln2_sq * z * z)*
1001	t0 = svmul_f32_m(pg, t0, t4); // Final result
1002
1003	return t0;
1004	}
1005	#endif
1006
1007	#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1008
1009	inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
1010	const svfloat32_t r = svdup_n_f32_x(pg, `0x1.8p23f`);
1011	const svfloat32_t z = svmla_n_f32_x(pg, r, x, `0x1.715476p+0f`);
1012	const svfloat32_t n = svsub_f32_x(pg, z, r);
1013	const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, `0x1.62e4p-1f`), n, `0x1.7f7d1cp-20f`);
1014	const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), `23`);
1015	const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, `1`))));
1016	const svbool_t c = svacgt_n_f32(pg, n, `126`);
1017	const svfloat32_t u = svmul_f32_x(pg, b, b);
1018	const svfloat32_t j = svmla_f32_x(pg,
1019	svmul_n_f32_x(pg, b, `0x1.ffffecp-1f`),
1020	svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, `0x1.fffdb6p-2f`), svdup_n_f32_x(pg, `0x1.555e66p-3f`), b),
1021	svmla_f32_x(pg, svdup_n_f32_x(pg, `0x1.573e2ep-5f`), svdup_n_f32_x(pg, `0x1.0e4020p-7f`), b), u), u);
1022	const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, `0.0`), `0x82000000`);
1023	const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, `0x7f000000`));
1024	const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
1025	return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, `192`)), svmul_f32_x(pg, s1, s1),
1026	svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
1027	}
1028
1029	// computes silu x/(1+exp(-x)) in single precision vector
1030	inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
1031	const svfloat32_t one = svdup_n_f32_x(pg, `1.0f`);
1032	const svfloat32_t zero = svdup_n_f32_x(pg, `0.0f`);
1033	const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
1034	const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
1035	const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
1036	return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
1037	}
1038
1039	#elif defined(__ARM_NEON) && defined(__aarch64__)
1040
1041	// adapted from arm limited optimized routine
1042	// the maximum error is 1.45358 plus 0.5 ulps
1043	// numbers above 88.38 will flush to infinity
1044	// numbers beneath -103.97 will flush to zero
1045	inline static float32x4_t ggml_v_expf(float32x4_t x) {
1046	const float32x4_t r = vdupq_n_f32(`0x1.8p23f`);
1047	const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(`0x1.715476p+0f`));
1048	const float32x4_t n = vsubq_f32(z, r);
1049	const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(`0x1.62e4p-1f`)), n,
1050	vdupq_n_f32(`0x1.7f7d1cp-20f`));
1051	const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), `23`);
1052	const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(`1`))));
1053	const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(`126`));
1054	const float32x4_t u = vmulq_f32(b, b);
1055	const float32x4_t j = vfmaq_f32(
1056	vmulq_f32(vdupq_n_f32(`0x1.ffffecp-1f`), b),
1057	vfmaq_f32(vfmaq_f32(vdupq_n_f32(`0x1.fffdb6p-2f`), vdupq_n_f32(`0x1.555e66p-3f`), b),
1058	vfmaq_f32(vdupq_n_f32(`0x1.573e2ep-5f`), vdupq_n_f32(`0x1.0e4020p-7f`), b), u), u);
1059	if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
1060	return vfmaq_f32(k, j, k);
1061	const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(`0x82000000`));
1062	const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(`0x7f000000`)));
1063	const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
1064	return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(`192`)), vmulq_f32(s1, s1),
1065	vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
1066	}
1067
1068	// computes silu x/(1+exp(-x)) in single precision vector
1069	inline static float32x4_t ggml_v_silu(float32x4_t x) {
1070	const float32x4_t one = vdupq_n_f32(`1.0f`);
1071	const float32x4_t zero = vdupq_n_f32(`0.0f`);
1072	const float32x4_t neg_x = vsubq_f32(zero, x);
1073	const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
1074	const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
1075	return vdivq_f32(x, one_plus_exp_neg_x);
1076	}
1077
1078	#elif defined(__AVX512F__) && defined(__AVX512DQ__)
1079
1080	// adapted from arm limited optimized routine
1081	// the maximum error is 1.45358 plus 0.5 ulps
1082	// numbers above 88.38 will flush to infinity
1083	// numbers beneath -103.97 will flush to zero
1084	inline static __m512 ggml_v_expf(__m512 x) {
1085	const __m512 r = _mm512_set1_ps(`0x1.8p23f`);
1086	const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(`0x1.715476p+0f`), r);
1087	const __m512 n = _mm512_sub_ps(z, r);
1088	const __m512 b =
1089	_mm512_fnmadd_ps(n, _mm512_set1_ps(`0x1.7f7d1cp-20f`),
1090	_mm512_fnmadd_ps(n, _mm512_set1_ps(`0x1.62e4p-1f`), x));
1091	const __mmask16 d =
1092	_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(`192`), _CMP_GT_OQ);
1093	const __m512 u = _mm512_mul_ps(b, b);
1094	const __m512 j = _mm512_fmadd_ps(
1095	_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(`0x1.0e4020p-7f`), b,
1096	_mm512_set1_ps(`0x1.573e2ep-5f`)),
1097	u,
1098	_mm512_fmadd_ps(_mm512_set1_ps(`0x1.555e66p-3f`), b,
1099	_mm512_set1_ps(`0x1.fffdb6p-2f`))),
1100	u,
1101	_mm512_fmadd_ps(_mm512_set1_ps(`0x1.ffffecp-1f`), b, _mm512_set1_ps(`1.0F`)));
1102	const __m512 res = _mm512_scalef_ps(j, n);
1103	if (_mm512_kortestz(d, d))
1104	return res;
1105	const __m512 zero = _mm512_setzero_ps();
1106	const __m512 alt = _mm512_mask_blend_ps(
1107	_mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
1108	return _mm512_mask_blend_ps(d, res, alt);
1109	}
1110
1111	// computes silu x/(1+exp(-x)) in single precision vector
1112	inline static __m512 ggml_v_silu(__m512 x) {
1113	const __m512 one = _mm512_set1_ps(`1`);
1114	const __m512 zero = _mm512_setzero_ps();
1115	const __m512 neg_x = _mm512_sub_ps(zero, x);
1116	const __m512 exp_neg_x = ggml_v_expf(neg_x);
1117	const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
1118	return _mm512_div_ps(x, one_plus_exp_neg_x);
1119	}
1120
1121	#elif defined(__AVX2__) && defined(__FMA__)
1122
1123	// adapted from arm limited optimized routine
1124	// the maximum error is 1.45358 plus 0.5 ulps
1125	// numbers above 88.38 will flush to infinity
1126	// numbers beneath -103.97 will flush to zero
1127	inline static __m256 ggml_v_expf(__m256 x) {
1128	const __m256 r = _mm256_set1_ps(w: `0x1.8p23f`);
1129	const __m256 z = _mm256_fmadd_ps(A: x, B: _mm256_set1_ps(w: `0x1.715476p+0f`), C: r);
1130	const __m256 n = _mm256_sub_ps(a: z, b: r);
1131	const __m256 b = _mm256_fnmadd_ps(A: n, B: _mm256_set1_ps(w: `0x1.7f7d1cp-20f`),
1132	C: _mm256_fnmadd_ps(A: n, B: _mm256_set1_ps(w: `0x1.62e4p-1f`), C: x));
1133	const __m256i e = _mm256_slli_epi32(a: _mm256_castps_si256(a: z), count: `23`);
1134	const __m256 k = _mm256_castsi256_ps(
1135	a: _mm256_add_epi32(a: e, b: _mm256_castps_si256(a: _mm256_set1_ps(w: `1`))));
1136	const __m256i c = _mm256_castps_si256(
1137	_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-`0.f`), n),
1138	_mm256_set1_ps(`126`), _CMP_GT_OQ));
1139	const __m256 u = _mm256_mul_ps(a: b, b: b);
1140	const __m256 j = _mm256_fmadd_ps(A: _mm256_fmadd_ps(A: _mm256_fmadd_ps(A: _mm256_set1_ps(w: `0x1.0e4020p-7f`), B: b,
1141	C: _mm256_set1_ps(w: `0x1.573e2ep-5f`)), B: u,
1142	C: _mm256_fmadd_ps(A: _mm256_set1_ps(w: `0x1.555e66p-3f`), B: b,
1143	C: _mm256_set1_ps(w: `0x1.fffdb6p-2f`))),
1144	B: u, C: _mm256_mul_ps(a: _mm256_set1_ps(w: `0x1.ffffecp-1f`), b: b));
1145	if (!_mm256_movemask_ps(a: _mm256_castsi256_ps(a: c)))
1146	return _mm256_fmadd_ps(A: j, B: k, C: k);
1147	const __m256i g = _mm256_and_si256(
1148	a: _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
1149	b: _mm256_set1_epi32(i: `0x82000000u`));
1150	const __m256 s1 =
1151	_mm256_castsi256_ps(a: _mm256_add_epi32(a: g, b: _mm256_set1_epi32(i: `0x7f000000u`)));
1152	const __m256 s2 = _mm256_castsi256_ps(a: _mm256_sub_epi32(a: e, b: g));
1153	const __m256i d = _mm256_castps_si256(
1154	_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-`0.f`), n),
1155	_mm256_set1_ps(`192`), _CMP_GT_OQ));
1156	return _mm256_or_ps(
1157	a: _mm256_and_ps(a: _mm256_castsi256_ps(a: d), b: _mm256_mul_ps(a: s1, b: s1)),
1158	b: _mm256_andnot_ps(
1159	a: _mm256_castsi256_ps(a: d),
1160	b: _mm256_or_ps(
1161	a: _mm256_and_ps(a: _mm256_castsi256_ps(a: c),
1162	b: _mm256_mul_ps(a: _mm256_fmadd_ps(A: s2, B: j, C: s2), b: s1)),
1163	b: _mm256_andnot_ps(a: _mm256_castsi256_ps(a: c), b: _mm256_fmadd_ps(A: k, B: j, C: k)))));
1164	}
1165
1166	// computes silu x/(1+exp(-x)) in single precision vector
1167	inline static __m256 ggml_v_silu(__m256 x) {
1168	const __m256 one = _mm256_set1_ps(w: `1`);
1169	const __m256 zero = _mm256_setzero_ps();
1170	const __m256 neg_x = _mm256_sub_ps(a: zero, b: x);
1171	const __m256 exp_neg_x = ggml_v_expf(x: neg_x);
1172	const __m256 one_plus_exp_neg_x = _mm256_add_ps(a: one, b: exp_neg_x);
1173	return _mm256_div_ps(a: x, b: one_plus_exp_neg_x);
1174	}
1175
1176	#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
1177
1178	#if defined(__FMA__)
1179	#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
1180	#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
1181	#else
1182	#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
1183	#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
1184	#endif
1185
1186	// adapted from arm limited optimized routine
1187	// the maximum error is 1.45358 plus 0.5 ulps
1188	// numbers above 88.38 will flush to infinity
1189	// numbers beneath -103.97 will flush to zero
1190	inline static __m128 ggml_v_expf(__m128 x) {
1191	const __m128 r = _mm_set1_ps(`0x1.8p23f`);
1192	const __m128 z = MADD128(x, _mm_set1_ps(`0x1.715476p+0f`), r);
1193	const __m128 n = _mm_sub_ps(z, r);
1194	const __m128 b =
1195	NMADD128(n, _mm_set1_ps(`0x1.7f7d1cp-20f`), NMADD128(n, _mm_set1_ps(`0x1.62e4p-1f`), x));
1196	const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), `23`);
1197	const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(`1`))));
1198	const __m128i c =
1199	_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-`0.f`), n), _mm_set1_ps(`126`)));
1200	const __m128 u = _mm_mul_ps(b, b);
1201	const __m128 j =
1202	MADD128(MADD128(MADD128(_mm_set1_ps(`0x1.0e4020p-7f`), b, _mm_set1_ps(`0x1.573e2ep-5f`)), u,
1203	MADD128(_mm_set1_ps(`0x1.555e66p-3f`), b, _mm_set1_ps(`0x1.fffdb6p-2f`))),
1204	u, _mm_mul_ps(_mm_set1_ps(`0x1.ffffecp-1f`), b));
1205	if (!_mm_movemask_epi8(c))
1206	return MADD128(j, k, k);
1207	const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
1208	_mm_set1_epi32(`0x82000000u`));
1209	const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(`0x7f000000u`)));
1210	const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
1211	const __m128i d =
1212	_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-`0.f`), n), _mm_set1_ps(`192`)));
1213	return _mm_or_ps(
1214	_mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
1215	_mm_andnot_ps(_mm_castsi128_ps(d),
1216	_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
1217	_mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
1218	}
1219
1220	// computes silu x/(1+exp(-x)) in single precision vector
1221	inline static __m128 ggml_v_silu(__m128 x) {
1222	const __m128 one = _mm_set1_ps(`1`);
1223	const __m128 zero = _mm_setzero_ps();
1224	const __m128 neg_x = _mm_sub_ps(zero, x);
1225	const __m128 exp_neg_x = ggml_v_expf(neg_x);
1226	const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
1227	return _mm_div_ps(x, one_plus_exp_neg_x);
1228	}
1229
1230	#elif defined(__riscv_v_intrinsic)
1231
1232	// adapted from arm limited optimized routine
1233	// the maximum error is 1.45358 plus 0.5 ulps
1234	// numbers above 88.38 will flush to infinity
1235	// numbers beneath -103.97 will flush to zero
1236	inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
1237	const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(`0x1.8p23f`, vl);
1238	#ifdef __riscv_xtheadvector
1239	// workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
1240	vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, `0.0f`, vl);
1241	z = __riscv_vfmacc_vf_f32m2(z, `0x1.715476p+0f`, x, vl);
1242	#else
1243	const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, `0x1.715476p+0f`, x, vl);
1244	#endif
1245	const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
1246	const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, `0x1.62e4p-1f`, n, vl),
1247	`0x1.7f7d1cp-20f`, n, vl);
1248	const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), `23`, vl);
1249	const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, `0x3f800000`, vl)); // 1.0f
1250	const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), `126.0f`, vl);
1251	const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
1252	const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
1253	__riscv_vfmul_vf_f32m2(b, `0x1.ffffecp-1f`, vl),
1254	__riscv_vfmacc_vv_f32m2(
1255	__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(`0x1.fffdb6p-2f`, vl), `0x1.555e66p-3f`, b, vl),
1256	__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(`0x1.573e2ep-5f`, vl), `0x1.0e4020p-7f`, b, vl),
1257	u, vl), u, vl);
1258	if (!__riscv_vcpop_m_b16(c, vl))
1259	return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
1260	const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, `0.0f`, vl);
1261	const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(`0`, vl), `0x82000000`, dm, vl);
1262	const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, `0x7f000000`, vl));
1263	const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
1264	const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
1265	__riscv_vfmacc_vv_f32m2(k, k, j, vl),
1266	__riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
1267	c, vl);
1268	return __riscv_vmerge_vvm_f32m2(
1269	r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
1270	__riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), `192.0f`, vl),
1271	vl);
1272	}
1273
1274	// computes silu x/(1+exp(-x)) in single precision vector
1275	inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1276	const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1277	const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
1278	const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, `1.0f`, vl);
1279	return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1280	}
1281
1282	#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
1283
1284	inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1285	for (int i = `0`; i < n; ++i) {
1286	y[i] = ggml_silu_f16(x: x[i]);
1287	}
1288	}
1289
1290	inline static float ggml_silu_backward_f32(float x, float dy) {
1291	const float s = `1.0f`/(`1.0f` + expf(x: -x));
1292	return dys(`1.0f` + x*(`1.0f` - s));
1293	}
1294
1295	inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
1296	const float v = GGML_CPU_FP16_TO_FP32(x);
1297	const float s = `1.0f`/(`1.0f` + expf(x: -v));
1298	return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)s(`1.0f` + v*(`1.0f` - s)));
1299	}
1300
1301	inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
1302	for (int i = `0`; i < n; ++i) {
1303	dx[i] = ggml_silu_backward_f32(x: x[i], dy: dy[i]);
1304	}
1305	}
1306
1307	inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
1308	for (int i = `0`; i < n; ++i) {
1309	dx[i] = ggml_silu_backward_f16(x: x[i], dy: dy[i]);
1310	}
1311	}
1312
1313	inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
1314	for (int i = `0`; i < n; ++i) {
1315	y[i] = (x[i] > `0.f`) ? x[i] * g[i] : `0.f`;
1316	}
1317	}
1318
1319	inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1320	for (int i = `0`; i < n; ++i) {
1321	float v = GGML_CPU_FP16_TO_FP32(x[i]);
1322	y[i] = GGML_CPU_FP32_TO_FP16((v > `0.f`) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : `0.f`);
1323	}
1324	}
1325
1326	#ifdef GGML_GELU_FP16
1327	inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1328	uint16_t t;
1329	for (int i = `0`; i < n; ++i) {
1330	if (x[i] <= -`10.0f`) {
1331	y[i] = `0.0f`;
1332	} else if (x[i] >= `10.0f`) {
1333	y[i] = x[i] * g[i];
1334	} else {
1335	ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1336	memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t));
1337	y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
1338	}
1339	}
1340	}
1341	#else
1342	inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1343	for (int i = `0`; i < n; ++i) {
1344	y[i] = ggml_gelu_f32(x[i]) * g[i];
1345	}
1346	}
1347	#endif
1348
1349	inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1350	const uint16_t * i16 = (const uint16_t *) x;
1351	for (int i = `0`; i < n; ++i) {
1352	float v = GGML_CPU_FP16_TO_FP32(g[i]);
1353	y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
1354	}
1355	}
1356
1357	void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
1358
1359	inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1360	for (int i = `0`; i < n; ++i) {
1361	float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1362	float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1363	y[i] = GGML_CPU_FP32_TO_FP16((xi/(`1.0f` + expf(-xi))) * gi);
1364	}
1365	}
1366
1367	inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
1368	for (int i = `0`; i < n; ++i) {
1369	float xi = x[i];
1370	y[i] = `0.5f` * xi * (`1.0f` + erff(xiSQRT_2_INV)) g[i];
1371	}
1372	}
1373
1374	inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1375	for (int i = `0`; i < n; ++i) {
1376	float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1377	float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1378	y[i] = GGML_CPU_FP32_TO_FP16(`0.5f` * xi * (`1.0f` + erff(xiSQRT_2_INV)) gi);
1379	}
1380	}
1381
1382	#ifdef GGML_GELU_QUICK_FP16
1383	inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1384	uint16_t t;
1385	for (int i = `0`; i < n; ++i) {
1386	ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1387	memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t));
1388	y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
1389	}
1390	}
1391	#else
1392	inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1393	for (int i = `0`; i < n; ++i) {
1394	y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
1395	}
1396	}
1397	#endif
1398
1399	inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1400	const uint16_t * i16 = (const uint16_t *) x;
1401	for (int i = `0`; i < n; ++i) {
1402	float v = GGML_CPU_FP16_TO_FP32(g[i]);
1403	y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
1404	}
1405	}
1406
1407	inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
1408	#ifndef GGML_USE_ACCELERATE
1409	ggml_float sum = `0.0`;
1410	for (int i = `0`; i < n; ++i) {
1411	sum += (ggml_float)x[i];
1412	}
1413	s = (float*)sum;
1414	#else
1415	vDSP_sve(x, `1`, s, n);
1416	#endif
1417	}
1418
1419	inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
1420	ggml_float sum = `0.0`;
1421	for (int i = `0`; i < n; ++i) {
1422	sum += (ggml_float)x[i];
1423	}
1424	*s = sum;
1425	}
1426
1427	inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
1428	float sum = `0.0f`;
1429	for (int i = `0`; i < n; ++i) {
1430	sum += GGML_CPU_FP16_TO_FP32(x[i]);
1431	}
1432	*s = sum;
1433	}
1434
1435	inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
1436	float sum = `0.0f`;
1437	for (int i = `0`; i < n; ++i) {
1438	sum += GGML_BF16_TO_FP32(x[i]);
1439	}
1440	*s = sum;
1441	}
1442
1443	inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
1444	#ifndef GGML_USE_ACCELERATE
1445	float max = -INFINITY;
1446	for (int i = `0`; i < n; ++i) {
1447	max = MAX(max, x[i]);
1448	}
1449	*s = max;
1450	#else
1451	vDSP_maxv(x, `1`, s, n);
1452	#endif
1453	}
1454
1455	inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
1456	ggml_vec_norm_f32(n, s, x);
1457	s = `1.f`/(s);
1458	}
1459
1460	inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
1461	float max = -INFINITY;
1462	int idx = `0`;
1463	for (int i = `0`; i < n; ++i) {
1464	max = MAX(max, x[i]);
1465	if (max == x[i]) { idx = i; }
1466	}
1467	*s = idx;
1468	}
1469
1470	#ifdef __cplusplus
1471	}
1472	#endif
1473

Browse the source code of llama.cpp/ggml/src/ggml-cpu/vec.h