1// Vectorized functions for fundamental operations
2
3#pragma once
4
5#include "ggml-impl.h"
6#include "simd-mappings.h"
7#include "ggml.h"
8#include "ggml-cpu.h"
9
10#if defined(GGML_USE_ACCELERATE)
11#include <Accelerate/Accelerate.h>
12#endif
13
14// floating point type used to accumulate sums
15typedef double ggml_float;
16
17#define GGML_GELU_FP16
18#define GGML_GELU_QUICK_FP16
19
20#define GGML_SOFT_MAX_UNROLL 4
21#define GGML_VEC_DOT_UNROLL 2
22#define GGML_VEC_MAD_UNROLL 32
23
24#ifdef __cplusplus
25extern "C" {
26#endif
27
28//
29// global data
30//
31
32// precomputed gelu table for f16 (128 KB)
33extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
34
35// precomputed quick gelu table for f16 (128 KB)
36extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
37
38//
39// fundamental operations
40//
41
42void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
43void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
44void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
45
46void ggml_vec_silu_f32(const int n, float * y, const float * x);
47ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
48ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
49ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
50
51inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
52inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
53
54inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
55inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
56
57inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
58inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
59
60inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
61 int i = 0;
62#if defined(__AVX2__)
63 for (; i + 7 < n; i += 8) {
64 __m256 vx = _mm256_loadu_ps(p: x + i);
65 __m256 vy = _mm256_loadu_ps(p: y + i);
66 __m256 vz = _mm256_add_ps(a: vx, b: vy);
67 _mm256_storeu_ps(p: z + i, a: vz);
68 }
69#endif
70 for (; i < n; ++i) {
71 z[i] = x[i] + y[i];
72 }
73}
74
75inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
76 for (int i = 0; i < n; ++i) {
77 z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
78 }
79}
80inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
81inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
82inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
83inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
84inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
85 for (int i = 0; i < n; ++i) {
86 z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
87 }
88}
89inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
90inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
91inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
92inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
93 for (int i = 0; i < n; ++i) {
94 y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
95 }
96}
97
98inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
99inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
100 for (int i = 0; i < n; ++i) {
101 z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
102 }
103}
104inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
105inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
106 for (int i = 0; i < n; ++i) {
107 z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
108 }
109}
110
111// compute GGML_VEC_DOT_UNROLL dot products at once
112// xs - x row stride in bytes
113inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
114 ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
115
116 ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
117
118 for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
119 x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
120 }
121
122#if defined(GGML_SIMD)
123 #if defined(__ARM_FEATURE_SVE)
124
125 const int sve_register_length = svcntb() * 8;
126 const int ggml_f16_epr = sve_register_length / 16; // running when 16
127 const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
128
129 const int np = (n & ~(ggml_f16_step - 1));
130
131 svfloat16_t sum_00 = svdup_n_f16(0.0f);
132 svfloat16_t sum_01 = svdup_n_f16(0.0f);
133 svfloat16_t sum_02 = svdup_n_f16(0.0f);
134 svfloat16_t sum_03 = svdup_n_f16(0.0f);
135
136 svfloat16_t sum_10 = svdup_n_f16(0.0f);
137 svfloat16_t sum_11 = svdup_n_f16(0.0f);
138 svfloat16_t sum_12 = svdup_n_f16(0.0f);
139 svfloat16_t sum_13 = svdup_n_f16(0.0f);
140
141 svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
142 svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
143
144 for (int i = 0; i < np; i += ggml_f16_step) {
145 ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
146
147 ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
148 sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
149 ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
150 sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
151
152 ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
153
154 ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
155 sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
156 ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
157 sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
158
159 ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
160
161 ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
162 sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
163 ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
164 sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
165
166 ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
167
168 ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
169 sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
170 ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
171 sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
172
173 ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
174
175 ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
176
177 sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
178 ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
179 sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
180
181 ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
182
183 ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
184
185 sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
186 ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
187 sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
188
189 ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
190
191 ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
192
193 sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
194 ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
195 sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
196
197 ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
198
199 ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
200
201 sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
202 ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
203 sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
204 }
205
206 const int np2 = (n & ~(ggml_f16_epr - 1));
207 for (int k = np; k < np2; k += ggml_f16_epr) {
208 svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
209
210 svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
211 sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
212 rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
213 sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
214 }
215
216 if (np2 < n) {
217 svbool_t pg = svwhilelt_b16(np2, n);
218 svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
219 svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
220 svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
221
222 sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
223 sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
224 }
225 GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
226 GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
227 #elif defined(__riscv_v_intrinsic)
228 // todo: RVV impl
229 for (int i = 0; i < n; ++i) {
230 for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
231 sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
232 }
233 }
234 #else
235 const int np = (n & ~(GGML_F16_STEP - 1));
236
237 GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
238
239 GGML_F16_VEC ax[GGML_F16_ARR];
240 GGML_F16_VEC ay[GGML_F16_ARR];
241
242 for (int i = 0; i < np; i += GGML_F16_STEP) {
243 for (int j = 0; j < GGML_F16_ARR; j++) {
244 ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
245
246 for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
247 ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
248
249 sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
250 }
251 }
252 }
253
254 // reduce sum0..sum3 to sum0
255 for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
256 GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
257 }
258
259 // leftovers
260 for (int i = np; i < n; ++i) {
261 for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
262 sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
263 }
264 }
265 #endif
266#else
267 for (int i = 0; i < n; ++i) {
268 for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
269 sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
270 }
271 }
272#endif
273
274 for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
275 s[i] = (float)sumf[i];
276 }
277}
278
279inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
280#if defined(GGML_SIMD)
281 #if defined(__ARM_FEATURE_SVE)
282
283 const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
284 const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
285 const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
286 GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
287
288 const int np = (n & ~(ggml_f32_step - 1));
289 svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
290 svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
291 for (int i = 0; i < np; i += ggml_f32_step) {
292
293 ax1 = GGML_F32_VEC_LOAD(x + i);
294 ay1 = GGML_F32_VEC_LOAD(y + i);
295 ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
296
297 GGML_F32_VEC_STORE(y + i, ay1);
298
299 ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
300 ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
301 ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
302
303 GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
304
305 ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
306 ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
307 ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
308
309 GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
310
311 ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
312 ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
313 ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
314
315 GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
316
317 ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
318 ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
319 ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
320
321 GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
322
323 ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
324 ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
325 ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
326
327 GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
328
329 ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
330 ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
331 ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
332
333 GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
334
335 ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
336 ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
337 ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
338
339 GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
340 }
341 // leftovers
342 // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
343 const int np2 = (n & ~(ggml_f32_epr - 1));
344 for (int i = np; i < np2; i += ggml_f32_epr) {
345 ax1 = GGML_F32_VEC_LOAD(x + i);
346 ay1 = GGML_F32_VEC_LOAD(y + i);
347 ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
348
349 GGML_F32_VEC_STORE(y + i, ay1);
350 }
351 // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
352 if (np2 < n) {
353 svbool_t pg =svwhilelt_b32(np2, n);
354 ax1 = svld1_f32(pg, x + np2);
355 ay1 = svld1_f32(pg, y + np2);
356 ay1 = svmad_f32_m(pg, ax1, vx, ay1);
357
358 svst1_f32(pg, y + np2, ay1);
359 }
360 #elif defined(__riscv_v_intrinsic)
361 for (int i = 0, avl; i < n; i += avl) {
362 avl = __riscv_vsetvl_e32m8(n - i);
363 vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
364 vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
365 vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
366 __riscv_vse32_v_f32m8(&y[i], ny, avl);
367 }
368 #else
369 const int np = (n & ~(GGML_F32_STEP - 1));
370
371 GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
372
373 GGML_F32_VEC ax[GGML_F32_ARR];
374 GGML_F32_VEC ay[GGML_F32_ARR];
375
376 for (int i = 0; i < np; i += GGML_F32_STEP) {
377 for (int j = 0; j < GGML_F32_ARR; j++) {
378 ax[j] = GGML_F32_VEC_LOAD(p: x + i + j*GGML_F32_EPR);
379 ay[j] = GGML_F32_VEC_LOAD(p: y + i + j*GGML_F32_EPR);
380 ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
381
382 GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]);
383 }
384 }
385
386 // leftovers
387 for (int i = np; i < n; ++i) {
388 y[i] += x[i]*v;
389 }
390 #endif
391#else
392 // scalar
393 for (int i = 0; i < n; ++i) {
394 y[i] += x[i]*v;
395 }
396#endif
397}
398
399inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
400#if defined(GGML_SIMD)
401 #if defined(__ARM_FEATURE_SVE)
402 const int sve_register_length = svcntb() * 8;
403 const int ggml_f16_epr = sve_register_length / 16;
404 const int ggml_f16_step = 8 * ggml_f16_epr;
405
406 GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
407
408 const int np= (n & ~(ggml_f16_step - 1));
409
410 svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
411 svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
412 for (int i = 0; i < np; i += ggml_f16_step) {
413 ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
414 ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
415 ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
416
417 GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
418
419 ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
420 ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
421 ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
422
423 GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
424
425 ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
426 ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
427 ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
428
429 GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
430
431 ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
432 ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
433 ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
434
435 GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
436
437 ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
438 ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
439 ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
440
441 GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
442
443 ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
444 ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
445 ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
446
447 GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
448
449 ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
450 ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
451 ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
452
453 GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
454
455 ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
456 ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
457 ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
458
459 GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
460 }
461 const int np2 = (n & ~(ggml_f16_epr - 1));
462 for (int k = np; k < np2; k += ggml_f16_epr) {
463 svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
464 svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
465 ry = GGML_F16x_VEC_FMA(ry, rx, vx);
466
467 GGML_F16x_VEC_STORE(y + k, ry, 0);
468 }
469
470 if (np2 < n) {
471 svbool_t pg = svwhilelt_b16(np2, n);
472 svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
473 svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
474 hy = svmad_f16_x(pg, hx, vx, hy);
475 svst1_f16(pg, (__fp16 *)(y + np2), hy);
476 }
477
478 #elif defined(__riscv_v_intrinsic)
479 // todo: RVV impl
480 // scalar
481 for (int i = 0; i < n; ++i) {
482 y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
483 }
484 #else
485 const int np = (n & ~(GGML_F16_STEP - 1));
486
487 GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
488
489 GGML_F16_VEC ax[GGML_F16_ARR];
490 GGML_F16_VEC ay[GGML_F16_ARR];
491
492 for (int i = 0; i < np; i += GGML_F16_STEP) {
493 for (int j = 0; j < GGML_F16_ARR; j++) {
494 ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
495 ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
496 ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
497
498 GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
499 }
500 }
501
502 // leftovers
503 for (int i = np; i < n; ++i) {
504 y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
505 }
506 #endif
507#else
508 // scalar
509 for (int i = 0; i < n; ++i) {
510 y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
511 }
512#endif
513}
514
515// xs and vs are byte strides of x and v
516inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
517
518 const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
519 const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
520
521 for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
522 x[i] = (const float *) ((const char *) xv + i*xs);
523 v[i] = (const float *) ((const char *) vv + i*vs);
524 }
525
526#if defined(GGML_SIMD)
527 #if defined(__ARM_FEATURE_SVE)
528 // scalar Route to scalar implementation //TODO: Write SVE code
529 for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
530 for (int i = 0; i < n; ++i) {
531 y[i] += x[k][i]*v[k][0];
532 }
533 }
534 #elif defined(__riscv_v_intrinsic)
535 for (int i = 0, avl; i < n; i += avl) {
536 avl = __riscv_vsetvl_e32m8(n - i);
537 vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
538 for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
539 vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
540 ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
541 }
542 __riscv_vse32_v_f32m8(&y[i], ay, avl);
543 }
544 #else
545 const int np = (n & ~(GGML_F32_STEP - 1));
546
547 GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
548
549 for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
550 vx[k] = GGML_F32_VEC_SET1(v[k][0]);
551 }
552
553 GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
554 GGML_F32_VEC ay[GGML_F32_ARR];
555
556 for (int i = 0; i < np; i += GGML_F32_STEP) {
557 for (int j = 0; j < GGML_F32_ARR; j++) {
558 ay[j] = GGML_F32_VEC_LOAD(p: y + i + j*GGML_F32_EPR);
559
560 for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
561 ax[k][j] = GGML_F32_VEC_LOAD(p: x[k] + i + j*GGML_F32_EPR);
562 ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
563 }
564
565 GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]);
566 }
567 }
568
569 // leftovers
570 for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
571 for (int i = np; i < n; ++i) {
572 y[i] += x[k][i]*v[k][0];
573 }
574 }
575 #endif
576#else
577 // scalar
578 for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
579 for (int i = 0; i < n; ++i) {
580 y[i] += x[k][i]*v[k][0];
581 }
582 }
583#endif
584}
585
586inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
587#if defined(GGML_USE_ACCELERATE)
588 vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
589#elif defined(GGML_SIMD)
590 #if defined(__ARM_FEATURE_SVE)
591 // scalar ; TODO: Write SVE code
592 for (int i = 0; i < n; ++i) {
593 y[i] = x[i]*s + b;
594 }
595 #elif defined(__riscv_v_intrinsic)
596 for (int i = 0, avl; i < n; i += avl) {
597 avl = __riscv_vsetvl_e32m8(n - i);
598 vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
599 vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
600 vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
601 __riscv_vse32_v_f32m8(&y[i], ny, avl);
602 }
603 #else
604 const int np = (n & ~(GGML_F32_STEP - 1));
605
606 GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
607 GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
608
609 GGML_F32_VEC ay[GGML_F32_ARR];
610
611 for (int i = 0; i < np; i += GGML_F32_STEP) {
612 for (int j = 0; j < GGML_F32_ARR; j++) {
613 ay[j] = GGML_F32_VEC_LOAD(p: x + i + j*GGML_F32_EPR);
614 ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
615
616 GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]);
617 }
618 }
619
620 // leftovers
621 for (int i = np; i < n; ++i) {
622 y[i] = x[i]*s + b;
623 }
624 #endif
625#else
626 // scalar
627 for (int i = 0; i < n; ++i) {
628 y[i] = x[i]*s + b;
629 }
630#endif
631}
632
633//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
634inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
635#if defined(GGML_USE_ACCELERATE)
636 vDSP_vsmul(y, 1, &v, y, 1, n);
637#elif defined(GGML_SIMD)
638 #if defined(__ARM_FEATURE_SVE)
639 const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
640 const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
641 const int ggml_f32_step = 2 * ggml_f32_epr;
642
643 GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
644 const int np = (n & ~(ggml_f32_step - 1));
645 svfloat32_t ay1;
646 svfloat32_t ay2;
647 for (int i = 0; i < np; i += ggml_f32_step) {
648 ay1 = GGML_F32_VEC_LOAD(y + i);
649 ay1 = GGML_F32_VEC_MUL(ay1, vx);
650 GGML_F32_VEC_STORE(y + i, ay1);
651
652 ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
653 ay2 = GGML_F32_VEC_MUL(ay2, vx);
654 GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
655 }
656 // leftovers
657 // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
658 for (int i = np; i < n; i += ggml_f32_epr) {
659 svbool_t pg = svwhilelt_b32(i, n);
660 ay1 = svld1_f32(pg, y + i);
661 ay1 = svmul_f32_m(pg, ay1, vx);
662 svst1_f32(pg, y + i, ay1);
663 }
664 #elif defined(__riscv_v_intrinsic)
665 for (int i = 0, avl; i < n; i += avl) {
666 avl = __riscv_vsetvl_e32m8(n - i);
667 vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
668 vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
669 __riscv_vse32_v_f32m8(&y[i], ny, avl);
670 }
671 #else
672 const int np = (n & ~(GGML_F32_STEP - 1));
673
674 GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
675
676 GGML_F32_VEC ay[GGML_F32_ARR];
677
678 for (int i = 0; i < np; i += GGML_F32_STEP) {
679 for (int j = 0; j < GGML_F32_ARR; j++) {
680 ay[j] = GGML_F32_VEC_LOAD(p: y + i + j*GGML_F32_EPR);
681 ay[j] = GGML_F32_VEC_MUL(a: ay[j], b: vx);
682
683 GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]);
684 }
685 }
686
687 // leftovers
688 for (int i = np; i < n; ++i) {
689 y[i] *= v;
690 }
691 #endif
692#else
693 // scalar
694 for (int i = 0; i < n; ++i) {
695 y[i] *= v;
696 }
697#endif
698}
699
700inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
701#if defined(GGML_SIMD)
702 #if defined(__ARM_FEATURE_SVE)
703 const int sve_register_length = svcntb() * 8;
704 const int ggml_f16_epr = sve_register_length / 16;
705 const int ggml_f16_step = 2 * ggml_f16_epr;
706
707 GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
708 const int np = (n & ~(ggml_f16_step - 1));
709 svfloat16_t ay1, ay2;
710
711 for (int i = 0; i < np; i += ggml_f16_step) {
712 ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
713 ay1 = GGML_F16x_VEC_MUL(ay1, vx);
714 GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
715
716 ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
717 ay2 = GGML_F16x_VEC_MUL(ay2, vx);
718 GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
719 }
720 // leftovers
721 // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
722 if (np < n) {
723 svbool_t pg = svwhilelt_b16(np, n);
724 svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
725 svfloat16_t out = svmul_f16_m(pg, hy, vx);
726 svst1_f16(pg, (__fp16 *)(y + np), out);
727 }
728 #elif defined(__riscv_v_intrinsic)
729 // todo: RVV impl
730 // scalar
731 for (int i = 0; i < n; ++i) {
732 y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
733 }
734 #else
735 const int np = (n & ~(GGML_F16_STEP - 1));
736
737 GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
738
739 GGML_F16_VEC ay[GGML_F16_ARR];
740
741 for (int i = 0; i < np; i += GGML_F16_STEP) {
742 for (int j = 0; j < GGML_F16_ARR; j++) {
743 ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
744 ay[j] = GGML_F16_VEC_MUL(a: ay[j], b: vx);
745
746 GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
747 }
748 }
749
750 // leftovers
751 for (int i = np; i < n; ++i) {
752 y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
753 }
754 #endif
755#else
756 // scalar
757 for (int i = 0; i < n; ++i) {
758 y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
759 }
760#endif
761}
762
763inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, bs: 0, x, bx: 0, y: x, by: 0, nrc: 1); *s = sqrtf(x: *s); }
764inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
765inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
766 for (int i = 0; i < n; ++i) {
767 float v = GGML_CPU_FP16_TO_FP32(x[i]);
768 y[i] = GGML_CPU_FP32_TO_FP16(v*v);
769 }
770}
771inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x: x[i]); }
772inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
773 for (int i = 0; i < n; ++i) {
774 y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
775 }
776}
777inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x: x[i]); }
778inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
779 for (int i = 0; i < n; ++i) {
780 y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
781 }
782}
783inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x: x[i]); }
784inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
785 for (int i = 0; i < n; ++i) {
786 y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
787 }
788}
789inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x: x[i]); }
790inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
791 for (int i = 0; i < n; ++i) {
792 y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
793 }
794}
795inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x: x[i]); }
796inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
797 for (int i = 0; i < n; ++i) {
798 y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
799 }
800}
801inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
802inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
803 for (int i = 0; i < n; ++i) {
804 float v = GGML_CPU_FP16_TO_FP32(x[i]);
805 y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
806 }
807}
808inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
809inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
810 for (int i = 0; i < n; ++i) {
811 y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
812 }
813}
814inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x: x[i]); }
815inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
816 for (int i = 0; i < n; ++i) {
817 y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
818 }
819}
820inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x: x[i]); }
821inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
822 for (int i = 0; i < n; ++i) {
823 const float v = GGML_CPU_FP16_TO_FP32(x[i]);
824 y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
825 }
826}
827inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
828inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
829 for (int i = 0; i < n; ++i) {
830 float v = GGML_CPU_FP16_TO_FP32(x[i]);
831 y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
832 }
833}
834inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
835inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
836 for (int i = 0; i < n; ++i) {
837 float v = GGML_CPU_FP16_TO_FP32(x[i]);
838 y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
839 }
840}
841inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(x: -x[i])); }
842inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
843 for (int i = 0; i < n; ++i) {
844 y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
845 }
846}
847// TODO: optimize performance
848inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(x: 1.0f, y: fmaxf(x: 0.0f, y: (x[i] + 3.0f) / 6.0f)); }
849inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
850 for (int i = 0; i < n; ++i) {
851 float v = GGML_CPU_FP16_TO_FP32(x[i]);
852 y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
853 }
854}
855inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(x: 1.0f, y: fmaxf(x: 0.0f, y: (x[i] + 3.0f) / 6.0f)); }
856inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
857 for (int i = 0; i < n; ++i) {
858 y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
859 }
860}
861inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x: x[i]); }
862inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
863 for (int i = 0; i < n; ++i) {
864 y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
865 }
866}
867
868static const float GELU_COEF_A = 0.044715f;
869static const float GELU_QUICK_COEF = -1.702f;
870static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
871static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
872
873inline static float ggml_gelu_f32(float x) {
874 return 0.5f*x*(1.0f + tanhf(x: SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
875}
876
877inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
878 const uint16_t * i16 = (const uint16_t *) x;
879 for (int i = 0; i < n; ++i) {
880 y[i] = ggml_table_gelu_f16[i16[i]];
881 }
882}
883
884inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
885 for (int i = 0; i < n; ++i) {
886 float xi = GGML_CPU_FP16_TO_FP32(x[i]);
887 float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
888 y[i] = GGML_CPU_FP32_TO_FP16(res);
889 }
890}
891
892#ifdef GGML_GELU_FP16
893inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
894 uint16_t t;
895 for (int i = 0; i < n; ++i) {
896 if (x[i] <= -10.0f) {
897 y[i] = 0.0f;
898 } else if (x[i] >= 10.0f) {
899 y[i] = x[i];
900 } else {
901 ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
902 memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t));
903 y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
904 }
905 }
906}
907#else
908inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
909 for (int i = 0; i < n; ++i) {
910 y[i] = ggml_gelu_f32(x[i]);
911 }
912}
913#endif
914
915inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
916 for (int i = 0; i < n; ++i) {
917 float xi = x[i];
918 y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
919 }
920}
921
922inline static float ggml_gelu_quick_f32(float x) {
923 return x*(1.0f/(1.0f+expf(x: GELU_QUICK_COEF*x)));
924}
925
926//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
927// const uint16_t * i16 = (const uint16_t *) x;
928// for (int i = 0; i < n; ++i) {
929// y[i] = ggml_table_gelu_quick_f16[i16[i]];
930// }
931//}
932
933#ifdef GGML_GELU_QUICK_FP16
934inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
935 uint16_t t;
936 for (int i = 0; i < n; ++i) {
937 ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
938 memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t));
939 y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
940 }
941}
942#else
943inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
944 for (int i = 0; i < n; ++i) {
945 y[i] = ggml_gelu_quick_f32(x[i]);
946 }
947}
948#endif
949
950inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
951 for (int i = 0; i < n; ++i) {
952 float v = GGML_CPU_FP16_TO_FP32(x[i]);
953 y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
954 }
955}
956
957// Sigmoid Linear Unit (SiLU) function
958inline static float ggml_silu_f32(float x) {
959 return x/(1.0f + expf(x: -x));
960}
961inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
962 float v = GGML_CPU_FP16_TO_FP32(x);
963 return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
964}
965
966#if __FINITE_MATH_ONLY__
967#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
968#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
969#endif
970
971/* Below function was borrowed from the GitHub repository:
972https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
973#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
974 inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
975 // Constants
976 const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
977 const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
978 const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
979 const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
980 const svfloat32_t one = svdup_n_f32(1.0f);
981 const svfloat32_t inactive1 = svdup_n_f32(0.0f);
982 const svint32_t inactive2 = svdup_n_s32(0);
983
984 // Algorithm starts here
985 svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
986 svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
987 svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
988
989 t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
990 t1 = svadd_f32_m(pg, t1, one); // b = a + 1
991
992 svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
993 svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
994 t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
995
996 // and_(t2.d, t1.d, not_mask17.d)
997 svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
998 t5 = svsub_f32_m(pg, t1, t5); // z
999 t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
1000 t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
1001 t0 = svmul_f32_m(pg, t0, t4); // Final result
1002
1003 return t0;
1004 }
1005#endif
1006
1007#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1008
1009inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
1010 const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
1011 const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
1012 const svfloat32_t n = svsub_f32_x(pg, z, r);
1013 const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
1014 const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
1015 const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
1016 const svbool_t c = svacgt_n_f32(pg, n, 126);
1017 const svfloat32_t u = svmul_f32_x(pg, b, b);
1018 const svfloat32_t j = svmla_f32_x(pg,
1019 svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
1020 svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
1021 svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
1022 const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
1023 const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
1024 const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
1025 return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
1026 svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
1027}
1028
1029// computes silu x/(1+exp(-x)) in single precision vector
1030inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
1031 const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
1032 const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
1033 const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
1034 const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
1035 const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
1036 return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
1037}
1038
1039#elif defined(__ARM_NEON) && defined(__aarch64__)
1040
1041// adapted from arm limited optimized routine
1042// the maximum error is 1.45358 plus 0.5 ulps
1043// numbers above 88.38 will flush to infinity
1044// numbers beneath -103.97 will flush to zero
1045inline static float32x4_t ggml_v_expf(float32x4_t x) {
1046 const float32x4_t r = vdupq_n_f32(0x1.8p23f);
1047 const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
1048 const float32x4_t n = vsubq_f32(z, r);
1049 const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
1050 vdupq_n_f32(0x1.7f7d1cp-20f));
1051 const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
1052 const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
1053 const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
1054 const float32x4_t u = vmulq_f32(b, b);
1055 const float32x4_t j = vfmaq_f32(
1056 vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
1057 vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
1058 vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
1059 if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
1060 return vfmaq_f32(k, j, k);
1061 const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
1062 const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
1063 const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
1064 return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
1065 vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
1066}
1067
1068// computes silu x/(1+exp(-x)) in single precision vector
1069inline static float32x4_t ggml_v_silu(float32x4_t x) {
1070 const float32x4_t one = vdupq_n_f32(1.0f);
1071 const float32x4_t zero = vdupq_n_f32(0.0f);
1072 const float32x4_t neg_x = vsubq_f32(zero, x);
1073 const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
1074 const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
1075 return vdivq_f32(x, one_plus_exp_neg_x);
1076}
1077
1078#elif defined(__AVX512F__) && defined(__AVX512DQ__)
1079
1080// adapted from arm limited optimized routine
1081// the maximum error is 1.45358 plus 0.5 ulps
1082// numbers above 88.38 will flush to infinity
1083// numbers beneath -103.97 will flush to zero
1084inline static __m512 ggml_v_expf(__m512 x) {
1085 const __m512 r = _mm512_set1_ps(0x1.8p23f);
1086 const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
1087 const __m512 n = _mm512_sub_ps(z, r);
1088 const __m512 b =
1089 _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
1090 _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
1091 const __mmask16 d =
1092 _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
1093 const __m512 u = _mm512_mul_ps(b, b);
1094 const __m512 j = _mm512_fmadd_ps(
1095 _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
1096 _mm512_set1_ps(0x1.573e2ep-5f)),
1097 u,
1098 _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
1099 _mm512_set1_ps(0x1.fffdb6p-2f))),
1100 u,
1101 _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
1102 const __m512 res = _mm512_scalef_ps(j, n);
1103 if (_mm512_kortestz(d, d))
1104 return res;
1105 const __m512 zero = _mm512_setzero_ps();
1106 const __m512 alt = _mm512_mask_blend_ps(
1107 _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
1108 return _mm512_mask_blend_ps(d, res, alt);
1109}
1110
1111// computes silu x/(1+exp(-x)) in single precision vector
1112inline static __m512 ggml_v_silu(__m512 x) {
1113 const __m512 one = _mm512_set1_ps(1);
1114 const __m512 zero = _mm512_setzero_ps();
1115 const __m512 neg_x = _mm512_sub_ps(zero, x);
1116 const __m512 exp_neg_x = ggml_v_expf(neg_x);
1117 const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
1118 return _mm512_div_ps(x, one_plus_exp_neg_x);
1119}
1120
1121#elif defined(__AVX2__) && defined(__FMA__)
1122
1123// adapted from arm limited optimized routine
1124// the maximum error is 1.45358 plus 0.5 ulps
1125// numbers above 88.38 will flush to infinity
1126// numbers beneath -103.97 will flush to zero
1127inline static __m256 ggml_v_expf(__m256 x) {
1128 const __m256 r = _mm256_set1_ps(w: 0x1.8p23f);
1129 const __m256 z = _mm256_fmadd_ps(A: x, B: _mm256_set1_ps(w: 0x1.715476p+0f), C: r);
1130 const __m256 n = _mm256_sub_ps(a: z, b: r);
1131 const __m256 b = _mm256_fnmadd_ps(A: n, B: _mm256_set1_ps(w: 0x1.7f7d1cp-20f),
1132 C: _mm256_fnmadd_ps(A: n, B: _mm256_set1_ps(w: 0x1.62e4p-1f), C: x));
1133 const __m256i e = _mm256_slli_epi32(a: _mm256_castps_si256(a: z), count: 23);
1134 const __m256 k = _mm256_castsi256_ps(
1135 a: _mm256_add_epi32(a: e, b: _mm256_castps_si256(a: _mm256_set1_ps(w: 1))));
1136 const __m256i c = _mm256_castps_si256(
1137 _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
1138 _mm256_set1_ps(126), _CMP_GT_OQ));
1139 const __m256 u = _mm256_mul_ps(a: b, b: b);
1140 const __m256 j = _mm256_fmadd_ps(A: _mm256_fmadd_ps(A: _mm256_fmadd_ps(A: _mm256_set1_ps(w: 0x1.0e4020p-7f), B: b,
1141 C: _mm256_set1_ps(w: 0x1.573e2ep-5f)), B: u,
1142 C: _mm256_fmadd_ps(A: _mm256_set1_ps(w: 0x1.555e66p-3f), B: b,
1143 C: _mm256_set1_ps(w: 0x1.fffdb6p-2f))),
1144 B: u, C: _mm256_mul_ps(a: _mm256_set1_ps(w: 0x1.ffffecp-1f), b: b));
1145 if (!_mm256_movemask_ps(a: _mm256_castsi256_ps(a: c)))
1146 return _mm256_fmadd_ps(A: j, B: k, C: k);
1147 const __m256i g = _mm256_and_si256(
1148 a: _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
1149 b: _mm256_set1_epi32(i: 0x82000000u));
1150 const __m256 s1 =
1151 _mm256_castsi256_ps(a: _mm256_add_epi32(a: g, b: _mm256_set1_epi32(i: 0x7f000000u)));
1152 const __m256 s2 = _mm256_castsi256_ps(a: _mm256_sub_epi32(a: e, b: g));
1153 const __m256i d = _mm256_castps_si256(
1154 _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
1155 _mm256_set1_ps(192), _CMP_GT_OQ));
1156 return _mm256_or_ps(
1157 a: _mm256_and_ps(a: _mm256_castsi256_ps(a: d), b: _mm256_mul_ps(a: s1, b: s1)),
1158 b: _mm256_andnot_ps(
1159 a: _mm256_castsi256_ps(a: d),
1160 b: _mm256_or_ps(
1161 a: _mm256_and_ps(a: _mm256_castsi256_ps(a: c),
1162 b: _mm256_mul_ps(a: _mm256_fmadd_ps(A: s2, B: j, C: s2), b: s1)),
1163 b: _mm256_andnot_ps(a: _mm256_castsi256_ps(a: c), b: _mm256_fmadd_ps(A: k, B: j, C: k)))));
1164}
1165
1166// computes silu x/(1+exp(-x)) in single precision vector
1167inline static __m256 ggml_v_silu(__m256 x) {
1168 const __m256 one = _mm256_set1_ps(w: 1);
1169 const __m256 zero = _mm256_setzero_ps();
1170 const __m256 neg_x = _mm256_sub_ps(a: zero, b: x);
1171 const __m256 exp_neg_x = ggml_v_expf(x: neg_x);
1172 const __m256 one_plus_exp_neg_x = _mm256_add_ps(a: one, b: exp_neg_x);
1173 return _mm256_div_ps(a: x, b: one_plus_exp_neg_x);
1174}
1175
1176#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
1177
1178#if defined(__FMA__)
1179#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
1180#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
1181#else
1182#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
1183#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
1184#endif
1185
1186// adapted from arm limited optimized routine
1187// the maximum error is 1.45358 plus 0.5 ulps
1188// numbers above 88.38 will flush to infinity
1189// numbers beneath -103.97 will flush to zero
1190inline static __m128 ggml_v_expf(__m128 x) {
1191 const __m128 r = _mm_set1_ps(0x1.8p23f);
1192 const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
1193 const __m128 n = _mm_sub_ps(z, r);
1194 const __m128 b =
1195 NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
1196 const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
1197 const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
1198 const __m128i c =
1199 _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
1200 const __m128 u = _mm_mul_ps(b, b);
1201 const __m128 j =
1202 MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
1203 MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
1204 u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
1205 if (!_mm_movemask_epi8(c))
1206 return MADD128(j, k, k);
1207 const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
1208 _mm_set1_epi32(0x82000000u));
1209 const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
1210 const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
1211 const __m128i d =
1212 _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
1213 return _mm_or_ps(
1214 _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
1215 _mm_andnot_ps(_mm_castsi128_ps(d),
1216 _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
1217 _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
1218}
1219
1220// computes silu x/(1+exp(-x)) in single precision vector
1221inline static __m128 ggml_v_silu(__m128 x) {
1222 const __m128 one = _mm_set1_ps(1);
1223 const __m128 zero = _mm_setzero_ps();
1224 const __m128 neg_x = _mm_sub_ps(zero, x);
1225 const __m128 exp_neg_x = ggml_v_expf(neg_x);
1226 const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
1227 return _mm_div_ps(x, one_plus_exp_neg_x);
1228}
1229
1230#elif defined(__riscv_v_intrinsic)
1231
1232// adapted from arm limited optimized routine
1233// the maximum error is 1.45358 plus 0.5 ulps
1234// numbers above 88.38 will flush to infinity
1235// numbers beneath -103.97 will flush to zero
1236inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
1237 const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
1238#ifdef __riscv_xtheadvector
1239 // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
1240 vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
1241 z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
1242#else
1243 const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
1244#endif
1245 const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
1246 const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
1247 0x1.7f7d1cp-20f, n, vl);
1248 const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
1249 const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
1250 const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
1251 const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
1252 const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
1253 __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
1254 __riscv_vfmacc_vv_f32m2(
1255 __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
1256 __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
1257 u, vl), u, vl);
1258 if (!__riscv_vcpop_m_b16(c, vl))
1259 return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
1260 const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
1261 const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
1262 const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
1263 const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
1264 const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
1265 __riscv_vfmacc_vv_f32m2(k, k, j, vl),
1266 __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
1267 c, vl);
1268 return __riscv_vmerge_vvm_f32m2(
1269 r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
1270 __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
1271 vl);
1272}
1273
1274// computes silu x/(1+exp(-x)) in single precision vector
1275inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1276 const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1277 const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
1278 const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
1279 return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1280}
1281
1282#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
1283
1284inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1285 for (int i = 0; i < n; ++i) {
1286 y[i] = ggml_silu_f16(x: x[i]);
1287 }
1288}
1289
1290inline static float ggml_silu_backward_f32(float x, float dy) {
1291 const float s = 1.0f/(1.0f + expf(x: -x));
1292 return dy*s*(1.0f + x*(1.0f - s));
1293}
1294
1295inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
1296 const float v = GGML_CPU_FP16_TO_FP32(x);
1297 const float s = 1.0f/(1.0f + expf(x: -v));
1298 return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
1299}
1300
1301inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
1302 for (int i = 0; i < n; ++i) {
1303 dx[i] = ggml_silu_backward_f32(x: x[i], dy: dy[i]);
1304 }
1305}
1306
1307inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
1308 for (int i = 0; i < n; ++i) {
1309 dx[i] = ggml_silu_backward_f16(x: x[i], dy: dy[i]);
1310 }
1311}
1312
1313inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
1314 for (int i = 0; i < n; ++i) {
1315 y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
1316 }
1317}
1318
1319inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1320 for (int i = 0; i < n; ++i) {
1321 float v = GGML_CPU_FP16_TO_FP32(x[i]);
1322 y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
1323 }
1324}
1325
1326#ifdef GGML_GELU_FP16
1327inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1328 uint16_t t;
1329 for (int i = 0; i < n; ++i) {
1330 if (x[i] <= -10.0f) {
1331 y[i] = 0.0f;
1332 } else if (x[i] >= 10.0f) {
1333 y[i] = x[i] * g[i];
1334 } else {
1335 ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1336 memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t));
1337 y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
1338 }
1339 }
1340}
1341#else
1342inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1343 for (int i = 0; i < n; ++i) {
1344 y[i] = ggml_gelu_f32(x[i]) * g[i];
1345 }
1346}
1347#endif
1348
1349inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1350 const uint16_t * i16 = (const uint16_t *) x;
1351 for (int i = 0; i < n; ++i) {
1352 float v = GGML_CPU_FP16_TO_FP32(g[i]);
1353 y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
1354 }
1355}
1356
1357void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
1358
1359inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1360 for (int i = 0; i < n; ++i) {
1361 float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1362 float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1363 y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
1364 }
1365}
1366
1367inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
1368 for (int i = 0; i < n; ++i) {
1369 float xi = x[i];
1370 y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
1371 }
1372}
1373
1374inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1375 for (int i = 0; i < n; ++i) {
1376 float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1377 float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1378 y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
1379 }
1380}
1381
1382#ifdef GGML_GELU_QUICK_FP16
1383inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1384 uint16_t t;
1385 for (int i = 0; i < n; ++i) {
1386 ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1387 memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t));
1388 y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
1389 }
1390}
1391#else
1392inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1393 for (int i = 0; i < n; ++i) {
1394 y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
1395 }
1396}
1397#endif
1398
1399inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1400 const uint16_t * i16 = (const uint16_t *) x;
1401 for (int i = 0; i < n; ++i) {
1402 float v = GGML_CPU_FP16_TO_FP32(g[i]);
1403 y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
1404 }
1405}
1406
1407inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
1408#ifndef GGML_USE_ACCELERATE
1409 ggml_float sum = 0.0;
1410 for (int i = 0; i < n; ++i) {
1411 sum += (ggml_float)x[i];
1412 }
1413 *s = (float)sum;
1414#else
1415 vDSP_sve(x, 1, s, n);
1416#endif
1417}
1418
1419inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
1420 ggml_float sum = 0.0;
1421 for (int i = 0; i < n; ++i) {
1422 sum += (ggml_float)x[i];
1423 }
1424 *s = sum;
1425}
1426
1427inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
1428 float sum = 0.0f;
1429 for (int i = 0; i < n; ++i) {
1430 sum += GGML_CPU_FP16_TO_FP32(x[i]);
1431 }
1432 *s = sum;
1433}
1434
1435inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
1436 float sum = 0.0f;
1437 for (int i = 0; i < n; ++i) {
1438 sum += GGML_BF16_TO_FP32(x[i]);
1439 }
1440 *s = sum;
1441}
1442
1443inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
1444#ifndef GGML_USE_ACCELERATE
1445 float max = -INFINITY;
1446 for (int i = 0; i < n; ++i) {
1447 max = MAX(max, x[i]);
1448 }
1449 *s = max;
1450#else
1451 vDSP_maxv(x, 1, s, n);
1452#endif
1453}
1454
1455inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
1456 ggml_vec_norm_f32(n, s, x);
1457 *s = 1.f/(*s);
1458}
1459
1460inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
1461 float max = -INFINITY;
1462 int idx = 0;
1463 for (int i = 0; i < n; ++i) {
1464 max = MAX(max, x[i]);
1465 if (max == x[i]) { idx = i; }
1466 }
1467 *s = idx;
1468}
1469
1470#ifdef __cplusplus
1471}
1472#endif
1473