1#pragma once
2
3#include "ggml-cpu-impl.h"
4
5#ifdef __ARM_FEATURE_SVE
6#include <arm_sve.h>
7#endif // __ARM_FEATURE_SVE
8
9#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
10// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
11//
12// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
13//
14#include <arm_neon.h>
15#endif
16
17#if defined(__F16C__)
18#include <immintrin.h>
19#endif
20
21#if defined(__riscv_v_intrinsic)
22#include <riscv_vector.h>
23#endif
24
25#ifdef __cplusplus
26extern "C" {
27#endif
28
29//
30// simd mappings
31//
32
33// FP16 to FP32 conversion
34
35// 16-bit float
36// on Arm, we use __fp16
37// on x86, we use uint16_t
38//
39// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
40// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
41//
42#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
43 #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
44 #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
45
46 #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
47
48 static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
49 __fp16 tmp;
50 memcpy(&tmp, &h, sizeof(ggml_fp16_t));
51 return (float)tmp;
52 }
53
54 static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
55 ggml_fp16_t res;
56 __fp16 tmp = f;
57 memcpy(&res, &tmp, sizeof(ggml_fp16_t));
58 return res;
59 }
60#elif defined(__F16C__)
61 #ifdef _MSC_VER
62 #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
63 #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
64 #else
65 #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
66 #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
67 #endif
68#elif defined(__POWER9_VECTOR__)
69 #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
70 #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
71 /* the inline asm below is about 12% faster than the lookup method */
72 #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
73 #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
74
75 static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
76 float f;
77 double d;
78 __asm__(
79 "mtfprd %0,%2\n"
80 "xscvhpdp %0,%0\n"
81 "frsp %1,%0\n" :
82 /* temp */ "=d"(d),
83 /* out */ "=f"(f):
84 /* in */ "r"(h));
85 return f;
86 }
87
88 static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
89 double d;
90 ggml_fp16_t r;
91 __asm__( /* xscvdphp can work on double or single precision */
92 "xscvdphp %0,%2\n"
93 "mffprd %1,%0\n" :
94 /* temp */ "=d"(d),
95 /* out */ "=r"(r):
96 /* in */ "f"(f));
97 return r;
98 }
99#elif defined(__riscv) && defined(__riscv_zfhmin)
100 static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
101 _Float16 hf;
102 memcpy(&hf, &h, sizeof(ggml_fp16_t));
103 return hf;
104 }
105
106 static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
107 ggml_fp16_t res;
108 _Float16 hf = (_Float16)f;
109 memcpy(&res, &hf, sizeof(ggml_fp16_t));
110 return res;
111 }
112
113 #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
114 #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
115 #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
116 #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
117#endif
118
119// precomputed f32 table for f16 (256 KB)
120// defined in ggml-cpu.c, initialized in ggml_cpu_init()
121extern float ggml_table_f32_f16[1 << 16];
122
123// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
124// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
125// This is also true for POWER9.
126#if !defined(GGML_CPU_FP16_TO_FP32)
127inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
128 uint16_t s;
129 memcpy(dest: &s, src: &f, n: sizeof(uint16_t));
130 return ggml_table_f32_f16[s];
131}
132
133#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
134#endif
135
136#if !defined(GGML_CPU_FP32_TO_FP16)
137#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
138#endif
139
140
141// we define a common set of C macros which map to specific intrinsics based on the current architecture
142// we then implement the fundamental computation operations below using only these macros
143// adding support for new architectures requires to define the corresponding SIMD macros
144//
145// GGML_F32_STEP / GGML_F16_STEP
146// number of elements to process in a single step
147//
148// GGML_F32_EPR / GGML_F16_EPR
149// number of elements to fit in a single register
150//
151
152#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
153
154#define GGML_SIMD
155
156// F32 SVE
157#define GGML_F32_EPR 8
158#define DEFAULT_PG svptrue_b32()
159
160#define GGML_F32xt svfloat32_t
161#define GGML_F32xt_ZERO svdup_n_f32(0.0f)
162#define GGML_F32xt_SET1(x) svdup_n_f32(x)
163#define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
164#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
165#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
166#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
167#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
168#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
169#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
170#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
171#define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
172#define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
173#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
174#define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
175#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
176{ \
177 sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
178 sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
179 sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
180 sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
181 sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
182 sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
183 sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
184 (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
185}
186#define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
187
188#define GGML_F32_VEC GGML_F32xt
189#define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
190#define GGML_F32_VEC_SET1 GGML_F32xt_SET1
191#define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
192#define GGML_F32_VEC_STORE GGML_F32xt_STORE
193#define GGML_F32_VEC_FMA GGML_F32xt_FMA
194#define GGML_F32_VEC_ADD GGML_F32xt_ADD
195#define GGML_F32_VEC_MUL GGML_F32xt_MUL
196#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
197
198// F16 SVE
199#define DEFAULT_PG32 svptrue_b32()
200#define DEFAULT_PG16 svptrue_b16()
201
202#define GGML_F32Cxt svfloat16_t
203#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
204#define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
205#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
206#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
207
208#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
209#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
210#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
211#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
212#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
213#define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
214#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
215
216#define GGML_F16x_VEC GGML_F32Cxt
217#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
218#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
219#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
220#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
221#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
222#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
223#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
224#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
225
226#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
227#define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
228
229#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
230{ \
231 sum1 = svadd_f16_x(pg16, sum1, sum2); \
232 sum3 = svadd_f16_x(pg16, sum3, sum4); \
233 sum1 = svadd_f16_x(pg16, sum1, sum3); \
234 __fp16 sum_f16 = svaddv_f16(pg16, sum1); \
235 (res) = (ggml_float) sum_f16; \
236}
237#define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
238
239// F16 NEON
240
241#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
242 #define GGML_F16_STEP 32
243 #define GGML_F16_EPR 8
244
245 #define GGML_F16x8 float16x8_t
246 #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
247 #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
248 #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
249 #define GGML_F16x8_STORE vst1q_f16
250 #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
251 #define GGML_F16x8_ADD vaddq_f16
252 #define GGML_F16x8_MUL vmulq_f16
253 #define GGML_F16x8_REDUCE(res, x) \
254 do { \
255 int offset = GGML_F16_ARR >> 1; \
256 for (int i = 0; i < offset; ++i) { \
257 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
258 } \
259 offset >>= 1; \
260 for (int i = 0; i < offset; ++i) { \
261 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
262 } \
263 offset >>= 1; \
264 for (int i = 0; i < offset; ++i) { \
265 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
266 } \
267 const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
268 const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
269 (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
270 } while (0)
271
272 #define GGML_F16_VEC GGML_F16x8
273 #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
274 #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
275 #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
276 #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
277 #define GGML_F16_VEC_FMA GGML_F16x8_FMA
278 #define GGML_F16_VEC_ADD GGML_F16x8_ADD
279 #define GGML_F16_VEC_MUL GGML_F16x8_MUL
280 #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
281#else
282 // if FP16 vector arithmetic is not supported, we use FP32 instead
283 // and take advantage of the vcvt_ functions to convert to/from FP16
284
285 #define GGML_F16_STEP 16
286 #define GGML_F16_EPR 4
287
288 #define GGML_F32Cx4 float32x4_t
289 #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
290 #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
291 #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
292 #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
293 #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
294 #define GGML_F32Cx4_ADD vaddq_f32
295 #define GGML_F32Cx4_MUL vmulq_f32
296 #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
297
298 #define GGML_F16_VEC GGML_F32Cx4
299 #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
300 #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
301 #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
302 #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
303 #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
304 #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
305 #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
306 #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
307#endif
308
309#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
310
311#define GGML_SIMD
312
313// F32 NEON
314
315#define GGML_F32_STEP 16
316#define GGML_F32_EPR 4
317
318#define GGML_F32x4 float32x4_t
319#define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
320#define GGML_F32x4_SET1(x) vdupq_n_f32(x)
321#define GGML_F32x4_LOAD vld1q_f32
322#define GGML_F32x4_STORE vst1q_f32
323#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
324#define GGML_F32x4_ADD vaddq_f32
325#define GGML_F32x4_MUL vmulq_f32
326#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
327#define GGML_F32x4_REDUCE(res, x) \
328{ \
329 int offset = GGML_F32_ARR >> 1; \
330 for (int i = 0; i < offset; ++i) { \
331 (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
332 } \
333 offset >>= 1; \
334 for (int i = 0; i < offset; ++i) { \
335 (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
336 } \
337 offset >>= 1; \
338 for (int i = 0; i < offset; ++i) { \
339 (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
340 } \
341 (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
342}
343
344#define GGML_F32_VEC GGML_F32x4
345#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
346#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
347#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
348#define GGML_F32_VEC_STORE GGML_F32x4_STORE
349#define GGML_F32_VEC_FMA GGML_F32x4_FMA
350#define GGML_F32_VEC_ADD GGML_F32x4_ADD
351#define GGML_F32_VEC_MUL GGML_F32x4_MUL
352#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
353
354// F16 NEON
355
356#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
357 #define GGML_F16_STEP 32
358 #define GGML_F16_EPR 8
359
360 #define GGML_F16x8 float16x8_t
361 #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
362 #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
363 #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
364 #define GGML_F16x8_STORE vst1q_f16
365 #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
366 #define GGML_F16x8_ADD vaddq_f16
367 #define GGML_F16x8_MUL vmulq_f16
368 #define GGML_F16x8_REDUCE(res, x) \
369 do { \
370 int offset = GGML_F16_ARR >> 1; \
371 for (int i = 0; i < offset; ++i) { \
372 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
373 } \
374 offset >>= 1; \
375 for (int i = 0; i < offset; ++i) { \
376 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
377 } \
378 offset >>= 1; \
379 for (int i = 0; i < offset; ++i) { \
380 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
381 } \
382 const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
383 const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
384 (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
385 } while (0)
386
387 #define GGML_F16_VEC GGML_F16x8
388 #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
389 #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
390 #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
391 #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
392 #define GGML_F16_VEC_FMA GGML_F16x8_FMA
393 #define GGML_F16_VEC_ADD GGML_F16x8_ADD
394 #define GGML_F16_VEC_MUL GGML_F16x8_MUL
395 #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
396#else
397 // if FP16 vector arithmetic is not supported, we use FP32 instead
398 // and take advantage of the vcvt_ functions to convert to/from FP16
399
400 #define GGML_F16_STEP 16
401 #define GGML_F16_EPR 4
402
403 #define GGML_F32Cx4 float32x4_t
404 #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
405 #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
406 #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
407 #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
408 #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
409 #define GGML_F32Cx4_ADD vaddq_f32
410 #define GGML_F32Cx4_MUL vmulq_f32
411 #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
412
413 #define GGML_F16_VEC GGML_F32Cx4
414 #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
415 #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
416 #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
417 #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
418 #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
419 #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
420 #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
421 #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
422#endif
423
424#elif defined(__AVX512F__)
425
426#define GGML_SIMD
427
428// F32 AVX512
429
430#define GGML_F32_STEP 64
431#define GGML_F32_EPR 16
432
433#define GGML_F32x16 __m512
434#define GGML_F32x16_ZERO _mm512_setzero_ps()
435#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
436#define GGML_F32x16_LOAD _mm512_loadu_ps
437#define GGML_F32x16_STORE _mm512_storeu_ps
438// _mm512_fmadd_ps is defined in AVX512F so no guard is required
439#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
440#define GGML_F32x16_ADD _mm512_add_ps
441#define GGML_F32x16_MUL _mm512_mul_ps
442#define GGML_F32x16_REDUCE(res, x) \
443do { \
444 int offset = GGML_F32_ARR >> 1; \
445 for (int i = 0; i < offset; ++i) { \
446 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
447 } \
448 offset >>= 1; \
449 for (int i = 0; i < offset; ++i) { \
450 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
451 } \
452 offset >>= 1; \
453 for (int i = 0; i < offset; ++i) { \
454 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
455 } \
456 res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
457} while (0)
458
459// TODO: is this optimal ?
460
461#define GGML_F32_VEC GGML_F32x16
462#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
463#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
464#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
465#define GGML_F32_VEC_STORE GGML_F32x16_STORE
466#define GGML_F32_VEC_FMA GGML_F32x16_FMA
467#define GGML_F32_VEC_ADD GGML_F32x16_ADD
468#define GGML_F32_VEC_MUL GGML_F32x16_MUL
469#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
470
471// F16 AVX512
472
473// F16 AVX
474
475#define GGML_F16_STEP 64
476#define GGML_F16_EPR 16
477
478// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
479
480#define GGML_F32Cx16 __m512
481#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
482#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
483
484// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
485// so F16C guard isn't required
486#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
487#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
488
489#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
490#define GGML_F32Cx16_ADD _mm512_add_ps
491#define GGML_F32Cx16_MUL _mm512_mul_ps
492#define GGML_F32Cx16_REDUCE(res, x) \
493do { \
494 int offset = GGML_F32_ARR >> 1; \
495 for (int i = 0; i < offset; ++i) { \
496 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
497 } \
498 offset >>= 1; \
499 for (int i = 0; i < offset; ++i) { \
500 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
501 } \
502 offset >>= 1; \
503 for (int i = 0; i < offset; ++i) { \
504 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
505 } \
506 res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
507} while (0)
508
509#define GGML_F16_VEC GGML_F32Cx16
510#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
511#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
512#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
513#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
514#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
515#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
516#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
517
518#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
519#elif defined(__AVX__)
520
521#define GGML_SIMD
522
523// F32 AVX
524
525#define GGML_F32_STEP 32
526#define GGML_F32_EPR 8
527
528#define GGML_F32x8 __m256
529#define GGML_F32x8_ZERO _mm256_setzero_ps()
530#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
531#define GGML_F32x8_LOAD _mm256_loadu_ps
532#define GGML_F32x8_STORE _mm256_storeu_ps
533#if defined(__FMA__)
534 #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
535#else
536 #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
537#endif
538#define GGML_F32x8_ADD _mm256_add_ps
539#define GGML_F32x8_MUL _mm256_mul_ps
540#define GGML_F32x8_REDUCE(res, x) \
541do { \
542 int offset = GGML_F32_ARR >> 1; \
543 for (int i = 0; i < offset; ++i) { \
544 x[i] = _mm256_add_ps(x[i], x[offset+i]); \
545 } \
546 offset >>= 1; \
547 for (int i = 0; i < offset; ++i) { \
548 x[i] = _mm256_add_ps(x[i], x[offset+i]); \
549 } \
550 offset >>= 1; \
551 for (int i = 0; i < offset; ++i) { \
552 x[i] = _mm256_add_ps(x[i], x[offset+i]); \
553 } \
554 const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
555 _mm256_extractf128_ps(x[0], 1)); \
556 const __m128 t1 = _mm_hadd_ps(t0, t0); \
557 res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
558} while (0)
559// TODO: is this optimal ?
560
561#define GGML_F32_VEC GGML_F32x8
562#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
563#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
564#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
565#define GGML_F32_VEC_STORE GGML_F32x8_STORE
566#define GGML_F32_VEC_FMA GGML_F32x8_FMA
567#define GGML_F32_VEC_ADD GGML_F32x8_ADD
568#define GGML_F32_VEC_MUL GGML_F32x8_MUL
569#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
570
571// F16 AVX
572
573#define GGML_F16_STEP 32
574#define GGML_F16_EPR 8
575
576// F16 arithmetic is not supported by AVX, so we use F32 instead
577
578#define GGML_F32Cx8 __m256
579#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
580#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
581
582#if defined(__F16C__)
583// the _mm256_cvt intrinsics require F16C
584#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
585#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
586#else
587static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
588 float tmp[8];
589
590 for (int i = 0; i < 8; i++) {
591 tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
592 }
593
594 return _mm256_loadu_ps(tmp);
595}
596static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
597 float arr[8];
598
599 _mm256_storeu_ps(arr, y);
600
601 for (int i = 0; i < 8; i++)
602 x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
603}
604#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
605#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
606#endif
607
608#define GGML_F32Cx8_FMA GGML_F32x8_FMA
609#define GGML_F32Cx8_ADD _mm256_add_ps
610#define GGML_F32Cx8_MUL _mm256_mul_ps
611#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
612
613#define GGML_F16_VEC GGML_F32Cx8
614#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
615#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
616#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
617#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
618#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
619#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
620#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
621#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
622
623#elif defined(__POWER9_VECTOR__)
624
625#define GGML_SIMD
626
627// F32 POWER9
628
629#define GGML_F32_STEP 32
630#define GGML_F32_EPR 4
631
632#define GGML_F32x4 vector float
633#define GGML_F32x4_ZERO {0.0f}
634#define GGML_F32x4_SET1 vec_splats
635#define GGML_F32x4_LOAD(p) vec_xl(0, p)
636#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
637#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
638#define GGML_F32x4_ADD vec_add
639#define GGML_F32x4_MUL vec_mul
640#define GGML_F32x4_REDUCE(res, x) \
641{ \
642 int offset = GGML_F32_ARR >> 1; \
643 for (int i = 0; i < offset; ++i) { \
644 x[i] = vec_add(x[i], x[offset+i]); \
645 } \
646 offset >>= 1; \
647 for (int i = 0; i < offset; ++i) { \
648 x[i] = vec_add(x[i], x[offset+i]); \
649 } \
650 offset >>= 1; \
651 for (int i = 0; i < offset; ++i) { \
652 x[i] = vec_add(x[i], x[offset+i]); \
653 } \
654 res = vec_extract(x[0], 0) + \
655 vec_extract(x[0], 1) + \
656 vec_extract(x[0], 2) + \
657 vec_extract(x[0], 3); \
658}
659
660#define GGML_F32_VEC GGML_F32x4
661#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
662#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
663#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
664#define GGML_F32_VEC_STORE GGML_F32x4_STORE
665#define GGML_F32_VEC_FMA GGML_F32x4_FMA
666#define GGML_F32_VEC_ADD GGML_F32x4_ADD
667#define GGML_F32_VEC_MUL GGML_F32x4_MUL
668#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
669
670// F16 POWER9
671#define GGML_F16_STEP GGML_F32_STEP
672#define GGML_F16_EPR GGML_F32_EPR
673#define GGML_F16_VEC GGML_F32x4
674#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
675#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
676#define GGML_F16_VEC_FMA GGML_F32x4_FMA
677#define GGML_F16_VEC_ADD GGML_F32x4_ADD
678#define GGML_F16_VEC_MUL GGML_F32x4_MUL
679#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
680// Use vec_xl, not vec_ld, in case the load address is not aligned.
681#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
682 vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
683 vec_extract_fp32_from_shortl(vec_xl(0, p))
684static inline unsigned char ggml_endian_byte(int i) {
685 uint16_t tmp_val = 1;
686 return ((unsigned char *)&tmp_val)[i];
687}
688#define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
689#define GGML_F16_VEC_STORE(p, r, i) \
690 if (i & 0x1) \
691 vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
692 r[i - GGML_ENDIAN_BYTE(0)]), \
693 0, p - GGML_F16_EPR)
694
695#elif defined(__wasm_simd128__)
696
697#define GGML_SIMD
698
699// F32 WASM
700
701#define GGML_F32_STEP 16
702#define GGML_F32_EPR 4
703
704#define GGML_F32x4 v128_t
705#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
706#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
707#define GGML_F32x4_LOAD wasm_v128_load
708#define GGML_F32x4_STORE wasm_v128_store
709#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
710#define GGML_F32x4_ADD wasm_f32x4_add
711#define GGML_F32x4_MUL wasm_f32x4_mul
712#define GGML_F32x4_REDUCE(res, x) \
713{ \
714 int offset = GGML_F32_ARR >> 1; \
715 for (int i = 0; i < offset; ++i) { \
716 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
717 } \
718 offset >>= 1; \
719 for (int i = 0; i < offset; ++i) { \
720 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
721 } \
722 offset >>= 1; \
723 for (int i = 0; i < offset; ++i) { \
724 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
725 } \
726 res = wasm_f32x4_extract_lane(x[0], 0) + \
727 wasm_f32x4_extract_lane(x[0], 1) + \
728 wasm_f32x4_extract_lane(x[0], 2) + \
729 wasm_f32x4_extract_lane(x[0], 3); \
730}
731
732#define GGML_F32_VEC GGML_F32x4
733#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
734#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
735#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
736#define GGML_F32_VEC_STORE GGML_F32x4_STORE
737#define GGML_F32_VEC_FMA GGML_F32x4_FMA
738#define GGML_F32_VEC_ADD GGML_F32x4_ADD
739#define GGML_F32_VEC_MUL GGML_F32x4_MUL
740#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
741
742// F16 WASM
743
744#define GGML_F16_STEP 16
745#define GGML_F16_EPR 4
746
747inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
748 float tmp[4];
749
750 tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
751 tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
752 tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
753 tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
754
755 return wasm_v128_load(tmp);
756}
757
758inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
759 float tmp[4];
760
761 wasm_v128_store(tmp, x);
762
763 p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
764 p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
765 p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
766 p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
767}
768
769#define GGML_F16x4 v128_t
770#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
771#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
772#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
773#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
774#define GGML_F16x4_FMA GGML_F32x4_FMA
775#define GGML_F16x4_ADD wasm_f32x4_add
776#define GGML_F16x4_MUL wasm_f32x4_mul
777#define GGML_F16x4_REDUCE(res, x) \
778{ \
779 int offset = GGML_F16_ARR >> 1; \
780 for (int i = 0; i < offset; ++i) { \
781 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
782 } \
783 offset >>= 1; \
784 for (int i = 0; i < offset; ++i) { \
785 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
786 } \
787 offset >>= 1; \
788 for (int i = 0; i < offset; ++i) { \
789 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
790 } \
791 res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
792 wasm_f32x4_extract_lane(x[0], 1) + \
793 wasm_f32x4_extract_lane(x[0], 2) + \
794 wasm_f32x4_extract_lane(x[0], 3)); \
795}
796
797#define GGML_F16_VEC GGML_F16x4
798#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
799#define GGML_F16_VEC_SET1 GGML_F16x4_SET1
800#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
801#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
802#define GGML_F16_VEC_FMA GGML_F16x4_FMA
803#define GGML_F16_VEC_ADD GGML_F16x4_ADD
804#define GGML_F16_VEC_MUL GGML_F16x4_MUL
805#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
806
807#elif defined(__SSE3__)
808
809#define GGML_SIMD
810
811// F32 SSE
812
813#define GGML_F32_STEP 32
814#define GGML_F32_EPR 4
815
816#define GGML_F32x4 __m128
817#define GGML_F32x4_ZERO _mm_setzero_ps()
818#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
819#define GGML_F32x4_LOAD _mm_loadu_ps
820#define GGML_F32x4_STORE _mm_storeu_ps
821#if defined(__FMA__)
822 // TODO: Does this work?
823 #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
824#else
825 #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
826#endif
827#define GGML_F32x4_ADD _mm_add_ps
828#define GGML_F32x4_MUL _mm_mul_ps
829#define GGML_F32x4_REDUCE(res, x) \
830{ \
831 int offset = GGML_F32_ARR >> 1; \
832 for (int i = 0; i < offset; ++i) { \
833 x[i] = _mm_add_ps(x[i], x[offset+i]); \
834 } \
835 offset >>= 1; \
836 for (int i = 0; i < offset; ++i) { \
837 x[i] = _mm_add_ps(x[i], x[offset+i]); \
838 } \
839 offset >>= 1; \
840 for (int i = 0; i < offset; ++i) { \
841 x[i] = _mm_add_ps(x[i], x[offset+i]); \
842 } \
843 const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
844 res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
845}
846// TODO: is this optimal ?
847
848#define GGML_F32_VEC GGML_F32x4
849#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
850#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
851#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
852#define GGML_F32_VEC_STORE GGML_F32x4_STORE
853#define GGML_F32_VEC_FMA GGML_F32x4_FMA
854#define GGML_F32_VEC_ADD GGML_F32x4_ADD
855#define GGML_F32_VEC_MUL GGML_F32x4_MUL
856#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
857
858// F16 SSE
859
860#define GGML_F16_STEP 32
861#define GGML_F16_EPR 4
862
863static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
864 float tmp[4];
865
866 tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
867 tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
868 tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
869 tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
870
871 return _mm_loadu_ps(tmp);
872}
873
874static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
875 float arr[4];
876
877 _mm_storeu_ps(arr, y);
878
879 x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
880 x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
881 x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
882 x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
883}
884
885#define GGML_F32Cx4 __m128
886#define GGML_F32Cx4_ZERO _mm_setzero_ps()
887#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
888#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
889#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
890#define GGML_F32Cx4_FMA GGML_F32x4_FMA
891#define GGML_F32Cx4_ADD _mm_add_ps
892#define GGML_F32Cx4_MUL _mm_mul_ps
893#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
894
895#define GGML_F16_VEC GGML_F32Cx4
896#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
897#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
898#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
899#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
900#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
901#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
902#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
903#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
904
905#elif defined(__loongarch_asx)
906
907#define GGML_SIMD
908
909// F32 LASX
910#define GGML_F32_STEP 32
911#define GGML_F32_EPR 8
912
913#define GGML_F32x8 __m256
914#define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
915#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
916#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
917#define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
918#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
919#define GGML_F32x8_ADD __lasx_xvfadd_s
920#define GGML_F32x8_MUL __lasx_xvfmul_s
921#define GGML_F32x8_REDUCE(res, x) \
922do { \
923 int offset = GGML_F32_ARR >> 1; \
924 for (int i = 0; i < offset; ++i) { \
925 x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
926 } \
927 offset >>= 1; \
928 for (int i = 0; i < offset; ++i) { \
929 x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
930 } \
931 offset >>= 1; \
932 for (int i = 0; i < offset; ++i) { \
933 x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
934 } \
935 float *tmp_p = (float *)&x[0]; \
936 res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
937} while (0)
938// TODO: is this optimal ?
939
940#define GGML_F32_VEC GGML_F32x8
941#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
942#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
943#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
944#define GGML_F32_VEC_STORE GGML_F32x8_STORE
945#define GGML_F32_VEC_FMA GGML_F32x8_FMA
946#define GGML_F32_VEC_ADD GGML_F32x8_ADD
947#define GGML_F32_VEC_MUL GGML_F32x8_MUL
948#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
949
950// F16 LASX
951
952#define GGML_F16_STEP 32
953#define GGML_F16_EPR 8
954
955// F16 arithmetic is not supported by LASX, so we use F32 instead
956
957#define GGML_F32Cx8 __m256
958#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
959#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
960
961static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
962 __m256i a;
963 memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
964 a = __lasx_xvpermi_d(a, 0 | (1 << 4));
965 return __lasx_xvfcvtl_s_h(a);
966}
967
968static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
969 __m256i a = __lasx_xvfcvt_h_s(y, y);
970 a = __lasx_xvpermi_d(a, 0 | (2 << 2));
971 memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
972}
973#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
974#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
975
976#define GGML_F32Cx8_FMA GGML_F32x8_FMA
977#define GGML_F32Cx8_ADD __lasx_xvfadd_s
978#define GGML_F32Cx8_MUL __lasx_xvfmul_s
979#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
980
981#define GGML_F16_VEC GGML_F32Cx8
982#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
983#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
984#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
985#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
986#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
987#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
988#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
989#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
990
991#elif defined(__loongarch_sx)
992
993#define GGML_SIMD
994
995// F32 LSX
996
997#define GGML_F32_STEP 32
998#define GGML_F32_EPR 4
999
1000#define GGML_F32x4 __m128
1001#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
1002#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
1003#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
1004#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
1005#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
1006#define GGML_F32x4_ADD __lsx_vfadd_s
1007#define GGML_F32x4_MUL __lsx_vfmul_s
1008
1009#define GGML_F32x4_REDUCE(res, x) \
1010{ \
1011 int offset = GGML_F32_ARR >> 1; \
1012 for (int i = 0; i < offset; ++i) { \
1013 x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1014 } \
1015 offset >>= 1; \
1016 for (int i = 0; i < offset; ++i) { \
1017 x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1018 } \
1019 offset >>= 1; \
1020 for (int i = 0; i < offset; ++i) { \
1021 x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1022 } \
1023 __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
1024 __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
1025 __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \
1026 __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \
1027 __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \
1028 __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \
1029 res = (ggml_float) ((v4f32)t5)[0]; \
1030}
1031
1032#define GGML_F32_VEC GGML_F32x4
1033#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1034#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1035#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1036#define GGML_F32_VEC_STORE GGML_F32x4_STORE
1037#define GGML_F32_VEC_FMA GGML_F32x4_FMA
1038#define GGML_F32_VEC_ADD GGML_F32x4_ADD
1039#define GGML_F32_VEC_MUL GGML_F32x4_MUL
1040#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1041
1042// F16 LSX
1043
1044#define GGML_F16_STEP 32
1045#define GGML_F16_EPR 4
1046
1047static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
1048 float tmp[4];
1049
1050 tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
1051 tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
1052 tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
1053 tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
1054
1055 return (__m128)__lsx_vld(tmp, 0);
1056}
1057
1058static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1059 float arr[4];
1060
1061 __lsx_vst(y, arr, 0);
1062
1063 x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
1064 x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
1065 x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
1066 x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
1067}
1068
1069#define GGML_F32Cx4 __m128
1070#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
1071#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
1072#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
1073#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
1074#define GGML_F32Cx4_FMA GGML_F32x4_FMA
1075#define GGML_F32Cx4_ADD __lsx_vfadd_s
1076#define GGML_F32Cx4_MUL __lsx_vfmul_s
1077#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
1078
1079#define GGML_F16_VEC GGML_F32Cx4
1080#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
1081#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
1082#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
1083#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
1084#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
1085#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
1086#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
1087#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
1088
1089#elif defined(__VXE__) || defined(__VXE2__)
1090
1091#define GGML_SIMD
1092
1093// F32 s390x
1094
1095#define GGML_F32_STEP 32
1096#define GGML_F32_EPR 4
1097
1098#define GGML_F32x4 float32x4_t
1099#define GGML_F32x4_ZERO vec_splats(0.0f)
1100#define GGML_F32x4_SET1 vec_splats
1101#define GGML_F32x4_LOAD(p) vec_xl(0, p)
1102#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
1103#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
1104#define GGML_F32x4_ADD vec_add
1105#define GGML_F32x4_MUL vec_mul
1106#define GGML_F32x4_REDUCE(res, x) \
1107{ \
1108 int offset = GGML_F32_ARR >> 1; \
1109 for (int i = 0; i < offset; ++i) { \
1110 x[i] = vec_add(x[i], x[offset + i]); \
1111 } \
1112 offset >>= 1; \
1113 for (int i = 0; i < offset; ++i) { \
1114 x[i] = vec_add(x[i], x[offset + i]); \
1115 } \
1116 offset >>= 1; \
1117 for (int i = 0; i < offset; ++i) { \
1118 x[i] = vec_add(x[i], x[offset + i]); \
1119 } \
1120 float32x4_t tmp = x[0] + vec_reve(x[0]); \
1121 res = tmp[0] + tmp[1]; \
1122}
1123
1124#define GGML_F32_VEC GGML_F32x4
1125#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1126#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1127#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1128#define GGML_F32_VEC_STORE GGML_F32x4_STORE
1129#define GGML_F32_VEC_FMA GGML_F32x4_FMA
1130#define GGML_F32_VEC_ADD GGML_F32x4_ADD
1131#define GGML_F32_VEC_MUL GGML_F32x4_MUL
1132#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1133
1134// F16 s390x
1135#define GGML_F16_STEP GGML_F32_STEP
1136#define GGML_F16_EPR GGML_F32_EPR
1137
1138static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1139 float tmp[4];
1140
1141 for (int i = 0; i < 4; i++) {
1142 tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
1143 }
1144
1145 // note: keep type-cast here to prevent compiler bugs
1146 // see: https://github.com/ggml-org/llama.cpp/issues/12846
1147 return vec_xl(0, (const float *)(tmp));
1148}
1149
1150static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1151 float arr[4];
1152
1153 // note: keep type-cast here to prevent compiler bugs
1154 // see: https://github.com/ggml-org/llama.cpp/issues/12846
1155 vec_xst(v_y, 0, (float *)(arr));
1156
1157 for (int i = 0; i < 4; i++) {
1158 x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
1159 }
1160}
1161
1162#define GGML_F16_VEC GGML_F32x4
1163#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
1164#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
1165#define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
1166#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
1167#define GGML_F16_VEC_FMA GGML_F32x4_FMA
1168#define GGML_F16_VEC_ADD GGML_F32x4_ADD
1169#define GGML_F16_VEC_MUL GGML_F32x4_MUL
1170#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
1171
1172#elif defined(__riscv_v_intrinsic)
1173
1174// compatible with vlen >= 128
1175
1176#define GGML_SIMD
1177
1178// F32
1179
1180#define GGML_F32_STEP 16
1181#define GGML_F32_EPR 4
1182
1183#define GGML_F32x4 vfloat32m1_t
1184#define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
1185#define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
1186#define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
1187#define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
1188#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
1189#define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
1190#define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
1191
1192#define GGML_F32_VEC GGML_F32x4
1193#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1194#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1195#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1196#define GGML_F32_VEC_STORE GGML_F32x4_STORE
1197#define GGML_F32_VEC_FMA GGML_F32x4_FMA
1198#define GGML_F32_VEC_ADD GGML_F32x4_ADD
1199#define GGML_F32_VEC_MUL GGML_F32x4_MUL
1200#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1201
1202#endif
1203
1204// GGML_F32_ARR / GGML_F16_ARR
1205// number of registers to use per step
1206#ifdef GGML_SIMD
1207#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
1208#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
1209#endif
1210
1211#ifdef __cplusplus
1212}
1213#endif
1214