| 1 | #pragma once |
| 2 | |
| 3 | #include "ggml-cpu-impl.h" |
| 4 | |
| 5 | #ifdef __ARM_FEATURE_SVE |
| 6 | #include <arm_sve.h> |
| 7 | #endif // __ARM_FEATURE_SVE |
| 8 | |
| 9 | #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) |
| 10 | // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: |
| 11 | // |
| 12 | // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ |
| 13 | // |
| 14 | #include <arm_neon.h> |
| 15 | #endif |
| 16 | |
| 17 | #if defined(__F16C__) |
| 18 | #include <immintrin.h> |
| 19 | #endif |
| 20 | |
| 21 | #if defined(__riscv_v_intrinsic) |
| 22 | #include <riscv_vector.h> |
| 23 | #endif |
| 24 | |
| 25 | #ifdef __cplusplus |
| 26 | extern "C" { |
| 27 | #endif |
| 28 | |
| 29 | // |
| 30 | // simd mappings |
| 31 | // |
| 32 | |
| 33 | // FP16 to FP32 conversion |
| 34 | |
| 35 | // 16-bit float |
| 36 | // on Arm, we use __fp16 |
| 37 | // on x86, we use uint16_t |
| 38 | // |
| 39 | // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 |
| 40 | // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 |
| 41 | // |
| 42 | #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) |
| 43 | #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x) |
| 44 | #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x) |
| 45 | |
| 46 | #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) |
| 47 | |
| 48 | static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) { |
| 49 | __fp16 tmp; |
| 50 | memcpy(&tmp, &h, sizeof(ggml_fp16_t)); |
| 51 | return (float)tmp; |
| 52 | } |
| 53 | |
| 54 | static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) { |
| 55 | ggml_fp16_t res; |
| 56 | __fp16 tmp = f; |
| 57 | memcpy(&res, &tmp, sizeof(ggml_fp16_t)); |
| 58 | return res; |
| 59 | } |
| 60 | #elif defined(__F16C__) |
| 61 | #ifdef _MSC_VER |
| 62 | #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) |
| 63 | #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) |
| 64 | #else |
| 65 | #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) |
| 66 | #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) |
| 67 | #endif |
| 68 | #elif defined(__POWER9_VECTOR__) |
| 69 | #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x) |
| 70 | #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x) |
| 71 | /* the inline asm below is about 12% faster than the lookup method */ |
| 72 | #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) |
| 73 | #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) |
| 74 | |
| 75 | static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) { |
| 76 | float f; |
| 77 | double d; |
| 78 | __asm__( |
| 79 | "mtfprd %0,%2\n" |
| 80 | "xscvhpdp %0,%0\n" |
| 81 | "frsp %1,%0\n" : |
| 82 | /* temp */ "=d" (d), |
| 83 | /* out */ "=f" (f): |
| 84 | /* in */ "r" (h)); |
| 85 | return f; |
| 86 | } |
| 87 | |
| 88 | static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) { |
| 89 | double d; |
| 90 | ggml_fp16_t r; |
| 91 | __asm__( /* xscvdphp can work on double or single precision */ |
| 92 | "xscvdphp %0,%2\n" |
| 93 | "mffprd %1,%0\n" : |
| 94 | /* temp */ "=d" (d), |
| 95 | /* out */ "=r" (r): |
| 96 | /* in */ "f" (f)); |
| 97 | return r; |
| 98 | } |
| 99 | #elif defined(__riscv) && defined(__riscv_zfhmin) |
| 100 | static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) { |
| 101 | _Float16 hf; |
| 102 | memcpy(&hf, &h, sizeof(ggml_fp16_t)); |
| 103 | return hf; |
| 104 | } |
| 105 | |
| 106 | static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) { |
| 107 | ggml_fp16_t res; |
| 108 | _Float16 hf = (_Float16)f; |
| 109 | memcpy(&res, &hf, sizeof(ggml_fp16_t)); |
| 110 | return res; |
| 111 | } |
| 112 | |
| 113 | #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x) |
| 114 | #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x) |
| 115 | #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) |
| 116 | #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) |
| 117 | #endif |
| 118 | |
| 119 | // precomputed f32 table for f16 (256 KB) |
| 120 | // defined in ggml-cpu.c, initialized in ggml_cpu_init() |
| 121 | extern float ggml_table_f32_f16[1 << 16]; |
| 122 | |
| 123 | // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, |
| 124 | // so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON. |
| 125 | // This is also true for POWER9. |
| 126 | #if !defined(GGML_CPU_FP16_TO_FP32) |
| 127 | inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { |
| 128 | uint16_t s; |
| 129 | memcpy(dest: &s, src: &f, n: sizeof(uint16_t)); |
| 130 | return ggml_table_f32_f16[s]; |
| 131 | } |
| 132 | |
| 133 | #define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) |
| 134 | #endif |
| 135 | |
| 136 | #if !defined(GGML_CPU_FP32_TO_FP16) |
| 137 | #define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) |
| 138 | #endif |
| 139 | |
| 140 | |
| 141 | // we define a common set of C macros which map to specific intrinsics based on the current architecture |
| 142 | // we then implement the fundamental computation operations below using only these macros |
| 143 | // adding support for new architectures requires to define the corresponding SIMD macros |
| 144 | // |
| 145 | // GGML_F32_STEP / GGML_F16_STEP |
| 146 | // number of elements to process in a single step |
| 147 | // |
| 148 | // GGML_F32_EPR / GGML_F16_EPR |
| 149 | // number of elements to fit in a single register |
| 150 | // |
| 151 | |
| 152 | #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA) |
| 153 | |
| 154 | #define GGML_SIMD |
| 155 | |
| 156 | // F32 SVE |
| 157 | #define GGML_F32_EPR 8 |
| 158 | #define DEFAULT_PG svptrue_b32() |
| 159 | |
| 160 | #define GGML_F32xt svfloat32_t |
| 161 | #define GGML_F32xt_ZERO svdup_n_f32(0.0f) |
| 162 | #define GGML_F32xt_SET1(x) svdup_n_f32(x) |
| 163 | #define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a) |
| 164 | #define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__) |
| 165 | #define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b) |
| 166 | #define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__) |
| 167 | #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a) |
| 168 | #define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__) |
| 169 | #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b) |
| 170 | #define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__) |
| 171 | #define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b) |
| 172 | #define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__) |
| 173 | #define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a) |
| 174 | #define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__) |
| 175 | #define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \ |
| 176 | { \ |
| 177 | sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \ |
| 178 | sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \ |
| 179 | sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \ |
| 180 | sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \ |
| 181 | sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \ |
| 182 | sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \ |
| 183 | sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \ |
| 184 | (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \ |
| 185 | } |
| 186 | #define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__) |
| 187 | |
| 188 | #define GGML_F32_VEC GGML_F32xt |
| 189 | #define GGML_F32_VEC_ZERO GGML_F32xt_ZERO |
| 190 | #define GGML_F32_VEC_SET1 GGML_F32xt_SET1 |
| 191 | #define GGML_F32_VEC_LOAD GGML_F32xt_LOAD |
| 192 | #define GGML_F32_VEC_STORE GGML_F32xt_STORE |
| 193 | #define GGML_F32_VEC_FMA GGML_F32xt_FMA |
| 194 | #define GGML_F32_VEC_ADD GGML_F32xt_ADD |
| 195 | #define GGML_F32_VEC_MUL GGML_F32xt_MUL |
| 196 | #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE |
| 197 | |
| 198 | // F16 SVE |
| 199 | #define DEFAULT_PG32 svptrue_b32() |
| 200 | #define DEFAULT_PG16 svptrue_b16() |
| 201 | |
| 202 | #define GGML_F32Cxt svfloat16_t |
| 203 | #define GGML_F32Cxt_ZERO svdup_n_f16(0.0f) |
| 204 | #define GGML_F32Cxt_SET1(x) svdup_n_f16(x) |
| 205 | #define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p)) |
| 206 | #define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec)) |
| 207 | |
| 208 | #define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a) |
| 209 | #define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__) |
| 210 | #define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b) |
| 211 | #define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__) |
| 212 | #define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b) |
| 213 | #define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__) |
| 214 | #define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED |
| 215 | |
| 216 | #define GGML_F16x_VEC GGML_F32Cxt |
| 217 | #define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO |
| 218 | #define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1 |
| 219 | #define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p) |
| 220 | #define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r) |
| 221 | #define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA |
| 222 | #define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD |
| 223 | #define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL |
| 224 | #define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE |
| 225 | |
| 226 | #define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a) |
| 227 | #define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__) |
| 228 | |
| 229 | #define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \ |
| 230 | { \ |
| 231 | sum1 = svadd_f16_x(pg16, sum1, sum2); \ |
| 232 | sum3 = svadd_f16_x(pg16, sum3, sum4); \ |
| 233 | sum1 = svadd_f16_x(pg16, sum1, sum3); \ |
| 234 | __fp16 sum_f16 = svaddv_f16(pg16, sum1); \ |
| 235 | (res) = (ggml_float) sum_f16; \ |
| 236 | } |
| 237 | #define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__) |
| 238 | |
| 239 | // F16 NEON |
| 240 | |
| 241 | #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) |
| 242 | #define GGML_F16_STEP 32 |
| 243 | #define GGML_F16_EPR 8 |
| 244 | |
| 245 | #define GGML_F16x8 float16x8_t |
| 246 | #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) |
| 247 | #define GGML_F16x8_SET1(x) vdupq_n_f16(x) |
| 248 | #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x)) |
| 249 | #define GGML_F16x8_STORE vst1q_f16 |
| 250 | #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) |
| 251 | #define GGML_F16x8_ADD vaddq_f16 |
| 252 | #define GGML_F16x8_MUL vmulq_f16 |
| 253 | #define GGML_F16x8_REDUCE(res, x) \ |
| 254 | do { \ |
| 255 | int offset = GGML_F16_ARR >> 1; \ |
| 256 | for (int i = 0; i < offset; ++i) { \ |
| 257 | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
| 258 | } \ |
| 259 | offset >>= 1; \ |
| 260 | for (int i = 0; i < offset; ++i) { \ |
| 261 | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
| 262 | } \ |
| 263 | offset >>= 1; \ |
| 264 | for (int i = 0; i < offset; ++i) { \ |
| 265 | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
| 266 | } \ |
| 267 | const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ |
| 268 | const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ |
| 269 | (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ |
| 270 | } while (0) |
| 271 | |
| 272 | #define GGML_F16_VEC GGML_F16x8 |
| 273 | #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO |
| 274 | #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 |
| 275 | #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) |
| 276 | #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i]) |
| 277 | #define GGML_F16_VEC_FMA GGML_F16x8_FMA |
| 278 | #define GGML_F16_VEC_ADD GGML_F16x8_ADD |
| 279 | #define GGML_F16_VEC_MUL GGML_F16x8_MUL |
| 280 | #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE |
| 281 | #else |
| 282 | // if FP16 vector arithmetic is not supported, we use FP32 instead |
| 283 | // and take advantage of the vcvt_ functions to convert to/from FP16 |
| 284 | |
| 285 | #define GGML_F16_STEP 16 |
| 286 | #define GGML_F16_EPR 4 |
| 287 | |
| 288 | #define GGML_F32Cx4 float32x4_t |
| 289 | #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) |
| 290 | #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) |
| 291 | #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) |
| 292 | #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) |
| 293 | #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) |
| 294 | #define GGML_F32Cx4_ADD vaddq_f32 |
| 295 | #define GGML_F32Cx4_MUL vmulq_f32 |
| 296 | #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE |
| 297 | |
| 298 | #define GGML_F16_VEC GGML_F32Cx4 |
| 299 | #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO |
| 300 | #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 |
| 301 | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) |
| 302 | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i]) |
| 303 | #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA |
| 304 | #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD |
| 305 | #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL |
| 306 | #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE |
| 307 | #endif |
| 308 | |
| 309 | #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) |
| 310 | |
| 311 | #define GGML_SIMD |
| 312 | |
| 313 | // F32 NEON |
| 314 | |
| 315 | #define GGML_F32_STEP 16 |
| 316 | #define GGML_F32_EPR 4 |
| 317 | |
| 318 | #define GGML_F32x4 float32x4_t |
| 319 | #define GGML_F32x4_ZERO vdupq_n_f32(0.0f) |
| 320 | #define GGML_F32x4_SET1(x) vdupq_n_f32(x) |
| 321 | #define GGML_F32x4_LOAD vld1q_f32 |
| 322 | #define GGML_F32x4_STORE vst1q_f32 |
| 323 | #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) |
| 324 | #define GGML_F32x4_ADD vaddq_f32 |
| 325 | #define GGML_F32x4_MUL vmulq_f32 |
| 326 | #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) |
| 327 | #define GGML_F32x4_REDUCE(res, x) \ |
| 328 | { \ |
| 329 | int offset = GGML_F32_ARR >> 1; \ |
| 330 | for (int i = 0; i < offset; ++i) { \ |
| 331 | (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ |
| 332 | } \ |
| 333 | offset >>= 1; \ |
| 334 | for (int i = 0; i < offset; ++i) { \ |
| 335 | (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ |
| 336 | } \ |
| 337 | offset >>= 1; \ |
| 338 | for (int i = 0; i < offset; ++i) { \ |
| 339 | (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ |
| 340 | } \ |
| 341 | (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \ |
| 342 | } |
| 343 | |
| 344 | #define GGML_F32_VEC GGML_F32x4 |
| 345 | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
| 346 | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
| 347 | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
| 348 | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
| 349 | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
| 350 | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
| 351 | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
| 352 | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
| 353 | |
| 354 | // F16 NEON |
| 355 | |
| 356 | #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) |
| 357 | #define GGML_F16_STEP 32 |
| 358 | #define GGML_F16_EPR 8 |
| 359 | |
| 360 | #define GGML_F16x8 float16x8_t |
| 361 | #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) |
| 362 | #define GGML_F16x8_SET1(x) vdupq_n_f16(x) |
| 363 | #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x)) |
| 364 | #define GGML_F16x8_STORE vst1q_f16 |
| 365 | #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) |
| 366 | #define GGML_F16x8_ADD vaddq_f16 |
| 367 | #define GGML_F16x8_MUL vmulq_f16 |
| 368 | #define GGML_F16x8_REDUCE(res, x) \ |
| 369 | do { \ |
| 370 | int offset = GGML_F16_ARR >> 1; \ |
| 371 | for (int i = 0; i < offset; ++i) { \ |
| 372 | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
| 373 | } \ |
| 374 | offset >>= 1; \ |
| 375 | for (int i = 0; i < offset; ++i) { \ |
| 376 | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
| 377 | } \ |
| 378 | offset >>= 1; \ |
| 379 | for (int i = 0; i < offset; ++i) { \ |
| 380 | (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ |
| 381 | } \ |
| 382 | const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ |
| 383 | const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ |
| 384 | (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ |
| 385 | } while (0) |
| 386 | |
| 387 | #define GGML_F16_VEC GGML_F16x8 |
| 388 | #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO |
| 389 | #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 |
| 390 | #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) |
| 391 | #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i]) |
| 392 | #define GGML_F16_VEC_FMA GGML_F16x8_FMA |
| 393 | #define GGML_F16_VEC_ADD GGML_F16x8_ADD |
| 394 | #define GGML_F16_VEC_MUL GGML_F16x8_MUL |
| 395 | #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE |
| 396 | #else |
| 397 | // if FP16 vector arithmetic is not supported, we use FP32 instead |
| 398 | // and take advantage of the vcvt_ functions to convert to/from FP16 |
| 399 | |
| 400 | #define GGML_F16_STEP 16 |
| 401 | #define GGML_F16_EPR 4 |
| 402 | |
| 403 | #define GGML_F32Cx4 float32x4_t |
| 404 | #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) |
| 405 | #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) |
| 406 | #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) |
| 407 | #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) |
| 408 | #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) |
| 409 | #define GGML_F32Cx4_ADD vaddq_f32 |
| 410 | #define GGML_F32Cx4_MUL vmulq_f32 |
| 411 | #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE |
| 412 | |
| 413 | #define GGML_F16_VEC GGML_F32Cx4 |
| 414 | #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO |
| 415 | #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 |
| 416 | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) |
| 417 | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i]) |
| 418 | #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA |
| 419 | #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD |
| 420 | #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL |
| 421 | #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE |
| 422 | #endif |
| 423 | |
| 424 | #elif defined(__AVX512F__) |
| 425 | |
| 426 | #define GGML_SIMD |
| 427 | |
| 428 | // F32 AVX512 |
| 429 | |
| 430 | #define GGML_F32_STEP 64 |
| 431 | #define GGML_F32_EPR 16 |
| 432 | |
| 433 | #define GGML_F32x16 __m512 |
| 434 | #define GGML_F32x16_ZERO _mm512_setzero_ps() |
| 435 | #define GGML_F32x16_SET1(x) _mm512_set1_ps(x) |
| 436 | #define GGML_F32x16_LOAD _mm512_loadu_ps |
| 437 | #define GGML_F32x16_STORE _mm512_storeu_ps |
| 438 | // _mm512_fmadd_ps is defined in AVX512F so no guard is required |
| 439 | #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) |
| 440 | #define GGML_F32x16_ADD _mm512_add_ps |
| 441 | #define GGML_F32x16_MUL _mm512_mul_ps |
| 442 | #define GGML_F32x16_REDUCE(res, x) \ |
| 443 | do { \ |
| 444 | int offset = GGML_F32_ARR >> 1; \ |
| 445 | for (int i = 0; i < offset; ++i) { \ |
| 446 | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
| 447 | } \ |
| 448 | offset >>= 1; \ |
| 449 | for (int i = 0; i < offset; ++i) { \ |
| 450 | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
| 451 | } \ |
| 452 | offset >>= 1; \ |
| 453 | for (int i = 0; i < offset; ++i) { \ |
| 454 | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
| 455 | } \ |
| 456 | res = (ggml_float) _mm512_reduce_add_ps(x[0]); \ |
| 457 | } while (0) |
| 458 | |
| 459 | // TODO: is this optimal ? |
| 460 | |
| 461 | #define GGML_F32_VEC GGML_F32x16 |
| 462 | #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO |
| 463 | #define GGML_F32_VEC_SET1 GGML_F32x16_SET1 |
| 464 | #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD |
| 465 | #define GGML_F32_VEC_STORE GGML_F32x16_STORE |
| 466 | #define GGML_F32_VEC_FMA GGML_F32x16_FMA |
| 467 | #define GGML_F32_VEC_ADD GGML_F32x16_ADD |
| 468 | #define GGML_F32_VEC_MUL GGML_F32x16_MUL |
| 469 | #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE |
| 470 | |
| 471 | // F16 AVX512 |
| 472 | |
| 473 | // F16 AVX |
| 474 | |
| 475 | #define GGML_F16_STEP 64 |
| 476 | #define GGML_F16_EPR 16 |
| 477 | |
| 478 | // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead |
| 479 | |
| 480 | #define GGML_F32Cx16 __m512 |
| 481 | #define GGML_F32Cx16_ZERO _mm512_setzero_ps() |
| 482 | #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x) |
| 483 | |
| 484 | // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F |
| 485 | // so F16C guard isn't required |
| 486 | #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x))) |
| 487 | #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0)) |
| 488 | |
| 489 | #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) |
| 490 | #define GGML_F32Cx16_ADD _mm512_add_ps |
| 491 | #define GGML_F32Cx16_MUL _mm512_mul_ps |
| 492 | #define GGML_F32Cx16_REDUCE(res, x) \ |
| 493 | do { \ |
| 494 | int offset = GGML_F32_ARR >> 1; \ |
| 495 | for (int i = 0; i < offset; ++i) { \ |
| 496 | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
| 497 | } \ |
| 498 | offset >>= 1; \ |
| 499 | for (int i = 0; i < offset; ++i) { \ |
| 500 | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
| 501 | } \ |
| 502 | offset >>= 1; \ |
| 503 | for (int i = 0; i < offset; ++i) { \ |
| 504 | x[i] = _mm512_add_ps(x[i], x[offset+i]); \ |
| 505 | } \ |
| 506 | res = (ggml_float) _mm512_reduce_add_ps(x[0]); \ |
| 507 | } while (0) |
| 508 | |
| 509 | #define GGML_F16_VEC GGML_F32Cx16 |
| 510 | #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO |
| 511 | #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1 |
| 512 | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p) |
| 513 | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i]) |
| 514 | #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA |
| 515 | #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD |
| 516 | #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL |
| 517 | |
| 518 | #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE |
| 519 | #elif defined(__AVX__) |
| 520 | |
| 521 | #define GGML_SIMD |
| 522 | |
| 523 | // F32 AVX |
| 524 | |
| 525 | #define GGML_F32_STEP 32 |
| 526 | #define GGML_F32_EPR 8 |
| 527 | |
| 528 | #define GGML_F32x8 __m256 |
| 529 | #define GGML_F32x8_ZERO _mm256_setzero_ps() |
| 530 | #define GGML_F32x8_SET1(x) _mm256_set1_ps(x) |
| 531 | #define GGML_F32x8_LOAD _mm256_loadu_ps |
| 532 | #define GGML_F32x8_STORE _mm256_storeu_ps |
| 533 | #if defined(__FMA__) |
| 534 | #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) |
| 535 | #else |
| 536 | #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) |
| 537 | #endif |
| 538 | #define GGML_F32x8_ADD _mm256_add_ps |
| 539 | #define GGML_F32x8_MUL _mm256_mul_ps |
| 540 | #define GGML_F32x8_REDUCE(res, x) \ |
| 541 | do { \ |
| 542 | int offset = GGML_F32_ARR >> 1; \ |
| 543 | for (int i = 0; i < offset; ++i) { \ |
| 544 | x[i] = _mm256_add_ps(x[i], x[offset+i]); \ |
| 545 | } \ |
| 546 | offset >>= 1; \ |
| 547 | for (int i = 0; i < offset; ++i) { \ |
| 548 | x[i] = _mm256_add_ps(x[i], x[offset+i]); \ |
| 549 | } \ |
| 550 | offset >>= 1; \ |
| 551 | for (int i = 0; i < offset; ++i) { \ |
| 552 | x[i] = _mm256_add_ps(x[i], x[offset+i]); \ |
| 553 | } \ |
| 554 | const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ |
| 555 | _mm256_extractf128_ps(x[0], 1)); \ |
| 556 | const __m128 t1 = _mm_hadd_ps(t0, t0); \ |
| 557 | res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ |
| 558 | } while (0) |
| 559 | // TODO: is this optimal ? |
| 560 | |
| 561 | #define GGML_F32_VEC GGML_F32x8 |
| 562 | #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO |
| 563 | #define GGML_F32_VEC_SET1 GGML_F32x8_SET1 |
| 564 | #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD |
| 565 | #define GGML_F32_VEC_STORE GGML_F32x8_STORE |
| 566 | #define GGML_F32_VEC_FMA GGML_F32x8_FMA |
| 567 | #define GGML_F32_VEC_ADD GGML_F32x8_ADD |
| 568 | #define GGML_F32_VEC_MUL GGML_F32x8_MUL |
| 569 | #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE |
| 570 | |
| 571 | // F16 AVX |
| 572 | |
| 573 | #define GGML_F16_STEP 32 |
| 574 | #define GGML_F16_EPR 8 |
| 575 | |
| 576 | // F16 arithmetic is not supported by AVX, so we use F32 instead |
| 577 | |
| 578 | #define GGML_F32Cx8 __m256 |
| 579 | #define GGML_F32Cx8_ZERO _mm256_setzero_ps() |
| 580 | #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) |
| 581 | |
| 582 | #if defined(__F16C__) |
| 583 | // the _mm256_cvt intrinsics require F16C |
| 584 | #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x))) |
| 585 | #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) |
| 586 | #else |
| 587 | static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) { |
| 588 | float tmp[8]; |
| 589 | |
| 590 | for (int i = 0; i < 8; i++) { |
| 591 | tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); |
| 592 | } |
| 593 | |
| 594 | return _mm256_loadu_ps(tmp); |
| 595 | } |
| 596 | static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { |
| 597 | float arr[8]; |
| 598 | |
| 599 | _mm256_storeu_ps(arr, y); |
| 600 | |
| 601 | for (int i = 0; i < 8; i++) |
| 602 | x[i] = GGML_CPU_FP32_TO_FP16(arr[i]); |
| 603 | } |
| 604 | #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) |
| 605 | #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) |
| 606 | #endif |
| 607 | |
| 608 | #define GGML_F32Cx8_FMA GGML_F32x8_FMA |
| 609 | #define GGML_F32Cx8_ADD _mm256_add_ps |
| 610 | #define GGML_F32Cx8_MUL _mm256_mul_ps |
| 611 | #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE |
| 612 | |
| 613 | #define GGML_F16_VEC GGML_F32Cx8 |
| 614 | #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO |
| 615 | #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 |
| 616 | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) |
| 617 | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) |
| 618 | #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA |
| 619 | #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD |
| 620 | #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL |
| 621 | #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE |
| 622 | |
| 623 | #elif defined(__POWER9_VECTOR__) |
| 624 | |
| 625 | #define GGML_SIMD |
| 626 | |
| 627 | // F32 POWER9 |
| 628 | |
| 629 | #define GGML_F32_STEP 32 |
| 630 | #define GGML_F32_EPR 4 |
| 631 | |
| 632 | #define GGML_F32x4 vector float |
| 633 | #define GGML_F32x4_ZERO {0.0f} |
| 634 | #define GGML_F32x4_SET1 vec_splats |
| 635 | #define GGML_F32x4_LOAD(p) vec_xl(0, p) |
| 636 | #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) |
| 637 | #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) |
| 638 | #define GGML_F32x4_ADD vec_add |
| 639 | #define GGML_F32x4_MUL vec_mul |
| 640 | #define GGML_F32x4_REDUCE(res, x) \ |
| 641 | { \ |
| 642 | int offset = GGML_F32_ARR >> 1; \ |
| 643 | for (int i = 0; i < offset; ++i) { \ |
| 644 | x[i] = vec_add(x[i], x[offset+i]); \ |
| 645 | } \ |
| 646 | offset >>= 1; \ |
| 647 | for (int i = 0; i < offset; ++i) { \ |
| 648 | x[i] = vec_add(x[i], x[offset+i]); \ |
| 649 | } \ |
| 650 | offset >>= 1; \ |
| 651 | for (int i = 0; i < offset; ++i) { \ |
| 652 | x[i] = vec_add(x[i], x[offset+i]); \ |
| 653 | } \ |
| 654 | res = vec_extract(x[0], 0) + \ |
| 655 | vec_extract(x[0], 1) + \ |
| 656 | vec_extract(x[0], 2) + \ |
| 657 | vec_extract(x[0], 3); \ |
| 658 | } |
| 659 | |
| 660 | #define GGML_F32_VEC GGML_F32x4 |
| 661 | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
| 662 | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
| 663 | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
| 664 | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
| 665 | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
| 666 | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
| 667 | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
| 668 | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
| 669 | |
| 670 | // F16 POWER9 |
| 671 | #define GGML_F16_STEP GGML_F32_STEP |
| 672 | #define GGML_F16_EPR GGML_F32_EPR |
| 673 | #define GGML_F16_VEC GGML_F32x4 |
| 674 | #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO |
| 675 | #define GGML_F16_VEC_SET1 GGML_F32x4_SET1 |
| 676 | #define GGML_F16_VEC_FMA GGML_F32x4_FMA |
| 677 | #define GGML_F16_VEC_ADD GGML_F32x4_ADD |
| 678 | #define GGML_F16_VEC_MUL GGML_F32x4_MUL |
| 679 | #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE |
| 680 | // Use vec_xl, not vec_ld, in case the load address is not aligned. |
| 681 | #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ |
| 682 | vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \ |
| 683 | vec_extract_fp32_from_shortl(vec_xl(0, p)) |
| 684 | static inline unsigned char ggml_endian_byte(int i) { |
| 685 | uint16_t tmp_val = 1; |
| 686 | return ((unsigned char *)&tmp_val)[i]; |
| 687 | } |
| 688 | #define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i) |
| 689 | #define GGML_F16_VEC_STORE(p, r, i) \ |
| 690 | if (i & 0x1) \ |
| 691 | vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \ |
| 692 | r[i - GGML_ENDIAN_BYTE(0)]), \ |
| 693 | 0, p - GGML_F16_EPR) |
| 694 | |
| 695 | #elif defined(__wasm_simd128__) |
| 696 | |
| 697 | #define GGML_SIMD |
| 698 | |
| 699 | // F32 WASM |
| 700 | |
| 701 | #define GGML_F32_STEP 16 |
| 702 | #define GGML_F32_EPR 4 |
| 703 | |
| 704 | #define GGML_F32x4 v128_t |
| 705 | #define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) |
| 706 | #define GGML_F32x4_SET1(x) wasm_f32x4_splat(x) |
| 707 | #define GGML_F32x4_LOAD wasm_v128_load |
| 708 | #define GGML_F32x4_STORE wasm_v128_store |
| 709 | #define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) |
| 710 | #define GGML_F32x4_ADD wasm_f32x4_add |
| 711 | #define GGML_F32x4_MUL wasm_f32x4_mul |
| 712 | #define GGML_F32x4_REDUCE(res, x) \ |
| 713 | { \ |
| 714 | int offset = GGML_F32_ARR >> 1; \ |
| 715 | for (int i = 0; i < offset; ++i) { \ |
| 716 | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
| 717 | } \ |
| 718 | offset >>= 1; \ |
| 719 | for (int i = 0; i < offset; ++i) { \ |
| 720 | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
| 721 | } \ |
| 722 | offset >>= 1; \ |
| 723 | for (int i = 0; i < offset; ++i) { \ |
| 724 | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
| 725 | } \ |
| 726 | res = wasm_f32x4_extract_lane(x[0], 0) + \ |
| 727 | wasm_f32x4_extract_lane(x[0], 1) + \ |
| 728 | wasm_f32x4_extract_lane(x[0], 2) + \ |
| 729 | wasm_f32x4_extract_lane(x[0], 3); \ |
| 730 | } |
| 731 | |
| 732 | #define GGML_F32_VEC GGML_F32x4 |
| 733 | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
| 734 | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
| 735 | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
| 736 | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
| 737 | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
| 738 | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
| 739 | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
| 740 | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
| 741 | |
| 742 | // F16 WASM |
| 743 | |
| 744 | #define GGML_F16_STEP 16 |
| 745 | #define GGML_F16_EPR 4 |
| 746 | |
| 747 | inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { |
| 748 | float tmp[4]; |
| 749 | |
| 750 | tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]); |
| 751 | tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]); |
| 752 | tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]); |
| 753 | tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]); |
| 754 | |
| 755 | return wasm_v128_load(tmp); |
| 756 | } |
| 757 | |
| 758 | inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { |
| 759 | float tmp[4]; |
| 760 | |
| 761 | wasm_v128_store(tmp, x); |
| 762 | |
| 763 | p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]); |
| 764 | p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]); |
| 765 | p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]); |
| 766 | p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]); |
| 767 | } |
| 768 | |
| 769 | #define GGML_F16x4 v128_t |
| 770 | #define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f) |
| 771 | #define GGML_F16x4_SET1(x) wasm_f32x4_splat(x) |
| 772 | #define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x) |
| 773 | #define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) |
| 774 | #define GGML_F16x4_FMA GGML_F32x4_FMA |
| 775 | #define GGML_F16x4_ADD wasm_f32x4_add |
| 776 | #define GGML_F16x4_MUL wasm_f32x4_mul |
| 777 | #define GGML_F16x4_REDUCE(res, x) \ |
| 778 | { \ |
| 779 | int offset = GGML_F16_ARR >> 1; \ |
| 780 | for (int i = 0; i < offset; ++i) { \ |
| 781 | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
| 782 | } \ |
| 783 | offset >>= 1; \ |
| 784 | for (int i = 0; i < offset; ++i) { \ |
| 785 | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
| 786 | } \ |
| 787 | offset >>= 1; \ |
| 788 | for (int i = 0; i < offset; ++i) { \ |
| 789 | x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ |
| 790 | } \ |
| 791 | res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \ |
| 792 | wasm_f32x4_extract_lane(x[0], 1) + \ |
| 793 | wasm_f32x4_extract_lane(x[0], 2) + \ |
| 794 | wasm_f32x4_extract_lane(x[0], 3)); \ |
| 795 | } |
| 796 | |
| 797 | #define GGML_F16_VEC GGML_F16x4 |
| 798 | #define GGML_F16_VEC_ZERO GGML_F16x4_ZERO |
| 799 | #define GGML_F16_VEC_SET1 GGML_F16x4_SET1 |
| 800 | #define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p) |
| 801 | #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i]) |
| 802 | #define GGML_F16_VEC_FMA GGML_F16x4_FMA |
| 803 | #define GGML_F16_VEC_ADD GGML_F16x4_ADD |
| 804 | #define GGML_F16_VEC_MUL GGML_F16x4_MUL |
| 805 | #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE |
| 806 | |
| 807 | #elif defined(__SSE3__) |
| 808 | |
| 809 | #define GGML_SIMD |
| 810 | |
| 811 | // F32 SSE |
| 812 | |
| 813 | #define GGML_F32_STEP 32 |
| 814 | #define GGML_F32_EPR 4 |
| 815 | |
| 816 | #define GGML_F32x4 __m128 |
| 817 | #define GGML_F32x4_ZERO _mm_setzero_ps() |
| 818 | #define GGML_F32x4_SET1(x) _mm_set1_ps(x) |
| 819 | #define GGML_F32x4_LOAD _mm_loadu_ps |
| 820 | #define GGML_F32x4_STORE _mm_storeu_ps |
| 821 | #if defined(__FMA__) |
| 822 | // TODO: Does this work? |
| 823 | #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) |
| 824 | #else |
| 825 | #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) |
| 826 | #endif |
| 827 | #define GGML_F32x4_ADD _mm_add_ps |
| 828 | #define GGML_F32x4_MUL _mm_mul_ps |
| 829 | #define GGML_F32x4_REDUCE(res, x) \ |
| 830 | { \ |
| 831 | int offset = GGML_F32_ARR >> 1; \ |
| 832 | for (int i = 0; i < offset; ++i) { \ |
| 833 | x[i] = _mm_add_ps(x[i], x[offset+i]); \ |
| 834 | } \ |
| 835 | offset >>= 1; \ |
| 836 | for (int i = 0; i < offset; ++i) { \ |
| 837 | x[i] = _mm_add_ps(x[i], x[offset+i]); \ |
| 838 | } \ |
| 839 | offset >>= 1; \ |
| 840 | for (int i = 0; i < offset; ++i) { \ |
| 841 | x[i] = _mm_add_ps(x[i], x[offset+i]); \ |
| 842 | } \ |
| 843 | const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ |
| 844 | res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ |
| 845 | } |
| 846 | // TODO: is this optimal ? |
| 847 | |
| 848 | #define GGML_F32_VEC GGML_F32x4 |
| 849 | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
| 850 | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
| 851 | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
| 852 | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
| 853 | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
| 854 | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
| 855 | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
| 856 | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
| 857 | |
| 858 | // F16 SSE |
| 859 | |
| 860 | #define GGML_F16_STEP 32 |
| 861 | #define GGML_F16_EPR 4 |
| 862 | |
| 863 | static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) { |
| 864 | float tmp[4]; |
| 865 | |
| 866 | tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]); |
| 867 | tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]); |
| 868 | tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]); |
| 869 | tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]); |
| 870 | |
| 871 | return _mm_loadu_ps(tmp); |
| 872 | } |
| 873 | |
| 874 | static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) { |
| 875 | float arr[4]; |
| 876 | |
| 877 | _mm_storeu_ps(arr, y); |
| 878 | |
| 879 | x[0] = GGML_CPU_FP32_TO_FP16(arr[0]); |
| 880 | x[1] = GGML_CPU_FP32_TO_FP16(arr[1]); |
| 881 | x[2] = GGML_CPU_FP32_TO_FP16(arr[2]); |
| 882 | x[3] = GGML_CPU_FP32_TO_FP16(arr[3]); |
| 883 | } |
| 884 | |
| 885 | #define GGML_F32Cx4 __m128 |
| 886 | #define GGML_F32Cx4_ZERO _mm_setzero_ps() |
| 887 | #define GGML_F32Cx4_SET1(x) _mm_set1_ps(x) |
| 888 | #define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) |
| 889 | #define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) |
| 890 | #define GGML_F32Cx4_FMA GGML_F32x4_FMA |
| 891 | #define GGML_F32Cx4_ADD _mm_add_ps |
| 892 | #define GGML_F32Cx4_MUL _mm_mul_ps |
| 893 | #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE |
| 894 | |
| 895 | #define GGML_F16_VEC GGML_F32Cx4 |
| 896 | #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO |
| 897 | #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 |
| 898 | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) |
| 899 | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) |
| 900 | #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA |
| 901 | #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD |
| 902 | #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL |
| 903 | #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE |
| 904 | |
| 905 | #elif defined(__loongarch_asx) |
| 906 | |
| 907 | #define GGML_SIMD |
| 908 | |
| 909 | // F32 LASX |
| 910 | #define GGML_F32_STEP 32 |
| 911 | #define GGML_F32_EPR 8 |
| 912 | |
| 913 | #define GGML_F32x8 __m256 |
| 914 | #define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0) |
| 915 | #define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x)) |
| 916 | #define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0) |
| 917 | #define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0) |
| 918 | #define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a) |
| 919 | #define GGML_F32x8_ADD __lasx_xvfadd_s |
| 920 | #define GGML_F32x8_MUL __lasx_xvfmul_s |
| 921 | #define GGML_F32x8_REDUCE(res, x) \ |
| 922 | do { \ |
| 923 | int offset = GGML_F32_ARR >> 1; \ |
| 924 | for (int i = 0; i < offset; ++i) { \ |
| 925 | x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ |
| 926 | } \ |
| 927 | offset >>= 1; \ |
| 928 | for (int i = 0; i < offset; ++i) { \ |
| 929 | x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ |
| 930 | } \ |
| 931 | offset >>= 1; \ |
| 932 | for (int i = 0; i < offset; ++i) { \ |
| 933 | x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ |
| 934 | } \ |
| 935 | float *tmp_p = (float *)&x[0]; \ |
| 936 | res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \ |
| 937 | } while (0) |
| 938 | // TODO: is this optimal ? |
| 939 | |
| 940 | #define GGML_F32_VEC GGML_F32x8 |
| 941 | #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO |
| 942 | #define GGML_F32_VEC_SET1 GGML_F32x8_SET1 |
| 943 | #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD |
| 944 | #define GGML_F32_VEC_STORE GGML_F32x8_STORE |
| 945 | #define GGML_F32_VEC_FMA GGML_F32x8_FMA |
| 946 | #define GGML_F32_VEC_ADD GGML_F32x8_ADD |
| 947 | #define GGML_F32_VEC_MUL GGML_F32x8_MUL |
| 948 | #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE |
| 949 | |
| 950 | // F16 LASX |
| 951 | |
| 952 | #define GGML_F16_STEP 32 |
| 953 | #define GGML_F16_EPR 8 |
| 954 | |
| 955 | // F16 arithmetic is not supported by LASX, so we use F32 instead |
| 956 | |
| 957 | #define GGML_F32Cx8 __m256 |
| 958 | #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0) |
| 959 | #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x)) |
| 960 | |
| 961 | static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) { |
| 962 | __m256i a; |
| 963 | memcpy(&a, x, sizeof(ggml_fp16_t) * 8); |
| 964 | a = __lasx_xvpermi_d(a, 0 | (1 << 4)); |
| 965 | return __lasx_xvfcvtl_s_h(a); |
| 966 | } |
| 967 | |
| 968 | static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { |
| 969 | __m256i a = __lasx_xvfcvt_h_s(y, y); |
| 970 | a = __lasx_xvpermi_d(a, 0 | (2 << 2)); |
| 971 | memcpy(x, &a, sizeof(ggml_fp16_t) * 8); |
| 972 | } |
| 973 | #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x) |
| 974 | #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y) |
| 975 | |
| 976 | #define GGML_F32Cx8_FMA GGML_F32x8_FMA |
| 977 | #define GGML_F32Cx8_ADD __lasx_xvfadd_s |
| 978 | #define GGML_F32Cx8_MUL __lasx_xvfmul_s |
| 979 | #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE |
| 980 | |
| 981 | #define GGML_F16_VEC GGML_F32Cx8 |
| 982 | #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO |
| 983 | #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 |
| 984 | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) |
| 985 | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) |
| 986 | #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA |
| 987 | #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD |
| 988 | #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL |
| 989 | #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE |
| 990 | |
| 991 | #elif defined(__loongarch_sx) |
| 992 | |
| 993 | #define GGML_SIMD |
| 994 | |
| 995 | // F32 LSX |
| 996 | |
| 997 | #define GGML_F32_STEP 32 |
| 998 | #define GGML_F32_EPR 4 |
| 999 | |
| 1000 | #define GGML_F32x4 __m128 |
| 1001 | #define GGML_F32x4_ZERO (__m128)__lsx_vldi(0) |
| 1002 | #define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x)) |
| 1003 | #define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0) |
| 1004 | #define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0) |
| 1005 | #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a) |
| 1006 | #define GGML_F32x4_ADD __lsx_vfadd_s |
| 1007 | #define GGML_F32x4_MUL __lsx_vfmul_s |
| 1008 | |
| 1009 | #define GGML_F32x4_REDUCE(res, x) \ |
| 1010 | { \ |
| 1011 | int offset = GGML_F32_ARR >> 1; \ |
| 1012 | for (int i = 0; i < offset; ++i) { \ |
| 1013 | x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ |
| 1014 | } \ |
| 1015 | offset >>= 1; \ |
| 1016 | for (int i = 0; i < offset; ++i) { \ |
| 1017 | x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ |
| 1018 | } \ |
| 1019 | offset >>= 1; \ |
| 1020 | for (int i = 0; i < offset; ++i) { \ |
| 1021 | x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ |
| 1022 | } \ |
| 1023 | __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \ |
| 1024 | __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \ |
| 1025 | __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \ |
| 1026 | __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \ |
| 1027 | __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \ |
| 1028 | __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \ |
| 1029 | res = (ggml_float) ((v4f32)t5)[0]; \ |
| 1030 | } |
| 1031 | |
| 1032 | #define GGML_F32_VEC GGML_F32x4 |
| 1033 | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
| 1034 | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
| 1035 | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
| 1036 | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
| 1037 | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
| 1038 | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
| 1039 | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
| 1040 | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
| 1041 | |
| 1042 | // F16 LSX |
| 1043 | |
| 1044 | #define GGML_F16_STEP 32 |
| 1045 | #define GGML_F16_EPR 4 |
| 1046 | |
| 1047 | static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) { |
| 1048 | float tmp[4]; |
| 1049 | |
| 1050 | tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]); |
| 1051 | tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]); |
| 1052 | tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]); |
| 1053 | tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]); |
| 1054 | |
| 1055 | return (__m128)__lsx_vld(tmp, 0); |
| 1056 | } |
| 1057 | |
| 1058 | static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { |
| 1059 | float arr[4]; |
| 1060 | |
| 1061 | __lsx_vst(y, arr, 0); |
| 1062 | |
| 1063 | x[0] = GGML_CPU_FP32_TO_FP16(arr[0]); |
| 1064 | x[1] = GGML_CPU_FP32_TO_FP16(arr[1]); |
| 1065 | x[2] = GGML_CPU_FP32_TO_FP16(arr[2]); |
| 1066 | x[3] = GGML_CPU_FP32_TO_FP16(arr[3]); |
| 1067 | } |
| 1068 | |
| 1069 | #define GGML_F32Cx4 __m128 |
| 1070 | #define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0) |
| 1071 | #define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x)) |
| 1072 | #define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x) |
| 1073 | #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y) |
| 1074 | #define GGML_F32Cx4_FMA GGML_F32x4_FMA |
| 1075 | #define GGML_F32Cx4_ADD __lsx_vfadd_s |
| 1076 | #define GGML_F32Cx4_MUL __lsx_vfmul_s |
| 1077 | #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE |
| 1078 | |
| 1079 | #define GGML_F16_VEC GGML_F32Cx4 |
| 1080 | #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO |
| 1081 | #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 |
| 1082 | #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) |
| 1083 | #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) |
| 1084 | #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA |
| 1085 | #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD |
| 1086 | #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL |
| 1087 | #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE |
| 1088 | |
| 1089 | #elif defined(__VXE__) || defined(__VXE2__) |
| 1090 | |
| 1091 | #define GGML_SIMD |
| 1092 | |
| 1093 | // F32 s390x |
| 1094 | |
| 1095 | #define GGML_F32_STEP 32 |
| 1096 | #define GGML_F32_EPR 4 |
| 1097 | |
| 1098 | #define GGML_F32x4 float32x4_t |
| 1099 | #define GGML_F32x4_ZERO vec_splats(0.0f) |
| 1100 | #define GGML_F32x4_SET1 vec_splats |
| 1101 | #define GGML_F32x4_LOAD(p) vec_xl(0, p) |
| 1102 | #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) |
| 1103 | #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) |
| 1104 | #define GGML_F32x4_ADD vec_add |
| 1105 | #define GGML_F32x4_MUL vec_mul |
| 1106 | #define GGML_F32x4_REDUCE(res, x) \ |
| 1107 | { \ |
| 1108 | int offset = GGML_F32_ARR >> 1; \ |
| 1109 | for (int i = 0; i < offset; ++i) { \ |
| 1110 | x[i] = vec_add(x[i], x[offset + i]); \ |
| 1111 | } \ |
| 1112 | offset >>= 1; \ |
| 1113 | for (int i = 0; i < offset; ++i) { \ |
| 1114 | x[i] = vec_add(x[i], x[offset + i]); \ |
| 1115 | } \ |
| 1116 | offset >>= 1; \ |
| 1117 | for (int i = 0; i < offset; ++i) { \ |
| 1118 | x[i] = vec_add(x[i], x[offset + i]); \ |
| 1119 | } \ |
| 1120 | float32x4_t tmp = x[0] + vec_reve(x[0]); \ |
| 1121 | res = tmp[0] + tmp[1]; \ |
| 1122 | } |
| 1123 | |
| 1124 | #define GGML_F32_VEC GGML_F32x4 |
| 1125 | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
| 1126 | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
| 1127 | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
| 1128 | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
| 1129 | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
| 1130 | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
| 1131 | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
| 1132 | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
| 1133 | |
| 1134 | // F16 s390x |
| 1135 | #define GGML_F16_STEP GGML_F32_STEP |
| 1136 | #define GGML_F16_EPR GGML_F32_EPR |
| 1137 | |
| 1138 | static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) { |
| 1139 | float tmp[4]; |
| 1140 | |
| 1141 | for (int i = 0; i < 4; i++) { |
| 1142 | tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); |
| 1143 | } |
| 1144 | |
| 1145 | // note: keep type-cast here to prevent compiler bugs |
| 1146 | // see: https://github.com/ggml-org/llama.cpp/issues/12846 |
| 1147 | return vec_xl(0, (const float *)(tmp)); |
| 1148 | } |
| 1149 | |
| 1150 | static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) { |
| 1151 | float arr[4]; |
| 1152 | |
| 1153 | // note: keep type-cast here to prevent compiler bugs |
| 1154 | // see: https://github.com/ggml-org/llama.cpp/issues/12846 |
| 1155 | vec_xst(v_y, 0, (float *)(arr)); |
| 1156 | |
| 1157 | for (int i = 0; i < 4; i++) { |
| 1158 | x[i] = GGML_CPU_FP32_TO_FP16(arr[i]); |
| 1159 | } |
| 1160 | } |
| 1161 | |
| 1162 | #define GGML_F16_VEC GGML_F32x4 |
| 1163 | #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO |
| 1164 | #define GGML_F16_VEC_SET1 GGML_F32x4_SET1 |
| 1165 | #define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p) |
| 1166 | #define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i]) |
| 1167 | #define GGML_F16_VEC_FMA GGML_F32x4_FMA |
| 1168 | #define GGML_F16_VEC_ADD GGML_F32x4_ADD |
| 1169 | #define GGML_F16_VEC_MUL GGML_F32x4_MUL |
| 1170 | #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE |
| 1171 | |
| 1172 | #elif defined(__riscv_v_intrinsic) |
| 1173 | |
| 1174 | // compatible with vlen >= 128 |
| 1175 | |
| 1176 | #define GGML_SIMD |
| 1177 | |
| 1178 | // F32 |
| 1179 | |
| 1180 | #define GGML_F32_STEP 16 |
| 1181 | #define GGML_F32_EPR 4 |
| 1182 | |
| 1183 | #define GGML_F32x4 vfloat32m1_t |
| 1184 | #define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR) |
| 1185 | #define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR) |
| 1186 | #define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR) |
| 1187 | #define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR) |
| 1188 | #define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR) |
| 1189 | #define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR) |
| 1190 | #define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR) |
| 1191 | |
| 1192 | #define GGML_F32_VEC GGML_F32x4 |
| 1193 | #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO |
| 1194 | #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 |
| 1195 | #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD |
| 1196 | #define GGML_F32_VEC_STORE GGML_F32x4_STORE |
| 1197 | #define GGML_F32_VEC_FMA GGML_F32x4_FMA |
| 1198 | #define GGML_F32_VEC_ADD GGML_F32x4_ADD |
| 1199 | #define GGML_F32_VEC_MUL GGML_F32x4_MUL |
| 1200 | #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE |
| 1201 | |
| 1202 | #endif |
| 1203 | |
| 1204 | // GGML_F32_ARR / GGML_F16_ARR |
| 1205 | // number of registers to use per step |
| 1206 | #ifdef GGML_SIMD |
| 1207 | #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) |
| 1208 | #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) |
| 1209 | #endif |
| 1210 | |
| 1211 | #ifdef __cplusplus |
| 1212 | } |
| 1213 | #endif |
| 1214 | |