| 1 | // Vectorized functions for fundamental operations |
| 2 | |
| 3 | #pragma once |
| 4 | |
| 5 | #include "ggml-impl.h" |
| 6 | #include "simd-mappings.h" |
| 7 | #include "ggml.h" |
| 8 | #include "ggml-cpu.h" |
| 9 | |
| 10 | #if defined(GGML_USE_ACCELERATE) |
| 11 | #include <Accelerate/Accelerate.h> |
| 12 | #endif |
| 13 | |
| 14 | // floating point type used to accumulate sums |
| 15 | typedef double ggml_float; |
| 16 | |
| 17 | #define GGML_GELU_FP16 |
| 18 | #define GGML_GELU_QUICK_FP16 |
| 19 | |
| 20 | #define GGML_SOFT_MAX_UNROLL 4 |
| 21 | #define GGML_VEC_DOT_UNROLL 2 |
| 22 | #define GGML_VEC_MAD_UNROLL 32 |
| 23 | |
| 24 | #ifdef __cplusplus |
| 25 | extern "C" { |
| 26 | #endif |
| 27 | |
| 28 | // |
| 29 | // global data |
| 30 | // |
| 31 | |
| 32 | // precomputed gelu table for f16 (128 KB) |
| 33 | extern ggml_fp16_t ggml_table_gelu_f16[1 << 16]; |
| 34 | |
| 35 | // precomputed quick gelu table for f16 (128 KB) |
| 36 | extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; |
| 37 | |
| 38 | // |
| 39 | // fundamental operations |
| 40 | // |
| 41 | |
| 42 | void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc); |
| 43 | void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc); |
| 44 | void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); |
| 45 | |
| 46 | void ggml_vec_silu_f32(const int n, float * y, const float * x); |
| 47 | ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean ) |
| 48 | ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max); |
| 49 | ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max); |
| 50 | |
| 51 | inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } |
| 52 | inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } |
| 53 | |
| 54 | inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } |
| 55 | inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } |
| 56 | |
| 57 | inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } |
| 58 | inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } |
| 59 | |
| 60 | inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { |
| 61 | int i = 0; |
| 62 | #if defined(__AVX2__) |
| 63 | for (; i + 7 < n; i += 8) { |
| 64 | __m256 vx = _mm256_loadu_ps(p: x + i); |
| 65 | __m256 vy = _mm256_loadu_ps(p: y + i); |
| 66 | __m256 vz = _mm256_add_ps(a: vx, b: vy); |
| 67 | _mm256_storeu_ps(p: z + i, a: vz); |
| 68 | } |
| 69 | #endif |
| 70 | for (; i < n; ++i) { |
| 71 | z[i] = x[i] + y[i]; |
| 72 | } |
| 73 | } |
| 74 | |
| 75 | inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { |
| 76 | for (int i = 0; i < n; ++i) { |
| 77 | z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i])); |
| 78 | } |
| 79 | } |
| 80 | inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } |
| 81 | inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } |
| 82 | inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } |
| 83 | inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } |
| 84 | inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { |
| 85 | for (int i = 0; i < n; ++i) { |
| 86 | z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i])); |
| 87 | } |
| 88 | } |
| 89 | inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } |
| 90 | inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } |
| 91 | inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } |
| 92 | inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 93 | for (int i = 0; i < n; ++i) { |
| 94 | y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i])); |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } |
| 99 | inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { |
| 100 | for (int i = 0; i < n; ++i) { |
| 101 | z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i])); |
| 102 | } |
| 103 | } |
| 104 | inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } |
| 105 | inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { |
| 106 | for (int i = 0; i < n; ++i) { |
| 107 | z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i])); |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | // compute GGML_VEC_DOT_UNROLL dot products at once |
| 112 | // xs - x row stride in bytes |
| 113 | inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) { |
| 114 | ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; |
| 115 | |
| 116 | ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL]; |
| 117 | |
| 118 | for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { |
| 119 | x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); |
| 120 | } |
| 121 | |
| 122 | #if defined(GGML_SIMD) |
| 123 | #if defined(__ARM_FEATURE_SVE) |
| 124 | |
| 125 | const int sve_register_length = svcntb() * 8; |
| 126 | const int ggml_f16_epr = sve_register_length / 16; // running when 16 |
| 127 | const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers |
| 128 | |
| 129 | const int np = (n & ~(ggml_f16_step - 1)); |
| 130 | |
| 131 | svfloat16_t sum_00 = svdup_n_f16(0.0f); |
| 132 | svfloat16_t sum_01 = svdup_n_f16(0.0f); |
| 133 | svfloat16_t sum_02 = svdup_n_f16(0.0f); |
| 134 | svfloat16_t sum_03 = svdup_n_f16(0.0f); |
| 135 | |
| 136 | svfloat16_t sum_10 = svdup_n_f16(0.0f); |
| 137 | svfloat16_t sum_11 = svdup_n_f16(0.0f); |
| 138 | svfloat16_t sum_12 = svdup_n_f16(0.0f); |
| 139 | svfloat16_t sum_13 = svdup_n_f16(0.0f); |
| 140 | |
| 141 | svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; |
| 142 | svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; |
| 143 | |
| 144 | for (int i = 0; i < np; i += ggml_f16_step) { |
| 145 | ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements |
| 146 | |
| 147 | ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements |
| 148 | sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1 |
| 149 | ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements |
| 150 | sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1); |
| 151 | |
| 152 | ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements |
| 153 | |
| 154 | ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements |
| 155 | sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2); |
| 156 | ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1); |
| 157 | sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2); |
| 158 | |
| 159 | ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); |
| 160 | |
| 161 | ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2); |
| 162 | sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3); |
| 163 | ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2); |
| 164 | sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3); |
| 165 | |
| 166 | ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); |
| 167 | |
| 168 | ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3); |
| 169 | sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4); |
| 170 | ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3); |
| 171 | sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4); |
| 172 | |
| 173 | ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); |
| 174 | |
| 175 | ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4); |
| 176 | |
| 177 | sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5); |
| 178 | ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4); |
| 179 | sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5); |
| 180 | |
| 181 | ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); |
| 182 | |
| 183 | ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5); |
| 184 | |
| 185 | sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6); |
| 186 | ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5); |
| 187 | sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6); |
| 188 | |
| 189 | ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); |
| 190 | |
| 191 | ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6); |
| 192 | |
| 193 | sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7); |
| 194 | ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6); |
| 195 | sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7); |
| 196 | |
| 197 | ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); |
| 198 | |
| 199 | ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7); |
| 200 | |
| 201 | sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8); |
| 202 | ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7); |
| 203 | sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8); |
| 204 | } |
| 205 | |
| 206 | const int np2 = (n & ~(ggml_f16_epr - 1)); |
| 207 | for (int k = np; k < np2; k += ggml_f16_epr) { |
| 208 | svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); |
| 209 | |
| 210 | svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0); |
| 211 | sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry); |
| 212 | rx = GGML_F16x_VEC_LOAD(x[1] + k, 0); |
| 213 | sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry); |
| 214 | } |
| 215 | |
| 216 | if (np2 < n) { |
| 217 | svbool_t pg = svwhilelt_b16(np2, n); |
| 218 | svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2)); |
| 219 | svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2)); |
| 220 | svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); |
| 221 | |
| 222 | sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00); |
| 223 | sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10); |
| 224 | } |
| 225 | GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03); |
| 226 | GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13); |
| 227 | #elif defined(__riscv_v_intrinsic) |
| 228 | // todo: RVV impl |
| 229 | for (int i = 0; i < n; ++i) { |
| 230 | for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { |
| 231 | sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); |
| 232 | } |
| 233 | } |
| 234 | #else |
| 235 | const int np = (n & ~(GGML_F16_STEP - 1)); |
| 236 | |
| 237 | GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; |
| 238 | |
| 239 | GGML_F16_VEC ax[GGML_F16_ARR]; |
| 240 | GGML_F16_VEC ay[GGML_F16_ARR]; |
| 241 | |
| 242 | for (int i = 0; i < np; i += GGML_F16_STEP) { |
| 243 | for (int j = 0; j < GGML_F16_ARR; j++) { |
| 244 | ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); |
| 245 | |
| 246 | for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { |
| 247 | ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); |
| 248 | |
| 249 | sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); |
| 250 | } |
| 251 | } |
| 252 | } |
| 253 | |
| 254 | // reduce sum0..sum3 to sum0 |
| 255 | for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { |
| 256 | GGML_F16_VEC_REDUCE(sumf[k], sum[k]); |
| 257 | } |
| 258 | |
| 259 | // leftovers |
| 260 | for (int i = np; i < n; ++i) { |
| 261 | for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { |
| 262 | sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); |
| 263 | } |
| 264 | } |
| 265 | #endif |
| 266 | #else |
| 267 | for (int i = 0; i < n; ++i) { |
| 268 | for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { |
| 269 | sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); |
| 270 | } |
| 271 | } |
| 272 | #endif |
| 273 | |
| 274 | for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { |
| 275 | s[i] = (float)sumf[i]; |
| 276 | } |
| 277 | } |
| 278 | |
| 279 | inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) { |
| 280 | #if defined(GGML_SIMD) |
| 281 | #if defined(__ARM_FEATURE_SVE) |
| 282 | |
| 283 | const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; |
| 284 | const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16 |
| 285 | const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers |
| 286 | GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); |
| 287 | |
| 288 | const int np = (n & ~(ggml_f32_step - 1)); |
| 289 | svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; |
| 290 | svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; |
| 291 | for (int i = 0; i < np; i += ggml_f32_step) { |
| 292 | |
| 293 | ax1 = GGML_F32_VEC_LOAD(x + i); |
| 294 | ay1 = GGML_F32_VEC_LOAD(y + i); |
| 295 | ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx); |
| 296 | |
| 297 | GGML_F32_VEC_STORE(y + i, ay1); |
| 298 | |
| 299 | ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr); |
| 300 | ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); |
| 301 | ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx); |
| 302 | |
| 303 | GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2); |
| 304 | |
| 305 | ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr); |
| 306 | ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr); |
| 307 | ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx); |
| 308 | |
| 309 | GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3); |
| 310 | |
| 311 | ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr); |
| 312 | ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr); |
| 313 | ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx); |
| 314 | |
| 315 | GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4); |
| 316 | |
| 317 | ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr); |
| 318 | ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr); |
| 319 | ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx); |
| 320 | |
| 321 | GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5); |
| 322 | |
| 323 | ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr); |
| 324 | ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr); |
| 325 | ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx); |
| 326 | |
| 327 | GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6); |
| 328 | |
| 329 | ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr); |
| 330 | ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr); |
| 331 | ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx); |
| 332 | |
| 333 | GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7); |
| 334 | |
| 335 | ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr); |
| 336 | ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr); |
| 337 | ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx); |
| 338 | |
| 339 | GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8); |
| 340 | } |
| 341 | // leftovers |
| 342 | // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop |
| 343 | const int np2 = (n & ~(ggml_f32_epr - 1)); |
| 344 | for (int i = np; i < np2; i += ggml_f32_epr) { |
| 345 | ax1 = GGML_F32_VEC_LOAD(x + i); |
| 346 | ay1 = GGML_F32_VEC_LOAD(y + i); |
| 347 | ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx); |
| 348 | |
| 349 | GGML_F32_VEC_STORE(y + i, ay1); |
| 350 | } |
| 351 | // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only |
| 352 | if (np2 < n) { |
| 353 | svbool_t pg =svwhilelt_b32(np2, n); |
| 354 | ax1 = svld1_f32(pg, x + np2); |
| 355 | ay1 = svld1_f32(pg, y + np2); |
| 356 | ay1 = svmad_f32_m(pg, ax1, vx, ay1); |
| 357 | |
| 358 | svst1_f32(pg, y + np2, ay1); |
| 359 | } |
| 360 | #elif defined(__riscv_v_intrinsic) |
| 361 | for (int i = 0, avl; i < n; i += avl) { |
| 362 | avl = __riscv_vsetvl_e32m8(n - i); |
| 363 | vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl); |
| 364 | vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl); |
| 365 | vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl); |
| 366 | __riscv_vse32_v_f32m8(&y[i], ny, avl); |
| 367 | } |
| 368 | #else |
| 369 | const int np = (n & ~(GGML_F32_STEP - 1)); |
| 370 | |
| 371 | GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); |
| 372 | |
| 373 | GGML_F32_VEC ax[GGML_F32_ARR]; |
| 374 | GGML_F32_VEC ay[GGML_F32_ARR]; |
| 375 | |
| 376 | for (int i = 0; i < np; i += GGML_F32_STEP) { |
| 377 | for (int j = 0; j < GGML_F32_ARR; j++) { |
| 378 | ax[j] = GGML_F32_VEC_LOAD(p: x + i + j*GGML_F32_EPR); |
| 379 | ay[j] = GGML_F32_VEC_LOAD(p: y + i + j*GGML_F32_EPR); |
| 380 | ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx); |
| 381 | |
| 382 | GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]); |
| 383 | } |
| 384 | } |
| 385 | |
| 386 | // leftovers |
| 387 | for (int i = np; i < n; ++i) { |
| 388 | y[i] += x[i]*v; |
| 389 | } |
| 390 | #endif |
| 391 | #else |
| 392 | // scalar |
| 393 | for (int i = 0; i < n; ++i) { |
| 394 | y[i] += x[i]*v; |
| 395 | } |
| 396 | #endif |
| 397 | } |
| 398 | |
| 399 | inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) { |
| 400 | #if defined(GGML_SIMD) |
| 401 | #if defined(__ARM_FEATURE_SVE) |
| 402 | const int sve_register_length = svcntb() * 8; |
| 403 | const int ggml_f16_epr = sve_register_length / 16; |
| 404 | const int ggml_f16_step = 8 * ggml_f16_epr; |
| 405 | |
| 406 | GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); |
| 407 | |
| 408 | const int np= (n & ~(ggml_f16_step - 1)); |
| 409 | |
| 410 | svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; |
| 411 | svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; |
| 412 | for (int i = 0; i < np; i += ggml_f16_step) { |
| 413 | ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0); |
| 414 | ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); |
| 415 | ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx); |
| 416 | |
| 417 | GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0); |
| 418 | |
| 419 | ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1); |
| 420 | ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); |
| 421 | ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx); |
| 422 | |
| 423 | GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1); |
| 424 | |
| 425 | ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2); |
| 426 | ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); |
| 427 | ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx); |
| 428 | |
| 429 | GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2); |
| 430 | |
| 431 | ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3); |
| 432 | ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); |
| 433 | ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx); |
| 434 | |
| 435 | GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3); |
| 436 | |
| 437 | ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4); |
| 438 | ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); |
| 439 | ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx); |
| 440 | |
| 441 | GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4); |
| 442 | |
| 443 | ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5); |
| 444 | ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); |
| 445 | ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx); |
| 446 | |
| 447 | GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5); |
| 448 | |
| 449 | ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6); |
| 450 | ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); |
| 451 | ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx); |
| 452 | |
| 453 | GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6); |
| 454 | |
| 455 | ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7); |
| 456 | ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); |
| 457 | ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx); |
| 458 | |
| 459 | GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7); |
| 460 | } |
| 461 | const int np2 = (n & ~(ggml_f16_epr - 1)); |
| 462 | for (int k = np; k < np2; k += ggml_f16_epr) { |
| 463 | svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0); |
| 464 | svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); |
| 465 | ry = GGML_F16x_VEC_FMA(ry, rx, vx); |
| 466 | |
| 467 | GGML_F16x_VEC_STORE(y + k, ry, 0); |
| 468 | } |
| 469 | |
| 470 | if (np2 < n) { |
| 471 | svbool_t pg = svwhilelt_b16(np2, n); |
| 472 | svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2)); |
| 473 | svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); |
| 474 | hy = svmad_f16_x(pg, hx, vx, hy); |
| 475 | svst1_f16(pg, (__fp16 *)(y + np2), hy); |
| 476 | } |
| 477 | |
| 478 | #elif defined(__riscv_v_intrinsic) |
| 479 | // todo: RVV impl |
| 480 | // scalar |
| 481 | for (int i = 0; i < n; ++i) { |
| 482 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); |
| 483 | } |
| 484 | #else |
| 485 | const int np = (n & ~(GGML_F16_STEP - 1)); |
| 486 | |
| 487 | GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); |
| 488 | |
| 489 | GGML_F16_VEC ax[GGML_F16_ARR]; |
| 490 | GGML_F16_VEC ay[GGML_F16_ARR]; |
| 491 | |
| 492 | for (int i = 0; i < np; i += GGML_F16_STEP) { |
| 493 | for (int j = 0; j < GGML_F16_ARR; j++) { |
| 494 | ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); |
| 495 | ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); |
| 496 | ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); |
| 497 | |
| 498 | GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); |
| 499 | } |
| 500 | } |
| 501 | |
| 502 | // leftovers |
| 503 | for (int i = np; i < n; ++i) { |
| 504 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); |
| 505 | } |
| 506 | #endif |
| 507 | #else |
| 508 | // scalar |
| 509 | for (int i = 0; i < n; ++i) { |
| 510 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); |
| 511 | } |
| 512 | #endif |
| 513 | } |
| 514 | |
| 515 | // xs and vs are byte strides of x and v |
| 516 | inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) { |
| 517 | |
| 518 | const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL]; |
| 519 | const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL]; |
| 520 | |
| 521 | for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) { |
| 522 | x[i] = (const float *) ((const char *) xv + i*xs); |
| 523 | v[i] = (const float *) ((const char *) vv + i*vs); |
| 524 | } |
| 525 | |
| 526 | #if defined(GGML_SIMD) |
| 527 | #if defined(__ARM_FEATURE_SVE) |
| 528 | // scalar Route to scalar implementation //TODO: Write SVE code |
| 529 | for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { |
| 530 | for (int i = 0; i < n; ++i) { |
| 531 | y[i] += x[k][i]*v[k][0]; |
| 532 | } |
| 533 | } |
| 534 | #elif defined(__riscv_v_intrinsic) |
| 535 | for (int i = 0, avl; i < n; i += avl) { |
| 536 | avl = __riscv_vsetvl_e32m8(n - i); |
| 537 | vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl); |
| 538 | for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) { |
| 539 | vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl); |
| 540 | ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl); |
| 541 | } |
| 542 | __riscv_vse32_v_f32m8(&y[i], ay, avl); |
| 543 | } |
| 544 | #else |
| 545 | const int np = (n & ~(GGML_F32_STEP - 1)); |
| 546 | |
| 547 | GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL]; |
| 548 | |
| 549 | for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { |
| 550 | vx[k] = GGML_F32_VEC_SET1(v[k][0]); |
| 551 | } |
| 552 | |
| 553 | GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR]; |
| 554 | GGML_F32_VEC ay[GGML_F32_ARR]; |
| 555 | |
| 556 | for (int i = 0; i < np; i += GGML_F32_STEP) { |
| 557 | for (int j = 0; j < GGML_F32_ARR; j++) { |
| 558 | ay[j] = GGML_F32_VEC_LOAD(p: y + i + j*GGML_F32_EPR); |
| 559 | |
| 560 | for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { |
| 561 | ax[k][j] = GGML_F32_VEC_LOAD(p: x[k] + i + j*GGML_F32_EPR); |
| 562 | ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]); |
| 563 | } |
| 564 | |
| 565 | GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]); |
| 566 | } |
| 567 | } |
| 568 | |
| 569 | // leftovers |
| 570 | for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { |
| 571 | for (int i = np; i < n; ++i) { |
| 572 | y[i] += x[k][i]*v[k][0]; |
| 573 | } |
| 574 | } |
| 575 | #endif |
| 576 | #else |
| 577 | // scalar |
| 578 | for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { |
| 579 | for (int i = 0; i < n; ++i) { |
| 580 | y[i] += x[k][i]*v[k][0]; |
| 581 | } |
| 582 | } |
| 583 | #endif |
| 584 | } |
| 585 | |
| 586 | inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) { |
| 587 | #if defined(GGML_USE_ACCELERATE) |
| 588 | vDSP_vsmsa(x, 1, &s, &b, y, 1, n); |
| 589 | #elif defined(GGML_SIMD) |
| 590 | #if defined(__ARM_FEATURE_SVE) |
| 591 | // scalar ; TODO: Write SVE code |
| 592 | for (int i = 0; i < n; ++i) { |
| 593 | y[i] = x[i]*s + b; |
| 594 | } |
| 595 | #elif defined(__riscv_v_intrinsic) |
| 596 | for (int i = 0, avl; i < n; i += avl) { |
| 597 | avl = __riscv_vsetvl_e32m8(n - i); |
| 598 | vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl); |
| 599 | vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl); |
| 600 | vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl); |
| 601 | __riscv_vse32_v_f32m8(&y[i], ny, avl); |
| 602 | } |
| 603 | #else |
| 604 | const int np = (n & ~(GGML_F32_STEP - 1)); |
| 605 | |
| 606 | GGML_F32_VEC vs = GGML_F32_VEC_SET1(s); |
| 607 | GGML_F32_VEC vb = GGML_F32_VEC_SET1(b); |
| 608 | |
| 609 | GGML_F32_VEC ay[GGML_F32_ARR]; |
| 610 | |
| 611 | for (int i = 0; i < np; i += GGML_F32_STEP) { |
| 612 | for (int j = 0; j < GGML_F32_ARR; j++) { |
| 613 | ay[j] = GGML_F32_VEC_LOAD(p: x + i + j*GGML_F32_EPR); |
| 614 | ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs); |
| 615 | |
| 616 | GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]); |
| 617 | } |
| 618 | } |
| 619 | |
| 620 | // leftovers |
| 621 | for (int i = np; i < n; ++i) { |
| 622 | y[i] = x[i]*s + b; |
| 623 | } |
| 624 | #endif |
| 625 | #else |
| 626 | // scalar |
| 627 | for (int i = 0; i < n; ++i) { |
| 628 | y[i] = x[i]*s + b; |
| 629 | } |
| 630 | #endif |
| 631 | } |
| 632 | |
| 633 | //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } |
| 634 | inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { |
| 635 | #if defined(GGML_USE_ACCELERATE) |
| 636 | vDSP_vsmul(y, 1, &v, y, 1, n); |
| 637 | #elif defined(GGML_SIMD) |
| 638 | #if defined(__ARM_FEATURE_SVE) |
| 639 | const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; |
| 640 | const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16 |
| 641 | const int ggml_f32_step = 2 * ggml_f32_epr; |
| 642 | |
| 643 | GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); |
| 644 | const int np = (n & ~(ggml_f32_step - 1)); |
| 645 | svfloat32_t ay1; |
| 646 | svfloat32_t ay2; |
| 647 | for (int i = 0; i < np; i += ggml_f32_step) { |
| 648 | ay1 = GGML_F32_VEC_LOAD(y + i); |
| 649 | ay1 = GGML_F32_VEC_MUL(ay1, vx); |
| 650 | GGML_F32_VEC_STORE(y + i, ay1); |
| 651 | |
| 652 | ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); |
| 653 | ay2 = GGML_F32_VEC_MUL(ay2, vx); |
| 654 | GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2); |
| 655 | } |
| 656 | // leftovers |
| 657 | // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only |
| 658 | for (int i = np; i < n; i += ggml_f32_epr) { |
| 659 | svbool_t pg = svwhilelt_b32(i, n); |
| 660 | ay1 = svld1_f32(pg, y + i); |
| 661 | ay1 = svmul_f32_m(pg, ay1, vx); |
| 662 | svst1_f32(pg, y + i, ay1); |
| 663 | } |
| 664 | #elif defined(__riscv_v_intrinsic) |
| 665 | for (int i = 0, avl; i < n; i += avl) { |
| 666 | avl = __riscv_vsetvl_e32m8(n - i); |
| 667 | vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl); |
| 668 | vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl); |
| 669 | __riscv_vse32_v_f32m8(&y[i], ny, avl); |
| 670 | } |
| 671 | #else |
| 672 | const int np = (n & ~(GGML_F32_STEP - 1)); |
| 673 | |
| 674 | GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); |
| 675 | |
| 676 | GGML_F32_VEC ay[GGML_F32_ARR]; |
| 677 | |
| 678 | for (int i = 0; i < np; i += GGML_F32_STEP) { |
| 679 | for (int j = 0; j < GGML_F32_ARR; j++) { |
| 680 | ay[j] = GGML_F32_VEC_LOAD(p: y + i + j*GGML_F32_EPR); |
| 681 | ay[j] = GGML_F32_VEC_MUL(a: ay[j], b: vx); |
| 682 | |
| 683 | GGML_F32_VEC_STORE(p: y + i + j*GGML_F32_EPR, a: ay[j]); |
| 684 | } |
| 685 | } |
| 686 | |
| 687 | // leftovers |
| 688 | for (int i = np; i < n; ++i) { |
| 689 | y[i] *= v; |
| 690 | } |
| 691 | #endif |
| 692 | #else |
| 693 | // scalar |
| 694 | for (int i = 0; i < n; ++i) { |
| 695 | y[i] *= v; |
| 696 | } |
| 697 | #endif |
| 698 | } |
| 699 | |
| 700 | inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) { |
| 701 | #if defined(GGML_SIMD) |
| 702 | #if defined(__ARM_FEATURE_SVE) |
| 703 | const int sve_register_length = svcntb() * 8; |
| 704 | const int ggml_f16_epr = sve_register_length / 16; |
| 705 | const int ggml_f16_step = 2 * ggml_f16_epr; |
| 706 | |
| 707 | GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); |
| 708 | const int np = (n & ~(ggml_f16_step - 1)); |
| 709 | svfloat16_t ay1, ay2; |
| 710 | |
| 711 | for (int i = 0; i < np; i += ggml_f16_step) { |
| 712 | ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0); |
| 713 | ay1 = GGML_F16x_VEC_MUL(ay1, vx); |
| 714 | GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0); |
| 715 | |
| 716 | ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1); |
| 717 | ay2 = GGML_F16x_VEC_MUL(ay2, vx); |
| 718 | GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1); |
| 719 | } |
| 720 | // leftovers |
| 721 | // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only |
| 722 | if (np < n) { |
| 723 | svbool_t pg = svwhilelt_b16(np, n); |
| 724 | svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np)); |
| 725 | svfloat16_t out = svmul_f16_m(pg, hy, vx); |
| 726 | svst1_f16(pg, (__fp16 *)(y + np), out); |
| 727 | } |
| 728 | #elif defined(__riscv_v_intrinsic) |
| 729 | // todo: RVV impl |
| 730 | // scalar |
| 731 | for (int i = 0; i < n; ++i) { |
| 732 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); |
| 733 | } |
| 734 | #else |
| 735 | const int np = (n & ~(GGML_F16_STEP - 1)); |
| 736 | |
| 737 | GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); |
| 738 | |
| 739 | GGML_F16_VEC ay[GGML_F16_ARR]; |
| 740 | |
| 741 | for (int i = 0; i < np; i += GGML_F16_STEP) { |
| 742 | for (int j = 0; j < GGML_F16_ARR; j++) { |
| 743 | ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); |
| 744 | ay[j] = GGML_F16_VEC_MUL(a: ay[j], b: vx); |
| 745 | |
| 746 | GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); |
| 747 | } |
| 748 | } |
| 749 | |
| 750 | // leftovers |
| 751 | for (int i = np; i < n; ++i) { |
| 752 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); |
| 753 | } |
| 754 | #endif |
| 755 | #else |
| 756 | // scalar |
| 757 | for (int i = 0; i < n; ++i) { |
| 758 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); |
| 759 | } |
| 760 | #endif |
| 761 | } |
| 762 | |
| 763 | inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, bs: 0, x, bx: 0, y: x, by: 0, nrc: 1); *s = sqrtf(x: *s); } |
| 764 | inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } |
| 765 | inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 766 | for (int i = 0; i < n; ++i) { |
| 767 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
| 768 | y[i] = GGML_CPU_FP32_TO_FP16(v*v); |
| 769 | } |
| 770 | } |
| 771 | inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x: x[i]); } |
| 772 | inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 773 | for (int i = 0; i < n; ++i) { |
| 774 | y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i]))); |
| 775 | } |
| 776 | } |
| 777 | inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x: x[i]); } |
| 778 | inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 779 | for (int i = 0; i < n; ++i) { |
| 780 | y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i]))); |
| 781 | } |
| 782 | } |
| 783 | inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x: x[i]); } |
| 784 | inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 785 | for (int i = 0; i < n; ++i) { |
| 786 | y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i]))); |
| 787 | } |
| 788 | } |
| 789 | inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x: x[i]); } |
| 790 | inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 791 | for (int i = 0; i < n; ++i) { |
| 792 | y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i]))); |
| 793 | } |
| 794 | } |
| 795 | inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x: x[i]); } |
| 796 | inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 797 | for (int i = 0; i < n; ++i) { |
| 798 | y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i]))); |
| 799 | } |
| 800 | } |
| 801 | inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } |
| 802 | inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 803 | for (int i = 0; i < n; ++i) { |
| 804 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
| 805 | y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f)); |
| 806 | } |
| 807 | } |
| 808 | inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } |
| 809 | inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 810 | for (int i = 0; i < n; ++i) { |
| 811 | y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f); |
| 812 | } |
| 813 | } |
| 814 | inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x: x[i]); } |
| 815 | inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 816 | for (int i = 0; i < n; ++i) { |
| 817 | y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i]))); |
| 818 | } |
| 819 | } |
| 820 | inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x: x[i]); } |
| 821 | inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 822 | for (int i = 0; i < n; ++i) { |
| 823 | const float v = GGML_CPU_FP16_TO_FP32(x[i]); |
| 824 | y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v)); |
| 825 | } |
| 826 | } |
| 827 | inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } |
| 828 | inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 829 | for (int i = 0; i < n; ++i) { |
| 830 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
| 831 | y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f); |
| 832 | } |
| 833 | } |
| 834 | inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } |
| 835 | inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) { |
| 836 | for (int i = 0; i < n; ++i) { |
| 837 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
| 838 | y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f)); |
| 839 | } |
| 840 | } |
| 841 | inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(x: -x[i])); } |
| 842 | inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 843 | for (int i = 0; i < n; ++i) { |
| 844 | y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i])))); |
| 845 | } |
| 846 | } |
| 847 | // TODO: optimize performance |
| 848 | inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(x: 1.0f, y: fmaxf(x: 0.0f, y: (x[i] + 3.0f) / 6.0f)); } |
| 849 | inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 850 | for (int i = 0; i < n; ++i) { |
| 851 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
| 852 | y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f))); |
| 853 | } |
| 854 | } |
| 855 | inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(x: 1.0f, y: fmaxf(x: 0.0f, y: (x[i] + 3.0f) / 6.0f)); } |
| 856 | inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 857 | for (int i = 0; i < n; ++i) { |
| 858 | y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f))); |
| 859 | } |
| 860 | } |
| 861 | inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x: x[i]); } |
| 862 | inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 863 | for (int i = 0; i < n; ++i) { |
| 864 | y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i]))); |
| 865 | } |
| 866 | } |
| 867 | |
| 868 | static const float GELU_COEF_A = 0.044715f; |
| 869 | static const float GELU_QUICK_COEF = -1.702f; |
| 870 | static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; |
| 871 | static const float SQRT_2_INV = 0.70710678118654752440084436210484f; |
| 872 | |
| 873 | inline static float ggml_gelu_f32(float x) { |
| 874 | return 0.5f*x*(1.0f + tanhf(x: SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); |
| 875 | } |
| 876 | |
| 877 | inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 878 | const uint16_t * i16 = (const uint16_t *) x; |
| 879 | for (int i = 0; i < n; ++i) { |
| 880 | y[i] = ggml_table_gelu_f16[i16[i]]; |
| 881 | } |
| 882 | } |
| 883 | |
| 884 | inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 885 | for (int i = 0; i < n; ++i) { |
| 886 | float xi = GGML_CPU_FP16_TO_FP32(x[i]); |
| 887 | float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV)); |
| 888 | y[i] = GGML_CPU_FP32_TO_FP16(res); |
| 889 | } |
| 890 | } |
| 891 | |
| 892 | #ifdef GGML_GELU_FP16 |
| 893 | inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { |
| 894 | uint16_t t; |
| 895 | for (int i = 0; i < n; ++i) { |
| 896 | if (x[i] <= -10.0f) { |
| 897 | y[i] = 0.0f; |
| 898 | } else if (x[i] >= 10.0f) { |
| 899 | y[i] = x[i]; |
| 900 | } else { |
| 901 | ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); |
| 902 | memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t)); |
| 903 | y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]); |
| 904 | } |
| 905 | } |
| 906 | } |
| 907 | #else |
| 908 | inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { |
| 909 | for (int i = 0; i < n; ++i) { |
| 910 | y[i] = ggml_gelu_f32(x[i]); |
| 911 | } |
| 912 | } |
| 913 | #endif |
| 914 | |
| 915 | inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) { |
| 916 | for (int i = 0; i < n; ++i) { |
| 917 | float xi = x[i]; |
| 918 | y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV)); |
| 919 | } |
| 920 | } |
| 921 | |
| 922 | inline static float ggml_gelu_quick_f32(float x) { |
| 923 | return x*(1.0f/(1.0f+expf(x: GELU_QUICK_COEF*x))); |
| 924 | } |
| 925 | |
| 926 | //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 927 | // const uint16_t * i16 = (const uint16_t *) x; |
| 928 | // for (int i = 0; i < n; ++i) { |
| 929 | // y[i] = ggml_table_gelu_quick_f16[i16[i]]; |
| 930 | // } |
| 931 | //} |
| 932 | |
| 933 | #ifdef GGML_GELU_QUICK_FP16 |
| 934 | inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { |
| 935 | uint16_t t; |
| 936 | for (int i = 0; i < n; ++i) { |
| 937 | ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); |
| 938 | memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t)); |
| 939 | y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); |
| 940 | } |
| 941 | } |
| 942 | #else |
| 943 | inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { |
| 944 | for (int i = 0; i < n; ++i) { |
| 945 | y[i] = ggml_gelu_quick_f32(x[i]); |
| 946 | } |
| 947 | } |
| 948 | #endif |
| 949 | |
| 950 | inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 951 | for (int i = 0; i < n; ++i) { |
| 952 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
| 953 | y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v)))); |
| 954 | } |
| 955 | } |
| 956 | |
| 957 | // Sigmoid Linear Unit (SiLU) function |
| 958 | inline static float ggml_silu_f32(float x) { |
| 959 | return x/(1.0f + expf(x: -x)); |
| 960 | } |
| 961 | inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) { |
| 962 | float v = GGML_CPU_FP16_TO_FP32(x); |
| 963 | return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v))); |
| 964 | } |
| 965 | |
| 966 | #if __FINITE_MATH_ONLY__ |
| 967 | #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix" |
| 968 | #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461" |
| 969 | #endif |
| 970 | |
| 971 | /* Below function was borrowed from the GitHub repository: |
| 972 | https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */ |
| 973 | #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__) |
| 974 | inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) { |
| 975 | // Constants |
| 976 | const svfloat32_t log2_e = svdup_n_f32(1.4426950409f); |
| 977 | const svfloat32_t ln2 = svdup_n_f32(0.6931473921f); |
| 978 | const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f); |
| 979 | const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1)); |
| 980 | const svfloat32_t one = svdup_n_f32(1.0f); |
| 981 | const svfloat32_t inactive1 = svdup_n_f32(0.0f); |
| 982 | const svint32_t inactive2 = svdup_n_s32(0); |
| 983 | |
| 984 | // Algorithm starts here |
| 985 | svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e) |
| 986 | svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float) |
| 987 | svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n |
| 988 | |
| 989 | t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y) |
| 990 | t1 = svadd_f32_m(pg, t1, one); // b = a + 1 |
| 991 | |
| 992 | svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32) |
| 993 | svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v) |
| 994 | t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n) |
| 995 | |
| 996 | // and_(t2.d, t1.d, not_mask17.d) |
| 997 | svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17)); |
| 998 | t5 = svsub_f32_m(pg, t1, t5); // z |
| 999 | t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z |
| 1000 | t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z) |
| 1001 | t0 = svmul_f32_m(pg, t0, t4); // Final result |
| 1002 | |
| 1003 | return t0; |
| 1004 | } |
| 1005 | #endif |
| 1006 | |
| 1007 | #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__) |
| 1008 | |
| 1009 | inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) { |
| 1010 | const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f); |
| 1011 | const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f); |
| 1012 | const svfloat32_t n = svsub_f32_x(pg, z, r); |
| 1013 | const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f); |
| 1014 | const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23); |
| 1015 | const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1)))); |
| 1016 | const svbool_t c = svacgt_n_f32(pg, n, 126); |
| 1017 | const svfloat32_t u = svmul_f32_x(pg, b, b); |
| 1018 | const svfloat32_t j = svmla_f32_x(pg, |
| 1019 | svmul_n_f32_x(pg, b, 0x1.ffffecp-1f), |
| 1020 | svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b), |
| 1021 | svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u); |
| 1022 | const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000); |
| 1023 | const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000)); |
| 1024 | const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d)); |
| 1025 | return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1), |
| 1026 | svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j))); |
| 1027 | } |
| 1028 | |
| 1029 | // computes silu x/(1+exp(-x)) in single precision vector |
| 1030 | inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) { |
| 1031 | const svfloat32_t one = svdup_n_f32_x(pg, 1.0f); |
| 1032 | const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f); |
| 1033 | const svfloat32_t neg_x = svsub_f32_x(pg, zero, x); |
| 1034 | const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x); |
| 1035 | const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x); |
| 1036 | return svdiv_f32_x(pg, x, one_plus_exp_neg_x); |
| 1037 | } |
| 1038 | |
| 1039 | #elif defined(__ARM_NEON) && defined(__aarch64__) |
| 1040 | |
| 1041 | // adapted from arm limited optimized routine |
| 1042 | // the maximum error is 1.45358 plus 0.5 ulps |
| 1043 | // numbers above 88.38 will flush to infinity |
| 1044 | // numbers beneath -103.97 will flush to zero |
| 1045 | inline static float32x4_t ggml_v_expf(float32x4_t x) { |
| 1046 | const float32x4_t r = vdupq_n_f32(0x1.8p23f); |
| 1047 | const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f)); |
| 1048 | const float32x4_t n = vsubq_f32(z, r); |
| 1049 | const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n, |
| 1050 | vdupq_n_f32(0x1.7f7d1cp-20f)); |
| 1051 | const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23); |
| 1052 | const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1)))); |
| 1053 | const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126)); |
| 1054 | const float32x4_t u = vmulq_f32(b, b); |
| 1055 | const float32x4_t j = vfmaq_f32( |
| 1056 | vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b), |
| 1057 | vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b), |
| 1058 | vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u); |
| 1059 | if (!vpaddd_u64(vreinterpretq_u64_u32(c))) |
| 1060 | return vfmaq_f32(k, j, k); |
| 1061 | const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000)); |
| 1062 | const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000))); |
| 1063 | const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d)); |
| 1064 | return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1), |
| 1065 | vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j))); |
| 1066 | } |
| 1067 | |
| 1068 | // computes silu x/(1+exp(-x)) in single precision vector |
| 1069 | inline static float32x4_t ggml_v_silu(float32x4_t x) { |
| 1070 | const float32x4_t one = vdupq_n_f32(1.0f); |
| 1071 | const float32x4_t zero = vdupq_n_f32(0.0f); |
| 1072 | const float32x4_t neg_x = vsubq_f32(zero, x); |
| 1073 | const float32x4_t exp_neg_x = ggml_v_expf(neg_x); |
| 1074 | const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x); |
| 1075 | return vdivq_f32(x, one_plus_exp_neg_x); |
| 1076 | } |
| 1077 | |
| 1078 | #elif defined(__AVX512F__) && defined(__AVX512DQ__) |
| 1079 | |
| 1080 | // adapted from arm limited optimized routine |
| 1081 | // the maximum error is 1.45358 plus 0.5 ulps |
| 1082 | // numbers above 88.38 will flush to infinity |
| 1083 | // numbers beneath -103.97 will flush to zero |
| 1084 | inline static __m512 ggml_v_expf(__m512 x) { |
| 1085 | const __m512 r = _mm512_set1_ps(0x1.8p23f); |
| 1086 | const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r); |
| 1087 | const __m512 n = _mm512_sub_ps(z, r); |
| 1088 | const __m512 b = |
| 1089 | _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f), |
| 1090 | _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x)); |
| 1091 | const __mmask16 d = |
| 1092 | _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ); |
| 1093 | const __m512 u = _mm512_mul_ps(b, b); |
| 1094 | const __m512 j = _mm512_fmadd_ps( |
| 1095 | _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b, |
| 1096 | _mm512_set1_ps(0x1.573e2ep-5f)), |
| 1097 | u, |
| 1098 | _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b, |
| 1099 | _mm512_set1_ps(0x1.fffdb6p-2f))), |
| 1100 | u, |
| 1101 | _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F))); |
| 1102 | const __m512 res = _mm512_scalef_ps(j, n); |
| 1103 | if (_mm512_kortestz(d, d)) |
| 1104 | return res; |
| 1105 | const __m512 zero = _mm512_setzero_ps(); |
| 1106 | const __m512 alt = _mm512_mask_blend_ps( |
| 1107 | _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero); |
| 1108 | return _mm512_mask_blend_ps(d, res, alt); |
| 1109 | } |
| 1110 | |
| 1111 | // computes silu x/(1+exp(-x)) in single precision vector |
| 1112 | inline static __m512 ggml_v_silu(__m512 x) { |
| 1113 | const __m512 one = _mm512_set1_ps(1); |
| 1114 | const __m512 zero = _mm512_setzero_ps(); |
| 1115 | const __m512 neg_x = _mm512_sub_ps(zero, x); |
| 1116 | const __m512 exp_neg_x = ggml_v_expf(neg_x); |
| 1117 | const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x); |
| 1118 | return _mm512_div_ps(x, one_plus_exp_neg_x); |
| 1119 | } |
| 1120 | |
| 1121 | #elif defined(__AVX2__) && defined(__FMA__) |
| 1122 | |
| 1123 | // adapted from arm limited optimized routine |
| 1124 | // the maximum error is 1.45358 plus 0.5 ulps |
| 1125 | // numbers above 88.38 will flush to infinity |
| 1126 | // numbers beneath -103.97 will flush to zero |
| 1127 | inline static __m256 ggml_v_expf(__m256 x) { |
| 1128 | const __m256 r = _mm256_set1_ps(w: 0x1.8p23f); |
| 1129 | const __m256 z = _mm256_fmadd_ps(A: x, B: _mm256_set1_ps(w: 0x1.715476p+0f), C: r); |
| 1130 | const __m256 n = _mm256_sub_ps(a: z, b: r); |
| 1131 | const __m256 b = _mm256_fnmadd_ps(A: n, B: _mm256_set1_ps(w: 0x1.7f7d1cp-20f), |
| 1132 | C: _mm256_fnmadd_ps(A: n, B: _mm256_set1_ps(w: 0x1.62e4p-1f), C: x)); |
| 1133 | const __m256i e = _mm256_slli_epi32(a: _mm256_castps_si256(a: z), count: 23); |
| 1134 | const __m256 k = _mm256_castsi256_ps( |
| 1135 | a: _mm256_add_epi32(a: e, b: _mm256_castps_si256(a: _mm256_set1_ps(w: 1)))); |
| 1136 | const __m256i c = _mm256_castps_si256( |
| 1137 | _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), |
| 1138 | _mm256_set1_ps(126), _CMP_GT_OQ)); |
| 1139 | const __m256 u = _mm256_mul_ps(a: b, b: b); |
| 1140 | const __m256 j = _mm256_fmadd_ps(A: _mm256_fmadd_ps(A: _mm256_fmadd_ps(A: _mm256_set1_ps(w: 0x1.0e4020p-7f), B: b, |
| 1141 | C: _mm256_set1_ps(w: 0x1.573e2ep-5f)), B: u, |
| 1142 | C: _mm256_fmadd_ps(A: _mm256_set1_ps(w: 0x1.555e66p-3f), B: b, |
| 1143 | C: _mm256_set1_ps(w: 0x1.fffdb6p-2f))), |
| 1144 | B: u, C: _mm256_mul_ps(a: _mm256_set1_ps(w: 0x1.ffffecp-1f), b: b)); |
| 1145 | if (!_mm256_movemask_ps(a: _mm256_castsi256_ps(a: c))) |
| 1146 | return _mm256_fmadd_ps(A: j, B: k, C: k); |
| 1147 | const __m256i g = _mm256_and_si256( |
| 1148 | a: _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)), |
| 1149 | b: _mm256_set1_epi32(i: 0x82000000u)); |
| 1150 | const __m256 s1 = |
| 1151 | _mm256_castsi256_ps(a: _mm256_add_epi32(a: g, b: _mm256_set1_epi32(i: 0x7f000000u))); |
| 1152 | const __m256 s2 = _mm256_castsi256_ps(a: _mm256_sub_epi32(a: e, b: g)); |
| 1153 | const __m256i d = _mm256_castps_si256( |
| 1154 | _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), |
| 1155 | _mm256_set1_ps(192), _CMP_GT_OQ)); |
| 1156 | return _mm256_or_ps( |
| 1157 | a: _mm256_and_ps(a: _mm256_castsi256_ps(a: d), b: _mm256_mul_ps(a: s1, b: s1)), |
| 1158 | b: _mm256_andnot_ps( |
| 1159 | a: _mm256_castsi256_ps(a: d), |
| 1160 | b: _mm256_or_ps( |
| 1161 | a: _mm256_and_ps(a: _mm256_castsi256_ps(a: c), |
| 1162 | b: _mm256_mul_ps(a: _mm256_fmadd_ps(A: s2, B: j, C: s2), b: s1)), |
| 1163 | b: _mm256_andnot_ps(a: _mm256_castsi256_ps(a: c), b: _mm256_fmadd_ps(A: k, B: j, C: k))))); |
| 1164 | } |
| 1165 | |
| 1166 | // computes silu x/(1+exp(-x)) in single precision vector |
| 1167 | inline static __m256 ggml_v_silu(__m256 x) { |
| 1168 | const __m256 one = _mm256_set1_ps(w: 1); |
| 1169 | const __m256 zero = _mm256_setzero_ps(); |
| 1170 | const __m256 neg_x = _mm256_sub_ps(a: zero, b: x); |
| 1171 | const __m256 exp_neg_x = ggml_v_expf(x: neg_x); |
| 1172 | const __m256 one_plus_exp_neg_x = _mm256_add_ps(a: one, b: exp_neg_x); |
| 1173 | return _mm256_div_ps(a: x, b: one_plus_exp_neg_x); |
| 1174 | } |
| 1175 | |
| 1176 | #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON |
| 1177 | |
| 1178 | #if defined(__FMA__) |
| 1179 | #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z) |
| 1180 | #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z) |
| 1181 | #else |
| 1182 | #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z) |
| 1183 | #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y)) |
| 1184 | #endif |
| 1185 | |
| 1186 | // adapted from arm limited optimized routine |
| 1187 | // the maximum error is 1.45358 plus 0.5 ulps |
| 1188 | // numbers above 88.38 will flush to infinity |
| 1189 | // numbers beneath -103.97 will flush to zero |
| 1190 | inline static __m128 ggml_v_expf(__m128 x) { |
| 1191 | const __m128 r = _mm_set1_ps(0x1.8p23f); |
| 1192 | const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r); |
| 1193 | const __m128 n = _mm_sub_ps(z, r); |
| 1194 | const __m128 b = |
| 1195 | NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x)); |
| 1196 | const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23); |
| 1197 | const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1)))); |
| 1198 | const __m128i c = |
| 1199 | _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126))); |
| 1200 | const __m128 u = _mm_mul_ps(b, b); |
| 1201 | const __m128 j = |
| 1202 | MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u, |
| 1203 | MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))), |
| 1204 | u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b)); |
| 1205 | if (!_mm_movemask_epi8(c)) |
| 1206 | return MADD128(j, k, k); |
| 1207 | const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())), |
| 1208 | _mm_set1_epi32(0x82000000u)); |
| 1209 | const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u))); |
| 1210 | const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g)); |
| 1211 | const __m128i d = |
| 1212 | _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192))); |
| 1213 | return _mm_or_ps( |
| 1214 | _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)), |
| 1215 | _mm_andnot_ps(_mm_castsi128_ps(d), |
| 1216 | _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)), |
| 1217 | _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k))))); |
| 1218 | } |
| 1219 | |
| 1220 | // computes silu x/(1+exp(-x)) in single precision vector |
| 1221 | inline static __m128 ggml_v_silu(__m128 x) { |
| 1222 | const __m128 one = _mm_set1_ps(1); |
| 1223 | const __m128 zero = _mm_setzero_ps(); |
| 1224 | const __m128 neg_x = _mm_sub_ps(zero, x); |
| 1225 | const __m128 exp_neg_x = ggml_v_expf(neg_x); |
| 1226 | const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x); |
| 1227 | return _mm_div_ps(x, one_plus_exp_neg_x); |
| 1228 | } |
| 1229 | |
| 1230 | #elif defined(__riscv_v_intrinsic) |
| 1231 | |
| 1232 | // adapted from arm limited optimized routine |
| 1233 | // the maximum error is 1.45358 plus 0.5 ulps |
| 1234 | // numbers above 88.38 will flush to infinity |
| 1235 | // numbers beneath -103.97 will flush to zero |
| 1236 | inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) { |
| 1237 | const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl); |
| 1238 | #ifdef __riscv_xtheadvector |
| 1239 | // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4') |
| 1240 | vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl); |
| 1241 | z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl); |
| 1242 | #else |
| 1243 | const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl); |
| 1244 | #endif |
| 1245 | const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl); |
| 1246 | const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl), |
| 1247 | 0x1.7f7d1cp-20f, n, vl); |
| 1248 | const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl); |
| 1249 | const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f |
| 1250 | const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl); |
| 1251 | const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl); |
| 1252 | const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2( |
| 1253 | __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl), |
| 1254 | __riscv_vfmacc_vv_f32m2( |
| 1255 | __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl), |
| 1256 | __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl), |
| 1257 | u, vl), u, vl); |
| 1258 | if (!__riscv_vcpop_m_b16(c, vl)) |
| 1259 | return __riscv_vfmacc_vv_f32m2(k, j, k, vl); |
| 1260 | const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl); |
| 1261 | const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl); |
| 1262 | const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl)); |
| 1263 | const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl)); |
| 1264 | const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2( |
| 1265 | __riscv_vfmacc_vv_f32m2(k, k, j, vl), |
| 1266 | __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl), |
| 1267 | c, vl); |
| 1268 | return __riscv_vmerge_vvm_f32m2( |
| 1269 | r1, __riscv_vfmul_vv_f32m2(s1, s1, vl), |
| 1270 | __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl), |
| 1271 | vl); |
| 1272 | } |
| 1273 | |
| 1274 | // computes silu x/(1+exp(-x)) in single precision vector |
| 1275 | inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) { |
| 1276 | const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl); |
| 1277 | const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl); |
| 1278 | const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl); |
| 1279 | return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl); |
| 1280 | } |
| 1281 | |
| 1282 | #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic |
| 1283 | |
| 1284 | inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { |
| 1285 | for (int i = 0; i < n; ++i) { |
| 1286 | y[i] = ggml_silu_f16(x: x[i]); |
| 1287 | } |
| 1288 | } |
| 1289 | |
| 1290 | inline static float ggml_silu_backward_f32(float x, float dy) { |
| 1291 | const float s = 1.0f/(1.0f + expf(x: -x)); |
| 1292 | return dy*s*(1.0f + x*(1.0f - s)); |
| 1293 | } |
| 1294 | |
| 1295 | inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) { |
| 1296 | const float v = GGML_CPU_FP16_TO_FP32(x); |
| 1297 | const float s = 1.0f/(1.0f + expf(x: -v)); |
| 1298 | return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s))); |
| 1299 | } |
| 1300 | |
| 1301 | inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { |
| 1302 | for (int i = 0; i < n; ++i) { |
| 1303 | dx[i] = ggml_silu_backward_f32(x: x[i], dy: dy[i]); |
| 1304 | } |
| 1305 | } |
| 1306 | |
| 1307 | inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) { |
| 1308 | for (int i = 0; i < n; ++i) { |
| 1309 | dx[i] = ggml_silu_backward_f16(x: x[i], dy: dy[i]); |
| 1310 | } |
| 1311 | } |
| 1312 | |
| 1313 | inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) { |
| 1314 | for (int i = 0; i < n; ++i) { |
| 1315 | y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f; |
| 1316 | } |
| 1317 | } |
| 1318 | |
| 1319 | inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { |
| 1320 | for (int i = 0; i < n; ++i) { |
| 1321 | float v = GGML_CPU_FP16_TO_FP32(x[i]); |
| 1322 | y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f); |
| 1323 | } |
| 1324 | } |
| 1325 | |
| 1326 | #ifdef GGML_GELU_FP16 |
| 1327 | inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) { |
| 1328 | uint16_t t; |
| 1329 | for (int i = 0; i < n; ++i) { |
| 1330 | if (x[i] <= -10.0f) { |
| 1331 | y[i] = 0.0f; |
| 1332 | } else if (x[i] >= 10.0f) { |
| 1333 | y[i] = x[i] * g[i]; |
| 1334 | } else { |
| 1335 | ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); |
| 1336 | memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t)); |
| 1337 | y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i]; |
| 1338 | } |
| 1339 | } |
| 1340 | } |
| 1341 | #else |
| 1342 | inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) { |
| 1343 | for (int i = 0; i < n; ++i) { |
| 1344 | y[i] = ggml_gelu_f32(x[i]) * g[i]; |
| 1345 | } |
| 1346 | } |
| 1347 | #endif |
| 1348 | |
| 1349 | inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { |
| 1350 | const uint16_t * i16 = (const uint16_t *) x; |
| 1351 | for (int i = 0; i < n; ++i) { |
| 1352 | float v = GGML_CPU_FP16_TO_FP32(g[i]); |
| 1353 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v); |
| 1354 | } |
| 1355 | } |
| 1356 | |
| 1357 | void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g); |
| 1358 | |
| 1359 | inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { |
| 1360 | for (int i = 0; i < n; ++i) { |
| 1361 | float xi = GGML_CPU_FP16_TO_FP32(x[i]); |
| 1362 | float gi = GGML_CPU_FP16_TO_FP32(g[i]); |
| 1363 | y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi); |
| 1364 | } |
| 1365 | } |
| 1366 | |
| 1367 | inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) { |
| 1368 | for (int i = 0; i < n; ++i) { |
| 1369 | float xi = x[i]; |
| 1370 | y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i]; |
| 1371 | } |
| 1372 | } |
| 1373 | |
| 1374 | inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { |
| 1375 | for (int i = 0; i < n; ++i) { |
| 1376 | float xi = GGML_CPU_FP16_TO_FP32(x[i]); |
| 1377 | float gi = GGML_CPU_FP16_TO_FP32(g[i]); |
| 1378 | y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi); |
| 1379 | } |
| 1380 | } |
| 1381 | |
| 1382 | #ifdef GGML_GELU_QUICK_FP16 |
| 1383 | inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) { |
| 1384 | uint16_t t; |
| 1385 | for (int i = 0; i < n; ++i) { |
| 1386 | ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); |
| 1387 | memcpy(dest: &t, src: &fp16, n: sizeof(uint16_t)); |
| 1388 | y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i]; |
| 1389 | } |
| 1390 | } |
| 1391 | #else |
| 1392 | inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) { |
| 1393 | for (int i = 0; i < n; ++i) { |
| 1394 | y[i] = ggml_gelu_quick_f32(x[i]) * g[i]; |
| 1395 | } |
| 1396 | } |
| 1397 | #endif |
| 1398 | |
| 1399 | inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { |
| 1400 | const uint16_t * i16 = (const uint16_t *) x; |
| 1401 | for (int i = 0; i < n; ++i) { |
| 1402 | float v = GGML_CPU_FP16_TO_FP32(g[i]); |
| 1403 | y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v); |
| 1404 | } |
| 1405 | } |
| 1406 | |
| 1407 | inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { |
| 1408 | #ifndef GGML_USE_ACCELERATE |
| 1409 | ggml_float sum = 0.0; |
| 1410 | for (int i = 0; i < n; ++i) { |
| 1411 | sum += (ggml_float)x[i]; |
| 1412 | } |
| 1413 | *s = (float)sum; |
| 1414 | #else |
| 1415 | vDSP_sve(x, 1, s, n); |
| 1416 | #endif |
| 1417 | } |
| 1418 | |
| 1419 | inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) { |
| 1420 | ggml_float sum = 0.0; |
| 1421 | for (int i = 0; i < n; ++i) { |
| 1422 | sum += (ggml_float)x[i]; |
| 1423 | } |
| 1424 | *s = sum; |
| 1425 | } |
| 1426 | |
| 1427 | inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) { |
| 1428 | float sum = 0.0f; |
| 1429 | for (int i = 0; i < n; ++i) { |
| 1430 | sum += GGML_CPU_FP16_TO_FP32(x[i]); |
| 1431 | } |
| 1432 | *s = sum; |
| 1433 | } |
| 1434 | |
| 1435 | inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) { |
| 1436 | float sum = 0.0f; |
| 1437 | for (int i = 0; i < n; ++i) { |
| 1438 | sum += GGML_BF16_TO_FP32(x[i]); |
| 1439 | } |
| 1440 | *s = sum; |
| 1441 | } |
| 1442 | |
| 1443 | inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { |
| 1444 | #ifndef GGML_USE_ACCELERATE |
| 1445 | float max = -INFINITY; |
| 1446 | for (int i = 0; i < n; ++i) { |
| 1447 | max = MAX(max, x[i]); |
| 1448 | } |
| 1449 | *s = max; |
| 1450 | #else |
| 1451 | vDSP_maxv(x, 1, s, n); |
| 1452 | #endif |
| 1453 | } |
| 1454 | |
| 1455 | inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { |
| 1456 | ggml_vec_norm_f32(n, s, x); |
| 1457 | *s = 1.f/(*s); |
| 1458 | } |
| 1459 | |
| 1460 | inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) { |
| 1461 | float max = -INFINITY; |
| 1462 | int idx = 0; |
| 1463 | for (int i = 0; i < n; ++i) { |
| 1464 | max = MAX(max, x[i]); |
| 1465 | if (max == x[i]) { idx = i; } |
| 1466 | } |
| 1467 | *s = idx; |
| 1468 | } |
| 1469 | |
| 1470 | #ifdef __cplusplus |
| 1471 | } |
| 1472 | #endif |
| 1473 | |