| 1 | #pragma once |
| 2 | |
| 3 | // GGML CPU internal header |
| 4 | |
| 5 | #include "ggml.h" |
| 6 | #include "ggml-impl.h" |
| 7 | |
| 8 | #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ |
| 9 | //#include <stddef.h> |
| 10 | #include <stdbool.h> |
| 11 | #include <string.h> // memcpy |
| 12 | #include <math.h> // fabsf |
| 13 | |
| 14 | #ifdef __cplusplus |
| 15 | extern "C" { |
| 16 | #endif |
| 17 | |
| 18 | struct ggml_compute_params { |
| 19 | // ith = thread index, nth = number of threads |
| 20 | int ith, nth; |
| 21 | |
| 22 | // work buffer for all threads |
| 23 | size_t wsize; |
| 24 | void * wdata; |
| 25 | |
| 26 | struct ggml_threadpool * threadpool; |
| 27 | }; |
| 28 | |
| 29 | |
| 30 | #if defined(_MSC_VER) |
| 31 | |
| 32 | #define m512bh(p) p |
| 33 | #define m512i(p) p |
| 34 | |
| 35 | #else |
| 36 | |
| 37 | #define m512bh(p) (__m512bh)(p) |
| 38 | #define m512i(p) (__m512i)(p) |
| 39 | |
| 40 | #endif |
| 41 | |
| 42 | // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 |
| 43 | #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) |
| 44 | #ifndef __FMA__ |
| 45 | #define __FMA__ |
| 46 | #endif |
| 47 | #ifndef __F16C__ |
| 48 | #define __F16C__ |
| 49 | #endif |
| 50 | #endif |
| 51 | |
| 52 | // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available |
| 53 | #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)) |
| 54 | #ifndef __SSE3__ |
| 55 | #define __SSE3__ |
| 56 | #endif |
| 57 | #ifndef __SSSE3__ |
| 58 | #define __SSSE3__ |
| 59 | #endif |
| 60 | #endif |
| 61 | |
| 62 | #if defined(__s390x__) && defined(__VEC__) |
| 63 | #ifndef __VXE__ |
| 64 | #define __VXE__ |
| 65 | #endif // __VXE__ |
| 66 | #ifndef __VXE2__ |
| 67 | #define __VXE2__ |
| 68 | #endif // __VXE2__ |
| 69 | #endif // __s390x__ && __VEC__ |
| 70 | |
| 71 | #if defined(__ARM_FEATURE_SVE) && defined(__linux__) |
| 72 | #include <sys/prctl.h> |
| 73 | #endif |
| 74 | |
| 75 | #if defined(__ARM_NEON) |
| 76 | |
| 77 | // ref: https://github.com/ggml-org/llama.cpp/pull/5404 |
| 78 | #ifdef _MSC_VER |
| 79 | #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } |
| 80 | #else |
| 81 | #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } |
| 82 | #endif // _MSC_VER |
| 83 | |
| 84 | #if !defined(__aarch64__) |
| 85 | |
| 86 | // 32-bit ARM compatibility |
| 87 | |
| 88 | // vaddlvq_s16 |
| 89 | // vpaddq_s16 |
| 90 | // vpaddq_s32 |
| 91 | // vaddvq_s32 |
| 92 | // vaddvq_f32 |
| 93 | // vmaxvq_f32 |
| 94 | // vcvtnq_s32_f32 |
| 95 | // vzip1_u8 |
| 96 | // vzip2_u8 |
| 97 | |
| 98 | inline static int32_t vaddlvq_s16(int16x8_t v) { |
| 99 | int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v))); |
| 100 | return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2); |
| 101 | } |
| 102 | |
| 103 | inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { |
| 104 | int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); |
| 105 | int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); |
| 106 | return vcombine_s16(a0, b0); |
| 107 | } |
| 108 | |
| 109 | inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { |
| 110 | int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); |
| 111 | int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); |
| 112 | return vcombine_s32(a0, b0); |
| 113 | } |
| 114 | |
| 115 | inline static int32_t vaddvq_s32(int32x4_t v) { |
| 116 | return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); |
| 117 | } |
| 118 | |
| 119 | inline static float vaddvq_f32(float32x4_t v) { |
| 120 | return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); |
| 121 | } |
| 122 | |
| 123 | inline static float vmaxvq_f32(float32x4_t v) { |
| 124 | return |
| 125 | MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), |
| 126 | MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); |
| 127 | } |
| 128 | |
| 129 | inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { |
| 130 | int32x4_t res; |
| 131 | |
| 132 | res[0] = roundf(vgetq_lane_f32(v, 0)); |
| 133 | res[1] = roundf(vgetq_lane_f32(v, 1)); |
| 134 | res[2] = roundf(vgetq_lane_f32(v, 2)); |
| 135 | res[3] = roundf(vgetq_lane_f32(v, 3)); |
| 136 | |
| 137 | return res; |
| 138 | } |
| 139 | |
| 140 | inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { |
| 141 | uint8x8_t res; |
| 142 | |
| 143 | res[0] = a[0]; res[1] = b[0]; |
| 144 | res[2] = a[1]; res[3] = b[1]; |
| 145 | res[4] = a[2]; res[5] = b[2]; |
| 146 | res[6] = a[3]; res[7] = b[3]; |
| 147 | |
| 148 | return res; |
| 149 | } |
| 150 | |
| 151 | inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { |
| 152 | uint8x8_t res; |
| 153 | |
| 154 | res[0] = a[4]; res[1] = b[4]; |
| 155 | res[2] = a[5]; res[3] = b[5]; |
| 156 | res[4] = a[6]; res[5] = b[6]; |
| 157 | res[6] = a[7]; res[7] = b[7]; |
| 158 | |
| 159 | return res; |
| 160 | } |
| 161 | |
| 162 | // vld1q_s16_x2 |
| 163 | // vld1q_u8_x2 |
| 164 | // vld1q_u8_x4 |
| 165 | // vld1q_s8_x2 |
| 166 | // vld1q_s8_x4 |
| 167 | // TODO: double-check these work correctly |
| 168 | |
| 169 | typedef struct ggml_int16x8x2_t { |
| 170 | int16x8_t val[2]; |
| 171 | } ggml_int16x8x2_t; |
| 172 | |
| 173 | inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { |
| 174 | ggml_int16x8x2_t res; |
| 175 | |
| 176 | res.val[0] = vld1q_s16(ptr + 0); |
| 177 | res.val[1] = vld1q_s16(ptr + 8); |
| 178 | |
| 179 | return res; |
| 180 | } |
| 181 | |
| 182 | typedef struct ggml_uint8x16x2_t { |
| 183 | uint8x16_t val[2]; |
| 184 | } ggml_uint8x16x2_t; |
| 185 | |
| 186 | inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { |
| 187 | ggml_uint8x16x2_t res; |
| 188 | |
| 189 | res.val[0] = vld1q_u8(ptr + 0); |
| 190 | res.val[1] = vld1q_u8(ptr + 16); |
| 191 | |
| 192 | return res; |
| 193 | } |
| 194 | |
| 195 | typedef struct ggml_uint8x16x4_t { |
| 196 | uint8x16_t val[4]; |
| 197 | } ggml_uint8x16x4_t; |
| 198 | |
| 199 | inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { |
| 200 | ggml_uint8x16x4_t res; |
| 201 | |
| 202 | res.val[0] = vld1q_u8(ptr + 0); |
| 203 | res.val[1] = vld1q_u8(ptr + 16); |
| 204 | res.val[2] = vld1q_u8(ptr + 32); |
| 205 | res.val[3] = vld1q_u8(ptr + 48); |
| 206 | |
| 207 | return res; |
| 208 | } |
| 209 | |
| 210 | typedef struct ggml_int8x16x2_t { |
| 211 | int8x16_t val[2]; |
| 212 | } ggml_int8x16x2_t; |
| 213 | |
| 214 | inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { |
| 215 | ggml_int8x16x2_t res; |
| 216 | |
| 217 | res.val[0] = vld1q_s8(ptr + 0); |
| 218 | res.val[1] = vld1q_s8(ptr + 16); |
| 219 | |
| 220 | return res; |
| 221 | } |
| 222 | |
| 223 | typedef struct ggml_int8x16x4_t { |
| 224 | int8x16_t val[4]; |
| 225 | } ggml_int8x16x4_t; |
| 226 | |
| 227 | inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { |
| 228 | ggml_int8x16x4_t res; |
| 229 | |
| 230 | res.val[0] = vld1q_s8(ptr + 0); |
| 231 | res.val[1] = vld1q_s8(ptr + 16); |
| 232 | res.val[2] = vld1q_s8(ptr + 32); |
| 233 | res.val[3] = vld1q_s8(ptr + 48); |
| 234 | |
| 235 | return res; |
| 236 | } |
| 237 | |
| 238 | // NOTE: not tested |
| 239 | inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { |
| 240 | int8x16_t res; |
| 241 | |
| 242 | res[ 0] = a[b[ 0]]; |
| 243 | res[ 1] = a[b[ 1]]; |
| 244 | res[ 2] = a[b[ 2]]; |
| 245 | res[ 3] = a[b[ 3]]; |
| 246 | res[ 4] = a[b[ 4]]; |
| 247 | res[ 5] = a[b[ 5]]; |
| 248 | res[ 6] = a[b[ 6]]; |
| 249 | res[ 7] = a[b[ 7]]; |
| 250 | res[ 8] = a[b[ 8]]; |
| 251 | res[ 9] = a[b[ 9]]; |
| 252 | res[10] = a[b[10]]; |
| 253 | res[11] = a[b[11]]; |
| 254 | res[12] = a[b[12]]; |
| 255 | res[13] = a[b[13]]; |
| 256 | res[14] = a[b[14]]; |
| 257 | res[15] = a[b[15]]; |
| 258 | |
| 259 | return res; |
| 260 | } |
| 261 | |
| 262 | // NOTE: not tested |
| 263 | inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { |
| 264 | uint8x16_t res; |
| 265 | |
| 266 | res[ 0] = a[b[ 0]]; |
| 267 | res[ 1] = a[b[ 1]]; |
| 268 | res[ 2] = a[b[ 2]]; |
| 269 | res[ 3] = a[b[ 3]]; |
| 270 | res[ 4] = a[b[ 4]]; |
| 271 | res[ 5] = a[b[ 5]]; |
| 272 | res[ 6] = a[b[ 6]]; |
| 273 | res[ 7] = a[b[ 7]]; |
| 274 | res[ 8] = a[b[ 8]]; |
| 275 | res[ 9] = a[b[ 9]]; |
| 276 | res[10] = a[b[10]]; |
| 277 | res[11] = a[b[11]]; |
| 278 | res[12] = a[b[12]]; |
| 279 | res[13] = a[b[13]]; |
| 280 | res[14] = a[b[14]]; |
| 281 | res[15] = a[b[15]]; |
| 282 | |
| 283 | return res; |
| 284 | } |
| 285 | |
| 286 | #else |
| 287 | |
| 288 | #define ggml_int16x8x2_t int16x8x2_t |
| 289 | #define ggml_uint8x16x2_t uint8x16x2_t |
| 290 | #define ggml_uint8x16x4_t uint8x16x4_t |
| 291 | #define ggml_int8x16x2_t int8x16x2_t |
| 292 | #define ggml_int8x16x4_t int8x16x4_t |
| 293 | |
| 294 | #define ggml_vld1q_s16_x2 vld1q_s16_x2 |
| 295 | #define ggml_vld1q_u8_x2 vld1q_u8_x2 |
| 296 | #define ggml_vld1q_u8_x4 vld1q_u8_x4 |
| 297 | #define ggml_vld1q_s8_x2 vld1q_s8_x2 |
| 298 | #define ggml_vld1q_s8_x4 vld1q_s8_x4 |
| 299 | #define ggml_vqtbl1q_s8 vqtbl1q_s8 |
| 300 | #define ggml_vqtbl1q_u8 vqtbl1q_u8 |
| 301 | |
| 302 | #endif // !defined(__aarch64__) |
| 303 | |
| 304 | #if !defined(__ARM_FEATURE_DOTPROD) |
| 305 | |
| 306 | inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { |
| 307 | const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); |
| 308 | const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); |
| 309 | |
| 310 | return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); |
| 311 | } |
| 312 | |
| 313 | #else |
| 314 | |
| 315 | #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) |
| 316 | |
| 317 | #endif // !defined(__ARM_FEATURE_DOTPROD) |
| 318 | |
| 319 | #endif // defined(__ARM_NEON) |
| 320 | |
| 321 | #ifdef __wasm_simd128__ |
| 322 | #include <wasm_simd128.h> |
| 323 | #endif |
| 324 | |
| 325 | #ifdef __POWER9_VECTOR__ |
| 326 | #include <altivec.h> |
| 327 | #endif |
| 328 | |
| 329 | #if defined(_MSC_VER) || defined(__MINGW32__) |
| 330 | #include <intrin.h> |
| 331 | #elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__) |
| 332 | #include <immintrin.h> |
| 333 | #endif |
| 334 | |
| 335 | #ifdef __riscv_v_intrinsic |
| 336 | #include <riscv_vector.h> |
| 337 | #endif |
| 338 | |
| 339 | #if defined(__loongarch64) |
| 340 | #if defined(__loongarch_asx) |
| 341 | #include <lasxintrin.h> |
| 342 | #endif |
| 343 | #if defined(__loongarch_sx) |
| 344 | #include <lsxintrin.h> |
| 345 | #endif |
| 346 | #endif |
| 347 | |
| 348 | #if defined(__VXE__) || defined(__VXE2__) |
| 349 | #include <vecintrin.h> |
| 350 | |
| 351 | #define vec_neg(a) (-(a)) // Vector Negate |
| 352 | #define vec_add(a, b) ((a) + (b)) // Vector Add |
| 353 | #define vec_sub(a, b) ((a) - (b)) // Vector Subtract |
| 354 | #define vec_mul(a, b) ((a) * (b)) // Vector Multiply |
| 355 | #define vec_div(a, b) ((a) / (b)) // Vector Divide |
| 356 | #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left |
| 357 | #define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right |
| 358 | #define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic |
| 359 | #define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet |
| 360 | #define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet |
| 361 | |
| 362 | #ifndef vec_and |
| 363 | #define vec_and(a, b) ((a) & (b)) // Vector AND |
| 364 | #endif |
| 365 | |
| 366 | #ifndef vec_or |
| 367 | #define vec_or(a, b) ((a) | (b)) // Vector OR |
| 368 | #endif |
| 369 | |
| 370 | #ifndef vec_xor |
| 371 | #define vec_xor(a, b) ((a) ^ (b)) // Vector XOR |
| 372 | #endif |
| 373 | |
| 374 | typedef signed char char8x16_t __attribute__((vector_size(16))); |
| 375 | typedef unsigned char uchar8x16_t __attribute__((vector_size(16))); |
| 376 | |
| 377 | typedef int8_t int8x16_t __attribute__((vector_size(16))); |
| 378 | typedef int16_t int16x8_t __attribute__((vector_size(16))); |
| 379 | typedef int32_t int32x4_t __attribute__((vector_size(16))); |
| 380 | |
| 381 | typedef uint8_t uint8x16_t __attribute__((vector_size(16))); |
| 382 | typedef uint16_t uint16x8_t __attribute__((vector_size(16))); |
| 383 | typedef uint32_t uint32x4_t __attribute__((vector_size(16))); |
| 384 | |
| 385 | typedef float float32x4_t __attribute__((vector_size(16))); |
| 386 | typedef double double64x2_t __attribute__((vector_size(16))); |
| 387 | |
| 388 | typedef signed long long long64x2_t __attribute__((vector_size(16))); |
| 389 | typedef unsigned long long ulong64x2_t __attribute__((vector_size(16))); |
| 390 | |
| 391 | typedef struct ggml_uint8x16x2_t { |
| 392 | uint8x16_t val[2]; |
| 393 | } ggml_uint8x16x2_t; |
| 394 | |
| 395 | inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2(const uint8_t * ptr) { |
| 396 | ggml_uint8x16x2_t res; |
| 397 | |
| 398 | res.val[0] = vec_xl( 0, ptr); |
| 399 | res.val[1] = vec_xl(16, ptr); |
| 400 | |
| 401 | return res; |
| 402 | } |
| 403 | |
| 404 | typedef struct ggml_uint8x16x4_t { |
| 405 | uint8x16_t val[4]; |
| 406 | } ggml_uint8x16x4_t; |
| 407 | |
| 408 | inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4(const uint8_t * ptr) { |
| 409 | ggml_uint8x16x4_t res; |
| 410 | |
| 411 | res.val[0] = vec_xl( 0, ptr); |
| 412 | res.val[1] = vec_xl(16, ptr); |
| 413 | res.val[2] = vec_xl(32, ptr); |
| 414 | res.val[3] = vec_xl(48, ptr); |
| 415 | |
| 416 | return res; |
| 417 | } |
| 418 | |
| 419 | typedef struct ggml_int8x16x4_t { |
| 420 | int8x16_t val[4]; |
| 421 | } ggml_int8x16x4_t; |
| 422 | |
| 423 | inline static ggml_int8x16x4_t ggml_vec_xl_s8x4(const int8_t * ptr) { |
| 424 | ggml_int8x16x4_t res; |
| 425 | |
| 426 | res.val[0] = vec_xl( 0, ptr); |
| 427 | res.val[1] = vec_xl(16, ptr); |
| 428 | res.val[2] = vec_xl(32, ptr); |
| 429 | res.val[3] = vec_xl(48, ptr); |
| 430 | |
| 431 | return res; |
| 432 | } |
| 433 | |
| 434 | typedef struct ggml_int16x8x2_t { |
| 435 | int16x8_t val[2]; |
| 436 | } ggml_int16x8x2_t; |
| 437 | |
| 438 | inline static ggml_int16x8x2_t ggml_vec_xl_s16x2(const int16_t * ptr) { |
| 439 | ggml_int16x8x2_t res; |
| 440 | |
| 441 | res.val[0] = vec_xl( 0, ptr); |
| 442 | res.val[1] = vec_xl(16, ptr); |
| 443 | |
| 444 | return res; |
| 445 | } |
| 446 | |
| 447 | /* |
| 448 | ! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs |
| 449 | ! or iq4_nl for example implementation. |
| 450 | */ |
| 451 | inline static int8x16_t ggml_vec_tbl(int8x16_t a, uint8x16_t b) { |
| 452 | int8x16_t res; |
| 453 | |
| 454 | res[ 0] = a[b[ 0]]; |
| 455 | res[ 1] = a[b[ 1]]; |
| 456 | res[ 2] = a[b[ 2]]; |
| 457 | res[ 3] = a[b[ 3]]; |
| 458 | res[ 4] = a[b[ 4]]; |
| 459 | res[ 5] = a[b[ 5]]; |
| 460 | res[ 6] = a[b[ 6]]; |
| 461 | res[ 7] = a[b[ 7]]; |
| 462 | res[ 8] = a[b[ 8]]; |
| 463 | res[ 9] = a[b[ 9]]; |
| 464 | res[10] = a[b[10]]; |
| 465 | res[11] = a[b[11]]; |
| 466 | res[12] = a[b[12]]; |
| 467 | res[13] = a[b[13]]; |
| 468 | res[14] = a[b[14]]; |
| 469 | res[15] = a[b[15]]; |
| 470 | |
| 471 | return res; |
| 472 | } |
| 473 | |
| 474 | inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) { |
| 475 | const uchar8x16_t v_maske = { 0, 1, 4, 5, 8, 9, 12, 13, |
| 476 | 16, 17, 20, 21, 24, 25, 28, 29 }; |
| 477 | |
| 478 | const int16x8_t v_abo = vec_pack((int32x4_t)a, (int32x4_t)b); |
| 479 | const int16x8_t v_abe = vec_perm(a, b, v_maske); |
| 480 | return v_abo + v_abe; |
| 481 | } |
| 482 | |
| 483 | /** |
| 484 | * @see https://github.com/ggml-org/llama.cpp/pull/14037 |
| 485 | */ |
| 486 | inline static float vec_hsum_f32x4(float32x4_t v) { |
| 487 | float32x4_t v_temp = v + vec_reve(v); |
| 488 | return v_temp[0] + v_temp[1]; |
| 489 | } |
| 490 | |
| 491 | inline static int32_t vec_hsum_i32x4(int32x4_t v) { |
| 492 | int32x4_t v_temp = v + vec_reve(v); |
| 493 | return v_temp[0] + v_temp[1]; |
| 494 | } |
| 495 | |
| 496 | inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) { |
| 497 | const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b); |
| 498 | return acc + (vec_unpackh(p) + vec_unpackl(p)); |
| 499 | } |
| 500 | |
| 501 | #endif |
| 502 | |
| 503 | #if defined(__loongarch_sx) |
| 504 | /* float type data load instructions */ |
| 505 | static __m128 __lsx_vreplfr2vr_s(const float val) { |
| 506 | v4f32 res = {val, val, val, val}; |
| 507 | return (__m128)res; |
| 508 | } |
| 509 | #endif |
| 510 | |
| 511 | #if defined(__loongarch_asx) |
| 512 | static __m256 __lasx_xvreplfr2vr_s(const float val) { |
| 513 | v8f32 res = {val, val, val, val, val, val, val, val}; |
| 514 | return (__m256)res; |
| 515 | } |
| 516 | #endif |
| 517 | |
| 518 | // TODO: move to ggml-threading |
| 519 | void ggml_barrier(struct ggml_threadpool * tp); |
| 520 | |
| 521 | void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value); |
| 522 | int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value); |
| 523 | |
| 524 | #ifdef __cplusplus |
| 525 | } |
| 526 | #endif |
| 527 | |