| 1 | #include "ggml-backend.h" |
| 2 | #include "ggml-backend-impl.h" |
| 3 | #include "ggml-cpu.h" |
| 4 | #include "repack.h" |
| 5 | #include "traits.h" |
| 6 | #include "ggml-impl.h" |
| 7 | #include "amx/amx.h" |
| 8 | |
| 9 | #include <cctype> |
| 10 | #include <string> |
| 11 | #include <vector> |
| 12 | |
| 13 | #ifdef GGML_USE_CPU_HBM |
| 14 | # include "hbm.h" |
| 15 | #endif |
| 16 | |
| 17 | #ifdef GGML_USE_CPU_KLEIDIAI |
| 18 | # include "kleidiai/kleidiai.h" |
| 19 | #endif |
| 20 | |
| 21 | #ifdef GGML_USE_CPU_RISCV64_SPACEMIT |
| 22 | # include "spacemit/ime.h" |
| 23 | #endif |
| 24 | |
| 25 | #if defined(_WIN32) |
| 26 | # define WIN32_LEAN_AND_MEAN |
| 27 | # ifndef NOMINMAX |
| 28 | # define NOMINMAX |
| 29 | # endif |
| 30 | # include <windows.h> |
| 31 | #else |
| 32 | # include <unistd.h> |
| 33 | #endif |
| 34 | |
| 35 | #if defined(__APPLE__) |
| 36 | # include <sys/sysctl.h> |
| 37 | # include <sys/types.h> |
| 38 | #endif |
| 39 | |
| 40 | // ggml-backend interface |
| 41 | |
| 42 | std::vector<ggml_backend_buffer_type_t> & () { |
| 43 | static std::vector<ggml_backend_buffer_type_t> bufts = []() { |
| 44 | std::vector<ggml_backend_buffer_type_t> bufts; |
| 45 | |
| 46 | #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) |
| 47 | if (ggml_backend_amx_buffer_type()) { |
| 48 | bufts.push_back(ggml_backend_amx_buffer_type()); |
| 49 | } |
| 50 | #endif |
| 51 | |
| 52 | #ifdef GGML_USE_CPU_RISCV64_SPACEMIT |
| 53 | if (ggml_backend_cpu_riscv64_spacemit_buffer_type()) { |
| 54 | bufts.push_back(ggml_backend_cpu_riscv64_spacemit_buffer_type()); |
| 55 | } |
| 56 | #endif |
| 57 | |
| 58 | #ifdef GGML_USE_CPU_KLEIDIAI |
| 59 | if (ggml_backend_cpu_kleidiai_buffer_type()) { |
| 60 | bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type()); |
| 61 | } |
| 62 | #endif |
| 63 | |
| 64 | #ifdef GGML_USE_CPU_REPACK |
| 65 | if (ggml_backend_cpu_repack_buffer_type()) { |
| 66 | bufts.push_back(x: ggml_backend_cpu_repack_buffer_type()); |
| 67 | } |
| 68 | #endif |
| 69 | |
| 70 | return bufts; |
| 71 | }(); |
| 72 | |
| 73 | return bufts; |
| 74 | } |
| 75 | |
| 76 | static ggml_backend_buffer_type_t * (ggml_backend_dev_t device) { |
| 77 | static std::vector<ggml_backend_buffer_type_t> = [] { |
| 78 | std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types(); |
| 79 | bufts.push_back(x: nullptr); |
| 80 | return bufts; |
| 81 | }(); |
| 82 | |
| 83 | return extra_bufts.data(); |
| 84 | |
| 85 | GGML_UNUSED(device); |
| 86 | } |
| 87 | |
| 88 | static bool (ggml_backend_buffer_type_t buft) { |
| 89 | for (auto * : ggml_backend_cpu_get_extra_buffer_types()) { |
| 90 | if (extra == buft) { |
| 91 | return true; |
| 92 | } |
| 93 | } |
| 94 | return false; |
| 95 | } |
| 96 | |
| 97 | // CPU backend - backend (stream) |
| 98 | |
| 99 | struct ggml_backend_cpu_context { |
| 100 | int n_threads; |
| 101 | ggml_threadpool_t threadpool; |
| 102 | |
| 103 | uint8_t * work_data; |
| 104 | size_t work_size; |
| 105 | |
| 106 | ggml_abort_callback abort_callback; |
| 107 | void * abort_callback_data; |
| 108 | }; |
| 109 | |
| 110 | static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { |
| 111 | return "CPU" ; |
| 112 | |
| 113 | GGML_UNUSED(backend); |
| 114 | } |
| 115 | |
| 116 | static void ggml_backend_cpu_free(ggml_backend_t backend) { |
| 117 | struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
| 118 | delete[] cpu_ctx->work_data; |
| 119 | delete cpu_ctx; |
| 120 | delete backend; |
| 121 | } |
| 122 | |
| 123 | struct ggml_backend_plan_cpu { |
| 124 | struct ggml_cplan cplan; |
| 125 | struct ggml_cgraph cgraph; |
| 126 | }; |
| 127 | |
| 128 | static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { |
| 129 | struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
| 130 | |
| 131 | struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu; |
| 132 | |
| 133 | cpu_plan->cplan = ggml_graph_plan(cgraph, n_threads: cpu_ctx->n_threads, threadpool: cpu_ctx->threadpool); |
| 134 | cpu_plan->cgraph = *cgraph; // FIXME: deep copy |
| 135 | |
| 136 | if (cpu_plan->cplan.work_size > 0) { |
| 137 | cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size]; |
| 138 | if (cpu_plan->cplan.work_data == NULL) { |
| 139 | delete cpu_plan; |
| 140 | return NULL; |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; |
| 145 | cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; |
| 146 | |
| 147 | return cpu_plan; |
| 148 | } |
| 149 | |
| 150 | static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { |
| 151 | struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; |
| 152 | |
| 153 | delete[] cpu_plan->cplan.work_data; |
| 154 | delete cpu_plan; |
| 155 | |
| 156 | GGML_UNUSED(backend); |
| 157 | } |
| 158 | |
| 159 | static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { |
| 160 | struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; |
| 161 | |
| 162 | return ggml_graph_compute(cgraph: &cpu_plan->cgraph, cplan: &cpu_plan->cplan); |
| 163 | |
| 164 | GGML_UNUSED(backend); |
| 165 | } |
| 166 | |
| 167 | static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { |
| 168 | struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
| 169 | |
| 170 | struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads: cpu_ctx->n_threads, threadpool: cpu_ctx->threadpool); |
| 171 | |
| 172 | if (cpu_ctx->work_size < cplan.work_size) { |
| 173 | delete[] cpu_ctx->work_data; |
| 174 | cpu_ctx->work_data = new uint8_t[cplan.work_size]; |
| 175 | if (cpu_ctx->work_data == NULL) { |
| 176 | cpu_ctx->work_size = 0; |
| 177 | return GGML_STATUS_ALLOC_FAILED; |
| 178 | } |
| 179 | cpu_ctx->work_size = cplan.work_size; |
| 180 | } |
| 181 | cplan.work_data = (uint8_t *)cpu_ctx->work_data; |
| 182 | |
| 183 | cplan.abort_callback = cpu_ctx->abort_callback; |
| 184 | cplan.abort_callback_data = cpu_ctx->abort_callback_data; |
| 185 | |
| 186 | return ggml_graph_compute(cgraph, cplan: &cplan); |
| 187 | } |
| 188 | |
| 189 | static const struct ggml_backend_i ggml_backend_cpu_i = { |
| 190 | /* .get_name = */ ggml_backend_cpu_get_name, |
| 191 | /* .free = */ ggml_backend_cpu_free, |
| 192 | /* .set_tensor_async = */ NULL, |
| 193 | /* .get_tensor_async = */ NULL, |
| 194 | /* .cpy_tensor_async = */ NULL, |
| 195 | /* .synchronize = */ NULL, |
| 196 | /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, |
| 197 | /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, |
| 198 | /* .graph_plan_update = */ NULL, |
| 199 | /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, |
| 200 | /* .graph_compute = */ ggml_backend_cpu_graph_compute, |
| 201 | /* .event_record = */ NULL, |
| 202 | /* .event_wait = */ NULL, |
| 203 | /* .graph_optimize = */ NULL, |
| 204 | }; |
| 205 | |
| 206 | static ggml_guid_t ggml_backend_cpu_guid(void) { |
| 207 | static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 }; |
| 208 | return &guid; |
| 209 | } |
| 210 | |
| 211 | ggml_backend_t ggml_backend_cpu_init(void) { |
| 212 | // initialize CPU backend now to avoid slowing the first graph computation |
| 213 | ggml_cpu_init(); |
| 214 | |
| 215 | struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context; |
| 216 | if (ctx == NULL) { |
| 217 | return NULL; |
| 218 | } |
| 219 | |
| 220 | ctx->n_threads = GGML_DEFAULT_N_THREADS; |
| 221 | ctx->threadpool = NULL; |
| 222 | ctx->work_data = NULL; |
| 223 | ctx->work_size = 0; |
| 224 | ctx->abort_callback = NULL; |
| 225 | ctx->abort_callback_data = NULL; |
| 226 | |
| 227 | ggml_backend_t cpu_backend = new ggml_backend { |
| 228 | /* .guid = */ ggml_backend_cpu_guid(), |
| 229 | /* .iface = */ ggml_backend_cpu_i, |
| 230 | /* .device = */ ggml_backend_reg_dev_get(reg: ggml_backend_cpu_reg(), index: 0), |
| 231 | /* .context = */ ctx, |
| 232 | }; |
| 233 | |
| 234 | if (cpu_backend == NULL) { |
| 235 | delete ctx; |
| 236 | return NULL; |
| 237 | } |
| 238 | |
| 239 | return cpu_backend; |
| 240 | } |
| 241 | |
| 242 | bool ggml_backend_is_cpu(ggml_backend_t backend) { |
| 243 | return backend != NULL && ggml_guid_matches(guid_a: backend->guid, guid_b: ggml_backend_cpu_guid()); |
| 244 | } |
| 245 | |
| 246 | void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { |
| 247 | GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); |
| 248 | |
| 249 | struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; |
| 250 | ctx->n_threads = n_threads; |
| 251 | } |
| 252 | |
| 253 | void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { |
| 254 | GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); |
| 255 | |
| 256 | struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; |
| 257 | |
| 258 | if (ctx->threadpool && ctx->threadpool != threadpool) { |
| 259 | // already had a different threadpool, pause/suspend it before switching |
| 260 | ggml_threadpool_pause(threadpool: ctx->threadpool); |
| 261 | } |
| 262 | ctx->threadpool = threadpool; |
| 263 | } |
| 264 | |
| 265 | void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { |
| 266 | GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); |
| 267 | |
| 268 | struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; |
| 269 | ctx->abort_callback = abort_callback; |
| 270 | ctx->abort_callback_data = abort_callback_data; |
| 271 | } |
| 272 | |
| 273 | // CPU backend - device |
| 274 | |
| 275 | struct ggml_backend_cpu_device_context { |
| 276 | std::string description = "CPU" ; |
| 277 | |
| 278 | ggml_backend_cpu_device_context() { |
| 279 | #ifdef __APPLE__ |
| 280 | size_t len = 0; |
| 281 | if (!sysctlbyname("machdep.cpu.brand_string" , NULL, &len, NULL, 0)) { |
| 282 | description.resize(len); |
| 283 | sysctlbyname("machdep.cpu.brand_string" , &description[0], &len, NULL, 0); // NOLINT |
| 284 | } |
| 285 | #elif defined(__linux__) |
| 286 | FILE * f = fopen(filename: "/proc/cpuinfo" , modes: "r" ); |
| 287 | if (f) { |
| 288 | char buf[1024]; |
| 289 | while (fgets(s: buf, n: sizeof(buf), stream: f)) { |
| 290 | if (strncmp(s1: buf, s2: "model name" , n: 10) == 0) { |
| 291 | char * p = strchr(s: buf, c: ':'); |
| 292 | if (p) { |
| 293 | p++; |
| 294 | while (std::isspace(*p)) { |
| 295 | p++; |
| 296 | } |
| 297 | while (std::isspace(p[strlen(s: p) - 1])) { |
| 298 | p[strlen(s: p) - 1] = '\0'; |
| 299 | } |
| 300 | description = p; |
| 301 | break; |
| 302 | } |
| 303 | } |
| 304 | } |
| 305 | fclose(stream: f); |
| 306 | } |
| 307 | #elif defined(_WIN32) |
| 308 | HKEY hKey; |
| 309 | if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, |
| 310 | TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0" ), |
| 311 | 0, |
| 312 | KEY_READ, |
| 313 | &hKey) == ERROR_SUCCESS) { |
| 314 | DWORD cpu_brand_size = 0; |
| 315 | if (RegQueryValueExA(hKey, |
| 316 | "ProcessorNameString" , |
| 317 | NULL, |
| 318 | NULL, |
| 319 | NULL, |
| 320 | &cpu_brand_size) == ERROR_SUCCESS) { |
| 321 | description.resize(cpu_brand_size); |
| 322 | if (RegQueryValueExA(hKey, |
| 323 | "ProcessorNameString" , |
| 324 | NULL, |
| 325 | NULL, |
| 326 | (LPBYTE)&description[0], // NOLINT |
| 327 | &cpu_brand_size) == ERROR_SUCCESS) { |
| 328 | if (description.find('\0') != std::string::npos) { |
| 329 | description.resize(description.find('\0')); |
| 330 | } |
| 331 | } |
| 332 | } |
| 333 | RegCloseKey(hKey); |
| 334 | } |
| 335 | #endif |
| 336 | } |
| 337 | }; |
| 338 | |
| 339 | static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) { |
| 340 | return "CPU" ; |
| 341 | |
| 342 | GGML_UNUSED(dev); |
| 343 | } |
| 344 | |
| 345 | static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) { |
| 346 | struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context; |
| 347 | |
| 348 | return ctx->description.c_str(); |
| 349 | } |
| 350 | |
| 351 | static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { |
| 352 | #ifdef _WIN32 |
| 353 | MEMORYSTATUSEX status; |
| 354 | status.dwLength = sizeof(status); |
| 355 | GlobalMemoryStatusEx(&status); |
| 356 | *total = status.ullTotalPhys; |
| 357 | *free = status.ullAvailPhys; |
| 358 | #else |
| 359 | long pages = sysconf(_SC_PHYS_PAGES); |
| 360 | long page_size = sysconf(_SC_PAGE_SIZE); |
| 361 | *total = pages * page_size; |
| 362 | |
| 363 | // "free" system memory is ill-defined, for practical purposes assume that all of it is free: |
| 364 | *free = *total; |
| 365 | #endif // _WIN32 |
| 366 | |
| 367 | GGML_UNUSED(dev); |
| 368 | } |
| 369 | |
| 370 | static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) { |
| 371 | return GGML_BACKEND_DEVICE_TYPE_CPU; |
| 372 | |
| 373 | GGML_UNUSED(dev); |
| 374 | } |
| 375 | |
| 376 | static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { |
| 377 | props->name = ggml_backend_cpu_device_get_name(dev); |
| 378 | props->description = ggml_backend_cpu_device_get_description(dev); |
| 379 | props->type = ggml_backend_cpu_device_get_type(dev); |
| 380 | ggml_backend_cpu_device_get_memory(dev, free: &props->memory_free, total: &props->memory_total); |
| 381 | props->caps = { |
| 382 | /* .async = */ false, |
| 383 | /* .host_buffer = */ false, |
| 384 | /* .buffer_from_host_ptr = */ true, |
| 385 | /* .events = */ false, |
| 386 | }; |
| 387 | } |
| 388 | |
| 389 | static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) { |
| 390 | return ggml_backend_cpu_init(); |
| 391 | |
| 392 | GGML_UNUSED(dev); |
| 393 | GGML_UNUSED(params); |
| 394 | } |
| 395 | |
| 396 | static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) { |
| 397 | return ggml_backend_cpu_buffer_type(); |
| 398 | |
| 399 | GGML_UNUSED(dev); |
| 400 | } |
| 401 | |
| 402 | static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { |
| 403 | return ggml_backend_cpu_buffer_from_ptr(ptr, size); |
| 404 | |
| 405 | GGML_UNUSED(dev); |
| 406 | GGML_UNUSED(max_tensor_size); |
| 407 | } |
| 408 | |
| 409 | static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { |
| 410 | const struct ggml_tensor * src0 = op->src[0]; |
| 411 | const struct ggml_tensor * src1 = op->src[1]; |
| 412 | |
| 413 | if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) { |
| 414 | return true; |
| 415 | } |
| 416 | |
| 417 | // check extra buffer types |
| 418 | // note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary |
| 419 | for (int i = 0; i < 4; i++) { |
| 420 | if (op->src[i] && op->src[i]->buffer && |
| 421 | ggml_backend_cpu_is_extra_buffer_type(buft: op->src[i]->buffer->buft)) { |
| 422 | auto * = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context; |
| 423 | return buf_extra->supports_op(dev, op); |
| 424 | } |
| 425 | } |
| 426 | |
| 427 | switch (op->op) { |
| 428 | case GGML_OP_CPY: |
| 429 | case GGML_OP_SET_ROWS: |
| 430 | return |
| 431 | op->type != GGML_TYPE_IQ3_XXS && |
| 432 | op->type != GGML_TYPE_IQ3_S && |
| 433 | op->type != GGML_TYPE_IQ2_XXS && |
| 434 | op->type != GGML_TYPE_IQ2_XS && |
| 435 | op->type != GGML_TYPE_IQ2_S && |
| 436 | op->type != GGML_TYPE_IQ1_S && |
| 437 | op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float |
| 438 | case GGML_OP_MUL_MAT: |
| 439 | return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(type: src0->type)->vec_dot_type; |
| 440 | case GGML_OP_SOFT_MAX_BACK: { |
| 441 | if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) { |
| 442 | return false; |
| 443 | } |
| 444 | float max_bias = 0.0f; |
| 445 | |
| 446 | memcpy(dest: &max_bias, src: (const float *) op->op_params + 1, n: sizeof(float)); |
| 447 | |
| 448 | return max_bias == 0.0f; |
| 449 | } |
| 450 | case GGML_OP_IM2COL_BACK: |
| 451 | return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32; |
| 452 | case GGML_OP_GET_ROWS_BACK: |
| 453 | return src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16; |
| 454 | case GGML_OP_OUT_PROD: |
| 455 | return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(type: src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) && |
| 456 | src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; |
| 457 | default: |
| 458 | return true; |
| 459 | } |
| 460 | } |
| 461 | |
| 462 | static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { |
| 463 | return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft); |
| 464 | GGML_UNUSED(dev); |
| 465 | } |
| 466 | |
| 467 | static const struct ggml_backend_device_i ggml_backend_cpu_device_i = { |
| 468 | /* .get_name = */ ggml_backend_cpu_device_get_name, |
| 469 | /* .get_description = */ ggml_backend_cpu_device_get_description, |
| 470 | /* .get_memory = */ ggml_backend_cpu_device_get_memory, |
| 471 | /* .get_type = */ ggml_backend_cpu_device_get_type, |
| 472 | /* .get_props = */ ggml_backend_cpu_device_get_props, |
| 473 | /* .init_backend = */ ggml_backend_cpu_device_init_backend, |
| 474 | /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type, |
| 475 | /* .get_host_buffer_type = */ NULL, |
| 476 | /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr, |
| 477 | /* .supports_op = */ ggml_backend_cpu_device_supports_op, |
| 478 | /* .supports_buft = */ ggml_backend_cpu_device_supports_buft, |
| 479 | /* .offload_op = */ NULL, |
| 480 | /* .event_new = */ NULL, |
| 481 | /* .event_free = */ NULL, |
| 482 | /* .event_synchronize = */ NULL, |
| 483 | }; |
| 484 | |
| 485 | // CPU backend - backend (reg) |
| 486 | |
| 487 | static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) { |
| 488 | return "CPU" ; |
| 489 | |
| 490 | GGML_UNUSED(reg); |
| 491 | } |
| 492 | |
| 493 | static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) { |
| 494 | return 1; |
| 495 | |
| 496 | GGML_UNUSED(reg); |
| 497 | } |
| 498 | |
| 499 | static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { |
| 500 | GGML_ASSERT(index == 0); |
| 501 | |
| 502 | static ggml_backend_cpu_device_context ctx; |
| 503 | static ggml_backend_device ggml_backend_cpu_device = { |
| 504 | /* .iface = */ ggml_backend_cpu_device_i, |
| 505 | /* .reg = */ reg, |
| 506 | /* .context = */ &ctx, |
| 507 | }; |
| 508 | |
| 509 | return &ggml_backend_cpu_device; |
| 510 | } |
| 511 | |
| 512 | // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically, |
| 513 | // and additionally to allow other backends to expose their own list of features that applications can query using the same API |
| 514 | static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) { |
| 515 | static std::vector<ggml_backend_feature> features = []() { |
| 516 | ggml_cpu_init(); |
| 517 | |
| 518 | std::vector<ggml_backend_feature> features; |
| 519 | if (ggml_cpu_has_sse3()) { |
| 520 | features.push_back(x: { .name: "SSE3" , .value: "1" }); |
| 521 | } |
| 522 | if (ggml_cpu_has_ssse3()) { |
| 523 | features.push_back(x: { .name: "SSSE3" , .value: "1" }); |
| 524 | } |
| 525 | if (ggml_cpu_has_avx()) { |
| 526 | features.push_back(x: { .name: "AVX" , .value: "1" }); |
| 527 | } |
| 528 | if (ggml_cpu_has_avx_vnni()) { |
| 529 | features.push_back(x: { .name: "AVX_VNNI" , .value: "1" }); |
| 530 | } |
| 531 | if (ggml_cpu_has_avx2()) { |
| 532 | features.push_back(x: { .name: "AVX2" , .value: "1" }); |
| 533 | } |
| 534 | if (ggml_cpu_has_f16c()) { |
| 535 | features.push_back(x: { .name: "F16C" , .value: "1" }); |
| 536 | } |
| 537 | if (ggml_cpu_has_fma()) { |
| 538 | features.push_back(x: { .name: "FMA" , .value: "1" }); |
| 539 | } |
| 540 | if (ggml_cpu_has_bmi2()) { |
| 541 | features.push_back(x: { .name: "BMI2" , .value: "1" }); |
| 542 | } |
| 543 | if (ggml_cpu_has_avx512()) { |
| 544 | features.push_back(x: { .name: "AVX512" , .value: "1" }); |
| 545 | } |
| 546 | if (ggml_cpu_has_avx512_vbmi()) { |
| 547 | features.push_back(x: { .name: "AVX512_VBMI" , .value: "1" }); |
| 548 | } |
| 549 | if (ggml_cpu_has_avx512_vnni()) { |
| 550 | features.push_back(x: { .name: "AVX512_VNNI" , .value: "1" }); |
| 551 | } |
| 552 | if (ggml_cpu_has_avx512_bf16()) { |
| 553 | features.push_back(x: { .name: "AVX512_BF16" , .value: "1" }); |
| 554 | } |
| 555 | if (ggml_cpu_has_amx_int8()) { |
| 556 | features.push_back(x: { .name: "AMX_INT8" , .value: "1" }); |
| 557 | } |
| 558 | if (ggml_cpu_has_neon()) { |
| 559 | features.push_back(x: { .name: "NEON" , .value: "1" }); |
| 560 | } |
| 561 | if (ggml_cpu_has_arm_fma()) { |
| 562 | features.push_back(x: { .name: "ARM_FMA" , .value: "1" }); |
| 563 | } |
| 564 | if (ggml_cpu_has_fp16_va()) { |
| 565 | features.push_back(x: { .name: "FP16_VA" , .value: "1" }); |
| 566 | } |
| 567 | if (ggml_cpu_has_matmul_int8()) { |
| 568 | features.push_back(x: { .name: "MATMUL_INT8" , .value: "1" }); |
| 569 | } |
| 570 | if (ggml_cpu_has_sve()) { |
| 571 | features.push_back(x: { .name: "SVE" , .value: "1" }); |
| 572 | } |
| 573 | if (ggml_cpu_has_dotprod()) { |
| 574 | features.push_back(x: { .name: "DOTPROD" , .value: "1" }); |
| 575 | } |
| 576 | if (ggml_cpu_get_sve_cnt() > 0) { |
| 577 | static std::string sve_cnt = std::to_string(val: ggml_cpu_get_sve_cnt()); |
| 578 | features.push_back(x: { .name: "SVE_CNT" , .value: sve_cnt.c_str() }); |
| 579 | } |
| 580 | if (ggml_cpu_has_sme()) { |
| 581 | features.push_back(x: { .name: "SME" , .value: "1" }); |
| 582 | } |
| 583 | if (ggml_cpu_has_riscv_v()) { |
| 584 | features.push_back(x: { .name: "RISCV_V" , .value: "1" }); |
| 585 | } |
| 586 | if (ggml_cpu_has_vsx()) { |
| 587 | features.push_back(x: { .name: "VSX" , .value: "1" }); |
| 588 | } |
| 589 | if (ggml_cpu_has_vxe()) { |
| 590 | features.push_back(x: { .name: "VXE" , .value: "1" }); |
| 591 | } |
| 592 | if (ggml_cpu_has_wasm_simd()) { |
| 593 | features.push_back(x: { .name: "WASM_SIMD" , .value: "1" }); |
| 594 | } |
| 595 | if (ggml_cpu_has_llamafile()) { |
| 596 | features.push_back(x: { .name: "LLAMAFILE" , .value: "1" }); |
| 597 | } |
| 598 | #ifdef GGML_USE_ACCELERATE |
| 599 | features.push_back({ "ACCELERATE" , "1" }); |
| 600 | #endif |
| 601 | #ifdef GGML_USE_CPU_HBM |
| 602 | features.push_back({ "CPU_HBM" , "1" }); |
| 603 | #endif |
| 604 | #ifdef GGML_USE_OPENMP |
| 605 | features.push_back(x: { .name: "OPENMP" , .value: "1" }); |
| 606 | #endif |
| 607 | #ifdef GGML_USE_CPU_KLEIDIAI |
| 608 | features.push_back({ "KLEIDIAI" , "1" }); |
| 609 | #endif |
| 610 | #ifdef GGML_USE_CPU_REPACK |
| 611 | features.push_back(x: { .name: "REPACK" , .value: "1" }); |
| 612 | #endif |
| 613 | |
| 614 | features.push_back(x: { .name: nullptr, .value: nullptr }); |
| 615 | |
| 616 | return features; |
| 617 | }(); |
| 618 | |
| 619 | return features.data(); |
| 620 | |
| 621 | GGML_UNUSED(reg); |
| 622 | } |
| 623 | |
| 624 | static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) { |
| 625 | if (strcmp(s1: name, s2: "ggml_backend_set_n_threads" ) == 0) { |
| 626 | ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads; |
| 627 | return (void *)fct; |
| 628 | } |
| 629 | if (strcmp(s1: name, s2: "ggml_backend_dev_get_extra_bufts" ) == 0) { |
| 630 | ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type; |
| 631 | return (void *)fct; |
| 632 | } |
| 633 | if (strcmp(s1: name, s2: "ggml_backend_get_features" ) == 0) { |
| 634 | return (void *)ggml_backend_cpu_get_features; |
| 635 | } |
| 636 | if (strcmp(s1: name, s2: "ggml_backend_set_abort_callback" ) == 0) { |
| 637 | return (void *)ggml_backend_cpu_set_abort_callback; |
| 638 | } |
| 639 | if (strcmp(s1: name, s2: "ggml_backend_cpu_numa_init" ) == 0) { |
| 640 | return (void *)ggml_numa_init; |
| 641 | } |
| 642 | if (strcmp(s1: name, s2: "ggml_backend_cpu_is_numa" ) == 0) { |
| 643 | return (void *)ggml_is_numa; |
| 644 | } |
| 645 | |
| 646 | // threadpool - TODO: move to ggml-base |
| 647 | if (strcmp(s1: name, s2: "ggml_threadpool_new" ) == 0) { |
| 648 | return (void *)ggml_threadpool_new; |
| 649 | } |
| 650 | if (strcmp(s1: name, s2: "ggml_threadpool_free" ) == 0) { |
| 651 | return (void *)ggml_threadpool_free; |
| 652 | } |
| 653 | if (strcmp(s1: name, s2: "ggml_backend_cpu_set_threadpool" ) == 0) { |
| 654 | return (void *)ggml_backend_cpu_set_threadpool; |
| 655 | } |
| 656 | |
| 657 | return NULL; |
| 658 | |
| 659 | GGML_UNUSED(reg); |
| 660 | } |
| 661 | |
| 662 | static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = { |
| 663 | /* .get_name = */ ggml_backend_cpu_reg_get_name, |
| 664 | /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count, |
| 665 | /* .get_device = */ ggml_backend_cpu_reg_get_device, |
| 666 | /* .get_proc_address = */ ggml_backend_cpu_get_proc_address, |
| 667 | }; |
| 668 | |
| 669 | ggml_backend_reg_t ggml_backend_cpu_reg(void) { |
| 670 | // init CPU feature detection |
| 671 | ggml_cpu_init(); |
| 672 | |
| 673 | static struct ggml_backend_reg ggml_backend_cpu_reg = { |
| 674 | /* .api_version = */ GGML_BACKEND_API_VERSION, |
| 675 | /* .iface = */ ggml_backend_cpu_reg_i, |
| 676 | /* .context = */ NULL, |
| 677 | }; |
| 678 | |
| 679 | return &ggml_backend_cpu_reg; |
| 680 | } |
| 681 | |
| 682 | GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg) |
| 683 | |