1#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
2#define _USE_MATH_DEFINES // For M_PI on MSVC
3
4#include "ggml-backend.h"
5#include "ggml-impl.h"
6#include "ggml-threading.h"
7#include "ggml-cpu.h"
8#include "ggml.h"
9
10// FIXME: required here for quantization functions
11#include "ggml-quants.h"
12
13#ifdef GGML_USE_CPU_HBM
14#include <hbwmalloc.h>
15#endif
16
17#if defined(_MSC_VER) || defined(__MINGW32__)
18#include <malloc.h> // using malloc.h with MSC/MINGW
19#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
20#include <alloca.h>
21#endif
22
23#include <assert.h>
24#include <errno.h>
25#include <time.h>
26#include <math.h>
27#include <stdlib.h>
28#include <string.h>
29#include <stdint.h>
30#include <inttypes.h>
31#include <stdio.h>
32#include <float.h>
33#include <limits.h>
34#include <stdarg.h>
35#include <signal.h>
36#if defined(__gnu_linux__)
37#include <syscall.h>
38#endif
39
40#if defined(__APPLE__)
41#include <unistd.h>
42#include <mach/mach.h>
43#include <TargetConditionals.h>
44#endif
45
46#if defined(_WIN32)
47#define WIN32_LEAN_AND_MEAN
48#ifndef NOMINMAX
49 #define NOMINMAX
50#endif
51#include <windows.h>
52#endif
53
54#define UNUSED GGML_UNUSED
55
56#if defined(_MSC_VER)
57#define m512bh(p) p
58#define m512i(p) p
59#else
60#define m512bh(p) (__m512bh)(p)
61#define m512i(p) (__m512i)(p)
62#endif
63
64#if defined(__linux__) || \
65 defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
66 (defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH)
67
68#include <unistd.h>
69#include <sys/types.h>
70#include <sys/stat.h>
71#include <sys/wait.h>
72#if defined(__linux__)
73#include <sys/prctl.h>
74#endif
75
76#if defined(__ANDROID__)
77#include <unwind.h>
78#include <dlfcn.h>
79#include <stdio.h>
80
81struct backtrace_state {
82 void ** current;
83 void ** end;
84};
85
86static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
87 struct backtrace_state * state = (struct backtrace_state *)arg;
88 uintptr_t pc = _Unwind_GetIP(context);
89 if (pc) {
90 if (state->current == state->end) {
91 return _URC_END_OF_STACK;
92 } else {
93 *state->current++ = (void*)pc;
94 }
95 }
96 return _URC_NO_REASON;
97}
98
99static void ggml_print_backtrace_symbols(void) {
100 const int max = 100;
101 void* buffer[max];
102
103 struct backtrace_state state = {buffer, buffer + max};
104 _Unwind_Backtrace(unwind_callback, &state);
105
106 int count = state.current - buffer;
107
108 for (int idx = 0; idx < count; ++idx) {
109 const void * addr = buffer[idx];
110 const char * symbol = "";
111
112 Dl_info info;
113 if (dladdr(addr, &info) && info.dli_sname) {
114 symbol = info.dli_sname;
115 }
116
117 fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
118 }
119}
120#elif defined(__linux__) && defined(__GLIBC__)
121#include <execinfo.h>
122static void ggml_print_backtrace_symbols(void) {
123 void * trace[100];
124 int nptrs = backtrace(array: trace, size: sizeof(trace)/sizeof(trace[0]));
125 backtrace_symbols_fd(array: trace, size: nptrs, STDERR_FILENO);
126}
127#else
128static void ggml_print_backtrace_symbols(void) {
129 // platform not supported
130}
131#endif
132
133void ggml_print_backtrace(void) {
134 const char * GGML_NO_BACKTRACE = getenv(name: "GGML_NO_BACKTRACE");
135 if (GGML_NO_BACKTRACE) {
136 return;
137 }
138#if defined(__linux__)
139 FILE * f = fopen(filename: "/proc/self/status", modes: "r");
140 size_t size = 0;
141 char * line = NULL;
142 ssize_t length = 0;
143 while ((length = getline(lineptr: &line, n: &size, stream: f)) > 0) {
144 if (!strncmp(s1: line, s2: "TracerPid:", n: sizeof("TracerPid:") - 1) &&
145 (length != sizeof("TracerPid:\t0\n") - 1 || line[length - 2] != '0')) {
146 // Already being debugged, and the breakpoint is the later abort()
147 free(ptr: line);
148 fclose(stream: f);
149 return;
150 }
151 }
152 free(ptr: line);
153 fclose(stream: f);
154 int lock[2] = { -1, -1 };
155 (void) !pipe(pipedes: lock); // Don't start gdb until after PR_SET_PTRACER
156#endif
157 const int parent_pid = getpid();
158 const int child_pid = fork();
159 if (child_pid < 0) { // error
160#if defined(__linux__)
161 close(fd: lock[1]);
162 close(fd: lock[0]);
163#endif
164 return;
165 } else if (child_pid == 0) { // child
166 char attach[32];
167 snprintf(s: attach, maxlen: sizeof(attach), format: "attach %d", parent_pid);
168#if defined(__linux__)
169 close(fd: lock[1]);
170 (void) !read(fd: lock[0], buf: lock, nbytes: 1);
171 close(fd: lock[0]);
172#endif
173 // try gdb
174 execlp(file: "gdb", arg: "gdb", "--batch",
175 "-ex", "set style enabled on",
176 "-ex", attach,
177 "-ex", "bt -frame-info source-and-location",
178 "-ex", "detach",
179 "-ex", "quit",
180 (char *) NULL);
181 // try lldb
182 execlp(file: "lldb", arg: "lldb", "--batch",
183 "-o", "bt",
184 "-o", "quit",
185 "-p", &attach[sizeof("attach ") - 1],
186 (char *) NULL);
187 // gdb failed, fallback to backtrace_symbols
188 ggml_print_backtrace_symbols();
189 _Exit(status: 0);
190 } else { // parent
191#if defined(__linux__)
192 prctl(PR_SET_PTRACER, child_pid);
193 close(fd: lock[1]);
194 close(fd: lock[0]);
195#endif
196 waitpid(pid: child_pid, NULL, options: 0);
197 }
198}
199#else
200void ggml_print_backtrace(void) {
201 // platform not supported
202}
203#endif
204
205static ggml_abort_callback_t g_abort_callback = NULL;
206
207// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
208GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback) {
209 ggml_abort_callback_t ret_val = g_abort_callback;
210 g_abort_callback = callback;
211 return ret_val;
212}
213
214void ggml_abort(const char * file, int line, const char * fmt, ...) {
215 fflush(stdout);
216
217 char message[2048];
218 int offset = snprintf(s: message, maxlen: sizeof(message), format: "%s:%d: ", file, line);
219
220 va_list args;
221 va_start(args, fmt);
222 vsnprintf(s: message + offset, maxlen: sizeof(message) - offset, format: fmt, arg: args);
223 va_end(args);
224
225 if (g_abort_callback) {
226 g_abort_callback(message);
227 } else {
228 // default: print error and backtrace to stderr
229 fprintf(stderr, format: "%s\n", message);
230 ggml_print_backtrace();
231 }
232
233 abort();
234}
235
236// ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
237
238//
239// logging
240//
241
242struct ggml_logger_state {
243 ggml_log_callback log_callback;
244 void * log_callback_user_data;
245};
246static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
247
248static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
249 if (format == NULL) {
250 return;
251 }
252 va_list args_copy;
253 va_copy(args_copy, args);
254 char buffer[128];
255 int len = vsnprintf(s: buffer, maxlen: 128, format: format, arg: args);
256 if (len < 128) {
257 g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
258 } else {
259 char * buffer2 = (char *) calloc(nmemb: len + 1, size: sizeof(char));
260 vsnprintf(s: buffer2, maxlen: len + 1, format: format, arg: args_copy);
261 buffer2[len] = 0;
262 g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
263 free(ptr: buffer2);
264 }
265 va_end(args_copy);
266}
267
268void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
269 va_list args;
270 va_start(args, format);
271 ggml_log_internal_v(level, format, args);
272 va_end(args);
273}
274
275void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
276 (void) level;
277 (void) user_data;
278 fputs(s: text, stderr);
279 fflush(stderr);
280}
281
282//
283// end of logging block
284//
285
286#ifdef GGML_USE_ACCELERATE
287// uncomment to use vDSP for soft max computation
288// note: not sure if it is actually faster
289//#define GGML_SOFT_MAX_ACCELERATE
290#endif
291
292
293void * ggml_aligned_malloc(size_t size) {
294#if defined(__s390x__)
295 const int alignment = 256;
296#else
297 const int alignment = 64;
298#endif
299
300#if defined(_MSC_VER) || defined(__MINGW32__)
301 return _aligned_malloc(size, alignment);
302#else
303 if (size == 0) {
304 GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
305 return NULL;
306 }
307 void * aligned_memory = NULL;
308 #ifdef GGML_USE_CPU_HBM
309 int result = hbw_posix_memalign(&aligned_memory, alignment, size);
310 #elif TARGET_OS_OSX
311 GGML_UNUSED(alignment);
312 kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
313 int result = EFAULT;
314 switch (alloc_status) {
315 case KERN_SUCCESS:
316 result = 0;
317 break;
318 case KERN_INVALID_ADDRESS:
319 result = EINVAL;
320 break;
321 case KERN_NO_SPACE:
322 result = ENOMEM;
323 break;
324 default:
325 result = EFAULT;
326 break;
327 }
328 #else
329 int result = posix_memalign(memptr: &aligned_memory, alignment: alignment, size: size);
330 #endif
331 if (result != 0) {
332 // Handle allocation failure
333 const char *error_desc = "unknown allocation error";
334 switch (result) {
335 case EINVAL:
336 error_desc = "invalid alignment value";
337 break;
338 case ENOMEM:
339 error_desc = "insufficient memory";
340 break;
341 }
342 GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
343 return NULL;
344 }
345 return aligned_memory;
346#endif
347}
348
349void ggml_aligned_free(void * ptr, size_t size) {
350 GGML_UNUSED(size);
351#if defined(_MSC_VER) || defined(__MINGW32__)
352 _aligned_free(ptr);
353#elif GGML_USE_CPU_HBM
354 if (ptr != NULL) {
355 hbw_free(ptr);
356 }
357#elif TARGET_OS_OSX
358 if (ptr != NULL) {
359 vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
360 }
361#else
362 free(ptr: ptr);
363#endif
364}
365
366
367inline static void * ggml_malloc(size_t size) {
368 if (size == 0) {
369 GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
370 return NULL;
371 }
372 void * result = malloc(size: size);
373 if (result == NULL) {
374 GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
375 GGML_ABORT("fatal error");
376 }
377 return result;
378}
379
380// calloc
381inline static void * ggml_calloc(size_t num, size_t size) {
382 if (num == 0 || size == 0) {
383 GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
384 return NULL;
385 }
386 void * result = calloc(nmemb: num, size: size);
387 if (result == NULL) {
388 GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
389 GGML_ABORT("fatal error");
390 }
391 return result;
392}
393
394#define GGML_MALLOC(size) ggml_malloc(size)
395#define GGML_CALLOC(num, size) ggml_calloc(num, size)
396
397#define GGML_FREE(ptr) free(ptr)
398
399const char * ggml_status_to_string(enum ggml_status status) {
400 switch (status) {
401 case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
402 case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
403 case GGML_STATUS_SUCCESS: return "GGML status: success";
404 case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
405 }
406
407 return "GGML status: unknown";
408}
409
410float ggml_fp16_to_fp32(ggml_fp16_t x) {
411#define ggml_fp16_to_fp32 do_not_use__ggml_fp16_to_fp32__in_ggml
412 return GGML_FP16_TO_FP32(x);
413}
414
415ggml_fp16_t ggml_fp32_to_fp16(float x) {
416#define ggml_fp32_to_fp16 do_not_use__ggml_fp32_to_fp16__in_ggml
417 return GGML_FP32_TO_FP16(x);
418}
419
420float ggml_bf16_to_fp32(ggml_bf16_t x) {
421#define ggml_bf16_to_fp32 do_not_use__ggml_bf16_to_fp32__in_ggml
422 return GGML_BF16_TO_FP32(x); // it just left shifts
423}
424
425ggml_bf16_t ggml_fp32_to_bf16(float x) {
426#define ggml_fp32_to_bf16 do_not_use__ggml_fp32_to_bf16__in_ggml
427 return GGML_FP32_TO_BF16(x);
428}
429
430void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
431 for (int64_t i = 0; i < n; i++) {
432 y[i] = GGML_FP16_TO_FP32(x[i]);
433 }
434}
435
436void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
437 int i = 0;
438 for (; i < n; ++i) {
439 y[i] = GGML_FP32_TO_FP16(x[i]);
440 }
441}
442
443void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
444 int i = 0;
445 for (; i < n; ++i) {
446 y[i] = GGML_BF16_TO_FP32(x[i]);
447 }
448}
449
450void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
451 for (int i = 0; i < n; i++) {
452 y[i] = ggml_compute_fp32_to_bf16(s: x[i]);
453 }
454}
455
456void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
457 int i = 0;
458#if defined(__AVX512BF16__)
459 // subnormals are flushed to zero on this platform
460 for (; i + 32 <= n; i += 32) {
461 _mm512_storeu_si512(
462 (__m512i *)(y + i),
463 m512i(_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16),
464 _mm512_loadu_ps(x + i))));
465 }
466#endif
467 for (; i < n; i++) {
468 y[i] = GGML_FP32_TO_BF16(x[i]);
469 }
470}
471
472bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
473 return memcmp(s1: guid_a, s2: guid_b, n: sizeof(ggml_guid)) == 0;
474}
475
476const char * ggml_version(void) {
477 return GGML_VERSION;
478}
479
480const char * ggml_commit(void) {
481 return GGML_COMMIT;
482}
483
484//
485// timing
486//
487
488#if defined(_MSC_VER) || defined(__MINGW32__)
489static int64_t timer_freq, timer_start;
490void ggml_time_init(void) {
491 LARGE_INTEGER t;
492 QueryPerformanceFrequency(&t);
493 timer_freq = t.QuadPart;
494
495 // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
496 // and the uptime is high enough.
497 // We subtract the program start time to reduce the likelihood of that happening.
498 QueryPerformanceCounter(&t);
499 timer_start = t.QuadPart;
500}
501int64_t ggml_time_ms(void) {
502 LARGE_INTEGER t;
503 QueryPerformanceCounter(&t);
504 return ((t.QuadPart-timer_start) * 1000) / timer_freq;
505}
506int64_t ggml_time_us(void) {
507 LARGE_INTEGER t;
508 QueryPerformanceCounter(&t);
509 return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
510}
511#else
512void ggml_time_init(void) {}
513int64_t ggml_time_ms(void) {
514 struct timespec ts;
515 clock_gettime(CLOCK_MONOTONIC, tp: &ts);
516 return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
517}
518
519int64_t ggml_time_us(void) {
520 struct timespec ts;
521 clock_gettime(CLOCK_MONOTONIC, tp: &ts);
522 return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
523}
524#endif
525
526int64_t ggml_cycles(void) {
527 return clock();
528}
529
530int64_t ggml_cycles_per_ms(void) {
531 return CLOCKS_PER_SEC/1000;
532}
533
534//
535// cross-platform UTF-8 file paths
536//
537
538#ifdef _WIN32
539static wchar_t * ggml_mbstowcs(const char * mbs) {
540 int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
541 if (!wlen) {
542 errno = EINVAL;
543 return NULL;
544 }
545
546 wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
547 wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
548 if (!wlen) {
549 GGML_FREE(wbuf);
550 errno = EINVAL;
551 return NULL;
552 }
553
554 return wbuf;
555}
556#endif
557
558FILE * ggml_fopen(const char * fname, const char * mode) {
559#ifdef _WIN32
560 FILE * file = NULL;
561
562 // convert fname (UTF-8)
563 wchar_t * wfname = ggml_mbstowcs(fname);
564 if (wfname) {
565 // convert mode (ANSI)
566 wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
567 wchar_t * wmode_p = wmode;
568 do {
569 *wmode_p++ = (wchar_t)*mode;
570 } while (*mode++);
571
572 // open file
573 file = _wfopen(wfname, wmode);
574
575 GGML_FREE(wfname);
576 GGML_FREE(wmode);
577 }
578
579 return file;
580#else
581 return fopen(filename: fname, modes: mode);
582#endif
583
584}
585
586static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
587 [GGML_TYPE_I8] = {
588 .type_name = "i8",
589 .blck_size = 1,
590 .type_size = sizeof(int8_t),
591 .is_quantized = false,
592 },
593 [GGML_TYPE_I16] = {
594 .type_name = "i16",
595 .blck_size = 1,
596 .type_size = sizeof(int16_t),
597 .is_quantized = false,
598 },
599 [GGML_TYPE_I32] = {
600 .type_name = "i32",
601 .blck_size = 1,
602 .type_size = sizeof(int32_t),
603 .is_quantized = false,
604 },
605 [GGML_TYPE_I64] = {
606 .type_name = "i64",
607 .blck_size = 1,
608 .type_size = sizeof(int64_t),
609 .is_quantized = false,
610 },
611 [GGML_TYPE_F64] = {
612 .type_name = "f64",
613 .blck_size = 1,
614 .type_size = sizeof(double),
615 .is_quantized = false,
616 },
617 [GGML_TYPE_F32] = {
618 .type_name = "f32",
619 .blck_size = 1,
620 .type_size = sizeof(float),
621 .is_quantized = false,
622 },
623 [GGML_TYPE_F16] = {
624 .type_name = "f16",
625 .blck_size = 1,
626 .type_size = sizeof(ggml_fp16_t),
627 .is_quantized = false,
628 .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
629 .from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row,
630 },
631 [GGML_TYPE_Q4_0] = {
632 .type_name = "q4_0",
633 .blck_size = QK4_0,
634 .type_size = sizeof(block_q4_0),
635 .is_quantized = true,
636 .to_float = (ggml_to_float_t) dequantize_row_q4_0,
637 .from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref,
638 },
639 [GGML_TYPE_Q4_1] = {
640 .type_name = "q4_1",
641 .blck_size = QK4_1,
642 .type_size = sizeof(block_q4_1),
643 .is_quantized = true,
644 .to_float = (ggml_to_float_t) dequantize_row_q4_1,
645 .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref,
646 },
647 [4] = { // GGML_TYPE_Q4_2
648 .type_name = "DEPRECATED",
649 .blck_size = 0,
650 .type_size = 0,
651 .is_quantized = false,
652 },
653 [5] = { // GGML_TYPE_Q4_3
654 .type_name = "DEPRECATED",
655 .blck_size = 0,
656 .type_size = 0,
657 .is_quantized = false,
658 },
659 [GGML_TYPE_Q5_0] = {
660 .type_name = "q5_0",
661 .blck_size = QK5_0,
662 .type_size = sizeof(block_q5_0),
663 .is_quantized = true,
664 .to_float = (ggml_to_float_t) dequantize_row_q5_0,
665 .from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref,
666 },
667 [GGML_TYPE_Q5_1] = {
668 .type_name = "q5_1",
669 .blck_size = QK5_1,
670 .type_size = sizeof(block_q5_1),
671 .is_quantized = true,
672 .to_float = (ggml_to_float_t) dequantize_row_q5_1,
673 .from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref,
674 },
675 [GGML_TYPE_Q8_0] = {
676 .type_name = "q8_0",
677 .blck_size = QK8_0,
678 .type_size = sizeof(block_q8_0),
679 .is_quantized = true,
680 .to_float = (ggml_to_float_t) dequantize_row_q8_0,
681 .from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref,
682 },
683 [GGML_TYPE_Q8_1] = {
684 .type_name = "q8_1",
685 .blck_size = QK8_1,
686 .type_size = sizeof(block_q8_1),
687 .is_quantized = true,
688 .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
689 },
690 [GGML_TYPE_MXFP4] = {
691 .type_name = "mxfp4",
692 .blck_size = QK_MXFP4,
693 .type_size = sizeof(block_mxfp4),
694 .is_quantized = true,
695 .to_float = (ggml_to_float_t) dequantize_row_mxfp4,
696 .from_float_ref = (ggml_from_float_t)quantize_row_mxfp4_ref,
697 },
698 [GGML_TYPE_Q2_K] = {
699 .type_name = "q2_K",
700 .blck_size = QK_K,
701 .type_size = sizeof(block_q2_K),
702 .is_quantized = true,
703 .to_float = (ggml_to_float_t) dequantize_row_q2_K,
704 .from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref,
705 },
706 [GGML_TYPE_Q3_K] = {
707 .type_name = "q3_K",
708 .blck_size = QK_K,
709 .type_size = sizeof(block_q3_K),
710 .is_quantized = true,
711 .to_float = (ggml_to_float_t) dequantize_row_q3_K,
712 .from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref,
713 },
714 [GGML_TYPE_Q4_K] = {
715 .type_name = "q4_K",
716 .blck_size = QK_K,
717 .type_size = sizeof(block_q4_K),
718 .is_quantized = true,
719 .to_float = (ggml_to_float_t) dequantize_row_q4_K,
720 .from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref,
721 },
722 [GGML_TYPE_Q5_K] = {
723 .type_name = "q5_K",
724 .blck_size = QK_K,
725 .type_size = sizeof(block_q5_K),
726 .is_quantized = true,
727 .to_float = (ggml_to_float_t) dequantize_row_q5_K,
728 .from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref,
729 },
730 [GGML_TYPE_Q6_K] = {
731 .type_name = "q6_K",
732 .blck_size = QK_K,
733 .type_size = sizeof(block_q6_K),
734 .is_quantized = true,
735 .to_float = (ggml_to_float_t) dequantize_row_q6_K,
736 .from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref,
737 },
738 [GGML_TYPE_IQ2_XXS] = {
739 .type_name = "iq2_xxs",
740 .blck_size = QK_K,
741 .type_size = sizeof(block_iq2_xxs),
742 .is_quantized = true,
743 .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
744 .from_float_ref = NULL,
745 },
746 [GGML_TYPE_IQ2_XS] = {
747 .type_name = "iq2_xs",
748 .blck_size = QK_K,
749 .type_size = sizeof(block_iq2_xs),
750 .is_quantized = true,
751 .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
752 .from_float_ref = NULL,
753 },
754 [GGML_TYPE_IQ3_XXS] = {
755 .type_name = "iq3_xxs",
756 .blck_size = QK_K,
757 .type_size = sizeof(block_iq3_xxs),
758 .is_quantized = true,
759 .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
760 .from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
761 },
762 [GGML_TYPE_IQ3_S] = {
763 .type_name = "iq3_s",
764 .blck_size = QK_K,
765 .type_size = sizeof(block_iq3_s),
766 .is_quantized = true,
767 .to_float = (ggml_to_float_t) dequantize_row_iq3_s,
768 .from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref,
769 },
770 [GGML_TYPE_IQ2_S] = {
771 .type_name = "iq2_s",
772 .blck_size = QK_K,
773 .type_size = sizeof(block_iq2_s),
774 .is_quantized = true,
775 .to_float = (ggml_to_float_t) dequantize_row_iq2_s,
776 .from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref,
777 },
778 [GGML_TYPE_IQ1_S] = {
779 .type_name = "iq1_s",
780 .blck_size = QK_K,
781 .type_size = sizeof(block_iq1_s),
782 .is_quantized = true,
783 .to_float = (ggml_to_float_t) dequantize_row_iq1_s,
784 .from_float_ref = NULL,
785 },
786 [GGML_TYPE_IQ1_M] = {
787 .type_name = "iq1_m",
788 .blck_size = QK_K,
789 .type_size = sizeof(block_iq1_m),
790 .is_quantized = true,
791 .to_float = (ggml_to_float_t) dequantize_row_iq1_m,
792 .from_float_ref = NULL,
793 },
794 [GGML_TYPE_IQ4_NL] = {
795 .type_name = "iq4_nl",
796 .blck_size = QK4_NL,
797 .type_size = sizeof(block_iq4_nl),
798 .is_quantized = true,
799 .to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
800 .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref,
801 },
802 [GGML_TYPE_IQ4_XS] = {
803 .type_name = "iq4_xs",
804 .blck_size = QK_K,
805 .type_size = sizeof(block_iq4_xs),
806 .is_quantized = true,
807 .to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
808 .from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref,
809 },
810 [GGML_TYPE_Q8_K] = {
811 .type_name = "q8_K",
812 .blck_size = QK_K,
813 .type_size = sizeof(block_q8_K),
814 .is_quantized = true,
815 },
816 [GGML_TYPE_BF16] = {
817 .type_name = "bf16",
818 .blck_size = 1,
819 .type_size = sizeof(ggml_bf16_t),
820 .is_quantized = false,
821 .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
822 .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
823 },
824 [31] = { // GGML_TYPE_Q4_0_4_4
825 .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
826 .blck_size = 0,
827 .type_size = 0,
828 .is_quantized = false,
829 },
830 [32] = { // GGML_TYPE_Q4_0_4_8
831 .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
832 .blck_size = 0,
833 .type_size = 0,
834 .is_quantized = false,
835 },
836 [33] = { // GGML_TYPE_Q4_0_8_8
837 .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
838 .blck_size = 0,
839 .type_size = 0,
840 .is_quantized = false,
841 },
842 [GGML_TYPE_TQ1_0] = {
843 .type_name = "tq1_0",
844 .blck_size = QK_K,
845 .type_size = sizeof(block_tq1_0),
846 .is_quantized = true,
847 .to_float = (ggml_to_float_t) dequantize_row_tq1_0,
848 .from_float_ref = (ggml_from_float_t) quantize_row_tq1_0_ref,
849 },
850 [GGML_TYPE_TQ2_0] = {
851 .type_name = "tq2_0",
852 .blck_size = QK_K,
853 .type_size = sizeof(block_tq2_0),
854 .is_quantized = true,
855 .to_float = (ggml_to_float_t) dequantize_row_tq2_0,
856 .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
857 },
858 [36] = { // GGML_TYPE_IQ4_NL_4_4
859 .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
860 .blck_size = 0,
861 .type_size = 0,
862 .is_quantized = false,
863 },
864 [37] = { // GGML_TYPE_IQ4_NL_4_8
865 .type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
866 .blck_size = 0,
867 .type_size = 0,
868 .is_quantized = false,
869 },
870 [38] = { // GGML_TYPE_IQ4_NL_8_8
871 .type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
872 .blck_size = 0,
873 .type_size = 0,
874 .is_quantized = false,
875 },
876};
877
878const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
879 GGML_ASSERT(type < GGML_TYPE_COUNT);
880 return &type_traits[type];
881}
882
883//
884// ggml object
885//
886
887struct ggml_object {
888 size_t offs;
889 size_t size;
890
891 struct ggml_object * next;
892
893 enum ggml_object_type type;
894
895 char padding[4];
896};
897
898static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
899
900//
901// ggml context
902//
903
904struct ggml_context {
905 size_t mem_size;
906 void * mem_buffer;
907 bool mem_buffer_owned;
908 bool no_alloc;
909
910 int n_objects;
911
912 struct ggml_object * objects_begin;
913 struct ggml_object * objects_end;
914};
915
916//
917// data types
918//
919
920static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
921 "NONE",
922
923 "DUP",
924 "ADD",
925 "ADD_ID",
926 "ADD1",
927 "ACC",
928 "SUB",
929 "MUL",
930 "DIV",
931 "SQR",
932 "SQRT",
933 "LOG",
934 "SIN",
935 "COS",
936 "SUM",
937 "SUM_ROWS",
938 "MEAN",
939 "ARGMAX",
940 "COUNT_EQUAL",
941 "REPEAT",
942 "REPEAT_BACK",
943 "CONCAT",
944 "SILU_BACK",
945 "NORM",
946 "RMS_NORM",
947 "RMS_NORM_BACK",
948 "GROUP_NORM",
949 "L2_NORM",
950
951 "MUL_MAT",
952 "MUL_MAT_ID",
953 "OUT_PROD",
954
955 "SCALE",
956 "SET",
957 "CPY",
958 "CONT",
959 "RESHAPE",
960 "VIEW",
961 "PERMUTE",
962 "TRANSPOSE",
963 "GET_ROWS",
964 "GET_ROWS_BACK",
965 "SET_ROWS",
966 "DIAG",
967 "DIAG_MASK_INF",
968 "DIAG_MASK_ZERO",
969 "SOFT_MAX",
970 "SOFT_MAX_BACK",
971 "ROPE",
972 "ROPE_BACK",
973 "CLAMP",
974 "CONV_TRANSPOSE_1D",
975 "IM2COL",
976 "IM2COL_BACK",
977 "IM2COL_3D",
978 "CONV_2D",
979 "CONV_3D",
980 "CONV_2D_DW",
981 "CONV_TRANSPOSE_2D",
982 "POOL_1D",
983 "POOL_2D",
984 "POOL_2D_BACK",
985 "UPSCALE",
986 "PAD",
987 "PAD_REFLECT_1D",
988 "ROLL",
989 "ARANGE",
990 "TIMESTEP_EMBEDDING",
991 "ARGSORT",
992 "LEAKY_RELU",
993
994 "FLASH_ATTN_EXT",
995 "FLASH_ATTN_BACK",
996 "SSM_CONV",
997 "SSM_SCAN",
998 "WIN_PART",
999 "WIN_UNPART",
1000 "GET_REL_POS",
1001 "ADD_REL_POS",
1002 "RWKV_WKV6",
1003 "GATED_LINEAR_ATTN",
1004 "RWKV_WKV7",
1005
1006 "UNARY",
1007
1008 "MAP_CUSTOM1",
1009 "MAP_CUSTOM2",
1010 "MAP_CUSTOM3",
1011
1012 "CUSTOM",
1013
1014 "CROSS_ENTROPY_LOSS",
1015 "CROSS_ENTROPY_LOSS_BACK",
1016 "OPT_STEP_ADAMW",
1017 "OPT_STEP_SGD",
1018
1019 "GLU",
1020};
1021
1022static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
1023
1024static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1025 "none",
1026
1027 "x",
1028 "x+y",
1029 "x[i]+y",
1030 "x+y",
1031 "view(x,nb,offset)+=y->x",
1032 "x-y",
1033 "x*y",
1034 "x/y",
1035 "x^2",
1036 "√x",
1037 "log(x)",
1038 "sin(x)",
1039 "cos(x)",
1040 "Σx",
1041 "Σx_k",
1042 "Σx/n",
1043 "argmax(x)",
1044 "count_equal(x)",
1045 "repeat(x)",
1046 "repeat_back(x)",
1047 "concat(x, y)",
1048 "silu_back(x)",
1049 "norm(x)",
1050 "rms_norm(x)",
1051 "rms_norm_back(x)",
1052 "group_norm(x)",
1053 "l2_norm(x)",
1054
1055 "X*Y",
1056 "X[i]*Y",
1057 "X*Y",
1058
1059 "x*v",
1060 "y-\\>view(x)",
1061 "x-\\>y",
1062 "cont(x)",
1063 "reshape(x)",
1064 "view(x)",
1065 "permute(x)",
1066 "transpose(x)",
1067 "get_rows(x)",
1068 "get_rows_back(x)",
1069 "set_rows(x)",
1070 "diag(x)",
1071 "diag_mask_inf(x)",
1072 "diag_mask_zero(x)",
1073 "soft_max(x)",
1074 "soft_max_back(x)",
1075 "rope(x)",
1076 "rope_back(x)",
1077 "clamp(x)",
1078 "conv_transpose_1d(x)",
1079 "im2col(x)",
1080 "im2col_back(x)",
1081 "im2col_3d(x)",
1082 "conv_2d(x)",
1083 "conv_3d(x)",
1084 "conv_2d_dw(x)",
1085 "conv_transpose_2d(x)",
1086 "pool_1d(x)",
1087 "pool_2d(x)",
1088 "pool_2d_back(x)",
1089 "upscale(x)",
1090 "pad(x)",
1091 "pad_reflect_1d(x)",
1092 "roll(x)",
1093 "arange(start, stop, step)",
1094 "timestep_embedding(timesteps, dim, max_period)",
1095 "argsort(x)",
1096 "leaky_relu(x)",
1097
1098 "flash_attn_ext(x)",
1099 "flash_attn_back(x)",
1100 "ssm_conv(x)",
1101 "ssm_scan(x)",
1102 "win_part(x)",
1103 "win_unpart(x)",
1104 "get_rel_pos(x)",
1105 "add_rel_pos(x)",
1106 "rwkv_wkv6(k, v, r, tf, td, s)",
1107 "gated_linear_attn(k, v, q, gate, s)",
1108 "rwkv_wkv7(r, w, k, v, a, b, s)",
1109
1110 "unary(x)",
1111
1112 "map_custom(x)",
1113 "map_custom(x,y)",
1114 "map_custom(x,y,z)",
1115
1116 "custom(x)",
1117
1118 "cross_entropy_loss(x,y)",
1119 "cross_entropy_loss_back(x,y)",
1120 "adamw(x)",
1121 "sgd(x)",
1122
1123 "glu(x)",
1124};
1125
1126static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
1127
1128static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1129
1130static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
1131 "ABS",
1132 "SGN",
1133 "NEG",
1134 "STEP",
1135 "TANH",
1136 "ELU",
1137 "RELU",
1138 "SIGMOID",
1139 "GELU",
1140 "GELU_QUICK",
1141 "SILU",
1142 "HARDSWISH",
1143 "HARDSIGMOID",
1144 "EXP",
1145 "GELU_ERF",
1146 "XIELU",
1147 "FLOOR",
1148 "CEIL",
1149 "ROUND",
1150 "TRUNC",
1151};
1152
1153static_assert(GGML_UNARY_OP_COUNT == 20, "GGML_UNARY_OP_COUNT != 20");
1154
1155static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
1156 "REGLU",
1157 "GEGLU",
1158 "SWIGLU",
1159 "SWIGLU_OAI",
1160 "GEGLU_ERF",
1161 "GEGLU_QUICK",
1162};
1163
1164static_assert(GGML_GLU_OP_COUNT == 6, "GGML_GLU_OP_COUNT != 6");
1165
1166
1167static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
1168static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
1169
1170
1171////////////////////////////////////////////////////////////////////////////////
1172
1173void ggml_print_object(const struct ggml_object * obj) {
1174 GGML_LOG_INFO(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
1175 obj->type, obj->offs, obj->size, (const void *) obj->next);
1176}
1177
1178void ggml_print_objects(const struct ggml_context * ctx) {
1179 struct ggml_object * obj = ctx->objects_begin;
1180
1181 GGML_LOG_INFO("%s: objects in context %p:\n", __func__, (const void *) ctx);
1182
1183 while (obj != NULL) {
1184 ggml_print_object(obj);
1185 obj = obj->next;
1186 }
1187
1188 GGML_LOG_INFO("%s: --- end ---\n", __func__);
1189}
1190
1191int64_t ggml_nelements(const struct ggml_tensor * tensor) {
1192 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1193
1194 return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
1195}
1196
1197int64_t ggml_nrows(const struct ggml_tensor * tensor) {
1198 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1199
1200 return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
1201}
1202
1203size_t ggml_nbytes(const struct ggml_tensor * tensor) {
1204 for (int i = 0; i < GGML_MAX_DIMS; ++i) {
1205 if (tensor->ne[i] <= 0) {
1206 return 0;
1207 }
1208 }
1209
1210 size_t nbytes;
1211 const size_t blck_size = ggml_blck_size(type: tensor->type);
1212 if (blck_size == 1) {
1213 nbytes = ggml_type_size(type: tensor->type);
1214 for (int i = 0; i < GGML_MAX_DIMS; ++i) {
1215 nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
1216 }
1217 }
1218 else {
1219 nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
1220 for (int i = 1; i < GGML_MAX_DIMS; ++i) {
1221 nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
1222 }
1223 }
1224
1225 return nbytes;
1226}
1227
1228size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
1229 return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
1230}
1231
1232int64_t ggml_blck_size(enum ggml_type type) {
1233 return type_traits[type].blck_size;
1234}
1235
1236size_t ggml_type_size(enum ggml_type type) {
1237 return type_traits[type].type_size;
1238}
1239
1240size_t ggml_row_size(enum ggml_type type, int64_t ne) {
1241 assert(ne % ggml_blck_size(type) == 0);
1242 return ggml_type_size(type)*ne/ggml_blck_size(type);
1243}
1244
1245double ggml_type_sizef(enum ggml_type type) {
1246 return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
1247}
1248
1249const char * ggml_type_name(enum ggml_type type) {
1250 return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
1251}
1252
1253bool ggml_is_quantized(enum ggml_type type) {
1254 return type_traits[type].is_quantized;
1255}
1256
1257const char * ggml_op_name(enum ggml_op op) {
1258 return GGML_OP_NAME[op];
1259}
1260
1261const char * ggml_op_symbol(enum ggml_op op) {
1262 return GGML_OP_SYMBOL[op];
1263}
1264
1265const char * ggml_unary_op_name(enum ggml_unary_op op) {
1266 return GGML_UNARY_OP_NAME[op];
1267}
1268
1269const char * ggml_glu_op_name(enum ggml_glu_op op) {
1270 return GGML_GLU_OP_NAME[op];
1271}
1272
1273const char * ggml_op_desc(const struct ggml_tensor * t) {
1274 if (t->op == GGML_OP_UNARY) {
1275 enum ggml_unary_op uop = ggml_get_unary_op(tensor: t);
1276 return ggml_unary_op_name(op: uop);
1277 }
1278 if (t->op == GGML_OP_GLU) {
1279 enum ggml_glu_op gop = ggml_get_glu_op(tensor: t);
1280 return ggml_glu_op_name(op: gop);
1281 }
1282 return ggml_op_name(op: t->op);
1283}
1284
1285size_t ggml_element_size(const struct ggml_tensor * tensor) {
1286 return ggml_type_size(type: tensor->type);
1287}
1288
1289bool ggml_is_scalar(const struct ggml_tensor * tensor) {
1290 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1291
1292 return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
1293}
1294
1295bool ggml_is_vector(const struct ggml_tensor * tensor) {
1296 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1297
1298 return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
1299}
1300
1301bool ggml_is_matrix(const struct ggml_tensor * tensor) {
1302 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1303
1304 return tensor->ne[2] == 1 && tensor->ne[3] == 1;
1305}
1306
1307bool ggml_is_3d(const struct ggml_tensor * tensor) {
1308 return tensor->ne[3] == 1;
1309}
1310
1311int ggml_n_dims(const struct ggml_tensor * tensor) {
1312 for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
1313 if (tensor->ne[i] > 1) {
1314 return i + 1;
1315 }
1316 }
1317 return 1;
1318}
1319
1320enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
1321 enum ggml_type wtype = GGML_TYPE_COUNT;
1322
1323 switch (ftype) {
1324 case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
1325 case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
1326 case GGML_FTYPE_MOSTLY_BF16: wtype = GGML_TYPE_BF16; break;
1327 case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
1328 case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
1329 case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
1330 case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
1331 case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
1332 case GGML_FTYPE_MOSTLY_MXFP4: wtype = GGML_TYPE_MXFP4; break;
1333 case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
1334 case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
1335 case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
1336 case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
1337 case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
1338 case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
1339 case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
1340 case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
1341 case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
1342 case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
1343 case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
1344 case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
1345 case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
1346 case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
1347 case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
1348 case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
1349 }
1350
1351 GGML_ASSERT(wtype != GGML_TYPE_COUNT);
1352
1353 return wtype;
1354}
1355
1356size_t ggml_tensor_overhead(void) {
1357 return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
1358}
1359
1360bool ggml_is_transposed(const struct ggml_tensor * tensor) {
1361 return tensor->nb[0] > tensor->nb[1];
1362}
1363
1364static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
1365 size_t next_nb = ggml_type_size(type: tensor->type);
1366 if (tensor->ne[0] != ggml_blck_size(type: tensor->type) && tensor->nb[0] != next_nb) {
1367 return false;
1368 }
1369 next_nb *= tensor->ne[0]/ggml_blck_size(type: tensor->type);
1370 for (int i = 1; i < GGML_MAX_DIMS; i++) {
1371 if (tensor->ne[i] != 1) {
1372 if (i > n) {
1373 if (tensor->nb[i] != next_nb) {
1374 return false;
1375 }
1376 next_nb *= tensor->ne[i];
1377 } else {
1378 // this dimension does not need to be contiguous
1379 next_nb = tensor->ne[i]*tensor->nb[i];
1380 }
1381 }
1382 }
1383 return true;
1384}
1385
1386bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
1387 return ggml_is_contiguous_0(tensor);
1388}
1389
1390bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
1391 return ggml_is_contiguous_n(tensor, n: 0);
1392}
1393
1394bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
1395 return ggml_is_contiguous_n(tensor, n: 1);
1396}
1397
1398bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
1399 return ggml_is_contiguous_n(tensor, n: 2);
1400}
1401
1402bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor) {
1403 return ggml_nbytes(tensor) == ggml_nelements(tensor) * ggml_type_size(type: tensor->type)/ggml_blck_size(type: tensor->type);
1404}
1405
1406bool ggml_is_permuted(const struct ggml_tensor * tensor) {
1407 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1408
1409 return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
1410}
1411
1412bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
1413 return
1414 tensor->nb[0] > tensor->nb[2] &&
1415 tensor->nb[1] > tensor->nb[0] &&
1416 tensor->nb[2] == ggml_type_size(type: tensor->type);
1417}
1418
1419bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor) {
1420 return
1421 tensor->ne[0] == ggml_blck_size(type: tensor->type) ||
1422 tensor->nb[0] == ggml_type_size(type: tensor->type);
1423}
1424
1425static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
1426 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1427
1428 return
1429 tensor->nb[0] == ggml_type_size(type: tensor->type) &&
1430 tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
1431 tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
1432}
1433
1434bool ggml_is_empty(const struct ggml_tensor * tensor) {
1435 for (int i = 0; i < GGML_MAX_DIMS; ++i) {
1436 if (tensor->ne[i] == 0) {
1437 // empty if any dimension has no elements
1438 return true;
1439 }
1440 }
1441 return false;
1442}
1443
1444bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1445 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1446
1447 return
1448 (t0->ne[0] == t1->ne[0]) &&
1449 (t0->ne[1] == t1->ne[1]) &&
1450 (t0->ne[2] == t1->ne[2]) &&
1451 (t0->ne[3] == t1->ne[3]);
1452}
1453
1454bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1455 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1456
1457 return
1458 (t0->nb[0] == t1->nb[0]) &&
1459 (t0->nb[1] == t1->nb[1]) &&
1460 (t0->nb[2] == t1->nb[2]) &&
1461 (t0->nb[3] == t1->nb[3]);
1462}
1463
1464// check if t1 can be represented as a repetition of t0
1465bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1466 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1467
1468 return ggml_is_empty(tensor: t0) ? ggml_is_empty(tensor: t1) :
1469 (t1->ne[0]%t0->ne[0] == 0) &&
1470 (t1->ne[1]%t0->ne[1] == 0) &&
1471 (t1->ne[2]%t0->ne[2] == 0) &&
1472 (t1->ne[3]%t0->ne[3] == 0);
1473}
1474
1475static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1476 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1477
1478 return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
1479}
1480
1481// assert that pointer is aligned to GGML_MEM_ALIGN
1482#define GGML_ASSERT_ALIGNED(ptr) \
1483 GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
1484
1485////////////////////////////////////////////////////////////////////////////////
1486
1487struct ggml_context * ggml_init(struct ggml_init_params params) {
1488 static bool is_first_call = true;
1489
1490 ggml_critical_section_start();
1491
1492 if (is_first_call) {
1493 // initialize time system (required on Windows)
1494 ggml_time_init();
1495
1496 is_first_call = false;
1497 }
1498
1499 ggml_critical_section_end();
1500
1501 struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));
1502
1503 // allow to call ggml_init with 0 size
1504 if (params.mem_size == 0) {
1505 params.mem_size = GGML_MEM_ALIGN;
1506 }
1507
1508 const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
1509
1510 *ctx = (struct ggml_context) {
1511 /*.mem_size =*/ mem_size,
1512 /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(size: mem_size),
1513 /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
1514 /*.no_alloc =*/ params.no_alloc,
1515 /*.n_objects =*/ 0,
1516 /*.objects_begin =*/ NULL,
1517 /*.objects_end =*/ NULL,
1518 };
1519
1520 GGML_ASSERT(ctx->mem_buffer != NULL);
1521
1522 GGML_ASSERT_ALIGNED(ctx->mem_buffer);
1523
1524 GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
1525
1526 return ctx;
1527}
1528
1529void ggml_reset(struct ggml_context * ctx) {
1530 if (ctx == NULL) {
1531 return;
1532 }
1533
1534 ctx->n_objects = 0;
1535 ctx->objects_begin = NULL;
1536 ctx->objects_end = NULL;
1537}
1538
1539void ggml_free(struct ggml_context * ctx) {
1540 if (ctx == NULL) {
1541 return;
1542 }
1543
1544 if (ctx->mem_buffer_owned) {
1545 ggml_aligned_free(ptr: ctx->mem_buffer, size: ctx->mem_size);
1546 }
1547
1548 GGML_FREE(ctx);
1549}
1550
1551size_t ggml_used_mem(const struct ggml_context * ctx) {
1552 return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
1553}
1554
1555bool ggml_get_no_alloc(struct ggml_context * ctx) {
1556 return ctx->no_alloc;
1557}
1558
1559void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
1560 ctx->no_alloc = no_alloc;
1561}
1562
1563void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
1564 return ctx->mem_buffer;
1565}
1566
1567size_t ggml_get_mem_size(const struct ggml_context * ctx) {
1568 return ctx->mem_size;
1569}
1570
1571size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
1572 size_t max_size = 0;
1573
1574 for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
1575 size_t bytes = ggml_nbytes(tensor);
1576 max_size = MAX(max_size, bytes);
1577 }
1578
1579 return max_size;
1580}
1581
1582////////////////////////////////////////////////////////////////////////////////
1583
1584static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
1585 // always insert objects at the end of the context's memory pool
1586 struct ggml_object * obj_cur = ctx->objects_end;
1587
1588 const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
1589 const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
1590 const size_t cur_end = cur_offs + cur_size;
1591
1592 // align to GGML_MEM_ALIGN
1593 size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
1594
1595 char * const mem_buffer = ctx->mem_buffer;
1596 struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
1597
1598 if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
1599 GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
1600 __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
1601#ifndef NDEBUG
1602 GGML_ABORT("not enough space in the context's memory pool");
1603#endif
1604 return NULL;
1605 }
1606
1607 *obj_new = (struct ggml_object) {
1608 .offs = cur_end + GGML_OBJECT_SIZE,
1609 .size = size_needed,
1610 .next = NULL,
1611 .type = type,
1612 };
1613
1614 GGML_ASSERT_ALIGNED(mem_buffer + obj_new->offs);
1615
1616 if (obj_cur != NULL) {
1617 obj_cur->next = obj_new;
1618 } else {
1619 // this is the first object in this context
1620 ctx->objects_begin = obj_new;
1621 }
1622
1623 ctx->objects_end = obj_new;
1624
1625 //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
1626
1627 return obj_new;
1628}
1629
1630static struct ggml_tensor * ggml_new_tensor_impl(
1631 struct ggml_context * ctx,
1632 enum ggml_type type,
1633 int n_dims,
1634 const int64_t * ne,
1635 struct ggml_tensor * view_src,
1636 size_t view_offs) {
1637
1638 GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
1639 GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
1640
1641 // find the base tensor and absolute offset
1642 if (view_src != NULL && view_src->view_src != NULL) {
1643 view_offs += view_src->view_offs;
1644 view_src = view_src->view_src;
1645 }
1646
1647 size_t data_size = ggml_row_size(type, ne: ne[0]);
1648 for (int i = 1; i < n_dims; i++) {
1649 data_size *= ne[i];
1650 }
1651
1652 GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
1653
1654 void * data = view_src != NULL ? view_src->data : NULL;
1655 if (data != NULL) {
1656 data = (char *) data + view_offs;
1657 }
1658
1659 size_t obj_alloc_size = 0;
1660
1661 if (view_src == NULL && !ctx->no_alloc) {
1662 // allocate tensor data in the context's memory pool
1663 obj_alloc_size = data_size;
1664 }
1665
1666 struct ggml_object * const obj_new = ggml_new_object(ctx, type: GGML_OBJECT_TYPE_TENSOR, size: GGML_TENSOR_SIZE + obj_alloc_size);
1667 GGML_ASSERT(obj_new);
1668
1669 struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
1670
1671 *result = (struct ggml_tensor) {
1672 /*.type =*/ type,
1673 /*.buffer =*/ NULL,
1674 /*.ne =*/ { 1, 1, 1, 1 },
1675 /*.nb =*/ { 0, 0, 0, 0 },
1676 /*.op =*/ GGML_OP_NONE,
1677 /*.op_params =*/ { 0 },
1678 /*.flags =*/ 0,
1679 /*.src =*/ { NULL },
1680 /*.view_src =*/ view_src,
1681 /*.view_offs =*/ view_offs,
1682 /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
1683 /*.name =*/ { 0 },
1684 /*.extra =*/ NULL,
1685 /*.padding =*/ { 0 },
1686 };
1687
1688 // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
1689 //GGML_ASSERT_ALIGNED(result->data);
1690
1691 for (int i = 0; i < n_dims; i++) {
1692 result->ne[i] = ne[i];
1693 }
1694
1695 result->nb[0] = ggml_type_size(type);
1696 result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
1697 for (int i = 2; i < GGML_MAX_DIMS; i++) {
1698 result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
1699 }
1700
1701 ctx->n_objects++;
1702
1703 return result;
1704}
1705
1706struct ggml_tensor * ggml_new_tensor(
1707 struct ggml_context * ctx,
1708 enum ggml_type type,
1709 int n_dims,
1710 const int64_t * ne) {
1711 return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, view_offs: 0);
1712}
1713
1714struct ggml_tensor * ggml_new_tensor_1d(
1715 struct ggml_context * ctx,
1716 enum ggml_type type,
1717 int64_t ne0) {
1718 return ggml_new_tensor(ctx, type, n_dims: 1, ne: &ne0);
1719}
1720
1721struct ggml_tensor * ggml_new_tensor_2d(
1722 struct ggml_context * ctx,
1723 enum ggml_type type,
1724 int64_t ne0,
1725 int64_t ne1) {
1726 const int64_t ne[2] = { ne0, ne1 };
1727 return ggml_new_tensor(ctx, type, n_dims: 2, ne);
1728}
1729
1730struct ggml_tensor * ggml_new_tensor_3d(
1731 struct ggml_context * ctx,
1732 enum ggml_type type,
1733 int64_t ne0,
1734 int64_t ne1,
1735 int64_t ne2) {
1736 const int64_t ne[3] = { ne0, ne1, ne2 };
1737 return ggml_new_tensor(ctx, type, n_dims: 3, ne);
1738}
1739
1740struct ggml_tensor * ggml_new_tensor_4d(
1741 struct ggml_context * ctx,
1742 enum ggml_type type,
1743 int64_t ne0,
1744 int64_t ne1,
1745 int64_t ne2,
1746 int64_t ne3) {
1747 const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
1748 return ggml_new_tensor(ctx, type, n_dims: 4, ne);
1749}
1750
1751void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) {
1752 struct ggml_object * obj = ggml_new_object(ctx, type: GGML_OBJECT_TYPE_WORK_BUFFER, size: nbytes);
1753
1754 return (uint8_t *)ctx->mem_buffer + obj->offs;
1755}
1756
1757struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
1758 return ggml_new_tensor(ctx, type: src->type, GGML_MAX_DIMS, ne: src->ne);
1759}
1760
1761void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
1762 const int64_t ne2 = tensor->ne[2];
1763 const int64_t ne1 = tensor->ne[1];
1764 const int64_t ne0 = tensor->ne[0];
1765
1766 const int64_t i3_ = (i/(ne2*ne1*ne0));
1767 const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0);
1768 const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0;
1769 const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0);
1770
1771 if (i0) {
1772 * i0 = i0_;
1773 }
1774 if (i1) {
1775 * i1 = i1_;
1776 }
1777 if (i2) {
1778 * i2 = i2_;
1779 }
1780 if (i3) {
1781 * i3 = i3_;
1782 }
1783}
1784
1785void * ggml_get_data(const struct ggml_tensor * tensor) {
1786 return tensor->data;
1787}
1788
1789float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
1790 assert(tensor->type == GGML_TYPE_F32);
1791 return (float *)(tensor->data);
1792}
1793
1794enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
1795 GGML_ASSERT(tensor->op == GGML_OP_UNARY);
1796 return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, i: 0);
1797}
1798
1799enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor) {
1800 GGML_ASSERT(tensor->op == GGML_OP_GLU);
1801 return (enum ggml_glu_op) ggml_get_op_params_i32(tensor, i: 0);
1802}
1803
1804const char * ggml_get_name(const struct ggml_tensor * tensor) {
1805 return tensor->name;
1806}
1807
1808struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
1809 size_t i;
1810 for (i = 0; i < sizeof(tensor->name) - 1 && name[i] != '\0'; i++) {
1811 tensor->name[i] = name[i];
1812 }
1813 tensor->name[i] = '\0';
1814 return tensor;
1815}
1816
1817struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
1818 va_list args;
1819 va_start(args, fmt);
1820 vsnprintf(s: tensor->name, maxlen: sizeof(tensor->name), format: fmt, arg: args);
1821 va_end(args);
1822 return tensor;
1823}
1824
1825struct ggml_tensor * ggml_view_tensor(
1826 struct ggml_context * ctx,
1827 struct ggml_tensor * src) {
1828 struct ggml_tensor * result = ggml_new_tensor_impl(ctx, type: src->type, GGML_MAX_DIMS, ne: src->ne, view_src: src, view_offs: 0);
1829 ggml_format_name(tensor: result, fmt: "%s (view)", src->name);
1830
1831 for (int i = 0; i < GGML_MAX_DIMS; i++) {
1832 result->nb[i] = src->nb[i];
1833 }
1834
1835 return result;
1836}
1837
1838struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
1839 struct ggml_object * obj = ctx->objects_begin;
1840
1841 char * const mem_buffer = ctx->mem_buffer;
1842
1843 while (obj != NULL) {
1844 if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
1845 return (struct ggml_tensor *)(mem_buffer + obj->offs);
1846 }
1847
1848 obj = obj->next;
1849 }
1850
1851 return NULL;
1852}
1853
1854struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
1855 struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
1856 obj = obj->next;
1857
1858 char * const mem_buffer = ctx->mem_buffer;
1859
1860 while (obj != NULL) {
1861 if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
1862 return (struct ggml_tensor *)(mem_buffer + obj->offs);
1863 }
1864
1865 obj = obj->next;
1866 }
1867
1868 return NULL;
1869}
1870
1871struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
1872 struct ggml_object * obj = ctx->objects_begin;
1873
1874 char * const mem_buffer = ctx->mem_buffer;
1875
1876 while (obj != NULL) {
1877 if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
1878 struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
1879 if (strcmp(s1: cur->name, s2: name) == 0) {
1880 return cur;
1881 }
1882 }
1883
1884 obj = obj->next;
1885 }
1886
1887 return NULL;
1888}
1889
1890////////////////////////////////////////////////////////////////////////////////
1891
1892// ggml_dup
1893
1894static struct ggml_tensor * ggml_dup_impl(
1895 struct ggml_context * ctx,
1896 struct ggml_tensor * a,
1897 bool inplace) {
1898 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
1899
1900 result->op = GGML_OP_DUP;
1901 result->src[0] = a;
1902
1903 return result;
1904}
1905
1906struct ggml_tensor * ggml_dup(
1907 struct ggml_context * ctx,
1908 struct ggml_tensor * a) {
1909 return ggml_dup_impl(ctx, a, false);
1910}
1911
1912struct ggml_tensor * ggml_dup_inplace(
1913 struct ggml_context * ctx,
1914 struct ggml_tensor * a) {
1915 return ggml_dup_impl(ctx, a, true);
1916}
1917
1918// ggml_add
1919
1920static struct ggml_tensor * ggml_add_impl(
1921 struct ggml_context * ctx,
1922 struct ggml_tensor * a,
1923 struct ggml_tensor * b,
1924 bool inplace) {
1925 GGML_ASSERT(ggml_can_repeat(b, a));
1926
1927 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
1928
1929 result->op = GGML_OP_ADD;
1930 result->src[0] = a;
1931 result->src[1] = b;
1932
1933 return result;
1934}
1935
1936struct ggml_tensor * ggml_add(
1937 struct ggml_context * ctx,
1938 struct ggml_tensor * a,
1939 struct ggml_tensor * b) {
1940 return ggml_add_impl(ctx, a, b, false);
1941}
1942
1943struct ggml_tensor * ggml_add_inplace(
1944 struct ggml_context * ctx,
1945 struct ggml_tensor * a,
1946 struct ggml_tensor * b) {
1947 return ggml_add_impl(ctx, a, b, true);
1948}
1949
1950// ggml_add_cast
1951
1952static struct ggml_tensor * ggml_add_cast_impl(
1953 struct ggml_context * ctx,
1954 struct ggml_tensor * a,
1955 struct ggml_tensor * b,
1956 enum ggml_type type) {
1957 // TODO: support less-strict constraint
1958 // GGML_ASSERT(ggml_can_repeat(b, a));
1959 GGML_ASSERT(ggml_can_repeat_rows(b, a));
1960
1961 // currently only supported for quantized input and f16
1962 GGML_ASSERT(ggml_is_quantized(a->type) ||
1963 a->type == GGML_TYPE_F16 ||
1964 a->type == GGML_TYPE_BF16);
1965
1966 struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, ne: a->ne);
1967
1968 result->op = GGML_OP_ADD;
1969 result->src[0] = a;
1970 result->src[1] = b;
1971
1972 return result;
1973}
1974
1975struct ggml_tensor * ggml_add_cast(
1976 struct ggml_context * ctx,
1977 struct ggml_tensor * a,
1978 struct ggml_tensor * b,
1979 enum ggml_type type) {
1980 return ggml_add_cast_impl(ctx, a, b, type);
1981}
1982
1983struct ggml_tensor * ggml_add_id(
1984 struct ggml_context * ctx,
1985 struct ggml_tensor * a,
1986 struct ggml_tensor * b,
1987 struct ggml_tensor * ids) {
1988
1989 GGML_ASSERT(a->ne[0] == b->ne[0]);
1990 GGML_ASSERT(a->ne[1] == ids->ne[0]);
1991 GGML_ASSERT(a->ne[2] == ids->ne[1]);
1992 GGML_ASSERT(ids->type == GGML_TYPE_I32);
1993
1994 struct ggml_tensor * result = ggml_dup_tensor(ctx, src: a);
1995
1996 result->op = GGML_OP_ADD_ID;
1997 result->src[0] = a;
1998 result->src[1] = b;
1999 result->src[2] = ids;
2000
2001 return result;
2002}
2003
2004// ggml_add1
2005
2006static struct ggml_tensor * ggml_add1_impl(
2007 struct ggml_context * ctx,
2008 struct ggml_tensor * a,
2009 struct ggml_tensor * b,
2010 bool inplace) {
2011 GGML_ASSERT(ggml_is_scalar(b));
2012 GGML_ASSERT(ggml_is_padded_1d(a));
2013
2014 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2015
2016 result->op = GGML_OP_ADD1;
2017 result->src[0] = a;
2018 result->src[1] = b;
2019
2020 return result;
2021}
2022
2023struct ggml_tensor * ggml_add1(
2024 struct ggml_context * ctx,
2025 struct ggml_tensor * a,
2026 struct ggml_tensor * b) {
2027 return ggml_add1_impl(ctx, a, b, false);
2028}
2029
2030struct ggml_tensor * ggml_add1_inplace(
2031 struct ggml_context * ctx,
2032 struct ggml_tensor * a,
2033 struct ggml_tensor * b) {
2034 return ggml_add1_impl(ctx, a, b, true);
2035}
2036
2037// ggml_acc
2038
2039static struct ggml_tensor * ggml_acc_impl(
2040 struct ggml_context * ctx,
2041 struct ggml_tensor * a,
2042 struct ggml_tensor * b,
2043 size_t nb1,
2044 size_t nb2,
2045 size_t nb3,
2046 size_t offset,
2047 bool inplace) {
2048 GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a));
2049 GGML_ASSERT(ggml_is_contiguous(a));
2050 GGML_ASSERT(a->type == GGML_TYPE_F32);
2051 GGML_ASSERT(b->type == GGML_TYPE_F32);
2052
2053 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2054
2055 int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
2056 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
2057
2058 result->op = GGML_OP_ACC;
2059 result->src[0] = a;
2060 result->src[1] = b;
2061
2062 return result;
2063}
2064
2065struct ggml_tensor * ggml_acc(
2066 struct ggml_context * ctx,
2067 struct ggml_tensor * a,
2068 struct ggml_tensor * b,
2069 size_t nb1,
2070 size_t nb2,
2071 size_t nb3,
2072 size_t offset) {
2073 return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
2074}
2075
2076struct ggml_tensor * ggml_acc_inplace(
2077 struct ggml_context * ctx,
2078 struct ggml_tensor * a,
2079 struct ggml_tensor * b,
2080 size_t nb1,
2081 size_t nb2,
2082 size_t nb3,
2083 size_t offset) {
2084 return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
2085}
2086
2087// ggml_sub
2088
2089static struct ggml_tensor * ggml_sub_impl(
2090 struct ggml_context * ctx,
2091 struct ggml_tensor * a,
2092 struct ggml_tensor * b,
2093 bool inplace) {
2094 GGML_ASSERT(ggml_can_repeat(b, a));
2095
2096 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2097
2098 result->op = GGML_OP_SUB;
2099 result->src[0] = a;
2100 result->src[1] = b;
2101
2102 return result;
2103}
2104
2105struct ggml_tensor * ggml_sub(
2106 struct ggml_context * ctx,
2107 struct ggml_tensor * a,
2108 struct ggml_tensor * b) {
2109 return ggml_sub_impl(ctx, a, b, false);
2110}
2111
2112struct ggml_tensor * ggml_sub_inplace(
2113 struct ggml_context * ctx,
2114 struct ggml_tensor * a,
2115 struct ggml_tensor * b) {
2116 return ggml_sub_impl(ctx, a, b, true);
2117}
2118
2119// ggml_mul
2120
2121static struct ggml_tensor * ggml_mul_impl(
2122 struct ggml_context * ctx,
2123 struct ggml_tensor * a,
2124 struct ggml_tensor * b,
2125 bool inplace) {
2126 GGML_ASSERT(ggml_can_repeat(b, a));
2127
2128 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2129
2130 result->op = GGML_OP_MUL;
2131 result->src[0] = a;
2132 result->src[1] = b;
2133
2134 return result;
2135}
2136
2137struct ggml_tensor * ggml_mul(
2138 struct ggml_context * ctx,
2139 struct ggml_tensor * a,
2140 struct ggml_tensor * b) {
2141 return ggml_mul_impl(ctx, a, b, false);
2142}
2143
2144struct ggml_tensor * ggml_mul_inplace(
2145 struct ggml_context * ctx,
2146 struct ggml_tensor * a,
2147 struct ggml_tensor * b) {
2148 return ggml_mul_impl(ctx, a, b, true);
2149}
2150
2151// ggml_div
2152
2153static struct ggml_tensor * ggml_div_impl(
2154 struct ggml_context * ctx,
2155 struct ggml_tensor * a,
2156 struct ggml_tensor * b,
2157 bool inplace) {
2158 GGML_ASSERT(ggml_can_repeat(b, a));
2159
2160 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2161
2162 result->op = GGML_OP_DIV;
2163 result->src[0] = a;
2164 result->src[1] = b;
2165
2166 return result;
2167}
2168
2169struct ggml_tensor * ggml_div(
2170 struct ggml_context * ctx,
2171 struct ggml_tensor * a,
2172 struct ggml_tensor * b) {
2173 return ggml_div_impl(ctx, a, b, false);
2174}
2175
2176struct ggml_tensor * ggml_div_inplace(
2177 struct ggml_context * ctx,
2178 struct ggml_tensor * a,
2179 struct ggml_tensor * b) {
2180 return ggml_div_impl(ctx, a, b, true);
2181}
2182
2183// ggml_sqr
2184
2185static struct ggml_tensor * ggml_sqr_impl(
2186 struct ggml_context * ctx,
2187 struct ggml_tensor * a,
2188 bool inplace) {
2189 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2190
2191 result->op = GGML_OP_SQR;
2192 result->src[0] = a;
2193
2194 return result;
2195}
2196
2197struct ggml_tensor * ggml_sqr(
2198 struct ggml_context * ctx,
2199 struct ggml_tensor * a) {
2200 return ggml_sqr_impl(ctx, a, false);
2201}
2202
2203struct ggml_tensor * ggml_sqr_inplace(
2204 struct ggml_context * ctx,
2205 struct ggml_tensor * a) {
2206 return ggml_sqr_impl(ctx, a, true);
2207}
2208
2209// ggml_sqrt
2210
2211static struct ggml_tensor * ggml_sqrt_impl(
2212 struct ggml_context * ctx,
2213 struct ggml_tensor * a,
2214 bool inplace) {
2215 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2216
2217 result->op = GGML_OP_SQRT;
2218 result->src[0] = a;
2219
2220 return result;
2221}
2222
2223struct ggml_tensor * ggml_sqrt(
2224 struct ggml_context * ctx,
2225 struct ggml_tensor * a) {
2226 return ggml_sqrt_impl(ctx, a, false);
2227}
2228
2229struct ggml_tensor * ggml_sqrt_inplace(
2230 struct ggml_context * ctx,
2231 struct ggml_tensor * a) {
2232 return ggml_sqrt_impl(ctx, a, true);
2233}
2234
2235// ggml_log
2236
2237static struct ggml_tensor * ggml_log_impl(
2238 struct ggml_context * ctx,
2239 struct ggml_tensor * a,
2240 bool inplace) {
2241 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2242
2243 result->op = GGML_OP_LOG;
2244 result->src[0] = a;
2245
2246 return result;
2247}
2248
2249struct ggml_tensor * ggml_log(
2250 struct ggml_context * ctx,
2251 struct ggml_tensor * a) {
2252 return ggml_log_impl(ctx, a, false);
2253}
2254
2255struct ggml_tensor * ggml_log_inplace(
2256 struct ggml_context * ctx,
2257 struct ggml_tensor * a) {
2258 return ggml_log_impl(ctx, a, true);
2259}
2260
2261// ggml_sin
2262
2263static struct ggml_tensor * ggml_sin_impl(
2264 struct ggml_context * ctx,
2265 struct ggml_tensor * a,
2266 bool inplace) {
2267 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2268
2269 result->op = GGML_OP_SIN;
2270 result->src[0] = a;
2271
2272 return result;
2273}
2274
2275struct ggml_tensor * ggml_sin(
2276 struct ggml_context * ctx,
2277 struct ggml_tensor * a) {
2278 return ggml_sin_impl(ctx, a, false);
2279}
2280
2281struct ggml_tensor * ggml_sin_inplace(
2282 struct ggml_context * ctx,
2283 struct ggml_tensor * a) {
2284 return ggml_sin_impl(ctx, a, true);
2285}
2286
2287// ggml_cos
2288
2289static struct ggml_tensor * ggml_cos_impl(
2290 struct ggml_context * ctx,
2291 struct ggml_tensor * a,
2292 bool inplace) {
2293 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2294
2295 result->op = GGML_OP_COS;
2296 result->src[0] = a;
2297
2298 return result;
2299}
2300
2301struct ggml_tensor * ggml_cos(
2302 struct ggml_context * ctx,
2303 struct ggml_tensor * a) {
2304 return ggml_cos_impl(ctx, a, false);
2305}
2306
2307struct ggml_tensor * ggml_cos_inplace(
2308 struct ggml_context * ctx,
2309 struct ggml_tensor * a) {
2310 return ggml_cos_impl(ctx, a, true);
2311}
2312
2313// ggml_sum
2314
2315struct ggml_tensor * ggml_sum(
2316 struct ggml_context * ctx,
2317 struct ggml_tensor * a) {
2318 struct ggml_tensor * result = ggml_new_tensor_1d(ctx, type: a->type, ne0: 1);
2319
2320 result->op = GGML_OP_SUM;
2321 result->src[0] = a;
2322
2323 return result;
2324}
2325
2326// ggml_sum_rows
2327
2328struct ggml_tensor * ggml_sum_rows(
2329 struct ggml_context * ctx,
2330 struct ggml_tensor * a) {
2331 int64_t ne[GGML_MAX_DIMS] = { 1 };
2332 for (int i = 1; i < GGML_MAX_DIMS; ++i) {
2333 ne[i] = a->ne[i];
2334 }
2335
2336 struct ggml_tensor * result = ggml_new_tensor(ctx, type: a->type, GGML_MAX_DIMS, ne);
2337
2338 result->op = GGML_OP_SUM_ROWS;
2339 result->src[0] = a;
2340
2341 return result;
2342}
2343
2344// ggml_mean
2345
2346struct ggml_tensor * ggml_mean(
2347 struct ggml_context * ctx,
2348 struct ggml_tensor * a) {
2349 int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
2350 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
2351
2352 result->op = GGML_OP_MEAN;
2353 result->src[0] = a;
2354
2355 return result;
2356}
2357
2358// ggml_argmax
2359
2360struct ggml_tensor * ggml_argmax(
2361 struct ggml_context * ctx,
2362 struct ggml_tensor * a) {
2363 GGML_ASSERT(ggml_is_matrix(a));
2364 GGML_ASSERT(a->ne[0] <= INT32_MAX);
2365
2366 struct ggml_tensor * result = ggml_new_tensor_1d(ctx, type: GGML_TYPE_I32, ne0: a->ne[1]);
2367
2368 result->op = GGML_OP_ARGMAX;
2369 result->src[0] = a;
2370
2371 return result;
2372}
2373
2374// ggml_count_equal
2375
2376struct ggml_tensor * ggml_count_equal(
2377 struct ggml_context * ctx,
2378 struct ggml_tensor * a,
2379 struct ggml_tensor * b) {
2380 GGML_ASSERT(ggml_are_same_shape(a, b));
2381
2382 struct ggml_tensor * result = ggml_new_tensor_1d(ctx, type: GGML_TYPE_I64, ne0: 1);
2383
2384 result->op = GGML_OP_COUNT_EQUAL;
2385 result->src[0] = a;
2386 result->src[1] = b;
2387
2388 return result;
2389}
2390
2391// ggml_repeat
2392
2393struct ggml_tensor * ggml_repeat(
2394 struct ggml_context * ctx,
2395 struct ggml_tensor * a,
2396 struct ggml_tensor * b) {
2397 GGML_ASSERT(ggml_can_repeat(a, b));
2398
2399 struct ggml_tensor * result = ggml_new_tensor(ctx, type: a->type, GGML_MAX_DIMS, ne: b->ne);
2400
2401 result->op = GGML_OP_REPEAT;
2402 result->src[0] = a;
2403
2404 return result;
2405}
2406
2407struct ggml_tensor * ggml_repeat_4d(
2408 struct ggml_context * ctx,
2409 struct ggml_tensor * a,
2410 int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
2411 const bool can_repeat = ggml_is_empty(tensor: a) || (
2412 (ne0 % a->ne[0] == 0) &&
2413 (ne1 % a->ne[1] == 0) &&
2414 (ne2 % a->ne[2] == 0) &&
2415 (ne3 % a->ne[3] == 0)
2416 );
2417 GGML_ASSERT(can_repeat);
2418
2419 struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type: a->type, ne0, ne1, ne2, ne3);
2420
2421 result->op = GGML_OP_REPEAT;
2422 result->src[0] = a;
2423
2424 return result;
2425}
2426
2427// ggml_repeat_back
2428
2429struct ggml_tensor * ggml_repeat_back(
2430 struct ggml_context * ctx,
2431 struct ggml_tensor * a,
2432 struct ggml_tensor * b) {
2433 GGML_ASSERT(ggml_can_repeat(b, a));
2434
2435 struct ggml_tensor * result = ggml_new_tensor(ctx, type: a->type, GGML_MAX_DIMS, ne: b->ne);
2436
2437 result->op = GGML_OP_REPEAT_BACK;
2438 result->src[0] = a;
2439
2440 return result;
2441}
2442
2443// ggml_concat
2444
2445struct ggml_tensor * ggml_concat(
2446 struct ggml_context * ctx,
2447 struct ggml_tensor * a,
2448 struct ggml_tensor * b,
2449 int dim) {
2450 GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
2451 GGML_ASSERT(a->type == b->type);
2452
2453 int64_t ne[GGML_MAX_DIMS];
2454 for (int d = 0; d < GGML_MAX_DIMS; ++d) {
2455 if (d == dim) {
2456 ne[d] = a->ne[d] + b->ne[d];
2457 continue;
2458 }
2459 GGML_ASSERT(a->ne[d] == b->ne[d]);
2460 ne[d] = a->ne[d];
2461 }
2462
2463 struct ggml_tensor * result = ggml_new_tensor(ctx, type: a->type, GGML_MAX_DIMS, ne);
2464
2465 ggml_set_op_params_i32(tensor: result, i: 0, value: dim);
2466
2467 result->op = GGML_OP_CONCAT;
2468 result->src[0] = a;
2469 result->src[1] = b;
2470
2471 return result;
2472}
2473
2474// ggml_abs
2475
2476struct ggml_tensor * ggml_abs(
2477 struct ggml_context * ctx,
2478 struct ggml_tensor * a) {
2479 return ggml_unary(ctx, a, op: GGML_UNARY_OP_ABS);
2480}
2481
2482struct ggml_tensor * ggml_abs_inplace(
2483 struct ggml_context * ctx,
2484 struct ggml_tensor * a) {
2485 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_ABS);
2486}
2487
2488// ggml_sgn
2489
2490struct ggml_tensor * ggml_sgn(
2491 struct ggml_context * ctx,
2492 struct ggml_tensor * a) {
2493 return ggml_unary(ctx, a, op: GGML_UNARY_OP_SGN);
2494}
2495
2496struct ggml_tensor * ggml_sgn_inplace(
2497 struct ggml_context * ctx,
2498 struct ggml_tensor * a) {
2499 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_SGN);
2500}
2501
2502// ggml_neg
2503
2504struct ggml_tensor * ggml_neg(
2505 struct ggml_context * ctx,
2506 struct ggml_tensor * a) {
2507 return ggml_unary(ctx, a, op: GGML_UNARY_OP_NEG);
2508}
2509
2510struct ggml_tensor * ggml_neg_inplace(
2511 struct ggml_context * ctx,
2512 struct ggml_tensor * a) {
2513 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_NEG);
2514}
2515
2516// ggml_step
2517
2518struct ggml_tensor * ggml_step(
2519 struct ggml_context * ctx,
2520 struct ggml_tensor * a) {
2521 return ggml_unary(ctx, a, op: GGML_UNARY_OP_STEP);
2522}
2523
2524struct ggml_tensor * ggml_step_inplace(
2525 struct ggml_context * ctx,
2526 struct ggml_tensor * a) {
2527 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_STEP);
2528}
2529
2530// ggml_tanh
2531
2532struct ggml_tensor * ggml_tanh(
2533 struct ggml_context * ctx,
2534 struct ggml_tensor * a) {
2535 return ggml_unary(ctx, a, op: GGML_UNARY_OP_TANH);
2536}
2537
2538struct ggml_tensor * ggml_tanh_inplace(
2539 struct ggml_context * ctx,
2540 struct ggml_tensor * a) {
2541 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_TANH);
2542}
2543
2544// ggml_elu
2545
2546struct ggml_tensor * ggml_elu(
2547 struct ggml_context * ctx,
2548 struct ggml_tensor * a) {
2549 return ggml_unary(ctx, a, op: GGML_UNARY_OP_ELU);
2550}
2551
2552struct ggml_tensor * ggml_elu_inplace(
2553 struct ggml_context * ctx,
2554 struct ggml_tensor * a) {
2555 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_ELU);
2556}
2557
2558// ggml_relu
2559
2560struct ggml_tensor * ggml_relu(
2561 struct ggml_context * ctx,
2562 struct ggml_tensor * a) {
2563 return ggml_unary(ctx, a, op: GGML_UNARY_OP_RELU);
2564}
2565
2566struct ggml_tensor * ggml_relu_inplace(
2567 struct ggml_context * ctx,
2568 struct ggml_tensor * a) {
2569 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_RELU);
2570}
2571
2572// ggml_leaky_relu
2573
2574struct ggml_tensor * ggml_leaky_relu(
2575 struct ggml_context * ctx,
2576 struct ggml_tensor * a,
2577 float negative_slope,
2578 bool inplace) {
2579 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2580
2581 ggml_set_op_params(tensor: result, params: &negative_slope, params_size: sizeof(negative_slope));
2582
2583 result->op = GGML_OP_LEAKY_RELU;
2584 result->src[0] = a;
2585
2586 return result;
2587}
2588
2589// ggml_sigmoid
2590
2591struct ggml_tensor * ggml_sigmoid(
2592 struct ggml_context * ctx,
2593 struct ggml_tensor * a) {
2594 return ggml_unary(ctx, a, op: GGML_UNARY_OP_SIGMOID);
2595}
2596
2597struct ggml_tensor * ggml_sigmoid_inplace(
2598 struct ggml_context * ctx,
2599 struct ggml_tensor * a) {
2600 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_SIGMOID);
2601}
2602
2603// ggml_gelu
2604
2605struct ggml_tensor * ggml_gelu(
2606 struct ggml_context * ctx,
2607 struct ggml_tensor * a) {
2608 return ggml_unary(ctx, a, op: GGML_UNARY_OP_GELU);
2609}
2610
2611struct ggml_tensor * ggml_gelu_inplace(
2612 struct ggml_context * ctx,
2613 struct ggml_tensor * a) {
2614 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_GELU);
2615}
2616
2617// ggml_gelu_erf
2618
2619struct ggml_tensor * ggml_gelu_erf(
2620 struct ggml_context * ctx,
2621 struct ggml_tensor * a) {
2622 return ggml_unary(ctx, a, op: GGML_UNARY_OP_GELU_ERF);
2623}
2624
2625struct ggml_tensor * ggml_gelu_erf_inplace(
2626 struct ggml_context * ctx,
2627 struct ggml_tensor * a) {
2628 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_GELU_ERF);
2629}
2630
2631// ggml_gelu_quick
2632
2633struct ggml_tensor * ggml_gelu_quick(
2634 struct ggml_context * ctx,
2635 struct ggml_tensor * a) {
2636 return ggml_unary(ctx, a, op: GGML_UNARY_OP_GELU_QUICK);
2637}
2638
2639struct ggml_tensor * ggml_gelu_quick_inplace(
2640 struct ggml_context * ctx,
2641 struct ggml_tensor * a) {
2642 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_GELU_QUICK);
2643}
2644
2645// ggml_silu
2646
2647struct ggml_tensor * ggml_silu(
2648 struct ggml_context * ctx,
2649 struct ggml_tensor * a) {
2650 return ggml_unary(ctx, a, op: GGML_UNARY_OP_SILU);
2651}
2652
2653struct ggml_tensor * ggml_silu_inplace(
2654 struct ggml_context * ctx,
2655 struct ggml_tensor * a) {
2656 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_SILU);
2657}
2658
2659// ggml_xielu
2660
2661struct ggml_tensor * ggml_xielu(
2662 struct ggml_context * ctx,
2663 struct ggml_tensor * a,
2664 float alpha_n,
2665 float alpha_p,
2666 float beta,
2667 float eps) {
2668 struct ggml_tensor * result = ggml_dup_tensor(ctx, src: a);
2669
2670 ggml_set_op_params_i32(tensor: result, i: 0, value: (int32_t) GGML_UNARY_OP_XIELU);
2671 ggml_set_op_params_f32(tensor: result, i: 1, value: beta + ggml_softplus(input: alpha_n));
2672 ggml_set_op_params_f32(tensor: result, i: 2, value: ggml_softplus(input: alpha_p));
2673 ggml_set_op_params_f32(tensor: result, i: 3, value: beta);
2674 ggml_set_op_params_f32(tensor: result, i: 4, value: eps);
2675
2676 result->op = GGML_OP_UNARY;
2677 result->src[0] = a;
2678
2679 return result;
2680}
2681
2682// ggml_silu_back
2683
2684struct ggml_tensor * ggml_silu_back(
2685 struct ggml_context * ctx,
2686 struct ggml_tensor * a,
2687 struct ggml_tensor * b) {
2688 struct ggml_tensor * result = ggml_dup_tensor(ctx, src: a);
2689
2690 result->op = GGML_OP_SILU_BACK;
2691 result->src[0] = a;
2692 result->src[1] = b;
2693
2694 return result;
2695}
2696
2697// ggml hardswish
2698
2699struct ggml_tensor * ggml_hardswish(
2700 struct ggml_context * ctx,
2701 struct ggml_tensor * a) {
2702 return ggml_unary(ctx, a, op: GGML_UNARY_OP_HARDSWISH);
2703}
2704
2705// ggml hardsigmoid
2706
2707struct ggml_tensor * ggml_hardsigmoid(
2708 struct ggml_context * ctx,
2709 struct ggml_tensor * a) {
2710 return ggml_unary(ctx, a, op: GGML_UNARY_OP_HARDSIGMOID);
2711}
2712
2713// ggml exp
2714
2715struct ggml_tensor * ggml_exp(
2716 struct ggml_context * ctx,
2717 struct ggml_tensor * a) {
2718 return ggml_unary(ctx, a, op: GGML_UNARY_OP_EXP);
2719}
2720
2721struct ggml_tensor * ggml_exp_inplace(
2722 struct ggml_context * ctx,
2723 struct ggml_tensor * a) {
2724 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_EXP);
2725}
2726
2727// ggml_glu
2728
2729static struct ggml_tensor * ggml_glu_impl(
2730 struct ggml_context * ctx,
2731 struct ggml_tensor * a,
2732 struct ggml_tensor * b,
2733 enum ggml_glu_op op,
2734 bool swapped) {
2735 GGML_ASSERT(ggml_is_contiguous_1(a));
2736
2737 if (b) {
2738 GGML_ASSERT(ggml_is_contiguous_1(b));
2739 GGML_ASSERT(ggml_are_same_shape(a, b));
2740 GGML_ASSERT(a->type == b->type);
2741 }
2742
2743 int64_t ne[GGML_MAX_DIMS] = { a->ne[0] / 2 }; for (int i = 1; i < GGML_MAX_DIMS; i++) ne[i] = a->ne[i];
2744 struct ggml_tensor * result = ggml_new_tensor_impl(ctx, type: a->type, GGML_MAX_DIMS, ne: b ? a->ne : ne, NULL, view_offs: 0);
2745
2746 ggml_set_op_params_i32(tensor: result, i: 0, value: (int32_t) op);
2747 ggml_set_op_params_i32(tensor: result, i: 1, value: (int32_t) swapped);
2748
2749 result->op = GGML_OP_GLU;
2750 result->src[0] = a;
2751 result->src[1] = b;
2752
2753 return result;
2754}
2755
2756// ggml_floor
2757
2758struct ggml_tensor * ggml_floor(
2759 struct ggml_context * ctx,
2760 struct ggml_tensor * a) {
2761 return ggml_unary(ctx, a, op: GGML_UNARY_OP_FLOOR);
2762}
2763
2764struct ggml_tensor * ggml_floor_inplace(
2765 struct ggml_context * ctx,
2766 struct ggml_tensor * a) {
2767 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_FLOOR);
2768}
2769
2770// ggml_ceil
2771
2772struct ggml_tensor * ggml_ceil(
2773 struct ggml_context * ctx,
2774 struct ggml_tensor * a) {
2775 return ggml_unary(ctx, a, op: GGML_UNARY_OP_CEIL);
2776}
2777
2778struct ggml_tensor * ggml_ceil_inplace(
2779 struct ggml_context * ctx,
2780 struct ggml_tensor * a) {
2781 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_CEIL);
2782}
2783
2784//ggml_round
2785
2786struct ggml_tensor * ggml_round(
2787 struct ggml_context * ctx,
2788 struct ggml_tensor * a) {
2789 return ggml_unary(ctx, a, op: GGML_UNARY_OP_ROUND);
2790}
2791
2792struct ggml_tensor * ggml_round_inplace(
2793 struct ggml_context * ctx,
2794 struct ggml_tensor * a) {
2795 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_ROUND);
2796}
2797
2798//ggml_trunc
2799
2800struct ggml_tensor * ggml_trunc(
2801 struct ggml_context * ctx,
2802 struct ggml_tensor * a) {
2803 return ggml_unary(ctx, a, op: GGML_UNARY_OP_TRUNC);
2804}
2805
2806struct ggml_tensor * ggml_trunc_inplace(
2807 struct ggml_context * ctx,
2808 struct ggml_tensor * a) {
2809 return ggml_unary_inplace(ctx, a, op: GGML_UNARY_OP_TRUNC);
2810}
2811
2812struct ggml_tensor * ggml_glu(
2813 struct ggml_context * ctx,
2814 struct ggml_tensor * a,
2815 enum ggml_glu_op op,
2816 bool swapped) {
2817 return ggml_glu_impl(ctx, a, NULL, op, swapped);
2818}
2819
2820struct ggml_tensor * ggml_glu_split(
2821 struct ggml_context * ctx,
2822 struct ggml_tensor * a,
2823 struct ggml_tensor * b,
2824 enum ggml_glu_op op) {
2825 return ggml_glu_impl(ctx, a, b, op, false);
2826}
2827
2828// ggml_reglu
2829
2830struct ggml_tensor * ggml_reglu(
2831 struct ggml_context * ctx,
2832 struct ggml_tensor * a) {
2833 return ggml_glu_impl(ctx, a, NULL, op: GGML_GLU_OP_REGLU, false);
2834}
2835
2836struct ggml_tensor * ggml_reglu_swapped(
2837 struct ggml_context * ctx,
2838 struct ggml_tensor * a) {
2839 return ggml_glu_impl(ctx, a, NULL, op: GGML_GLU_OP_REGLU, true);
2840}
2841
2842struct ggml_tensor * ggml_reglu_split(
2843 struct ggml_context * ctx,
2844 struct ggml_tensor * a,
2845 struct ggml_tensor * b) {
2846 return ggml_glu_impl(ctx, a, b, op: GGML_GLU_OP_REGLU, false);
2847}
2848
2849// ggml_geglu
2850
2851struct ggml_tensor * ggml_geglu(
2852 struct ggml_context * ctx,
2853 struct ggml_tensor * a) {
2854 return ggml_glu_impl(ctx, a, NULL, op: GGML_GLU_OP_GEGLU, false);
2855}
2856
2857struct ggml_tensor * ggml_geglu_swapped(
2858 struct ggml_context * ctx,
2859 struct ggml_tensor * a) {
2860 return ggml_glu_impl(ctx, a, NULL, op: GGML_GLU_OP_GEGLU, true);
2861}
2862
2863struct ggml_tensor * ggml_geglu_split(
2864 struct ggml_context * ctx,
2865 struct ggml_tensor * a,
2866 struct ggml_tensor * b) {
2867 return ggml_glu_impl(ctx, a, b, op: GGML_GLU_OP_GEGLU, false);
2868}
2869
2870// ggml_swiglu
2871
2872struct ggml_tensor * ggml_swiglu(
2873 struct ggml_context * ctx,
2874 struct ggml_tensor * a) {
2875 return ggml_glu_impl(ctx, a, NULL, op: GGML_GLU_OP_SWIGLU, false);
2876}
2877
2878struct ggml_tensor * ggml_swiglu_swapped(
2879 struct ggml_context * ctx,
2880 struct ggml_tensor * a) {
2881 return ggml_glu_impl(ctx, a, NULL, op: GGML_GLU_OP_SWIGLU, true);
2882}
2883
2884struct ggml_tensor * ggml_swiglu_split(
2885 struct ggml_context * ctx,
2886 struct ggml_tensor * a,
2887 struct ggml_tensor * b) {
2888 return ggml_glu_impl(ctx, a, b, op: GGML_GLU_OP_SWIGLU, false);
2889}
2890
2891// ggml_geglu_erf
2892
2893struct ggml_tensor * ggml_geglu_erf(
2894 struct ggml_context * ctx,
2895 struct ggml_tensor * a) {
2896 return ggml_glu_impl(ctx, a, NULL, op: GGML_GLU_OP_GEGLU_ERF, false);
2897}
2898
2899struct ggml_tensor * ggml_geglu_erf_swapped(
2900 struct ggml_context * ctx,
2901 struct ggml_tensor * a) {
2902 return ggml_glu_impl(ctx, a, NULL, op: GGML_GLU_OP_GEGLU_ERF, true);
2903}
2904
2905struct ggml_tensor * ggml_geglu_erf_split(
2906 struct ggml_context * ctx,
2907 struct ggml_tensor * a,
2908 struct ggml_tensor * b) {
2909 return ggml_glu_impl(ctx, a, b, op: GGML_GLU_OP_GEGLU_ERF, false);
2910}
2911
2912// ggml_geglu_quick
2913
2914struct ggml_tensor * ggml_geglu_quick(
2915 struct ggml_context * ctx,
2916 struct ggml_tensor * a) {
2917 return ggml_glu_impl(ctx, a, NULL, op: GGML_GLU_OP_GEGLU_QUICK, false);
2918}
2919
2920struct ggml_tensor * ggml_geglu_quick_swapped(
2921 struct ggml_context * ctx,
2922 struct ggml_tensor * a) {
2923 return ggml_glu_impl(ctx, a, NULL, op: GGML_GLU_OP_GEGLU_QUICK, true);
2924}
2925
2926struct ggml_tensor * ggml_geglu_quick_split(
2927 struct ggml_context * ctx,
2928 struct ggml_tensor * a,
2929 struct ggml_tensor * b) {
2930 return ggml_glu_impl(ctx, a, b, op: GGML_GLU_OP_GEGLU_QUICK, false);
2931}
2932
2933struct ggml_tensor * ggml_swiglu_oai(
2934 struct ggml_context * ctx,
2935 struct ggml_tensor * a,
2936 struct ggml_tensor * b,
2937 float alpha,
2938 float limit) {
2939 struct ggml_tensor * result = ggml_glu_impl(ctx, a, b, op: GGML_GLU_OP_SWIGLU_OAI, false);
2940 ggml_set_op_params_f32(tensor: result, i: 2, value: alpha);
2941 ggml_set_op_params_f32(tensor: result, i: 3, value: limit);
2942
2943 return result;
2944}
2945
2946// ggml_norm
2947
2948static struct ggml_tensor * ggml_norm_impl(
2949 struct ggml_context * ctx,
2950 struct ggml_tensor * a,
2951 float eps,
2952 bool inplace) {
2953 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2954
2955 ggml_set_op_params(tensor: result, params: &eps, params_size: sizeof(eps));
2956
2957 result->op = GGML_OP_NORM;
2958 result->src[0] = a;
2959
2960 return result;
2961}
2962
2963struct ggml_tensor * ggml_norm(
2964 struct ggml_context * ctx,
2965 struct ggml_tensor * a,
2966 float eps) {
2967 return ggml_norm_impl(ctx, a, eps, false);
2968}
2969
2970struct ggml_tensor * ggml_norm_inplace(
2971 struct ggml_context * ctx,
2972 struct ggml_tensor * a,
2973 float eps) {
2974 return ggml_norm_impl(ctx, a, eps, true);
2975}
2976
2977// ggml_rms_norm
2978
2979static struct ggml_tensor * ggml_rms_norm_impl(
2980 struct ggml_context * ctx,
2981 struct ggml_tensor * a,
2982 float eps,
2983 bool inplace) {
2984 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
2985
2986 ggml_set_op_params(tensor: result, params: &eps, params_size: sizeof(eps));
2987
2988 result->op = GGML_OP_RMS_NORM;
2989 result->src[0] = a;
2990
2991 return result;
2992}
2993
2994struct ggml_tensor * ggml_rms_norm(
2995 struct ggml_context * ctx,
2996 struct ggml_tensor * a,
2997 float eps) {
2998 return ggml_rms_norm_impl(ctx, a, eps, false);
2999}
3000
3001struct ggml_tensor * ggml_rms_norm_inplace(
3002 struct ggml_context * ctx,
3003 struct ggml_tensor * a,
3004 float eps) {
3005 return ggml_rms_norm_impl(ctx, a, eps, true);
3006}
3007
3008// ggml_rms_norm_back
3009
3010struct ggml_tensor * ggml_rms_norm_back(
3011 struct ggml_context * ctx,
3012 struct ggml_tensor * a,
3013 struct ggml_tensor * b,
3014 float eps) {
3015 struct ggml_tensor * result = ggml_dup_tensor(ctx, src: a);
3016
3017 ggml_set_op_params(tensor: result, params: &eps, params_size: sizeof(eps));
3018
3019 result->op = GGML_OP_RMS_NORM_BACK;
3020 result->src[0] = a;
3021 result->src[1] = b;
3022
3023 return result;
3024}
3025
3026// ggml_group_norm
3027
3028static struct ggml_tensor * ggml_group_norm_impl(
3029 struct ggml_context * ctx,
3030 struct ggml_tensor * a,
3031 int n_groups,
3032 float eps,
3033 bool inplace) {
3034 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
3035
3036 ggml_set_op_params_i32(tensor: result, i: 0, value: n_groups);
3037 ggml_set_op_params_f32(tensor: result, i: 1, value: eps);
3038
3039 result->op = GGML_OP_GROUP_NORM;
3040 result->src[0] = a;
3041
3042 return result;
3043}
3044
3045struct ggml_tensor * ggml_group_norm(
3046 struct ggml_context * ctx,
3047 struct ggml_tensor * a,
3048 int n_groups,
3049 float eps) {
3050 return ggml_group_norm_impl(ctx, a, n_groups, eps, false);
3051}
3052
3053struct ggml_tensor * ggml_group_norm_inplace(
3054 struct ggml_context * ctx,
3055 struct ggml_tensor * a,
3056 int n_groups,
3057 float eps) {
3058 return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
3059}
3060
3061// ggml_l2_norm
3062
3063static struct ggml_tensor * ggml_l2_norm_impl(
3064 struct ggml_context * ctx,
3065 struct ggml_tensor * a,
3066 float eps,
3067 bool inplace) {
3068 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
3069
3070 ggml_set_op_params_f32(tensor: result, i: 0, value: eps);
3071
3072 result->op = GGML_OP_L2_NORM;
3073 result->src[0] = a;
3074
3075 return result;
3076}
3077
3078struct ggml_tensor * ggml_l2_norm(
3079 struct ggml_context * ctx,
3080 struct ggml_tensor * a,
3081 float eps) {
3082 return ggml_l2_norm_impl(ctx, a, eps, false);
3083}
3084
3085struct ggml_tensor * ggml_l2_norm_inplace(
3086 struct ggml_context * ctx,
3087 struct ggml_tensor * a,
3088 float eps) {
3089 return ggml_l2_norm_impl(ctx, a, eps, true);
3090}
3091
3092// ggml_mul_mat
3093
3094static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
3095 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3096
3097 return (t0->ne[0] == t1->ne[0]) &&
3098 (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
3099 (t1->ne[3]%t0->ne[3] == 0);
3100}
3101
3102struct ggml_tensor * ggml_mul_mat(
3103 struct ggml_context * ctx,
3104 struct ggml_tensor * a,
3105 struct ggml_tensor * b) {
3106 GGML_ASSERT(ggml_can_mul_mat(a, b));
3107 GGML_ASSERT(!ggml_is_transposed(a));
3108
3109 const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
3110 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
3111
3112 result->op = GGML_OP_MUL_MAT;
3113 result->src[0] = a;
3114 result->src[1] = b;
3115
3116 return result;
3117}
3118
3119void ggml_mul_mat_set_prec(
3120 struct ggml_tensor * a,
3121 enum ggml_prec prec) {
3122 GGML_ASSERT(a->op == GGML_OP_MUL_MAT);
3123
3124 const int32_t prec_i32 = (int32_t) prec;
3125
3126 ggml_set_op_params_i32(tensor: a, i: 0, value: prec_i32);
3127}
3128
3129// ggml_mul_mat_id
3130
3131/*
3132 c = ggml_mul_mat_id(ctx, as, b, ids);
3133
3134 as -> [cols, rows, n_expert]
3135 b -> [cols, n_expert_used, n_tokens]
3136 ids -> [n_expert_used, n_tokens] (i32)
3137 c -> [rows, n_expert_used, n_tokens]
3138
3139 in b, n_expert_used can be broadcasted to match the n_expert_used of ids
3140
3141 c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
3142*/
3143struct ggml_tensor * ggml_mul_mat_id(
3144 struct ggml_context * ctx,
3145 struct ggml_tensor * as,
3146 struct ggml_tensor * b,
3147 struct ggml_tensor * ids) {
3148 GGML_ASSERT(!ggml_is_transposed(as));
3149 GGML_ASSERT(ids->type == GGML_TYPE_I32);
3150
3151 GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
3152 GGML_ASSERT(b->ne[3] == 1); // b is 3d
3153 GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
3154 GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
3155 GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
3156 GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
3157
3158 const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
3159 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
3160
3161 result->op = GGML_OP_MUL_MAT_ID;
3162 result->src[0] = as;
3163 result->src[1] = b;
3164 result->src[2] = ids;
3165
3166 return result;
3167}
3168
3169// ggml_out_prod
3170
3171static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
3172 static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3173
3174 return (t0->ne[1] == t1->ne[1]) &&
3175 (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
3176 (t1->ne[3]%t0->ne[3] == 0);
3177}
3178
3179struct ggml_tensor * ggml_out_prod(
3180 struct ggml_context * ctx,
3181 struct ggml_tensor * a,
3182 struct ggml_tensor * b) {
3183 GGML_ASSERT(ggml_can_out_prod(a, b));
3184 GGML_ASSERT(!ggml_is_transposed(a));
3185
3186 // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
3187 const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
3188 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
3189
3190 result->op = GGML_OP_OUT_PROD;
3191 result->src[0] = a;
3192 result->src[1] = b;
3193
3194 return result;
3195}
3196
3197// ggml_scale
3198
3199static struct ggml_tensor * ggml_scale_impl(
3200 struct ggml_context * ctx,
3201 struct ggml_tensor * a,
3202 float s,
3203 float b,
3204 bool inplace) {
3205 GGML_ASSERT(ggml_is_padded_1d(a));
3206
3207 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
3208
3209 float params[2] = { s, b };
3210 ggml_set_op_params(tensor: result, params: &params, params_size: sizeof(params));
3211
3212 result->op = GGML_OP_SCALE;
3213 result->src[0] = a;
3214
3215 return result;
3216}
3217
3218struct ggml_tensor * ggml_scale(
3219 struct ggml_context * ctx,
3220 struct ggml_tensor * a,
3221 float s) {
3222 return ggml_scale_impl(ctx, a, s, b: 0.0, false);
3223}
3224
3225struct ggml_tensor * ggml_scale_inplace(
3226 struct ggml_context * ctx,
3227 struct ggml_tensor * a,
3228 float s) {
3229 return ggml_scale_impl(ctx, a, s, b: 0.0, true);
3230}
3231
3232struct ggml_tensor * ggml_scale_bias(
3233 struct ggml_context * ctx,
3234 struct ggml_tensor * a,
3235 float s,
3236 float b) {
3237 return ggml_scale_impl(ctx, a, s, b, false);
3238}
3239
3240struct ggml_tensor * ggml_scale_bias_inplace(
3241 struct ggml_context * ctx,
3242 struct ggml_tensor * a,
3243 float s,
3244 float b) {
3245 return ggml_scale_impl(ctx, a, s, b, true);
3246}
3247
3248// ggml_set
3249
3250static struct ggml_tensor * ggml_set_impl(
3251 struct ggml_context * ctx,
3252 struct ggml_tensor * a,
3253 struct ggml_tensor * b,
3254 size_t nb1,
3255 size_t nb2,
3256 size_t nb3,
3257 size_t offset,
3258 bool inplace) {
3259 GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b));
3260
3261 // make a view of the destination
3262 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
3263
3264 GGML_ASSERT(offset < (size_t)(1 << 30));
3265 int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
3266 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
3267
3268 result->op = GGML_OP_SET;
3269 result->src[0] = a;
3270 result->src[1] = b;
3271
3272 return result;
3273}
3274
3275struct ggml_tensor * ggml_set(
3276 struct ggml_context * ctx,
3277 struct ggml_tensor * a,
3278 struct ggml_tensor * b,
3279 size_t nb1,
3280 size_t nb2,
3281 size_t nb3,
3282 size_t offset) {
3283 return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
3284}
3285
3286struct ggml_tensor * ggml_set_inplace(
3287 struct ggml_context * ctx,
3288 struct ggml_tensor * a,
3289 struct ggml_tensor * b,
3290 size_t nb1,
3291 size_t nb2,
3292 size_t nb3,
3293 size_t offset) {
3294 return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
3295}
3296
3297struct ggml_tensor * ggml_set_1d(
3298 struct ggml_context * ctx,
3299 struct ggml_tensor * a,
3300 struct ggml_tensor * b,
3301 size_t offset) {
3302 return ggml_set_impl(ctx, a, b, nb1: a->nb[1], nb2: a->nb[2], nb3: a->nb[3], offset, false);
3303}
3304
3305struct ggml_tensor * ggml_set_1d_inplace(
3306 struct ggml_context * ctx,
3307 struct ggml_tensor * a,
3308 struct ggml_tensor * b,
3309 size_t offset) {
3310 return ggml_set_impl(ctx, a, b, nb1: a->nb[1], nb2: a->nb[2], nb3: a->nb[3], offset, true);
3311}
3312
3313struct ggml_tensor * ggml_set_2d(
3314 struct ggml_context * ctx,
3315 struct ggml_tensor * a,
3316 struct ggml_tensor * b,
3317 size_t nb1,
3318 size_t offset) {
3319 return ggml_set_impl(ctx, a, b, nb1, nb2: a->nb[2], nb3: a->nb[3], offset, false);
3320}
3321
3322struct ggml_tensor * ggml_set_2d_inplace(
3323 struct ggml_context * ctx,
3324 struct ggml_tensor * a,
3325 struct ggml_tensor * b,
3326 size_t nb1,
3327 size_t offset) {
3328 return ggml_set_impl(ctx, a, b, nb1, nb2: a->nb[2], nb3: a->nb[3], offset, true);
3329}
3330
3331// ggml_cpy
3332
3333static struct ggml_tensor * ggml_cpy_impl(
3334 struct ggml_context * ctx,
3335 struct ggml_tensor * a,
3336 struct ggml_tensor * b) {
3337 GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
3338
3339 // make a view of the destination
3340 struct ggml_tensor * result = ggml_view_tensor(ctx, src: b);
3341 if (strlen(s: b->name) > 0) {
3342 ggml_format_name(tensor: result, fmt: "%s (copy of %s)", b->name, a->name);
3343 } else {
3344 ggml_format_name(tensor: result, fmt: "%s (copy)", a->name);
3345 }
3346
3347 result->op = GGML_OP_CPY;
3348 result->src[0] = a;
3349 result->src[1] = b;
3350
3351 return result;
3352}
3353
3354struct ggml_tensor * ggml_cpy(
3355 struct ggml_context * ctx,
3356 struct ggml_tensor * a,
3357 struct ggml_tensor * b) {
3358 return ggml_cpy_impl(ctx, a, b);
3359}
3360
3361struct ggml_tensor * ggml_cast(
3362 struct ggml_context * ctx,
3363 struct ggml_tensor * a,
3364 enum ggml_type type) {
3365 struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, ne: a->ne);
3366 ggml_format_name(tensor: result, fmt: "%s (copy)", a->name);
3367
3368 result->op = GGML_OP_CPY;
3369 result->src[0] = a;
3370 result->src[1] = result;
3371
3372 return result;
3373}
3374
3375// ggml_cont
3376
3377static struct ggml_tensor * ggml_cont_impl(
3378 struct ggml_context * ctx,
3379 struct ggml_tensor * a) {
3380 struct ggml_tensor * result = ggml_dup_tensor(ctx, src: a);
3381 ggml_format_name(tensor: result, fmt: "%s (cont)", a->name);
3382
3383 result->op = GGML_OP_CONT;
3384 result->src[0] = a;
3385
3386 return result;
3387}
3388
3389struct ggml_tensor * ggml_cont(
3390 struct ggml_context * ctx,
3391 struct ggml_tensor * a) {
3392 return ggml_cont_impl(ctx, a);
3393}
3394
3395// make contiguous, with new shape
3396GGML_API struct ggml_tensor * ggml_cont_1d(
3397 struct ggml_context * ctx,
3398 struct ggml_tensor * a,
3399 int64_t ne0) {
3400 return ggml_cont_4d(ctx, a, ne0, ne1: 1, ne2: 1, ne3: 1);
3401}
3402
3403GGML_API struct ggml_tensor * ggml_cont_2d(
3404 struct ggml_context * ctx,
3405 struct ggml_tensor * a,
3406 int64_t ne0,
3407 int64_t ne1) {
3408 return ggml_cont_4d(ctx, a, ne0, ne1, ne2: 1, ne3: 1);
3409}
3410
3411GGML_API struct ggml_tensor * ggml_cont_3d(
3412 struct ggml_context * ctx,
3413 struct ggml_tensor * a,
3414 int64_t ne0,
3415 int64_t ne1,
3416 int64_t ne2) {
3417 return ggml_cont_4d(ctx, a, ne0, ne1, ne2, ne3: 1);
3418}
3419
3420struct ggml_tensor * ggml_cont_4d(
3421 struct ggml_context * ctx,
3422 struct ggml_tensor * a,
3423 int64_t ne0,
3424 int64_t ne1,
3425 int64_t ne2,
3426 int64_t ne3) {
3427 GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
3428
3429 struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type: a->type, ne0, ne1, ne2, ne3);
3430 ggml_format_name(tensor: result, fmt: "%s (cont)", a->name);
3431
3432 result->op = GGML_OP_CONT;
3433 result->src[0] = a;
3434
3435 return result;
3436}
3437
3438// ggml_reshape
3439
3440struct ggml_tensor * ggml_reshape(
3441 struct ggml_context * ctx,
3442 struct ggml_tensor * a,
3443 struct ggml_tensor * b) {
3444 GGML_ASSERT(ggml_is_contiguous(a));
3445 // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
3446 GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
3447
3448 struct ggml_tensor * result = ggml_new_tensor_impl(ctx, type: a->type, GGML_MAX_DIMS, ne: b->ne, view_src: a, view_offs: 0);
3449 ggml_format_name(tensor: result, fmt: "%s (reshaped)", a->name);
3450
3451 result->op = GGML_OP_RESHAPE;
3452 result->src[0] = a;
3453
3454 return result;
3455}
3456
3457struct ggml_tensor * ggml_reshape_1d(
3458 struct ggml_context * ctx,
3459 struct ggml_tensor * a,
3460 int64_t ne0) {
3461 GGML_ASSERT(ggml_is_contiguous(a));
3462 GGML_ASSERT(ggml_nelements(a) == ne0);
3463
3464 const int64_t ne[1] = { ne0 };
3465 struct ggml_tensor * result = ggml_new_tensor_impl(ctx, type: a->type, n_dims: 1, ne, view_src: a, view_offs: 0);
3466 ggml_format_name(tensor: result, fmt: "%s (reshaped)", a->name);
3467
3468 result->op = GGML_OP_RESHAPE;
3469 result->src[0] = a;
3470
3471 return result;
3472}
3473
3474struct ggml_tensor * ggml_reshape_2d(
3475 struct ggml_context * ctx,
3476 struct ggml_tensor * a,
3477 int64_t ne0,
3478 int64_t ne1) {
3479 GGML_ASSERT(ggml_is_contiguous(a));
3480 GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
3481
3482 const int64_t ne[2] = { ne0, ne1 };
3483 struct ggml_tensor * result = ggml_new_tensor_impl(ctx, type: a->type, n_dims: 2, ne, view_src: a, view_offs: 0);
3484 ggml_format_name(tensor: result, fmt: "%s (reshaped)", a->name);
3485
3486 result->op = GGML_OP_RESHAPE;
3487 result->src[0] = a;
3488
3489 return result;
3490}
3491
3492struct ggml_tensor * ggml_reshape_3d(
3493 struct ggml_context * ctx,
3494 struct ggml_tensor * a,
3495 int64_t ne0,
3496 int64_t ne1,
3497 int64_t ne2) {
3498 GGML_ASSERT(ggml_is_contiguous(a));
3499 GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
3500
3501 const int64_t ne[3] = { ne0, ne1, ne2 };
3502 struct ggml_tensor * result = ggml_new_tensor_impl(ctx, type: a->type, n_dims: 3, ne, view_src: a, view_offs: 0);
3503 ggml_format_name(tensor: result, fmt: "%s (reshaped)", a->name);
3504
3505 result->op = GGML_OP_RESHAPE;
3506 result->src[0] = a;
3507
3508 return result;
3509}
3510
3511struct ggml_tensor * ggml_reshape_4d(
3512 struct ggml_context * ctx,
3513 struct ggml_tensor * a,
3514 int64_t ne0,
3515 int64_t ne1,
3516 int64_t ne2,
3517 int64_t ne3) {
3518 GGML_ASSERT(ggml_is_contiguous(a));
3519 GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
3520
3521 const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
3522 struct ggml_tensor * result = ggml_new_tensor_impl(ctx, type: a->type, n_dims: 4, ne, view_src: a, view_offs: 0);
3523 ggml_format_name(tensor: result, fmt: "%s (reshaped)", a->name);
3524
3525 result->op = GGML_OP_RESHAPE;
3526 result->src[0] = a;
3527
3528 return result;
3529}
3530
3531static struct ggml_tensor * ggml_view_impl(
3532 struct ggml_context * ctx,
3533 struct ggml_tensor * a,
3534 int n_dims,
3535 const int64_t * ne,
3536 size_t offset) {
3537 struct ggml_tensor * result = ggml_new_tensor_impl(ctx, type: a->type, n_dims, ne, view_src: a, view_offs: offset);
3538 ggml_format_name(tensor: result, fmt: "%s (view)", a->name);
3539
3540 ggml_set_op_params(tensor: result, params: &offset, params_size: sizeof(offset));
3541
3542 result->op = GGML_OP_VIEW;
3543 result->src[0] = a;
3544
3545 return result;
3546}
3547
3548// ggml_view_1d
3549
3550struct ggml_tensor * ggml_view_1d(
3551 struct ggml_context * ctx,
3552 struct ggml_tensor * a,
3553 int64_t ne0,
3554 size_t offset) {
3555 struct ggml_tensor * result = ggml_view_impl(ctx, a, n_dims: 1, ne: &ne0, offset);
3556
3557 return result;
3558}
3559
3560// ggml_view_2d
3561
3562struct ggml_tensor * ggml_view_2d(
3563 struct ggml_context * ctx,
3564 struct ggml_tensor * a,
3565 int64_t ne0,
3566 int64_t ne1,
3567 size_t nb1,
3568 size_t offset) {
3569 const int64_t ne[2] = { ne0, ne1 };
3570
3571 struct ggml_tensor * result = ggml_view_impl(ctx, a, n_dims: 2, ne, offset);
3572
3573 result->nb[1] = nb1;
3574 result->nb[2] = result->nb[1]*ne1;
3575 result->nb[3] = result->nb[2];
3576
3577 return result;
3578}
3579
3580// ggml_view_3d
3581
3582struct ggml_tensor * ggml_view_3d(
3583 struct ggml_context * ctx,
3584 struct ggml_tensor * a,
3585 int64_t ne0,
3586 int64_t ne1,
3587 int64_t ne2,
3588 size_t nb1,
3589 size_t nb2,
3590 size_t offset) {
3591 const int64_t ne[3] = { ne0, ne1, ne2 };
3592
3593 struct ggml_tensor * result = ggml_view_impl(ctx, a, n_dims: 3, ne, offset);
3594
3595 result->nb[1] = nb1;
3596 result->nb[2] = nb2;
3597 result->nb[3] = result->nb[2]*ne2;
3598
3599 return result;
3600}
3601
3602// ggml_view_4d
3603
3604struct ggml_tensor * ggml_view_4d(
3605 struct ggml_context * ctx,
3606 struct ggml_tensor * a,
3607 int64_t ne0,
3608 int64_t ne1,
3609 int64_t ne2,
3610 int64_t ne3,
3611 size_t nb1,
3612 size_t nb2,
3613 size_t nb3,
3614 size_t offset) {
3615 const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
3616
3617 struct ggml_tensor * result = ggml_view_impl(ctx, a, n_dims: 4, ne, offset);
3618
3619 result->nb[1] = nb1;
3620 result->nb[2] = nb2;
3621 result->nb[3] = nb3;
3622
3623 return result;
3624}
3625
3626// ggml_permute
3627
3628struct ggml_tensor * ggml_permute(
3629 struct ggml_context * ctx,
3630 struct ggml_tensor * a,
3631 int axis0,
3632 int axis1,
3633 int axis2,
3634 int axis3) {
3635 GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);
3636 GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);
3637 GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);
3638 GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);
3639
3640 GGML_ASSERT(axis0 != axis1);
3641 GGML_ASSERT(axis0 != axis2);
3642 GGML_ASSERT(axis0 != axis3);
3643 GGML_ASSERT(axis1 != axis2);
3644 GGML_ASSERT(axis1 != axis3);
3645 GGML_ASSERT(axis2 != axis3);
3646
3647 struct ggml_tensor * result = ggml_view_tensor(ctx, src: a);
3648 ggml_format_name(tensor: result, fmt: "%s (permuted)", a->name);
3649
3650 int ne[GGML_MAX_DIMS];
3651 int nb[GGML_MAX_DIMS];
3652
3653 ne[axis0] = a->ne[0];
3654 ne[axis1] = a->ne[1];
3655 ne[axis2] = a->ne[2];
3656 ne[axis3] = a->ne[3];
3657
3658 nb[axis0] = a->nb[0];
3659 nb[axis1] = a->nb[1];
3660 nb[axis2] = a->nb[2];
3661 nb[axis3] = a->nb[3];
3662
3663 result->ne[0] = ne[0];
3664 result->ne[1] = ne[1];
3665 result->ne[2] = ne[2];
3666 result->ne[3] = ne[3];
3667
3668 result->nb[0] = nb[0];
3669 result->nb[1] = nb[1];
3670 result->nb[2] = nb[2];
3671 result->nb[3] = nb[3];
3672
3673 result->op = GGML_OP_PERMUTE;
3674 result->src[0] = a;
3675
3676 int32_t params[] = { axis0, axis1, axis2, axis3 };
3677 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
3678
3679 return result;
3680}
3681
3682// ggml_transpose
3683
3684struct ggml_tensor * ggml_transpose(
3685 struct ggml_context * ctx,
3686 struct ggml_tensor * a) {
3687 struct ggml_tensor * result = ggml_view_tensor(ctx, src: a);
3688 ggml_format_name(tensor: result, fmt: "%s (transposed)", a->name);
3689
3690 result->ne[0] = a->ne[1];
3691 result->ne[1] = a->ne[0];
3692
3693 result->nb[0] = a->nb[1];
3694 result->nb[1] = a->nb[0];
3695
3696 result->op = GGML_OP_TRANSPOSE;
3697 result->src[0] = a;
3698
3699 return result;
3700}
3701
3702// ggml_get_rows
3703
3704struct ggml_tensor * ggml_get_rows(
3705 struct ggml_context * ctx,
3706 struct ggml_tensor * a,
3707 struct ggml_tensor * b) {
3708 GGML_ASSERT(a->ne[2] == b->ne[1]);
3709 GGML_ASSERT(a->ne[3] == b->ne[2]);
3710 GGML_ASSERT(b->ne[3] == 1);
3711 GGML_ASSERT(b->type == GGML_TYPE_I32);
3712
3713 // TODO: implement non F32 return
3714 enum ggml_type type = GGML_TYPE_F32;
3715 if (a->type == GGML_TYPE_I32) {
3716 type = a->type;
3717 }
3718 struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, ne0: a->ne[0], ne1: b->ne[0], ne2: b->ne[1], ne3: b->ne[2]);
3719
3720 result->op = GGML_OP_GET_ROWS;
3721 result->src[0] = a;
3722 result->src[1] = b;
3723
3724 return result;
3725}
3726
3727// ggml_get_rows_back
3728
3729struct ggml_tensor * ggml_get_rows_back(
3730 struct ggml_context * ctx,
3731 struct ggml_tensor * a,
3732 struct ggml_tensor * b,
3733 struct ggml_tensor * c) {
3734 GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
3735 GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
3736
3737 // TODO: implement non F32 return
3738 //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
3739 struct ggml_tensor * result = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: c->ne[0], ne1: c->ne[1]);
3740
3741 result->op = GGML_OP_GET_ROWS_BACK;
3742 result->src[0] = a;
3743 result->src[1] = b;
3744
3745 return result;
3746}
3747
3748// ggml_set_rows
3749
3750struct ggml_tensor * ggml_set_rows(
3751 struct ggml_context * ctx,
3752 struct ggml_tensor * a,
3753 struct ggml_tensor * b,
3754 struct ggml_tensor * c) {
3755 GGML_ASSERT(a->ne[0] == b->ne[0]);
3756 GGML_ASSERT(a->ne[2] == b->ne[2]);
3757 GGML_ASSERT(a->ne[3] == b->ne[3]);
3758 GGML_ASSERT(b->ne[1] == c->ne[0]);
3759 GGML_ASSERT(b->ne[2] % c->ne[1] == 0);
3760 GGML_ASSERT(b->ne[3] % c->ne[2] == 0);
3761 GGML_ASSERT(c->ne[3] == 1);
3762 GGML_ASSERT(b->type == GGML_TYPE_F32);
3763 GGML_ASSERT(c->type == GGML_TYPE_I64 || c->type == GGML_TYPE_I32);
3764
3765 GGML_ASSERT(ggml_is_contiguous_rows(a));
3766 GGML_ASSERT(ggml_is_contiguous_rows(b));
3767
3768 struct ggml_tensor * result = ggml_view_tensor(ctx, src: a);
3769
3770 result->op = GGML_OP_SET_ROWS;
3771 result->src[0] = b;
3772 result->src[1] = c;
3773 result->src[2] = a; // note: order is weird due to legacy reasons (https://github.com/ggml-org/llama.cpp/pull/16063#discussion_r2385795931)
3774
3775 return result;
3776}
3777
3778// ggml_diag
3779
3780struct ggml_tensor * ggml_diag(
3781 struct ggml_context * ctx,
3782 struct ggml_tensor * a) {
3783 GGML_ASSERT(a->ne[1] == 1);
3784
3785 const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
3786 struct ggml_tensor * result = ggml_new_tensor(ctx, type: a->type, n_dims: 4, ne);
3787
3788 result->op = GGML_OP_DIAG;
3789 result->src[0] = a;
3790
3791 return result;
3792}
3793
3794// ggml_diag_mask_inf
3795
3796static struct ggml_tensor * ggml_diag_mask_inf_impl(
3797 struct ggml_context * ctx,
3798 struct ggml_tensor * a,
3799 int n_past,
3800 bool inplace) {
3801 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
3802
3803 int32_t params[] = { n_past };
3804 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
3805
3806 result->op = GGML_OP_DIAG_MASK_INF;
3807 result->src[0] = a;
3808
3809 return result;
3810}
3811
3812struct ggml_tensor * ggml_diag_mask_inf(
3813 struct ggml_context * ctx,
3814 struct ggml_tensor * a,
3815 int n_past) {
3816 return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
3817}
3818
3819struct ggml_tensor * ggml_diag_mask_inf_inplace(
3820 struct ggml_context * ctx,
3821 struct ggml_tensor * a,
3822 int n_past) {
3823 return ggml_diag_mask_inf_impl(ctx, a, n_past, true);
3824}
3825
3826// ggml_diag_mask_zero
3827
3828static struct ggml_tensor * ggml_diag_mask_zero_impl(
3829 struct ggml_context * ctx,
3830 struct ggml_tensor * a,
3831 int n_past,
3832 bool inplace) {
3833 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
3834
3835 int32_t params[] = { n_past };
3836 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
3837
3838 result->op = GGML_OP_DIAG_MASK_ZERO;
3839 result->src[0] = a;
3840
3841 return result;
3842}
3843
3844struct ggml_tensor * ggml_diag_mask_zero(
3845 struct ggml_context * ctx,
3846 struct ggml_tensor * a,
3847 int n_past) {
3848 return ggml_diag_mask_zero_impl(ctx, a, n_past, false);
3849}
3850
3851struct ggml_tensor * ggml_diag_mask_zero_inplace(
3852 struct ggml_context * ctx,
3853 struct ggml_tensor * a,
3854 int n_past) {
3855 return ggml_diag_mask_zero_impl(ctx, a, n_past, true);
3856}
3857
3858// ggml_soft_max
3859
3860static struct ggml_tensor * ggml_soft_max_impl(
3861 struct ggml_context * ctx,
3862 struct ggml_tensor * a,
3863 struct ggml_tensor * mask,
3864 float scale,
3865 float max_bias,
3866 bool inplace) {
3867 GGML_ASSERT(ggml_is_contiguous(a));
3868
3869 if (mask) {
3870 GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
3871 GGML_ASSERT(ggml_is_contiguous(mask));
3872 GGML_ASSERT(mask->ne[0] == a->ne[0]);
3873 GGML_ASSERT(mask->ne[1] >= a->ne[1]);
3874 GGML_ASSERT(a->ne[2]%mask->ne[2] == 0);
3875 GGML_ASSERT(a->ne[3]%mask->ne[3] == 0);
3876 }
3877
3878 if (max_bias > 0.0f) {
3879 GGML_ASSERT(mask);
3880 }
3881
3882 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
3883
3884 float params[] = { scale, max_bias };
3885 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
3886
3887 result->op = GGML_OP_SOFT_MAX;
3888 result->src[0] = a;
3889 result->src[1] = mask;
3890
3891 return result;
3892}
3893
3894struct ggml_tensor * ggml_soft_max(
3895 struct ggml_context * ctx,
3896 struct ggml_tensor * a) {
3897 return ggml_soft_max_impl(ctx, a, NULL, scale: 1.0f, max_bias: 0.0f, false);
3898}
3899
3900struct ggml_tensor * ggml_soft_max_inplace(
3901 struct ggml_context * ctx,
3902 struct ggml_tensor * a) {
3903 return ggml_soft_max_impl(ctx, a, NULL, scale: 1.0f, max_bias: 0.0f, true);
3904}
3905
3906struct ggml_tensor * ggml_soft_max_ext(
3907 struct ggml_context * ctx,
3908 struct ggml_tensor * a,
3909 struct ggml_tensor * mask,
3910 float scale,
3911 float max_bias) {
3912 return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
3913}
3914
3915struct ggml_tensor * ggml_soft_max_ext_inplace(
3916 struct ggml_context * ctx,
3917 struct ggml_tensor * a,
3918 struct ggml_tensor * mask,
3919 float scale,
3920 float max_bias) {
3921 return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, true);
3922}
3923
3924void ggml_soft_max_add_sinks(
3925 struct ggml_tensor * a,
3926 struct ggml_tensor * sinks) {
3927 if (!sinks) {
3928 a->src[2] = NULL;
3929 return;
3930 }
3931
3932 GGML_ASSERT(a->op == GGML_OP_SOFT_MAX);
3933 GGML_ASSERT(a->src[2] == NULL);
3934 GGML_ASSERT(a->src[0]->ne[2] == sinks->ne[0]);
3935 GGML_ASSERT(sinks->type == GGML_TYPE_F32);
3936
3937 a->src[2] = sinks;
3938}
3939
3940// ggml_soft_max_ext_back
3941
3942static struct ggml_tensor * ggml_soft_max_ext_back_impl(
3943 struct ggml_context * ctx,
3944 struct ggml_tensor * a,
3945 struct ggml_tensor * b,
3946 float scale,
3947 float max_bias,
3948 bool inplace) {
3949 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
3950
3951 result->op = GGML_OP_SOFT_MAX_BACK;
3952 result->src[0] = a;
3953 result->src[1] = b;
3954
3955 memcpy(dest: (float *) result->op_params + 0, src: &scale, n: sizeof(float));
3956 memcpy(dest: (float *) result->op_params + 1, src: &max_bias, n: sizeof(float));
3957
3958 return result;
3959}
3960
3961struct ggml_tensor * ggml_soft_max_ext_back(
3962 struct ggml_context * ctx,
3963 struct ggml_tensor * a,
3964 struct ggml_tensor * b,
3965 float scale,
3966 float max_bias) {
3967 return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, false);
3968}
3969
3970struct ggml_tensor * ggml_soft_max_ext_back_inplace(
3971 struct ggml_context * ctx,
3972 struct ggml_tensor * a,
3973 struct ggml_tensor * b,
3974 float scale,
3975 float max_bias) {
3976 return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, true);
3977}
3978
3979// ggml_rope
3980
3981static struct ggml_tensor * ggml_rope_impl(
3982 struct ggml_context * ctx,
3983 struct ggml_tensor * a,
3984 struct ggml_tensor * b,
3985 struct ggml_tensor * c,
3986 int n_dims,
3987 int sections[GGML_MROPE_SECTIONS],
3988 int mode,
3989 int n_ctx_orig,
3990 float freq_base,
3991 float freq_scale,
3992 float ext_factor,
3993 float attn_factor,
3994 float beta_fast,
3995 float beta_slow,
3996 bool inplace) {
3997 GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
3998
3999 GGML_ASSERT(ggml_is_vector(b));
4000 GGML_ASSERT(b->type == GGML_TYPE_I32);
4001
4002 bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
4003 if (mrope_used) {
4004 GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
4005 } else {
4006 GGML_ASSERT(a->ne[2] == b->ne[0]);
4007 }
4008
4009 if (c) {
4010 GGML_ASSERT(c->type == GGML_TYPE_F32);
4011 GGML_ASSERT(c->ne[0] >= n_dims / 2);
4012 }
4013
4014 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
4015
4016 int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
4017 memcpy(dest: params + 5, src: &freq_base, n: sizeof(float));
4018 memcpy(dest: params + 6, src: &freq_scale, n: sizeof(float));
4019 memcpy(dest: params + 7, src: &ext_factor, n: sizeof(float));
4020 memcpy(dest: params + 8, src: &attn_factor, n: sizeof(float));
4021 memcpy(dest: params + 9, src: &beta_fast, n: sizeof(float));
4022 memcpy(dest: params + 10, src: &beta_slow, n: sizeof(float));
4023 if (mrope_used && sections) {
4024 memcpy(dest: params + 11, src: sections, n: sizeof(int32_t) * GGML_MROPE_SECTIONS);
4025 } else {
4026 memset(s: params + 11, c: 0, n: sizeof(int32_t) * GGML_MROPE_SECTIONS);
4027 }
4028 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
4029
4030 result->op = GGML_OP_ROPE;
4031 result->src[0] = a;
4032 result->src[1] = b;
4033 result->src[2] = c;
4034
4035 return result;
4036}
4037
4038struct ggml_tensor * ggml_rope(
4039 struct ggml_context * ctx,
4040 struct ggml_tensor * a,
4041 struct ggml_tensor * b,
4042 int n_dims,
4043 int mode) {
4044 return ggml_rope_impl(
4045 ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig: 0, freq_base: 10000.0f, freq_scale: 1.0f, ext_factor: 0.0f, attn_factor: 1.0f, beta_fast: 0.0f, beta_slow: 0.0f, false
4046 );
4047}
4048
4049struct ggml_tensor * ggml_rope_multi(
4050 struct ggml_context * ctx,
4051 struct ggml_tensor * a,
4052 struct ggml_tensor * b,
4053 struct ggml_tensor * c,
4054 int n_dims,
4055 int sections[GGML_MROPE_SECTIONS],
4056 int mode,
4057 int n_ctx_orig,
4058 float freq_base,
4059 float freq_scale,
4060 float ext_factor,
4061 float attn_factor,
4062 float beta_fast,
4063 float beta_slow) {
4064 return ggml_rope_impl(
4065 ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
4066 ext_factor, attn_factor, beta_fast, beta_slow, false
4067 );
4068}
4069
4070struct ggml_tensor * ggml_rope_multi_inplace(
4071 struct ggml_context * ctx,
4072 struct ggml_tensor * a,
4073 struct ggml_tensor * b,
4074 struct ggml_tensor * c,
4075 int n_dims,
4076 int sections[GGML_MROPE_SECTIONS],
4077 int mode,
4078 int n_ctx_orig,
4079 float freq_base,
4080 float freq_scale,
4081 float ext_factor,
4082 float attn_factor,
4083 float beta_fast,
4084 float beta_slow) {
4085 return ggml_rope_impl(
4086 ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
4087 ext_factor, attn_factor, beta_fast, beta_slow, true
4088 );
4089}
4090
4091struct ggml_tensor * ggml_rope_inplace(
4092 struct ggml_context * ctx,
4093 struct ggml_tensor * a,
4094 struct ggml_tensor * b,
4095 int n_dims,
4096 int mode) {
4097 return ggml_rope_impl(
4098 ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig: 0, freq_base: 10000.0f, freq_scale: 1.0f, ext_factor: 0.0f, attn_factor: 1.0f, beta_fast: 0.0f, beta_slow: 0.0f, true
4099 );
4100}
4101
4102struct ggml_tensor * ggml_rope_ext(
4103 struct ggml_context * ctx,
4104 struct ggml_tensor * a,
4105 struct ggml_tensor * b,
4106 struct ggml_tensor * c,
4107 int n_dims,
4108 int mode,
4109 int n_ctx_orig,
4110 float freq_base,
4111 float freq_scale,
4112 float ext_factor,
4113 float attn_factor,
4114 float beta_fast,
4115 float beta_slow) {
4116 return ggml_rope_impl(
4117 ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
4118 ext_factor, attn_factor, beta_fast, beta_slow, false
4119 );
4120}
4121
4122struct ggml_tensor * ggml_rope_ext_inplace(
4123 struct ggml_context * ctx,
4124 struct ggml_tensor * a,
4125 struct ggml_tensor * b,
4126 struct ggml_tensor * c,
4127 int n_dims,
4128 int mode,
4129 int n_ctx_orig,
4130 float freq_base,
4131 float freq_scale,
4132 float ext_factor,
4133 float attn_factor,
4134 float beta_fast,
4135 float beta_slow) {
4136 return ggml_rope_impl(
4137 ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
4138 ext_factor, attn_factor, beta_fast, beta_slow, true
4139 );
4140}
4141
4142struct ggml_tensor * ggml_rope_custom(
4143 struct ggml_context * ctx,
4144 struct ggml_tensor * a,
4145 struct ggml_tensor * b,
4146 int n_dims,
4147 int mode,
4148 int n_ctx_orig,
4149 float freq_base,
4150 float freq_scale,
4151 float ext_factor,
4152 float attn_factor,
4153 float beta_fast,
4154 float beta_slow) {
4155 return ggml_rope_impl(
4156 ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
4157 ext_factor, attn_factor, beta_fast, beta_slow, false
4158 );
4159}
4160
4161struct ggml_tensor * ggml_rope_custom_inplace(
4162 struct ggml_context * ctx,
4163 struct ggml_tensor * a,
4164 struct ggml_tensor * b,
4165 int n_dims,
4166 int mode,
4167 int n_ctx_orig,
4168 float freq_base,
4169 float freq_scale,
4170 float ext_factor,
4171 float attn_factor,
4172 float beta_fast,
4173 float beta_slow) {
4174 return ggml_rope_impl(
4175 ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
4176 ext_factor, attn_factor, beta_fast, beta_slow, true
4177 );
4178}
4179
4180// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
4181// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
4182static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
4183 return n_dims * logf(x: n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(x: base));
4184}
4185
4186void ggml_rope_yarn_corr_dims(
4187 int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
4188) {
4189 // start and end correction dims
4190 float start = floorf(x: ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, n_rot: beta_fast, base: freq_base));
4191 float end = ceilf(x: ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, n_rot: beta_slow, base: freq_base));
4192 dims[0] = MAX(0, start);
4193 dims[1] = MIN(n_dims - 1, end);
4194}
4195
4196// ggml_rope_back
4197
4198struct ggml_tensor * ggml_rope_ext_back(
4199 struct ggml_context * ctx,
4200 struct ggml_tensor * a,
4201 struct ggml_tensor * b,
4202 struct ggml_tensor * c,
4203 int n_dims,
4204 int mode,
4205 int n_ctx_orig,
4206 float freq_base,
4207 float freq_scale,
4208 float ext_factor,
4209 float attn_factor,
4210 float beta_fast,
4211 float beta_slow) {
4212 struct ggml_tensor * result = ggml_rope_ext(
4213 ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
4214 result->op = GGML_OP_ROPE_BACK;
4215 return result;
4216}
4217
4218struct ggml_tensor * ggml_rope_multi_back(
4219 struct ggml_context * ctx,
4220 struct ggml_tensor * a,
4221 struct ggml_tensor * b,
4222 struct ggml_tensor * c,
4223 int n_dims,
4224 int sections[4],
4225 int mode,
4226 int n_ctx_orig,
4227 float freq_base,
4228 float freq_scale,
4229 float ext_factor,
4230 float attn_factor,
4231 float beta_fast,
4232 float beta_slow) {
4233 struct ggml_tensor * result = ggml_rope_multi(
4234 ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
4235 result->op = GGML_OP_ROPE_BACK;
4236 return result;
4237}
4238// ggml_clamp
4239
4240struct ggml_tensor * ggml_clamp(
4241 struct ggml_context * ctx,
4242 struct ggml_tensor * a,
4243 float min,
4244 float max) {
4245 // TODO: when implement backward, fix this:
4246 struct ggml_tensor * result = ggml_view_tensor(ctx, src: a);
4247
4248 float params[] = { min, max };
4249 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
4250
4251 result->op = GGML_OP_CLAMP;
4252 result->src[0] = a;
4253
4254 return result;
4255}
4256
4257static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
4258 return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
4259}
4260
4261// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
4262// a: [OC,IC, KH, KW]
4263// b: [N, IC, IH, IW]
4264// result: [N, OH, OW, IC*KH*KW]
4265struct ggml_tensor * ggml_im2col(
4266 struct ggml_context * ctx,
4267 struct ggml_tensor * a,
4268 struct ggml_tensor * b,
4269 int s0,
4270 int s1,
4271 int p0,
4272 int p1,
4273 int d0,
4274 int d1,
4275 bool is_2D,
4276 enum ggml_type dst_type) {
4277 if (is_2D) {
4278 GGML_ASSERT(a->ne[2] == b->ne[2]);
4279 } else {
4280 //GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
4281 GGML_ASSERT(b->ne[1] == a->ne[1]);
4282 GGML_ASSERT(b->ne[3] == 1);
4283 }
4284
4285 const int64_t OH = is_2D ? ggml_calc_conv_output_size(ins: b->ne[1], ks: a->ne[1], s: s1, p: p1, d: d1) : 0;
4286 const int64_t OW = ggml_calc_conv_output_size(ins: b->ne[0], ks: a->ne[0], s: s0, p: p0, d: d0);
4287
4288 GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
4289 GGML_ASSERT((OW > 0) && "b too small compared to a");
4290
4291 const int64_t ne[4] = {
4292 is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
4293 OW,
4294 is_2D ? OH : b->ne[2],
4295 is_2D ? b->ne[3] : 1,
4296 };
4297
4298 struct ggml_tensor * result = ggml_new_tensor(ctx, type: dst_type, n_dims: 4, ne);
4299 int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
4300 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
4301
4302 result->op = GGML_OP_IM2COL;
4303 result->src[0] = a;
4304 result->src[1] = b;
4305
4306 return result;
4307}
4308
4309struct ggml_tensor * ggml_im2col_back(
4310 struct ggml_context * ctx,
4311 struct ggml_tensor * a,
4312 struct ggml_tensor * b,
4313 int64_t * ne,
4314 int s0,
4315 int s1,
4316 int p0,
4317 int p1,
4318 int d0,
4319 int d1,
4320 bool is_2D) {
4321 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
4322 int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
4323 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
4324
4325 result->op = GGML_OP_IM2COL_BACK;
4326 result->src[0] = a;
4327 result->src[1] = b;
4328
4329 return result;
4330}
4331
4332// ggml_conv_1d
4333
4334struct ggml_tensor * ggml_conv_1d(
4335 struct ggml_context * ctx,
4336 struct ggml_tensor * a,
4337 struct ggml_tensor * b,
4338 int s0,
4339 int p0,
4340 int d0) {
4341 struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1: 0, p0, p1: 0, d0, d1: 0, false, dst_type: GGML_TYPE_F16); // [N, OL, IC * K]
4342
4343 struct ggml_tensor * result =
4344 ggml_mul_mat(ctx,
4345 a: ggml_reshape_2d(ctx, a: im2col, ne0: im2col->ne[0], ne1: (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
4346 b: ggml_reshape_2d(ctx, a, ne0: (a->ne[0] * a->ne[1]), ne1: a->ne[2])); // [OC,IC, K] => [OC, IC * K]
4347
4348 result = ggml_reshape_3d(ctx, a: result, ne0: im2col->ne[1], ne1: a->ne[2], ne2: im2col->ne[2]); // [N, OC, OL]
4349
4350 return result;
4351}
4352
4353// ggml_conv_1d_ph
4354
4355struct ggml_tensor* ggml_conv_1d_ph(
4356 struct ggml_context * ctx,
4357 struct ggml_tensor * a,
4358 struct ggml_tensor * b,
4359 int s,
4360 int d) {
4361 return ggml_conv_1d(ctx, a, b, s0: s, p0: a->ne[0] / 2, d0: d);
4362}
4363
4364// ggml_conv_1d_dw
4365
4366struct ggml_tensor * ggml_conv_1d_dw(
4367 struct ggml_context * ctx,
4368 struct ggml_tensor * a,
4369 struct ggml_tensor * b,
4370 int s0,
4371 int p0,
4372 int d0) {
4373 struct ggml_tensor * new_b = ggml_reshape_4d(ctx, a: b, ne0: b->ne[0], ne1: 1, ne2: b->ne[1], ne3: b->ne[2]);
4374
4375 struct ggml_tensor * im2col = ggml_im2col(ctx, a, b: new_b, s0, s1: 0, p0, p1: 0, d0, d1: 0, false, dst_type: GGML_TYPE_F16);
4376
4377 struct ggml_tensor * result = ggml_mul_mat(ctx, a: im2col, b: a);
4378
4379 result = ggml_reshape_3d(ctx, a: result, ne0: result->ne[0], ne1: result->ne[2], ne2: 1);
4380
4381 return result;
4382}
4383
4384// ggml_conv_1d_dw_ph
4385
4386struct ggml_tensor * ggml_conv_1d_dw_ph(
4387 struct ggml_context * ctx,
4388 struct ggml_tensor * a,
4389 struct ggml_tensor * b,
4390 int s0,
4391 int d0) {
4392 return ggml_conv_1d_dw(ctx, a, b, s0, p0: a->ne[0] / 2, d0);
4393}
4394
4395// ggml_conv_transpose_1d
4396
4397static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
4398 return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
4399}
4400
4401GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
4402 struct ggml_context * ctx,
4403 struct ggml_tensor * a,
4404 struct ggml_tensor * b,
4405 int s0,
4406 int p0,
4407 int d0) {
4408 GGML_ASSERT(ggml_is_matrix(b));
4409 GGML_ASSERT(a->ne[2] == b->ne[1]);
4410 GGML_ASSERT(a->ne[3] == 1);
4411
4412 GGML_ASSERT(p0 == 0);
4413 GGML_ASSERT(d0 == 1);
4414
4415 const int64_t ne[4] = {
4416 ggml_calc_conv_transpose_1d_output_size(ins: b->ne[0], ks: a->ne[0], s: s0, p: 0 /*p0*/, d: 1 /*d0*/),
4417 a->ne[1], b->ne[2], 1,
4418 };
4419 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
4420
4421 int32_t params[] = { s0, p0, d0 };
4422 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
4423
4424 result->op = GGML_OP_CONV_TRANSPOSE_1D;
4425 result->src[0] = a;
4426 result->src[1] = b;
4427
4428 return result;
4429}
4430
4431// ggml_conv_2d
4432
4433// a: [OC,IC, KH, KW]
4434// b: [N, IC, IH, IW]
4435// result: [N, OC, OH, OW]
4436struct ggml_tensor * ggml_conv_2d(
4437 struct ggml_context * ctx,
4438 struct ggml_tensor * a,
4439 struct ggml_tensor * b,
4440 int s0,
4441 int s1,
4442 int p0,
4443 int p1,
4444 int d0,
4445 int d1) {
4446 struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, dst_type: a->type); // [N, OH, OW, IC * KH * KW]
4447
4448 struct ggml_tensor * result =
4449 ggml_mul_mat(ctx,
4450 a: ggml_reshape_2d(ctx, a: im2col, ne0: im2col->ne[0], ne1: im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
4451 b: ggml_reshape_2d(ctx, a, ne0: (a->ne[0] * a->ne[1] * a->ne[2]), ne1: a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
4452
4453 result = ggml_reshape_4d(ctx, a: result, ne0: im2col->ne[1], ne1: im2col->ne[2], ne2: im2col->ne[3], ne3: a->ne[3]); // [OC, N, OH, OW]
4454 result = ggml_cont(ctx, a: ggml_permute(ctx, a: result, axis0: 0, axis1: 1, axis2: 3, axis3: 2)); // [N, OC, OH, OW]
4455
4456
4457 return result;
4458}
4459
4460// a: [OC*IC, KD, KH, KW]
4461// b: [N*IC, ID, IH, IW]
4462// result: [N*OD, OH, OW, IC * KD * KH * KW]
4463struct ggml_tensor * ggml_im2col_3d(
4464 struct ggml_context * ctx,
4465 struct ggml_tensor * a,
4466 struct ggml_tensor * b,
4467 int64_t IC,
4468 int s0, // stride width
4469 int s1, // stride height
4470 int s2, // stride depth
4471 int p0, // padding width
4472 int p1, // padding height
4473 int p2, // padding depth
4474 int d0, // dilation width
4475 int d1, // dilation height
4476 int d2, // dilation depth
4477 enum ggml_type dst_type) {
4478 const int64_t N = b->ne[3] / IC;
4479 const int64_t ID = b->ne[2];
4480 const int64_t IH = b->ne[1];
4481 const int64_t IW = b->ne[0];
4482
4483 const int64_t OC = a->ne[3] / IC;
4484 UNUSED(OC);
4485 const int64_t KD = a->ne[2];
4486 const int64_t KH = a->ne[1];
4487 const int64_t KW = a->ne[0];
4488 const int64_t OD = ggml_calc_conv_output_size(ins: ID, ks: KD, s: s2, p: p2, d: d2);
4489 const int64_t OH = ggml_calc_conv_output_size(ins: IH, ks: KH, s: s1, p: p1, d: d1);
4490 const int64_t OW = ggml_calc_conv_output_size(ins: IW, ks: KW, s: s0, p: p0, d: d0);
4491
4492 GGML_ASSERT((OD > 0) && "b too small compared to a");
4493 GGML_ASSERT((OH > 0) && "b too small compared to a");
4494 GGML_ASSERT((OW > 0) && "b too small compared to a");
4495
4496
4497 const int64_t ne[4] = {KW*KH*KD*IC, OW, OH, OD*N};
4498
4499 struct ggml_tensor * result = ggml_new_tensor(ctx, type: dst_type, n_dims: 4, ne);
4500 int32_t params[] = { s0, s1, s2, p0, p1, p2, d0, d1, d2, (int32_t)IC};
4501 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
4502
4503 result->op = GGML_OP_IM2COL_3D;
4504 result->src[0] = a;
4505 result->src[1] = b;
4506
4507 return result;
4508}
4509
4510// a: [OC*IC, KD, KH, KW]
4511// b: [N*IC, ID, IH, IW]
4512// result: [N*OC, OD, OH, OW]
4513struct ggml_tensor * ggml_conv_3d(
4514 struct ggml_context * ctx,
4515 struct ggml_tensor * a,
4516 struct ggml_tensor * b,
4517 int64_t IC,
4518 int s0, // stride width
4519 int s1, // stride height
4520 int s2, // stride depth
4521 int p0, // padding width
4522 int p1, // padding height
4523 int p2, // padding depth
4524 int d0, // dilation width
4525 int d1, // dilation height
4526 int d2 // dilation depth
4527 ) {
4528 struct ggml_tensor * im2col = ggml_im2col_3d(ctx, a, b, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, dst_type: a->type); // [N*OD, OH, OW, IC * KD * KH * KW]
4529
4530 int64_t OC = a->ne[3] / IC;
4531 int64_t N = b->ne[3] / IC;
4532 struct ggml_tensor * result =
4533 ggml_mul_mat(ctx,
4534 a: ggml_reshape_2d(ctx, a: im2col, ne0: im2col->ne[0], ne1: im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N*OD, OH, OW, IC * KD * KH * KW] => [N*OD*OH*OW, IC * KD * KH * KW]
4535 b: ggml_reshape_2d(ctx, a, ne0: (a->ne[0] * a->ne[1] * a->ne[2] * IC), ne1: OC)); // [OC*IC, KD, KH, KW] => [OC, IC * KD * KH * KW]
4536
4537 int64_t OD = im2col->ne[3] / N;
4538 result = ggml_reshape_4d(ctx, a: result, ne0: im2col->ne[1]*im2col->ne[2], ne1: OD, ne2: N, ne3: OC); // [OC, N*OD*OH*OW] => [OC, N, OD, OH*OW]
4539 result = ggml_cont(ctx, a: ggml_permute(ctx, a: result, axis0: 0, axis1: 1, axis2: 3, axis3: 2)); // [N, OC, OD, OH*OW]
4540 result = ggml_reshape_4d(ctx, a: result, ne0: im2col->ne[1], ne1: im2col->ne[2], ne2: OD, ne3: OC * N); // [N*OC, OD, OH, OW]
4541
4542 return result;
4543}
4544
4545// ggml_conv_2d_sk_p0
4546
4547struct ggml_tensor * ggml_conv_2d_sk_p0(
4548 struct ggml_context * ctx,
4549 struct ggml_tensor * a,
4550 struct ggml_tensor * b) {
4551 return ggml_conv_2d(ctx, a, b, s0: a->ne[0], s1: a->ne[1], p0: 0, p1: 0, d0: 1, d1: 1);
4552}
4553
4554// ggml_conv_2d_s1_ph
4555
4556struct ggml_tensor * ggml_conv_2d_s1_ph(
4557 struct ggml_context * ctx,
4558 struct ggml_tensor * a,
4559 struct ggml_tensor * b) {
4560 return ggml_conv_2d(ctx, a, b, s0: 1, s1: 1, p0: a->ne[0] / 2, p1: a->ne[1] / 2, d0: 1, d1: 1);
4561}
4562
4563// ggml_conv_2d_dw
4564
4565struct ggml_tensor * ggml_conv_2d_dw(
4566 struct ggml_context * ctx,
4567 struct ggml_tensor * a,
4568 struct ggml_tensor * b,
4569 int s0,
4570 int s1,
4571 int p0,
4572 int p1,
4573 int d0,
4574 int d1) {
4575 struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, ne0: a->ne[0], ne1: a->ne[1], ne2: 1, ne3: a->ne[2] * a->ne[3]);
4576 struct ggml_tensor * im2col = ggml_im2col(ctx, a: new_a,
4577 b: ggml_reshape_4d(ctx, a: b, ne0: b->ne[0], ne1: b->ne[1], ne2: 1, ne3: b->ne[2] * b->ne[3]),
4578 s0, s1, p0, p1, d0, d1, true, dst_type: GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
4579 struct ggml_tensor * new_b = ggml_reshape_4d(ctx, a: im2col, ne0: im2col->ne[0], ne1: im2col->ne[2] * im2col->ne[1], ne2: b->ne[2], ne3: b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
4580
4581 new_a = ggml_reshape_4d(ctx, a: new_a, ne0: (new_a->ne[0] * new_a->ne[1]), ne1: new_a->ne[2], ne2: new_a->ne[3], ne3: 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
4582 struct ggml_tensor * result = ggml_mul_mat(ctx, a: new_a, b: new_b);
4583 result = ggml_reshape_4d(ctx, a: result, ne0: im2col->ne[1], ne1: im2col->ne[2], ne2: b->ne[2], ne3: b->ne[3]); // [N, OC, OH, OW]
4584
4585 return result;
4586}
4587
4588// ggml_conv_2d_dw_direct
4589
4590struct ggml_tensor * ggml_conv_2d_dw_direct(
4591 struct ggml_context * ctx,
4592 struct ggml_tensor * a,
4593 struct ggml_tensor * b,
4594 int stride0,
4595 int stride1,
4596 int pad0,
4597 int pad1,
4598 int dilation0,
4599 int dilation1) {
4600 GGML_ASSERT(a->ne[2] == 1);
4601 GGML_ASSERT(a->ne[3] == b->ne[2]);
4602 int64_t ne[4];
4603 ne[0] = ggml_calc_conv_output_size(ins: b->ne[0], ks: a->ne[0], s: stride0, p: pad0, d: dilation0);
4604 ne[1] = ggml_calc_conv_output_size(ins: b->ne[1], ks: a->ne[1], s: stride1, p: pad1, d: dilation1);
4605 ne[2] = b->ne[2];
4606 ne[3] = b->ne[3];
4607
4608 struct ggml_tensor * result = ggml_new_tensor(ctx, type: b->type, n_dims: 4, ne);
4609
4610 if (ggml_is_contiguous_channels(tensor: b)) {
4611 // Result will be permuted the same way as input (CWHN order)
4612 const int64_t type_size = ggml_type_size(type: result->type);
4613 GGML_ASSERT(ggml_blck_size(result->type) == 1);
4614 result->nb[0] = result->ne[2] * type_size;
4615 result->nb[1] = result->ne[0] * result->nb[0];
4616 result->nb[2] = type_size;
4617 }
4618
4619 int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
4620 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
4621
4622 result->op = GGML_OP_CONV_2D_DW;
4623 result->src[0] = a;
4624 result->src[1] = b;
4625 return result;
4626}
4627
4628// ggml_conv_2d_direct
4629
4630struct ggml_tensor * ggml_conv_2d_direct(
4631 struct ggml_context * ctx,
4632 struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
4633 struct ggml_tensor * b, // input data [W, H, C, N]
4634 int s0, // stride dimension 0
4635 int s1, // stride dimension 1
4636 int p0, // padding dimension 0
4637 int p1, // padding dimension 1
4638 int d0, // dilation dimension 0
4639 int d1) {// dilation dimension 1
4640
4641 GGML_ASSERT(a->ne[2] == b->ne[2]);
4642 //GGML_ASSERT(a->type == b->type);
4643
4644 int64_t ne[4];
4645 ne[0] = ggml_calc_conv_output_size(ins: b->ne[0], ks: a->ne[0], s: s0, p: p0, d: d0);
4646 ne[1] = ggml_calc_conv_output_size(ins: b->ne[1], ks: a->ne[1], s: s1, p: p1, d: d1);
4647 ne[2] = a->ne[3];
4648 ne[3] = b->ne[3];
4649
4650 struct ggml_tensor * result = ggml_new_tensor(ctx, type: b->type, n_dims: 4, ne);
4651
4652 ggml_set_op_params_i32(tensor: result, i: 0, value: s0);
4653 ggml_set_op_params_i32(tensor: result, i: 1, value: s1);
4654 ggml_set_op_params_i32(tensor: result, i: 2, value: p0);
4655 ggml_set_op_params_i32(tensor: result, i: 3, value: p1);
4656 ggml_set_op_params_i32(tensor: result, i: 4, value: d0);
4657 ggml_set_op_params_i32(tensor: result, i: 5, value: d1);
4658
4659 result->op = GGML_OP_CONV_2D;
4660 result->src[0] = a;
4661 result->src[1] = b;
4662
4663 return result;
4664}
4665
4666// ggml_conv_3d_direct
4667
4668struct ggml_tensor * ggml_conv_3d_direct(
4669 struct ggml_context * ctx,
4670 struct ggml_tensor * a,
4671 struct ggml_tensor * b,
4672 int s0,
4673 int s1,
4674 int s2,
4675 int p0,
4676 int p1,
4677 int p2,
4678 int d0,
4679 int d1,
4680 int d2,
4681 int c,
4682 int n,
4683 int oc) {
4684
4685 GGML_ASSERT(a->ne[3] == (int64_t) c * oc);
4686 GGML_ASSERT(b->ne[3] == (int64_t) c * n);
4687
4688 int64_t ne[4];
4689 ne[0] = ggml_calc_conv_output_size(ins: b->ne[0], ks: a->ne[0], s: s0, p: p0, d: d0);
4690 ne[1] = ggml_calc_conv_output_size(ins: b->ne[1], ks: a->ne[1], s: s1, p: p1, d: d1);
4691 ne[2] = ggml_calc_conv_output_size(ins: b->ne[2], ks: a->ne[2], s: s2, p: p2, d: d2);
4692 ne[3] = (int64_t) oc * n;
4693
4694 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
4695
4696 ggml_set_op_params_i32(tensor: result, i: 0, value: s0);
4697 ggml_set_op_params_i32(tensor: result, i: 1, value: s1);
4698 ggml_set_op_params_i32(tensor: result, i: 2, value: s2);
4699 ggml_set_op_params_i32(tensor: result, i: 3, value: p0);
4700 ggml_set_op_params_i32(tensor: result, i: 4, value: p1);
4701 ggml_set_op_params_i32(tensor: result, i: 5, value: p2);
4702 ggml_set_op_params_i32(tensor: result, i: 6, value: d0);
4703 ggml_set_op_params_i32(tensor: result, i: 7, value: d1);
4704 ggml_set_op_params_i32(tensor: result, i: 8, value: d2);
4705 ggml_set_op_params_i32(tensor: result, i: 9, value: c);
4706 ggml_set_op_params_i32(tensor: result, i: 10, value: n);
4707 ggml_set_op_params_i32(tensor: result, i: 11, value: oc);
4708
4709 result->op = GGML_OP_CONV_3D;
4710 result->src[0] = a;
4711 result->src[1] = b;
4712
4713 return result;
4714}
4715
4716// ggml_conv_transpose_2d_p0
4717
4718static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
4719 return (ins - 1) * s - 2 * p + ks;
4720}
4721
4722struct ggml_tensor * ggml_conv_transpose_2d_p0(
4723 struct ggml_context * ctx,
4724 struct ggml_tensor * a,
4725 struct ggml_tensor * b,
4726 int stride) {
4727 GGML_ASSERT(a->ne[3] == b->ne[2]);
4728
4729 const int64_t ne[4] = {
4730 ggml_calc_conv_transpose_output_size(ins: b->ne[0], ks: a->ne[0], s: stride, p: 0 /*p0*/),
4731 ggml_calc_conv_transpose_output_size(ins: b->ne[1], ks: a->ne[1], s: stride, p: 0 /*p1*/),
4732 a->ne[2], b->ne[3],
4733 };
4734
4735 struct ggml_tensor* result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
4736
4737 ggml_set_op_params_i32(tensor: result, i: 0, value: stride);
4738
4739 result->op = GGML_OP_CONV_TRANSPOSE_2D;
4740 result->src[0] = a;
4741 result->src[1] = b;
4742
4743 return result;
4744}
4745
4746// ggml_pool_*
4747
4748static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
4749 return (ins + 2 * p - ks) / s + 1;
4750}
4751
4752// ggml_pool_1d
4753
4754struct ggml_tensor * ggml_pool_1d(
4755 struct ggml_context * ctx,
4756 struct ggml_tensor * a,
4757 enum ggml_op_pool op,
4758 int k0,
4759 int s0,
4760 int p0) {
4761 const int64_t ne[4] = {
4762 ggml_calc_pool_output_size(ins: a->ne[0], ks: k0, s: s0, p: p0),
4763 a->ne[1],
4764 a->ne[2],
4765 a->ne[3],
4766 };
4767 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
4768
4769 int32_t params[] = { op, k0, s0, p0 };
4770 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
4771
4772 result->op = GGML_OP_POOL_1D;
4773 result->src[0] = a;
4774
4775 return result;
4776}
4777
4778// ggml_pool_2d
4779
4780struct ggml_tensor * ggml_pool_2d(
4781 struct ggml_context * ctx,
4782 struct ggml_tensor * a,
4783 enum ggml_op_pool op,
4784 int k0,
4785 int k1,
4786 int s0,
4787 int s1,
4788 float p0,
4789 float p1) {
4790 struct ggml_tensor * result;
4791 const int64_t ne[4] = {
4792 ggml_calc_pool_output_size(ins: a->ne[0], ks: k0, s: s0, p: p0),
4793 ggml_calc_pool_output_size(ins: a->ne[1], ks: k1, s: s1, p: p1),
4794 a->ne[2],
4795 a->ne[3],
4796 };
4797 result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
4798
4799 int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
4800 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
4801
4802 result->op = GGML_OP_POOL_2D;
4803 result->src[0] = a;
4804
4805 return result;
4806}
4807
4808struct ggml_tensor * ggml_pool_2d_back(
4809 struct ggml_context * ctx,
4810 struct ggml_tensor * a,
4811 struct ggml_tensor * af,
4812 enum ggml_op_pool op,
4813 int k0,
4814 int k1,
4815 int s0,
4816 int s1,
4817 float p0,
4818 float p1) {
4819 struct ggml_tensor * result;
4820 result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne: af->ne);
4821
4822 int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
4823 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
4824
4825 result->op = GGML_OP_POOL_2D_BACK;
4826 result->src[0] = a;
4827 result->src[1] = af;
4828
4829 return result;
4830}
4831
4832// ggml_upscale / ggml_interpolate
4833
4834static struct ggml_tensor * ggml_interpolate_impl(
4835 struct ggml_context * ctx,
4836 struct ggml_tensor * a,
4837 int64_t ne0,
4838 int64_t ne1,
4839 int64_t ne2,
4840 int64_t ne3,
4841 uint32_t mode) {
4842 GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT);
4843
4844 struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type: a->type, ne0, ne1, ne2, ne3);
4845
4846 ggml_set_op_params_i32(tensor: result, i: 0, value: (int32_t)mode);
4847
4848 result->op = GGML_OP_UPSCALE;
4849 result->src[0] = a;
4850
4851 return result;
4852}
4853
4854struct ggml_tensor * ggml_upscale(
4855 struct ggml_context * ctx,
4856 struct ggml_tensor * a,
4857 int scale_factor,
4858 enum ggml_scale_mode mode) {
4859 GGML_ASSERT(scale_factor > 1);
4860 return ggml_interpolate_impl(ctx, a, ne0: a->ne[0] * scale_factor, ne1: a->ne[1] * scale_factor, ne2: a->ne[2], ne3: a->ne[3], mode);
4861}
4862
4863struct ggml_tensor * ggml_upscale_ext(
4864 struct ggml_context * ctx,
4865 struct ggml_tensor * a,
4866 int ne0,
4867 int ne1,
4868 int ne2,
4869 int ne3,
4870 enum ggml_scale_mode mode) {
4871 return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
4872}
4873
4874struct ggml_tensor * ggml_interpolate(
4875 struct ggml_context * ctx,
4876 struct ggml_tensor * a,
4877 int64_t ne0,
4878 int64_t ne1,
4879 int64_t ne2,
4880 int64_t ne3,
4881 uint32_t mode) {
4882 return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
4883}
4884
4885// ggml_pad
4886
4887struct ggml_tensor * ggml_pad(
4888 struct ggml_context * ctx,
4889 struct ggml_tensor * a,
4890 int p0,
4891 int p1,
4892 int p2,
4893 int p3) {
4894 return ggml_pad_ext(ctx, a, lp0: 0, rp0: p0, lp1: 0, rp1: p1, lp2: 0, rp2: p2, lp3: 0, rp3: p3);
4895}
4896
4897struct ggml_tensor * ggml_pad_ext(
4898 struct ggml_context * ctx,
4899 struct ggml_tensor * a,
4900 int lp0,
4901 int rp0,
4902 int lp1,
4903 int rp1,
4904 int lp2,
4905 int rp2,
4906 int lp3,
4907 int rp3
4908 ) {
4909 struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type: a->type,
4910 ne0: a->ne[0] + lp0 + rp0,
4911 ne1: a->ne[1] + lp1 + rp1,
4912 ne2: a->ne[2] + lp2 + rp2,
4913 ne3: a->ne[3] + lp3 + rp3);
4914
4915 ggml_set_op_params_i32(tensor: result, i: 0, value: lp0);
4916 ggml_set_op_params_i32(tensor: result, i: 1, value: rp0);
4917 ggml_set_op_params_i32(tensor: result, i: 2, value: lp1);
4918 ggml_set_op_params_i32(tensor: result, i: 3, value: rp1);
4919 ggml_set_op_params_i32(tensor: result, i: 4, value: lp2);
4920 ggml_set_op_params_i32(tensor: result, i: 5, value: rp2);
4921 ggml_set_op_params_i32(tensor: result, i: 6, value: lp3);
4922 ggml_set_op_params_i32(tensor: result, i: 7, value: rp3);
4923
4924
4925 result->op = GGML_OP_PAD;
4926 result->src[0] = a;
4927
4928 return result;
4929}
4930
4931// ggml_pad_reflect_1d
4932
4933struct ggml_tensor * ggml_pad_reflect_1d(
4934 struct ggml_context * ctx,
4935 struct ggml_tensor * a,
4936 int p0,
4937 int p1) {
4938 GGML_ASSERT(p0 >= 0);
4939 GGML_ASSERT(p1 >= 0);
4940
4941 GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
4942 GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
4943
4944 GGML_ASSERT(ggml_is_contiguous(a));
4945 GGML_ASSERT(a->type == GGML_TYPE_F32);
4946
4947 struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type: a->type,
4948 ne0: a->ne[0] + p0 + p1,
4949 ne1: a->ne[1],
4950 ne2: a->ne[2],
4951 ne3: a->ne[3]);
4952
4953 int32_t params[] = { p0, p1 };
4954 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
4955
4956 result->op = GGML_OP_PAD_REFLECT_1D;
4957 result->src[0] = a;
4958
4959 return result;
4960}
4961
4962// ggml_roll
4963
4964struct ggml_tensor * ggml_roll(
4965 struct ggml_context * ctx,
4966 struct ggml_tensor * a,
4967 int shift0,
4968 int shift1,
4969 int shift2,
4970 int shift3) {
4971 GGML_ASSERT(a->nb[0] == ggml_type_size(a->type));
4972 GGML_ASSERT(abs(shift0) < a->ne[0]);
4973 GGML_ASSERT(abs(shift1) < a->ne[1]);
4974 GGML_ASSERT(abs(shift2) < a->ne[2]);
4975 GGML_ASSERT(abs(shift3) < a->ne[3]);
4976
4977 struct ggml_tensor * result = ggml_dup_tensor(ctx, src: a);
4978
4979 ggml_set_op_params_i32(tensor: result, i: 0, value: shift0);
4980 ggml_set_op_params_i32(tensor: result, i: 1, value: shift1);
4981 ggml_set_op_params_i32(tensor: result, i: 2, value: shift2);
4982 ggml_set_op_params_i32(tensor: result, i: 3, value: shift3);
4983
4984 result->op = GGML_OP_ROLL;
4985 result->src[0] = a;
4986
4987 return result;
4988}
4989
4990// ggml_arange
4991
4992struct ggml_tensor * ggml_arange(
4993 struct ggml_context * ctx,
4994 float start,
4995 float stop,
4996 float step) {
4997 GGML_ASSERT(stop > start);
4998
4999 const int64_t steps = (int64_t) ceilf(x: (stop - start) / step);
5000
5001 struct ggml_tensor * result = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: steps);
5002
5003 ggml_set_op_params_f32(tensor: result, i: 0, value: start);
5004 ggml_set_op_params_f32(tensor: result, i: 1, value: stop);
5005 ggml_set_op_params_f32(tensor: result, i: 2, value: step);
5006
5007 result->op = GGML_OP_ARANGE;
5008
5009 return result;
5010}
5011
5012// ggml_timestep_embedding
5013
5014struct ggml_tensor * ggml_timestep_embedding(
5015 struct ggml_context * ctx,
5016 struct ggml_tensor * timesteps,
5017 int dim,
5018 int max_period) {
5019
5020 struct ggml_tensor * result = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: dim, ne1: timesteps->ne[0]);
5021
5022 ggml_set_op_params_i32(tensor: result, i: 0, value: dim);
5023 ggml_set_op_params_i32(tensor: result, i: 1, value: max_period);
5024
5025 result->op = GGML_OP_TIMESTEP_EMBEDDING;
5026 result->src[0] = timesteps;
5027
5028 return result;
5029}
5030
5031// ggml_argsort
5032
5033struct ggml_tensor * ggml_argsort(
5034 struct ggml_context * ctx,
5035 struct ggml_tensor * a,
5036 enum ggml_sort_order order) {
5037 GGML_ASSERT(a->ne[0] <= INT32_MAX);
5038 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_I32, GGML_MAX_DIMS, ne: a->ne);
5039
5040 ggml_set_op_params_i32(tensor: result, i: 0, value: (int32_t) order);
5041
5042 result->op = GGML_OP_ARGSORT;
5043 result->src[0] = a;
5044
5045 return result;
5046}
5047
5048// ggml_top_k
5049
5050struct ggml_tensor * ggml_top_k(
5051 struct ggml_context * ctx,
5052 struct ggml_tensor * a,
5053 int k) {
5054 GGML_ASSERT(a->ne[0] >= k);
5055
5056 struct ggml_tensor * result = ggml_argsort(ctx, a, order: GGML_SORT_ORDER_DESC);
5057
5058 result = ggml_view_4d(ctx, a: result,
5059 ne0: k, ne1: result->ne[1], ne2: result->ne[2], ne3: result->ne[3],
5060 nb1: result->nb[1], nb2: result->nb[2], nb3: result->nb[3],
5061 offset: 0);
5062
5063 return result;
5064}
5065
5066// ggml_flash_attn_ext
5067
5068struct ggml_tensor * ggml_flash_attn_ext(
5069 struct ggml_context * ctx,
5070 struct ggml_tensor * q,
5071 struct ggml_tensor * k,
5072 struct ggml_tensor * v,
5073 struct ggml_tensor * mask,
5074 float scale,
5075 float max_bias,
5076 float logit_softcap) {
5077 GGML_ASSERT(ggml_can_mul_mat(k, q));
5078 // TODO: check if vT can be multiplied by (k*qT)
5079
5080 GGML_ASSERT(q->ne[3] == k->ne[3]);
5081 GGML_ASSERT(q->ne[3] == v->ne[3]);
5082
5083 if (mask) {
5084 GGML_ASSERT(ggml_is_contiguous(mask));
5085 GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
5086 "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
5087 //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
5088
5089 GGML_ASSERT(q->ne[2] % mask->ne[2] == 0);
5090 GGML_ASSERT(q->ne[3] % mask->ne[3] == 0);
5091 }
5092
5093 if (max_bias > 0.0f) {
5094 GGML_ASSERT(mask);
5095 }
5096
5097 // permute(0, 2, 1, 3)
5098 int64_t ne[4] = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] };
5099 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
5100
5101 float params[] = { scale, max_bias, logit_softcap };
5102 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
5103
5104 result->op = GGML_OP_FLASH_ATTN_EXT;
5105 result->src[0] = q;
5106 result->src[1] = k;
5107 result->src[2] = v;
5108 result->src[3] = mask;
5109
5110 return result;
5111}
5112
5113void ggml_flash_attn_ext_set_prec(
5114 struct ggml_tensor * a,
5115 enum ggml_prec prec) {
5116 GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
5117
5118 const int32_t prec_i32 = (int32_t) prec;
5119
5120 ggml_set_op_params_i32(tensor: a, i: 3, value: prec_i32); // scale is on first pos, max_bias on second
5121}
5122
5123enum ggml_prec ggml_flash_attn_ext_get_prec(
5124 const struct ggml_tensor * a) {
5125 GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
5126
5127 const int32_t prec_i32 = ggml_get_op_params_i32(tensor: a, i: 3);
5128
5129 return (enum ggml_prec) prec_i32;
5130}
5131
5132void ggml_flash_attn_ext_add_sinks(
5133 struct ggml_tensor * a,
5134 struct ggml_tensor * sinks) {
5135 if (!sinks) {
5136 a->src[4] = NULL;
5137 return;
5138 }
5139
5140 GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
5141 GGML_ASSERT(a->src[4] == NULL);
5142 GGML_ASSERT(a->src[0]->ne[2] == sinks->ne[0]);
5143 GGML_ASSERT(sinks->type == GGML_TYPE_F32);
5144
5145 a->src[4] = sinks;
5146}
5147
5148// ggml_flash_attn_back
5149
5150struct ggml_tensor * ggml_flash_attn_back(
5151 struct ggml_context * ctx,
5152 struct ggml_tensor * q,
5153 struct ggml_tensor * k,
5154 struct ggml_tensor * v,
5155 struct ggml_tensor * d,
5156 bool masked) {
5157 GGML_ABORT("TODO: adapt to ggml_flash_attn_ext() changes");
5158
5159 GGML_ASSERT(ggml_can_mul_mat(k, q));
5160 // TODO: check if vT can be multiplied by (k*qT)
5161
5162 // d shape [D,N,ne2,ne3]
5163 // q shape [D,N,ne2,ne3]
5164 // k shape [D,M,kvne2,ne3]
5165 // v shape [M,D,kvne2,ne3]
5166
5167 const int64_t D = q->ne[0];
5168 const int64_t N = q->ne[1];
5169 const int64_t M = k->ne[1];
5170 const int64_t ne2 = q->ne[2];
5171 const int64_t ne3 = q->ne[3];
5172 const int64_t kvne2 = k->ne[2];
5173
5174 GGML_ASSERT(k->ne[0] == D);
5175 GGML_ASSERT(v->ne[0] == M);
5176 GGML_ASSERT(v->ne[1] == D);
5177 GGML_ASSERT(d->ne[0] == D);
5178 GGML_ASSERT(d->ne[1] == N);
5179 GGML_ASSERT(k->ne[2] == kvne2);
5180 GGML_ASSERT(k->ne[3] == ne3);
5181 GGML_ASSERT(v->ne[2] == kvne2);
5182 GGML_ASSERT(v->ne[3] == ne3);
5183 GGML_ASSERT(d->ne[2] == ne2);
5184 GGML_ASSERT(d->ne[3] == ne3);
5185
5186 GGML_ASSERT(ne2 % kvne2 == 0);
5187
5188 // store gradients of q, k and v as continuous tensors concatenated in result.
5189 // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
5190 const int64_t elem_q = ggml_nelements(tensor: q);
5191 const int64_t elem_k = ggml_nelements(tensor: k);
5192 const int64_t elem_v = ggml_nelements(tensor: v);
5193
5194 enum ggml_type result_type = GGML_TYPE_F32;
5195 GGML_ASSERT(ggml_blck_size(result_type) == 1);
5196 const size_t tsize = ggml_type_size(type: result_type);
5197
5198 const size_t offs_q = 0;
5199 const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
5200 const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
5201 const size_t end = offs_v + GGML_PAD(elem_v * tsize, GGML_MEM_ALIGN);
5202
5203 const size_t nelements = (end + tsize - 1)/tsize;
5204
5205 struct ggml_tensor * result = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: nelements);
5206
5207 int32_t masked_i = masked ? 1 : 0;
5208 ggml_set_op_params(tensor: result, params: &masked_i, params_size: sizeof(masked_i));
5209
5210 result->op = GGML_OP_FLASH_ATTN_BACK;
5211 result->src[0] = q;
5212 result->src[1] = k;
5213 result->src[2] = v;
5214 result->src[3] = d;
5215
5216 return result;
5217}
5218
5219// ggml_ssm_conv
5220
5221struct ggml_tensor * ggml_ssm_conv(
5222 struct ggml_context * ctx,
5223 struct ggml_tensor * sx,
5224 struct ggml_tensor * c) {
5225 GGML_ASSERT(ggml_is_3d(sx));
5226 GGML_ASSERT(ggml_is_matrix(c));
5227
5228 const int64_t d_conv = c->ne[0];
5229 const int64_t d_inner = c->ne[1];
5230 const int64_t n_t = sx->ne[0] - d_conv + 1; // tokens per sequence
5231 const int64_t n_s = sx->ne[2];
5232
5233 // TODO: maybe support other strides than 1?
5234 GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t);
5235 GGML_ASSERT(sx->ne[1] == d_inner);
5236 GGML_ASSERT(n_t >= 0);
5237
5238 struct ggml_tensor * result = ggml_new_tensor_3d(ctx, type: GGML_TYPE_F32, ne0: d_inner, ne1: n_t, ne2: n_s);
5239
5240 result->op = GGML_OP_SSM_CONV;
5241 result->src[0] = sx;
5242 result->src[1] = c;
5243
5244 return result;
5245}
5246
5247// ggml_ssm_scan
5248
5249struct ggml_tensor * ggml_ssm_scan(
5250 struct ggml_context * ctx,
5251 struct ggml_tensor * s,
5252 struct ggml_tensor * x,
5253 struct ggml_tensor * dt,
5254 struct ggml_tensor * A,
5255 struct ggml_tensor * B,
5256 struct ggml_tensor * C,
5257 struct ggml_tensor * ids) {
5258 GGML_ASSERT(ggml_is_contiguous(s));
5259 GGML_ASSERT(ggml_is_contiguous(dt));
5260 GGML_ASSERT(ggml_is_contiguous(A));
5261 GGML_ASSERT(x->nb[0] == ggml_type_size(x->type));
5262 GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
5263 GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
5264 GGML_ASSERT(x->nb[1] == x->ne[0]*x->nb[0]);
5265 GGML_ASSERT(B->nb[1] == B->ne[0]*B->nb[0]);
5266 GGML_ASSERT(C->nb[1] == C->ne[0]*C->nb[0]);
5267 GGML_ASSERT(ggml_are_same_shape(B, C));
5268 GGML_ASSERT(ids->type == GGML_TYPE_I32);
5269
5270 {
5271 const int64_t d_state = s->ne[0];
5272 const int64_t head_dim = x->ne[0];
5273 const int64_t n_head = x->ne[1];
5274 const int64_t n_seq_tokens = x->ne[2];
5275 const int64_t n_seqs = x->ne[3];
5276
5277 GGML_ASSERT(dt->ne[0] == n_head);
5278 GGML_ASSERT(dt->ne[1] == n_seq_tokens);
5279 GGML_ASSERT(dt->ne[2] == n_seqs);
5280 GGML_ASSERT(ggml_is_3d(dt));
5281 GGML_ASSERT(s->ne[1] == head_dim);
5282 GGML_ASSERT(s->ne[2] == n_head);
5283 GGML_ASSERT(B->ne[0] == d_state);
5284 GGML_ASSERT(B->ne[2] == n_seq_tokens);
5285 GGML_ASSERT(B->ne[3] == n_seqs);
5286 GGML_ASSERT(ids->ne[0] == n_seqs);
5287 GGML_ASSERT(ggml_is_vector(ids));
5288 GGML_ASSERT(A->ne[1] == n_head);
5289 GGML_ASSERT(ggml_is_matrix(A));
5290
5291 if (A->ne[0] != 1) {
5292 // Mamba-1 has more granular decay factors
5293 GGML_ASSERT(A->ne[0] == d_state);
5294 }
5295 }
5296
5297 // concatenated y + ssm_states
5298 struct ggml_tensor * result = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: ggml_nelements(tensor: x) + s->ne[0]*s->ne[1]*s->ne[2]*ids->ne[0]);
5299
5300 result->op = GGML_OP_SSM_SCAN;
5301 result->src[0] = s;
5302 result->src[1] = x;
5303 result->src[2] = dt;
5304 result->src[3] = A;
5305 result->src[4] = B;
5306 result->src[5] = C;
5307 result->src[6] = ids;
5308
5309 return result;
5310}
5311
5312// ggml_win_part
5313
5314struct ggml_tensor * ggml_win_part(
5315 struct ggml_context * ctx,
5316 struct ggml_tensor * a,
5317 int w) {
5318 GGML_ASSERT(a->ne[3] == 1);
5319 GGML_ASSERT(a->type == GGML_TYPE_F32);
5320
5321 // padding
5322 const int px = (w - a->ne[1]%w)%w;
5323 const int py = (w - a->ne[2]%w)%w;
5324
5325 const int npx = (px + a->ne[1])/w;
5326 const int npy = (py + a->ne[2])/w;
5327 const int np = npx*npy;
5328
5329 const int64_t ne[4] = { a->ne[0], w, w, np, };
5330 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
5331
5332 int32_t params[] = { npx, npy, w };
5333 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
5334
5335 result->op = GGML_OP_WIN_PART;
5336 result->src[0] = a;
5337
5338 return result;
5339}
5340
5341// ggml_win_unpart
5342
5343struct ggml_tensor * ggml_win_unpart(
5344 struct ggml_context * ctx,
5345 struct ggml_tensor * a,
5346 int w0,
5347 int h0,
5348 int w) {
5349 GGML_ASSERT(a->type == GGML_TYPE_F32);
5350
5351 const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
5352 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 3, ne);
5353
5354 int32_t params[] = { w };
5355 ggml_set_op_params(tensor: result, params, params_size: sizeof(params));
5356
5357 result->op = GGML_OP_WIN_UNPART;
5358 result->src[0] = a;
5359
5360 return result;
5361}
5362
5363// ggml_get_rel_pos
5364
5365struct ggml_tensor * ggml_get_rel_pos(
5366 struct ggml_context * ctx,
5367 struct ggml_tensor * a,
5368 int qh,
5369 int kh) {
5370 GGML_ASSERT(qh == kh);
5371 GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
5372
5373 const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
5374 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F16, n_dims: 3, ne);
5375
5376 result->op = GGML_OP_GET_REL_POS;
5377 result->src[0] = a;
5378
5379 return result;
5380}
5381
5382// ggml_add_rel_pos
5383
5384static struct ggml_tensor * ggml_add_rel_pos_impl(
5385 struct ggml_context * ctx,
5386 struct ggml_tensor * a,
5387 struct ggml_tensor * pw,
5388 struct ggml_tensor * ph,
5389 bool inplace) {
5390 GGML_ASSERT(ggml_are_same_shape(pw, ph));
5391 GGML_ASSERT(ggml_is_contiguous(a));
5392 GGML_ASSERT(ggml_is_contiguous(pw));
5393 GGML_ASSERT(ggml_is_contiguous(ph));
5394 GGML_ASSERT(ph->type == GGML_TYPE_F32);
5395 GGML_ASSERT(pw->type == GGML_TYPE_F32);
5396 GGML_ASSERT(pw->ne[3] == a->ne[2]);
5397 GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
5398 GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
5399
5400 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
5401 ggml_set_op_params_i32(tensor: result, i: 0, value: inplace ? 1 : 0);
5402
5403 result->op = GGML_OP_ADD_REL_POS;
5404 result->src[0] = a;
5405 result->src[1] = pw;
5406 result->src[2] = ph;
5407
5408 return result;
5409}
5410
5411struct ggml_tensor * ggml_add_rel_pos(
5412 struct ggml_context * ctx,
5413 struct ggml_tensor * a,
5414 struct ggml_tensor * pw,
5415 struct ggml_tensor * ph) {
5416 return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
5417}
5418
5419struct ggml_tensor * ggml_add_rel_pos_inplace(
5420 struct ggml_context * ctx,
5421 struct ggml_tensor * a,
5422 struct ggml_tensor * pw,
5423 struct ggml_tensor * ph) {
5424 return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
5425}
5426
5427// ggml_rwkv_wkv6
5428
5429struct ggml_tensor * ggml_rwkv_wkv6(
5430 struct ggml_context * ctx,
5431 struct ggml_tensor * k,
5432 struct ggml_tensor * v,
5433 struct ggml_tensor * r,
5434 struct ggml_tensor * tf,
5435 struct ggml_tensor * td,
5436 struct ggml_tensor * state) {
5437 GGML_ASSERT(ggml_is_contiguous(k));
5438 GGML_ASSERT(ggml_is_contiguous(v));
5439 GGML_ASSERT(ggml_is_contiguous(r));
5440 GGML_ASSERT(ggml_is_contiguous(tf));
5441 GGML_ASSERT(ggml_is_contiguous(td));
5442 GGML_ASSERT(ggml_is_contiguous(state));
5443
5444 const int64_t S = k->ne[0];
5445 const int64_t H = k->ne[1];
5446 const int64_t n_tokens = k->ne[2];
5447 const int64_t n_seqs = state->ne[1];
5448 {
5449 GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
5450 GGML_ASSERT(r->ne[0] == S && r->ne[1] == H && r->ne[2] == n_tokens);
5451 GGML_ASSERT(td->ne[0] == S && td->ne[1] == H && td->ne[2] == n_tokens);
5452 GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
5453 }
5454
5455 // concat output and new_state
5456 const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
5457 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
5458
5459 result->op = GGML_OP_RWKV_WKV6;
5460 result->src[0] = k;
5461 result->src[1] = v;
5462 result->src[2] = r;
5463 result->src[3] = tf;
5464 result->src[4] = td;
5465 result->src[5] = state;
5466
5467 return result;
5468}
5469
5470// ggml_gated_linear_attn
5471
5472struct ggml_tensor * ggml_gated_linear_attn(
5473 struct ggml_context * ctx,
5474 struct ggml_tensor * k,
5475 struct ggml_tensor * v,
5476 struct ggml_tensor * q,
5477 struct ggml_tensor * g,
5478 struct ggml_tensor * state,
5479 float scale) {
5480 GGML_ASSERT(ggml_is_contiguous(k));
5481 GGML_ASSERT(ggml_is_contiguous(v));
5482 GGML_ASSERT(ggml_is_contiguous(q));
5483 GGML_ASSERT(ggml_is_contiguous(g));
5484 GGML_ASSERT(ggml_is_contiguous(state));
5485
5486 const int64_t S = k->ne[0];
5487 const int64_t H = k->ne[1];
5488 const int64_t n_tokens = k->ne[2];
5489 const int64_t n_seqs = state->ne[1];
5490 {
5491 GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
5492 GGML_ASSERT(q->ne[0] == S && q->ne[1] == H && q->ne[2] == n_tokens);
5493 GGML_ASSERT(g->ne[0] == S && g->ne[1] == H && g->ne[2] == n_tokens);
5494 GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
5495 }
5496
5497 // concat output and new_state
5498 const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
5499 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
5500
5501 ggml_set_op_params_f32(tensor: result, i: 0, value: scale);
5502
5503 result->op = GGML_OP_GATED_LINEAR_ATTN;
5504 result->src[0] = k;
5505 result->src[1] = v;
5506 result->src[2] = q;
5507 result->src[3] = g;
5508 result->src[4] = state;
5509
5510 return result;
5511}
5512
5513// ggml_rwkv_wkv7
5514
5515struct ggml_tensor * ggml_rwkv_wkv7(
5516 struct ggml_context * ctx,
5517 struct ggml_tensor * r,
5518 struct ggml_tensor * w,
5519 struct ggml_tensor * k,
5520 struct ggml_tensor * v,
5521 struct ggml_tensor * a,
5522 struct ggml_tensor * b,
5523 struct ggml_tensor * state) {
5524 GGML_ASSERT(ggml_is_contiguous(r));
5525 GGML_ASSERT(ggml_is_contiguous(w));
5526 GGML_ASSERT(ggml_is_contiguous(k));
5527 GGML_ASSERT(ggml_is_contiguous(v));
5528 GGML_ASSERT(ggml_is_contiguous(a));
5529 GGML_ASSERT(ggml_is_contiguous(b));
5530 GGML_ASSERT(ggml_is_contiguous(state));
5531
5532 const int64_t S = k->ne[0];
5533 const int64_t H = k->ne[1];
5534 const int64_t n_tokens = k->ne[2];
5535 const int64_t n_seqs = state->ne[1];
5536 {
5537 GGML_ASSERT(w->ne[0] == S && w->ne[1] == H && w->ne[2] == n_tokens);
5538 GGML_ASSERT(k->ne[0] == S && k->ne[1] == H && k->ne[2] == n_tokens);
5539 GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
5540 GGML_ASSERT(a->ne[0] == S && a->ne[1] == H && a->ne[2] == n_tokens);
5541 GGML_ASSERT(b->ne[0] == S && b->ne[1] == H && b->ne[2] == n_tokens);
5542 GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
5543 }
5544
5545 // concat output and new_state
5546 const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
5547 struct ggml_tensor * result = ggml_new_tensor(ctx, type: GGML_TYPE_F32, n_dims: 4, ne);
5548
5549 result->op = GGML_OP_RWKV_WKV7;
5550 result->src[0] = r;
5551 result->src[1] = w;
5552 result->src[2] = k;
5553 result->src[3] = v;
5554 result->src[4] = a;
5555 result->src[5] = b;
5556 result->src[6] = state;
5557
5558 return result;
5559}
5560
5561// ggml_unary
5562
5563static struct ggml_tensor * ggml_unary_impl(
5564 struct ggml_context * ctx,
5565 struct ggml_tensor * a,
5566 enum ggml_unary_op op,
5567 bool inplace) {
5568 GGML_ASSERT(ggml_is_contiguous_1(a));
5569
5570 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
5571
5572 ggml_set_op_params_i32(tensor: result, i: 0, value: (int32_t) op);
5573
5574 result->op = GGML_OP_UNARY;
5575 result->src[0] = a;
5576
5577 return result;
5578}
5579
5580struct ggml_tensor * ggml_unary(
5581 struct ggml_context * ctx,
5582 struct ggml_tensor * a,
5583 enum ggml_unary_op op) {
5584 return ggml_unary_impl(ctx, a, op, false);
5585}
5586
5587struct ggml_tensor * ggml_unary_inplace(
5588 struct ggml_context * ctx,
5589 struct ggml_tensor * a,
5590 enum ggml_unary_op op) {
5591 return ggml_unary_impl(ctx, a, op, true);
5592}
5593
5594// ggml_map_custom1
5595
5596static struct ggml_tensor * ggml_map_custom1_impl(
5597 struct ggml_context * ctx,
5598 struct ggml_tensor * a,
5599 const ggml_custom1_op_t fun,
5600 int n_tasks,
5601 void * userdata,
5602 bool inplace) {
5603 GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
5604
5605 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
5606
5607 struct ggml_map_custom1_op_params params = {
5608 /*.fun =*/ fun,
5609 /*.n_tasks =*/ n_tasks,
5610 /*.userdata =*/ userdata
5611 };
5612 ggml_set_op_params(tensor: result, params: &params, params_size: sizeof(params));
5613
5614 result->op = GGML_OP_MAP_CUSTOM1;
5615 result->src[0] = a;
5616
5617 return result;
5618}
5619
5620struct ggml_tensor * ggml_map_custom1(
5621 struct ggml_context * ctx,
5622 struct ggml_tensor * a,
5623 const ggml_custom1_op_t fun,
5624 int n_tasks,
5625 void * userdata) {
5626 return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
5627}
5628
5629struct ggml_tensor * ggml_map_custom1_inplace(
5630 struct ggml_context * ctx,
5631 struct ggml_tensor * a,
5632 const ggml_custom1_op_t fun,
5633 int n_tasks,
5634 void * userdata) {
5635 return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
5636}
5637
5638// ggml_map_custom2
5639
5640static struct ggml_tensor * ggml_map_custom2_impl(
5641 struct ggml_context * ctx,
5642 struct ggml_tensor * a,
5643 struct ggml_tensor * b,
5644 const ggml_custom2_op_t fun,
5645 int n_tasks,
5646 void * userdata,
5647 bool inplace) {
5648 GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
5649
5650 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
5651
5652 struct ggml_map_custom2_op_params params = {
5653 /*.fun =*/ fun,
5654 /*.n_tasks =*/ n_tasks,
5655 /*.userdata =*/ userdata
5656 };
5657 ggml_set_op_params(tensor: result, params: &params, params_size: sizeof(params));
5658
5659 result->op = GGML_OP_MAP_CUSTOM2;
5660 result->src[0] = a;
5661 result->src[1] = b;
5662
5663 return result;
5664}
5665
5666struct ggml_tensor * ggml_map_custom2(
5667 struct ggml_context * ctx,
5668 struct ggml_tensor * a,
5669 struct ggml_tensor * b,
5670 const ggml_custom2_op_t fun,
5671 int n_tasks,
5672 void * userdata) {
5673 return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
5674}
5675
5676struct ggml_tensor * ggml_map_custom2_inplace(
5677 struct ggml_context * ctx,
5678 struct ggml_tensor * a,
5679 struct ggml_tensor * b,
5680 const ggml_custom2_op_t fun,
5681 int n_tasks,
5682 void * userdata) {
5683 return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
5684}
5685
5686// ggml_map_custom3
5687
5688static struct ggml_tensor * ggml_map_custom3_impl(
5689 struct ggml_context * ctx,
5690 struct ggml_tensor * a,
5691 struct ggml_tensor * b,
5692 struct ggml_tensor * c,
5693 const ggml_custom3_op_t fun,
5694 int n_tasks,
5695 void * userdata,
5696 bool inplace) {
5697 GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
5698
5699 struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, src: a) : ggml_dup_tensor(ctx, src: a);
5700
5701 struct ggml_map_custom3_op_params params = {
5702 /*.fun =*/ fun,
5703 /*.n_tasks =*/ n_tasks,
5704 /*.userdata =*/ userdata
5705 };
5706 ggml_set_op_params(tensor: result, params: &params, params_size: sizeof(params));
5707
5708 result->op = GGML_OP_MAP_CUSTOM3;
5709 result->src[0] = a;
5710 result->src[1] = b;
5711 result->src[2] = c;
5712
5713 return result;
5714}
5715
5716struct ggml_tensor * ggml_map_custom3(
5717 struct ggml_context * ctx,
5718 struct ggml_tensor * a,
5719 struct ggml_tensor * b,
5720 struct ggml_tensor * c,
5721 const ggml_custom3_op_t fun,
5722 int n_tasks,
5723 void * userdata) {
5724 return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
5725}
5726
5727struct ggml_tensor * ggml_map_custom3_inplace(
5728 struct ggml_context * ctx,
5729 struct ggml_tensor * a,
5730 struct ggml_tensor * b,
5731 struct ggml_tensor * c,
5732 const ggml_custom3_op_t fun,
5733 int n_tasks,
5734 void * userdata) {
5735 return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
5736}
5737
5738struct ggml_tensor * ggml_custom_4d(
5739 struct ggml_context * ctx,
5740 enum ggml_type type,
5741 int64_t ne0,
5742 int64_t ne1,
5743 int64_t ne2,
5744 int64_t ne3,
5745 struct ggml_tensor ** args,
5746 int n_args,
5747 ggml_custom_op_t fun,
5748 int n_tasks,
5749 void * userdata) {
5750
5751 GGML_ASSERT(n_args < GGML_MAX_SRC);
5752
5753 struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
5754
5755 struct ggml_custom_op_params params = {
5756 /*.fun =*/ fun,
5757 /*.n_tasks =*/ n_tasks,
5758 /*.userdata =*/ userdata
5759 };
5760 ggml_set_op_params(tensor: result, params: &params, params_size: sizeof(params));
5761
5762 result->op = GGML_OP_CUSTOM;
5763 for (int i = 0; i < n_args; i++) {
5764 result->src[i] = args[i];
5765 }
5766
5767 return result;
5768}
5769
5770struct ggml_tensor * ggml_custom_inplace(
5771 struct ggml_context * ctx,
5772 struct ggml_tensor * a,
5773 struct ggml_tensor ** args,
5774 int n_args,
5775 ggml_custom_op_t fun,
5776 int n_tasks,
5777 void * userdata) {
5778
5779 GGML_ASSERT(n_args < GGML_MAX_SRC - 1);
5780
5781 struct ggml_tensor * result = ggml_view_tensor(ctx, src: a);
5782
5783 struct ggml_custom_op_params params = {
5784 /*.fun =*/ fun,
5785 /*.n_tasks =*/ n_tasks,
5786 /*.userdata =*/ userdata
5787 };
5788 ggml_set_op_params(tensor: result, params: &params, params_size: sizeof(params));
5789
5790 result->op = GGML_OP_CUSTOM;
5791 result->src[0] = a;
5792 for (int i = 0; i < n_args; i++) {
5793 result->src[i + 1] = args[i];
5794 }
5795
5796 return result;
5797}
5798// ggml_cross_entropy_loss
5799
5800struct ggml_tensor * ggml_cross_entropy_loss(
5801 struct ggml_context * ctx,
5802 struct ggml_tensor * a,
5803 struct ggml_tensor * b) {
5804 GGML_ASSERT(ggml_are_same_shape(a, b));
5805
5806 struct ggml_tensor * result = ggml_new_tensor_1d(ctx, type: a->type, ne0: 1);
5807
5808 result->op = GGML_OP_CROSS_ENTROPY_LOSS;
5809 result->src[0] = a;
5810 result->src[1] = b;
5811
5812 return result;
5813}
5814
5815// ggml_cross_entropy_loss_back
5816
5817struct ggml_tensor * ggml_cross_entropy_loss_back(
5818 struct ggml_context * ctx,
5819 struct ggml_tensor * a,
5820 struct ggml_tensor * b,
5821 struct ggml_tensor * c) {
5822 GGML_ASSERT(ggml_is_scalar(a));
5823 GGML_ASSERT(ggml_are_same_shape(b, c));
5824
5825 struct ggml_tensor * result = ggml_dup_tensor(ctx, src: b);
5826
5827 result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
5828 result->src[0] = a;
5829 result->src[1] = b;
5830 result->src[2] = c;
5831
5832 return result;
5833}
5834
5835// opt_step_adamw
5836
5837struct ggml_tensor * ggml_opt_step_adamw(
5838 struct ggml_context * ctx,
5839 struct ggml_tensor * a,
5840 struct ggml_tensor * grad,
5841 struct ggml_tensor * m,
5842 struct ggml_tensor * v,
5843 struct ggml_tensor * adamw_params) {
5844 GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
5845 GGML_ASSERT(ggml_are_same_shape(a, grad));
5846 GGML_ASSERT(ggml_are_same_shape(a, m));
5847 GGML_ASSERT(ggml_are_same_shape(a, v));
5848 GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
5849 GGML_ASSERT(ggml_nelements(adamw_params) == 7);
5850
5851 struct ggml_tensor * result = ggml_view_tensor(ctx, src: a);
5852
5853 result->op = GGML_OP_OPT_STEP_ADAMW;
5854 result->src[0] = a;
5855 result->src[1] = grad;
5856 result->src[2] = m;
5857 result->src[3] = v;
5858 result->src[4] = adamw_params;
5859
5860 return result;
5861}
5862
5863// opt_step_sgd
5864
5865struct ggml_tensor * ggml_opt_step_sgd(
5866 struct ggml_context * ctx,
5867 struct ggml_tensor * a,
5868 struct ggml_tensor * grad,
5869 struct ggml_tensor * params) {
5870 GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
5871 GGML_ASSERT(ggml_are_same_shape(a, grad));
5872 GGML_ASSERT(params->type == GGML_TYPE_F32);
5873 GGML_ASSERT(ggml_nelements(params) == 2);
5874
5875 struct ggml_tensor * result = ggml_view_tensor(ctx, src: a);
5876
5877 result->op = GGML_OP_OPT_STEP_SGD;
5878 result->src[0] = a;
5879 result->src[1] = grad;
5880 result->src[2] = params;
5881
5882 return result;
5883}
5884
5885////////////////////////////////////////////////////////////////////////////////
5886
5887struct ggml_hash_set ggml_hash_set_new(size_t size) {
5888 size = ggml_hash_size(min_sz: size);
5889 struct ggml_hash_set result;
5890 result.size = size;
5891 result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
5892 result.used = GGML_CALLOC(ggml_bitset_size(size), sizeof(ggml_bitset_t));
5893 return result;
5894}
5895
5896void ggml_hash_set_reset(struct ggml_hash_set * hash_set) {
5897 memset(s: hash_set->used, c: 0, n: sizeof(ggml_bitset_t) * ggml_bitset_size(n: hash_set->size));
5898}
5899
5900void ggml_hash_set_free(struct ggml_hash_set * hash_set) {
5901 GGML_FREE(hash_set->used);
5902 GGML_FREE(hash_set->keys);
5903}
5904
5905size_t ggml_hash_size(size_t min_sz) {
5906 // next primes after powers of two
5907 static const size_t primes[] = {
5908 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
5909 2053, 4099, 8209, 16411, 32771, 65537, 131101,
5910 262147, 524309, 1048583, 2097169, 4194319, 8388617,
5911 16777259, 33554467, 67108879, 134217757, 268435459,
5912 536870923, 1073741827, 2147483659
5913 };
5914 static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
5915
5916 // find the smallest prime that is larger or equal than min_sz
5917 size_t l = 0;
5918 size_t r = n_primes;
5919 while (l < r) {
5920 size_t m = (l + r)/2;
5921 if (primes[m] < min_sz) {
5922 l = m + 1;
5923 } else {
5924 r = m;
5925 }
5926 }
5927 size_t sz = l < n_primes ? primes[l] : min_sz | 1;
5928 return sz;
5929}
5930
5931struct hash_map {
5932 struct ggml_hash_set set;
5933 struct ggml_tensor ** vals;
5934};
5935
5936static struct hash_map * ggml_new_hash_map(size_t size) {
5937 struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
5938 result->set = ggml_hash_set_new(size);
5939 result->vals = GGML_CALLOC(result->set.size, sizeof(struct ggml_tensor *));
5940 return result;
5941}
5942
5943static void ggml_hash_map_free(struct hash_map * map) {
5944 ggml_hash_set_free(hash_set: &map->set);
5945 GGML_FREE(map->vals);
5946 GGML_FREE(map);
5947}
5948
5949// utility functions to change gradients
5950// isrc is the index of tensor in cgraph->visited_has_set.keys
5951// the corresponding gradient (accumulators) are also at position isrc
5952// if tensor has a gradient accumulator, modify that accumulator in-place
5953// else if there is no gradient for tensor, set the corresponding value
5954// else, just add/subtract/etc. the gradients
5955
5956static void ggml_add_or_set(
5957 struct ggml_context * ctx,
5958 struct ggml_cgraph * cgraph,
5959 size_t isrc,
5960 struct ggml_tensor * tensor) {
5961 struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
5962 GGML_ASSERT(src);
5963 if (cgraph->grads[isrc]) {
5964 cgraph->grads[isrc] = ggml_add_impl(ctx, a: cgraph->grads[isrc], b: tensor, /*inplace =*/ cgraph->grad_accs[isrc]);
5965 } else {
5966 cgraph->grads[isrc] = tensor;
5967 }
5968 ggml_format_name(tensor: cgraph->grads[isrc], fmt: "grad for %s", src->name);
5969 ggml_build_forward_expand(cgraph, tensor: cgraph->grads[isrc]);
5970}
5971
5972static void ggml_acc_or_set(
5973 struct ggml_context * ctx,
5974 struct ggml_cgraph * cgraph,
5975 size_t isrc,
5976 struct ggml_tensor * tensor,
5977 const size_t nb1,
5978 const size_t nb2,
5979 const size_t nb3,
5980 const size_t offset) {
5981 struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
5982 GGML_ASSERT(src);
5983 if (cgraph->grads[isrc]) {
5984 cgraph->grads[isrc] = ggml_acc_impl(ctx, a: cgraph->grads[isrc], b: tensor, nb1, nb2, nb3, offset, inplace: cgraph->grad_accs[isrc]);
5985 } else {
5986 struct ggml_tensor * a_zero = ggml_scale(ctx, a: src, s: 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
5987 cgraph->grads[isrc] = ggml_acc_impl(ctx, a: a_zero, b: tensor, nb1, nb2, nb3, offset, false);
5988 }
5989 ggml_format_name(tensor: cgraph->grads[isrc], fmt: "grad for %s", cgraph->visited_hash_set.keys[isrc]->name);
5990 ggml_build_forward_expand(cgraph, tensor: cgraph->grads[isrc]);
5991}
5992
5993static void ggml_add1_or_set(
5994 struct ggml_context * ctx,
5995 struct ggml_cgraph * cgraph,
5996 size_t isrc,
5997 struct ggml_tensor * tensor) {
5998 struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
5999 GGML_ASSERT(src);
6000 if (cgraph->grads[isrc]) {
6001 cgraph->grads[isrc] = ggml_add1_impl(ctx, a: cgraph->grads[isrc], b: tensor, inplace: cgraph->grad_accs[isrc]);
6002 } else {
6003 cgraph->grads[isrc] = ggml_repeat(ctx, a: tensor, b: src);
6004 }
6005 ggml_format_name(tensor: cgraph->grads[isrc], fmt: "grad for %s", src->name);
6006 ggml_build_forward_expand(cgraph, tensor: cgraph->grads[isrc]);
6007}
6008
6009static void ggml_sub_or_set(
6010 struct ggml_context * ctx,
6011 struct ggml_cgraph * cgraph,
6012 size_t isrc,
6013 struct ggml_tensor * tensor) {
6014 struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
6015 GGML_ASSERT(src);
6016 if (cgraph->grads[isrc]) {
6017 cgraph->grads[isrc] = ggml_sub_impl(ctx, a: cgraph->grads[isrc], b: tensor, inplace: cgraph->grad_accs[isrc]);
6018 } else {
6019 cgraph->grads[isrc] = ggml_neg(ctx, a: tensor);
6020 }
6021 ggml_format_name(tensor: cgraph->grads[isrc], fmt: "grad for %s", src->name);
6022 ggml_build_forward_expand(cgraph, tensor: cgraph->grads[isrc]);
6023}
6024
6025static void ggml_compute_backward(
6026 struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, const bool * grads_needed) {
6027 struct ggml_tensor * tensor = cgraph->nodes[i];
6028 struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, node: tensor);
6029
6030 if (!grad) {
6031 return;
6032 }
6033
6034 struct ggml_tensor * src0 = tensor->src[0];
6035 struct ggml_tensor * src1 = tensor->src[1];
6036 struct ggml_tensor * src2 = tensor->src[2];
6037 struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
6038 const size_t isrc0 = src0 ? ggml_hash_find(hash_set, key: src0) : (size_t) -1;
6039 const size_t isrc1 = src1 ? ggml_hash_find(hash_set, key: src1) : (size_t) -1;
6040 const size_t isrc2 = src2 ? ggml_hash_find(hash_set, key: src2) : (size_t) -1;
6041 const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(bitset: hash_set->used, i: isrc0) && grads_needed[isrc0];
6042 const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(bitset: hash_set->used, i: isrc1) && grads_needed[isrc1];
6043 const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(bitset: hash_set->used, i: isrc2) && grads_needed[isrc2];
6044
6045 switch (tensor->op) {
6046 case GGML_OP_DUP: {
6047 if (src0_needs_grads) {
6048 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: grad);
6049 }
6050 } break;
6051 case GGML_OP_ADD: {
6052 if (src0_needs_grads) {
6053 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: grad);
6054 }
6055 if (src1_needs_grads) {
6056 struct ggml_tensor * tmp = grad;
6057 if (!ggml_are_same_shape(t0: src0, t1: src1)) {
6058 tmp = ggml_repeat_back(ctx, a: tmp, b: src1);
6059 }
6060 ggml_add_or_set(ctx, cgraph, isrc: isrc1, tensor: tmp);
6061 }
6062 } break;
6063 case GGML_OP_ADD1: {
6064 if (src0_needs_grads) {
6065 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: grad);
6066 }
6067 if (src1_needs_grads) {
6068 ggml_add_or_set(ctx, cgraph, isrc: isrc1, tensor: ggml_mean(ctx, a: grad)); // TODO: should probably be sum instead of mean
6069 }
6070 } break;
6071 case GGML_OP_ACC: {
6072 if (src0_needs_grads) {
6073 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: grad);
6074 }
6075 if (src1_needs_grads) {
6076 const size_t nb1 = ((int32_t *) tensor->op_params)[0];
6077 const size_t nb2 = ((int32_t *) tensor->op_params)[1];
6078 const size_t nb3 = ((int32_t *) tensor->op_params)[2];
6079 const size_t offset = ((int32_t *) tensor->op_params)[3];
6080
6081 struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
6082 a: grad, ne0: src1->ne[0], ne1: src1->ne[1], ne2: src1->ne[2], ne3: src1->ne[3],
6083 nb1, nb2, nb3, offset);
6084
6085 ggml_add_or_set(ctx, cgraph, isrc: isrc1, tensor: ggml_reshape(ctx, a: ggml_cont(ctx, a: tensor_grad_view), b: src1));
6086 }
6087 } break;
6088 case GGML_OP_SUB: {
6089 if (src0_needs_grads) {
6090 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: grad);
6091 }
6092 if (src1_needs_grads) {
6093 ggml_sub_or_set(ctx, cgraph, isrc: isrc1, tensor: grad);
6094 }
6095 } break;
6096 case GGML_OP_MUL: {
6097 if (src0_needs_grads) {
6098 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_mul(ctx, a: grad, b: src1));
6099 }
6100 if (src1_needs_grads) {
6101 struct ggml_tensor * tmp = ggml_mul(ctx, a: src0, b: grad);
6102 if (!ggml_are_same_shape(t0: src0, t1: src1)) {
6103 tmp = ggml_repeat_back(ctx, a: tmp, b: src1);
6104 }
6105 ggml_add_or_set(ctx, cgraph, isrc: isrc1, tensor: tmp);
6106 }
6107 } break;
6108 case GGML_OP_DIV: {
6109 if (src0_needs_grads) {
6110 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_div(ctx, a: grad, b: src1));
6111 }
6112 if (src1_needs_grads) {
6113 ggml_sub_or_set(ctx, cgraph, isrc: isrc1, tensor: ggml_mul(ctx, a: grad, b: ggml_div(ctx, a: tensor, b: src1)));
6114 }
6115 } break;
6116 case GGML_OP_SQR: {
6117 if (src0_needs_grads) {
6118 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_scale(ctx, a: ggml_mul(ctx, a: src0, b: grad), s: 2.0f));
6119 }
6120 } break;
6121 case GGML_OP_SQRT: {
6122 if (src0_needs_grads) {
6123 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_scale(ctx, a: ggml_div(ctx, a: grad, b: tensor), s: 0.5f));
6124 }
6125 } break;
6126 case GGML_OP_LOG: {
6127 if (src0_needs_grads) {
6128 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_div(ctx, a: grad, b: src0));
6129 }
6130 } break;
6131 case GGML_OP_SIN: {
6132 if (src0_needs_grads) {
6133 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_mul(ctx, a: grad, b: ggml_cos(ctx, a: src0)));
6134 }
6135 } break;
6136 case GGML_OP_COS: {
6137 if (src0_needs_grads) {
6138 ggml_sub_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_mul(ctx, a: grad, b: ggml_sin(ctx, a: src0)));
6139 }
6140 } break;
6141 case GGML_OP_SUM: {
6142 if (src0_needs_grads) {
6143 ggml_add1_or_set(ctx, cgraph, isrc: isrc0, tensor: grad);
6144 }
6145 } break;
6146 case GGML_OP_SUM_ROWS: {
6147 if (src0_needs_grads) {
6148 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_repeat(ctx, a: grad, b: src0));
6149 }
6150 } break;
6151 case GGML_OP_MEAN: {
6152 if (src0_needs_grads) {
6153 ggml_add1_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_scale_impl(ctx, a: grad, s: 1.0f/src0->ne[0], b: 0.0, false));
6154 }
6155 } break;
6156 case GGML_OP_REPEAT: {
6157 if (src0_needs_grads) {
6158 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_repeat_back(ctx, a: grad, b: src0));
6159 }
6160 } break;
6161 case GGML_OP_REPEAT_BACK: {
6162 if (src0_needs_grads) {
6163 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_repeat(ctx, a: grad, b: src0));
6164 }
6165 } break;
6166 case GGML_OP_RMS_NORM: {
6167 if (src0_needs_grads) {
6168 float eps;
6169 memcpy(dest: &eps, src: tensor->op_params, n: sizeof(float));
6170 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_rms_norm_back(ctx, a: grad, b: src0, eps));
6171 }
6172 } break;
6173 case GGML_OP_MUL_MAT: {
6174 // https://cs231n.github.io/optimization-2/#staged
6175 // # forward pass
6176 // s0 = np.random.randn(5, 10)
6177 // s1 = np.random.randn(10, 3)
6178 // t = s0.dot(s1)
6179
6180 // # now suppose we had the gradient on t from above in the circuit
6181 // dt = np.random.randn(*t.shape) # same shape as t
6182 // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
6183 // ds1 = t.T.dot(dt)
6184
6185 // tensor.shape [m,p,qq,rr]
6186 // src0.shape [n,m,q1,r1]
6187 // src1.shape [n,p,qq,rr]
6188
6189 if (src0_needs_grads) {
6190 GGML_ASSERT(grad->ne[2] == src1->ne[2]);
6191 GGML_ASSERT(grad->ne[3] == src1->ne[3]);
6192 struct ggml_tensor * tmp =
6193 ggml_out_prod(ctx, // [n,m,qq,rr]
6194 a: src1, // [n,p,qq,rr]
6195 b: grad); // [m,p,qq,rr]
6196 if (!ggml_are_same_shape(t0: tmp, t1: src0)) {
6197 GGML_ASSERT(tmp->ne[0] == src0->ne[0]);
6198 GGML_ASSERT(tmp->ne[1] == src0->ne[1]);
6199 GGML_ASSERT(tmp->ne[3] == 1);
6200
6201 const int64_t nr2 = tmp->ne[2] / src0->ne[2];
6202 const size_t nb2 = tmp->nb[2] * nr2;
6203 const size_t nb3 = tmp->nb[2];
6204
6205 tmp = ggml_view_4d(ctx, a: tmp, ne0: src0->ne[0], ne1: src0->ne[1], ne2: src0->ne[2], ne3: nr2, nb1: tmp->nb[1], nb2, nb3, offset: 0);
6206 tmp = ggml_repeat_back(ctx, a: tmp, b: src0);
6207 }
6208 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: tmp);
6209 }
6210 if (src1_needs_grads) {
6211 ggml_add_or_set(ctx, cgraph, isrc: isrc1,
6212 // ggml_mul_mat(ctx, // [n,p,qq,rr]
6213 // ggml_cont(ctx, // [m,n,q1,r1]
6214 // ggml_transpose(ctx, src0)), // [m,n,q1,r1]
6215 // grad), // [m,p,qq,rr]
6216
6217 // when src0 is bigger than tensor->grad (this is mostly the case in llama),
6218 // avoid transpose of src0, rather transpose smaller tensor->grad
6219 // and then use ggml_out_prod
6220 tensor: ggml_out_prod(ctx, // [n,p,qq,rr]
6221 a: src0, // [n,m,q1,r1]
6222 b: ggml_transpose(ctx, // [p,m,qq,rr]
6223 a: grad))); // [m,p,qq,rr]
6224 }
6225 } break;
6226 case GGML_OP_SCALE: {
6227 if (src0_needs_grads) {
6228 float s;
6229 memcpy(dest: &s, src: tensor->op_params, n: sizeof(float));
6230 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_scale_impl(ctx, a: grad, s, b: 0.0, false));
6231 }
6232 } break;
6233 case GGML_OP_SET: {
6234 const size_t nb1 = ((const int32_t *) tensor->op_params)[0];
6235 const size_t nb2 = ((const int32_t *) tensor->op_params)[1];
6236 const size_t nb3 = ((const int32_t *) tensor->op_params)[2];
6237 const size_t offset = ((const int32_t *) tensor->op_params)[3];
6238
6239 struct ggml_tensor * tensor_grad_view = NULL;
6240
6241 if (src0_needs_grads || src1_needs_grads) {
6242 GGML_ASSERT(src0->type == tensor->type);
6243 GGML_ASSERT(!cgraph->grads[isrc0] || cgraph->grads[isrc0]->type == grad->type);
6244 GGML_ASSERT(!cgraph->grads[isrc1] || !src1_needs_grads || cgraph->grads[isrc1]->type == grad->type);
6245
6246 tensor_grad_view = ggml_view_4d(ctx,
6247 a: grad, ne0: src1->ne[0], ne1: src1->ne[1], ne2: src1->ne[2], ne3: src1->ne[3],
6248 nb1, nb2, nb3, offset);
6249 }
6250
6251 if (src0_needs_grads) {
6252 struct ggml_tensor * tmp = ggml_neg(ctx, a: tensor_grad_view);
6253 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_acc_impl(ctx, a: grad, b: tmp, nb1, nb2, nb3, offset, false));
6254 }
6255
6256 if (src1_needs_grads) {
6257 ggml_add_or_set(ctx, cgraph, isrc: isrc1, tensor: ggml_reshape(ctx, a: ggml_cont(ctx, a: tensor_grad_view), b: src1));
6258 }
6259 } break;
6260 case GGML_OP_CPY: {
6261 // cpy overwrites value of src1 by src0 and returns view(src1)
6262 // the overwriting is mathematically equivalent to:
6263 // tensor = src0 * 1 + src1 * 0
6264 if (src0_needs_grads) {
6265 // dsrc0 = dtensor * 1
6266 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_reshape(ctx, a: grad, b: src0));
6267 }
6268 if (src1_needs_grads) {
6269 // dsrc1 = dtensor * 0 -> noop
6270 }
6271 } break;
6272 case GGML_OP_CONT: {
6273 // same as cpy
6274 if (src0_needs_grads) {
6275 GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
6276 GGML_ASSERT(ggml_is_contiguous(grad));
6277 GGML_ASSERT(ggml_nelements(tensor) == ggml_nelements(src0));
6278 ggml_add_or_set(ctx, cgraph, isrc: isrc0,
6279 tensor: ggml_are_same_shape(t0: tensor, t1: src0) ? grad : ggml_reshape(ctx, a: grad, b: src0));
6280 }
6281 } break;
6282 case GGML_OP_RESHAPE: {
6283 if (src0_needs_grads) {
6284 struct ggml_tensor * grad_cont = ggml_is_contiguous(tensor: grad) ? grad : ggml_cont(ctx, a: grad);
6285 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_reshape(ctx, a: grad_cont, b: src0));
6286 }
6287 } break;
6288 case GGML_OP_VIEW: {
6289 if (src0_needs_grads) {
6290 size_t offset;
6291
6292 memcpy(dest: &offset, src: tensor->op_params, n: sizeof(offset));
6293
6294 size_t nb1 = tensor->nb[1];
6295 size_t nb2 = tensor->nb[2];
6296 size_t nb3 = tensor->nb[3];
6297
6298 if (cgraph->grads[isrc0] && src0->type != cgraph->grads[isrc0]->type) {
6299 // gradient is typically F32, but src0 could be other type
6300 size_t ng = ggml_element_size(tensor: cgraph->grads[isrc0]);
6301 size_t n0 = ggml_element_size(tensor: src0);
6302 GGML_ASSERT(offset % n0 == 0);
6303 GGML_ASSERT(nb1 % n0 == 0);
6304 GGML_ASSERT(nb2 % n0 == 0);
6305 GGML_ASSERT(nb3 % n0 == 0);
6306 offset = (offset / n0) * ng;
6307 nb1 = (nb1 / n0) * ng;
6308 nb2 = (nb2 / n0) * ng;
6309 nb3 = (nb3 / n0) * ng;
6310 }
6311
6312 ggml_acc_or_set(ctx, cgraph, isrc: isrc0, tensor: grad, nb1, nb2, nb3, offset);
6313 }
6314 } break;
6315 case GGML_OP_PERMUTE: {
6316 if (src0_needs_grads) {
6317 const int32_t * axes = (const int32_t *) tensor->op_params;
6318 const int axis0 = axes[0] & 0x3;
6319 const int axis1 = axes[1] & 0x3;
6320 const int axis2 = axes[2] & 0x3;
6321 const int axis3 = axes[3] & 0x3;
6322 int axb[4] = {0,0,0,0}; // axes backward
6323 axb[axis0] = 0;
6324 axb[axis1] = 1;
6325 axb[axis2] = 2;
6326 axb[axis3] = 3;
6327 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_permute(ctx, a: grad, axis0: axb[0], axis1: axb[1], axis2: axb[2], axis3: axb[3]));
6328 }
6329 } break;
6330 case GGML_OP_TRANSPOSE: {
6331 if (src0_needs_grads) {
6332 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_transpose(ctx, a: grad));
6333 }
6334 } break;
6335 case GGML_OP_GET_ROWS: {
6336 if (src0_needs_grads) {
6337 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_get_rows_back(ctx, a: grad, b: src1, c: src0));
6338 }
6339 if (src1_needs_grads) {
6340 // noop
6341 }
6342 } break;
6343 case GGML_OP_DIAG_MASK_INF: {
6344 if (src0_needs_grads) {
6345 /* ggml_diag_mask_inf_impl() shouldn't be here */
6346 /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
6347 const int n_past = ((const int32_t *) tensor->op_params)[0];
6348 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_diag_mask_zero_impl(ctx, a: grad, n_past, false));
6349 }
6350 } break;
6351 case GGML_OP_DIAG_MASK_ZERO: {
6352 if (src0_needs_grads) {
6353 const int n_past = ((const int32_t *) tensor->op_params)[0];
6354 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_diag_mask_zero_impl(ctx, a: grad, n_past, false));
6355 }
6356 } break;
6357 case GGML_OP_SOFT_MAX: {
6358 if (src0_needs_grads) {
6359 float scale = 1.0f;
6360 float max_bias = 0.0f;
6361
6362 memcpy(dest: &scale, src: (const float *) tensor->op_params + 0, n: sizeof(float));
6363 memcpy(dest: &max_bias, src: (const float *) tensor->op_params + 1, n: sizeof(float));
6364
6365 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_soft_max_ext_back(ctx, a: grad, b: tensor, scale, max_bias));
6366 }
6367 GGML_ASSERT((!src1 || !src1_needs_grads) && "backward pass for softmax mask not implemented");
6368 } break;
6369 case GGML_OP_ROPE: {
6370 if (src0_needs_grads) {
6371 //const int n_past = ((int32_t *) tensor->op_params)[0];
6372 const int n_dims = ((const int32_t *) tensor->op_params)[1];
6373 const int mode = ((const int32_t *) tensor->op_params)[2];
6374 //const int n_ctx = ((int32_t *) tensor->op_params)[3];
6375 const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4];
6376 float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
6377 int sections[4] = {0, 0, 0, 0};
6378
6379 memcpy(dest: &freq_base, src: (const float *) tensor->op_params + 5, n: sizeof(float));
6380 memcpy(dest: &freq_scale, src: (const float *) tensor->op_params + 6, n: sizeof(float));
6381 memcpy(dest: &ext_factor, src: (const float *) tensor->op_params + 7, n: sizeof(float));
6382 memcpy(dest: &attn_factor, src: (const float *) tensor->op_params + 8, n: sizeof(float));
6383 memcpy(dest: &beta_fast, src: (const float *) tensor->op_params + 9, n: sizeof(float));
6384 memcpy(dest: &beta_slow, src: (const float *) tensor->op_params + 10, n: sizeof(float));
6385 memcpy(dest: &sections, src: tensor->op_params + 11, n: sizeof(sections));
6386
6387 struct ggml_tensor * rope_back = grad->ne[2] == src1->ne[0] ?
6388 ggml_rope_ext_back(ctx, a: grad, b: src1, c: src2, n_dims,
6389 mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) :
6390 ggml_rope_multi_back(ctx, a: grad, b: src1, c: src2, n_dims, sections,
6391 mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6392 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: rope_back);
6393 }
6394 GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented");
6395 } break;
6396 case GGML_OP_IM2COL: {
6397 if (src1_needs_grads) {
6398 const int32_t s0 = ggml_get_op_params_i32(tensor, i: 0);
6399 const int32_t s1 = ggml_get_op_params_i32(tensor, i: 1);
6400 const int32_t p0 = ggml_get_op_params_i32(tensor, i: 2);
6401 const int32_t p1 = ggml_get_op_params_i32(tensor, i: 3);
6402 const int32_t d0 = ggml_get_op_params_i32(tensor, i: 4);
6403 const int32_t d1 = ggml_get_op_params_i32(tensor, i: 5);
6404 const bool is_2D = ggml_get_op_params_i32(tensor, i: 6) == 1;
6405
6406 ggml_add_or_set(ctx, cgraph, isrc: isrc1, tensor: ggml_im2col_back(ctx, a: grad, b: src0, ne: src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
6407 }
6408 } break;
6409 case GGML_OP_POOL_2D: {
6410 if (src0_needs_grads) {
6411 const enum ggml_op_pool op = ggml_get_op_params_i32(tensor, i: 0);
6412 const int32_t k0 = ggml_get_op_params_i32(tensor, i: 1);
6413 const int32_t k1 = ggml_get_op_params_i32(tensor, i: 2);
6414 const int32_t s0 = ggml_get_op_params_i32(tensor, i: 3);
6415 const int32_t s1 = ggml_get_op_params_i32(tensor, i: 4);
6416 const int32_t p0 = ggml_get_op_params_i32(tensor, i: 5);
6417 const int32_t p1 = ggml_get_op_params_i32(tensor, i: 6);
6418
6419 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_pool_2d_back(ctx, a: grad, af: src0, op, k0, k1, s0, s1, p0, p1));
6420 }
6421 } break;
6422 case GGML_OP_WIN_PART:
6423 case GGML_OP_WIN_UNPART:
6424 case GGML_OP_UNARY: {
6425 switch (ggml_get_unary_op(tensor)) {
6426 case GGML_UNARY_OP_ABS: {
6427 if (src0_needs_grads) {
6428 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_mul(ctx, a: ggml_sgn(ctx, a: src0), b: grad));
6429 }
6430 } break;
6431 case GGML_UNARY_OP_SGN: {
6432 // noop
6433 } break;
6434 case GGML_UNARY_OP_NEG: {
6435 if (src0_needs_grads) {
6436 ggml_sub_or_set(ctx, cgraph, isrc: isrc0, tensor: grad);
6437 }
6438 } break;
6439 case GGML_UNARY_OP_STEP: {
6440 // noop
6441 } break;
6442 case GGML_UNARY_OP_RELU: {
6443 if (src0_needs_grads) {
6444 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_mul(ctx, a: ggml_step(ctx, a: src0), b: grad));
6445 }
6446 } break;
6447 case GGML_UNARY_OP_SILU: {
6448 if (src0_needs_grads) {
6449 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_silu_back(ctx, a: grad, b: src0));
6450 }
6451 } break;
6452 case GGML_UNARY_OP_EXP: {
6453 if (src0_needs_grads) {
6454 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_mul(ctx, a: tensor, b: grad));
6455 }
6456 } break;
6457 default: {
6458 fprintf(stderr, format: "%s: unsupported unary op for backward pass: %s\n",
6459 __func__, ggml_unary_op_name(op: ggml_get_unary_op(tensor)));
6460 GGML_ABORT("fatal error");
6461 } //break;
6462 }
6463 } break;
6464 case GGML_OP_CROSS_ENTROPY_LOSS: {
6465 if (src0_needs_grads) {
6466 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_cross_entropy_loss_back(ctx, a: grad, b: src0, c: src1));
6467 }
6468 GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented");
6469 } break;
6470 case GGML_OP_GLU: {
6471 switch (ggml_get_glu_op(tensor)) {
6472 case GGML_GLU_OP_SWIGLU: {
6473 if (src0_needs_grads) {
6474 GGML_ASSERT(src1 && "backward pass only implemented for split swiglu");
6475 ggml_add_or_set(ctx, cgraph, isrc: isrc0, tensor: ggml_silu_back(ctx, a: ggml_mul(ctx, a: grad, b: src1), b: src0));
6476 }
6477 if (src1_needs_grads) {
6478 ggml_add_or_set(ctx, cgraph, isrc: isrc1, tensor: ggml_mul(ctx, a: ggml_silu(ctx, a: src0), b: grad));
6479 }
6480 } break;
6481 default: {
6482 GGML_ABORT("unsupported glu op for backward pass: %s", ggml_glu_op_name(ggml_get_glu_op(tensor)));
6483 } //break;
6484 }
6485 } break;
6486 case GGML_OP_NONE: {
6487 // noop
6488 } break;
6489 case GGML_OP_COUNT:
6490 default: {
6491 GGML_ABORT("%s: unsupported ggml op for backward pass: %s\n", __func__, ggml_op_name(tensor->op));
6492 } //break;
6493 }
6494
6495 GGML_ASSERT(!src0_needs_grads || ggml_are_same_shape(src0, cgraph->grads[isrc0]));
6496 GGML_ASSERT(!src1_needs_grads || ggml_are_same_shape(src1, cgraph->grads[isrc1]));
6497 GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2]));
6498}
6499
6500static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
6501 // check if already visited
6502 size_t node_hash_pos = ggml_hash_find(hash_set: &cgraph->visited_hash_set, key: node);
6503 GGML_ASSERT(node_hash_pos != GGML_HASHSET_FULL);
6504 if (!ggml_bitset_get(bitset: cgraph->visited_hash_set.used, i: node_hash_pos)) {
6505 // This is the first time we see this node in the current graph.
6506 cgraph->visited_hash_set.keys[node_hash_pos] = node;
6507 ggml_bitset_set(bitset: cgraph->visited_hash_set.used, i: node_hash_pos);
6508 cgraph->use_counts[node_hash_pos] = 0;
6509 } else {
6510 // already visited
6511 return node_hash_pos;
6512 }
6513
6514 for (int i = 0; i < GGML_MAX_SRC; ++i) {
6515 const int k =
6516 (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
6517 (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
6518 /* unknown order, just fall back to using i */ i;
6519
6520 struct ggml_tensor * src = node->src[k];
6521 if (src) {
6522 size_t src_hash_pos = ggml_visit_parents(cgraph, node: src);
6523
6524 // Update the use count for this operand.
6525 cgraph->use_counts[src_hash_pos]++;
6526 }
6527 }
6528
6529 if (node->op == GGML_OP_NONE && !(node->flags & GGML_TENSOR_FLAG_PARAM)) {
6530 // reached a leaf node, not part of the gradient graph (e.g. a constant)
6531 GGML_ASSERT(cgraph->n_leafs < cgraph->size);
6532
6533 if (strlen(s: node->name) == 0) {
6534 ggml_format_name(tensor: node, fmt: "leaf_%d", cgraph->n_leafs);
6535 }
6536
6537 cgraph->leafs[cgraph->n_leafs] = node;
6538 cgraph->n_leafs++;
6539 } else {
6540 GGML_ASSERT(cgraph->n_nodes < cgraph->size);
6541
6542 if (strlen(s: node->name) == 0) {
6543 ggml_format_name(tensor: node, fmt: "node_%d", cgraph->n_nodes);
6544 }
6545
6546 cgraph->nodes[cgraph->n_nodes] = node;
6547 cgraph->n_nodes++;
6548 }
6549
6550 return node_hash_pos;
6551}
6552
6553static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
6554 if (!expand) {
6555 // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
6556 ggml_graph_clear(cgraph);
6557 }
6558
6559 const int n0 = cgraph->n_nodes;
6560
6561 ggml_visit_parents(cgraph, node: tensor);
6562
6563 const int n_new = cgraph->n_nodes - n0;
6564 GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
6565
6566 if (n_new > 0) {
6567 // the last added node should always be starting point
6568 GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
6569 }
6570}
6571
6572void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
6573 ggml_build_forward_impl(cgraph, tensor, true);
6574}
6575
6576void ggml_build_backward_expand(
6577 struct ggml_context * ctx,
6578 struct ggml_cgraph * cgraph,
6579 struct ggml_tensor ** grad_accs) {
6580 GGML_ASSERT(cgraph->n_nodes > 0);
6581 GGML_ASSERT(cgraph->grads);
6582 GGML_ASSERT(cgraph->grad_accs);
6583
6584 const int n_nodes_f = cgraph->n_nodes;
6585
6586 memset(s: cgraph->grads, c: 0, n: cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
6587 memset(s: cgraph->grad_accs, c: 0, n: cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
6588 bool * grads_needed = calloc(nmemb: cgraph->visited_hash_set.size, size: sizeof(bool));
6589
6590 {
6591 bool any_params = false;
6592 bool any_loss = false;
6593 for (int i = 0; i < n_nodes_f; ++i) {
6594 struct ggml_tensor * node = cgraph->nodes[i];
6595 any_params = any_params || (node->flags & GGML_TENSOR_FLAG_PARAM);
6596 any_loss = any_loss || (node->flags & GGML_TENSOR_FLAG_LOSS);
6597 }
6598 GGML_ASSERT(any_params && "no trainable parameters found, did you forget to call ggml_set_param?");
6599 GGML_ASSERT(any_loss && "no training loss found, did you forget to call ggml_set_loss?");
6600 }
6601
6602 for (int i = 0; i < n_nodes_f; ++i) {
6603 struct ggml_tensor * node = cgraph->nodes[i];
6604
6605 if (node->type == GGML_TYPE_I32) {
6606 continue;
6607 }
6608
6609 bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS);
6610 bool ignore_src[GGML_MAX_SRC] = {false};
6611 switch (node->op) {
6612 // gradients in node->src[0] for one reason or another have no effect on output gradients
6613 case GGML_OP_IM2COL: // only used for its shape
6614 case GGML_OP_IM2COL_BACK: // same as IM2COL
6615 ignore_src[0] = true;
6616 break;
6617 case GGML_OP_UNARY: {
6618 const enum ggml_unary_op uop = ggml_get_unary_op(tensor: node);
6619 // SGN and STEP unary ops are piecewise constant
6620 if (uop == GGML_UNARY_OP_SGN || uop == GGML_UNARY_OP_STEP) {
6621 ignore_src[0] = true;
6622 }
6623 } break;
6624
6625 // gradients in node->src[1] for one reason or another have no effect on output gradients
6626 case GGML_OP_CPY: // gradients in CPY target are irrelevant
6627 case GGML_OP_GET_ROWS: // row indices not differentiable
6628 case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
6629 case GGML_OP_ROPE: // positions not differentiable
6630 ignore_src[1] = true;
6631 break;
6632
6633 default:
6634 break;
6635 }
6636 for (int j = 0; j < GGML_MAX_SRC; ++j) {
6637 if (!node->src[j] || ignore_src[j] || !grads_needed[ggml_hash_find(hash_set: &cgraph->visited_hash_set, key: node->src[j])]) {
6638 continue;
6639 }
6640 GGML_ASSERT(node->src[j]->type == GGML_TYPE_F32 || node->src[j]->type == GGML_TYPE_F16);
6641 node_needs_grad = true;
6642 break;
6643 }
6644 if (!node_needs_grad) {
6645 continue;
6646 }
6647
6648 // inplace operations are currently not supported
6649 GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
6650 node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
6651
6652 const size_t ihash = ggml_hash_find(hash_set: &cgraph->visited_hash_set, key: node);
6653 GGML_ASSERT(ihash != GGML_HASHSET_FULL);
6654 GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, ihash));
6655 if (grad_accs && grad_accs[i]) {
6656 cgraph->grad_accs[ihash] = grad_accs[i];
6657 cgraph->grads[ihash] = cgraph->grad_accs[ihash];
6658 } else if (node->flags & GGML_TENSOR_FLAG_LOSS) {
6659 // loss tensors always need a gradient accumulator
6660 cgraph->grad_accs[ihash] = ggml_new_tensor(ctx, type: GGML_TYPE_F32, GGML_MAX_DIMS, ne: node->ne);
6661 cgraph->grads[ihash] = cgraph->grad_accs[ihash];
6662 }
6663 grads_needed[ihash] = true;
6664 }
6665
6666 for (int i = n_nodes_f - 1; i >= 0; --i) {
6667 // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
6668 // use allocator to automatically make inplace operations
6669 ggml_compute_backward(ctx, cgraph, i, grads_needed);
6670 }
6671
6672 free(ptr: grads_needed);
6673}
6674
6675static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
6676 void * ptr = *p;
6677 ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
6678 *p = (void *) ((char *) ptr + size);
6679 return ptr;
6680}
6681
6682static size_t ggml_graph_nbytes(size_t size, bool grads) {
6683 size_t hash_size = ggml_hash_size(min_sz: size * 2);
6684 void * p = 0;
6685 incr_ptr_aligned(p: &p, size: sizeof(struct ggml_cgraph), align: 1);
6686 incr_ptr_aligned(p: &p, size: size * sizeof(struct ggml_tensor *), align: sizeof(struct ggml_tensor *)); // nodes
6687 incr_ptr_aligned(p: &p, size: size * sizeof(struct ggml_tensor *), align: sizeof(struct ggml_tensor *)); // leafs
6688 incr_ptr_aligned(p: &p, size: hash_size * sizeof(int32_t), align: sizeof(int32_t)); // use_counts
6689 incr_ptr_aligned(p: &p, size: hash_size * sizeof(struct ggml_tensor *), align: sizeof(struct ggml_tensor *)); // hash keys
6690 if (grads) {
6691 incr_ptr_aligned(p: &p, size: hash_size * sizeof(struct ggml_tensor *), align: sizeof(struct ggml_tensor *)); // grads
6692 incr_ptr_aligned(p: &p, size: hash_size * sizeof(struct ggml_tensor *), align: sizeof(struct ggml_tensor *)); // grad_accs
6693 }
6694 incr_ptr_aligned(p: &p, size: ggml_bitset_size(n: hash_size) * sizeof(ggml_bitset_t), align: sizeof(ggml_bitset_t));
6695
6696 size_t nbytes = (size_t) p;
6697 return nbytes;
6698}
6699
6700size_t ggml_graph_overhead_custom(size_t size, bool grads) {
6701 return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
6702}
6703
6704size_t ggml_graph_overhead(void) {
6705 return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
6706}
6707
6708struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
6709 const size_t obj_size = ggml_graph_nbytes(size, grads);
6710 struct ggml_object * obj = ggml_new_object(ctx, type: GGML_OBJECT_TYPE_GRAPH, size: obj_size);
6711 struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
6712
6713 // the size of the hash table is doubled since it needs to hold both nodes and leafs
6714 size_t hash_size = ggml_hash_size(min_sz: size * 2);
6715
6716 void * p = cgraph + 1;
6717
6718 struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(p: &p, size: size * sizeof(struct ggml_tensor *), align: sizeof(struct ggml_tensor *));
6719 struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(p: &p, size: size * sizeof(struct ggml_tensor *), align: sizeof(struct ggml_tensor *));
6720 int32_t * use_counts_ptr = incr_ptr_aligned(p: &p, size: hash_size * sizeof(int32_t), align: sizeof(int32_t));
6721 struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(p: &p, size: hash_size * sizeof(struct ggml_tensor *), align: sizeof(struct ggml_tensor *));
6722 struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(p: &p, size: hash_size * sizeof(struct ggml_tensor *), align: sizeof(struct ggml_tensor *)) : NULL;
6723 struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(p: &p, size: hash_size * sizeof(struct ggml_tensor *), align: sizeof(struct ggml_tensor *)) : NULL;
6724
6725 ggml_bitset_t * hash_used = incr_ptr_aligned(p: &p, size: ggml_bitset_size(n: hash_size) * sizeof(ggml_bitset_t), align: sizeof(ggml_bitset_t));
6726
6727 // check that we allocated the correct amount of memory
6728 assert(obj_size == (size_t)((char *)p - (char *)cgraph));
6729
6730 *cgraph = (struct ggml_cgraph) {
6731 /*.size =*/ size,
6732 /*.n_nodes =*/ 0,
6733 /*.n_leafs =*/ 0,
6734 /*.nodes =*/ nodes_ptr,
6735 /*.grads =*/ grads_ptr,
6736 /*.grad_accs =*/ grad_accs_ptr,
6737 /*.leafs =*/ leafs_ptr,
6738 /*.use_counts =*/ use_counts_ptr,
6739 /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
6740 /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
6741 };
6742
6743 ggml_hash_set_reset(hash_set: &cgraph->visited_hash_set);
6744 if (grads) {
6745 memset(s: cgraph->grads, c: 0, n: hash_size*sizeof(struct ggml_tensor *));
6746 memset(s: cgraph->grad_accs, c: 0, n: hash_size*sizeof(struct ggml_tensor *));
6747 }
6748
6749 return cgraph;
6750}
6751
6752struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
6753 return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
6754}
6755
6756struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
6757 struct ggml_cgraph cgraph = {
6758 /*.size =*/ 0,
6759 /*.n_nodes =*/ i1 - i0,
6760 /*.n_leafs =*/ 0,
6761 /*.nodes =*/ cgraph0->nodes + i0,
6762 /*.grads =*/ NULL, // gradients would need visited_hash_set
6763 /*.grad_accs =*/ NULL,
6764 /*.leafs =*/ NULL,
6765 /*.use_counts =*/ cgraph0->use_counts,
6766 /*.visited_hash_set =*/ cgraph0->visited_hash_set,
6767 /*.order =*/ cgraph0->order,
6768 };
6769
6770 return cgraph;
6771}
6772
6773void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
6774 GGML_ASSERT(dst->size >= src->n_leafs);
6775 GGML_ASSERT(dst->size >= src->n_nodes);
6776 GGML_ASSERT(dst->visited_hash_set.size >= src->visited_hash_set.size);
6777
6778 dst->n_leafs = src->n_leafs;
6779 dst->n_nodes = src->n_nodes;
6780 dst->order = src->order;
6781
6782 for (int i = 0; i < src->n_leafs; ++i) {
6783 dst->leafs[i] = src->leafs[i];
6784 }
6785
6786 for (int i = 0; i < src->n_nodes; ++i) {
6787 dst->nodes[i] = src->nodes[i];
6788 }
6789
6790 for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
6791 // copy all hashset keys (tensors) that are in use
6792 if (ggml_bitset_get(bitset: src->visited_hash_set.used, i)) {
6793 size_t new_hash_pos = ggml_hash_insert(hash_set: &dst->visited_hash_set, key: src->visited_hash_set.keys[i]);
6794 dst->use_counts[new_hash_pos] = src->use_counts[i];
6795 }
6796 }
6797
6798 if (dst->grads) {
6799 memset(s: dst->grads, c: 0, n: dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
6800 memset(s: dst->grad_accs, c: 0, n: dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
6801 }
6802 if (src->grads) {
6803 GGML_ASSERT(dst->grads != NULL);
6804 GGML_ASSERT(dst->grad_accs != NULL);
6805 for (int i = 0; i < src->n_nodes; ++i) {
6806 const size_t igrad_src = ggml_hash_find(hash_set: &src->visited_hash_set, key: src->nodes[i]);
6807 const size_t igrad_dst = ggml_hash_find(hash_set: &dst->visited_hash_set, key: dst->nodes[i]);
6808
6809 GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
6810 GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
6811 GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
6812 GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
6813
6814 dst->grads[igrad_dst] = src->grads[igrad_src];
6815 dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
6816 }
6817 }
6818}
6819
6820struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads) {
6821 struct ggml_cgraph * result = ggml_new_graph_custom(ctx, size: cgraph->size, grads: cgraph->grads || force_grads);
6822 ggml_graph_cpy(src: cgraph, dst: result);
6823 return result;
6824}
6825
6826struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
6827 if (ggml_is_empty(tensor)) {
6828 return tensor;
6829 }
6830 if (tensor->buffer) {
6831 ggml_backend_tensor_memset(tensor, value: 0, offset: 0, size: ggml_nbytes(tensor));
6832 } else {
6833 GGML_ASSERT(tensor->data);
6834 memset(s: tensor->data, c: 0, n: ggml_nbytes(tensor));
6835 }
6836 return tensor;
6837}
6838
6839void ggml_graph_reset(struct ggml_cgraph * cgraph) {
6840 if (!cgraph) {
6841 return;
6842 }
6843 GGML_ASSERT(cgraph->grads != NULL);
6844
6845 for (int i = 0; i < cgraph->n_nodes; i++) {
6846 struct ggml_tensor * node = cgraph->nodes[i];
6847 struct ggml_tensor * grad_acc = ggml_graph_get_grad_acc(cgraph, node);
6848
6849 if (node->op == GGML_OP_OPT_STEP_ADAMW) {
6850 // clear momenta
6851 ggml_set_zero(tensor: node->src[2]);
6852 ggml_set_zero(tensor: node->src[3]);
6853 }
6854
6855 // initial gradients of loss should be 1, 0 otherwise
6856 if (grad_acc) {
6857 if (node->flags & GGML_TENSOR_FLAG_LOSS) {
6858 GGML_ASSERT(grad_acc->type == GGML_TYPE_F32);
6859 GGML_ASSERT(ggml_is_scalar(grad_acc));
6860
6861 const float onef = 1.0f;
6862 if (grad_acc->buffer) {
6863 ggml_backend_tensor_set(tensor: grad_acc, data: &onef, offset: 0, size: sizeof(float));
6864 } else {
6865 GGML_ASSERT(grad_acc->data);
6866 *((float *) grad_acc->data) = onef;
6867 }
6868 } else {
6869 ggml_set_zero(tensor: grad_acc);
6870 }
6871 }
6872 }
6873}
6874
6875void ggml_graph_clear(struct ggml_cgraph * cgraph) {
6876 cgraph->n_leafs = 0;
6877 cgraph->n_nodes = 0;
6878 ggml_hash_set_reset(hash_set: &cgraph->visited_hash_set);
6879}
6880
6881int ggml_graph_size(struct ggml_cgraph * cgraph) {
6882 return cgraph->size;
6883}
6884
6885struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
6886 if (i < 0) {
6887 GGML_ASSERT(cgraph->n_nodes + i >= 0);
6888 return cgraph->nodes[cgraph->n_nodes + i];
6889 }
6890
6891 GGML_ASSERT(i < cgraph->n_nodes);
6892 return cgraph->nodes[i];
6893}
6894
6895struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
6896 return cgraph->nodes;
6897}
6898
6899int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
6900 return cgraph->n_nodes;
6901}
6902
6903void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
6904 GGML_ASSERT(cgraph->size > cgraph->n_nodes);
6905 cgraph->nodes[cgraph->n_nodes] = tensor;
6906 cgraph->n_nodes++;
6907}
6908
6909struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, const char * name) {
6910 for (int i = 0; i < cgraph->n_leafs; i++) {
6911 struct ggml_tensor * leaf = cgraph->leafs[i];
6912
6913 if (strcmp(s1: leaf->name, s2: name) == 0) {
6914 return leaf;
6915 }
6916 }
6917
6918 for (int i = 0; i < cgraph->n_nodes; i++) {
6919 struct ggml_tensor * node = cgraph->nodes[i];
6920
6921 if (strcmp(s1: node->name, s2: name) == 0) {
6922 return node;
6923 }
6924 }
6925
6926 return NULL;
6927}
6928
6929struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
6930 const size_t igrad = ggml_hash_find(hash_set: &cgraph->visited_hash_set, key: node);
6931 return igrad != GGML_HASHSET_FULL && ggml_bitset_get(bitset: cgraph->visited_hash_set.used, i: igrad) && cgraph->grads ? cgraph->grads[igrad] : NULL;
6932}
6933
6934struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
6935 const size_t igrad = ggml_hash_find(hash_set: &cgraph->visited_hash_set, key: node);
6936 return igrad != GGML_HASHSET_FULL && ggml_bitset_get(bitset: cgraph->visited_hash_set.used, i: igrad) && cgraph->grad_accs ? cgraph->grad_accs[igrad] : NULL;
6937}
6938
6939void ggml_graph_print(const struct ggml_cgraph * cgraph) {
6940 GGML_LOG_INFO("=== GRAPH ===\n");
6941
6942 GGML_LOG_INFO("n_nodes = %d\n", cgraph->n_nodes);
6943 for (int i = 0; i < cgraph->n_nodes; i++) {
6944 struct ggml_tensor * node = cgraph->nodes[i];
6945
6946 GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
6947 i,
6948 node->ne[0], node->ne[1], node->ne[2],
6949 ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" :
6950 ggml_graph_get_grad(cgraph, node) ? "g" : " ");
6951 }
6952
6953 GGML_LOG_INFO("n_leafs = %d\n", cgraph->n_leafs);
6954 for (int i = 0; i < cgraph->n_leafs; i++) {
6955 struct ggml_tensor * node = cgraph->leafs[i];
6956
6957 GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
6958 i,
6959 node->ne[0], node->ne[1],
6960 ggml_op_name(node->op),
6961 ggml_get_name(node));
6962 }
6963
6964 GGML_LOG_INFO("========================================\n");
6965}
6966
6967static int ggml_node_list_find_tensor(const struct ggml_cgraph * cgraph,
6968 const int * idxs,
6969 int count,
6970 const struct ggml_tensor * tensor) {
6971 GGML_ASSERT(cgraph && idxs);
6972 for (int i = 0; i < count; ++i) {
6973 const int node_idx = idxs[i];
6974
6975 if (node_idx >= cgraph->n_nodes) {
6976 return -1;
6977 }
6978 if (cgraph->nodes[node_idx] == tensor) {
6979 return i;
6980 }
6981 }
6982 return -1;
6983}
6984
6985bool ggml_can_fuse_subgraph_ext(const struct ggml_cgraph * cgraph,
6986 const int * node_idxs,
6987 int count,
6988 const enum ggml_op * ops,
6989 const int * outputs,
6990 int num_outputs) {
6991 GGML_ASSERT(outputs && num_outputs > 0);
6992
6993 for (int i = 0; i < count; ++i) {
6994 if (node_idxs[i] >= cgraph->n_nodes) {
6995 return false;
6996 }
6997
6998 const struct ggml_tensor * node = cgraph->nodes[node_idxs[i]];
6999
7000 if (node->op != ops[i]) {
7001 return false;
7002 }
7003
7004 if (ggml_node_list_find_tensor(cgraph, idxs: outputs, count: num_outputs, tensor: node) != -1) {
7005 continue;
7006 }
7007
7008 if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
7009 return false;
7010 }
7011
7012 int subgraph_uses = 0;
7013 for (int j = i + 1; j < count; ++j) {
7014 const struct ggml_tensor * other_node = cgraph->nodes[node_idxs[j]];
7015 for (int src_idx = 0; src_idx < GGML_MAX_SRC; src_idx++) {
7016 if (other_node->src[src_idx] == node) {
7017 subgraph_uses++;
7018 }
7019 }
7020 }
7021
7022 if (subgraph_uses != ggml_node_get_use_count(cgraph, node_idx: node_idxs[i])) {
7023 return false;
7024 }
7025
7026 // if node is a view, check if the view_src and all it's parent view_srcs are within the subgraph
7027 struct ggml_tensor * view_src = node->view_src;
7028 while (view_src) {
7029 if (ggml_node_list_find_tensor(cgraph, idxs: node_idxs, count, tensor: view_src) == -1) {
7030 return false;
7031 }
7032 view_src = view_src->view_src;
7033 }
7034 }
7035
7036 return true;
7037}
7038
7039// check if node is part of the graph
7040static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
7041 if (cgraph == NULL) {
7042 return true;
7043 }
7044
7045 for (int i = 0; i < cgraph->n_nodes; i++) {
7046 if (cgraph->nodes[i] == node) {
7047 return true;
7048 }
7049 }
7050
7051 return false;
7052}
7053
7054static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
7055 for (int i = 0; i < cgraph->n_nodes; i++) {
7056 struct ggml_tensor * parent = cgraph->nodes[i];
7057 struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, node: parent);
7058
7059 if (grad == node) {
7060 return parent;
7061 }
7062 }
7063
7064 return NULL;
7065}
7066
7067static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
7068 struct ggml_tensor * gparent = ggml_graph_get_parent(cgraph: gb, node);
7069 struct ggml_tensor * gparent0 = ggml_graph_get_parent(cgraph: gb, node: parent);
7070 fprintf(stream: fp, format: " \"%p\" -> \"%p\" [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
7071 gparent0 ? (void *) gparent0 : (void *) parent,
7072 gparent ? (void *) gparent : (void *) node,
7073 gparent ? "empty" : "vee",
7074 gparent ? "dashed" : "solid",
7075 label);
7076}
7077
7078static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
7079 fprintf(stream: fp, format: " \"%p\" -> \"%p\" [ label = \"%s\"; ]\n",
7080 (void *) parent,
7081 (void *) node,
7082 label);
7083}
7084
7085void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
7086 char color[16];
7087
7088 FILE * fp = ggml_fopen(fname: filename, mode: "w");
7089 GGML_ASSERT(fp);
7090
7091 fprintf(stream: fp, format: "digraph G {\n");
7092 fprintf(stream: fp, format: " newrank = true;\n");
7093 fprintf(stream: fp, format: " rankdir = TB;\n");
7094
7095 for (int i = 0; i < gb->n_nodes; i++) {
7096 struct ggml_tensor * node = gb->nodes[i];
7097 struct ggml_tensor * grad = ggml_graph_get_grad(cgraph: gb, node);
7098
7099 if (ggml_graph_get_parent(cgraph: gb, node) != NULL) {
7100 continue;
7101 }
7102
7103 if (node->flags & GGML_TENSOR_FLAG_PARAM) {
7104 snprintf(s: color, maxlen: sizeof(color), format: "yellow");
7105 } else if (grad) {
7106 if (ggml_graph_find(cgraph: gf, node)) {
7107 snprintf(s: color, maxlen: sizeof(color), format: "green");
7108 } else {
7109 snprintf(s: color, maxlen: sizeof(color), format: "lightblue");
7110 }
7111 } else {
7112 snprintf(s: color, maxlen: sizeof(color), format: "white");
7113 }
7114
7115 fprintf(stream: fp, format: " \"%p\" [ "
7116 "style = filled; fillcolor = %s; shape = record; "
7117 "label=\"",
7118 (void *) node, color);
7119
7120 if (strlen(s: node->name) > 0) {
7121 fprintf(stream: fp, format: "%s (%s)|", node->name, ggml_type_name(type: node->type));
7122 } else {
7123 fprintf(stream: fp, format: "(%s)|", ggml_type_name(type: node->type));
7124 }
7125
7126 if (ggml_is_matrix(tensor: node)) {
7127 fprintf(stream: fp, format: "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(op: node->op));
7128 } else {
7129 fprintf(stream: fp, format: "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(op: node->op));
7130 }
7131
7132 if (grad) {
7133 fprintf(stream: fp, format: " | <g>%s\"; ]\n", ggml_op_symbol(op: grad->op));
7134 } else {
7135 fprintf(stream: fp, format: "\"; ]\n");
7136 }
7137 }
7138
7139 for (int i = 0; i < gb->n_leafs; i++) {
7140 struct ggml_tensor * node = gb->leafs[i];
7141
7142 snprintf(s: color, maxlen: sizeof(color), format: "pink");
7143
7144 fprintf(stream: fp, format: " \"%p\" [ "
7145 "style = filled; fillcolor = %s; shape = record; "
7146 "label=\"<x>",
7147 (void *) node, color);
7148
7149 if (strlen(s: node->name) > 0) {
7150 fprintf(stream: fp, format: "%s (%s)|", node->name, ggml_type_name(type: node->type));
7151 } else {
7152 fprintf(stream: fp, format: "(%s)|", ggml_type_name(type: node->type));
7153 }
7154
7155 fprintf(stream: fp, format: "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
7156 if (ggml_nelements(tensor: node) < 5 && node->data != NULL) {
7157 fprintf(stream: fp, format: " | (");
7158 for (int j = 0; j < ggml_nelements(tensor: node); j++) {
7159 // FIXME: use ggml-backend to obtain the tensor data
7160 //if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
7161 // fprintf(fp, "%d", ggml_get_i32_1d(node, j));
7162 //}
7163 //else if (node->type == GGML_TYPE_F32 ||
7164 // node->type == GGML_TYPE_F16 ||
7165 // node->type == GGML_TYPE_BF16) {
7166 // fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
7167 //}
7168 //else
7169 {
7170 fprintf(stream: fp, format: "#");
7171 }
7172 if (j < ggml_nelements(tensor: node) - 1) {
7173 fprintf(stream: fp, format: ", ");
7174 }
7175 }
7176 fprintf(stream: fp, format: ")");
7177 }
7178 fprintf(stream: fp, format: "\"; ]\n");
7179 }
7180
7181 for (int i = 0; i < gb->n_nodes; i++) {
7182 struct ggml_tensor * node = gb->nodes[i];
7183
7184 for (int j = 0; j < GGML_MAX_SRC; j++) {
7185 if (node->src[j]) {
7186 char label[16];
7187 snprintf(s: label, maxlen: sizeof(label), format: "src %d", j);
7188 ggml_graph_dump_dot_node_edge(fp, gb, node, parent: node->src[j], label);
7189 }
7190 }
7191 }
7192
7193 for (int i = 0; i < gb->n_leafs; i++) {
7194 struct ggml_tensor * node = gb->leafs[i];
7195
7196 for (int j = 0; j < GGML_MAX_SRC; j++) {
7197 if (node->src[j]) {
7198 char label[16];
7199 snprintf(s: label, maxlen: sizeof(label), format: "src %d", j);
7200 ggml_graph_dump_dot_leaf_edge(fp, node, parent: node->src[j], label);
7201 }
7202 }
7203 }
7204
7205 fprintf(stream: fp, format: "}\n");
7206
7207 fclose(stream: fp);
7208
7209 GGML_LOG_INFO("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
7210}
7211
7212////////////////////////////////////////////////////////////////////////////////
7213
7214void ggml_set_input(struct ggml_tensor * tensor) {
7215 tensor->flags |= GGML_TENSOR_FLAG_INPUT;
7216}
7217
7218void ggml_set_output(struct ggml_tensor * tensor) {
7219 tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
7220}
7221
7222void ggml_set_param(struct ggml_tensor * tensor) {
7223 GGML_ASSERT(tensor->op == GGML_OP_NONE);
7224 tensor->flags |= GGML_TENSOR_FLAG_PARAM;
7225}
7226
7227void ggml_set_loss(struct ggml_tensor * tensor) {
7228 GGML_ASSERT(ggml_is_scalar(tensor));
7229 GGML_ASSERT(tensor->type == GGML_TYPE_F32);
7230 tensor->flags |= GGML_TENSOR_FLAG_LOSS;
7231}
7232
7233////////////////////////////////////////////////////////////////////////////////
7234
7235void ggml_quantize_init(enum ggml_type type) {
7236 ggml_critical_section_start();
7237
7238 switch (type) {
7239 case GGML_TYPE_IQ2_XXS:
7240 case GGML_TYPE_IQ2_XS:
7241 case GGML_TYPE_IQ2_S:
7242 case GGML_TYPE_IQ1_S:
7243 case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
7244 case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(grid_size: 256); break;
7245 case GGML_TYPE_IQ3_S: iq3xs_init_impl(grid_size: 512); break;
7246 default: // nothing
7247 break;
7248 }
7249
7250 ggml_critical_section_end();
7251}
7252
7253void ggml_quantize_free(void) {
7254 ggml_critical_section_start();
7255
7256 iq2xs_free_impl(type: GGML_TYPE_IQ2_XXS);
7257 iq2xs_free_impl(type: GGML_TYPE_IQ2_XS);
7258 iq2xs_free_impl(type: GGML_TYPE_IQ1_S);
7259 iq3xs_free_impl(grid_size: 256);
7260
7261 ggml_critical_section_end();
7262}
7263
7264bool ggml_quantize_requires_imatrix(enum ggml_type type) {
7265 return
7266 type == GGML_TYPE_IQ2_XXS ||
7267 type == GGML_TYPE_IQ2_XS ||
7268 type == GGML_TYPE_IQ1_S;// ||
7269 //type == GGML_TYPE_IQ1_M;
7270}
7271
7272size_t ggml_quantize_chunk(
7273 enum ggml_type type,
7274 const float * src,
7275 void * dst,
7276 int64_t start,
7277 int64_t nrows,
7278 int64_t n_per_row,
7279 const float * imatrix) {
7280 const int64_t n = (int64_t) nrows * n_per_row;
7281
7282 if (ggml_quantize_requires_imatrix(type)) {
7283 GGML_ASSERT(imatrix != NULL);
7284 }
7285
7286 GGML_ASSERT(start % type_traits[type].blck_size == 0);
7287 GGML_ASSERT(start % n_per_row == 0);
7288
7289 ggml_quantize_init(type); // this is noop if already initialized
7290
7291 const size_t start_row = start / n_per_row;
7292 const size_t row_size = ggml_row_size(type, ne: n_per_row);
7293
7294 size_t result = 0;
7295
7296 switch (type) {
7297 case GGML_TYPE_Q4_0: result = quantize_q4_0(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7298 case GGML_TYPE_Q4_1: result = quantize_q4_1(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7299 case GGML_TYPE_Q5_0: result = quantize_q5_0(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7300 case GGML_TYPE_Q5_1: result = quantize_q5_1(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7301 case GGML_TYPE_Q8_0: result = quantize_q8_0(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7302 case GGML_TYPE_MXFP4: result = quantize_mxfp4(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7303 case GGML_TYPE_Q2_K: result = quantize_q2_K(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7304 case GGML_TYPE_Q3_K: result = quantize_q3_K(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7305 case GGML_TYPE_Q4_K: result = quantize_q4_K(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7306 case GGML_TYPE_Q5_K: result = quantize_q5_K(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7307 case GGML_TYPE_Q6_K: result = quantize_q6_K(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7308 case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7309 case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7310 case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7311 case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7312 case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7313 case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7314 case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7315 case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7316 case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7317 case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7318 case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src: src + start, dst: (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7319 case GGML_TYPE_F16:
7320 {
7321 size_t elemsize = sizeof(ggml_fp16_t);
7322 ggml_fp32_to_fp16_row(x: src + start, y: (ggml_fp16_t *)dst + start, n);
7323 result = n * elemsize;
7324 } break;
7325 case GGML_TYPE_BF16:
7326 {
7327 size_t elemsize = sizeof(ggml_bf16_t);
7328 ggml_fp32_to_bf16_row_ref(x: src + start, y: (ggml_bf16_t *)dst + start, n);
7329 result = n * elemsize;
7330 } break;
7331 case GGML_TYPE_F32:
7332 {
7333 size_t elemsize = sizeof(float);
7334 result = n * elemsize;
7335 memcpy(dest: (uint8_t *)dst + start * elemsize, src: src + start, n: result);
7336 } break;
7337 default:
7338 assert(false);
7339 }
7340
7341 GGML_ASSERT(result == nrows * row_size);
7342
7343 return result;
7344}
7345
7346////////////////////////////////////////////////////////////////////////////////
7347
7348void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
7349 g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
7350 g_logger_state.log_callback_user_data = user_data;
7351}
7352
7353void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
7354 p->n_threads = n_threads;
7355 p->prio = 0; // default priority (usually means normal or inherited)
7356 p->poll = 50; // hybrid-polling enabled
7357 p->strict_cpu = false; // no strict placement (all threads share same cpumask)
7358 p->paused = false; // threads are ready to go
7359 memset(s: p->cpumask, c: 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
7360}
7361
7362struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
7363 struct ggml_threadpool_params p;
7364 ggml_threadpool_params_init(p: &p, n_threads);
7365 return p;
7366}
7367
7368bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
7369 if (p0->n_threads != p1->n_threads ) return false;
7370 if (p0->prio != p1->prio ) return false;
7371 if (p0->poll != p1->poll ) return false;
7372 if (p0->strict_cpu != p1->strict_cpu ) return false;
7373 return memcmp(s1: p0->cpumask, s2: p1->cpumask, GGML_MAX_N_THREADS) == 0;
7374}
7375