1#pragma once
2
3//
4// GGML Tensor Library
5//
6// This documentation is still a work in progress.
7// If you wish some specific topics to be covered, feel free to drop a comment:
8//
9// https://github.com/ggerganov/whisper.cpp/issues/40
10//
11// ## Overview
12//
13// This library implements:
14//
15// - a set of tensor operations
16// - automatic differentiation
17// - basic optimization algorithms
18//
19// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
20// but is not limited to, the following:
21//
22// - linear regression
23// - support vector machines
24// - neural networks
25//
26// The library allows the user to define a certain function using the available tensor operations. This function
27// definition is represented internally via a computation graph. Each tensor operation in the function definition
28// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
29// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
30// using one of the available optimization algorithms.
31//
32// For example, here we define the function: f(x) = a*x^2 + b
33//
34// {
35// struct ggml_init_params params = {
36// .mem_size = 16*1024*1024,
37// .mem_buffer = NULL,
38// };
39//
40// // memory allocation happens here
41// struct ggml_context * ctx = ggml_init(params);
42//
43// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
44//
45// ggml_set_param(ctx, x); // x is an input variable
46//
47// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
48// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
49// struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
50// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
51//
52// ...
53// }
54//
55// Notice that the function definition above does not involve any actual computation. The computation is performed only
56// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
57//
58// {
59// ...
60//
61// struct ggml_cgraph * gf = ggml_new_graph(ctx);
62// ggml_build_forward_expand(gf, f);
63//
64// // set the input variable and parameter values
65// ggml_set_f32(x, 2.0f);
66// ggml_set_f32(a, 3.0f);
67// ggml_set_f32(b, 4.0f);
68//
69// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
70//
71// printf("f = %f\n", ggml_get_f32_1d(f, 0));
72//
73// ...
74// }
75//
76// The actual computation is performed in the ggml_graph_compute() function.
77//
78// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
79// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
80// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
81// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
82// actually needed.
83//
84// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
85// differentiation and optimization algorithms.
86//
87// The described approach allows to define the function graph once and then compute its forward or backward graphs
88// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
89// the user can avoid the memory allocation overhead at runtime.
90//
91// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
92// citizens, but in theory the library can be extended to support FP8 and integer data types.
93//
94// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
95// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
96// clear that the library needs to support more complex operations. The way to support these operations is not clear
97// yet, but a few examples are demonstrated in the following operations:
98//
99// - ggml_permute()
100// - ggml_conv_1d_1s()
101// - ggml_conv_1d_2s()
102//
103// For each tensor operator, the library implements a forward and backward computation function. The forward function
104// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
105// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
106// calculus class, or watch the following video:
107//
108// What is Automatic Differentiation?
109// https://www.youtube.com/watch?v=wG_nF1awSSY
110//
111//
112// ## Tensor data (struct ggml_tensor)
113//
114// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
115// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
116// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
117//
118// {
119// struct ggml_tensor * c = ggml_add(ctx, a, b);
120//
121// assert(c->src[0] == a);
122// assert(c->src[1] == b);
123// }
124//
125// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
126// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
127// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
128// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
129// contiguous in memory.
130//
131// The data of the tensor is accessed via the "data" pointer. For example:
132//
133// {
134// const int nx = 2;
135// const int ny = 3;
136//
137// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
138//
139// for (int y = 0; y < ny; y++) {
140// for (int x = 0; x < nx; x++) {
141// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
142// }
143// }
144//
145// ...
146// }
147//
148// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
149//
150// ## The matrix multiplication operator (ggml_mul_mat)
151//
152// TODO
153//
154//
155// ## Multi-threading
156//
157// TODO
158//
159//
160// ## Overview of ggml.c
161//
162// TODO
163//
164//
165// ## SIMD optimizations
166//
167// TODO
168//
169//
170// ## Debugging ggml
171//
172// TODO
173//
174//
175
176#ifdef GGML_SHARED
177# if defined(_WIN32) && !defined(__MINGW32__)
178# ifdef GGML_BUILD
179# define GGML_API __declspec(dllexport) extern
180# else
181# define GGML_API __declspec(dllimport) extern
182# endif
183# else
184# define GGML_API __attribute__ ((visibility ("default"))) extern
185# endif
186#else
187# define GGML_API extern
188#endif
189
190// TODO: support for clang
191#ifdef __GNUC__
192# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
193#elif defined(_MSC_VER)
194# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
195#else
196# define GGML_DEPRECATED(func, hint) func
197#endif
198
199#ifndef __GNUC__
200# define GGML_ATTRIBUTE_FORMAT(...)
201#elif defined(__MINGW32__) && !defined(__clang__)
202# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
203#else
204# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
205#endif
206
207#include <stdbool.h>
208#include <stddef.h>
209#include <stdint.h>
210#include <stdio.h>
211
212#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
213#define GGML_FILE_VERSION 2
214
215#define GGML_QNT_VERSION 2 // bump this on quantization format changes
216#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
217
218#define GGML_MAX_DIMS 4
219#define GGML_MAX_PARAMS 2048
220#define GGML_MAX_SRC 10
221#define GGML_MAX_N_THREADS 512
222#define GGML_MAX_OP_PARAMS 64
223
224#ifndef GGML_MAX_NAME
225# define GGML_MAX_NAME 64
226#endif
227
228#define GGML_DEFAULT_N_THREADS 4
229#define GGML_DEFAULT_GRAPH_SIZE 2048
230
231#if UINTPTR_MAX == 0xFFFFFFFF
232 #define GGML_MEM_ALIGN 4
233#else
234 #define GGML_MEM_ALIGN 16
235#endif
236
237#define GGML_EXIT_SUCCESS 0
238#define GGML_EXIT_ABORTED 1
239
240// TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
241#define GGML_ROPE_TYPE_NORMAL 0
242#define GGML_ROPE_TYPE_NEOX 2
243#define GGML_ROPE_TYPE_MROPE 8
244#define GGML_ROPE_TYPE_VISION 24
245#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
246
247#define GGML_MROPE_SECTIONS 4
248
249#define GGML_UNUSED(x) (void)(x)
250#ifdef __CUDACC__
251template<typename... Args>
252__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
253#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
254#else
255#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
256#endif // __CUDACC__
257
258#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
259
260#ifndef NDEBUG
261# define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
262#elif defined(__GNUC__)
263# define GGML_UNREACHABLE() __builtin_unreachable()
264#elif defined(_MSC_VER)
265# define GGML_UNREACHABLE() __assume(0)
266#else
267# define GGML_UNREACHABLE() ((void) 0)
268#endif
269
270#ifdef __cplusplus
271# define GGML_NORETURN [[noreturn]]
272#elif defined(_MSC_VER)
273# define GGML_NORETURN __declspec(noreturn)
274#else
275# define GGML_NORETURN _Noreturn
276#endif
277
278#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
279#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
280
281// used to copy the number of elements and stride in bytes of tensors into local variables.
282// main purpose is to reduce code duplication and improve readability.
283//
284// example:
285//
286// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
287// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
288//
289#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
290 const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
291 GGML_UNUSED(prefix##0);
292#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
293 GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
294 const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
295 GGML_UNUSED(prefix##1);
296#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
297 GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
298 const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
299 GGML_UNUSED(prefix##2);
300#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
301 GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
302 const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
303 GGML_UNUSED(prefix##3);
304
305#define GGML_TENSOR_UNARY_OP_LOCALS \
306 GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
307 GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
308 GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
309 GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
310
311#define GGML_TENSOR_BINARY_OP_LOCALS \
312 GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
313 GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
314 GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
315 GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
316 GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
317 GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
318
319#define GGML_TENSOR_TERNARY_OP_LOCALS \
320 GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
321 GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
322 GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
323 GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
324 GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
325 GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \
326 GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
327 GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
328
329#define GGML_TENSOR_BINARY_OP_LOCALS01 \
330 GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
331 GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
332 GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
333 GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
334
335#ifdef __cplusplus
336extern "C" {
337#endif
338
339 // Function type used in fatal error callbacks
340 typedef void (*ggml_abort_callback_t)(const char * error_message);
341
342 // Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
343 // Returns the old callback for chaining
344 GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
345
346 GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
347 GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
348
349 enum ggml_status {
350 GGML_STATUS_ALLOC_FAILED = -2,
351 GGML_STATUS_FAILED = -1,
352 GGML_STATUS_SUCCESS = 0,
353 GGML_STATUS_ABORTED = 1,
354 };
355
356 // get ggml_status name string
357 GGML_API const char * ggml_status_to_string(enum ggml_status status);
358
359 // ieee 754-2008 half-precision float16
360 // todo: make this not an integral type
361 typedef uint16_t ggml_fp16_t;
362 GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
363 GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
364 GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
365 GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
366
367 // google brain half-precision bfloat16
368 typedef struct { uint16_t bits; } ggml_bf16_t;
369 GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
370 GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
371 GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
372 GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
373 GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
374
375 struct ggml_object;
376 struct ggml_context;
377 struct ggml_cgraph;
378
379 // NOTE: always add types at the end of the enum to keep backward compatibility
380 enum ggml_type {
381 GGML_TYPE_F32 = 0,
382 GGML_TYPE_F16 = 1,
383 GGML_TYPE_Q4_0 = 2,
384 GGML_TYPE_Q4_1 = 3,
385 // GGML_TYPE_Q4_2 = 4, support has been removed
386 // GGML_TYPE_Q4_3 = 5, support has been removed
387 GGML_TYPE_Q5_0 = 6,
388 GGML_TYPE_Q5_1 = 7,
389 GGML_TYPE_Q8_0 = 8,
390 GGML_TYPE_Q8_1 = 9,
391 GGML_TYPE_Q2_K = 10,
392 GGML_TYPE_Q3_K = 11,
393 GGML_TYPE_Q4_K = 12,
394 GGML_TYPE_Q5_K = 13,
395 GGML_TYPE_Q6_K = 14,
396 GGML_TYPE_Q8_K = 15,
397 GGML_TYPE_IQ2_XXS = 16,
398 GGML_TYPE_IQ2_XS = 17,
399 GGML_TYPE_IQ3_XXS = 18,
400 GGML_TYPE_IQ1_S = 19,
401 GGML_TYPE_IQ4_NL = 20,
402 GGML_TYPE_IQ3_S = 21,
403 GGML_TYPE_IQ2_S = 22,
404 GGML_TYPE_IQ4_XS = 23,
405 GGML_TYPE_I8 = 24,
406 GGML_TYPE_I16 = 25,
407 GGML_TYPE_I32 = 26,
408 GGML_TYPE_I64 = 27,
409 GGML_TYPE_F64 = 28,
410 GGML_TYPE_IQ1_M = 29,
411 GGML_TYPE_BF16 = 30,
412 // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
413 // GGML_TYPE_Q4_0_4_8 = 32,
414 // GGML_TYPE_Q4_0_8_8 = 33,
415 GGML_TYPE_TQ1_0 = 34,
416 GGML_TYPE_TQ2_0 = 35,
417 // GGML_TYPE_IQ4_NL_4_4 = 36,
418 // GGML_TYPE_IQ4_NL_4_8 = 37,
419 // GGML_TYPE_IQ4_NL_8_8 = 38,
420 GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
421 GGML_TYPE_COUNT = 40,
422 };
423
424 // precision
425 enum ggml_prec {
426 GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
427 GGML_PREC_F32 = 10,
428 };
429
430 // model file types
431 enum ggml_ftype {
432 GGML_FTYPE_UNKNOWN = -1,
433 GGML_FTYPE_ALL_F32 = 0,
434 GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
435 GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
436 GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
437 GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
438 GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
439 GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
440 GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
441 GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
442 GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
443 GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
444 GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
445 GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
446 GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
447 GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
448 GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
449 GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
450 GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
451 GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
452 GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
453 GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
454 GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
455 GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
456 GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
457 };
458
459 // available tensor operations:
460 enum ggml_op {
461 GGML_OP_NONE = 0,
462
463 GGML_OP_DUP,
464 GGML_OP_ADD,
465 GGML_OP_ADD_ID,
466 GGML_OP_ADD1,
467 GGML_OP_ACC,
468 GGML_OP_SUB,
469 GGML_OP_MUL,
470 GGML_OP_DIV,
471 GGML_OP_SQR,
472 GGML_OP_SQRT,
473 GGML_OP_LOG,
474 GGML_OP_SIN,
475 GGML_OP_COS,
476 GGML_OP_SUM,
477 GGML_OP_SUM_ROWS,
478 GGML_OP_MEAN,
479 GGML_OP_ARGMAX,
480 GGML_OP_COUNT_EQUAL,
481 GGML_OP_REPEAT,
482 GGML_OP_REPEAT_BACK,
483 GGML_OP_CONCAT,
484 GGML_OP_SILU_BACK,
485 GGML_OP_NORM, // normalize
486 GGML_OP_RMS_NORM,
487 GGML_OP_RMS_NORM_BACK,
488 GGML_OP_GROUP_NORM,
489 GGML_OP_L2_NORM,
490
491 GGML_OP_MUL_MAT,
492 GGML_OP_MUL_MAT_ID,
493 GGML_OP_OUT_PROD,
494
495 GGML_OP_SCALE,
496 GGML_OP_SET,
497 GGML_OP_CPY,
498 GGML_OP_CONT,
499 GGML_OP_RESHAPE,
500 GGML_OP_VIEW,
501 GGML_OP_PERMUTE,
502 GGML_OP_TRANSPOSE,
503 GGML_OP_GET_ROWS,
504 GGML_OP_GET_ROWS_BACK,
505 GGML_OP_SET_ROWS,
506 GGML_OP_DIAG,
507 GGML_OP_DIAG_MASK_INF,
508 GGML_OP_DIAG_MASK_ZERO,
509 GGML_OP_SOFT_MAX,
510 GGML_OP_SOFT_MAX_BACK,
511 GGML_OP_ROPE,
512 GGML_OP_ROPE_BACK,
513 GGML_OP_CLAMP,
514 GGML_OP_CONV_TRANSPOSE_1D,
515 GGML_OP_IM2COL,
516 GGML_OP_IM2COL_BACK,
517 GGML_OP_IM2COL_3D,
518 GGML_OP_CONV_2D,
519 GGML_OP_CONV_3D,
520 GGML_OP_CONV_2D_DW,
521 GGML_OP_CONV_TRANSPOSE_2D,
522 GGML_OP_POOL_1D,
523 GGML_OP_POOL_2D,
524 GGML_OP_POOL_2D_BACK,
525 GGML_OP_UPSCALE,
526 GGML_OP_PAD,
527 GGML_OP_PAD_REFLECT_1D,
528 GGML_OP_ROLL,
529 GGML_OP_ARANGE,
530 GGML_OP_TIMESTEP_EMBEDDING,
531 GGML_OP_ARGSORT,
532 GGML_OP_LEAKY_RELU,
533
534 GGML_OP_FLASH_ATTN_EXT,
535 GGML_OP_FLASH_ATTN_BACK,
536 GGML_OP_SSM_CONV,
537 GGML_OP_SSM_SCAN,
538 GGML_OP_WIN_PART,
539 GGML_OP_WIN_UNPART,
540 GGML_OP_GET_REL_POS,
541 GGML_OP_ADD_REL_POS,
542 GGML_OP_RWKV_WKV6,
543 GGML_OP_GATED_LINEAR_ATTN,
544 GGML_OP_RWKV_WKV7,
545
546 GGML_OP_UNARY,
547
548 GGML_OP_MAP_CUSTOM1,
549 GGML_OP_MAP_CUSTOM2,
550 GGML_OP_MAP_CUSTOM3,
551
552 GGML_OP_CUSTOM,
553
554 GGML_OP_CROSS_ENTROPY_LOSS,
555 GGML_OP_CROSS_ENTROPY_LOSS_BACK,
556 GGML_OP_OPT_STEP_ADAMW,
557 GGML_OP_OPT_STEP_SGD,
558
559 GGML_OP_GLU,
560
561 GGML_OP_COUNT,
562 };
563
564 enum ggml_unary_op {
565 GGML_UNARY_OP_ABS,
566 GGML_UNARY_OP_SGN,
567 GGML_UNARY_OP_NEG,
568 GGML_UNARY_OP_STEP,
569 GGML_UNARY_OP_TANH,
570 GGML_UNARY_OP_ELU,
571 GGML_UNARY_OP_RELU,
572 GGML_UNARY_OP_SIGMOID,
573 GGML_UNARY_OP_GELU,
574 GGML_UNARY_OP_GELU_QUICK,
575 GGML_UNARY_OP_SILU,
576 GGML_UNARY_OP_HARDSWISH,
577 GGML_UNARY_OP_HARDSIGMOID,
578 GGML_UNARY_OP_EXP,
579 GGML_UNARY_OP_GELU_ERF,
580 GGML_UNARY_OP_XIELU,
581 GGML_UNARY_OP_FLOOR,
582 GGML_UNARY_OP_CEIL,
583 GGML_UNARY_OP_ROUND,
584 GGML_UNARY_OP_TRUNC,
585
586 GGML_UNARY_OP_COUNT,
587 };
588
589 enum ggml_glu_op {
590 GGML_GLU_OP_REGLU,
591 GGML_GLU_OP_GEGLU,
592 GGML_GLU_OP_SWIGLU,
593 GGML_GLU_OP_SWIGLU_OAI,
594 GGML_GLU_OP_GEGLU_ERF,
595 GGML_GLU_OP_GEGLU_QUICK,
596
597 GGML_GLU_OP_COUNT,
598 };
599
600 enum ggml_object_type {
601 GGML_OBJECT_TYPE_TENSOR,
602 GGML_OBJECT_TYPE_GRAPH,
603 GGML_OBJECT_TYPE_WORK_BUFFER
604 };
605
606 enum ggml_log_level {
607 GGML_LOG_LEVEL_NONE = 0,
608 GGML_LOG_LEVEL_DEBUG = 1,
609 GGML_LOG_LEVEL_INFO = 2,
610 GGML_LOG_LEVEL_WARN = 3,
611 GGML_LOG_LEVEL_ERROR = 4,
612 GGML_LOG_LEVEL_CONT = 5, // continue previous log
613 };
614
615 // this tensor...
616 enum ggml_tensor_flag {
617 GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
618 GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
619 GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
620 GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
621 };
622
623 struct ggml_init_params {
624 // memory pool
625 size_t mem_size; // bytes
626 void * mem_buffer; // if NULL, memory will be allocated internally
627 bool no_alloc; // don't allocate memory for the tensor data
628 };
629
630 // n-dimensional tensor
631 struct ggml_tensor {
632 enum ggml_type type;
633
634 struct ggml_backend_buffer * buffer;
635
636 int64_t ne[GGML_MAX_DIMS]; // number of elements
637 size_t nb[GGML_MAX_DIMS]; // stride in bytes:
638 // nb[0] = ggml_type_size(type)
639 // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
640 // nb[i] = nb[i-1] * ne[i-1]
641
642 // compute data
643 enum ggml_op op;
644
645 // op params - allocated as int32_t for alignment
646 int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
647
648 int32_t flags;
649
650 struct ggml_tensor * src[GGML_MAX_SRC];
651
652 // source tensor and offset for views
653 struct ggml_tensor * view_src;
654 size_t view_offs;
655
656 void * data;
657
658 char name[GGML_MAX_NAME];
659
660 void * extra; // extra things e.g. for ggml-cuda.cu
661
662 char padding[8];
663 };
664
665 static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
666
667 // Abort callback
668 // If not NULL, called before ggml computation
669 // If it returns true, the computation is aborted
670 typedef bool (*ggml_abort_callback)(void * data);
671
672
673 //
674 // GUID
675 //
676
677 // GUID types
678 typedef uint8_t ggml_guid[16];
679 typedef ggml_guid * ggml_guid_t;
680
681 GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b);
682
683 // misc
684
685 GGML_API const char * ggml_version(void);
686 GGML_API const char * ggml_commit(void);
687
688 GGML_API void ggml_time_init(void); // call this once at the beginning of the program
689 GGML_API int64_t ggml_time_ms(void);
690 GGML_API int64_t ggml_time_us(void);
691 GGML_API int64_t ggml_cycles(void);
692 GGML_API int64_t ggml_cycles_per_ms(void);
693
694 // accepts a UTF-8 path, even on Windows
695 GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
696
697 GGML_API void ggml_print_object (const struct ggml_object * obj);
698 GGML_API void ggml_print_objects(const struct ggml_context * ctx);
699
700 GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
701 GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
702 GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
703 GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
704
705 GGML_API int64_t ggml_blck_size(enum ggml_type type);
706 GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
707 GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
708
709 GGML_DEPRECATED(
710 GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
711 "use ggml_row_size() instead");
712
713 GGML_API const char * ggml_type_name(enum ggml_type type);
714 GGML_API const char * ggml_op_name (enum ggml_op op);
715 GGML_API const char * ggml_op_symbol(enum ggml_op op);
716
717 GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
718 GGML_API const char * ggml_glu_op_name(enum ggml_glu_op op);
719 GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
720
721 GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
722
723 GGML_API bool ggml_is_quantized(enum ggml_type type);
724
725 // TODO: temporary until model loading of ggml examples is refactored
726 GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
727
728 GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
729 GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
730 GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
731 GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
732 GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
733 GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
734 GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
735 GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
736
737 // returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
738 GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
739 GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
740 GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
741 GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
742
743 // returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
744 GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
745
746 // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
747 GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
748
749 // true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
750 GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
751
752 GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
753 GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
754
755 GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
756
757 // use this to compute the memory overhead of a tensor
758 GGML_API size_t ggml_tensor_overhead(void);
759
760 GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
761
762 // main
763
764 GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
765 GGML_API void ggml_reset(struct ggml_context * ctx);
766 GGML_API void ggml_free (struct ggml_context * ctx);
767
768 GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
769
770 GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
771 GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
772
773 GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
774 GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
775 GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
776
777 GGML_API struct ggml_tensor * ggml_new_tensor(
778 struct ggml_context * ctx,
779 enum ggml_type type,
780 int n_dims,
781 const int64_t *ne);
782
783 GGML_API struct ggml_tensor * ggml_new_tensor_1d(
784 struct ggml_context * ctx,
785 enum ggml_type type,
786 int64_t ne0);
787
788 GGML_API struct ggml_tensor * ggml_new_tensor_2d(
789 struct ggml_context * ctx,
790 enum ggml_type type,
791 int64_t ne0,
792 int64_t ne1);
793
794 GGML_API struct ggml_tensor * ggml_new_tensor_3d(
795 struct ggml_context * ctx,
796 enum ggml_type type,
797 int64_t ne0,
798 int64_t ne1,
799 int64_t ne2);
800
801 GGML_API struct ggml_tensor * ggml_new_tensor_4d(
802 struct ggml_context * ctx,
803 enum ggml_type type,
804 int64_t ne0,
805 int64_t ne1,
806 int64_t ne2,
807 int64_t ne3);
808
809 GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
810
811 GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
812 GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
813
814 // Context tensor enumeration and lookup
815 GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
816 GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
817 GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
818
819 // Converts a flat index into coordinates
820 GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
821
822 GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
823 GGML_API enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor);
824
825 GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
826 GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
827
828 GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
829 GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
830 GGML_ATTRIBUTE_FORMAT(2, 3)
831 GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
832
833 // Tensor flags
834 GGML_API void ggml_set_input(struct ggml_tensor * tensor);
835 GGML_API void ggml_set_output(struct ggml_tensor * tensor);
836 GGML_API void ggml_set_param(struct ggml_tensor * tensor);
837 GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
838
839 //
840 // operations on tensors with backpropagation
841 //
842
843 GGML_API struct ggml_tensor * ggml_dup(
844 struct ggml_context * ctx,
845 struct ggml_tensor * a);
846
847 // in-place, returns view(a)
848 GGML_API struct ggml_tensor * ggml_dup_inplace(
849 struct ggml_context * ctx,
850 struct ggml_tensor * a);
851
852 GGML_API struct ggml_tensor * ggml_add(
853 struct ggml_context * ctx,
854 struct ggml_tensor * a,
855 struct ggml_tensor * b);
856
857 GGML_API struct ggml_tensor * ggml_add_inplace(
858 struct ggml_context * ctx,
859 struct ggml_tensor * a,
860 struct ggml_tensor * b);
861
862 GGML_API struct ggml_tensor * ggml_add_cast(
863 struct ggml_context * ctx,
864 struct ggml_tensor * a,
865 struct ggml_tensor * b,
866 enum ggml_type type);
867
868 // dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
869 GGML_API struct ggml_tensor * ggml_add_id(
870 struct ggml_context * ctx,
871 struct ggml_tensor * a,
872 struct ggml_tensor * b,
873 struct ggml_tensor * ids);
874
875 GGML_API struct ggml_tensor * ggml_add1(
876 struct ggml_context * ctx,
877 struct ggml_tensor * a,
878 struct ggml_tensor * b);
879
880 GGML_API struct ggml_tensor * ggml_add1_inplace(
881 struct ggml_context * ctx,
882 struct ggml_tensor * a,
883 struct ggml_tensor * b);
884
885 // dst = a
886 // view(dst, nb1, nb2, nb3, offset) += b
887 // return dst
888 GGML_API struct ggml_tensor * ggml_acc(
889 struct ggml_context * ctx,
890 struct ggml_tensor * a,
891 struct ggml_tensor * b,
892 size_t nb1,
893 size_t nb2,
894 size_t nb3,
895 size_t offset);
896
897 GGML_API struct ggml_tensor * ggml_acc_inplace(
898 struct ggml_context * ctx,
899 struct ggml_tensor * a,
900 struct ggml_tensor * b,
901 size_t nb1,
902 size_t nb2,
903 size_t nb3,
904 size_t offset);
905
906 GGML_API struct ggml_tensor * ggml_sub(
907 struct ggml_context * ctx,
908 struct ggml_tensor * a,
909 struct ggml_tensor * b);
910
911 GGML_API struct ggml_tensor * ggml_sub_inplace(
912 struct ggml_context * ctx,
913 struct ggml_tensor * a,
914 struct ggml_tensor * b);
915
916 GGML_API struct ggml_tensor * ggml_mul(
917 struct ggml_context * ctx,
918 struct ggml_tensor * a,
919 struct ggml_tensor * b);
920
921 GGML_API struct ggml_tensor * ggml_mul_inplace(
922 struct ggml_context * ctx,
923 struct ggml_tensor * a,
924 struct ggml_tensor * b);
925
926 GGML_API struct ggml_tensor * ggml_div(
927 struct ggml_context * ctx,
928 struct ggml_tensor * a,
929 struct ggml_tensor * b);
930
931 GGML_API struct ggml_tensor * ggml_div_inplace(
932 struct ggml_context * ctx,
933 struct ggml_tensor * a,
934 struct ggml_tensor * b);
935
936 GGML_API struct ggml_tensor * ggml_sqr(
937 struct ggml_context * ctx,
938 struct ggml_tensor * a);
939
940 GGML_API struct ggml_tensor * ggml_sqr_inplace(
941 struct ggml_context * ctx,
942 struct ggml_tensor * a);
943
944 GGML_API struct ggml_tensor * ggml_sqrt(
945 struct ggml_context * ctx,
946 struct ggml_tensor * a);
947
948 GGML_API struct ggml_tensor * ggml_sqrt_inplace(
949 struct ggml_context * ctx,
950 struct ggml_tensor * a);
951
952 GGML_API struct ggml_tensor * ggml_log(
953 struct ggml_context * ctx,
954 struct ggml_tensor * a);
955
956 GGML_API struct ggml_tensor * ggml_log_inplace(
957 struct ggml_context * ctx,
958 struct ggml_tensor * a);
959
960 GGML_API struct ggml_tensor * ggml_sin(
961 struct ggml_context * ctx,
962 struct ggml_tensor * a);
963
964 GGML_API struct ggml_tensor * ggml_sin_inplace(
965 struct ggml_context * ctx,
966 struct ggml_tensor * a);
967
968 GGML_API struct ggml_tensor * ggml_cos(
969 struct ggml_context * ctx,
970 struct ggml_tensor * a);
971
972 GGML_API struct ggml_tensor * ggml_cos_inplace(
973 struct ggml_context * ctx,
974 struct ggml_tensor * a);
975
976 // return scalar
977 GGML_API struct ggml_tensor * ggml_sum(
978 struct ggml_context * ctx,
979 struct ggml_tensor * a);
980
981 // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
982 GGML_API struct ggml_tensor * ggml_sum_rows(
983 struct ggml_context * ctx,
984 struct ggml_tensor * a);
985
986 // mean along rows
987 GGML_API struct ggml_tensor * ggml_mean(
988 struct ggml_context * ctx,
989 struct ggml_tensor * a);
990
991 // argmax along rows
992 GGML_API struct ggml_tensor * ggml_argmax(
993 struct ggml_context * ctx,
994 struct ggml_tensor * a);
995
996 // count number of equal elements in a and b
997 GGML_API struct ggml_tensor * ggml_count_equal(
998 struct ggml_context * ctx,
999 struct ggml_tensor * a,
1000 struct ggml_tensor * b);
1001
1002 // if a is the same shape as b, and a is not parameter, return a
1003 // otherwise, return a new tensor: repeat(a) to fit in b
1004 GGML_API struct ggml_tensor * ggml_repeat(
1005 struct ggml_context * ctx,
1006 struct ggml_tensor * a,
1007 struct ggml_tensor * b);
1008
1009 // repeat a to the specified shape
1010 GGML_API struct ggml_tensor * ggml_repeat_4d(
1011 struct ggml_context * ctx,
1012 struct ggml_tensor * a,
1013 int64_t ne0,
1014 int64_t ne1,
1015 int64_t ne2,
1016 int64_t ne3);
1017
1018 // sums repetitions in a into shape of b
1019 GGML_API struct ggml_tensor * ggml_repeat_back(
1020 struct ggml_context * ctx,
1021 struct ggml_tensor * a,
1022 struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
1023
1024 // concat a and b along dim
1025 // used in stable-diffusion
1026 GGML_API struct ggml_tensor * ggml_concat(
1027 struct ggml_context * ctx,
1028 struct ggml_tensor * a,
1029 struct ggml_tensor * b,
1030 int dim);
1031
1032 GGML_API struct ggml_tensor * ggml_abs(
1033 struct ggml_context * ctx,
1034 struct ggml_tensor * a);
1035
1036 GGML_API struct ggml_tensor * ggml_abs_inplace(
1037 struct ggml_context * ctx,
1038 struct ggml_tensor * a);
1039
1040 GGML_API struct ggml_tensor * ggml_sgn(
1041 struct ggml_context * ctx,
1042 struct ggml_tensor * a);
1043
1044 GGML_API struct ggml_tensor * ggml_sgn_inplace(
1045 struct ggml_context * ctx,
1046 struct ggml_tensor * a);
1047
1048 GGML_API struct ggml_tensor * ggml_neg(
1049 struct ggml_context * ctx,
1050 struct ggml_tensor * a);
1051
1052 GGML_API struct ggml_tensor * ggml_neg_inplace(
1053 struct ggml_context * ctx,
1054 struct ggml_tensor * a);
1055
1056 GGML_API struct ggml_tensor * ggml_step(
1057 struct ggml_context * ctx,
1058 struct ggml_tensor * a);
1059
1060 GGML_API struct ggml_tensor * ggml_step_inplace(
1061 struct ggml_context * ctx,
1062 struct ggml_tensor * a);
1063
1064 GGML_API struct ggml_tensor * ggml_tanh(
1065 struct ggml_context * ctx,
1066 struct ggml_tensor * a);
1067
1068 GGML_API struct ggml_tensor * ggml_tanh_inplace(
1069 struct ggml_context * ctx,
1070 struct ggml_tensor * a);
1071
1072 GGML_API struct ggml_tensor * ggml_elu(
1073 struct ggml_context * ctx,
1074 struct ggml_tensor * a);
1075
1076 GGML_API struct ggml_tensor * ggml_elu_inplace(
1077 struct ggml_context * ctx,
1078 struct ggml_tensor * a);
1079
1080 GGML_API struct ggml_tensor * ggml_relu(
1081 struct ggml_context * ctx,
1082 struct ggml_tensor * a);
1083
1084 GGML_API struct ggml_tensor * ggml_leaky_relu(
1085 struct ggml_context * ctx,
1086 struct ggml_tensor * a, float negative_slope, bool inplace);
1087
1088 GGML_API struct ggml_tensor * ggml_relu_inplace(
1089 struct ggml_context * ctx,
1090 struct ggml_tensor * a);
1091
1092 GGML_API struct ggml_tensor * ggml_sigmoid(
1093 struct ggml_context * ctx,
1094 struct ggml_tensor * a);
1095
1096 GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
1097 struct ggml_context * ctx,
1098 struct ggml_tensor * a);
1099
1100 GGML_API struct ggml_tensor * ggml_gelu(
1101 struct ggml_context * ctx,
1102 struct ggml_tensor * a);
1103
1104 GGML_API struct ggml_tensor * ggml_gelu_inplace(
1105 struct ggml_context * ctx,
1106 struct ggml_tensor * a);
1107
1108 // GELU using erf (error function) when possible
1109 // some backends may fallback to approximation based on Abramowitz and Stegun formula
1110 GGML_API struct ggml_tensor * ggml_gelu_erf(
1111 struct ggml_context * ctx,
1112 struct ggml_tensor * a);
1113
1114 GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
1115 struct ggml_context * ctx,
1116 struct ggml_tensor * a);
1117
1118 GGML_API struct ggml_tensor * ggml_gelu_quick(
1119 struct ggml_context * ctx,
1120 struct ggml_tensor * a);
1121
1122 GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
1123 struct ggml_context * ctx,
1124 struct ggml_tensor * a);
1125
1126 GGML_API struct ggml_tensor * ggml_silu(
1127 struct ggml_context * ctx,
1128 struct ggml_tensor * a);
1129
1130 GGML_API struct ggml_tensor * ggml_silu_inplace(
1131 struct ggml_context * ctx,
1132 struct ggml_tensor * a);
1133
1134 // a - x
1135 // b - dy
1136 GGML_API struct ggml_tensor * ggml_silu_back(
1137 struct ggml_context * ctx,
1138 struct ggml_tensor * a,
1139 struct ggml_tensor * b);
1140
1141 // hardswish(x) = x * relu6(x + 3) / 6
1142 GGML_API struct ggml_tensor * ggml_hardswish(
1143 struct ggml_context * ctx,
1144 struct ggml_tensor * a);
1145
1146 // hardsigmoid(x) = relu6(x + 3) / 6
1147 GGML_API struct ggml_tensor * ggml_hardsigmoid(
1148 struct ggml_context * ctx,
1149 struct ggml_tensor * a);
1150
1151 GGML_API struct ggml_tensor * ggml_exp(
1152 struct ggml_context * ctx,
1153 struct ggml_tensor * a);
1154
1155 GGML_API struct ggml_tensor * ggml_exp_inplace(
1156 struct ggml_context * ctx,
1157 struct ggml_tensor * a);
1158
1159 GGML_API struct ggml_tensor * ggml_floor(
1160 struct ggml_context * ctx,
1161 struct ggml_tensor * a);
1162
1163 GGML_API struct ggml_tensor * ggml_floor_inplace(
1164 struct ggml_context * ctx,
1165 struct ggml_tensor * a);
1166
1167 GGML_API struct ggml_tensor * ggml_ceil(
1168 struct ggml_context * ctx,
1169 struct ggml_tensor * a);
1170
1171 GGML_API struct ggml_tensor * ggml_ceil_inplace(
1172 struct ggml_context * ctx,
1173 struct ggml_tensor * a);
1174
1175 GGML_API struct ggml_tensor * ggml_round(
1176 struct ggml_context * ctx,
1177 struct ggml_tensor * a);
1178
1179 GGML_API struct ggml_tensor * ggml_round_inplace(
1180 struct ggml_context * ctx,
1181 struct ggml_tensor * a);
1182
1183 /**
1184 * Truncates the fractional part of each element in the tensor (towards zero).
1185 * For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
1186 * Similar to std::trunc in C/C++.
1187 */
1188
1189 GGML_API struct ggml_tensor * ggml_trunc(
1190 struct ggml_context * ctx,
1191 struct ggml_tensor * a);
1192
1193 GGML_API struct ggml_tensor * ggml_trunc_inplace(
1194 struct ggml_context * ctx,
1195 struct ggml_tensor * a);
1196
1197
1198
1199 // xIELU activation function
1200 // x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
1201 // where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
1202 // that constrain the positive and negative source alpha values respectively
1203 GGML_API struct ggml_tensor * ggml_xielu(
1204 struct ggml_context * ctx,
1205 struct ggml_tensor * a,
1206 float alpha_n,
1207 float alpha_p,
1208 float beta,
1209 float eps);
1210
1211 // gated linear unit ops
1212 // A: n columns, r rows,
1213 // result is n / 2 columns, r rows,
1214 // expects gate in second half of row, unless swapped is true
1215 GGML_API struct ggml_tensor * ggml_glu(
1216 struct ggml_context * ctx,
1217 struct ggml_tensor * a,
1218 enum ggml_glu_op op,
1219 bool swapped);
1220
1221 GGML_API struct ggml_tensor * ggml_reglu(
1222 struct ggml_context * ctx,
1223 struct ggml_tensor * a);
1224
1225 GGML_API struct ggml_tensor * ggml_reglu_swapped(
1226 struct ggml_context * ctx,
1227 struct ggml_tensor * a);
1228
1229 GGML_API struct ggml_tensor * ggml_geglu(
1230 struct ggml_context * ctx,
1231 struct ggml_tensor * a);
1232
1233 GGML_API struct ggml_tensor * ggml_geglu_swapped(
1234 struct ggml_context * ctx,
1235 struct ggml_tensor * a);
1236
1237 GGML_API struct ggml_tensor * ggml_swiglu(
1238 struct ggml_context * ctx,
1239 struct ggml_tensor * a);
1240
1241 GGML_API struct ggml_tensor * ggml_swiglu_swapped(
1242 struct ggml_context * ctx,
1243 struct ggml_tensor * a);
1244
1245 GGML_API struct ggml_tensor * ggml_geglu_erf(
1246 struct ggml_context * ctx,
1247 struct ggml_tensor * a);
1248
1249 GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
1250 struct ggml_context * ctx,
1251 struct ggml_tensor * a);
1252
1253 GGML_API struct ggml_tensor * ggml_geglu_quick(
1254 struct ggml_context * ctx,
1255 struct ggml_tensor * a);
1256
1257 GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
1258 struct ggml_context * ctx,
1259 struct ggml_tensor * a);
1260
1261 // A: n columns, r rows,
1262 // B: n columns, r rows,
1263 GGML_API struct ggml_tensor * ggml_glu_split(
1264 struct ggml_context * ctx,
1265 struct ggml_tensor * a,
1266 struct ggml_tensor * b,
1267 enum ggml_glu_op op);
1268
1269 GGML_API struct ggml_tensor * ggml_reglu_split(
1270 struct ggml_context * ctx,
1271 struct ggml_tensor * a,
1272 struct ggml_tensor * b);
1273
1274 GGML_API struct ggml_tensor * ggml_geglu_split(
1275 struct ggml_context * ctx,
1276 struct ggml_tensor * a,
1277 struct ggml_tensor * b);
1278
1279 GGML_API struct ggml_tensor * ggml_swiglu_split(
1280 struct ggml_context * ctx,
1281 struct ggml_tensor * a,
1282 struct ggml_tensor * b);
1283
1284 GGML_API struct ggml_tensor * ggml_geglu_erf_split(
1285 struct ggml_context * ctx,
1286 struct ggml_tensor * a,
1287 struct ggml_tensor * b);
1288
1289 GGML_API struct ggml_tensor * ggml_geglu_quick_split(
1290 struct ggml_context * ctx,
1291 struct ggml_tensor * a,
1292 struct ggml_tensor * b);
1293
1294 GGML_API struct ggml_tensor * ggml_swiglu_oai(
1295 struct ggml_context * ctx,
1296 struct ggml_tensor * a,
1297 struct ggml_tensor * b,
1298 float alpha,
1299 float limit);
1300
1301 // normalize along rows
1302 GGML_API struct ggml_tensor * ggml_norm(
1303 struct ggml_context * ctx,
1304 struct ggml_tensor * a,
1305 float eps);
1306
1307 GGML_API struct ggml_tensor * ggml_norm_inplace(
1308 struct ggml_context * ctx,
1309 struct ggml_tensor * a,
1310 float eps);
1311
1312 GGML_API struct ggml_tensor * ggml_rms_norm(
1313 struct ggml_context * ctx,
1314 struct ggml_tensor * a,
1315 float eps);
1316
1317 GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
1318 struct ggml_context * ctx,
1319 struct ggml_tensor * a,
1320 float eps);
1321
1322 // group normalize along ne0*ne1*n_groups
1323 // used in stable-diffusion
1324 GGML_API struct ggml_tensor * ggml_group_norm(
1325 struct ggml_context * ctx,
1326 struct ggml_tensor * a,
1327 int n_groups,
1328 float eps);
1329
1330 GGML_API struct ggml_tensor * ggml_group_norm_inplace(
1331 struct ggml_context * ctx,
1332 struct ggml_tensor * a,
1333 int n_groups,
1334 float eps);
1335
1336 // l2 normalize along rows
1337 // used in rwkv v7
1338 GGML_API struct ggml_tensor * ggml_l2_norm(
1339 struct ggml_context * ctx,
1340 struct ggml_tensor * a,
1341 float eps);
1342
1343 GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
1344 struct ggml_context * ctx,
1345 struct ggml_tensor * a,
1346 float eps);
1347
1348 // a - x
1349 // b - dy
1350 GGML_API struct ggml_tensor * ggml_rms_norm_back(
1351 struct ggml_context * ctx,
1352 struct ggml_tensor * a,
1353 struct ggml_tensor * b,
1354 float eps);
1355
1356 // A: k columns, n rows => [ne03, ne02, n, k]
1357 // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
1358 // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
1359 GGML_API struct ggml_tensor * ggml_mul_mat(
1360 struct ggml_context * ctx,
1361 struct ggml_tensor * a,
1362 struct ggml_tensor * b);
1363
1364 // change the precision of a matrix multiplication
1365 // set to GGML_PREC_F32 for higher precision (useful for phi-2)
1366 GGML_API void ggml_mul_mat_set_prec(
1367 struct ggml_tensor * a,
1368 enum ggml_prec prec);
1369
1370 // indirect matrix multiplication
1371 GGML_API struct ggml_tensor * ggml_mul_mat_id(
1372 struct ggml_context * ctx,
1373 struct ggml_tensor * as,
1374 struct ggml_tensor * b,
1375 struct ggml_tensor * ids);
1376
1377 // A: m columns, n rows,
1378 // B: p columns, n rows,
1379 // result is m columns, p rows
1380 GGML_API struct ggml_tensor * ggml_out_prod(
1381 struct ggml_context * ctx,
1382 struct ggml_tensor * a,
1383 struct ggml_tensor * b);
1384
1385 //
1386 // operations on tensors without backpropagation
1387 //
1388
1389 GGML_API struct ggml_tensor * ggml_scale(
1390 struct ggml_context * ctx,
1391 struct ggml_tensor * a,
1392 float s);
1393
1394 // in-place, returns view(a)
1395 GGML_API struct ggml_tensor * ggml_scale_inplace(
1396 struct ggml_context * ctx,
1397 struct ggml_tensor * a,
1398 float s);
1399
1400 // x = s * a + b
1401 GGML_API struct ggml_tensor * ggml_scale_bias(
1402 struct ggml_context * ctx,
1403 struct ggml_tensor * a,
1404 float s,
1405 float b);
1406
1407 GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
1408 struct ggml_context * ctx,
1409 struct ggml_tensor * a,
1410 float s,
1411 float b);
1412
1413 // b -> view(a,offset,nb1,nb2,3), return modified a
1414 GGML_API struct ggml_tensor * ggml_set(
1415 struct ggml_context * ctx,
1416 struct ggml_tensor * a,
1417 struct ggml_tensor * b,
1418 size_t nb1,
1419 size_t nb2,
1420 size_t nb3,
1421 size_t offset); // in bytes
1422
1423 // b -> view(a,offset,nb1,nb2,3), return view(a)
1424 GGML_API struct ggml_tensor * ggml_set_inplace(
1425 struct ggml_context * ctx,
1426 struct ggml_tensor * a,
1427 struct ggml_tensor * b,
1428 size_t nb1,
1429 size_t nb2,
1430 size_t nb3,
1431 size_t offset); // in bytes
1432
1433 GGML_API struct ggml_tensor * ggml_set_1d(
1434 struct ggml_context * ctx,
1435 struct ggml_tensor * a,
1436 struct ggml_tensor * b,
1437 size_t offset); // in bytes
1438
1439 GGML_API struct ggml_tensor * ggml_set_1d_inplace(
1440 struct ggml_context * ctx,
1441 struct ggml_tensor * a,
1442 struct ggml_tensor * b,
1443 size_t offset); // in bytes
1444
1445 // b -> view(a,offset,nb1,nb2,3), return modified a
1446 GGML_API struct ggml_tensor * ggml_set_2d(
1447 struct ggml_context * ctx,
1448 struct ggml_tensor * a,
1449 struct ggml_tensor * b,
1450 size_t nb1,
1451 size_t offset); // in bytes
1452
1453 // b -> view(a,offset,nb1,nb2,3), return view(a)
1454 GGML_API struct ggml_tensor * ggml_set_2d_inplace(
1455 struct ggml_context * ctx,
1456 struct ggml_tensor * a,
1457 struct ggml_tensor * b,
1458 size_t nb1,
1459 size_t offset); // in bytes
1460
1461 // a -> b, return view(b)
1462 GGML_API struct ggml_tensor * ggml_cpy(
1463 struct ggml_context * ctx,
1464 struct ggml_tensor * a,
1465 struct ggml_tensor * b);
1466
1467 // note: casting from f32 to i32 will discard the fractional part
1468 GGML_API struct ggml_tensor * ggml_cast(
1469 struct ggml_context * ctx,
1470 struct ggml_tensor * a,
1471 enum ggml_type type);
1472
1473 // make contiguous
1474 GGML_API struct ggml_tensor * ggml_cont(
1475 struct ggml_context * ctx,
1476 struct ggml_tensor * a);
1477
1478 // make contiguous, with new shape
1479 GGML_API struct ggml_tensor * ggml_cont_1d(
1480 struct ggml_context * ctx,
1481 struct ggml_tensor * a,
1482 int64_t ne0);
1483
1484 GGML_API struct ggml_tensor * ggml_cont_2d(
1485 struct ggml_context * ctx,
1486 struct ggml_tensor * a,
1487 int64_t ne0,
1488 int64_t ne1);
1489
1490 GGML_API struct ggml_tensor * ggml_cont_3d(
1491 struct ggml_context * ctx,
1492 struct ggml_tensor * a,
1493 int64_t ne0,
1494 int64_t ne1,
1495 int64_t ne2);
1496
1497 GGML_API struct ggml_tensor * ggml_cont_4d(
1498 struct ggml_context * ctx,
1499 struct ggml_tensor * a,
1500 int64_t ne0,
1501 int64_t ne1,
1502 int64_t ne2,
1503 int64_t ne3);
1504
1505 // return view(a), b specifies the new shape
1506 // TODO: when we start computing gradient, make a copy instead of view
1507 GGML_API struct ggml_tensor * ggml_reshape(
1508 struct ggml_context * ctx,
1509 struct ggml_tensor * a,
1510 struct ggml_tensor * b);
1511
1512 // return view(a)
1513 // TODO: when we start computing gradient, make a copy instead of view
1514 GGML_API struct ggml_tensor * ggml_reshape_1d(
1515 struct ggml_context * ctx,
1516 struct ggml_tensor * a,
1517 int64_t ne0);
1518
1519 GGML_API struct ggml_tensor * ggml_reshape_2d(
1520 struct ggml_context * ctx,
1521 struct ggml_tensor * a,
1522 int64_t ne0,
1523 int64_t ne1);
1524
1525 // return view(a)
1526 // TODO: when we start computing gradient, make a copy instead of view
1527 GGML_API struct ggml_tensor * ggml_reshape_3d(
1528 struct ggml_context * ctx,
1529 struct ggml_tensor * a,
1530 int64_t ne0,
1531 int64_t ne1,
1532 int64_t ne2);
1533
1534 GGML_API struct ggml_tensor * ggml_reshape_4d(
1535 struct ggml_context * ctx,
1536 struct ggml_tensor * a,
1537 int64_t ne0,
1538 int64_t ne1,
1539 int64_t ne2,
1540 int64_t ne3);
1541
1542 // offset in bytes
1543 GGML_API struct ggml_tensor * ggml_view_1d(
1544 struct ggml_context * ctx,
1545 struct ggml_tensor * a,
1546 int64_t ne0,
1547 size_t offset);
1548
1549 GGML_API struct ggml_tensor * ggml_view_2d(
1550 struct ggml_context * ctx,
1551 struct ggml_tensor * a,
1552 int64_t ne0,
1553 int64_t ne1,
1554 size_t nb1, // row stride in bytes
1555 size_t offset);
1556
1557 GGML_API struct ggml_tensor * ggml_view_3d(
1558 struct ggml_context * ctx,
1559 struct ggml_tensor * a,
1560 int64_t ne0,
1561 int64_t ne1,
1562 int64_t ne2,
1563 size_t nb1, // row stride in bytes
1564 size_t nb2, // slice stride in bytes
1565 size_t offset);
1566
1567 GGML_API struct ggml_tensor * ggml_view_4d(
1568 struct ggml_context * ctx,
1569 struct ggml_tensor * a,
1570 int64_t ne0,
1571 int64_t ne1,
1572 int64_t ne2,
1573 int64_t ne3,
1574 size_t nb1, // row stride in bytes
1575 size_t nb2, // slice stride in bytes
1576 size_t nb3,
1577 size_t offset);
1578
1579 GGML_API struct ggml_tensor * ggml_permute(
1580 struct ggml_context * ctx,
1581 struct ggml_tensor * a,
1582 int axis0,
1583 int axis1,
1584 int axis2,
1585 int axis3);
1586
1587 // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
1588 GGML_API struct ggml_tensor * ggml_transpose(
1589 struct ggml_context * ctx,
1590 struct ggml_tensor * a);
1591
1592 // supports 4D a:
1593 // a [n_embd, ne1, ne2, ne3]
1594 // b I32 [n_rows, ne2, ne3, 1]
1595 //
1596 // return [n_embd, n_rows, ne2, ne3]
1597 GGML_API struct ggml_tensor * ggml_get_rows(
1598 struct ggml_context * ctx,
1599 struct ggml_tensor * a, // data
1600 struct ggml_tensor * b); // row indices
1601
1602 GGML_API struct ggml_tensor * ggml_get_rows_back(
1603 struct ggml_context * ctx,
1604 struct ggml_tensor * a, // gradients of ggml_get_rows result
1605 struct ggml_tensor * b, // row indices
1606 struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
1607
1608 // a TD [n_embd, ne1, ne2, ne3]
1609 // b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
1610 // c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
1611 //
1612 // undefined behavior if destination rows overlap
1613 //
1614 // broadcast:
1615 // ne2 % ne11 == 0
1616 // ne3 % ne12 == 0
1617 //
1618 // return view(a)
1619 GGML_API struct ggml_tensor * ggml_set_rows(
1620 struct ggml_context * ctx,
1621 struct ggml_tensor * a, // destination
1622 struct ggml_tensor * b, // source
1623 struct ggml_tensor * c); // row indices
1624
1625 GGML_API struct ggml_tensor * ggml_diag(
1626 struct ggml_context * ctx,
1627 struct ggml_tensor * a);
1628
1629 // set elements above the diagonal to -INF
1630 GGML_API struct ggml_tensor * ggml_diag_mask_inf(
1631 struct ggml_context * ctx,
1632 struct ggml_tensor * a,
1633 int n_past);
1634
1635 // in-place, returns view(a)
1636 GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
1637 struct ggml_context * ctx,
1638 struct ggml_tensor * a,
1639 int n_past);
1640
1641 // set elements above the diagonal to 0
1642 GGML_API struct ggml_tensor * ggml_diag_mask_zero(
1643 struct ggml_context * ctx,
1644 struct ggml_tensor * a,
1645 int n_past);
1646
1647 // in-place, returns view(a)
1648 GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
1649 struct ggml_context * ctx,
1650 struct ggml_tensor * a,
1651 int n_past);
1652
1653 GGML_API struct ggml_tensor * ggml_soft_max(
1654 struct ggml_context * ctx,
1655 struct ggml_tensor * a);
1656
1657 // in-place, returns view(a)
1658 GGML_API struct ggml_tensor * ggml_soft_max_inplace(
1659 struct ggml_context * ctx,
1660 struct ggml_tensor * a);
1661
1662 // a [ne0, ne01, ne02, ne03]
1663 // mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
1664 //
1665 // broadcast:
1666 // ne02 % ne12 == 0
1667 // ne03 % ne13 == 0
1668 //
1669 // fused soft_max(a*scale + mask*(ALiBi slope))
1670 // max_bias = 0.0f for no ALiBi
1671 GGML_API struct ggml_tensor * ggml_soft_max_ext(
1672 struct ggml_context * ctx,
1673 struct ggml_tensor * a,
1674 struct ggml_tensor * mask,
1675 float scale,
1676 float max_bias);
1677
1678 GGML_API struct ggml_tensor * ggml_soft_max_ext_inplace(
1679 struct ggml_context * ctx,
1680 struct ggml_tensor * a,
1681 struct ggml_tensor * mask,
1682 float scale,
1683 float max_bias);
1684
1685 GGML_API void ggml_soft_max_add_sinks(
1686 struct ggml_tensor * a,
1687 struct ggml_tensor * sinks);
1688
1689 GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
1690 struct ggml_context * ctx,
1691 struct ggml_tensor * a,
1692 struct ggml_tensor * b,
1693 float scale,
1694 float max_bias);
1695
1696 // in-place, returns view(a)
1697 GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace(
1698 struct ggml_context * ctx,
1699 struct ggml_tensor * a,
1700 struct ggml_tensor * b,
1701 float scale,
1702 float max_bias);
1703
1704 // rotary position embedding
1705 // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
1706 // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
1707 //
1708 // b is an int32 vector with size a->ne[2], it contains the positions
1709 GGML_API struct ggml_tensor * ggml_rope(
1710 struct ggml_context * ctx,
1711 struct ggml_tensor * a,
1712 struct ggml_tensor * b,
1713 int n_dims,
1714 int mode);
1715
1716 // in-place, returns view(a)
1717 GGML_API struct ggml_tensor * ggml_rope_inplace(
1718 struct ggml_context * ctx,
1719 struct ggml_tensor * a,
1720 struct ggml_tensor * b,
1721 int n_dims,
1722 int mode);
1723
1724 // custom RoPE
1725 // c is freq factors (e.g. phi3-128k), (optional)
1726 GGML_API struct ggml_tensor * ggml_rope_ext(
1727 struct ggml_context * ctx,
1728 struct ggml_tensor * a,
1729 struct ggml_tensor * b,
1730 struct ggml_tensor * c,
1731 int n_dims,
1732 int mode,
1733 int n_ctx_orig,
1734 float freq_base,
1735 float freq_scale,
1736 float ext_factor,
1737 float attn_factor,
1738 float beta_fast,
1739 float beta_slow);
1740
1741 GGML_API struct ggml_tensor * ggml_rope_multi(
1742 struct ggml_context * ctx,
1743 struct ggml_tensor * a,
1744 struct ggml_tensor * b,
1745 struct ggml_tensor * c,
1746 int n_dims,
1747 int sections[GGML_MROPE_SECTIONS],
1748 int mode,
1749 int n_ctx_orig,
1750 float freq_base,
1751 float freq_scale,
1752 float ext_factor,
1753 float attn_factor,
1754 float beta_fast,
1755 float beta_slow);
1756
1757 // in-place, returns view(a)
1758 GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1759 struct ggml_context * ctx,
1760 struct ggml_tensor * a,
1761 struct ggml_tensor * b,
1762 struct ggml_tensor * c,
1763 int n_dims,
1764 int mode,
1765 int n_ctx_orig,
1766 float freq_base,
1767 float freq_scale,
1768 float ext_factor,
1769 float attn_factor,
1770 float beta_fast,
1771 float beta_slow);
1772
1773 GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
1774 struct ggml_context * ctx,
1775 struct ggml_tensor * a,
1776 struct ggml_tensor * b,
1777 struct ggml_tensor * c,
1778 int n_dims,
1779 int sections[GGML_MROPE_SECTIONS],
1780 int mode,
1781 int n_ctx_orig,
1782 float freq_base,
1783 float freq_scale,
1784 float ext_factor,
1785 float attn_factor,
1786 float beta_fast,
1787 float beta_slow);
1788
1789 GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1790 struct ggml_context * ctx,
1791 struct ggml_tensor * a,
1792 struct ggml_tensor * b,
1793 int n_dims,
1794 int mode,
1795 int n_ctx_orig,
1796 float freq_base,
1797 float freq_scale,
1798 float ext_factor,
1799 float attn_factor,
1800 float beta_fast,
1801 float beta_slow),
1802 "use ggml_rope_ext instead");
1803
1804 GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1805 struct ggml_context * ctx,
1806 struct ggml_tensor * a,
1807 struct ggml_tensor * b,
1808 int n_dims,
1809 int mode,
1810 int n_ctx_orig,
1811 float freq_base,
1812 float freq_scale,
1813 float ext_factor,
1814 float attn_factor,
1815 float beta_fast,
1816 float beta_slow),
1817 "use ggml_rope_ext_inplace instead");
1818
1819 // compute correction dims for YaRN RoPE scaling
1820 GGML_API void ggml_rope_yarn_corr_dims(
1821 int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1822
1823 // rotary position embedding backward, i.e compute dx from dy
1824 // a - dy
1825 GGML_API struct ggml_tensor * ggml_rope_ext_back(
1826 struct ggml_context * ctx,
1827 struct ggml_tensor * a, // gradients of ggml_rope result
1828 struct ggml_tensor * b, // positions
1829 struct ggml_tensor * c, // freq factors
1830 int n_dims,
1831 int mode,
1832 int n_ctx_orig,
1833 float freq_base,
1834 float freq_scale,
1835 float ext_factor,
1836 float attn_factor,
1837 float beta_fast,
1838 float beta_slow);
1839
1840 GGML_API struct ggml_tensor * ggml_rope_multi_back(
1841 struct ggml_context * ctx,
1842 struct ggml_tensor * a,
1843 struct ggml_tensor * b,
1844 struct ggml_tensor * c,
1845 int n_dims,
1846 int sections[4],
1847 int mode,
1848 int n_ctx_orig,
1849 float freq_base,
1850 float freq_scale,
1851 float ext_factor,
1852 float attn_factor,
1853 float beta_fast,
1854 float beta_slow);
1855
1856
1857 // clamp
1858 // in-place, returns view(a)
1859 GGML_API struct ggml_tensor * ggml_clamp(
1860 struct ggml_context * ctx,
1861 struct ggml_tensor * a,
1862 float min,
1863 float max);
1864
1865 // im2col
1866 // converts data into a format that effectively results in a convolution when combined with matrix multiplication
1867 GGML_API struct ggml_tensor * ggml_im2col(
1868 struct ggml_context * ctx,
1869 struct ggml_tensor * a, // convolution kernel
1870 struct ggml_tensor * b, // data
1871 int s0, // stride dimension 0
1872 int s1, // stride dimension 1
1873 int p0, // padding dimension 0
1874 int p1, // padding dimension 1
1875 int d0, // dilation dimension 0
1876 int d1, // dilation dimension 1
1877 bool is_2D,
1878 enum ggml_type dst_type);
1879
1880 GGML_API struct ggml_tensor * ggml_im2col_back(
1881 struct ggml_context * ctx,
1882 struct ggml_tensor * a, // convolution kernel
1883 struct ggml_tensor * b, // gradient of im2col output
1884 int64_t * ne, // shape of im2col input
1885 int s0, // stride dimension 0
1886 int s1, // stride dimension 1
1887 int p0, // padding dimension 0
1888 int p1, // padding dimension 1
1889 int d0, // dilation dimension 0
1890 int d1, // dilation dimension 1
1891 bool is_2D);
1892
1893 GGML_API struct ggml_tensor * ggml_conv_1d(
1894 struct ggml_context * ctx,
1895 struct ggml_tensor * a, // convolution kernel
1896 struct ggml_tensor * b, // data
1897 int s0, // stride
1898 int p0, // padding
1899 int d0); // dilation
1900
1901 // conv_1d with padding = half
1902 // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1903 GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1904 struct ggml_context * ctx,
1905 struct ggml_tensor * a, // convolution kernel
1906 struct ggml_tensor * b, // data
1907 int s, // stride
1908 int d); // dilation
1909
1910 // depthwise
1911 // TODO: this is very likely wrong for some cases! - needs more testing
1912 GGML_API struct ggml_tensor * ggml_conv_1d_dw(
1913 struct ggml_context * ctx,
1914 struct ggml_tensor * a, // convolution kernel
1915 struct ggml_tensor * b, // data
1916 int s0, // stride
1917 int p0, // padding
1918 int d0); // dilation
1919
1920 GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph(
1921 struct ggml_context * ctx,
1922 struct ggml_tensor * a, // convolution kernel
1923 struct ggml_tensor * b, // data
1924 int s0, // stride
1925 int d0); // dilation
1926
1927 GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
1928 struct ggml_context * ctx,
1929 struct ggml_tensor * a, // convolution kernel
1930 struct ggml_tensor * b, // data
1931 int s0, // stride
1932 int p0, // padding
1933 int d0); // dilation
1934
1935 GGML_API struct ggml_tensor * ggml_conv_2d(
1936 struct ggml_context * ctx,
1937 struct ggml_tensor * a, // convolution kernel
1938 struct ggml_tensor * b, // data
1939 int s0, // stride dimension 0
1940 int s1, // stride dimension 1
1941 int p0, // padding dimension 0
1942 int p1, // padding dimension 1
1943 int d0, // dilation dimension 0
1944 int d1); // dilation dimension 1
1945
1946 GGML_API struct ggml_tensor * ggml_im2col_3d(
1947 struct ggml_context * ctx,
1948 struct ggml_tensor * a,
1949 struct ggml_tensor * b,
1950 int64_t IC,
1951 int s0, // stride width
1952 int s1, // stride height
1953 int s2, // stride depth
1954 int p0, // padding width
1955 int p1, // padding height
1956 int p2, // padding depth
1957 int d0, // dilation width
1958 int d1, // dilation height
1959 int d2, // dilation depth
1960 enum ggml_type dst_type);
1961
1962 // a: [OC*IC, KD, KH, KW]
1963 // b: [N*IC, ID, IH, IW]
1964 // result: [N*OC, OD, OH, OW]
1965 GGML_API struct ggml_tensor * ggml_conv_3d(
1966 struct ggml_context * ctx,
1967 struct ggml_tensor * a,
1968 struct ggml_tensor * b,
1969 int64_t IC,
1970 int s0, // stride width
1971 int s1, // stride height
1972 int s2, // stride depth
1973 int p0, // padding width
1974 int p1, // padding height
1975 int p2, // padding depth
1976 int d0, // dilation width
1977 int d1, // dilation height
1978 int d2 // dilation depth
1979 );
1980
1981 // kernel size is a->ne[0] x a->ne[1]
1982 // stride is equal to kernel size
1983 // padding is zero
1984 // example:
1985 // a: 16 16 3 768
1986 // b: 1024 1024 3 1
1987 // res: 64 64 768 1
1988 // used in sam
1989 GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1990 struct ggml_context * ctx,
1991 struct ggml_tensor * a,
1992 struct ggml_tensor * b);
1993
1994 // kernel size is a->ne[0] x a->ne[1]
1995 // stride is 1
1996 // padding is half
1997 // example:
1998 // a: 3 3 256 256
1999 // b: 64 64 256 1
2000 // res: 64 64 256 1
2001 // used in sam
2002 GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
2003 struct ggml_context * ctx,
2004 struct ggml_tensor * a,
2005 struct ggml_tensor * b);
2006
2007 // depthwise (via im2col and mul_mat)
2008 GGML_API struct ggml_tensor * ggml_conv_2d_dw(
2009 struct ggml_context * ctx,
2010 struct ggml_tensor * a, // convolution kernel
2011 struct ggml_tensor * b, // data
2012 int s0, // stride dimension 0
2013 int s1, // stride dimension 1
2014 int p0, // padding dimension 0
2015 int p1, // padding dimension 1
2016 int d0, // dilation dimension 0
2017 int d1); // dilation dimension 1
2018
2019 // Depthwise 2D convolution
2020 // may be faster than ggml_conv_2d_dw, but not available in all backends
2021 // a: KW KH 1 C convolution kernel
2022 // b: W H C N input data
2023 // res: W_out H_out C N
2024 GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
2025 struct ggml_context * ctx,
2026 struct ggml_tensor * a,
2027 struct ggml_tensor * b,
2028 int stride0,
2029 int stride1,
2030 int pad0,
2031 int pad1,
2032 int dilation0,
2033 int dilation1);
2034
2035 GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
2036 struct ggml_context * ctx,
2037 struct ggml_tensor * a,
2038 struct ggml_tensor * b,
2039 int stride);
2040
2041 GGML_API struct ggml_tensor * ggml_conv_2d_direct(
2042 struct ggml_context * ctx,
2043 struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
2044 struct ggml_tensor * b, // input data [W, H, C, N]
2045 int s0, // stride dimension 0
2046 int s1, // stride dimension 1
2047 int p0, // padding dimension 0
2048 int p1, // padding dimension 1
2049 int d0, // dilation dimension 0
2050 int d1); // dilation dimension 1
2051
2052 GGML_API struct ggml_tensor * ggml_conv_3d_direct(
2053 struct ggml_context * ctx,
2054 struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
2055 struct ggml_tensor * b, // input [W, H, D, C * N]
2056 int s0, // stride
2057 int s1,
2058 int s2,
2059 int p0, // padding
2060 int p1,
2061 int p2,
2062 int d0, // dilation
2063 int d1,
2064 int d2,
2065 int n_channels,
2066 int n_batch,
2067 int n_channels_out);
2068
2069 enum ggml_op_pool {
2070 GGML_OP_POOL_MAX,
2071 GGML_OP_POOL_AVG,
2072 GGML_OP_POOL_COUNT,
2073 };
2074
2075 GGML_API struct ggml_tensor * ggml_pool_1d(
2076 struct ggml_context * ctx,
2077 struct ggml_tensor * a,
2078 enum ggml_op_pool op,
2079 int k0, // kernel size
2080 int s0, // stride
2081 int p0); // padding
2082
2083 // the result will have 2*p0 padding for the first dimension
2084 // and 2*p1 padding for the second dimension
2085 GGML_API struct ggml_tensor * ggml_pool_2d(
2086 struct ggml_context * ctx,
2087 struct ggml_tensor * a,
2088 enum ggml_op_pool op,
2089 int k0,
2090 int k1,
2091 int s0,
2092 int s1,
2093 float p0,
2094 float p1);
2095
2096 GGML_API struct ggml_tensor * ggml_pool_2d_back(
2097 struct ggml_context * ctx,
2098 struct ggml_tensor * a,
2099 struct ggml_tensor * af, // "a"/input used in forward pass
2100 enum ggml_op_pool op,
2101 int k0,
2102 int k1,
2103 int s0,
2104 int s1,
2105 float p0,
2106 float p1);
2107
2108 enum ggml_scale_mode {
2109 GGML_SCALE_MODE_NEAREST = 0,
2110 GGML_SCALE_MODE_BILINEAR = 1,
2111 GGML_SCALE_MODE_BICUBIC = 2,
2112
2113 GGML_SCALE_MODE_COUNT
2114 };
2115
2116 enum ggml_scale_flag {
2117 GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
2118 };
2119
2120 // interpolate
2121 // multiplies ne0 and ne1 by scale factor
2122 GGML_API struct ggml_tensor * ggml_upscale(
2123 struct ggml_context * ctx,
2124 struct ggml_tensor * a,
2125 int scale_factor,
2126 enum ggml_scale_mode mode);
2127
2128 // interpolate
2129 // interpolate scale to specified dimensions
2130 GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext(
2131 struct ggml_context * ctx,
2132 struct ggml_tensor * a,
2133 int ne0,
2134 int ne1,
2135 int ne2,
2136 int ne3,
2137 enum ggml_scale_mode mode),
2138 "use ggml_interpolate instead");
2139
2140 // Up- or downsamples the input to the specified size.
2141 // 2D scale modes (eg. bilinear) are applied to the first two dimensions.
2142 GGML_API struct ggml_tensor * ggml_interpolate(
2143 struct ggml_context * ctx,
2144 struct ggml_tensor * a,
2145 int64_t ne0,
2146 int64_t ne1,
2147 int64_t ne2,
2148 int64_t ne3,
2149 uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...]
2150
2151 // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
2152 GGML_API struct ggml_tensor * ggml_pad(
2153 struct ggml_context * ctx,
2154 struct ggml_tensor * a,
2155 int p0,
2156 int p1,
2157 int p2,
2158 int p3);
2159
2160 GGML_API struct ggml_tensor * ggml_pad_ext(
2161 struct ggml_context * ctx,
2162 struct ggml_tensor * a,
2163 int lp0,
2164 int rp0,
2165 int lp1,
2166 int rp1,
2167 int lp2,
2168 int rp2,
2169 int lp3,
2170 int rp3
2171 );
2172
2173 // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
2174 GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
2175 struct ggml_context * ctx,
2176 struct ggml_tensor * a,
2177 int p0,
2178 int p1);
2179
2180 // Move tensor elements by an offset given for each dimension. Elements that
2181 // are shifted beyond the last position are wrapped around to the beginning.
2182 GGML_API struct ggml_tensor * ggml_roll(
2183 struct ggml_context * ctx,
2184 struct ggml_tensor * a,
2185 int shift0,
2186 int shift1,
2187 int shift2,
2188 int shift3);
2189
2190
2191 // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
2192 // timesteps: [N,]
2193 // return: [N, dim]
2194 GGML_API struct ggml_tensor * ggml_timestep_embedding(
2195 struct ggml_context * ctx,
2196 struct ggml_tensor * timesteps,
2197 int dim,
2198 int max_period);
2199
2200 // sort rows
2201 enum ggml_sort_order {
2202 GGML_SORT_ORDER_ASC,
2203 GGML_SORT_ORDER_DESC,
2204 };
2205
2206 GGML_API struct ggml_tensor * ggml_argsort(
2207 struct ggml_context * ctx,
2208 struct ggml_tensor * a,
2209 enum ggml_sort_order order);
2210
2211 GGML_API struct ggml_tensor * ggml_arange(
2212 struct ggml_context * ctx,
2213 float start,
2214 float stop,
2215 float step);
2216
2217 // top k elements per row
2218 GGML_API struct ggml_tensor * ggml_top_k(
2219 struct ggml_context * ctx,
2220 struct ggml_tensor * a,
2221 int k);
2222
2223#define GGML_KQ_MASK_PAD 64
2224
2225 // q: [n_embd_k, n_batch, n_head, ne3 ]
2226 // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2227 // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2228 // mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
2229 // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2230 //
2231 // broadcast:
2232 // n_head % n_head_kv == 0
2233 // n_head % ne32 == 0
2234 // ne3 % ne33 == 0
2235 //
2236 GGML_API struct ggml_tensor * ggml_flash_attn_ext(
2237 struct ggml_context * ctx,
2238 struct ggml_tensor * q,
2239 struct ggml_tensor * k,
2240 struct ggml_tensor * v,
2241 struct ggml_tensor * mask,
2242 float scale,
2243 float max_bias,
2244 float logit_softcap);
2245
2246 GGML_API void ggml_flash_attn_ext_set_prec(
2247 struct ggml_tensor * a,
2248 enum ggml_prec prec);
2249
2250 GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
2251 const struct ggml_tensor * a);
2252
2253 GGML_API void ggml_flash_attn_ext_add_sinks(
2254 struct ggml_tensor * a,
2255 struct ggml_tensor * sinks);
2256
2257 // TODO: needs to be adapted to ggml_flash_attn_ext
2258 GGML_API struct ggml_tensor * ggml_flash_attn_back(
2259 struct ggml_context * ctx,
2260 struct ggml_tensor * q,
2261 struct ggml_tensor * k,
2262 struct ggml_tensor * v,
2263 struct ggml_tensor * d,
2264 bool masked);
2265
2266 GGML_API struct ggml_tensor * ggml_ssm_conv(
2267 struct ggml_context * ctx,
2268 struct ggml_tensor * sx,
2269 struct ggml_tensor * c);
2270
2271 GGML_API struct ggml_tensor * ggml_ssm_scan(
2272 struct ggml_context * ctx,
2273 struct ggml_tensor * s,
2274 struct ggml_tensor * x,
2275 struct ggml_tensor * dt,
2276 struct ggml_tensor * A,
2277 struct ggml_tensor * B,
2278 struct ggml_tensor * C,
2279 struct ggml_tensor * ids);
2280
2281 // partition into non-overlapping windows with padding if needed
2282 // example:
2283 // a: 768 64 64 1
2284 // w: 14
2285 // res: 768 14 14 25
2286 // used in sam
2287 GGML_API struct ggml_tensor * ggml_win_part(
2288 struct ggml_context * ctx,
2289 struct ggml_tensor * a,
2290 int w);
2291
2292 // reverse of ggml_win_part
2293 // used in sam
2294 GGML_API struct ggml_tensor * ggml_win_unpart(
2295 struct ggml_context * ctx,
2296 struct ggml_tensor * a,
2297 int w0,
2298 int h0,
2299 int w);
2300
2301 GGML_API struct ggml_tensor * ggml_unary(
2302 struct ggml_context * ctx,
2303 struct ggml_tensor * a,
2304 enum ggml_unary_op op);
2305
2306 GGML_API struct ggml_tensor * ggml_unary_inplace(
2307 struct ggml_context * ctx,
2308 struct ggml_tensor * a,
2309 enum ggml_unary_op op);
2310
2311 // used in sam
2312 GGML_API struct ggml_tensor * ggml_get_rel_pos(
2313 struct ggml_context * ctx,
2314 struct ggml_tensor * a,
2315 int qh,
2316 int kh);
2317
2318 // used in sam
2319 GGML_API struct ggml_tensor * ggml_add_rel_pos(
2320 struct ggml_context * ctx,
2321 struct ggml_tensor * a,
2322 struct ggml_tensor * pw,
2323 struct ggml_tensor * ph);
2324
2325 GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
2326 struct ggml_context * ctx,
2327 struct ggml_tensor * a,
2328 struct ggml_tensor * pw,
2329 struct ggml_tensor * ph);
2330
2331 GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
2332 struct ggml_context * ctx,
2333 struct ggml_tensor * k,
2334 struct ggml_tensor * v,
2335 struct ggml_tensor * r,
2336 struct ggml_tensor * tf,
2337 struct ggml_tensor * td,
2338 struct ggml_tensor * state);
2339
2340 GGML_API struct ggml_tensor * ggml_gated_linear_attn(
2341 struct ggml_context * ctx,
2342 struct ggml_tensor * k,
2343 struct ggml_tensor * v,
2344 struct ggml_tensor * q,
2345 struct ggml_tensor * g,
2346 struct ggml_tensor * state,
2347 float scale);
2348
2349 GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
2350 struct ggml_context * ctx,
2351 struct ggml_tensor * r,
2352 struct ggml_tensor * w,
2353 struct ggml_tensor * k,
2354 struct ggml_tensor * v,
2355 struct ggml_tensor * a,
2356 struct ggml_tensor * b,
2357 struct ggml_tensor * state);
2358
2359 // custom operators
2360
2361 typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
2362 typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
2363 typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
2364
2365#define GGML_N_TASKS_MAX (-1)
2366 // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
2367
2368 GGML_API struct ggml_tensor * ggml_map_custom1(
2369 struct ggml_context * ctx,
2370 struct ggml_tensor * a,
2371 ggml_custom1_op_t fun,
2372 int n_tasks,
2373 void * userdata);
2374
2375 GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
2376 struct ggml_context * ctx,
2377 struct ggml_tensor * a,
2378 ggml_custom1_op_t fun,
2379 int n_tasks,
2380 void * userdata);
2381
2382 GGML_API struct ggml_tensor * ggml_map_custom2(
2383 struct ggml_context * ctx,
2384 struct ggml_tensor * a,
2385 struct ggml_tensor * b,
2386 ggml_custom2_op_t fun,
2387 int n_tasks,
2388 void * userdata);
2389
2390 GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
2391 struct ggml_context * ctx,
2392 struct ggml_tensor * a,
2393 struct ggml_tensor * b,
2394 ggml_custom2_op_t fun,
2395 int n_tasks,
2396 void * userdata);
2397
2398 GGML_API struct ggml_tensor * ggml_map_custom3(
2399 struct ggml_context * ctx,
2400 struct ggml_tensor * a,
2401 struct ggml_tensor * b,
2402 struct ggml_tensor * c,
2403 ggml_custom3_op_t fun,
2404 int n_tasks,
2405 void * userdata);
2406
2407 GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
2408 struct ggml_context * ctx,
2409 struct ggml_tensor * a,
2410 struct ggml_tensor * b,
2411 struct ggml_tensor * c,
2412 ggml_custom3_op_t fun,
2413 int n_tasks,
2414 void * userdata);
2415
2416 typedef void (*ggml_custom_op_t)(struct ggml_tensor * dst , int ith, int nth, void * userdata);
2417
2418 GGML_API struct ggml_tensor * ggml_custom_4d(
2419 struct ggml_context * ctx,
2420 enum ggml_type type,
2421 int64_t ne0,
2422 int64_t ne1,
2423 int64_t ne2,
2424 int64_t ne3,
2425 struct ggml_tensor ** args,
2426 int n_args,
2427 ggml_custom_op_t fun,
2428 int n_tasks,
2429 void * userdata);
2430
2431 GGML_API struct ggml_tensor * ggml_custom_inplace(
2432 struct ggml_context * ctx,
2433 struct ggml_tensor * a,
2434 struct ggml_tensor ** args,
2435 int n_args,
2436 ggml_custom_op_t fun,
2437 int n_tasks,
2438 void * userdata);
2439
2440 // loss function
2441
2442 GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
2443 struct ggml_context * ctx,
2444 struct ggml_tensor * a, // logits
2445 struct ggml_tensor * b); // labels
2446
2447 GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
2448 struct ggml_context * ctx,
2449 struct ggml_tensor * a, // logits
2450 struct ggml_tensor * b, // labels
2451 struct ggml_tensor * c); // gradients of cross_entropy_loss result
2452
2453 // AdamW optimizer step
2454 // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
2455 // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
2456 GGML_API struct ggml_tensor * ggml_opt_step_adamw(
2457 struct ggml_context * ctx,
2458 struct ggml_tensor * a,
2459 struct ggml_tensor * grad,
2460 struct ggml_tensor * m,
2461 struct ggml_tensor * v,
2462 struct ggml_tensor * adamw_params); // parameters such as the learning rate
2463
2464 // stochastic gradient descent step (with weight decay)
2465 GGML_API struct ggml_tensor * ggml_opt_step_sgd(
2466 struct ggml_context * ctx,
2467 struct ggml_tensor * a,
2468 struct ggml_tensor * grad,
2469 struct ggml_tensor * sgd_params); // alpha, weight decay
2470
2471 //
2472 // automatic differentiation
2473 //
2474
2475 GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2476 GGML_API void ggml_build_backward_expand(
2477 struct ggml_context * ctx, // context for gradient computation
2478 struct ggml_cgraph * cgraph,
2479 struct ggml_tensor ** grad_accs);
2480
2481 // graph allocation in a context
2482 GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
2483 GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
2484 GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
2485 GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
2486 GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2487 GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
2488
2489 GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
2490 GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
2491 GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
2492 GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
2493
2494 GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2495
2496 GGML_API size_t ggml_graph_overhead(void);
2497 GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
2498
2499 GGML_API struct ggml_tensor * ggml_graph_get_tensor (const struct ggml_cgraph * cgraph, const char * name);
2500 GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2501 GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2502
2503 // print info and performance information for the graph
2504 GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
2505
2506 // dump the graph into a file using the dot format
2507 GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
2508
2509 // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
2510 typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
2511
2512 // Set callback for all future logging events.
2513 // If this is not called, or NULL is supplied, everything is output on stderr.
2514 GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2515
2516 GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
2517
2518 //
2519 // quantization
2520 //
2521
2522 // - ggml_quantize_init can be called multiple times with the same type
2523 // it will only initialize the quantization tables for the first call or after ggml_quantize_free
2524 // automatically called by ggml_quantize_chunk for convenience
2525 //
2526 // - ggml_quantize_free will free any memory allocated by ggml_quantize_init
2527 // call this at the end of the program to avoid memory leaks
2528 //
2529 // note: these are thread-safe
2530 //
2531 GGML_API void ggml_quantize_init(enum ggml_type type);
2532 GGML_API void ggml_quantize_free(void);
2533
2534 // some quantization type cannot be used without an importance matrix
2535 GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2536
2537 // calls ggml_quantize_init internally (i.e. can allocate memory)
2538 GGML_API size_t ggml_quantize_chunk(
2539 enum ggml_type type,
2540 const float * src,
2541 void * dst,
2542 int64_t start,
2543 int64_t nrows,
2544 int64_t n_per_row,
2545 const float * imatrix);
2546
2547#ifdef __cplusplus
2548 // restrict not standard in C++
2549# if defined(__GNUC__)
2550# define GGML_RESTRICT __restrict__
2551# elif defined(__clang__)
2552# define GGML_RESTRICT __restrict
2553# elif defined(_MSC_VER)
2554# define GGML_RESTRICT __restrict
2555# else
2556# define GGML_RESTRICT
2557# endif
2558#else
2559# if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
2560# define GGML_RESTRICT __restrict
2561# else
2562# define GGML_RESTRICT restrict
2563# endif
2564#endif
2565 typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2566 typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2567
2568 struct ggml_type_traits {
2569 const char * type_name;
2570 int64_t blck_size;
2571 int64_t blck_size_interleave; // interleave elements in blocks
2572 size_t type_size;
2573 bool is_quantized;
2574 ggml_to_float_t to_float;
2575 ggml_from_float_t from_float_ref;
2576 };
2577
2578 GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
2579
2580 // ggml threadpool
2581 // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
2582 // the goal should be to create an API that other backends can use move everything to the ggml base
2583
2584 // scheduling priorities
2585 enum ggml_sched_priority {
2586 GGML_SCHED_PRIO_LOW = -1,
2587 GGML_SCHED_PRIO_NORMAL,
2588 GGML_SCHED_PRIO_MEDIUM,
2589 GGML_SCHED_PRIO_HIGH,
2590 GGML_SCHED_PRIO_REALTIME
2591 };
2592
2593 // threadpool params
2594 // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
2595 struct ggml_threadpool_params {
2596 bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
2597 int n_threads; // number of threads
2598 enum ggml_sched_priority prio; // thread priority
2599 uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
2600 bool strict_cpu; // strict cpu placement
2601 bool paused; // start in paused state
2602 };
2603
2604 struct ggml_threadpool; // forward declaration, see ggml.c
2605
2606 typedef struct ggml_threadpool * ggml_threadpool_t;
2607
2608 GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
2609 GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
2610 GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
2611
2612#ifdef __cplusplus
2613}
2614#endif
2615