ggml.h source code [llama.cpp/ggml/include/ggml.h]

1	#pragma once
2
3	//
4	// GGML Tensor Library
5	//
6	// This documentation is still a work in progress.
7	// If you wish some specific topics to be covered, feel free to drop a comment:
8	//
9	// https://github.com/ggerganov/whisper.cpp/issues/40
10	//
11	// ## Overview
12	//
13	// This library implements:
14	//
15	// - a set of tensor operations
16	// - automatic differentiation
17	// - basic optimization algorithms
18	//
19	// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
20	// but is not limited to, the following:
21	//
22	// - linear regression
23	// - support vector machines
24	// - neural networks
25	//
26	// The library allows the user to define a certain function using the available tensor operations. This function
27	// definition is represented internally via a computation graph. Each tensor operation in the function definition
28	// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
29	// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
30	// using one of the available optimization algorithms.
31	//
32	// For example, here we define the function: f(x) = ax^2 + b*
33	//
34	// {
35	// struct ggml_init_params params = {
36	// .mem_size = 1610241024,
37	// .mem_buffer = NULL,
38	// };
39	//
40	// // memory allocation happens here
41	// struct ggml_context ctx = ggml_init(params);*
42	//
43	// struct ggml_tensor x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);*
44	//
45	// ggml_set_param(ctx, x); // x is an input variable
46	//
47	// struct ggml_tensor a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);*
48	// struct ggml_tensor b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);*
49	// struct ggml_tensor x2 = ggml_mul(ctx, x, x);*
50	// struct ggml_tensor f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);*
51	//
52	// ...
53	// }
54	//
55	// Notice that the function definition above does not involve any actual computation. The computation is performed only
56	// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
57	//
58	// {
59	// ...
60	//
61	// struct ggml_cgraph gf = ggml_new_graph(ctx);*
62	// ggml_build_forward_expand(gf, f);
63	//
64	// // set the input variable and parameter values
65	// ggml_set_f32(x, 2.0f);
66	// ggml_set_f32(a, 3.0f);
67	// ggml_set_f32(b, 4.0f);
68	//
69	// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
70	//
71	// printf("f = %f\n", ggml_get_f32_1d(f, 0));
72	//
73	// ...
74	// }
75	//
76	// The actual computation is performed in the ggml_graph_compute() function.
77	//
78	// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
79	// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
80	// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
81	// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
82	// actually needed.
83	//
84	// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
85	// differentiation and optimization algorithms.
86	//
87	// The described approach allows to define the function graph once and then compute its forward or backward graphs
88	// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
89	// the user can avoid the memory allocation overhead at runtime.
90	//
91	// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
92	// citizens, but in theory the library can be extended to support FP8 and integer data types.
93	//
94	// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
95	// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
96	// clear that the library needs to support more complex operations. The way to support these operations is not clear
97	// yet, but a few examples are demonstrated in the following operations:
98	//
99	// - ggml_permute()
100	// - ggml_conv_1d_1s()
101	// - ggml_conv_1d_2s()
102	//
103	// For each tensor operator, the library implements a forward and backward computation function. The forward function
104	// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
105	// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
106	// calculus class, or watch the following video:
107	//
108	// What is Automatic Differentiation?
109	// https://www.youtube.com/watch?v=wG_nF1awSSY
110	//
111	//
112	// ## Tensor data (struct ggml_tensor)
113	//
114	// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
115	// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
116	// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
117	//
118	// {
119	// struct ggml_tensor c = ggml_add(ctx, a, b);*
120	//
121	// assert(c->src[0] == a);
122	// assert(c->src[1] == b);
123	// }
124	//
125	// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
126	// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
127	// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
128	// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
129	// contiguous in memory.
130	//
131	// The data of the tensor is accessed via the "data" pointer. For example:
132	//
133	// {
134	// const int nx = 2;
135	// const int ny = 3;
136	//
137	// struct ggml_tensor a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);*
138	//
139	// for (int y = 0; y < ny; y++) {
140	// for (int x = 0; x < nx; x++) {
141	// (float ) ((char ) a->data + ya->nb[1] + xa->nb[0]) = x + y;*
142	// }
143	// }
144	//
145	// ...
146	// }
147	//
148	// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
149	//
150	// ## The matrix multiplication operator (ggml_mul_mat)
151	//
152	// TODO
153	//
154	//
155	// ## Multi-threading
156	//
157	// TODO
158	//
159	//
160	// ## Overview of ggml.c
161	//
162	// TODO
163	//
164	//
165	// ## SIMD optimizations
166	//
167	// TODO
168	//
169	//
170	// ## Debugging ggml
171	//
172	// TODO
173	//
174	//
175
176	#ifdef GGML_SHARED
177	# if defined(_WIN32) && !defined(__MINGW32__)
178	# ifdef GGML_BUILD
179	# define GGML_API __declspec(dllexport) extern
180	# else
181	# define GGML_API __declspec(dllimport) extern
182	# endif
183	# else
184	# define GGML_API __attribute__ ((visibility ("default"))) extern
185	# endif
186	#else
187	# define GGML_API extern
188	#endif
189
190	// TODO: support for clang
191	#ifdef __GNUC__
192	# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
193	#elif defined(_MSC_VER)
194	# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
195	#else
196	# define GGML_DEPRECATED(func, hint) func
197	#endif
198
199	#ifndef __GNUC__
200	# define GGML_ATTRIBUTE_FORMAT(...)
201	#elif defined(__MINGW32__) && !defined(__clang__)
202	# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
203	#else
204	# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
205	#endif
206
207	#include <stdbool.h>
208	#include <stddef.h>
209	#include <stdint.h>
210	#include <stdio.h>
211
212	#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
213	#define GGML_FILE_VERSION 2
214
215	#define GGML_QNT_VERSION 2 // bump this on quantization format changes
216	#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
217
218	#define GGML_MAX_DIMS 4
219	#define GGML_MAX_PARAMS 2048
220	#define GGML_MAX_SRC 10
221	#define GGML_MAX_N_THREADS 512
222	#define GGML_MAX_OP_PARAMS 64
223
224	#ifndef GGML_MAX_NAME
225	# define GGML_MAX_NAME 64
226	#endif
227
228	#define GGML_DEFAULT_N_THREADS 4
229	#define GGML_DEFAULT_GRAPH_SIZE 2048
230
231	#if UINTPTR_MAX == 0xFFFFFFFF
232	#define GGML_MEM_ALIGN 4
233	#else
234	#define GGML_MEM_ALIGN 16
235	#endif
236
237	#define GGML_EXIT_SUCCESS 0
238	#define GGML_EXIT_ABORTED 1
239
240	// TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
241	#define GGML_ROPE_TYPE_NORMAL 0
242	#define GGML_ROPE_TYPE_NEOX 2
243	#define GGML_ROPE_TYPE_MROPE 8
244	#define GGML_ROPE_TYPE_VISION 24
245	#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
246
247	#define GGML_MROPE_SECTIONS 4
248
249	#define GGML_UNUSED(x) (void)(x)
250	#ifdef __CUDACC__
251	template<typename... Args>
252	__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
253	#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
254	#else
255	#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
256	#endif // __CUDACC__
257
258	#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
259
260	#ifndef NDEBUG
261	# define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
262	#elif defined(__GNUC__)
263	# define GGML_UNREACHABLE() __builtin_unreachable()
264	#elif defined(_MSC_VER)
265	# define GGML_UNREACHABLE() __assume(0)
266	#else
267	# define GGML_UNREACHABLE() ((void) 0)
268	#endif
269
270	#ifdef __cplusplus
271	# define GGML_NORETURN [[noreturn]]
272	#elif defined(_MSC_VER)
273	# define GGML_NORETURN __declspec(noreturn)
274	#else
275	# define GGML_NORETURN _Noreturn
276	#endif
277
278	#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
279	#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
280
281	// used to copy the number of elements and stride in bytes of tensors into local variables.
282	// main purpose is to reduce code duplication and improve readability.
283	//
284	// example:
285	//
286	// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
287	// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
288	//
289	#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
290	const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
291	GGML_UNUSED(prefix##0);
292	#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
293	GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
294	const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
295	GGML_UNUSED(prefix##1);
296	#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
297	GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
298	const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
299	GGML_UNUSED(prefix##2);
300	#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
301	GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
302	const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
303	GGML_UNUSED(prefix##3);
304
305	#define GGML_TENSOR_UNARY_OP_LOCALS \
306	GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
307	GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
308	GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
309	GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
310
311	#define GGML_TENSOR_BINARY_OP_LOCALS \
312	GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
313	GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
314	GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
315	GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
316	GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
317	GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
318
319	#define GGML_TENSOR_TERNARY_OP_LOCALS \
320	GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
321	GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
322	GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
323	GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
324	GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
325	GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \
326	GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
327	GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
328
329	#define GGML_TENSOR_BINARY_OP_LOCALS01 \
330	GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
331	GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
332	GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
333	GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
334
335	#ifdef __cplusplus
336	extern "C" {
337	#endif
338
339	// Function type used in fatal error callbacks
340	typedef void (ggml_abort_callback_t)(const* char * error_message);
341
342	// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
343	// Returns the old callback for chaining
344	GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
345
346	GGML_NORETURN GGML_ATTRIBUTE_FORMAT(`3`, `4`)
347	GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
348
349	enum ggml_status {
350	GGML_STATUS_ALLOC_FAILED = -`2`,
351	GGML_STATUS_FAILED = -`1`,
352	GGML_STATUS_SUCCESS = `0`,
353	GGML_STATUS_ABORTED = `1`,
354	};
355
356	// get ggml_status name string
357	GGML_API const char * ggml_status_to_string(enum ggml_status status);
358
359	// ieee 754-2008 half-precision float16
360	// todo: make this not an integral type
361	typedef uint16_t ggml_fp16_t;
362	GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
363	GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
364	GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t , float* *, int64_t);
365	GGML_API void ggml_fp32_to_fp16_row(const float , ggml_fp16_t , int64_t);
366
367	// google brain half-precision bfloat16
368	typedef struct { uint16_t bits; } ggml_bf16_t;
369	GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
370	GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
371	GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t , float* *, int64_t);
372	GGML_API void ggml_fp32_to_bf16_row_ref(const float , ggml_bf16_t , int64_t);
373	GGML_API void ggml_fp32_to_bf16_row(const float , ggml_bf16_t , int64_t);
374
375	struct ggml_object;
376	struct ggml_context;
377	struct ggml_cgraph;
378
379	// NOTE: always add types at the end of the enum to keep backward compatibility
380	enum ggml_type {
381	GGML_TYPE_F32 = `0`,
382	GGML_TYPE_F16 = `1`,
383	GGML_TYPE_Q4_0 = `2`,
384	GGML_TYPE_Q4_1 = `3`,
385	// GGML_TYPE_Q4_2 = 4, support has been removed
386	// GGML_TYPE_Q4_3 = 5, support has been removed
387	GGML_TYPE_Q5_0 = `6`,
388	GGML_TYPE_Q5_1 = `7`,
389	GGML_TYPE_Q8_0 = `8`,
390	GGML_TYPE_Q8_1 = `9`,
391	GGML_TYPE_Q2_K = `10`,
392	GGML_TYPE_Q3_K = `11`,
393	GGML_TYPE_Q4_K = `12`,
394	GGML_TYPE_Q5_K = `13`,
395	GGML_TYPE_Q6_K = `14`,
396	GGML_TYPE_Q8_K = `15`,
397	GGML_TYPE_IQ2_XXS = `16`,
398	GGML_TYPE_IQ2_XS = `17`,
399	GGML_TYPE_IQ3_XXS = `18`,
400	GGML_TYPE_IQ1_S = `19`,
401	GGML_TYPE_IQ4_NL = `20`,
402	GGML_TYPE_IQ3_S = `21`,
403	GGML_TYPE_IQ2_S = `22`,
404	GGML_TYPE_IQ4_XS = `23`,
405	GGML_TYPE_I8 = `24`,
406	GGML_TYPE_I16 = `25`,
407	GGML_TYPE_I32 = `26`,
408	GGML_TYPE_I64 = `27`,
409	GGML_TYPE_F64 = `28`,
410	GGML_TYPE_IQ1_M = `29`,
411	GGML_TYPE_BF16 = `30`,
412	// GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
413	// GGML_TYPE_Q4_0_4_8 = 32,
414	// GGML_TYPE_Q4_0_8_8 = 33,
415	GGML_TYPE_TQ1_0 = `34`,
416	GGML_TYPE_TQ2_0 = `35`,
417	// GGML_TYPE_IQ4_NL_4_4 = 36,
418	// GGML_TYPE_IQ4_NL_4_8 = 37,
419	// GGML_TYPE_IQ4_NL_8_8 = 38,
420	GGML_TYPE_MXFP4 = `39`, // MXFP4 (1 block)
421	GGML_TYPE_COUNT = `40`,
422	};
423
424	// precision
425	enum ggml_prec {
426	GGML_PREC_DEFAULT = `0`, // stored as ggml_tensor.op_params, 0 by default
427	GGML_PREC_F32 = `10`,
428	};
429
430	// model file types
431	enum ggml_ftype {
432	GGML_FTYPE_UNKNOWN = -`1`,
433	GGML_FTYPE_ALL_F32 = `0`,
434	GGML_FTYPE_MOSTLY_F16 = `1`, // except 1d tensors
435	GGML_FTYPE_MOSTLY_Q4_0 = `2`, // except 1d tensors
436	GGML_FTYPE_MOSTLY_Q4_1 = `3`, // except 1d tensors
437	GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = `4`, // tok_embeddings.weight and output.weight are F16
438	GGML_FTYPE_MOSTLY_Q8_0 = `7`, // except 1d tensors
439	GGML_FTYPE_MOSTLY_Q5_0 = `8`, // except 1d tensors
440	GGML_FTYPE_MOSTLY_Q5_1 = `9`, // except 1d tensors
441	GGML_FTYPE_MOSTLY_Q2_K = `10`, // except 1d tensors
442	GGML_FTYPE_MOSTLY_Q3_K = `11`, // except 1d tensors
443	GGML_FTYPE_MOSTLY_Q4_K = `12`, // except 1d tensors
444	GGML_FTYPE_MOSTLY_Q5_K = `13`, // except 1d tensors
445	GGML_FTYPE_MOSTLY_Q6_K = `14`, // except 1d tensors
446	GGML_FTYPE_MOSTLY_IQ2_XXS = `15`, // except 1d tensors
447	GGML_FTYPE_MOSTLY_IQ2_XS = `16`, // except 1d tensors
448	GGML_FTYPE_MOSTLY_IQ3_XXS = `17`, // except 1d tensors
449	GGML_FTYPE_MOSTLY_IQ1_S = `18`, // except 1d tensors
450	GGML_FTYPE_MOSTLY_IQ4_NL = `19`, // except 1d tensors
451	GGML_FTYPE_MOSTLY_IQ3_S = `20`, // except 1d tensors
452	GGML_FTYPE_MOSTLY_IQ2_S = `21`, // except 1d tensors
453	GGML_FTYPE_MOSTLY_IQ4_XS = `22`, // except 1d tensors
454	GGML_FTYPE_MOSTLY_IQ1_M = `23`, // except 1d tensors
455	GGML_FTYPE_MOSTLY_BF16 = `24`, // except 1d tensors
456	GGML_FTYPE_MOSTLY_MXFP4 = `25`, // except 1d tensors
457	};
458
459	// available tensor operations:
460	enum ggml_op {
461	GGML_OP_NONE = `0`,
462
463	GGML_OP_DUP,
464	GGML_OP_ADD,
465	GGML_OP_ADD_ID,
466	GGML_OP_ADD1,
467	GGML_OP_ACC,
468	GGML_OP_SUB,
469	GGML_OP_MUL,
470	GGML_OP_DIV,
471	GGML_OP_SQR,
472	GGML_OP_SQRT,
473	GGML_OP_LOG,
474	GGML_OP_SIN,
475	GGML_OP_COS,
476	GGML_OP_SUM,
477	GGML_OP_SUM_ROWS,
478	GGML_OP_MEAN,
479	GGML_OP_ARGMAX,
480	GGML_OP_COUNT_EQUAL,
481	GGML_OP_REPEAT,
482	GGML_OP_REPEAT_BACK,
483	GGML_OP_CONCAT,
484	GGML_OP_SILU_BACK,
485	GGML_OP_NORM, // normalize
486	GGML_OP_RMS_NORM,
487	GGML_OP_RMS_NORM_BACK,
488	GGML_OP_GROUP_NORM,
489	GGML_OP_L2_NORM,
490
491	GGML_OP_MUL_MAT,
492	GGML_OP_MUL_MAT_ID,
493	GGML_OP_OUT_PROD,
494
495	GGML_OP_SCALE,
496	GGML_OP_SET,
497	GGML_OP_CPY,
498	GGML_OP_CONT,
499	GGML_OP_RESHAPE,
500	GGML_OP_VIEW,
501	GGML_OP_PERMUTE,
502	GGML_OP_TRANSPOSE,
503	GGML_OP_GET_ROWS,
504	GGML_OP_GET_ROWS_BACK,
505	GGML_OP_SET_ROWS,
506	GGML_OP_DIAG,
507	GGML_OP_DIAG_MASK_INF,
508	GGML_OP_DIAG_MASK_ZERO,
509	GGML_OP_SOFT_MAX,
510	GGML_OP_SOFT_MAX_BACK,
511	GGML_OP_ROPE,
512	GGML_OP_ROPE_BACK,
513	GGML_OP_CLAMP,
514	GGML_OP_CONV_TRANSPOSE_1D,
515	GGML_OP_IM2COL,
516	GGML_OP_IM2COL_BACK,
517	GGML_OP_IM2COL_3D,
518	GGML_OP_CONV_2D,
519	GGML_OP_CONV_3D,
520	GGML_OP_CONV_2D_DW,
521	GGML_OP_CONV_TRANSPOSE_2D,
522	GGML_OP_POOL_1D,
523	GGML_OP_POOL_2D,
524	GGML_OP_POOL_2D_BACK,
525	GGML_OP_UPSCALE,
526	GGML_OP_PAD,
527	GGML_OP_PAD_REFLECT_1D,
528	GGML_OP_ROLL,
529	GGML_OP_ARANGE,
530	GGML_OP_TIMESTEP_EMBEDDING,
531	GGML_OP_ARGSORT,
532	GGML_OP_LEAKY_RELU,
533
534	GGML_OP_FLASH_ATTN_EXT,
535	GGML_OP_FLASH_ATTN_BACK,
536	GGML_OP_SSM_CONV,
537	GGML_OP_SSM_SCAN,
538	GGML_OP_WIN_PART,
539	GGML_OP_WIN_UNPART,
540	GGML_OP_GET_REL_POS,
541	GGML_OP_ADD_REL_POS,
542	GGML_OP_RWKV_WKV6,
543	GGML_OP_GATED_LINEAR_ATTN,
544	GGML_OP_RWKV_WKV7,
545
546	GGML_OP_UNARY,
547
548	GGML_OP_MAP_CUSTOM1,
549	GGML_OP_MAP_CUSTOM2,
550	GGML_OP_MAP_CUSTOM3,
551
552	GGML_OP_CUSTOM,
553
554	GGML_OP_CROSS_ENTROPY_LOSS,
555	GGML_OP_CROSS_ENTROPY_LOSS_BACK,
556	GGML_OP_OPT_STEP_ADAMW,
557	GGML_OP_OPT_STEP_SGD,
558
559	GGML_OP_GLU,
560
561	GGML_OP_COUNT,
562	};
563
564	enum ggml_unary_op {
565	GGML_UNARY_OP_ABS,
566	GGML_UNARY_OP_SGN,
567	GGML_UNARY_OP_NEG,
568	GGML_UNARY_OP_STEP,
569	GGML_UNARY_OP_TANH,
570	GGML_UNARY_OP_ELU,
571	GGML_UNARY_OP_RELU,
572	GGML_UNARY_OP_SIGMOID,
573	GGML_UNARY_OP_GELU,
574	GGML_UNARY_OP_GELU_QUICK,
575	GGML_UNARY_OP_SILU,
576	GGML_UNARY_OP_HARDSWISH,
577	GGML_UNARY_OP_HARDSIGMOID,
578	GGML_UNARY_OP_EXP,
579	GGML_UNARY_OP_GELU_ERF,
580	GGML_UNARY_OP_XIELU,
581	GGML_UNARY_OP_FLOOR,
582	GGML_UNARY_OP_CEIL,
583	GGML_UNARY_OP_ROUND,
584	GGML_UNARY_OP_TRUNC,
585
586	GGML_UNARY_OP_COUNT,
587	};
588
589	enum ggml_glu_op {
590	GGML_GLU_OP_REGLU,
591	GGML_GLU_OP_GEGLU,
592	GGML_GLU_OP_SWIGLU,
593	GGML_GLU_OP_SWIGLU_OAI,
594	GGML_GLU_OP_GEGLU_ERF,
595	GGML_GLU_OP_GEGLU_QUICK,
596
597	GGML_GLU_OP_COUNT,
598	};
599
600	enum ggml_object_type {
601	GGML_OBJECT_TYPE_TENSOR,
602	GGML_OBJECT_TYPE_GRAPH,
603	GGML_OBJECT_TYPE_WORK_BUFFER
604	};
605
606	enum ggml_log_level {
607	GGML_LOG_LEVEL_NONE = `0`,
608	GGML_LOG_LEVEL_DEBUG = `1`,
609	GGML_LOG_LEVEL_INFO = `2`,
610	GGML_LOG_LEVEL_WARN = `3`,
611	GGML_LOG_LEVEL_ERROR = `4`,
612	GGML_LOG_LEVEL_CONT = `5`, // continue previous log
613	};
614
615	// this tensor...
616	enum ggml_tensor_flag {
617	GGML_TENSOR_FLAG_INPUT = `1`, // ...is an input for the GGML compute graph
618	GGML_TENSOR_FLAG_OUTPUT = `2`, // ...is an output for the GGML compute graph
619	GGML_TENSOR_FLAG_PARAM = `4`, // ...contains trainable parameters
620	GGML_TENSOR_FLAG_LOSS = `8`, // ...defines loss for numerical optimization (multiple loss tensors add up)
621	};
622
623	struct ggml_init_params {
624	// memory pool
625	size_t mem_size; // bytes
626	void * mem_buffer; // if NULL, memory will be allocated internally
627	bool no_alloc; // don't allocate memory for the tensor data
628	};
629
630	// n-dimensional tensor
631	struct ggml_tensor {
632	enum ggml_type type;
633
634	struct ggml_backend_buffer * buffer;
635
636	int64_t ne[GGML_MAX_DIMS]; // number of elements
637	size_t nb[GGML_MAX_DIMS]; // stride in bytes:
638	// nb[0] = ggml_type_size(type)
639	// nb[1] = nb[0] (ne[0] / ggml_blck_size(type)) + padding*
640	// nb[i] = nb[i-1] ne[i-1]*
641
642	// compute data
643	enum ggml_op op;
644
645	// op params - allocated as int32_t for alignment
646	int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
647
648	int32_t flags;
649
650	struct ggml_tensor * src[GGML_MAX_SRC];
651
652	// source tensor and offset for views
653	struct ggml_tensor * view_src;
654	size_t view_offs;
655
656	void * data;
657
658	char name[GGML_MAX_NAME];
659
660	void * extra; // extra things e.g. for ggml-cuda.cu
661
662	char padding[`8`];
663	};
664
665	static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
666
667	// Abort callback
668	// If not NULL, called before ggml computation
669	// If it returns true, the computation is aborted
670	typedef bool (ggml_abort_callback)(void* * data);
671
672
673	//
674	// GUID
675	//
676
677	// GUID types
678	typedef uint8_t ggml_guid[`16`];
679	typedef ggml_guid * ggml_guid_t;
680
681	GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b);
682
683	// misc
684
685	GGML_API const char * ggml_version(void);
686	GGML_API const char * ggml_commit(void);
687
688	GGML_API void ggml_time_init(void); // call this once at the beginning of the program
689	GGML_API int64_t ggml_time_ms(void);
690	GGML_API int64_t ggml_time_us(void);
691	GGML_API int64_t ggml_cycles(void);
692	GGML_API int64_t ggml_cycles_per_ms(void);
693
694	// accepts a UTF-8 path, even on Windows
695	GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
696
697	GGML_API void ggml_print_object (const struct ggml_object * obj);
698	GGML_API void ggml_print_objects(const struct ggml_context * ctx);
699
700	GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
701	GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
702	GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
703	GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
704
705	GGML_API int64_t ggml_blck_size(enum ggml_type type);
706	GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
707	GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
708
709	GGML_DEPRECATED(
710	GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
711	"use ggml_row_size() instead");
712
713	GGML_API const char * ggml_type_name(enum ggml_type type);
714	GGML_API const char * ggml_op_name (enum ggml_op op);
715	GGML_API const char * ggml_op_symbol(enum ggml_op op);
716
717	GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
718	GGML_API const char * ggml_glu_op_name(enum ggml_glu_op op);
719	GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
720
721	GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
722
723	GGML_API bool ggml_is_quantized(enum ggml_type type);
724
725	// TODO: temporary until model loading of ggml examples is refactored
726	GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
727
728	GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
729	GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
730	GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
731	GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
732	GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
733	GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
734	GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
735	GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
736
737	// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
738	GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
739	GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
740	GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
741	GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
742
743	// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
744	GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
745
746	// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
747	GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
748
749	// true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
750	GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
751
752	GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
753	GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
754
755	GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
756
757	// use this to compute the memory overhead of a tensor
758	GGML_API size_t ggml_tensor_overhead(void);
759
760	GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
761
762	// main
763
764	GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
765	GGML_API void ggml_reset(struct ggml_context * ctx);
766	GGML_API void ggml_free (struct ggml_context * ctx);
767
768	GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
769
770	GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
771	GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
772
773	GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
774	GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
775	GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
776
777	GGML_API struct ggml_tensor * ggml_new_tensor(
778	struct ggml_context * ctx,
779	enum ggml_type type,
780	int n_dims,
781	const int64_t *ne);
782
783	GGML_API struct ggml_tensor * ggml_new_tensor_1d(
784	struct ggml_context * ctx,
785	enum ggml_type type,
786	int64_t ne0);
787
788	GGML_API struct ggml_tensor * ggml_new_tensor_2d(
789	struct ggml_context * ctx,
790	enum ggml_type type,
791	int64_t ne0,
792	int64_t ne1);
793
794	GGML_API struct ggml_tensor * ggml_new_tensor_3d(
795	struct ggml_context * ctx,
796	enum ggml_type type,
797	int64_t ne0,
798	int64_t ne1,
799	int64_t ne2);
800
801	GGML_API struct ggml_tensor * ggml_new_tensor_4d(
802	struct ggml_context * ctx,
803	enum ggml_type type,
804	int64_t ne0,
805	int64_t ne1,
806	int64_t ne2,
807	int64_t ne3);
808
809	GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
810
811	GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
812	GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
813
814	// Context tensor enumeration and lookup
815	GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
816	GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
817	GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
818
819	// Converts a flat index into coordinates
820	GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
821
822	GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
823	GGML_API enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor);
824
825	GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
826	GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
827
828	GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
829	GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
830	GGML_ATTRIBUTE_FORMAT(`2`, `3`)
831	GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
832
833	// Tensor flags
834	GGML_API void ggml_set_input(struct ggml_tensor * tensor);
835	GGML_API void ggml_set_output(struct ggml_tensor * tensor);
836	GGML_API void ggml_set_param(struct ggml_tensor * tensor);
837	GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
838
839	//
840	// operations on tensors with backpropagation
841	//
842
843	GGML_API struct ggml_tensor * ggml_dup(
844	struct ggml_context * ctx,
845	struct ggml_tensor * a);
846
847	// in-place, returns view(a)
848	GGML_API struct ggml_tensor * ggml_dup_inplace(
849	struct ggml_context * ctx,
850	struct ggml_tensor * a);
851
852	GGML_API struct ggml_tensor * ggml_add(
853	struct ggml_context * ctx,
854	struct ggml_tensor * a,
855	struct ggml_tensor * b);
856
857	GGML_API struct ggml_tensor * ggml_add_inplace(
858	struct ggml_context * ctx,
859	struct ggml_tensor * a,
860	struct ggml_tensor * b);
861
862	GGML_API struct ggml_tensor * ggml_add_cast(
863	struct ggml_context * ctx,
864	struct ggml_tensor * a,
865	struct ggml_tensor * b,
866	enum ggml_type type);
867
868	// dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
869	GGML_API struct ggml_tensor * ggml_add_id(
870	struct ggml_context * ctx,
871	struct ggml_tensor * a,
872	struct ggml_tensor * b,
873	struct ggml_tensor * ids);
874
875	GGML_API struct ggml_tensor * ggml_add1(
876	struct ggml_context * ctx,
877	struct ggml_tensor * a,
878	struct ggml_tensor * b);
879
880	GGML_API struct ggml_tensor * ggml_add1_inplace(
881	struct ggml_context * ctx,
882	struct ggml_tensor * a,
883	struct ggml_tensor * b);
884
885	// dst = a
886	// view(dst, nb1, nb2, nb3, offset) += b
887	// return dst
888	GGML_API struct ggml_tensor * ggml_acc(
889	struct ggml_context * ctx,
890	struct ggml_tensor * a,
891	struct ggml_tensor * b,
892	size_t nb1,
893	size_t nb2,
894	size_t nb3,
895	size_t offset);
896
897	GGML_API struct ggml_tensor * ggml_acc_inplace(
898	struct ggml_context * ctx,
899	struct ggml_tensor * a,
900	struct ggml_tensor * b,
901	size_t nb1,
902	size_t nb2,
903	size_t nb3,
904	size_t offset);
905
906	GGML_API struct ggml_tensor * ggml_sub(
907	struct ggml_context * ctx,
908	struct ggml_tensor * a,
909	struct ggml_tensor * b);
910
911	GGML_API struct ggml_tensor * ggml_sub_inplace(
912	struct ggml_context * ctx,
913	struct ggml_tensor * a,
914	struct ggml_tensor * b);
915
916	GGML_API struct ggml_tensor * ggml_mul(
917	struct ggml_context * ctx,
918	struct ggml_tensor * a,
919	struct ggml_tensor * b);
920
921	GGML_API struct ggml_tensor * ggml_mul_inplace(
922	struct ggml_context * ctx,
923	struct ggml_tensor * a,
924	struct ggml_tensor * b);
925
926	GGML_API struct ggml_tensor * ggml_div(
927	struct ggml_context * ctx,
928	struct ggml_tensor * a,
929	struct ggml_tensor * b);
930
931	GGML_API struct ggml_tensor * ggml_div_inplace(
932	struct ggml_context * ctx,
933	struct ggml_tensor * a,
934	struct ggml_tensor * b);
935
936	GGML_API struct ggml_tensor * ggml_sqr(
937	struct ggml_context * ctx,
938	struct ggml_tensor * a);
939
940	GGML_API struct ggml_tensor * ggml_sqr_inplace(
941	struct ggml_context * ctx,
942	struct ggml_tensor * a);
943
944	GGML_API struct ggml_tensor * ggml_sqrt(
945	struct ggml_context * ctx,
946	struct ggml_tensor * a);
947
948	GGML_API struct ggml_tensor * ggml_sqrt_inplace(
949	struct ggml_context * ctx,
950	struct ggml_tensor * a);
951
952	GGML_API struct ggml_tensor * ggml_log(
953	struct ggml_context * ctx,
954	struct ggml_tensor * a);
955
956	GGML_API struct ggml_tensor * ggml_log_inplace(
957	struct ggml_context * ctx,
958	struct ggml_tensor * a);
959
960	GGML_API struct ggml_tensor * ggml_sin(
961	struct ggml_context * ctx,
962	struct ggml_tensor * a);
963
964	GGML_API struct ggml_tensor * ggml_sin_inplace(
965	struct ggml_context * ctx,
966	struct ggml_tensor * a);
967
968	GGML_API struct ggml_tensor * ggml_cos(
969	struct ggml_context * ctx,
970	struct ggml_tensor * a);
971
972	GGML_API struct ggml_tensor * ggml_cos_inplace(
973	struct ggml_context * ctx,
974	struct ggml_tensor * a);
975
976	// return scalar
977	GGML_API struct ggml_tensor * ggml_sum(
978	struct ggml_context * ctx,
979	struct ggml_tensor * a);
980
981	// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
982	GGML_API struct ggml_tensor * ggml_sum_rows(
983	struct ggml_context * ctx,
984	struct ggml_tensor * a);
985
986	// mean along rows
987	GGML_API struct ggml_tensor * ggml_mean(
988	struct ggml_context * ctx,
989	struct ggml_tensor * a);
990
991	// argmax along rows
992	GGML_API struct ggml_tensor * ggml_argmax(
993	struct ggml_context * ctx,
994	struct ggml_tensor * a);
995
996	// count number of equal elements in a and b
997	GGML_API struct ggml_tensor * ggml_count_equal(
998	struct ggml_context * ctx,
999	struct ggml_tensor * a,
1000	struct ggml_tensor * b);
1001
1002	// if a is the same shape as b, and a is not parameter, return a
1003	// otherwise, return a new tensor: repeat(a) to fit in b
1004	GGML_API struct ggml_tensor * ggml_repeat(
1005	struct ggml_context * ctx,
1006	struct ggml_tensor * a,
1007	struct ggml_tensor * b);
1008
1009	// repeat a to the specified shape
1010	GGML_API struct ggml_tensor * ggml_repeat_4d(
1011	struct ggml_context * ctx,
1012	struct ggml_tensor * a,
1013	int64_t ne0,
1014	int64_t ne1,
1015	int64_t ne2,
1016	int64_t ne3);
1017
1018	// sums repetitions in a into shape of b
1019	GGML_API struct ggml_tensor * ggml_repeat_back(
1020	struct ggml_context * ctx,
1021	struct ggml_tensor * a,
1022	struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
1023
1024	// concat a and b along dim
1025	// used in stable-diffusion
1026	GGML_API struct ggml_tensor * ggml_concat(
1027	struct ggml_context * ctx,
1028	struct ggml_tensor * a,
1029	struct ggml_tensor * b,
1030	int dim);
1031
1032	GGML_API struct ggml_tensor * ggml_abs(
1033	struct ggml_context * ctx,
1034	struct ggml_tensor * a);
1035
1036	GGML_API struct ggml_tensor * ggml_abs_inplace(
1037	struct ggml_context * ctx,
1038	struct ggml_tensor * a);
1039
1040	GGML_API struct ggml_tensor * ggml_sgn(
1041	struct ggml_context * ctx,
1042	struct ggml_tensor * a);
1043
1044	GGML_API struct ggml_tensor * ggml_sgn_inplace(
1045	struct ggml_context * ctx,
1046	struct ggml_tensor * a);
1047
1048	GGML_API struct ggml_tensor * ggml_neg(
1049	struct ggml_context * ctx,
1050	struct ggml_tensor * a);
1051
1052	GGML_API struct ggml_tensor * ggml_neg_inplace(
1053	struct ggml_context * ctx,
1054	struct ggml_tensor * a);
1055
1056	GGML_API struct ggml_tensor * ggml_step(
1057	struct ggml_context * ctx,
1058	struct ggml_tensor * a);
1059
1060	GGML_API struct ggml_tensor * ggml_step_inplace(
1061	struct ggml_context * ctx,
1062	struct ggml_tensor * a);
1063
1064	GGML_API struct ggml_tensor * ggml_tanh(
1065	struct ggml_context * ctx,
1066	struct ggml_tensor * a);
1067
1068	GGML_API struct ggml_tensor * ggml_tanh_inplace(
1069	struct ggml_context * ctx,
1070	struct ggml_tensor * a);
1071
1072	GGML_API struct ggml_tensor * ggml_elu(
1073	struct ggml_context * ctx,
1074	struct ggml_tensor * a);
1075
1076	GGML_API struct ggml_tensor * ggml_elu_inplace(
1077	struct ggml_context * ctx,
1078	struct ggml_tensor * a);
1079
1080	GGML_API struct ggml_tensor * ggml_relu(
1081	struct ggml_context * ctx,
1082	struct ggml_tensor * a);
1083
1084	GGML_API struct ggml_tensor * ggml_leaky_relu(
1085	struct ggml_context * ctx,
1086	struct ggml_tensor * a, float negative_slope, bool inplace);
1087
1088	GGML_API struct ggml_tensor * ggml_relu_inplace(
1089	struct ggml_context * ctx,
1090	struct ggml_tensor * a);
1091
1092	GGML_API struct ggml_tensor * ggml_sigmoid(
1093	struct ggml_context * ctx,
1094	struct ggml_tensor * a);
1095
1096	GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
1097	struct ggml_context * ctx,
1098	struct ggml_tensor * a);
1099
1100	GGML_API struct ggml_tensor * ggml_gelu(
1101	struct ggml_context * ctx,
1102	struct ggml_tensor * a);
1103
1104	GGML_API struct ggml_tensor * ggml_gelu_inplace(
1105	struct ggml_context * ctx,
1106	struct ggml_tensor * a);
1107
1108	// GELU using erf (error function) when possible
1109	// some backends may fallback to approximation based on Abramowitz and Stegun formula
1110	GGML_API struct ggml_tensor * ggml_gelu_erf(
1111	struct ggml_context * ctx,
1112	struct ggml_tensor * a);
1113
1114	GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
1115	struct ggml_context * ctx,
1116	struct ggml_tensor * a);
1117
1118	GGML_API struct ggml_tensor * ggml_gelu_quick(
1119	struct ggml_context * ctx,
1120	struct ggml_tensor * a);
1121
1122	GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
1123	struct ggml_context * ctx,
1124	struct ggml_tensor * a);
1125
1126	GGML_API struct ggml_tensor * ggml_silu(
1127	struct ggml_context * ctx,
1128	struct ggml_tensor * a);
1129
1130	GGML_API struct ggml_tensor * ggml_silu_inplace(
1131	struct ggml_context * ctx,
1132	struct ggml_tensor * a);
1133
1134	// a - x
1135	// b - dy
1136	GGML_API struct ggml_tensor * ggml_silu_back(
1137	struct ggml_context * ctx,
1138	struct ggml_tensor * a,
1139	struct ggml_tensor * b);
1140
1141	// hardswish(x) = x relu6(x + 3) / 6*
1142	GGML_API struct ggml_tensor * ggml_hardswish(
1143	struct ggml_context * ctx,
1144	struct ggml_tensor * a);
1145
1146	// hardsigmoid(x) = relu6(x + 3) / 6
1147	GGML_API struct ggml_tensor * ggml_hardsigmoid(
1148	struct ggml_context * ctx,
1149	struct ggml_tensor * a);
1150
1151	GGML_API struct ggml_tensor * ggml_exp(
1152	struct ggml_context * ctx,
1153	struct ggml_tensor * a);
1154
1155	GGML_API struct ggml_tensor * ggml_exp_inplace(
1156	struct ggml_context * ctx,
1157	struct ggml_tensor * a);
1158
1159	GGML_API struct ggml_tensor * ggml_floor(
1160	struct ggml_context * ctx,
1161	struct ggml_tensor * a);
1162
1163	GGML_API struct ggml_tensor * ggml_floor_inplace(
1164	struct ggml_context * ctx,
1165	struct ggml_tensor * a);
1166
1167	GGML_API struct ggml_tensor * ggml_ceil(
1168	struct ggml_context * ctx,
1169	struct ggml_tensor * a);
1170
1171	GGML_API struct ggml_tensor * ggml_ceil_inplace(
1172	struct ggml_context * ctx,
1173	struct ggml_tensor * a);
1174
1175	GGML_API struct ggml_tensor * ggml_round(
1176	struct ggml_context * ctx,
1177	struct ggml_tensor * a);
1178
1179	GGML_API struct ggml_tensor * ggml_round_inplace(
1180	struct ggml_context * ctx,
1181	struct ggml_tensor * a);
1182
1183	/**
1184	* Truncates the fractional part of each element in the tensor (towards zero).
1185	* For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
1186	* Similar to std::trunc in C/C++.
1187	*/
1188
1189	GGML_API struct ggml_tensor * ggml_trunc(
1190	struct ggml_context * ctx,
1191	struct ggml_tensor * a);
1192
1193	GGML_API struct ggml_tensor * ggml_trunc_inplace(
1194	struct ggml_context * ctx,
1195	struct ggml_tensor * a);
1196
1197
1198
1199	// xIELU activation function
1200	// x = x (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)*
1201	// where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
1202	// that constrain the positive and negative source alpha values respectively
1203	GGML_API struct ggml_tensor * ggml_xielu(
1204	struct ggml_context * ctx,
1205	struct ggml_tensor * a,
1206	float alpha_n,
1207	float alpha_p,
1208	float beta,
1209	float eps);
1210
1211	// gated linear unit ops
1212	// A: n columns, r rows,
1213	// result is n / 2 columns, r rows,
1214	// expects gate in second half of row, unless swapped is true
1215	GGML_API struct ggml_tensor * ggml_glu(
1216	struct ggml_context * ctx,
1217	struct ggml_tensor * a,
1218	enum ggml_glu_op op,
1219	bool swapped);
1220
1221	GGML_API struct ggml_tensor * ggml_reglu(
1222	struct ggml_context * ctx,
1223	struct ggml_tensor * a);
1224
1225	GGML_API struct ggml_tensor * ggml_reglu_swapped(
1226	struct ggml_context * ctx,
1227	struct ggml_tensor * a);
1228
1229	GGML_API struct ggml_tensor * ggml_geglu(
1230	struct ggml_context * ctx,
1231	struct ggml_tensor * a);
1232
1233	GGML_API struct ggml_tensor * ggml_geglu_swapped(
1234	struct ggml_context * ctx,
1235	struct ggml_tensor * a);
1236
1237	GGML_API struct ggml_tensor * ggml_swiglu(
1238	struct ggml_context * ctx,
1239	struct ggml_tensor * a);
1240
1241	GGML_API struct ggml_tensor * ggml_swiglu_swapped(
1242	struct ggml_context * ctx,
1243	struct ggml_tensor * a);
1244
1245	GGML_API struct ggml_tensor * ggml_geglu_erf(
1246	struct ggml_context * ctx,
1247	struct ggml_tensor * a);
1248
1249	GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
1250	struct ggml_context * ctx,
1251	struct ggml_tensor * a);
1252
1253	GGML_API struct ggml_tensor * ggml_geglu_quick(
1254	struct ggml_context * ctx,
1255	struct ggml_tensor * a);
1256
1257	GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
1258	struct ggml_context * ctx,
1259	struct ggml_tensor * a);
1260
1261	// A: n columns, r rows,
1262	// B: n columns, r rows,
1263	GGML_API struct ggml_tensor * ggml_glu_split(
1264	struct ggml_context * ctx,
1265	struct ggml_tensor * a,
1266	struct ggml_tensor * b,
1267	enum ggml_glu_op op);
1268
1269	GGML_API struct ggml_tensor * ggml_reglu_split(
1270	struct ggml_context * ctx,
1271	struct ggml_tensor * a,
1272	struct ggml_tensor * b);
1273
1274	GGML_API struct ggml_tensor * ggml_geglu_split(
1275	struct ggml_context * ctx,
1276	struct ggml_tensor * a,
1277	struct ggml_tensor * b);
1278
1279	GGML_API struct ggml_tensor * ggml_swiglu_split(
1280	struct ggml_context * ctx,
1281	struct ggml_tensor * a,
1282	struct ggml_tensor * b);
1283
1284	GGML_API struct ggml_tensor * ggml_geglu_erf_split(
1285	struct ggml_context * ctx,
1286	struct ggml_tensor * a,
1287	struct ggml_tensor * b);
1288
1289	GGML_API struct ggml_tensor * ggml_geglu_quick_split(
1290	struct ggml_context * ctx,
1291	struct ggml_tensor * a,
1292	struct ggml_tensor * b);
1293
1294	GGML_API struct ggml_tensor * ggml_swiglu_oai(
1295	struct ggml_context * ctx,
1296	struct ggml_tensor * a,
1297	struct ggml_tensor * b,
1298	float alpha,
1299	float limit);
1300
1301	// normalize along rows
1302	GGML_API struct ggml_tensor * ggml_norm(
1303	struct ggml_context * ctx,
1304	struct ggml_tensor * a,
1305	float eps);
1306
1307	GGML_API struct ggml_tensor * ggml_norm_inplace(
1308	struct ggml_context * ctx,
1309	struct ggml_tensor * a,
1310	float eps);
1311
1312	GGML_API struct ggml_tensor * ggml_rms_norm(
1313	struct ggml_context * ctx,
1314	struct ggml_tensor * a,
1315	float eps);
1316
1317	GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
1318	struct ggml_context * ctx,
1319	struct ggml_tensor * a,
1320	float eps);
1321
1322	// group normalize along ne0ne1n_groups
1323	// used in stable-diffusion
1324	GGML_API struct ggml_tensor * ggml_group_norm(
1325	struct ggml_context * ctx,
1326	struct ggml_tensor * a,
1327	int n_groups,
1328	float eps);
1329
1330	GGML_API struct ggml_tensor * ggml_group_norm_inplace(
1331	struct ggml_context * ctx,
1332	struct ggml_tensor * a,
1333	int n_groups,
1334	float eps);
1335
1336	// l2 normalize along rows
1337	// used in rwkv v7
1338	GGML_API struct ggml_tensor * ggml_l2_norm(
1339	struct ggml_context * ctx,
1340	struct ggml_tensor * a,
1341	float eps);
1342
1343	GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
1344	struct ggml_context * ctx,
1345	struct ggml_tensor * a,
1346	float eps);
1347
1348	// a - x
1349	// b - dy
1350	GGML_API struct ggml_tensor * ggml_rms_norm_back(
1351	struct ggml_context * ctx,
1352	struct ggml_tensor * a,
1353	struct ggml_tensor * b,
1354	float eps);
1355
1356	// A: k columns, n rows => [ne03, ne02, n, k]
1357	// B: k columns, m rows (i.e. we transpose it internally) => [ne03 x, ne02 * y, m, k]*
1358	// result is n columns, m rows => [ne03 x, ne02 * y, m, n]*
1359	GGML_API struct ggml_tensor * ggml_mul_mat(
1360	struct ggml_context * ctx,
1361	struct ggml_tensor * a,
1362	struct ggml_tensor * b);
1363
1364	// change the precision of a matrix multiplication
1365	// set to GGML_PREC_F32 for higher precision (useful for phi-2)
1366	GGML_API void ggml_mul_mat_set_prec(
1367	struct ggml_tensor * a,
1368	enum ggml_prec prec);
1369
1370	// indirect matrix multiplication
1371	GGML_API struct ggml_tensor * ggml_mul_mat_id(
1372	struct ggml_context * ctx,
1373	struct ggml_tensor * as,
1374	struct ggml_tensor * b,
1375	struct ggml_tensor * ids);
1376
1377	// A: m columns, n rows,
1378	// B: p columns, n rows,
1379	// result is m columns, p rows
1380	GGML_API struct ggml_tensor * ggml_out_prod(
1381	struct ggml_context * ctx,
1382	struct ggml_tensor * a,
1383	struct ggml_tensor * b);
1384
1385	//
1386	// operations on tensors without backpropagation
1387	//
1388
1389	GGML_API struct ggml_tensor * ggml_scale(
1390	struct ggml_context * ctx,
1391	struct ggml_tensor * a,
1392	float s);
1393
1394	// in-place, returns view(a)
1395	GGML_API struct ggml_tensor * ggml_scale_inplace(
1396	struct ggml_context * ctx,
1397	struct ggml_tensor * a,
1398	float s);
1399
1400	// x = s a + b*
1401	GGML_API struct ggml_tensor * ggml_scale_bias(
1402	struct ggml_context * ctx,
1403	struct ggml_tensor * a,
1404	float s,
1405	float b);
1406
1407	GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
1408	struct ggml_context * ctx,
1409	struct ggml_tensor * a,
1410	float s,
1411	float b);
1412
1413	// b -> view(a,offset,nb1,nb2,3), return modified a
1414	GGML_API struct ggml_tensor * ggml_set(
1415	struct ggml_context * ctx,
1416	struct ggml_tensor * a,
1417	struct ggml_tensor * b,
1418	size_t nb1,
1419	size_t nb2,
1420	size_t nb3,
1421	size_t offset); // in bytes
1422
1423	// b -> view(a,offset,nb1,nb2,3), return view(a)
1424	GGML_API struct ggml_tensor * ggml_set_inplace(
1425	struct ggml_context * ctx,
1426	struct ggml_tensor * a,
1427	struct ggml_tensor * b,
1428	size_t nb1,
1429	size_t nb2,
1430	size_t nb3,
1431	size_t offset); // in bytes
1432
1433	GGML_API struct ggml_tensor * ggml_set_1d(
1434	struct ggml_context * ctx,
1435	struct ggml_tensor * a,
1436	struct ggml_tensor * b,
1437	size_t offset); // in bytes
1438
1439	GGML_API struct ggml_tensor * ggml_set_1d_inplace(
1440	struct ggml_context * ctx,
1441	struct ggml_tensor * a,
1442	struct ggml_tensor * b,
1443	size_t offset); // in bytes
1444
1445	// b -> view(a,offset,nb1,nb2,3), return modified a
1446	GGML_API struct ggml_tensor * ggml_set_2d(
1447	struct ggml_context * ctx,
1448	struct ggml_tensor * a,
1449	struct ggml_tensor * b,
1450	size_t nb1,
1451	size_t offset); // in bytes
1452
1453	// b -> view(a,offset,nb1,nb2,3), return view(a)
1454	GGML_API struct ggml_tensor * ggml_set_2d_inplace(
1455	struct ggml_context * ctx,
1456	struct ggml_tensor * a,
1457	struct ggml_tensor * b,
1458	size_t nb1,
1459	size_t offset); // in bytes
1460
1461	// a -> b, return view(b)
1462	GGML_API struct ggml_tensor * ggml_cpy(
1463	struct ggml_context * ctx,
1464	struct ggml_tensor * a,
1465	struct ggml_tensor * b);
1466
1467	// note: casting from f32 to i32 will discard the fractional part
1468	GGML_API struct ggml_tensor * ggml_cast(
1469	struct ggml_context * ctx,
1470	struct ggml_tensor * a,
1471	enum ggml_type type);
1472
1473	// make contiguous
1474	GGML_API struct ggml_tensor * ggml_cont(
1475	struct ggml_context * ctx,
1476	struct ggml_tensor * a);
1477
1478	// make contiguous, with new shape
1479	GGML_API struct ggml_tensor * ggml_cont_1d(
1480	struct ggml_context * ctx,
1481	struct ggml_tensor * a,
1482	int64_t ne0);
1483
1484	GGML_API struct ggml_tensor * ggml_cont_2d(
1485	struct ggml_context * ctx,
1486	struct ggml_tensor * a,
1487	int64_t ne0,
1488	int64_t ne1);
1489
1490	GGML_API struct ggml_tensor * ggml_cont_3d(
1491	struct ggml_context * ctx,
1492	struct ggml_tensor * a,
1493	int64_t ne0,
1494	int64_t ne1,
1495	int64_t ne2);
1496
1497	GGML_API struct ggml_tensor * ggml_cont_4d(
1498	struct ggml_context * ctx,
1499	struct ggml_tensor * a,
1500	int64_t ne0,
1501	int64_t ne1,
1502	int64_t ne2,
1503	int64_t ne3);
1504
1505	// return view(a), b specifies the new shape
1506	// TODO: when we start computing gradient, make a copy instead of view
1507	GGML_API struct ggml_tensor * ggml_reshape(
1508	struct ggml_context * ctx,
1509	struct ggml_tensor * a,
1510	struct ggml_tensor * b);
1511
1512	// return view(a)
1513	// TODO: when we start computing gradient, make a copy instead of view
1514	GGML_API struct ggml_tensor * ggml_reshape_1d(
1515	struct ggml_context * ctx,
1516	struct ggml_tensor * a,
1517	int64_t ne0);
1518
1519	GGML_API struct ggml_tensor * ggml_reshape_2d(
1520	struct ggml_context * ctx,
1521	struct ggml_tensor * a,
1522	int64_t ne0,
1523	int64_t ne1);
1524
1525	// return view(a)
1526	// TODO: when we start computing gradient, make a copy instead of view
1527	GGML_API struct ggml_tensor * ggml_reshape_3d(
1528	struct ggml_context * ctx,
1529	struct ggml_tensor * a,
1530	int64_t ne0,
1531	int64_t ne1,
1532	int64_t ne2);
1533
1534	GGML_API struct ggml_tensor * ggml_reshape_4d(
1535	struct ggml_context * ctx,
1536	struct ggml_tensor * a,
1537	int64_t ne0,
1538	int64_t ne1,
1539	int64_t ne2,
1540	int64_t ne3);
1541
1542	// offset in bytes
1543	GGML_API struct ggml_tensor * ggml_view_1d(
1544	struct ggml_context * ctx,
1545	struct ggml_tensor * a,
1546	int64_t ne0,
1547	size_t offset);
1548
1549	GGML_API struct ggml_tensor * ggml_view_2d(
1550	struct ggml_context * ctx,
1551	struct ggml_tensor * a,
1552	int64_t ne0,
1553	int64_t ne1,
1554	size_t nb1, // row stride in bytes
1555	size_t offset);
1556
1557	GGML_API struct ggml_tensor * ggml_view_3d(
1558	struct ggml_context * ctx,
1559	struct ggml_tensor * a,
1560	int64_t ne0,
1561	int64_t ne1,
1562	int64_t ne2,
1563	size_t nb1, // row stride in bytes
1564	size_t nb2, // slice stride in bytes
1565	size_t offset);
1566
1567	GGML_API struct ggml_tensor * ggml_view_4d(
1568	struct ggml_context * ctx,
1569	struct ggml_tensor * a,
1570	int64_t ne0,
1571	int64_t ne1,
1572	int64_t ne2,
1573	int64_t ne3,
1574	size_t nb1, // row stride in bytes
1575	size_t nb2, // slice stride in bytes
1576	size_t nb3,
1577	size_t offset);
1578
1579	GGML_API struct ggml_tensor * ggml_permute(
1580	struct ggml_context * ctx,
1581	struct ggml_tensor * a,
1582	int axis0,
1583	int axis1,
1584	int axis2,
1585	int axis3);
1586
1587	// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
1588	GGML_API struct ggml_tensor * ggml_transpose(
1589	struct ggml_context * ctx,
1590	struct ggml_tensor * a);
1591
1592	// supports 4D a:
1593	// a [n_embd, ne1, ne2, ne3]
1594	// b I32 [n_rows, ne2, ne3, 1]
1595	//
1596	// return [n_embd, n_rows, ne2, ne3]
1597	GGML_API struct ggml_tensor * ggml_get_rows(
1598	struct ggml_context * ctx,
1599	struct ggml_tensor * a, // data
1600	struct ggml_tensor * b); // row indices
1601
1602	GGML_API struct ggml_tensor * ggml_get_rows_back(
1603	struct ggml_context * ctx,
1604	struct ggml_tensor * a, // gradients of ggml_get_rows result
1605	struct ggml_tensor * b, // row indices
1606	struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
1607
1608	// a TD [n_embd, ne1, ne2, ne3]
1609	// b TS [n_embd, n_rows, ne02, ne03] \| ne02 == ne2, ne03 == ne3
1610	// c I64 [n_rows, ne11, ne12, 1] \| c[i] in [0, ne1)
1611	//
1612	// undefined behavior if destination rows overlap
1613	//
1614	// broadcast:
1615	// ne2 % ne11 == 0
1616	// ne3 % ne12 == 0
1617	//
1618	// return view(a)
1619	GGML_API struct ggml_tensor * ggml_set_rows(
1620	struct ggml_context * ctx,
1621	struct ggml_tensor * a, // destination
1622	struct ggml_tensor * b, // source
1623	struct ggml_tensor * c); // row indices
1624
1625	GGML_API struct ggml_tensor * ggml_diag(
1626	struct ggml_context * ctx,
1627	struct ggml_tensor * a);
1628
1629	// set elements above the diagonal to -INF
1630	GGML_API struct ggml_tensor * ggml_diag_mask_inf(
1631	struct ggml_context * ctx,
1632	struct ggml_tensor * a,
1633	int n_past);
1634
1635	// in-place, returns view(a)
1636	GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
1637	struct ggml_context * ctx,
1638	struct ggml_tensor * a,
1639	int n_past);
1640
1641	// set elements above the diagonal to 0
1642	GGML_API struct ggml_tensor * ggml_diag_mask_zero(
1643	struct ggml_context * ctx,
1644	struct ggml_tensor * a,
1645	int n_past);
1646
1647	// in-place, returns view(a)
1648	GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
1649	struct ggml_context * ctx,
1650	struct ggml_tensor * a,
1651	int n_past);
1652
1653	GGML_API struct ggml_tensor * ggml_soft_max(
1654	struct ggml_context * ctx,
1655	struct ggml_tensor * a);
1656
1657	// in-place, returns view(a)
1658	GGML_API struct ggml_tensor * ggml_soft_max_inplace(
1659	struct ggml_context * ctx,
1660	struct ggml_tensor * a);
1661
1662	// a [ne0, ne01, ne02, ne03]
1663	// mask [ne0, ne11, ne12, ne13] \| ne11 >= ne01, F16 or F32, optional
1664	//
1665	// broadcast:
1666	// ne02 % ne12 == 0
1667	// ne03 % ne13 == 0
1668	//
1669	// fused soft_max(ascale + mask(ALiBi slope))
1670	// max_bias = 0.0f for no ALiBi
1671	GGML_API struct ggml_tensor * ggml_soft_max_ext(
1672	struct ggml_context * ctx,
1673	struct ggml_tensor * a,
1674	struct ggml_tensor * mask,
1675	float scale,
1676	float max_bias);
1677
1678	GGML_API struct ggml_tensor * ggml_soft_max_ext_inplace(
1679	struct ggml_context * ctx,
1680	struct ggml_tensor * a,
1681	struct ggml_tensor * mask,
1682	float scale,
1683	float max_bias);
1684
1685	GGML_API void ggml_soft_max_add_sinks(
1686	struct ggml_tensor * a,
1687	struct ggml_tensor * sinks);
1688
1689	GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
1690	struct ggml_context * ctx,
1691	struct ggml_tensor * a,
1692	struct ggml_tensor * b,
1693	float scale,
1694	float max_bias);
1695
1696	// in-place, returns view(a)
1697	GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace(
1698	struct ggml_context * ctx,
1699	struct ggml_tensor * a,
1700	struct ggml_tensor * b,
1701	float scale,
1702	float max_bias);
1703
1704	// rotary position embedding
1705	// if (mode & 1) - skip n_past elements (NOT SUPPORTED)
1706	// if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
1707	//
1708	// b is an int32 vector with size a->ne[2], it contains the positions
1709	GGML_API struct ggml_tensor * ggml_rope(
1710	struct ggml_context * ctx,
1711	struct ggml_tensor * a,
1712	struct ggml_tensor * b,
1713	int n_dims,
1714	int mode);
1715
1716	// in-place, returns view(a)
1717	GGML_API struct ggml_tensor * ggml_rope_inplace(
1718	struct ggml_context * ctx,
1719	struct ggml_tensor * a,
1720	struct ggml_tensor * b,
1721	int n_dims,
1722	int mode);
1723
1724	// custom RoPE
1725	// c is freq factors (e.g. phi3-128k), (optional)
1726	GGML_API struct ggml_tensor * ggml_rope_ext(
1727	struct ggml_context * ctx,
1728	struct ggml_tensor * a,
1729	struct ggml_tensor * b,
1730	struct ggml_tensor * c,
1731	int n_dims,
1732	int mode,
1733	int n_ctx_orig,
1734	float freq_base,
1735	float freq_scale,
1736	float ext_factor,
1737	float attn_factor,
1738	float beta_fast,
1739	float beta_slow);
1740
1741	GGML_API struct ggml_tensor * ggml_rope_multi(
1742	struct ggml_context * ctx,
1743	struct ggml_tensor * a,
1744	struct ggml_tensor * b,
1745	struct ggml_tensor * c,
1746	int n_dims,
1747	int sections[GGML_MROPE_SECTIONS],
1748	int mode,
1749	int n_ctx_orig,
1750	float freq_base,
1751	float freq_scale,
1752	float ext_factor,
1753	float attn_factor,
1754	float beta_fast,
1755	float beta_slow);
1756
1757	// in-place, returns view(a)
1758	GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1759	struct ggml_context * ctx,
1760	struct ggml_tensor * a,
1761	struct ggml_tensor * b,
1762	struct ggml_tensor * c,
1763	int n_dims,
1764	int mode,
1765	int n_ctx_orig,
1766	float freq_base,
1767	float freq_scale,
1768	float ext_factor,
1769	float attn_factor,
1770	float beta_fast,
1771	float beta_slow);
1772
1773	GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
1774	struct ggml_context * ctx,
1775	struct ggml_tensor * a,
1776	struct ggml_tensor * b,
1777	struct ggml_tensor * c,
1778	int n_dims,
1779	int sections[GGML_MROPE_SECTIONS],
1780	int mode,
1781	int n_ctx_orig,
1782	float freq_base,
1783	float freq_scale,
1784	float ext_factor,
1785	float attn_factor,
1786	float beta_fast,
1787	float beta_slow);
1788
1789	GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1790	struct ggml_context * ctx,
1791	struct ggml_tensor * a,
1792	struct ggml_tensor * b,
1793	int n_dims,
1794	int mode,
1795	int n_ctx_orig,
1796	float freq_base,
1797	float freq_scale,
1798	float ext_factor,
1799	float attn_factor,
1800	float beta_fast,
1801	float beta_slow),
1802	"use ggml_rope_ext instead");
1803
1804	GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1805	struct ggml_context * ctx,
1806	struct ggml_tensor * a,
1807	struct ggml_tensor * b,
1808	int n_dims,
1809	int mode,
1810	int n_ctx_orig,
1811	float freq_base,
1812	float freq_scale,
1813	float ext_factor,
1814	float attn_factor,
1815	float beta_fast,
1816	float beta_slow),
1817	"use ggml_rope_ext_inplace instead");
1818
1819	// compute correction dims for YaRN RoPE scaling
1820	GGML_API void ggml_rope_yarn_corr_dims(
1821	int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[`2`]);
1822
1823	// rotary position embedding backward, i.e compute dx from dy
1824	// a - dy
1825	GGML_API struct ggml_tensor * ggml_rope_ext_back(
1826	struct ggml_context * ctx,
1827	struct ggml_tensor * a, // gradients of ggml_rope result
1828	struct ggml_tensor * b, // positions
1829	struct ggml_tensor * c, // freq factors
1830	int n_dims,
1831	int mode,
1832	int n_ctx_orig,
1833	float freq_base,
1834	float freq_scale,
1835	float ext_factor,
1836	float attn_factor,
1837	float beta_fast,
1838	float beta_slow);
1839
1840	GGML_API struct ggml_tensor * ggml_rope_multi_back(
1841	struct ggml_context * ctx,
1842	struct ggml_tensor * a,
1843	struct ggml_tensor * b,
1844	struct ggml_tensor * c,
1845	int n_dims,
1846	int sections[`4`],
1847	int mode,
1848	int n_ctx_orig,
1849	float freq_base,
1850	float freq_scale,
1851	float ext_factor,
1852	float attn_factor,
1853	float beta_fast,
1854	float beta_slow);
1855
1856
1857	// clamp
1858	// in-place, returns view(a)
1859	GGML_API struct ggml_tensor * ggml_clamp(
1860	struct ggml_context * ctx,
1861	struct ggml_tensor * a,
1862	float min,
1863	float max);
1864
1865	// im2col
1866	// converts data into a format that effectively results in a convolution when combined with matrix multiplication
1867	GGML_API struct ggml_tensor * ggml_im2col(
1868	struct ggml_context * ctx,
1869	struct ggml_tensor * a, // convolution kernel
1870	struct ggml_tensor * b, // data
1871	int s0, // stride dimension 0
1872	int s1, // stride dimension 1
1873	int p0, // padding dimension 0
1874	int p1, // padding dimension 1
1875	int d0, // dilation dimension 0
1876	int d1, // dilation dimension 1
1877	bool is_2D,
1878	enum ggml_type dst_type);
1879
1880	GGML_API struct ggml_tensor * ggml_im2col_back(
1881	struct ggml_context * ctx,
1882	struct ggml_tensor * a, // convolution kernel
1883	struct ggml_tensor * b, // gradient of im2col output
1884	int64_t * ne, // shape of im2col input
1885	int s0, // stride dimension 0
1886	int s1, // stride dimension 1
1887	int p0, // padding dimension 0
1888	int p1, // padding dimension 1
1889	int d0, // dilation dimension 0
1890	int d1, // dilation dimension 1
1891	bool is_2D);
1892
1893	GGML_API struct ggml_tensor * ggml_conv_1d(
1894	struct ggml_context * ctx,
1895	struct ggml_tensor * a, // convolution kernel
1896	struct ggml_tensor * b, // data
1897	int s0, // stride
1898	int p0, // padding
1899	int d0); // dilation
1900
1901	// conv_1d with padding = half
1902	// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1903	GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1904	struct ggml_context * ctx,
1905	struct ggml_tensor * a, // convolution kernel
1906	struct ggml_tensor * b, // data
1907	int s, // stride
1908	int d); // dilation
1909
1910	// depthwise
1911	// TODO: this is very likely wrong for some cases! - needs more testing
1912	GGML_API struct ggml_tensor * ggml_conv_1d_dw(
1913	struct ggml_context * ctx,
1914	struct ggml_tensor * a, // convolution kernel
1915	struct ggml_tensor * b, // data
1916	int s0, // stride
1917	int p0, // padding
1918	int d0); // dilation
1919
1920	GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph(
1921	struct ggml_context * ctx,
1922	struct ggml_tensor * a, // convolution kernel
1923	struct ggml_tensor * b, // data
1924	int s0, // stride
1925	int d0); // dilation
1926
1927	GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
1928	struct ggml_context * ctx,
1929	struct ggml_tensor * a, // convolution kernel
1930	struct ggml_tensor * b, // data
1931	int s0, // stride
1932	int p0, // padding
1933	int d0); // dilation
1934
1935	GGML_API struct ggml_tensor * ggml_conv_2d(
1936	struct ggml_context * ctx,
1937	struct ggml_tensor * a, // convolution kernel
1938	struct ggml_tensor * b, // data
1939	int s0, // stride dimension 0
1940	int s1, // stride dimension 1
1941	int p0, // padding dimension 0
1942	int p1, // padding dimension 1
1943	int d0, // dilation dimension 0
1944	int d1); // dilation dimension 1
1945
1946	GGML_API struct ggml_tensor * ggml_im2col_3d(
1947	struct ggml_context * ctx,
1948	struct ggml_tensor * a,
1949	struct ggml_tensor * b,
1950	int64_t IC,
1951	int s0, // stride width
1952	int s1, // stride height
1953	int s2, // stride depth
1954	int p0, // padding width
1955	int p1, // padding height
1956	int p2, // padding depth
1957	int d0, // dilation width
1958	int d1, // dilation height
1959	int d2, // dilation depth
1960	enum ggml_type dst_type);
1961
1962	// a: [OCIC, KD, KH, KW]*
1963	// b: [NIC, ID, IH, IW]*
1964	// result: [NOC, OD, OH, OW]*
1965	GGML_API struct ggml_tensor * ggml_conv_3d(
1966	struct ggml_context * ctx,
1967	struct ggml_tensor * a,
1968	struct ggml_tensor * b,
1969	int64_t IC,
1970	int s0, // stride width
1971	int s1, // stride height
1972	int s2, // stride depth
1973	int p0, // padding width
1974	int p1, // padding height
1975	int p2, // padding depth
1976	int d0, // dilation width
1977	int d1, // dilation height
1978	int d2 // dilation depth
1979	);
1980
1981	// kernel size is a->ne[0] x a->ne[1]
1982	// stride is equal to kernel size
1983	// padding is zero
1984	// example:
1985	// a: 16 16 3 768
1986	// b: 1024 1024 3 1
1987	// res: 64 64 768 1
1988	// used in sam
1989	GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1990	struct ggml_context * ctx,
1991	struct ggml_tensor * a,
1992	struct ggml_tensor * b);
1993
1994	// kernel size is a->ne[0] x a->ne[1]
1995	// stride is 1
1996	// padding is half
1997	// example:
1998	// a: 3 3 256 256
1999	// b: 64 64 256 1
2000	// res: 64 64 256 1
2001	// used in sam
2002	GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
2003	struct ggml_context * ctx,
2004	struct ggml_tensor * a,
2005	struct ggml_tensor * b);
2006
2007	// depthwise (via im2col and mul_mat)
2008	GGML_API struct ggml_tensor * ggml_conv_2d_dw(
2009	struct ggml_context * ctx,
2010	struct ggml_tensor * a, // convolution kernel
2011	struct ggml_tensor * b, // data
2012	int s0, // stride dimension 0
2013	int s1, // stride dimension 1
2014	int p0, // padding dimension 0
2015	int p1, // padding dimension 1
2016	int d0, // dilation dimension 0
2017	int d1); // dilation dimension 1
2018
2019	// Depthwise 2D convolution
2020	// may be faster than ggml_conv_2d_dw, but not available in all backends
2021	// a: KW KH 1 C convolution kernel
2022	// b: W H C N input data
2023	// res: W_out H_out C N
2024	GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
2025	struct ggml_context * ctx,
2026	struct ggml_tensor * a,
2027	struct ggml_tensor * b,
2028	int stride0,
2029	int stride1,
2030	int pad0,
2031	int pad1,
2032	int dilation0,
2033	int dilation1);
2034
2035	GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
2036	struct ggml_context * ctx,
2037	struct ggml_tensor * a,
2038	struct ggml_tensor * b,
2039	int stride);
2040
2041	GGML_API struct ggml_tensor * ggml_conv_2d_direct(
2042	struct ggml_context * ctx,
2043	struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
2044	struct ggml_tensor * b, // input data [W, H, C, N]
2045	int s0, // stride dimension 0
2046	int s1, // stride dimension 1
2047	int p0, // padding dimension 0
2048	int p1, // padding dimension 1
2049	int d0, // dilation dimension 0
2050	int d1); // dilation dimension 1
2051
2052	GGML_API struct ggml_tensor * ggml_conv_3d_direct(
2053	struct ggml_context * ctx,
2054	struct ggml_tensor * a, // kernel [KW, KH, KD, IC OC]*
2055	struct ggml_tensor * b, // input [W, H, D, C N]*
2056	int s0, // stride
2057	int s1,
2058	int s2,
2059	int p0, // padding
2060	int p1,
2061	int p2,
2062	int d0, // dilation
2063	int d1,
2064	int d2,
2065	int n_channels,
2066	int n_batch,
2067	int n_channels_out);
2068
2069	enum ggml_op_pool {
2070	GGML_OP_POOL_MAX,
2071	GGML_OP_POOL_AVG,
2072	GGML_OP_POOL_COUNT,
2073	};
2074
2075	GGML_API struct ggml_tensor * ggml_pool_1d(
2076	struct ggml_context * ctx,
2077	struct ggml_tensor * a,
2078	enum ggml_op_pool op,
2079	int k0, // kernel size
2080	int s0, // stride
2081	int p0); // padding
2082
2083	// the result will have 2p0 padding for the first dimension*
2084	// and 2p1 padding for the second dimension*
2085	GGML_API struct ggml_tensor * ggml_pool_2d(
2086	struct ggml_context * ctx,
2087	struct ggml_tensor * a,
2088	enum ggml_op_pool op,
2089	int k0,
2090	int k1,
2091	int s0,
2092	int s1,
2093	float p0,
2094	float p1);
2095
2096	GGML_API struct ggml_tensor * ggml_pool_2d_back(
2097	struct ggml_context * ctx,
2098	struct ggml_tensor * a,
2099	struct ggml_tensor * af, // "a"/input used in forward pass
2100	enum ggml_op_pool op,
2101	int k0,
2102	int k1,
2103	int s0,
2104	int s1,
2105	float p0,
2106	float p1);
2107
2108	enum ggml_scale_mode {
2109	GGML_SCALE_MODE_NEAREST = `0`,
2110	GGML_SCALE_MODE_BILINEAR = `1`,
2111	GGML_SCALE_MODE_BICUBIC = `2`,
2112
2113	GGML_SCALE_MODE_COUNT
2114	};
2115
2116	enum ggml_scale_flag {
2117	GGML_SCALE_FLAG_ALIGN_CORNERS = (`1` << `8`)
2118	};
2119
2120	// interpolate
2121	// multiplies ne0 and ne1 by scale factor
2122	GGML_API struct ggml_tensor * ggml_upscale(
2123	struct ggml_context * ctx,
2124	struct ggml_tensor * a,
2125	int scale_factor,
2126	enum ggml_scale_mode mode);
2127
2128	// interpolate
2129	// interpolate scale to specified dimensions
2130	GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext(
2131	struct ggml_context * ctx,
2132	struct ggml_tensor * a,
2133	int ne0,
2134	int ne1,
2135	int ne2,
2136	int ne3,
2137	enum ggml_scale_mode mode),
2138	"use ggml_interpolate instead");
2139
2140	// Up- or downsamples the input to the specified size.
2141	// 2D scale modes (eg. bilinear) are applied to the first two dimensions.
2142	GGML_API struct ggml_tensor * ggml_interpolate(
2143	struct ggml_context * ctx,
2144	struct ggml_tensor * a,
2145	int64_t ne0,
2146	int64_t ne1,
2147	int64_t ne2,
2148	int64_t ne3,
2149	uint32_t mode); // ggml_scale_mode [ \| ggml_scale_flag...]
2150
2151	// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
2152	GGML_API struct ggml_tensor * ggml_pad(
2153	struct ggml_context * ctx,
2154	struct ggml_tensor * a,
2155	int p0,
2156	int p1,
2157	int p2,
2158	int p3);
2159
2160	GGML_API struct ggml_tensor * ggml_pad_ext(
2161	struct ggml_context * ctx,
2162	struct ggml_tensor * a,
2163	int lp0,
2164	int rp0,
2165	int lp1,
2166	int rp1,
2167	int lp2,
2168	int rp2,
2169	int lp3,
2170	int rp3
2171	);
2172
2173	// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
2174	GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
2175	struct ggml_context * ctx,
2176	struct ggml_tensor * a,
2177	int p0,
2178	int p1);
2179
2180	// Move tensor elements by an offset given for each dimension. Elements that
2181	// are shifted beyond the last position are wrapped around to the beginning.
2182	GGML_API struct ggml_tensor * ggml_roll(
2183	struct ggml_context * ctx,
2184	struct ggml_tensor * a,
2185	int shift0,
2186	int shift1,
2187	int shift2,
2188	int shift3);
2189
2190
2191	// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
2192	// timesteps: [N,]
2193	// return: [N, dim]
2194	GGML_API struct ggml_tensor * ggml_timestep_embedding(
2195	struct ggml_context * ctx,
2196	struct ggml_tensor * timesteps,
2197	int dim,
2198	int max_period);
2199
2200	// sort rows
2201	enum ggml_sort_order {
2202	GGML_SORT_ORDER_ASC,
2203	GGML_SORT_ORDER_DESC,
2204	};
2205
2206	GGML_API struct ggml_tensor * ggml_argsort(
2207	struct ggml_context * ctx,
2208	struct ggml_tensor * a,
2209	enum ggml_sort_order order);
2210
2211	GGML_API struct ggml_tensor * ggml_arange(
2212	struct ggml_context * ctx,
2213	float start,
2214	float stop,
2215	float step);
2216
2217	// top k elements per row
2218	GGML_API struct ggml_tensor * ggml_top_k(
2219	struct ggml_context * ctx,
2220	struct ggml_tensor * a,
2221	int k);
2222
2223	#define GGML_KQ_MASK_PAD 64
2224
2225	// q: [n_embd_k, n_batch, n_head, ne3 ]
2226	// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2227	// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2228	// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
2229	// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2230	//
2231	// broadcast:
2232	// n_head % n_head_kv == 0
2233	// n_head % ne32 == 0
2234	// ne3 % ne33 == 0
2235	//
2236	GGML_API struct ggml_tensor * ggml_flash_attn_ext(
2237	struct ggml_context * ctx,
2238	struct ggml_tensor * q,
2239	struct ggml_tensor * k,
2240	struct ggml_tensor * v,
2241	struct ggml_tensor * mask,
2242	float scale,
2243	float max_bias,
2244	float logit_softcap);
2245
2246	GGML_API void ggml_flash_attn_ext_set_prec(
2247	struct ggml_tensor * a,
2248	enum ggml_prec prec);
2249
2250	GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
2251	const struct ggml_tensor * a);
2252
2253	GGML_API void ggml_flash_attn_ext_add_sinks(
2254	struct ggml_tensor * a,
2255	struct ggml_tensor * sinks);
2256
2257	// TODO: needs to be adapted to ggml_flash_attn_ext
2258	GGML_API struct ggml_tensor * ggml_flash_attn_back(
2259	struct ggml_context * ctx,
2260	struct ggml_tensor * q,
2261	struct ggml_tensor * k,
2262	struct ggml_tensor * v,
2263	struct ggml_tensor * d,
2264	bool masked);
2265
2266	GGML_API struct ggml_tensor * ggml_ssm_conv(
2267	struct ggml_context * ctx,
2268	struct ggml_tensor * sx,
2269	struct ggml_tensor * c);
2270
2271	GGML_API struct ggml_tensor * ggml_ssm_scan(
2272	struct ggml_context * ctx,
2273	struct ggml_tensor * s,
2274	struct ggml_tensor * x,
2275	struct ggml_tensor * dt,
2276	struct ggml_tensor * A,
2277	struct ggml_tensor * B,
2278	struct ggml_tensor * C,
2279	struct ggml_tensor * ids);
2280
2281	// partition into non-overlapping windows with padding if needed
2282	// example:
2283	// a: 768 64 64 1
2284	// w: 14
2285	// res: 768 14 14 25
2286	// used in sam
2287	GGML_API struct ggml_tensor * ggml_win_part(
2288	struct ggml_context * ctx,
2289	struct ggml_tensor * a,
2290	int w);
2291
2292	// reverse of ggml_win_part
2293	// used in sam
2294	GGML_API struct ggml_tensor * ggml_win_unpart(
2295	struct ggml_context * ctx,
2296	struct ggml_tensor * a,
2297	int w0,
2298	int h0,
2299	int w);
2300
2301	GGML_API struct ggml_tensor * ggml_unary(
2302	struct ggml_context * ctx,
2303	struct ggml_tensor * a,
2304	enum ggml_unary_op op);
2305
2306	GGML_API struct ggml_tensor * ggml_unary_inplace(
2307	struct ggml_context * ctx,
2308	struct ggml_tensor * a,
2309	enum ggml_unary_op op);
2310
2311	// used in sam
2312	GGML_API struct ggml_tensor * ggml_get_rel_pos(
2313	struct ggml_context * ctx,
2314	struct ggml_tensor * a,
2315	int qh,
2316	int kh);
2317
2318	// used in sam
2319	GGML_API struct ggml_tensor * ggml_add_rel_pos(
2320	struct ggml_context * ctx,
2321	struct ggml_tensor * a,
2322	struct ggml_tensor * pw,
2323	struct ggml_tensor * ph);
2324
2325	GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
2326	struct ggml_context * ctx,
2327	struct ggml_tensor * a,
2328	struct ggml_tensor * pw,
2329	struct ggml_tensor * ph);
2330
2331	GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
2332	struct ggml_context * ctx,
2333	struct ggml_tensor * k,
2334	struct ggml_tensor * v,
2335	struct ggml_tensor * r,
2336	struct ggml_tensor * tf,
2337	struct ggml_tensor * td,
2338	struct ggml_tensor * state);
2339
2340	GGML_API struct ggml_tensor * ggml_gated_linear_attn(
2341	struct ggml_context * ctx,
2342	struct ggml_tensor * k,
2343	struct ggml_tensor * v,
2344	struct ggml_tensor * q,
2345	struct ggml_tensor * g,
2346	struct ggml_tensor * state,
2347	float scale);
2348
2349	GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
2350	struct ggml_context * ctx,
2351	struct ggml_tensor * r,
2352	struct ggml_tensor * w,
2353	struct ggml_tensor * k,
2354	struct ggml_tensor * v,
2355	struct ggml_tensor * a,
2356	struct ggml_tensor * b,
2357	struct ggml_tensor * state);
2358
2359	// custom operators
2360
2361	typedef void (ggml_custom1_op_t)(struct* ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
2362	typedef void (ggml_custom2_op_t)(struct* ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
2363	typedef void (ggml_custom3_op_t)(struct* ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
2364
2365	#define GGML_N_TASKS_MAX (-1)
2366	// n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
2367
2368	GGML_API struct ggml_tensor * ggml_map_custom1(
2369	struct ggml_context * ctx,
2370	struct ggml_tensor * a,
2371	ggml_custom1_op_t fun,
2372	int n_tasks,
2373	void * userdata);
2374
2375	GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
2376	struct ggml_context * ctx,
2377	struct ggml_tensor * a,
2378	ggml_custom1_op_t fun,
2379	int n_tasks,
2380	void * userdata);
2381
2382	GGML_API struct ggml_tensor * ggml_map_custom2(
2383	struct ggml_context * ctx,
2384	struct ggml_tensor * a,
2385	struct ggml_tensor * b,
2386	ggml_custom2_op_t fun,
2387	int n_tasks,
2388	void * userdata);
2389
2390	GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
2391	struct ggml_context * ctx,
2392	struct ggml_tensor * a,
2393	struct ggml_tensor * b,
2394	ggml_custom2_op_t fun,
2395	int n_tasks,
2396	void * userdata);
2397
2398	GGML_API struct ggml_tensor * ggml_map_custom3(
2399	struct ggml_context * ctx,
2400	struct ggml_tensor * a,
2401	struct ggml_tensor * b,
2402	struct ggml_tensor * c,
2403	ggml_custom3_op_t fun,
2404	int n_tasks,
2405	void * userdata);
2406
2407	GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
2408	struct ggml_context * ctx,
2409	struct ggml_tensor * a,
2410	struct ggml_tensor * b,
2411	struct ggml_tensor * c,
2412	ggml_custom3_op_t fun,
2413	int n_tasks,
2414	void * userdata);
2415
2416	typedef void (ggml_custom_op_t)(struct* ggml_tensor * dst , int ith, int nth, void * userdata);
2417
2418	GGML_API struct ggml_tensor * ggml_custom_4d(
2419	struct ggml_context * ctx,
2420	enum ggml_type type,
2421	int64_t ne0,
2422	int64_t ne1,
2423	int64_t ne2,
2424	int64_t ne3,
2425	struct ggml_tensor ** args,
2426	int n_args,
2427	ggml_custom_op_t fun,
2428	int n_tasks,
2429	void * userdata);
2430
2431	GGML_API struct ggml_tensor * ggml_custom_inplace(
2432	struct ggml_context * ctx,
2433	struct ggml_tensor * a,
2434	struct ggml_tensor ** args,
2435	int n_args,
2436	ggml_custom_op_t fun,
2437	int n_tasks,
2438	void * userdata);
2439
2440	// loss function
2441
2442	GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
2443	struct ggml_context * ctx,
2444	struct ggml_tensor * a, // logits
2445	struct ggml_tensor * b); // labels
2446
2447	GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
2448	struct ggml_context * ctx,
2449	struct ggml_tensor * a, // logits
2450	struct ggml_tensor * b, // labels
2451	struct ggml_tensor * c); // gradients of cross_entropy_loss result
2452
2453	// AdamW optimizer step
2454	// Paper: https://arxiv.org/pdf/1711.05101v3.pdf
2455	// PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
2456	GGML_API struct ggml_tensor * ggml_opt_step_adamw(
2457	struct ggml_context * ctx,
2458	struct ggml_tensor * a,
2459	struct ggml_tensor * grad,
2460	struct ggml_tensor * m,
2461	struct ggml_tensor * v,
2462	struct ggml_tensor * adamw_params); // parameters such as the learning rate
2463
2464	// stochastic gradient descent step (with weight decay)
2465	GGML_API struct ggml_tensor * ggml_opt_step_sgd(
2466	struct ggml_context * ctx,
2467	struct ggml_tensor * a,
2468	struct ggml_tensor * grad,
2469	struct ggml_tensor * sgd_params); // alpha, weight decay
2470
2471	//
2472	// automatic differentiation
2473	//
2474
2475	GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2476	GGML_API void ggml_build_backward_expand(
2477	struct ggml_context * ctx, // context for gradient computation
2478	struct ggml_cgraph * cgraph,
2479	struct ggml_tensor ** grad_accs);
2480
2481	// graph allocation in a context
2482	GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
2483	GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
2484	GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
2485	GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
2486	GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2487	GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
2488
2489	GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
2490	GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
2491	GGML_API struct ggml_tensor ggml_graph_nodes (struct** ggml_cgraph * cgraph);
2492	GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
2493
2494	GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2495
2496	GGML_API size_t ggml_graph_overhead(void);
2497	GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
2498
2499	GGML_API struct ggml_tensor * ggml_graph_get_tensor (const struct ggml_cgraph * cgraph, const char * name);
2500	GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2501	GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2502
2503	// print info and performance information for the graph
2504	GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
2505
2506	// dump the graph into a file using the dot format
2507	GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
2508
2509	// TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
2510	typedef void (ggml_log_callback)(enum* ggml_log_level level, const char * text, void * user_data);
2511
2512	// Set callback for all future logging events.
2513	// If this is not called, or NULL is supplied, everything is output on stderr.
2514	GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2515
2516	GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
2517
2518	//
2519	// quantization
2520	//
2521
2522	// - ggml_quantize_init can be called multiple times with the same type
2523	// it will only initialize the quantization tables for the first call or after ggml_quantize_free
2524	// automatically called by ggml_quantize_chunk for convenience
2525	//
2526	// - ggml_quantize_free will free any memory allocated by ggml_quantize_init
2527	// call this at the end of the program to avoid memory leaks
2528	//
2529	// note: these are thread-safe
2530	//
2531	GGML_API void ggml_quantize_init(enum ggml_type type);
2532	GGML_API void ggml_quantize_free(void);
2533
2534	// some quantization type cannot be used without an importance matrix
2535	GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2536
2537	// calls ggml_quantize_init internally (i.e. can allocate memory)
2538	GGML_API size_t ggml_quantize_chunk(
2539	enum ggml_type type,
2540	const float * src,
2541	void * dst,
2542	int64_t start,
2543	int64_t nrows,
2544	int64_t n_per_row,
2545	const float * imatrix);
2546
2547	#ifdef __cplusplus
2548	// restrict not standard in C++
2549	# if defined(__GNUC__)
2550	# define GGML_RESTRICT __restrict__
2551	# elif defined(__clang__)
2552	# define GGML_RESTRICT __restrict
2553	# elif defined(_MSC_VER)
2554	# define GGML_RESTRICT __restrict
2555	# else
2556	# define GGML_RESTRICT
2557	# endif
2558	#else
2559	# if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
2560	# define GGML_RESTRICT __restrict
2561	# else
2562	# define GGML_RESTRICT restrict
2563	# endif
2564	#endif
2565	typedef void (ggml_to_float_t) (const* void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2566	typedef void (ggml_from_float_t)(const* float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2567
2568	struct ggml_type_traits {
2569	const char * type_name;
2570	int64_t blck_size;
2571	int64_t blck_size_interleave; // interleave elements in blocks
2572	size_t type_size;
2573	bool is_quantized;
2574	ggml_to_float_t to_float;
2575	ggml_from_float_t from_float_ref;
2576	};
2577
2578	GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
2579
2580	// ggml threadpool
2581	// TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
2582	// the goal should be to create an API that other backends can use move everything to the ggml base
2583
2584	// scheduling priorities
2585	enum ggml_sched_priority {
2586	GGML_SCHED_PRIO_LOW = -`1`,
2587	GGML_SCHED_PRIO_NORMAL,
2588	GGML_SCHED_PRIO_MEDIUM,
2589	GGML_SCHED_PRIO_HIGH,
2590	GGML_SCHED_PRIO_REALTIME
2591	};
2592
2593	// threadpool params
2594	// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
2595	struct ggml_threadpool_params {
2596	bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
2597	int n_threads; // number of threads
2598	enum ggml_sched_priority prio; // thread priority
2599	uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
2600	bool strict_cpu; // strict cpu placement
2601	bool paused; // start in paused state
2602	};
2603
2604	struct ggml_threadpool; // forward declaration, see ggml.c
2605
2606	typedef struct ggml_threadpool * ggml_threadpool_t;
2607
2608	GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
2609	GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
2610	GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
2611
2612	#ifdef __cplusplus
2613	}
2614	#endif
2615

Browse the source code of llama.cpp/ggml/include/ggml.h