amx.cpp source code [llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp]

1	#include "amx.h"
2	#include "common.h"
3	#include "mmq.h"
4	#include "ggml-backend-impl.h"
5	#include "ggml-backend.h"
6	#include "ggml-impl.h"
7	#include "ggml-cpu.h"
8	#include "traits.h"
9
10	#if defined(__linux__)
11	#include <sys/syscall.h>
12	#include <unistd.h>
13	#endif
14
15	#include <cstdlib>
16	#include <cstring>
17	#include <memory>
18
19	#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
20
21	// AMX type_trais
22	namespace ggml::cpu::amx {
23	class tensor_traits : public ggml::cpu::tensor_traits {
24	bool work_size(int / n_threads /, const struct ggml_tensor * op, size_t & size) override {
25	size = ggml_backend_amx_desired_wsize(op);
26	return true;
27	}
28
29	bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
30	if (op->op == GGML_OP_MUL_MAT) {
31	ggml_backend_amx_mul_mat(params, op);
32	return true;
33	}
34	return false;
35	}
36	};
37
38	static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
39	static tensor_traits traits;
40	return &traits;
41	}
42	} // namespace ggml::cpu::amx
43
44	// AMX buffer interface
45	static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
46	free(buffer->context);
47	}
48
49	static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
50	return (void *) (buffer->context);
51	}
52
53	static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
54	tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
55
56	GGML_UNUSED(buffer);
57	return GGML_STATUS_SUCCESS;
58	}
59
60	static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
61	uint8_t value, size_t offset, size_t size) {
62	memset((char *) tensor->data + offset, value, size);
63
64	GGML_UNUSED(buffer);
65	}
66
67	static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
68	const void * data, size_t offset, size_t size) {
69	if (qtype_has_amx_kernels(tensor->type)) {
70	GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
71	ggml_backend_amx_convert_weight(tensor, data, offset, size);
72	} else {
73	memcpy((char *) tensor->data + offset, data, size);
74	}
75
76	GGML_UNUSED(buffer);
77	}
78
79	/*
80	// need to figure what we need to do with buffer->extra.
81	static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor tensor, void * data, size_t offset, size_t size) {*
82	GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
83	memcpy(data, (const char )tensor->data + offset, size);*
84
85	GGML_UNUSED(buffer);
86	}
87
88	static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor src, struct ggml_tensor * dst) {*
89	if (ggml_backend_buffer_is_host(src->buffer)) {
90	if (qtype_has_amx_kernels(src->type)) {
91	ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
92	} else {
93	memcpy(dst->data, src->data, ggml_nbytes(src));
94	}
95	return true;
96	}
97	return false;
98
99	GGML_UNUSED(buffer);
100	}
101	*/
102
103	static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
104	memset(buffer->context, value, buffer->size);
105	}
106
107	static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
108	/ .free_buffer = / ggml_backend_amx_buffer_free_buffer,
109	/ .get_base = / ggml_backend_amx_buffer_get_base,
110	/ .init_tensor = / ggml_backend_amx_buffer_init_tensor,
111	/ .memset_tensor = / ggml_backend_amx_buffer_memset_tensor,
112	/ .set_tensor = / ggml_backend_amx_buffer_set_tensor,
113	/ .get_tensor = / nullptr,
114	/ .cpy_tensor = / nullptr,
115	/ .clear = / ggml_backend_amx_buffer_clear,
116	/ .reset = / nullptr,
117	};
118
119	static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
120	return "AMX";
121
122	GGML_UNUSED(buft);
123	}
124
125	static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
126	void * data = ggml_aligned_malloc(size);
127	if (data == NULL) {
128	fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
129	return NULL;
130	}
131
132	return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
133	}
134
135	static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
136	return TENSOR_ALIGNMENT;
137
138	GGML_UNUSED(buft);
139	}
140
141	namespace ggml::cpu::amx {
142	class extra_buffer_type : ggml::cpu::extra_buffer_type {
143	bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
144	// handle only 2d gemm for now
145	auto is_contiguous_2d = [](const struct ggml_tensor * t) {
146	return ggml_is_contiguous(t) && t->ne[`3`] == `1` && t->ne[`2`] == `1`;
147	};
148
149	if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[`0`]) && // src0 must be contiguous
150	is_contiguous_2d(op->src[`1`]) && // src1 must be contiguous
151	op->src[`0`]->buffer && op->src[`0`]->buffer->buft == ggml_backend_amx_buffer_type() &&
152	op->src[`0`]->ne[`0`] % (TILE_K * `2` * `32`) == `0` && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
153	op->ne[`0`] % (TILE_N * `2`) == `0` && // out_features is 32x
154	(qtype_has_amx_kernels(op->src[`0`]->type) \|\| (op->src[`0`]->type == GGML_TYPE_F16))) {
155	// src1 must be host buffer
156	if (op->src[`1`]->buffer && !ggml_backend_buft_is_host(op->src[`1`]->buffer->buft)) {
157	return false;
158	}
159	// src1 must be float32
160	if (op->src[`1`]->type == GGML_TYPE_F32) {
161	return true;
162	}
163	}
164	return false;
165	}
166
167	ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
168	if (op->op == GGML_OP_MUL_MAT && op->src[`0`]->buffer &&
169	op->src[`0`]->buffer->buft == ggml_backend_amx_buffer_type()) {
170	return (ggml::cpu::tensor_traits *) op->src[`0`]->extra;
171	}
172
173	return nullptr;
174	}
175	};
176	} // namespace ggml::cpu::amx
177
178	static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
179	return ggml_backend_amx_get_alloc_size(tensor);
180
181	GGML_UNUSED(buft);
182	}
183
184	#define ARCH_GET_XCOMP_PERM 0x1022
185	#define ARCH_REQ_XCOMP_PERM 0x1023
186	#define XFEATURE_XTILECFG 17
187	#define XFEATURE_XTILEDATA 18
188
189	static bool ggml_amx_init() {
190	#if defined(__linux__)
191	if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
192	fprintf(stderr, "AMX is not ready to be used!\n");
193	return false;
194	}
195	return true;
196	#elif defined(_WIN32)
197	return true;
198	#else
199	return false;
200	#endif
201	}
202
203	ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
204	static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
205	/ .iface = / {
206	/ .get_name = / ggml_backend_amx_buffer_type_get_name,
207	/ .alloc_buffer = / ggml_backend_amx_buffer_type_alloc_buffer,
208	/ .get_alignment = / ggml_backend_amx_buffer_type_get_alignment,
209	/ .get_max_size = / nullptr, // defaults to SIZE_MAX
210	/ .get_alloc_size = / ggml_backend_amx_buffer_type_get_alloc_size,
211	/ .is_host = / nullptr,
212	},
213	/ .device = / ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), `0`),
214	/ .context = / new ggml::cpu::amx::extra_buffer_type(),
215	};
216
217	if (!ggml_amx_init()) {
218	return nullptr;
219	}
220
221	return &ggml_backend_buffer_type_amx;
222	}
223
224	#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
225

Browse the source code of llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp