llama-memory-hybrid.cpp source code [llama.cpp/src/llama-memory-hybrid.cpp]

1	#include "llama-memory-hybrid.h"
2
3	#include "llama-impl.h"
4	#include "llama-model.h"
5	#include "llama-context.h"
6
7	//
8	// llama_memory_hybrid
9	//
10
11	llama_memory_hybrid::llama_memory_hybrid(
12	const llama_model & model,
13	/ attn /
14	ggml_type type_k,
15	ggml_type type_v,
16	bool v_trans,
17	uint32_t kv_size,
18	uint32_t n_pad,
19	uint32_t n_swa,
20	llama_swa_type swa_type,
21	/ recurrent /
22	ggml_type type_r,
23	ggml_type type_s,
24	uint32_t rs_size,
25	/ common /
26	uint32_t n_seq_max,
27	bool offload,
28	bool unified,
29	/ layer filters /
30	const layer_filter_cb & filter_attn,
31	const layer_filter_cb & filter_recr) :
32	hparams(model.hparams),
33	mem_attn (new llama_kv_cache (
34	model,
35	type_k,
36	type_v,
37	v_trans,
38	offload,
39	unified,
40	kv_size,
41	n_seq_max,
42	n_pad,
43	n_swa,
44	swa_type,
45	filter_attn == nullptr ?
46	[&](int32_t il) { return !hparams.is_recurrent(il); }
47	: filter_attn,
48	nullptr
49	)),
50	mem_recr (new llama_memory_recurrent (
51	model,
52	type_r,
53	type_s,
54	offload,
55	rs_size,
56	n_seq_max,
57	filter_recr == nullptr ?
58	[&](int32_t il) { return hparams.is_recurrent(il); }
59	: filter_recr
60	)) {}
61
62	llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
63	do {
64	balloc.split_reset();
65
66	// follow the recurrent pattern for creating the ubatch splits
67	std::vector<llama_ubatch> ubatches;
68
69	while (true) {
70	llama_ubatch ubatch;
71
72	if (embd_all) {
73	// if all tokens are output, split by sequence
74	ubatch = balloc.split_seq(n_ubatch);
75	} else {
76	// TODO: non-sequential equal split can be done if using unified KV cache
77	// for simplicity, we always use sequential equal split for now
78	ubatch = balloc.split_equal(n_ubatch, sequential: true);
79	}
80
81	if (ubatch.n_tokens == `0`) {
82	break;
83	}
84
85	ubatches.push_back(x: std::move(ubatch)); // NOLINT
86	}
87
88	if (balloc.get_n_used() < balloc.get_n_tokens()) {
89	// failed to find a suitable split
90	break;
91	}
92
93	// prepare the recurrent batches first
94	if (!mem_recr ->prepare(ubatches)) {
95	// TODO: will the recurrent cache be in an undefined context at this point?
96	LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
97	return std::make_unique<llama_memory_hybrid_context>(args: LLAMA_MEMORY_STATUS_FAILED_PREPARE);
98	}
99
100	// prepare the attention cache
101	auto heads_attn = mem_attn ->prepare(ubatches);
102	if (heads_attn.empty()) {
103	LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
104	return std::make_unique<llama_memory_hybrid_context>(args: LLAMA_MEMORY_STATUS_FAILED_PREPARE);
105	}
106
107	return std::make_unique<llama_memory_hybrid_context>(
108	args: this, args: std::move(heads_attn), args: std::move(ubatches));
109	} while(false);
110
111	return std::make_unique<llama_memory_hybrid_context>(args: LLAMA_MEMORY_STATUS_FAILED_PREPARE);
112	}
113
114	llama_memory_context_ptr llama_memory_hybrid::init_full() {
115	return std::make_unique<llama_memory_hybrid_context>(args: this);
116	}
117
118	llama_memory_context_ptr llama_memory_hybrid::init_update(llama_context * lctx, bool optimize) {
119	return std::make_unique<llama_memory_hybrid_context>(args: this, args&: lctx, args&: optimize);
120	}
121
122	bool llama_memory_hybrid::get_can_shift() const {
123	// Shifting is trivially supported for recurrent
124	return mem_attn ->get_can_shift();
125	}
126
127	void llama_memory_hybrid::clear(bool data) {
128	mem_attn ->clear(data);
129	mem_recr ->clear(data);
130	}
131
132	bool llama_memory_hybrid::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
133	// Try removing from the recurrent cache first since it may fail. If it does
134	// fail, the cache will not have been mutated.
135	if (!mem_recr ->seq_rm(seq_id, p0, p1)) {
136	return false;
137	}
138	return mem_attn ->seq_rm(seq_id, p0, p1);
139	}
140
141	void llama_memory_hybrid::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
142	mem_attn ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
143	mem_recr ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
144	}
145
146	void llama_memory_hybrid::seq_keep(llama_seq_id seq_id) {
147	mem_attn ->seq_keep(seq_id);
148	mem_recr ->seq_keep(seq_id);
149	}
150
151	void llama_memory_hybrid::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
152	mem_attn ->seq_add(seq_id, p0, p1, shift);
153	mem_recr ->seq_add(seq_id, p0, p1, shift);
154	}
155
156	void llama_memory_hybrid::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
157	mem_attn ->seq_div(seq_id, p0, p1, d);
158	mem_recr ->seq_div(seq_id, p0, p1, d);
159	}
160
161	llama_pos llama_memory_hybrid::seq_pos_min(llama_seq_id seq_id) const {
162	// the min of the total cache is the max of the two caches' min values
163	return std::max(a: mem_attn ->seq_pos_min(seq_id), b: mem_recr ->seq_pos_min(seq_id));
164	}
165
166	llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
167	// the max of the total cache is the min of the two caches' max values
168	return std::min(a: mem_attn ->seq_pos_max(seq_id), b: mem_recr ->seq_pos_max(seq_id));
169	}
170
171	std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
172	std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn ->memory_breakdown();
173	for (const auto & buft_size : mem_recr ->memory_breakdown()) {
174	mb [buft_size.first] += buft_size.second;
175	}
176	return mb;
177	}
178
179	void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
180	if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == `0`) {
181	mem_attn ->state_write(io, seq_id, flags);
182	}
183	mem_recr ->state_write(io, seq_id, flags);
184	}
185
186	void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
187	if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == `0`) {
188	mem_attn ->state_read(io, seq_id, flags);
189	}
190	mem_recr ->state_read(io, seq_id, flags);
191	}
192
193	llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
194	return mem_attn.get();
195	}
196
197	llama_memory_recurrent * llama_memory_hybrid::get_mem_recr() const {
198	return mem_recr.get();
199	}
200
201	llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_status status) : status(status) {}
202
203	llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_hybrid * mem) :
204	ctx_attn(mem->get_mem_attn()->init_full()),
205	ctx_recr(mem->get_mem_recr()->init_full()),
206	status(llama_memory_status_combine(s0: ctx_attn ->get_status(), s1: ctx_recr ->get_status())) {
207	}
208
209	llama_memory_hybrid_context::llama_memory_hybrid_context(
210	llama_memory_hybrid * mem,
211	llama_context * lctx,
212	bool optimize) :
213	ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
214	ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
215	status(llama_memory_status_combine(s0: ctx_attn ->get_status(), s1: ctx_recr ->get_status())) {
216	}
217
218	llama_memory_hybrid_context::llama_memory_hybrid_context(
219	llama_memory_hybrid * mem,
220	slot_info_vec_t sinfos_attn,
221	std::vector<llama_ubatch> ubatches) :
222	ubatches (std::move(ubatches)),
223	// note: here we copy the ubatches. not sure if this is ideal
224	ctx_attn (new llama_kv_cache_context (mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
225	ctx_recr (new llama_memory_recurrent_context (mem->get_mem_recr(), this->ubatches)),
226	status(llama_memory_status_combine(s0: ctx_attn ->get_status(), s1: ctx_recr ->get_status())) {
227	}
228
229	bool llama_memory_hybrid_context::next() {
230	assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
231
232	ctx_attn ->next();
233	ctx_recr ->next();
234
235	if (++i_next >= ubatches.size()) {
236	return false;
237	}
238
239	return true;
240	}
241
242	bool llama_memory_hybrid_context::apply() {
243	assert(!llama_memory_status_is_fail(status));
244
245	bool res = true;
246
247	res = res & ctx_attn ->apply();
248	res = res & ctx_recr ->apply();
249
250	return res;
251	}
252
253	llama_memory_status llama_memory_hybrid_context::get_status() const {
254	return status;
255	}
256
257	const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
258	assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
259	return ubatches [i_next];
260	}
261
262	const llama_kv_cache_context * llama_memory_hybrid_context::get_attn() const {
263	return static_cast<const llama_kv_cache_context *>(ctx_attn.get());
264	}
265
266	const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {
267	return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
268	}
269

Browse the source code of llama.cpp/src/llama-memory-hybrid.cpp