| 1 | #include "arg.h" |
| 2 | #include "common.h" |
| 3 | #include "sampling.h" |
| 4 | #include "log.h" |
| 5 | #include "llama.h" |
| 6 | |
| 7 | #include <algorithm> |
| 8 | #include <cstdio> |
| 9 | #include <cstring> |
| 10 | #include <random> |
| 11 | #include <set> |
| 12 | #include <string> |
| 13 | #include <vector> |
| 14 | |
| 15 | #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128 |
| 16 | #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 |
| 17 | |
| 18 | struct seq_draft { |
| 19 | bool active = false; |
| 20 | bool drafting = false; |
| 21 | bool skip = false; |
| 22 | |
| 23 | int i_batch_dft = 0; |
| 24 | std::vector<int> i_batch_tgt; |
| 25 | |
| 26 | std::vector<llama_token> tokens; |
| 27 | std::vector<std::vector<llama_token_data>> dists; |
| 28 | |
| 29 | struct common_sampler * smpl = nullptr; |
| 30 | }; |
| 31 | |
| 32 | int main(int argc, char ** argv) { |
| 33 | common_params params; |
| 34 | |
| 35 | // needed to get candidate probs even for temp <= 0.0 |
| 36 | params.sampling.n_probs = 128; |
| 37 | |
| 38 | if (!common_params_parse(argc, argv, params, ex: LLAMA_EXAMPLE_SPECULATIVE)) { |
| 39 | return 1; |
| 40 | } |
| 41 | |
| 42 | if (params.n_predict < -1) { |
| 43 | LOG_ERR("%s: --n-predict must be >= -1\n" , __func__); |
| 44 | return 1; |
| 45 | } |
| 46 | |
| 47 | common_init(); |
| 48 | |
| 49 | if (params.speculative.model.path.empty()) { |
| 50 | LOG_ERR("%s: --model-draft is required\n" , __func__); |
| 51 | return 1; |
| 52 | } |
| 53 | |
| 54 | // max number of parallel drafting sequences (i.e. tree branches) |
| 55 | const int n_seq_dft = params.n_parallel; |
| 56 | |
| 57 | // probability threshold for splitting a draft branch (only for n_seq_dft > 1) |
| 58 | const float p_draft_split = params.speculative.p_split; |
| 59 | |
| 60 | std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed); |
| 61 | std::uniform_real_distribution<> u_dist; |
| 62 | |
| 63 | // init llama.cpp |
| 64 | llama_backend_init(); |
| 65 | llama_numa_init(numa: params.numa); |
| 66 | |
| 67 | llama_model * model_tgt = NULL; |
| 68 | llama_model * model_dft = NULL; |
| 69 | |
| 70 | llama_context * ctx_tgt = NULL; |
| 71 | llama_context * ctx_dft = NULL; |
| 72 | |
| 73 | // load the target model |
| 74 | common_init_result llama_init_tgt = common_init_from_params(params); |
| 75 | |
| 76 | model_tgt = llama_init_tgt.model.get(); |
| 77 | ctx_tgt = llama_init_tgt.context.get(); |
| 78 | |
| 79 | // load the draft model |
| 80 | params.devices = params.speculative.devices; |
| 81 | params.model = params.speculative.model; |
| 82 | params.n_gpu_layers = params.speculative.n_gpu_layers; |
| 83 | if (params.speculative.cpuparams.n_threads > 0) { |
| 84 | params.cpuparams.n_threads = params.speculative.cpuparams.n_threads; |
| 85 | } |
| 86 | |
| 87 | params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; |
| 88 | params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; |
| 89 | |
| 90 | common_init_result llama_init_dft = common_init_from_params(params); |
| 91 | |
| 92 | model_dft = llama_init_dft.model.get(); |
| 93 | ctx_dft = llama_init_dft.context.get(); |
| 94 | |
| 95 | const llama_vocab * vocab_tgt = llama_model_get_vocab(model: model_tgt); |
| 96 | const llama_vocab * vocab_dft = llama_model_get_vocab(model: model_dft); |
| 97 | |
| 98 | const bool vocab_type_tgt = llama_vocab_type(vocab: vocab_tgt); |
| 99 | LOG_DBG("vocab_type tgt: %d\n" , vocab_type_tgt); |
| 100 | |
| 101 | const bool vocab_type_dft = llama_vocab_type(vocab: vocab_dft); |
| 102 | LOG_DBG("vocab_type dft: %d\n" , vocab_type_dft); |
| 103 | |
| 104 | if (vocab_type_tgt != vocab_type_dft) { |
| 105 | LOG_ERR("%s: draft model vocab type must match target model to use speculation but " , __func__); |
| 106 | LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n" , vocab_type_dft, vocab_type_tgt); |
| 107 | return 1; |
| 108 | } |
| 109 | |
| 110 | if ( |
| 111 | llama_vocab_get_add_bos(vocab: vocab_tgt) != llama_vocab_get_add_bos(vocab: vocab_dft) || |
| 112 | llama_vocab_get_add_eos(vocab: vocab_tgt) != llama_vocab_get_add_eos(vocab: vocab_dft) || |
| 113 | llama_vocab_bos(vocab: vocab_tgt) != llama_vocab_bos(vocab: vocab_dft) || |
| 114 | llama_vocab_eos(vocab: vocab_tgt) != llama_vocab_eos(vocab: vocab_dft) |
| 115 | ) { |
| 116 | LOG_ERR("%s: draft model special tokens must match target model to use speculation\n" , __func__); |
| 117 | return 1; |
| 118 | } |
| 119 | |
| 120 | { |
| 121 | const int n_vocab_tgt = llama_vocab_n_tokens(vocab: vocab_tgt); |
| 122 | const int n_vocab_dft = llama_vocab_n_tokens(vocab: vocab_dft); |
| 123 | const int vocab_diff = n_vocab_tgt > n_vocab_dft |
| 124 | ? n_vocab_tgt - n_vocab_dft |
| 125 | : n_vocab_dft - n_vocab_tgt; |
| 126 | |
| 127 | if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { |
| 128 | LOG_ERR("%s: draft model vocab must closely match target model to use speculation but " , __func__); |
| 129 | LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n" , |
| 130 | n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); |
| 131 | return 1; |
| 132 | } |
| 133 | |
| 134 | for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(a: n_vocab_tgt, b: n_vocab_dft); ++i) { |
| 135 | const char * token_text_tgt = llama_vocab_get_text(vocab: vocab_tgt, token: i); |
| 136 | const char * token_text_dft = llama_vocab_get_text(vocab: vocab_dft, token: i); |
| 137 | if (std::strcmp(s1: token_text_tgt, s2: token_text_dft) != 0) { |
| 138 | LOG_ERR("%s: draft model vocab must match target model to use speculation but " , __func__); |
| 139 | LOG_ERR("token %d content differs - target '%s', draft '%s'\n" , i, |
| 140 | common_token_to_piece(ctx_tgt, i).c_str(), |
| 141 | common_token_to_piece(ctx_dft, i).c_str()); |
| 142 | return 1; |
| 143 | } |
| 144 | } |
| 145 | } |
| 146 | |
| 147 | auto * mem_tgt = llama_get_memory(ctx: ctx_tgt); |
| 148 | auto * mem_dft = llama_get_memory(ctx: ctx_dft); |
| 149 | |
| 150 | // Tokenize the prompt |
| 151 | std::vector<llama_token> inp; |
| 152 | inp = common_tokenize(ctx: ctx_tgt, text: params.prompt, add_special: true, parse_special: true); |
| 153 | |
| 154 | const int max_context_size = llama_n_ctx(ctx: ctx_tgt); |
| 155 | const int max_tokens_list_size = max_context_size - 4; |
| 156 | |
| 157 | if ((int) inp.size() > max_tokens_list_size) { |
| 158 | LOG_ERR("%s: prompt too long (%d tokens, max %d)\n" , __func__, (int) inp.size(), max_tokens_list_size); |
| 159 | return 1; |
| 160 | } |
| 161 | |
| 162 | LOG("\n\n" ); |
| 163 | |
| 164 | for (auto id : inp) { |
| 165 | LOG("%s" , common_token_to_piece(ctx_tgt, id).c_str()); |
| 166 | } |
| 167 | |
| 168 | const int n_input = inp.size(); |
| 169 | |
| 170 | const auto t_enc_start = ggml_time_us(); |
| 171 | |
| 172 | // eval the prompt with both models |
| 173 | llama_decode(ctx: ctx_tgt, batch: llama_batch_get_one( tokens: inp.data(), n_tokens: n_input - 1)); |
| 174 | llama_decode(ctx: ctx_tgt, batch: llama_batch_get_one(tokens: &inp.back(), n_tokens: 1)); |
| 175 | llama_decode(ctx: ctx_dft, batch: llama_batch_get_one( tokens: inp.data(), n_tokens: n_input)); |
| 176 | |
| 177 | const auto t_enc_end = ggml_time_us(); |
| 178 | |
| 179 | // the 2 models should have the same vocab |
| 180 | //GGML_ASSERT(n_vocab == llama_vocab_n_tokens(model_dft)); |
| 181 | |
| 182 | // how many tokens to draft each time |
| 183 | int n_draft = params.speculative.n_max; |
| 184 | |
| 185 | int n_predict = 0; |
| 186 | int n_drafted = 0; |
| 187 | int n_accept = 0; |
| 188 | |
| 189 | int n_past_tgt = inp.size(); |
| 190 | int n_past_dft = inp.size(); |
| 191 | |
| 192 | // used to determine end of generation |
| 193 | bool has_eos = false; |
| 194 | |
| 195 | // target model sampling context (reuse the llama_context's sampling instance) |
| 196 | struct common_sampler * smpl = common_sampler_init(model: model_tgt, params: params.sampling); |
| 197 | |
| 198 | // draft sequence data |
| 199 | std::vector<seq_draft> drafts(n_seq_dft); |
| 200 | |
| 201 | for (int s = 0; s < n_seq_dft; ++s) { |
| 202 | // allocate llama_sampler for each draft sequence |
| 203 | drafts[s].smpl = common_sampler_init(model: model_dft, params: params.sampling); |
| 204 | } |
| 205 | |
| 206 | llama_batch batch_dft = llama_batch_init(n_tokens: llama_n_batch(ctx: ctx_dft), embd: 0, n_seq_max: 1); |
| 207 | llama_batch batch_tgt = llama_batch_init(n_tokens: llama_n_batch(ctx: ctx_tgt), embd: 0, n_seq_max: n_seq_dft); |
| 208 | |
| 209 | const auto t_dec_start = ggml_time_us(); |
| 210 | |
| 211 | // sample from the last token of the prompt |
| 212 | drafts[0].i_batch_tgt.resize(new_size: 1); |
| 213 | drafts[0].i_batch_tgt[0] = 0; |
| 214 | |
| 215 | while (true) { |
| 216 | std::set<int> active_seqs = {}; |
| 217 | |
| 218 | // print current draft sequences |
| 219 | for (int s = 0; s < n_seq_dft; ++s) { |
| 220 | if (!drafts[s].active) { |
| 221 | continue; |
| 222 | } |
| 223 | |
| 224 | active_seqs.insert(x: s); |
| 225 | const auto & tokens = drafts[s].tokens; |
| 226 | |
| 227 | LOG_DBG("draft %d: %s\n" , s, string_from(ctx_dft, tokens).c_str()); |
| 228 | } |
| 229 | |
| 230 | int i_dft = 0; |
| 231 | int s_keep = 0; |
| 232 | |
| 233 | llama_token token_id; |
| 234 | std::string token_str; |
| 235 | |
| 236 | // loop until we fail to accept a drafted token or we run out of drafted tokens |
| 237 | while (true) { |
| 238 | |
| 239 | // check if the target token matches any of the drafts |
| 240 | // for stochastic sampling, attempt to match the token with the drafted tokens |
| 241 | { |
| 242 | bool accept = false; |
| 243 | if (params.sampling.temp > 0) { |
| 244 | // stochastic verification |
| 245 | common_sampler_sample(gsmpl: smpl, ctx: ctx_tgt, idx: drafts[s_keep].i_batch_tgt[i_dft], grammar_first: true); |
| 246 | |
| 247 | auto & dist_tgt = *common_sampler_get_candidates(gsmpl: smpl, do_sort: true); |
| 248 | |
| 249 | float p_tgt = 0.0f; |
| 250 | float p_dft = 0.0f; |
| 251 | |
| 252 | while (active_seqs.size() > 0) { |
| 253 | // randomly select a sequence to verify from active sequences |
| 254 | std::uniform_int_distribution<unsigned int> u_int_dist(0, active_seqs.size() - 1); |
| 255 | int s = *std::next(x: active_seqs.begin(), n: u_int_dist(rng)); |
| 256 | if (i_dft >= (int) drafts[s].tokens.size()) { |
| 257 | drafts[s].active = false; |
| 258 | active_seqs.erase(x: s); |
| 259 | continue; |
| 260 | } |
| 261 | if (accept) { |
| 262 | // if we already accepted a token, we can skip the rest |
| 263 | if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) { |
| 264 | drafts[s].active = false; |
| 265 | active_seqs.erase(x: s); |
| 266 | } |
| 267 | continue; |
| 268 | } |
| 269 | |
| 270 | LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n" , s, i_dft, (int) active_seqs.size()); |
| 271 | float r = u_dist(rng); |
| 272 | llama_token_data_array dist_dft = { .data: drafts[s].dists[i_dft].data() , .size: drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, .sorted: true }; |
| 273 | |
| 274 | //GGML_ASSERT(dist_tgt.size <= dist_dft.size); |
| 275 | |
| 276 | // acquire the token probabilities assigned by the draft and target models |
| 277 | for (size_t i = 0; i < dist_tgt.size; i++) { |
| 278 | if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) { |
| 279 | p_tgt = dist_tgt.data[i].p; |
| 280 | break; |
| 281 | } |
| 282 | } |
| 283 | for (size_t i = 0; i < dist_dft.size; i++) { |
| 284 | if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) { |
| 285 | p_dft = dist_dft.data[i].p; |
| 286 | break; |
| 287 | } |
| 288 | } |
| 289 | LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n" , r, p_dft, p_tgt); |
| 290 | if (r <= p_tgt / p_dft) { |
| 291 | s_keep = s; |
| 292 | accept = true; |
| 293 | token_id = drafts[s].tokens[i_dft]; |
| 294 | token_str = common_token_to_piece(ctx: ctx_tgt, token: token_id); |
| 295 | common_sampler_accept(gsmpl: smpl, token: token_id, accept_grammar: true); |
| 296 | |
| 297 | LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n" , i_dft, s, token_id, token_str.c_str()); |
| 298 | break; |
| 299 | } else { |
| 300 | LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n" , i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); |
| 301 | drafts[s].active = false; |
| 302 | |
| 303 | // calculate residual probability |
| 304 | GGML_ASSERT(dist_tgt.sorted); |
| 305 | GGML_ASSERT(dist_dft.sorted); |
| 306 | |
| 307 | // sort dist by id |
| 308 | std::sort(first: dist_tgt.data, last: dist_tgt.data + dist_tgt.size, comp: [](const llama_token_data &a, const llama_token_data &b) { |
| 309 | return a.id < b.id; |
| 310 | }); |
| 311 | std::sort(first: dist_dft.data, last: dist_dft.data + dist_dft.size, comp: [](const llama_token_data &a, const llama_token_data &b) { |
| 312 | return a.id < b.id; |
| 313 | }); |
| 314 | |
| 315 | float sum_probs = 0.0f; |
| 316 | |
| 317 | for (size_t i = 0; i < dist_tgt.size; i++) { |
| 318 | if (i < dist_dft.size) { |
| 319 | dist_tgt.data[i].p = std::max(a: 0.0f, b: dist_tgt.data[i].p - dist_dft.data[i].p); |
| 320 | } else { |
| 321 | dist_tgt.data[i].p = std::max(a: 0.0f, b: dist_tgt.data[i].p); |
| 322 | } |
| 323 | |
| 324 | sum_probs += dist_tgt.data[i].p; |
| 325 | } |
| 326 | |
| 327 | for (size_t i = 0; i < dist_tgt.size; i++) { |
| 328 | dist_tgt.data[i].p /= sum_probs; |
| 329 | } |
| 330 | |
| 331 | // sort dist_tgt by p desc |
| 332 | std::sort(first: dist_tgt.data, last: dist_tgt.data + dist_tgt.size, comp: [](const llama_token_data &a, const llama_token_data &b) { |
| 333 | return a.p > b.p; |
| 334 | }); |
| 335 | } |
| 336 | |
| 337 | active_seqs.erase(x: s); |
| 338 | for (int i = 0; i < n_seq_dft; i++) { |
| 339 | if (i == s) { |
| 340 | continue; |
| 341 | } |
| 342 | if (drafts[i].active && drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) { |
| 343 | // synchronize active status for sequences with the same drafted token |
| 344 | drafts[i].active = drafts[i].active && accept; |
| 345 | if (!drafts[i].active) { |
| 346 | active_seqs.erase(x: s); |
| 347 | } |
| 348 | } |
| 349 | } |
| 350 | } |
| 351 | |
| 352 | if (!accept) { |
| 353 | // all drafted tokens were rejected |
| 354 | // sample from the target model |
| 355 | LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n" ); |
| 356 | std::vector<float> probs(dist_tgt.size); |
| 357 | for (size_t i = 0; i < dist_tgt.size; ++i) { |
| 358 | probs[i] = dist_tgt.data[i].p; |
| 359 | } |
| 360 | |
| 361 | std::discrete_distribution<> dist(probs.begin(), probs.end()); |
| 362 | |
| 363 | const int idx = dist(rng); |
| 364 | |
| 365 | token_id = dist_tgt.data[idx].id; |
| 366 | common_sampler_accept(gsmpl: smpl, token: token_id, accept_grammar: true); |
| 367 | token_str = common_token_to_piece(ctx: ctx_tgt, token: token_id); |
| 368 | } |
| 369 | } else { |
| 370 | // greedy verification |
| 371 | |
| 372 | // sample from the target model |
| 373 | LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n" , s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); |
| 374 | token_id = common_sampler_sample(gsmpl: smpl, ctx: ctx_tgt, idx: drafts[s_keep].i_batch_tgt[i_dft]); |
| 375 | |
| 376 | common_sampler_accept(gsmpl: smpl, token: token_id, accept_grammar: true); |
| 377 | |
| 378 | token_str = common_token_to_piece(ctx: ctx_tgt, token: token_id); |
| 379 | |
| 380 | for (int s = 0; s < n_seq_dft; ++s) { |
| 381 | if (!drafts[s].active) { |
| 382 | continue; |
| 383 | } |
| 384 | |
| 385 | if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) { |
| 386 | LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n" , i_dft, s, token_id, token_str.c_str()); |
| 387 | |
| 388 | s_keep = s; |
| 389 | accept = true; |
| 390 | } else { |
| 391 | drafts[s].active = false; |
| 392 | } |
| 393 | } |
| 394 | } |
| 395 | |
| 396 | if (llama_vocab_is_eog(vocab: vocab_tgt, token: token_id)) { |
| 397 | has_eos = true; |
| 398 | } |
| 399 | ++n_predict; |
| 400 | |
| 401 | if (accept) { |
| 402 | ++n_accept; |
| 403 | ++n_past_tgt; |
| 404 | ++n_past_dft; |
| 405 | ++i_dft; |
| 406 | if (params.use_color) { |
| 407 | // Color token according to its origin sequence |
| 408 | LOG("\u001b[%dm%s\u001b[37m" , (36 - s_keep % 6), token_str.c_str()); |
| 409 | } else { |
| 410 | LOG("%s" , token_str.c_str()); |
| 411 | } |
| 412 | continue; |
| 413 | } else { |
| 414 | LOG("%s" , token_str.c_str()); |
| 415 | break; |
| 416 | } |
| 417 | } |
| 418 | } |
| 419 | |
| 420 | { |
| 421 | LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n" , token_id, token_str.c_str()); |
| 422 | |
| 423 | // TODO: simplify |
| 424 | { |
| 425 | LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n" , s_keep, n_past_tgt, n_past_dft); |
| 426 | |
| 427 | llama_memory_seq_keep(mem: mem_dft, seq_id: s_keep); |
| 428 | llama_memory_seq_cp (mem: mem_dft, seq_id_src: s_keep, seq_id_dst: 0, p0: -1, p1: -1); |
| 429 | llama_memory_seq_keep(mem: mem_dft, seq_id: 0); |
| 430 | |
| 431 | llama_memory_seq_rm (mem: mem_tgt, seq_id: s_keep, p0: n_past_tgt, p1: -1); |
| 432 | llama_memory_seq_keep(mem: mem_tgt, seq_id: s_keep); |
| 433 | llama_memory_seq_cp (mem: mem_tgt, seq_id_src: s_keep, seq_id_dst: 0, p0: -1, p1: -1); |
| 434 | llama_memory_seq_keep(mem: mem_tgt, seq_id: 0); |
| 435 | } |
| 436 | |
| 437 | for (int s = 0; s < n_seq_dft; ++s) { |
| 438 | drafts[s].active = false; |
| 439 | drafts[s].tokens.clear(); |
| 440 | drafts[s].i_batch_tgt.clear(); |
| 441 | drafts[s].dists.clear(); |
| 442 | } |
| 443 | // note: will be erased after the speculation phase |
| 444 | drafts[0].tokens.push_back(x: token_id); |
| 445 | drafts[0].dists.push_back(x: std::vector<llama_token_data>()); |
| 446 | drafts[0].i_batch_tgt.push_back(x: 0); |
| 447 | |
| 448 | common_batch_clear(batch&: batch_dft); |
| 449 | common_batch_add (batch&: batch_dft, id: token_id, pos: n_past_dft, seq_ids: { 0 }, logits: true); |
| 450 | |
| 451 | llama_memory_seq_rm(mem: mem_dft, seq_id: 0, p0: n_past_dft, p1: -1); |
| 452 | // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); |
| 453 | llama_decode(ctx: ctx_dft, batch: batch_dft); |
| 454 | |
| 455 | ++n_past_dft; |
| 456 | } |
| 457 | |
| 458 | if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { |
| 459 | break; |
| 460 | } |
| 461 | |
| 462 | if (drafts[0].smpl) { |
| 463 | common_sampler_free(gsmpl: drafts[0].smpl); |
| 464 | } |
| 465 | drafts[0].smpl = common_sampler_clone(gsmpl: smpl); |
| 466 | |
| 467 | int n_seq_cur = 1; |
| 468 | int n_past_cur = n_past_dft; |
| 469 | |
| 470 | for (int s = 0; s < n_seq_dft; ++s) { |
| 471 | drafts[s].active = false; |
| 472 | drafts[s].drafting = false; |
| 473 | } |
| 474 | drafts[0].active = true; |
| 475 | drafts[0].drafting = true; |
| 476 | drafts[0].i_batch_dft = 0; |
| 477 | |
| 478 | common_batch_clear(batch&: batch_tgt); |
| 479 | common_batch_add (batch&: batch_tgt, id: drafts[0].tokens[0], pos: n_past_tgt, seq_ids: { 0 }, logits: true); |
| 480 | |
| 481 | // sample n_draft tokens from the draft model using tree-based sampling |
| 482 | for (int i = 0; i < n_draft; ++i) { |
| 483 | batch_dft.n_tokens = 0; |
| 484 | |
| 485 | for (int s = 0; s < n_seq_dft; ++s) { |
| 486 | drafts[s].skip = false; |
| 487 | } |
| 488 | |
| 489 | for (int s = 0; s < n_seq_dft; ++s) { |
| 490 | if (!drafts[s].drafting || drafts[s].skip) { |
| 491 | continue; |
| 492 | } |
| 493 | |
| 494 | common_sampler_sample(gsmpl: drafts[s].smpl, ctx: ctx_dft, idx: drafts[s].i_batch_dft, grammar_first: true); |
| 495 | |
| 496 | const auto * cur_p = common_sampler_get_candidates(gsmpl: drafts[s].smpl, do_sort: true); |
| 497 | |
| 498 | for (int k = 0; k < std::min(a: n_seq_dft + 3, b: (int) cur_p->size); ++k) { |
| 499 | LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n" , |
| 500 | k, s, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); |
| 501 | } |
| 502 | |
| 503 | std::vector<int> sa(1, s); |
| 504 | |
| 505 | // attempt to split the branch if the probability is high enough |
| 506 | for (int f = 1; f < 8; ++f) { |
| 507 | if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) { |
| 508 | LOG_DBG("splitting seq %3d into %3d\n" , s, n_seq_cur); |
| 509 | |
| 510 | llama_memory_seq_rm(mem: mem_dft, seq_id: n_seq_cur, p0: -1, p1: -1); |
| 511 | llama_memory_seq_cp(mem: mem_dft, seq_id_src: s, seq_id_dst: n_seq_cur, p0: -1, p1: -1); |
| 512 | |
| 513 | // all previous tokens from this branch are now also part of the new branch |
| 514 | for (int t = 0; t < batch_tgt.n_tokens; ++t) { |
| 515 | for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) { |
| 516 | if (batch_tgt.seq_id[t][p] == s) { |
| 517 | batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur; |
| 518 | batch_tgt.n_seq_id[t]++; |
| 519 | break; |
| 520 | } |
| 521 | } |
| 522 | } |
| 523 | |
| 524 | // copy the draft state |
| 525 | drafts[n_seq_cur].active = true; |
| 526 | drafts[n_seq_cur].drafting = true; |
| 527 | drafts[n_seq_cur].skip = true; |
| 528 | |
| 529 | drafts[n_seq_cur].tokens = drafts[s].tokens; |
| 530 | drafts[n_seq_cur].dists = drafts[s].dists; |
| 531 | drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft; |
| 532 | drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; |
| 533 | |
| 534 | if (drafts[n_seq_cur].smpl) { |
| 535 | common_sampler_free(gsmpl: drafts[n_seq_cur].smpl); |
| 536 | } |
| 537 | drafts[n_seq_cur].smpl = common_sampler_clone(gsmpl: drafts[s].smpl); |
| 538 | |
| 539 | sa.push_back(x: n_seq_cur); |
| 540 | |
| 541 | n_seq_cur++; |
| 542 | } else { |
| 543 | break; |
| 544 | } |
| 545 | } |
| 546 | |
| 547 | // add drafted token for each sequence |
| 548 | for (int is = 0; is < (int) sa.size(); ++is) { |
| 549 | const llama_token id = cur_p->data[is].id; |
| 550 | |
| 551 | const int s = sa[is]; |
| 552 | |
| 553 | common_sampler_accept(gsmpl: drafts[s].smpl, token: id, accept_grammar: true); |
| 554 | |
| 555 | drafts[s].tokens.push_back(x: id); |
| 556 | // save cur_p.data into drafts[s].dists |
| 557 | drafts[s].dists.push_back(x: {cur_p->data, cur_p->data + cur_p->size}); |
| 558 | |
| 559 | // add unique drafted tokens to the target batch |
| 560 | drafts[s].i_batch_tgt.push_back(x: batch_tgt.n_tokens); |
| 561 | |
| 562 | common_batch_add(batch&: batch_tgt, id, pos: n_past_tgt + i + 1, seq_ids: { s }, logits: true); |
| 563 | |
| 564 | // add the token to the batch for batched decoding with the draft model |
| 565 | drafts[s].i_batch_dft = batch_dft.n_tokens; |
| 566 | |
| 567 | common_batch_add(batch&: batch_dft, id, pos: n_past_cur, seq_ids: { s }, logits: true); |
| 568 | |
| 569 | if (batch_tgt.n_tokens > n_draft) { |
| 570 | drafts[s].drafting = false; |
| 571 | } |
| 572 | } |
| 573 | } |
| 574 | |
| 575 | // no sequence is drafting anymore |
| 576 | if (batch_dft.n_tokens == 0) { |
| 577 | break; |
| 578 | } |
| 579 | |
| 580 | // evaluate the drafted tokens on the draft model |
| 581 | llama_decode(ctx: ctx_dft, batch: batch_dft); |
| 582 | ++n_past_cur; |
| 583 | ++n_drafted; |
| 584 | |
| 585 | if (batch_tgt.n_tokens > n_draft) { |
| 586 | break; |
| 587 | } |
| 588 | } |
| 589 | |
| 590 | // evaluate the target model on the drafted tokens |
| 591 | { |
| 592 | llama_memory_seq_keep(mem: mem_tgt, seq_id: 0); |
| 593 | for (int s = 1; s < n_seq_dft; ++s) { |
| 594 | llama_memory_seq_cp(mem: mem_tgt, seq_id_src: 0, seq_id_dst: s, p0: -1, p1: -1); |
| 595 | } |
| 596 | |
| 597 | // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); |
| 598 | llama_decode(ctx: ctx_tgt, batch: batch_tgt); |
| 599 | ++n_past_tgt; |
| 600 | } |
| 601 | |
| 602 | // the first token is always proposed by the target model before the speculation loop so we erase it here |
| 603 | for (int s = 0; s < n_seq_dft; ++s) { |
| 604 | if (!drafts[s].active) { |
| 605 | continue; |
| 606 | } |
| 607 | |
| 608 | drafts[s].tokens.erase(position: drafts[s].tokens.begin()); |
| 609 | drafts[s].dists.erase(position: drafts[s].dists.begin()); |
| 610 | } |
| 611 | } |
| 612 | |
| 613 | auto t_dec_end = ggml_time_us(); |
| 614 | |
| 615 | LOG("\n\n" ); |
| 616 | |
| 617 | LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n" , n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); |
| 618 | LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n" , n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); |
| 619 | |
| 620 | LOG_INF("\n" ); |
| 621 | LOG_INF("n_draft = %d\n" , n_draft); |
| 622 | LOG_INF("n_predict = %d\n" , n_predict); |
| 623 | LOG_INF("n_drafted = %d\n" , n_drafted); |
| 624 | LOG_INF("n_accept = %d\n" , n_accept); |
| 625 | LOG_INF("accept = %.3f%%\n" , 100.0f * n_accept / n_drafted); |
| 626 | |
| 627 | LOG_INF("\n" ); |
| 628 | LOG_INF("draft:\n\n" ); |
| 629 | // TODO: print sampling/grammar timings for all drafts |
| 630 | llama_perf_context_print(ctx: ctx_dft); |
| 631 | |
| 632 | LOG_INF("\n" ); |
| 633 | LOG_INF("target:\n\n" ); |
| 634 | common_perf_print(ctx: ctx_tgt, gsmpl: smpl); |
| 635 | |
| 636 | common_sampler_free(gsmpl: smpl); |
| 637 | for (int s = 0; s < n_seq_dft; ++s) { |
| 638 | common_sampler_free(gsmpl: drafts[s].smpl); |
| 639 | } |
| 640 | |
| 641 | llama_batch_free(batch: batch_dft); |
| 642 | |
| 643 | llama_backend_free(); |
| 644 | |
| 645 | LOG("\n\n" ); |
| 646 | |
| 647 | return 0; |
| 648 | } |
| 649 | |