| 1 | #pragma once |
| 2 | |
| 3 | #include "llama.h" |
| 4 | |
| 5 | #include <unordered_map> |
| 6 | #include <string> |
| 7 | #include <vector> |
| 8 | |
| 9 | #define LLAMA_NGRAM_MIN 1 |
| 10 | #define LLAMA_NGRAM_MAX 4 |
| 11 | #define LLAMA_NGRAM_STATIC 2 |
| 12 | |
| 13 | // Data structures to map n-grams to empirical token probabilities: |
| 14 | |
| 15 | struct common_ngram { |
| 16 | llama_token tokens[LLAMA_NGRAM_MAX]; |
| 17 | |
| 18 | common_ngram() { |
| 19 | for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { |
| 20 | tokens[i] = LLAMA_TOKEN_NULL; |
| 21 | } |
| 22 | } |
| 23 | |
| 24 | common_ngram(const llama_token * input, const int ngram_size) { |
| 25 | for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { |
| 26 | tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL; |
| 27 | } |
| 28 | } |
| 29 | |
| 30 | bool operator==(const common_ngram & other) const { |
| 31 | for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { |
| 32 | if (tokens[i] != other.tokens[i]) { |
| 33 | return false; |
| 34 | } |
| 35 | } |
| 36 | return true; |
| 37 | } |
| 38 | }; |
| 39 | |
| 40 | struct common_token_hash_function { |
| 41 | size_t operator()(const llama_token token) const { |
| 42 | // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ |
| 43 | return token * 11400714819323198485llu; |
| 44 | } |
| 45 | }; |
| 46 | |
| 47 | struct common_ngram_hash_function { |
| 48 | size_t operator()(const common_ngram & ngram) const { |
| 49 | size_t hash = common_token_hash_function{}(ngram.tokens[0]); |
| 50 | for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) { |
| 51 | hash ^= common_token_hash_function{}(ngram.tokens[i]); |
| 52 | } |
| 53 | return hash; |
| 54 | } |
| 55 | }; |
| 56 | |
| 57 | // token -> number of times token has been seen |
| 58 | typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part; |
| 59 | |
| 60 | // n-gram -> empirical distribution of following tokens |
| 61 | typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache; |
| 62 | |
| 63 | |
| 64 | // Update an ngram cache with tokens. |
| 65 | // ngram_cache: the cache to modify. |
| 66 | // ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data. |
| 67 | // inp_data: the token sequence with which to update ngram_cache. |
| 68 | // nnew: how many new tokens have been appended to inp_data since the last call to this function. |
| 69 | // print_progress: whether to print progress to stderr. |
| 70 | // |
| 71 | // In order to get correct results inp_data can ONLY BE APPENDED TO. |
| 72 | // Changes in the middle need a complete rebuild. |
| 73 | void common_ngram_cache_update( |
| 74 | common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress); |
| 75 | |
| 76 | // Try to draft tokens from ngram caches. |
| 77 | // inp: the tokens generated so far. |
| 78 | // draft: the token sequence to draft. Expected to initially contain the previously sampled token. |
| 79 | // n_draft: maximum number of tokens to add to draft. |
| 80 | // ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic. |
| 81 | // nc_context: ngram cache based on current context. |
| 82 | // nc_dynamic: ngram cache based on previous user generations. |
| 83 | // nc_static: ngram cache generated from a large text corpus, used for validation. |
| 84 | void common_ngram_cache_draft( |
| 85 | std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max, |
| 86 | common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static); |
| 87 | |
| 88 | // Save an ngram cache to a file. |
| 89 | // ngram_cache: the ngram cache to save. |
| 90 | // filename: the path under which to save the ngram cache. |
| 91 | void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename); |
| 92 | |
| 93 | // Load an ngram cache saved with common_ngram_cache_save. |
| 94 | // filename: the path from which to load the ngram cache. |
| 95 | // returns: an ngram cache containing the information saved to filename. |
| 96 | common_ngram_cache common_ngram_cache_load(std::string & filename); |
| 97 | |
| 98 | // Merge two ngram caches. |
| 99 | // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add. |
| 100 | // ngram_cache_add: the ngram cache to add to ngram_cache_target. |
| 101 | void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add); |
| 102 | |