llama-vocab.h source code [llama.cpp/src/llama-vocab.h]

1	#pragma once
2
3	#include "llama.h"
4
5	#include <string>
6	#include <vector>
7	#include <memory>
8
9	// pre-tokenization types
10	enum llama_vocab_pre_type {
11	LLAMA_VOCAB_PRE_TYPE_DEFAULT = `0`,
12	LLAMA_VOCAB_PRE_TYPE_LLAMA3 = `1`,
13	LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = `2`,
14	LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = `3`,
15	LLAMA_VOCAB_PRE_TYPE_FALCON = `4`,
16	LLAMA_VOCAB_PRE_TYPE_MPT = `5`,
17	LLAMA_VOCAB_PRE_TYPE_STARCODER = `6`,
18	LLAMA_VOCAB_PRE_TYPE_GPT2 = `7`,
19	LLAMA_VOCAB_PRE_TYPE_REFACT = `8`,
20	LLAMA_VOCAB_PRE_TYPE_COMMAND_R = `9`,
21	LLAMA_VOCAB_PRE_TYPE_STABLELM2 = `10`,
22	LLAMA_VOCAB_PRE_TYPE_QWEN2 = `11`,
23	LLAMA_VOCAB_PRE_TYPE_OLMO = `12`,
24	LLAMA_VOCAB_PRE_TYPE_DBRX = `13`,
25	LLAMA_VOCAB_PRE_TYPE_SMAUG = `14`,
26	LLAMA_VOCAB_PRE_TYPE_PORO = `15`,
27	LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = `16`,
28	LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = `17`,
29	LLAMA_VOCAB_PRE_TYPE_VIKING = `18`,
30	LLAMA_VOCAB_PRE_TYPE_JAIS = `19`,
31	LLAMA_VOCAB_PRE_TYPE_TEKKEN = `20`,
32	LLAMA_VOCAB_PRE_TYPE_SMOLLM = `21`,
33	LLAMA_VOCAB_PRE_TYPE_CODESHELL = `22`,
34	LLAMA_VOCAB_PRE_TYPE_BLOOM = `23`,
35	LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = `24`,
36	LLAMA_VOCAB_PRE_TYPE_EXAONE = `25`,
37	LLAMA_VOCAB_PRE_TYPE_CHAMELEON = `26`,
38	LLAMA_VOCAB_PRE_TYPE_MINERVA = `27`,
39	LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = `28`,
40	LLAMA_VOCAB_PRE_TYPE_GPT4O = `29`,
41	LLAMA_VOCAB_PRE_TYPE_SUPERBPE = `30`,
42	LLAMA_VOCAB_PRE_TYPE_TRILLION = `31`,
43	LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = `32`,
44	LLAMA_VOCAB_PRE_TYPE_LLAMA4 = `33`,
45	LLAMA_VOCAB_PRE_TYPE_PIXTRAL = `34`,
46	LLAMA_VOCAB_PRE_TYPE_SEED_CODER = `35`,
47	LLAMA_VOCAB_PRE_TYPE_HUNYUAN = `36`,
48	LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = `37`,
49	LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = `38`,
50	LLAMA_VOCAB_PRE_TYPE_GROK_2 = `39`,
51	LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = `40`,
52	LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = `41`,
53	};
54
55	struct LLM_KV;
56	struct llama_model_loader;
57
58	struct llama_vocab {
59	struct token_data {
60	std::string text;
61	float score;
62	llama_token_attr attr;
63	};
64
65	llama_vocab();
66	~llama_vocab();
67
68	void load(llama_model_loader & ml, const LLM_KV & kv);
69
70	std::string get_tokenizer_model() const;
71	std::string get_tokenizer_pre() const;
72
73	enum llama_vocab_type get_type() const;
74	enum llama_vocab_pre_type get_pre_type() const;
75
76	uint32_t n_tokens() const;
77	uint32_t n_token_types() const;
78
79	std::string type_name() const;
80
81	bool is_normal (llama_token id) const;
82	bool is_unknown (llama_token id) const;
83	bool is_control (llama_token id) const;
84	bool is_byte (llama_token id) const;
85	bool is_user_defined(llama_token id) const;
86	bool is_unused (llama_token id) const;
87	bool is_eog (llama_token id) const;
88
89	uint8_t token_to_byte(llama_token id) const;
90	llama_token byte_to_token(uint8_t ch) const;
91
92	llama_token text_to_token(const std::string & text) const;
93
94	const token_data & get_token_data(llama_token id) const;
95
96	const char * token_get_text (llama_token id) const;
97	float token_get_score(llama_token id) const;
98	llama_token_attr token_get_attr (llama_token id) const;
99
100	llama_token token_bos() const;
101	llama_token token_eos() const;
102	llama_token token_eot() const;
103	llama_token token_eom() const;
104	llama_token token_unk() const;
105	llama_token token_sep() const;
106	llama_token token_nl () const;
107	llama_token token_pad() const;
108	llama_token token_mask() const;
109
110	llama_token token_prefix() const;
111	llama_token token_middle() const;
112	llama_token token_suffix() const;
113
114	llama_token token_fim_pre() const;
115	llama_token token_fim_suf() const;
116	llama_token token_fim_mid() const;
117	llama_token token_fim_pad() const;
118	llama_token token_fim_rep() const;
119	llama_token token_fim_sep() const;
120
121	bool get_add_space_prefix () const;
122	bool get_add_bos () const;
123	bool get_add_eos () const;
124	bool get_add_sep () const;
125	bool get_ignore_merges () const;
126	bool get_clean_spaces () const;
127	bool get_remove_extra_whitespaces () const;
128	bool get_escape_whitespaces () const;
129	bool get_treat_whitespace_as_suffix() const;
130
131	int max_token_len() const;
132
133	int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
134	std::vector<std::string> get_bpe_merges() const;
135
136	std::vector<char> get_precompiled_charsmap() const;
137
138	int32_t tokenize(
139	const char * text,
140	int32_t text_len,
141	llama_token * tokens,
142	int32_t n_tokens_max,
143	bool add_special,
144	bool parse_special) const;
145
146	std::vector<llama_token> tokenize(
147	const std::string & raw_text,
148	bool add_special,
149	bool parse_special = false) const;
150
151	// does not write null-terminator to buf
152	int32_t token_to_piece(
153	llama_token token,
154	char * buf,
155	int32_t length,
156	int32_t lstrip,
157	bool special) const;
158
159	// use cached data
160	const std::string & token_to_piece(llama_token token) const;
161
162	int32_t detokenize(
163	const llama_token * tokens,
164	int32_t n_tokens,
165	char * text,
166	int32_t text_len_max,
167	bool remove_special,
168	bool unparse_special) const;
169
170	std::string detokenize(
171	const std::vector<llama_token> & tokens,
172	bool special) const;
173
174	void print_info() const;
175
176	private:
177	struct impl;
178	std::unique_ptr<impl> pimpl;
179	};
180

Browse the source code of llama.cpp/src/llama-vocab.h