test-tokenizer-1-bpe.cpp source code [llama.cpp/tests/test-tokenizer-1-bpe.cpp]

1	#include "llama.h"
2	#include "common.h"
3	#include "console.h"
4
5	#include "../src/unicode.h"
6
7	#include <cassert>
8	#include <codecvt>
9	#include <cstdio>
10	#include <cstring>
11	#include <locale>
12	#include <string>
13	#include <thread>
14	#include <vector>
15	#include <atomic>
16
17	int main(int argc, char **argv) {
18	if (argc < `2` \|\| argc > `3`) {
19	fprintf(stderr, format: "Usage: %s <vocab-file> [--ignore-merges]\n", argv[`0`]);
20	return `1`;
21	}
22
23	const std::string fname = argv[`1`];
24	bool ignore_merges = false;
25	if (argc == `3`) {
26	if (std::strcmp(s1: argv[`2`], s2: "--ignore-merges") != `0`) {
27	fprintf(stderr, format: "Usage: %s <vocab-file> [--ignore-merges]\n", argv[`0`]);
28	return `1`;
29	}
30	ignore_merges = true;
31	}
32
33	fprintf(stderr, format: "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
34
35	if (ignore_merges) {
36	fprintf(stderr, format: "%s : ignoring merges for tokens inside vocab\n", __func__);
37	}
38
39	llama_model * model;
40	llama_context * ctx;
41
42	llama_backend_init();
43
44	// load the vocab
45	{
46	auto mparams = llama_model_default_params();
47
48	mparams.vocab_only = true;
49
50	model = llama_model_load_from_file(path_model: fname.c_str(), params: mparams);
51
52	if (model == NULL) {
53	fprintf(stderr, format: "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
54	return `1`;
55	}
56
57	auto cparams = llama_context_default_params();
58
59	ctx = llama_init_from_model(model, params: cparams);
60
61	if (ctx == NULL) {
62	fprintf(stderr, format: "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
63	llama_model_free(model);
64	return `1`;
65	}
66	}
67
68	const llama_vocab * vocab = llama_model_get_vocab(model);
69
70	//GGML_ASSERT(llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_BPE);
71	if (llama_vocab_type(vocab) != LLAMA_VOCAB_TYPE_BPE) {
72	return `99`;
73	}
74
75	#ifdef _WIN32
76	// We need this for unicode console support
77	console::init(false, false);
78	atexit([]() { console::cleanup(); });
79	#endif
80
81	const int n_vocab = llama_vocab_n_tokens(vocab);
82
83	for (int i = `0`; i < n_vocab; ++i) {
84	std::string str = common_detokenize(ctx, tokens: std::vector<int>(`1`, i));
85	try {
86	auto cps = unicode_cpts_from_utf8(utf8: str);
87	std::vector<llama_token> tokens = common_tokenize(ctx, text: str, add_special: false, parse_special: true);
88	if (ignore_merges && tokens.size() > `1`) {
89	fprintf(stderr,
90	format: "%s : error: token %d detokenizes to '%s'(%zu) but "
91	"tokenization of this to multiple tokens: [",
92	__func__, i, str.c_str(), str.length());
93	fprintf(stderr, format: "%d", tokens [`0`]);
94	for (size_t i = `1`; i < tokens.size(); i++) {
95	fprintf(stderr, format: ", %d", tokens [i]);
96	}
97	fprintf(stderr, format: "]\n");
98	return `2`;
99	}
100	std::string check = common_detokenize(ctx, tokens);
101	if (check != str) {
102	fprintf(stderr, format: "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
103	__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
104	return `2`;
105	}
106	}
107	catch (const std::invalid_argument &) {
108	//fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
109	}
110	}
111
112	// unicode
113	{
114	const int nthread = std::thread::hardware_concurrency();
115
116	std::vector<std::thread> threads(nthread);
117
118	std::atomic_int errcode = {};
119
120	for (int i = `0`; i < nthread; ++i) {
121	threads [i] = std::thread ([i, nthread, ctx, &errcode]() {
122	for (uint32_t cp = i; !errcode && cp < `0x00110000`; cp += nthread) {
123	if ((`0x0000D800` <= cp && cp <= `0x0000DFFF`) \|\| // surrogates \p{Cs}
124	(`0x00040000` <= cp && cp <= `0x000E0000`)) { // undefined \p{Cn}
125	continue;
126	}
127
128	std::string str = unicode_cpt_to_utf8(cpt: cp);
129	std::vector<llama_token> tokens = common_tokenize(ctx, text: str, add_special: false);
130	std::string check = common_detokenize(ctx, tokens);
131	if (cp != `9601` && str != check) {
132	fprintf(stderr, format: "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
133	cp, check.c_str(), check.length(), str.c_str(), str.length());
134	errcode = `3`;
135	}
136	}
137	});
138	}
139
140	for (auto & t : threads) {
141	t.join();
142	}
143
144	if (errcode) {
145	return errcode;
146	}
147	}
148
149	llama_model_free(model);
150	llama_free(ctx);
151
152	llama_backend_free();
153
154	return `0`;
155	}
156

Browse the source code of llama.cpp/tests/test-tokenizer-1-bpe.cpp