test-tokenizer-1-spm.cpp source code [llama.cpp/tests/test-tokenizer-1-spm.cpp]

1	#include "llama.h"
2	#include "common.h"
3	#include "console.h"
4
5	#include "../src/unicode.h"
6
7	#include <cassert>
8	#include <codecvt>
9	#include <cstdio>
10	#include <cstring>
11	#include <locale>
12	#include <string>
13	#include <thread>
14	#include <vector>
15	#include <atomic>
16
17	int main(int argc, char ** argv) {
18	if (argc < `2`) {
19	fprintf(stderr, format: "Usage: %s <vocab-file>\n", argv[`0`]);
20	return `1`;
21	}
22
23	const std::string fname = argv[`1`];
24
25	fprintf(stderr, format: "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
26
27	llama_model * model;
28	llama_context * ctx;
29
30	llama_backend_init();
31
32	// load the vocab
33	{
34	auto mparams = llama_model_default_params();
35
36	mparams.vocab_only = true;
37
38	model = llama_model_load_from_file(path_model: fname.c_str(), params: mparams);
39
40	if (model == NULL) {
41	fprintf(stderr, format: "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
42	return `1`;
43	}
44
45	auto cparams = llama_context_default_params();
46
47	ctx = llama_init_from_model(model, params: cparams);
48
49	if (ctx == NULL) {
50	fprintf(stderr, format: "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
51	llama_model_free(model);
52	return `1`;
53	}
54	}
55
56	const llama_vocab * vocab = llama_model_get_vocab(model);
57
58	//GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
59	if (llama_vocab_type(vocab) != LLAMA_VOCAB_TYPE_SPM) {
60	return `99`;
61	}
62
63	#ifdef _WIN32
64	// We need this for unicode console support
65	console::init(false, false);
66	atexit([]() { console::cleanup(); });
67	#endif
68
69	const int n_vocab = llama_vocab_n_tokens(vocab);
70
71	for (int i = `0`; i < n_vocab; ++i) {
72	std::string str = common_detokenize(ctx, tokens: std::vector<int>(`1`, i), special: true);
73	std::vector<llama_token> tokens = common_tokenize(ctx, text: str, add_special: false, parse_special: true);
74	std::string check = common_detokenize(ctx, tokens);
75	if (check != str) {
76	fprintf(stderr, format: "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
77	__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
78	return `2`;
79	}
80	}
81
82	// unicode
83	{
84	const int nthread = std::thread::hardware_concurrency();
85
86	std::vector<std::thread> threads(nthread);
87
88	std::atomic_int errcode = {};
89
90	for (int i = `0`; i < nthread; ++i) {
91	threads [i] = std::thread ([i, nthread, ctx, &errcode]() {
92	for (uint32_t cp = i; !errcode && cp < `0x00110000`; cp += nthread) {
93	if ((`0x0000D800` <= cp && cp <= `0x0000DFFF`) \|\| // surrogates \p{Cs}
94	(`0x00040000` <= cp && cp <= `0x000E0000`)) { // undefined \p{Cn}
95	continue;
96	}
97
98	std::string str = unicode_cpt_to_utf8(cpt: cp);
99	std::vector<llama_token> tokens = common_tokenize(ctx, text: str, add_special: false, parse_special: true);
100	std::string check = common_detokenize(ctx, tokens);
101	if (cp != `9601` && str != check) {
102	fprintf(stderr, format: "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
103	cp, check.c_str(), check.length(), str.c_str(), str.length());
104	errcode = `3`;
105	}
106	}
107	});
108	}
109
110	for (auto & t : threads) {
111	t.join();
112	}
113
114	if(errcode) {
115	return errcode;
116	}
117	}
118
119	llama_model_free(model);
120	llama_free(ctx);
121
122	llama_backend_free();
123
124	return `0`;
125	}
126

Browse the source code of llama.cpp/tests/test-tokenizer-1-spm.cpp