test-sampling.cpp source code [llama.cpp/tests/test-sampling.cpp]

1	#include "ggml.h"
2	#include "llama.h"
3
4	#ifdef NDEBUG
5	#undef NDEBUG
6	#endif
7
8	#include <algorithm>
9	#include <cmath>
10	#include <string>
11	#include <vector>
12
13	extern struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers);
14
15	static void dump(const llama_token_data_array * cur_p) {
16	for (size_t i = `0`; i < cur_p->size; i++) {
17	printf(format: "%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
18	}
19	}
20
21	#define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
22
23	struct sampler_tester {
24	sampler_tester(size_t n_vocab) {
25	cur.reserve(n: n_vocab);
26	for (llama_token token_id = `0`; token_id < (llama_token)n_vocab; token_id++) {
27	const float logit = logf(x: token_id);
28	cur.emplace_back(args: llama_token_data{.id: token_id, .logit: logit, .p: `0.0f`});
29	}
30
31	cur_p = llama_token_data_array { .data: cur.data(), .size: cur.size(), .selected: -`1`, .sorted: false };
32	}
33
34	sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected (probs_expected) {
35	cur.reserve(n: probs.size());
36	for (llama_token token_id = `0`; token_id < (llama_token)probs.size(); token_id++) {
37	const float logit = logf(x: probs [token_id]);
38	cur.emplace_back(args: llama_token_data{.id: token_id, .logit: logit, .p: probs [token_id]});
39	}
40
41	cur_p = llama_token_data_array { .data: cur.data(), .size: cur.size(), .selected: -`1`, .sorted: false };
42	}
43
44	void apply(llama_sampler * sampler) {
45	llama_sampler_apply(smpl: sampler, cur_p: &cur_p);
46	llama_sampler_free(smpl: sampler);
47	}
48
49	void check() {
50	GGML_ASSERT(cur_p.size == probs_expected.size());
51	for (size_t i = `0`; i < cur_p.size; i++) {
52	GGML_ASSERT(fabs(cur_p.data[i].p - probs_expected[i]) < `1e-5`);
53	}
54	}
55
56	llama_token_data_array cur_p;
57
58	private:
59	const std::vector<float> probs_expected;
60
61	std::vector<llama_token_data> cur;
62	};
63
64	static void test_temp(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp) {
65	sampler_tester tester(probs, probs_expected);
66
67	DUMP(&tester.cur_p);
68	tester.apply(sampler: llama_sampler_init_temp(t: temp));
69	tester.apply(sampler: llama_sampler_init_dist(seed: `0`));
70	DUMP(&tester.cur_p);
71
72	tester.check();
73	}
74
75	static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent) {
76	sampler_tester tester(probs, probs_expected);
77
78	DUMP(&tester.cur_p);
79	tester.apply(sampler: llama_sampler_init_temp_ext(t: temp, delta, exponent));
80	tester.apply(sampler: llama_sampler_init_dist (seed: `0`));
81	DUMP(&tester.cur_p);
82
83	tester.check();
84	}
85
86	static void test_top_k(const std::vector<float> & probs, const std::vector<float> & probs_expected, int k) {
87	sampler_tester tester(probs, probs_expected);
88
89	DUMP(&tester.cur_p);
90	tester.apply(sampler: llama_sampler_init_top_k(k));
91	tester.apply(sampler: llama_sampler_init_dist (seed: `0`));
92	DUMP(&tester.cur_p);
93
94	tester.check();
95	}
96
97	static void test_top_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
98	sampler_tester tester(probs, probs_expected);
99
100	DUMP(&tester.cur_p);
101	tester.apply(sampler: llama_sampler_init_top_p(p, min_keep: `0`));
102	tester.apply(sampler: llama_sampler_init_dist (seed: `0`));
103	DUMP(&tester.cur_p);
104
105	tester.check();
106	}
107
108	static void test_min_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
109	sampler_tester tester(probs, probs_expected);
110
111	DUMP(&tester.cur_p);
112	tester.apply(sampler: llama_sampler_init_min_p(p, min_keep: `0`));
113	tester.apply(sampler: llama_sampler_init_dist (seed: `0`));
114	DUMP(&tester.cur_p);
115
116	tester.check();
117	}
118
119	static void test_xtc(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p, float t) {
120	sampler_tester tester(probs, probs_expected);
121
122	DUMP(&tester.cur_p);
123	tester.apply(sampler: llama_sampler_init_xtc(p, t, min_keep: `0`, seed: `0`));
124	DUMP(&tester.cur_p);
125
126	tester.check();
127	}
128
129	static void test_typical(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
130	sampler_tester tester(probs, probs_expected);
131
132	DUMP(&tester.cur_p);
133	tester.apply(sampler: llama_sampler_init_typical(p, min_keep: `0`));
134	DUMP(&tester.cur_p);
135
136	tester.check();
137	}
138
139	static void test_penalties(
140	const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
141	const std::vector<float> & probs_expected, float repeat_penalty, float alpha_frequency, float alpha_presence
142	) {
143	GGML_ASSERT(probs.size() == probs_expected.size());
144
145	sampler_tester tester(probs, probs_expected);
146
147	auto * sampler = llama_sampler_init_penalties(penalty_last_n: last_tokens.size(), penalty_repeat: repeat_penalty, penalty_freq: alpha_frequency, penalty_present: alpha_presence);
148
149	for (size_t i = `0`; i < last_tokens.size(); i++) {
150	llama_sampler_accept(smpl: sampler, token: last_tokens [i]);
151	}
152
153	DUMP(&tester.cur_p);
154	tester.apply(sampler);
155	tester.apply(sampler: llama_sampler_init_dist(seed: `0`));
156	DUMP(&tester.cur_p);
157
158	tester.check();
159	}
160
161	static void test_dry(
162	const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
163	const std::vector<float> & expected_probs, float dry_multiplier, float dry_base,
164	int dry_allowed_length, int dry_penalty_last_n,
165	const std::vector<std::vector<llama_token>> & seq_breakers
166	) {
167	GGML_ASSERT(probs.size() == expected_probs.size());
168
169	sampler_tester tester(probs, expected_probs);
170
171	auto * sampler = llama_sampler_init_dry_testing(context_size: `1024`, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);
172
173	for (size_t i = `0`; i < last_tokens.size(); i++) {
174	llama_sampler_accept(smpl: sampler, token: last_tokens [i]);
175	}
176
177	DUMP(&tester.cur_p);
178	tester.apply(sampler);
179	tester.apply(sampler: llama_sampler_init_dist(seed: `0`));
180	DUMP(&tester.cur_p);
181	tester.check();
182	}
183
184	static void test_top_n_sigma(const std::vector<float> & probs, const std::vector<float> & probs_expected, int n) {
185	sampler_tester tester(probs, probs_expected);
186
187	DUMP(&tester.cur_p);
188	tester.apply(sampler: llama_sampler_init_top_n_sigma(n));
189	tester.apply(sampler: llama_sampler_init_dist (seed: `0`));
190	DUMP(&tester.cur_p);
191
192	tester.check();
193	}
194
195	static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
196	) {
197	sampler_tester tester(n_vocab);
198
199	llama_token min_token_id = `0`;
200	const llama_token max_token_id = n_vocab - `1`;
201
202	for (auto s : samplers_sequence) {
203	switch (s) {
204	case `'k'`: tester.apply(sampler: llama_sampler_init_top_k(k: top_k)); break;
205	case `'y'`: GGML_ABORT("typical test not implemented");
206	case `'p'`: tester.apply(sampler: llama_sampler_init_top_p(p: top_p, min_keep: `1`)); break;
207	case `'m'`: tester.apply(sampler: llama_sampler_init_min_p(p: min_p, min_keep: `1`)); break;
208	case `'t'`: GGML_ABORT("temperature test not implemented");
209	default : GGML_ABORT("Unknown sampler");
210	}
211
212	tester.apply(sampler: llama_sampler_init_dist(seed: `0`));
213
214	auto & cur_p = tester.cur_p;
215
216	const int size = cur_p.size;
217
218	if (s == `'k'`) {
219	const int expected_size = std::min(a: size, b: top_k);
220	min_token_id = std::max(a: min_token_id, b: (llama_token)(n_vocab - top_k));
221
222	GGML_ASSERT(size == expected_size);
223	GGML_ASSERT(cur_p.data[`0`].id == max_token_id);
224	GGML_ASSERT(cur_p.data[expected_size-`1`].id == min_token_id);
225	} else if (s == `'p'`) {
226	const int softmax_divisor = n_vocab * (n_vocab-`1`) / `2` - min_token_id * (min_token_id-`1`) / `2`;
227	const int softmax_numerator_target = ceilf(x: top_p * softmax_divisor);
228
229	min_token_id = n_vocab;
230	int expected_size = `0`;
231	int cumsum = `0`;
232	do { // do-while because always at least one token is sampled
233	min_token_id--;
234	expected_size++;
235
236	cumsum += min_token_id;
237	} while (cumsum < softmax_numerator_target);
238
239	// token 0 has p == 0, need special consideration for cumsum because top_p immediately returns
240	if (min_token_id == `1`) {
241	min_token_id--;
242	expected_size += `1`;
243	}
244
245	GGML_ASSERT(size == expected_size);
246	GGML_ASSERT(!cur_p.sorted \|\| cur_p.data[`0`].id == max_token_id);
247	GGML_ASSERT(!cur_p.sorted \|\| cur_p.data[expected_size-`1`].id == min_token_id);
248	} else if (s == `'m'`) {
249	int expected_size = ceilf(x: (`1.0f` - min_p) * n_vocab);
250	expected_size = std::max(a: expected_size, b: `1`);
251	expected_size = std::min(a: expected_size, b: size);
252
253	min_token_id = floorf(x: min_p * n_vocab);
254	min_token_id = std::max(a: min_token_id, b: `1`);
255	min_token_id = std::max(a: min_token_id, b: (llama_token)(n_vocab - size));
256	min_token_id = std::min(a: min_token_id, b: (llama_token)(n_vocab - `1`));
257
258	GGML_ASSERT(size == expected_size);
259	GGML_ASSERT(!cur_p.sorted \|\| cur_p.data[`0`].id == max_token_id);
260	GGML_ASSERT(!cur_p.sorted \|\| cur_p.data[expected_size-`1`].id == min_token_id);
261	} else {
262	GGML_ABORT("fatal error");
263	}
264	}
265
266	printf(format: "Sampler queue %3s OK with n_vocab=%05zu top_k=%5d top_p=%f min_p=%f\n",
267	samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
268	}
269
270	static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
271	std::vector<llama_token_data> cur(data.size());
272	std::copy(first: data.begin(), last: data.end(), result: cur.begin());
273	llama_token_data_array cur_p = { .data: cur.data(), .size: cur.size(), .selected: -`1`, .sorted: false };
274	llama_sampler_apply(smpl: cnstr, cur_p: &cur_p);
275	llama_sampler_reset(smpl: cnstr);
276	const int64_t t_start = ggml_time_us();
277	for (int i = `0`; i < n_iter; i++) {
278	std::copy(first: data.begin(), last: data.end(), result: cur.begin());
279	llama_token_data_array cur_p = { .data: cur.data(), .size: cur.size(), .selected: -`1`, .sorted: false };
280	llama_sampler_apply(smpl: cnstr, cur_p: &cur_p);
281	llama_sampler_reset(smpl: cnstr);
282	}
283	const int64_t t_end = ggml_time_us();
284	llama_sampler_free(smpl: cnstr);
285	printf(format: "%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
286	}
287
288	#define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
289
290	static void test_perf() {
291	const int n_vocab = `1` << `17`;
292
293	std::vector<llama_token_data> data;
294
295	data.reserve(n: n_vocab);
296	for (int i = `0`; i < n_vocab; i++) {
297	const float logit = `2.0f`((double*)(rand())/RAND_MAX - `0.5`);
298	data.emplace_back(args: llama_token_data{.id: i, .logit: logit, .p: `0.0f`});
299	}
300
301	BENCH(llama_sampler_init_top_k (`40`), data, `32`);
302	BENCH(llama_sampler_init_top_p (`0.8f`, `1`), data, `32`);
303	BENCH(llama_sampler_init_min_p (`0.2f`, `1`), data, `32`);
304	BENCH(llama_sampler_init_typical(`0.5f`, `1`), data, `32`);
305	BENCH(llama_sampler_init_xtc (`1.0f`, `0.1f`, `1`, `1`), data, `32`);
306	}
307
308	int main(void) {
309	ggml_time_init();
310
311	test_temp(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, temp: `1.0f`);
312	test_temp(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.0f`, `0.0f`, `0.0f`, `1.0f`}, temp: `0.0f`);
313
314	test_temp_ext(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, temp: `1.0f`, delta: `0.0f`, exponent: `1.0f`);
315	test_temp_ext(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.0f`, `0.0f`, `0.0f`, `1.0f`}, temp: `0.0f`, delta: `0.0f`, exponent: `1.0f`);
316
317	test_top_k(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`1.0f`}, k: `1`);
318	test_top_k(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.44444f`, `0.33333f`, `0.22222f`}, k: `3`);
319	test_top_k(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.4f`, `0.3f`, `0.2f`, `0.1f`}, k: `4`);
320	test_top_k(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, k: `0`);
321
322	test_top_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`1.0f`}, p: `0`);
323	test_top_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.571429f`, `0.428571f`}, p: `0.7f`);
324	test_top_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.44444f`, `0.33333f`, `0.22222f`}, p: `0.8f`);
325	test_top_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, p: `1.0f`);
326
327	test_min_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.1f`/`1.0f`, `0.2f`/`1.0f`, `0.3f`/`1.0f`, `0.4f`/`1.0f`}, p: `0.00f`);
328	test_min_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.1f`/`1.0f`, `0.2f`/`1.0f`, `0.3f`/`1.0f`, `0.4f`/`1.0f`}, p: `0.24f`);
329	test_min_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.2f`/`0.9f`, `0.3f`/`0.9f`, `0.4f`/`0.9f`}, p: `0.26f`);
330	test_min_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.2f`/`0.9f`, `0.3f`/`0.9f`, `0.4f`/`0.9f`}, p: `0.49f`);
331	test_min_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.3f`/`0.7f`, `0.4f`/`0.7f`}, p: `0.51f`);
332	test_min_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.3f`/`0.7f`, `0.4f`/`0.7f`}, p: `0.74f`);
333	test_min_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.4f`/`0.4f`}, p: `0.76f`);
334	test_min_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.4f`/`0.4f`}, p: `1.00f`);
335	test_min_p(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.4f`/`0.4f`}, p: `1.05f`);
336
337	printf(format: "XTC should:\n");
338	test_xtc(probs: {`0.4f`, `0.3f`, `0.2f`, `0.1f`}, probs_expected: {`0.1f`}, p: `0.99f`, t: `0.09f`);
339	test_xtc(probs: {`0.4f`, `0.3f`, `0.2f`, `0.1f`}, probs_expected: {`0.2f`, `0.1f`}, p: `0.99f`, t: `0.19f`);
340	test_xtc(probs: {`0.4f`, `0.3f`, `0.2f`, `0.1f`}, probs_expected: {`0.3f`, `0.2f`, `0.1f`}, p: `0.99f`, t: `0.29f`);
341
342	printf(format: "XTC should not:\n");
343	test_xtc(probs: {`0.4f`, `0.3f`, `0.2f`, `0.1f`}, probs_expected: {`0.4f`, `0.3f`, `0.2f`, `0.1f`}, p: `0.99f`, t: `0.39f`);
344
345	test_typical(probs: {`0.97f`, `0.01f`, `0.01f`, `0.01f`}, probs_expected: {`0.97f`}, p: `0.5f`);
346	test_typical(probs: {`0.4f`, `0.2f`, `0.2f`, `0.2f`}, probs_expected: {`0.2f`, `0.2f`, `0.2f`}, p: `0.5f`);
347
348	test_penalties(probs: {`0.2f`, `0.2f`, `0.2f`, `0.2f`, `0.2f`}, last_tokens: {`0`}, probs_expected: {`0`, `0.25f`, `0.25f`, `0.25f`, `0.25f`}, repeat_penalty: `50.0f`, alpha_frequency: `0.0f`, alpha_presence: `0.0f`);
349	test_penalties(probs: {`0.2f`, `0.2f`, `0.2f`, `0.2f`, `0.2f`}, last_tokens: {`0`, `1`, `2`}, probs_expected: {`0`, `0`, `0`, `0.5f`, `0.5f`}, repeat_penalty: `50.0f`, alpha_frequency: `0.0f`, alpha_presence: `0.0f`);
350	test_penalties(probs: {`0.2f`, `0.2f`, `0.2f`, `0.2f`, `0.2f`}, last_tokens: {`0`, `1`, `2`, `0`, `0`}, probs_expected: {`0`, `0`, `0`, `0.5f`, `0.5f`}, repeat_penalty: `50.0f`, alpha_frequency: `0.0f`, alpha_presence: `0.0f`);
351
352	test_penalties(probs: {`0.2f`, `0.2f`, `0.2f`, `0.2f`, `0.2f`}, last_tokens: {`0`}, probs_expected: {`0.000011f`, `0.249997f`, `0.249997f`, `0.249997f`, `0.249997f`}, repeat_penalty: `1.0f`, alpha_frequency: `5.0f`, alpha_presence: `5.0f`);
353	test_penalties(probs: {`0.2f`, `0.2f`, `0.2f`, `0.2f`, `0.2f`}, last_tokens: {`0`, `1`, `2`}, probs_expected: {`0.000023f`, `0.000023f`, `0.000023f`, `0.499966f`, `0.499966f`}, repeat_penalty: `1.0f`, alpha_frequency: `5.0f`, alpha_presence: `5.0f`);
354	test_penalties(probs: {`0.2f`, `0.2f`, `0.2f`, `0.2f`, `0.2f`}, last_tokens: {`0`, `1`, `2`, `0`, `0`}, probs_expected: {`0.000000f`, `0.000023f`, `0.000023f`, `0.499977f`, `0.499977f`}, repeat_penalty: `1.0f`, alpha_frequency: `5.0f`, alpha_presence: `5.0f`);
355
356
357	test_dry(probs: {`0.25f`, `0.25f`, `0.25f`, `0.25f`}, last_tokens: {`0`, `1`}, expected_probs: {`0.25f`, `0.25f`, `0.25f`, `0.25f`}, dry_multiplier: `1.0f`, dry_base: `1.1f`, dry_allowed_length: `2`, dry_penalty_last_n: `4`, seq_breakers: {});
358	test_dry(probs: {`0.25f`, `0.25f`, `0.25f`, `0.25f`}, last_tokens: {`0`, `1`, `2`, `0`, `1`}, expected_probs: {`0.296923f`, `0.296923f`, `0.109232f`, `0.296923f`}, dry_multiplier: `1.0f`, dry_base: `1.1f`, dry_allowed_length: `2`, dry_penalty_last_n: `5`, seq_breakers: {});
359	test_dry(probs: {`0.2f`, `0.2f`, `0.2f`, `0.2f`, `0.2f`}, last_tokens: {`0`, `1`, `3`, `4`, `0`, `1`}, expected_probs: {`0.2f`, `0.2f`, `0.2f`, `0.2f`, `0.2f`}, dry_multiplier: `1.0f`, dry_base: `1.1f`, dry_allowed_length: `2`, dry_penalty_last_n: `6`, seq_breakers: {{`3`}});
360	test_dry(probs: {`0.2f`, `0.2f`, `0.2f`, `0.2f`, `0.2f`}, last_tokens: {`0`, `1`, `2`, `0`, `1`}, expected_probs: {`0.241818f`, `0.241818f`, `0.032727f`, `0.241818f`, `0.241818f`}, dry_multiplier: `2.0f`, dry_base: `1.1f`, dry_allowed_length: `2`, dry_penalty_last_n: `5`, seq_breakers: {});
361	test_dry(probs: {`0.2f`, `0.2f`, `0.2f`, `0.2f`, `0.2f`}, last_tokens: {`0`, `1`, `2`, `3`, `4`, `0`, `1`}, expected_probs: {`0.2f`, `0.2f`, `0.2f`, `0.2f`, `0.2f`}, dry_multiplier: `1.0f`, dry_base: `1.1f`, dry_allowed_length: `4`, dry_penalty_last_n: `7`, seq_breakers: {});
362
363	test_top_n_sigma(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.571429f`, `0.428571f`, `0.0f`, `0.0f`}, n: `1.00f`);
364	test_top_n_sigma(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, n: `0.00f`); // top_n_sigma == 0 now represents a no-op rather than greedy decoding as of PR#13345
365	test_top_n_sigma(probs: {`0.1f`, `0.2f`, `0.3f`, `0.4f`}, probs_expected: {`0.4f`, `0.3f`, `0.2f`, `0.1f`}, n: `3.00f`);
366
367	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "k", top_k: `10000`, top_p: `1.0f`, min_p: `1.0f`);
368	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "k", top_k: `1`, top_p: `1.0f`, min_p: `1.0f`);
369	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "p", top_k: `10000`, top_p: `1.0f`, min_p: `1.0f`);
370	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "p", top_k: `10000`, top_p: `0.0f`, min_p: `1.0f`);
371	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "m", top_k: `10000`, top_p: `1.0f`, min_p: `1.0f`);
372	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "m", top_k: `10000`, top_p: `1.0f`, min_p: `1e-12`);
373
374	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "k", top_k: `100`, top_p: `1.0000f`, min_p: `1.0f`);
375	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "p", top_k: `10000`, top_p: `0.0003f`, min_p: `1.0f`);
376	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "p", top_k: `10000`, top_p: `0.8000f`, min_p: `1.0f`);
377	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "m", top_k: `10000`, top_p: `1.0000f`, min_p: `9997.9f`/`9999.0f`);
378	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "m", top_k: `10000`, top_p: `1.0000f`, min_p: `0.1f`);
379
380	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "kp", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
381	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "km", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
382	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "pk", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
383	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "pm", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
384	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "mk", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
385	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "mp", top_k: `100`, top_p: `0.8f`, min_p: `9997.9f`/`9999.0f`);
386	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "mp", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
387
388	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "kpm", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
389	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "kmp", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
390	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "pkm", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
391	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "pmk", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
392	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "mkp", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
393	test_sampler_queue(n_vocab: `10000`, samplers_sequence: "mpk", top_k: `100`, top_p: `0.8f`, min_p: `0.1f`);
394
395	printf(format: "OK\n");
396
397	test_perf();
398
399	return `0`;
400	}
401

Browse the source code of llama.cpp/tests/test-sampling.cpp