json-schema-to-grammar.cpp source code [llama.cpp/common/json-schema-to-grammar.cpp]

1	#include "json-schema-to-grammar.h"
2	#include "common.h"
3
4	#include <nlohmann/json.hpp>
5
6	#include <algorithm>
7	#include <map>
8	#include <regex>
9	#include <sstream>
10	#include <string>
11	#include <unordered_map>
12	#include <unordered_set>
13	#include <vector>
14
15	using json = nlohmann::ordered_json;
16
17	static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
18	auto has_max = max_items != std::numeric_limits<int>::max();
19
20	if (max_items == `0`) {
21	return "";
22	}
23	if (min_items == `0` && max_items == `1`) {
24	return item_rule + "?";
25	}
26
27	if (separator_rule.empty()) {
28	if (min_items == `1` && !has_max) {
29	return item_rule + "+";
30	} else if (min_items == `0` && !has_max) {
31	return item_rule + "*";
32	} else {
33	return item_rule + "{" + std::to_string(val: min_items) + "," + (has_max ? std::to_string(val: max_items) : "") + "}";
34	}
35	}
36
37	auto result = item_rule + " " + build_repetition(item_rule: "(" + separator_rule + " " + item_rule + ")", min_items: min_items == `0` ? `0` : min_items - `1`, max_items: has_max ? max_items - `1` : max_items);
38	if (min_items == `0`) {
39	result = "(" + result + ")?";
40	}
41	return result;
42	}
43
44	static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = `16`, bool top_level = true) {
45	auto has_min = min_value != std::numeric_limits<int64_t>::min();
46	auto has_max = max_value != std::numeric_limits<int64_t>::max();
47
48	auto digit_range = [&](char from, char to) {
49	out << "[";
50	if (from == to) {
51	out << from;
52	} else {
53	out << from << "-" << to;
54	}
55	out << "]";
56	};
57	auto more_digits = [&](int min_digits, int max_digits) {
58	out << "[0-9]";
59	if (min_digits == max_digits && min_digits == `1`) {
60	return;
61	}
62	out << "{";
63	out << min_digits;
64	if (max_digits != min_digits) {
65	out << ",";
66	if (max_digits != std::numeric_limits<int>::max()) {
67	out << max_digits;
68	}
69	}
70	out << "}";
71	};
72	std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
73	[&](const std::string_view & from, const std::string_view & to) {
74	size_t i = `0`;
75	while (i < from.length() && i < to.length() && from [i] == to [i]) {
76	i++;
77	}
78	if (i > `0`) {
79	out << "\"" << from.substr(pos: `0`, n: i) << "\"";
80	}
81	if (i < from.length() && i < to.length()) {
82	if (i > `0`) {
83	out << " ";
84	}
85	auto sub_len = from.length() - i - `1`;
86	if (sub_len > `0`) {
87	auto from_sub = from.substr(pos: i + `1`);
88	auto to_sub = to.substr(pos: i + `1`);
89	auto sub_zeros = string_repeat(str: "0", n: sub_len);
90	auto sub_nines = string_repeat(str: "9", n: sub_len);
91
92	auto to_reached = false;
93	out << "(";
94	if (from_sub == sub_zeros) {
95	digit_range (from [i], to [i] - `1`);
96	out << " ";
97	more_digits (sub_len, sub_len);
98	} else {
99	out << "[" << from [i] << "] ";
100	out << "(";
101	uniform_range (from_sub, sub_nines);
102	out << ")";
103	if (from [i] < to [i] - `1`) {
104	out << " \| ";
105	if (to_sub == sub_nines) {
106	digit_range (from [i] + `1`, to [i]);
107	to_reached = true;
108	} else {
109	digit_range (from [i] + `1`, to [i] - `1`);
110	}
111	out << " ";
112	more_digits (sub_len, sub_len);
113	}
114	}
115	if (!to_reached) {
116	out << " \| ";
117	digit_range (to [i], to [i]);
118	out << " ";
119	uniform_range (sub_zeros, to_sub);
120	}
121	out << ")";
122	} else {
123	out << "[" << from [i] << "-" << to [i] << "]";
124	}
125	}
126	};
127
128	if (has_min && has_max) {
129	if (min_value < `0` && max_value < `0`) {
130	out << "\"-\" (";
131	_build_min_max_int(min_value: -max_value, max_value: -min_value, out, decimals_left, / top_level= / true);
132	out << ")";
133	return;
134	}
135
136	if (min_value < `0`) {
137	out << "\"-\" (";
138	_build_min_max_int(min_value: `0`, max_value: -min_value, out, decimals_left, / top_level= / true);
139	out << ") \| ";
140	min_value = `0`;
141	}
142
143	auto min_s = std::to_string(val: min_value);
144	auto max_s = std::to_string(val: max_value);
145	auto min_digits = min_s.length();
146	auto max_digits = max_s.length();
147
148	for (auto digits = min_digits; digits < max_digits; digits++) {
149	uniform_range (min_s, string_repeat(str: "9", n: digits));
150	min_s = "1" + string_repeat(str: "0", n: digits);
151	out << " \| ";
152	}
153	uniform_range (min_s, max_s);
154	return;
155	}
156
157	auto less_decimals = std::max(a: decimals_left - `1`, b: `1`);
158
159	if (has_min) {
160	if (min_value < `0`) {
161	out << "\"-\" (";
162	_build_min_max_int(min_value: std::numeric_limits<int64_t>::min(), max_value: -min_value, out, decimals_left, / top_level= / false);
163	out << ") \| [0] \| [1-9] ";
164	more_digits (`0`, decimals_left - `1`);
165	} else if (min_value == `0`) {
166	if (top_level) {
167	out << "[0] \| [1-9] ";
168	more_digits (`0`, less_decimals);
169	} else {
170	more_digits (`1`, decimals_left);
171	}
172	} else if (min_value <= `9`) {
173	char c = `'0'` + min_value;
174	auto range_start = top_level ? `'1'` : `'0'`;
175	if (c > range_start) {
176	digit_range (range_start, c - `1`);
177	out << " ";
178	more_digits (`1`, less_decimals);
179	out << " \| ";
180	}
181	digit_range (c, `'9'`);
182	out << " ";
183	more_digits (`0`, less_decimals);
184	} else {
185	auto min_s = std::to_string(val: min_value);
186	auto len = min_s.length();
187	auto c = min_s [`0`];
188
189	if (c > `'1'`) {
190	digit_range (top_level ? `'1'` : `'0'`, c - `1`);
191	out << " ";
192	more_digits (len, less_decimals);
193	out << " \| ";
194	}
195	digit_range (c, c);
196	out << " (";
197	_build_min_max_int(min_value: std::stoll(str: min_s.substr(pos: `1`)), max_value: std::numeric_limits<int64_t>::max(), out, decimals_left: less_decimals, / top_level= / false);
198	out << ")";
199	if (c < `'9'`) {
200	out << " \| ";
201	digit_range (c + `1`, `'9'`);
202	out << " ";
203	more_digits (len - `1`, less_decimals);
204	}
205	}
206	return;
207	}
208
209	if (has_max) {
210	if (max_value >= `0`) {
211	if (top_level) {
212	out << "\"-\" [1-9] ";
213	more_digits (`0`, less_decimals);
214	out << " \| ";
215	}
216	_build_min_max_int(min_value: `0`, max_value, out, decimals_left, / top_level= / true);
217	} else {
218	out << "\"-\" (";
219	_build_min_max_int(min_value: -max_value, max_value: std::numeric_limits<int64_t>::max(), out, decimals_left, / top_level= / false);
220	out << ")";
221	}
222	return;
223	}
224
225	throw std::runtime_error ("At least one of min_value or max_value must be set");
226	}
227
228	const std::string SPACE_RULE = "\| \" \" \| \"\\n\"{1,2} [ \\t]{0,20}";
229
230	struct BuiltinRule {
231	std::string content;
232	std::vector<std::string> deps;
233	};
234
235	std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
236	{"boolean", {.content: "(\"true\" \| \"false\") space", .deps: {}}},
237	{"decimal-part", {.content: "[0-9]{1,16}", .deps: {}}},
238	{"integral-part", {.content: "[0] \| [1-9] [0-9]{0,15}", .deps: {}}},
239	{"number", {.content: "(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", .deps: {"integral-part", "decimal-part"}}},
240	{"integer", {.content: "(\"-\"? integral-part) space", .deps: {"integral-part"}}},
241	{"value", {.content: "object \| array \| string \| number \| boolean \| null", .deps: {"object", "array", "string", "number", "boolean", "null"}}},
242	{"object", {.content: "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", .deps: {"string", "value"}}},
243	{"array", {.content: "\"[\" space ( value (\",\" space value)* )? \"]\" space", .deps: {"value"}}},
244	{"uuid", {.content: "\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", .deps: {}}},
245	{"char", {.content: "[^\"\\\\\\x7F\\x00-\\x1F] \| [\\\\] ([\"\\\\bfnrt] \| \"u\" [0-9a-fA-F]{4})", .deps: {}}},
246	{"string", {.content: "\"\\\"\" char* \"\\\"\" space", .deps: {"char"}}},
247	{"null", {.content: "\"null\" space", .deps: {}}},
248	};
249
250	std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
251	{"date", {.content: "[0-9]{4} \"-\" ( \"0\" [1-9] \| \"1\" [0-2] ) \"-\" ( \"0\" [1-9] \| [1-2] [0-9] \| \"3\" [0-1] )", .deps: {}}},
252	{"time", {.content: "([01] [0-9] \| \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" \| ( \"+\" \| \"-\" ) ( [01] [0-9] \| \"2\" [0-3] ) \":\" [0-5] [0-9] )", .deps: {}}},
253	{"date-time", {.content: "date \"T\" time", .deps: {"date", "time"}}},
254	{"date-string", {.content: "\"\\\"\" date \"\\\"\" space", .deps: {"date"}}},
255	{"time-string", {.content: "\"\\\"\" time \"\\\"\" space", .deps: {"time"}}},
256	{"date-time-string", {.content: "\"\\\"\" date-time \"\\\"\" space", .deps: {"date-time"}}}
257	};
258
259	static bool is_reserved_name(const std::string & name) {
260	static const std::unordered_set<std::string> RESERVED_NAMES = [] {
261	std::unordered_set<std::string> s;
262	s.insert(x: "root");
263	for (const auto & p : PRIMITIVE_RULES) s.insert(x: p.first);
264	for (const auto & p : STRING_FORMAT_RULES) s.insert(x: p.first);
265	return s;
266	}();
267	return RESERVED_NAMES.find(x: name) != RESERVED_NAMES.end();
268	}
269
270	std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
271	std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
272	std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
273	std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
274	{`'\r'`, "\\r"}, {`'\n'`, "\\n"}, {`'"'`, "\\\""}, {`'-'`, "\\-"}, {`']'`, "\\]"}
275	};
276
277	std::unordered_set<char> NON_LITERAL_SET = {`'\|'`, `'.'`, `'('`, `')'`, `'['`, `']'`, `'{'`, `'}'`, `'*'`, `'+'`, `'?'`};
278	std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {`'^'`, `'$'`, `'.'`, `'['`, `']'`, `'('`, `')'`, `'\|'`, `'{'`, `'}'`, `'*'`, `'+'`, `'?'`};
279
280	static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
281	std::smatch match;
282	std::string result;
283
284	std::string::const_iterator searchStart(input.cbegin());
285	std::string::const_iterator searchEnd(input.cend());
286
287	while (std::regex_search(s: searchStart, e: searchEnd, m&: match, re: regex)) {
288	result.append(first: searchStart, last: searchStart + match.position());
289	result.append(str: replacement (match));
290	searchStart = match.suffix().first;
291	}
292
293	result.append(first: searchStart, last: searchEnd);
294
295	return result;
296	}
297
298	static std::string format_literal(const std::string & literal) {
299	std::string escaped = replacePattern(input: literal, regex: GRAMMAR_LITERAL_ESCAPE_RE, replacement: [&](const std::smatch & match) {
300	char c = match.str()[`0`];
301	return GRAMMAR_LITERAL_ESCAPES.at(k: c);
302	});
303	return "\"" + escaped + "\"";
304	}
305
306	class SchemaConverter {
307	private:
308	friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
309	std::function<json(const std::string &)> _fetch_json;
310	bool _dotall;
311	std::map<std::string, std::string> _rules;
312	std::unordered_map<std::string, json> _refs;
313	std::unordered_set<std::string> _refs_being_resolved;
314	std::vector<std::string> _errors;
315	std::vector<std::string> _warnings;
316
317	std::string _add_rule(const std::string & name, const std::string & rule) {
318	std::string esc_name = regex_replace(s: name, e: INVALID_RULE_CHARS_RE, fmt: "-");
319	if (_rules.find(x: esc_name) == _rules.end() \|\| _rules [esc_name] == rule) {
320	_rules [esc_name] = rule;
321	return esc_name;
322	} else {
323	int i = `0`;
324	while (_rules.find(x: esc_name + std::to_string(val: i)) != _rules.end() && _rules [esc_name + std::to_string(val: i)] != rule) {
325	i++;
326	}
327	std::string key = esc_name + std::to_string(val: i);
328	_rules [key] = rule;
329	return key;
330	}
331	}
332
333	std::string _generate_union_rule(const std::string & name, const std::vector<json> & alt_schemas) {
334	std::vector<std::string> rules;
335	for (size_t i = `0`; i < alt_schemas.size(); i++) {
336	rules.push_back(x: visit(schema: alt_schemas [i], name: name + (name.empty() ? "alternative-" : "-") + std::to_string(val: i)));
337	}
338	return string_join(values: rules, separator: " \| ");
339	}
340
341	std::string _visit_pattern(const std::string & pattern, const std::string & name) {
342	if (!(pattern.front() == `'^'` && pattern.back() == `'$'`)) {
343	_errors.push_back(x: "Pattern must start with '^' and end with '$'");
344	return "";
345	}
346	std::string sub_pattern = pattern.substr(pos: `1`, n: pattern.length() - `2`);
347	std::unordered_map<std::string, std::string> sub_rule_ids;
348
349	size_t i = `0`;
350	size_t length = sub_pattern.length();
351
352	using literal_or_rule = std::pair<std::string, bool>;
353	auto to_rule = [&](const literal_or_rule & ls) {
354	auto is_literal = ls.second;
355	auto s = ls.first;
356	return is_literal ? "\"" + s + "\"" : s;
357	};
358	std::function<literal_or_rule()> transform = [&]() -> literal_or_rule {
359	size_t start = i;
360	std::vector<literal_or_rule> seq;
361
362	auto get_dot = [&]() {
363	std::string rule;
364	if (_dotall) {
365	rule = "[\\U00000000-\\U0010FFFF]";
366	} else {
367	rule = "[^\\x0A\\x0D]";
368	}
369	return _add_rule(name: "dot", rule);
370	};
371
372	// Joins the sequence, merging consecutive literals together.
373	auto join_seq = [&]() {
374	std::vector<literal_or_rule> ret;
375
376	std::string literal;
377	auto flush_literal = [&]() {
378	if (literal.empty()) {
379	return false;
380	}
381	ret.emplace_back(args&: literal, args: true);
382	literal.clear();
383	return true;
384	};
385
386	for (const auto & item : seq) {
387	auto is_literal = item.second;
388	if (is_literal) {
389	literal += item.first;
390	} else {
391	flush_literal();
392	ret.push_back(x: item);
393	}
394	}
395	flush_literal();
396
397	std::vector<std::string> results;
398	for (const auto & item : ret) {
399	results.push_back(x: to_rule(item));
400	}
401	return std::make_pair(x: string_join(values: results, separator: " "), y: false);
402	};
403
404	while (i < length) {
405	char c = sub_pattern [i];
406	if (c == `'.'`) {
407	seq.emplace_back(args: get_dot(), args: false);
408	i++;
409	} else if (c == `'('`) {
410	i++;
411	if (i < length) {
412	if (sub_pattern [i] == `'?'`) {
413	_warnings.push_back(x: "Unsupported pattern syntax");
414	}
415	}
416	seq.emplace_back(args: "(" + to_rule(transform ()) + ")", args: false);
417	} else if (c == `')'`) {
418	i++;
419	if (start > `0` && sub_pattern [start - `1`] != `'('`) {
420	_errors.push_back(x: "Unbalanced parentheses");
421	}
422	return join_seq();
423	} else if (c == `'['`) {
424	std::string square_brackets = std::string (`1`, c);
425	i++;
426	while (i < length && sub_pattern [i] != `']'`) {
427	if (sub_pattern [i] == `'\\'`) {
428	square_brackets += sub_pattern.substr(pos: i, n: `2`);
429	i += `2`;
430	} else {
431	square_brackets += sub_pattern [i];
432	i++;
433	}
434	}
435	if (i >= length) {
436	_errors.push_back(x: "Unbalanced square brackets");
437	}
438	square_brackets += `']'`;
439	i++;
440	seq.emplace_back(args&: square_brackets, args: false);
441	} else if (c == `'\|'`) {
442	seq.emplace_back(args: "\|", args: false);
443	i++;
444	} else if (c == `'*'` \|\| c == `'+'` \|\| c == `'?'`) {
445	seq.back() = std::make_pair(x: to_rule(seq.back()) + c, y: false);
446	i++;
447	} else if (c == `'{'`) {
448	std::string curly_brackets = std::string (`1`, c);
449	i++;
450	while (i < length && sub_pattern [i] != `'}'`) {
451	curly_brackets += sub_pattern [i];
452	i++;
453	}
454	if (i >= length) {
455	_errors.push_back(x: "Unbalanced curly brackets");
456	}
457	curly_brackets += `'}'`;
458	i++;
459	auto nums = string_split(str: curly_brackets.substr(pos: `1`, n: curly_brackets.length() - `2`), delimiter: ",");
460	int min_times = `0`;
461	int max_times = std::numeric_limits<int>::max();
462	try {
463	if (nums.size() == `1`) {
464	min_times = max_times = std::stoi(str: nums [`0`]);
465	} else if (nums.size() != `2`) {
466	_errors.push_back(x: "Wrong number of values in curly brackets");
467	} else {
468	if (!nums [`0`].empty()) {
469	min_times = std::stoi(str: nums [`0`]);
470	}
471	if (!nums [`1`].empty()) {
472	max_times = std::stoi(str: nums [`1`]);
473	}
474	}
475	} catch (const std::invalid_argument & e) {
476	_errors.push_back(x: "Invalid number in curly brackets");
477	return std::make_pair(x: "", y: false);
478	}
479	auto &last = seq.back();
480	auto &sub = last.first;
481	auto sub_is_literal = last.second;
482
483	if (!sub_is_literal) {
484	std::string & sub_id = sub_rule_ids [sub];
485	if (sub_id.empty()) {
486	sub_id = _add_rule(name: name + "-" + std::to_string(val: sub_rule_ids.size()), rule: sub);
487	}
488	sub = sub_id;
489	}
490	seq.back().first = build_repetition(
491	item_rule: sub_is_literal ? "\"" + sub + "\"" : sub,
492	min_items: min_times,
493	max_items: max_times,
494	separator_rule: ""
495	);
496	seq.back().second = false;
497	} else {
498	std::string literal;
499	auto is_non_literal = [&](char c) {
500	return NON_LITERAL_SET.find(x: c) != NON_LITERAL_SET.end();
501	};
502	while (i < length) {
503	if (sub_pattern [i] == `'\\'` && i < length - `1`) {
504	char next = sub_pattern [i + `1`];
505	if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(x: next) != ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end()) {
506	i++;
507	literal += sub_pattern [i];
508	i++;
509	} else {
510	literal += sub_pattern.substr(pos: i, n: `2`);
511	i += `2`;
512	}
513	} else if (sub_pattern [i] == `'"'`) {
514	literal += "\\\"";
515	i++;
516	} else if (!is_non_literal(sub_pattern [i]) &&
517	(i == length - `1` \|\| literal.empty() \|\| sub_pattern [i + `1`] == `'.'` \|\| !is_non_literal(sub_pattern [i + `1`]))) {
518	literal += sub_pattern [i];
519	i++;
520	} else {
521	break;
522	}
523	}
524	if (!literal.empty()) {
525	seq.emplace_back(args&: literal, args: true);
526	}
527	}
528	}
529	return join_seq();
530	};
531	return _add_rule(name, rule: "\"\\\"\" (" + to_rule(transform ()) + ") \"\\\"\" space");
532	}
533
534	/*
535	Returns a rule that matches a JSON string that is none of the provided strings
536
537	not_strings({"a"})
538	-> ["] ( [a] char+ \| [^"a] char )? ["] space*
539	not_strings({"and", "also"})
540	-> ["] ( [a] ([l] ([s] ([o] char+ \| [^"o] char) \| [^"s] char) \| [n] ([d] char+ \| [^"d] char) \| [^"ln] char) \| [^"a] char )? ["] space*
541	*/
542	std::string _not_strings(const std::vector<std::string> & strings) {
543
544	struct TrieNode {
545	std::map<char, TrieNode> children;
546	bool is_end_of_string;
547
548	TrieNode() : is_end_of_string(false) {}
549
550	void insert(const std::string & string) {
551	auto node = this;
552	for (char c : string) {
553	node = &node->children [c];
554	}
555	node->is_end_of_string = true;
556	}
557	};
558
559	TrieNode trie;
560	for (const auto & s : strings) {
561	trie.insert(string: s);
562	}
563
564	std::string char_rule = _add_primitive(name: "char", rule: PRIMITIVE_RULES.at(k: "char"));
565	std::ostringstream out;
566	out << "[\"] ( ";
567	std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
568	std::ostringstream rejects;
569	auto first = true;
570	for (const auto & kv : node.children) {
571	rejects << kv.first;
572	if (first) {
573	first = false;
574	} else {
575	out << " \| ";
576	}
577	out << "[" << kv.first << "]";
578	if (!kv.second.children.empty()) {
579	out << " (";
580	visit (kv.second);
581	out << ")";
582	} else if (kv.second.is_end_of_string) {
583	out << " " << char_rule << "+";
584	}
585	}
586	if (!node.children.empty()) {
587	if (!first) {
588	out << " \| ";
589	}
590	out << "[^\"" << rejects.str() << "] " << char_rule << "*";
591	}
592	};
593	visit (trie);
594
595	out << " )";
596	if (!trie.is_end_of_string) {
597	out << "?";
598	}
599	out << " [\"] space";
600	return out.str();
601	}
602
603	std::string _resolve_ref(const std::string & ref) {
604	auto it = ref.find(c: `'#'`);
605	std::string ref_fragment = it != std::string::npos ? ref.substr(pos: it + `1`) : ref;
606	static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
607	std::string ref_name = "ref" + std::regex_replace(s: ref_fragment, e: nonalphanumeric_regex, fmt: "-");
608	if (_rules.find(x: ref_name) == _rules.end() && _refs_being_resolved.find(x: ref) == _refs_being_resolved.end()) {
609	_refs_being_resolved.insert(x: ref);
610	json resolved = _refs [ref];
611	ref_name = visit(schema: resolved, name: ref_name);
612	_refs_being_resolved.erase(x: ref);
613	}
614	return ref_name;
615	}
616
617	std::string _build_object_rule(
618	const std::vector<std::pair<std::string, json>> & properties,
619	const std::unordered_set<std::string> & required,
620	const std::string & name,
621	const json & additional_properties)
622	{
623	std::vector<std::string> required_props;
624	std::vector<std::string> optional_props;
625	std::unordered_map<std::string, std::string> prop_kv_rule_names;
626	std::vector<std::string> prop_names;
627	for (const auto & kv : properties) {
628	const auto &prop_name = kv.first;
629	const auto &prop_schema = kv.second;
630
631	std::string prop_rule_name = visit(schema: prop_schema, name: name + (name.empty() ? "" : "-") + prop_name);
632	prop_kv_rule_names [prop_name] = _add_rule(
633	name: name + (name.empty() ? "" : "-") + prop_name + "-kv",
634	rule: format_literal(literal: json (prop_name).dump()) + " space \":\" space " + prop_rule_name
635	);
636	if (required.find(x: prop_name) != required.end()) {
637	required_props.push_back(x: prop_name);
638	} else {
639	optional_props.push_back(x: prop_name);
640	}
641	prop_names.push_back(x: prop_name);
642	}
643	if ((additional_properties.is_boolean() && additional_properties.get<bool>()) \|\| additional_properties.is_object()) {
644	std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
645	std::string value_rule =
646	additional_properties.is_object() ? visit(schema: additional_properties, name: sub_name + "-value")
647	: _add_primitive(name: "value", rule: PRIMITIVE_RULES.at(k: "value"));
648
649	auto key_rule =
650	prop_names.empty() ? _add_primitive(name: "string", rule: PRIMITIVE_RULES.at(k: "string"))
651	: _add_rule(name: sub_name + "-k", rule: _not_strings(strings: prop_names));
652	std::string kv_rule = _add_rule(name: sub_name + "-kv", rule: key_rule + " \":\" space " + value_rule);
653	prop_kv_rule_names ["*"] = kv_rule;
654	optional_props.push_back(x: "*");
655	}
656
657	std::string rule = "\"{\" space ";
658	for (size_t i = `0`; i < required_props.size(); i++) {
659	if (i > `0`) {
660	rule += " \",\" space ";
661	}
662	rule += prop_kv_rule_names [required_props [i]];
663	}
664
665	if (!optional_props.empty()) {
666	rule += " (";
667	if (!required_props.empty()) {
668	rule += " \",\" space ( ";
669	}
670
671	std::function<std::string(const std::vector<std::string> &, bool)> get_recursive_refs = [&](const std::vector<std::string> & ks, bool first_is_optional) {
672	std::string res;
673	if (ks.empty()) {
674	return res;
675	}
676	std::string k = ks [`0`];
677	std::string kv_rule_name = prop_kv_rule_names [k];
678	std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
679	if (first_is_optional) {
680	res = comma_ref + (k == "" ? "" : "?");
681	} else {
682	res = kv_rule_name + (k == "" ? " " + comma_ref + "" : "");
683	}
684	if (ks.size() > `1`) {
685	res += " " + _add_rule(
686	name: name + (name.empty() ? "" : "-") + k + "-rest",
687	rule: get_recursive_refs (std::vector<std::string>(ks.begin() + `1`, ks.end()), true)
688	);
689	}
690	return res;
691	};
692
693	for (size_t i = `0`; i < optional_props.size(); i++) {
694	if (i > `0`) {
695	rule += " \| ";
696	}
697	rule += get_recursive_refs (std::vector<std::string>(optional_props.begin() + i, optional_props.end()), false);
698	}
699	if (!required_props.empty()) {
700	rule += " )";
701	}
702	rule += " )?";
703	}
704
705	rule += " \"}\" space";
706
707	return rule;
708	}
709
710	std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
711	auto n = _add_rule(name, rule: rule.content);
712	for (const auto & dep : rule.deps) {
713	BuiltinRule dep_rule;
714	auto it = PRIMITIVE_RULES.find(x: dep);
715	if (it == PRIMITIVE_RULES.end()) {
716	it = STRING_FORMAT_RULES.find(x: dep);
717	if (it == STRING_FORMAT_RULES.end()) {
718	_errors.push_back(x: "Rule " + dep + " not known");
719	continue;
720	}
721	}
722	if (_rules.find(x: dep) == _rules.end()) {
723	_add_primitive(name: dep, rule: it ->second);
724	}
725	}
726	return n;
727	}
728
729	public:
730	SchemaConverter(
731	const std::function<json(const std::string &)> & fetch_json,
732	bool dotall)
733	: _fetch_json (fetch_json), _dotall(dotall)
734	{
735	_rules ["space"] = SPACE_RULE;
736	}
737
738	void resolve_refs(json & schema, const std::string & url) {
739	/*
740	* Resolves all $ref fields in the given schema, fetching any remote schemas,
741	* replacing each $ref with absolute reference URL and populates _refs with the
742	* respective referenced (sub)schema dictionaries.
743	*/
744	std::function<void(json &)> visit_refs = [&](json & n) {
745	if (n.is_array()) {
746	for (auto & x : n) {
747	visit_refs (x);
748	}
749	} else if (n.is_object()) {
750	if (n.contains(key: "$ref")) {
751	std::string ref = n ["$ref"];
752	if (_refs.find(x: ref) == _refs.end()) {
753	json target;
754	if (ref.find(s: "https://") == `0`) {
755	std::string base_url = ref.substr(pos: `0`, n: ref.find(c: `'#'`));
756	auto it = _refs.find(x: base_url);
757	if (it != _refs.end()) {
758	target = it ->second;
759	} else {
760	// Fetch the referenced schema and resolve its refs
761	auto referenced = _fetch_json (ref);
762	resolve_refs(schema&: referenced, url: base_url);
763	_refs [base_url] = referenced;
764	}
765	if (ref.find(c: `'#'`) == std::string::npos \|\| ref.substr(pos: ref.find(c: `'#'`) + `1`).empty()) {
766	return;
767	}
768	} else if (ref.find(s: "#/") == `0`) {
769	target = schema;
770	n ["$ref"] = url + ref;
771	ref = url + ref;
772	} else {
773	_errors.push_back(x: "Unsupported ref: " + ref);
774	return;
775	}
776	std::string pointer = ref.substr(pos: ref.find(c: `'#'`) + `1`);
777	std::vector<std::string> tokens = string_split(str: pointer, delimiter: "/");
778	for (size_t i = `1`; i < tokens.size(); ++i) {
779	std::string sel = tokens [i];
780	if (target.is_object() && target.contains(key: sel)) {
781	target = target [sel];
782	} else if (target.is_array()) {
783	size_t sel_index;
784	try {
785	sel_index = std::stoul(str: sel);
786	} catch (const std::invalid_argument & e) {
787	sel_index = target.size();
788	}
789	if (sel_index >= target.size()) {
790	_errors.push_back(x: "Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
791	return;
792	}
793	target = target [sel_index];
794	} else {
795	_errors.push_back(x: "Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
796	return;
797	}
798	}
799	_refs [ref] = target;
800	}
801	} else {
802	for (auto & kv : n.items()) {
803	visit_refs (kv.value());
804	}
805	}
806	}
807	};
808
809	visit_refs (schema);
810	}
811
812	std::string _generate_constant_rule(const json & value) {
813	return format_literal(literal: value.dump());
814	}
815
816	std::string visit(const json & schema, const std::string & name) {
817	json schema_type = schema.contains(key: "type") ? schema ["type"] : json ();
818	std::string schema_format = schema.contains(key: "format") ? schema ["format"].get<std::string>() : "";
819	std::string rule_name = is_reserved_name(name) ? name + "-" : name.empty() ? "root" : name;
820
821	if (schema.contains(key: "$ref")) {
822	return _add_rule(name: rule_name, rule: _resolve_ref(ref: schema ["$ref"]));
823	} else if (schema.contains(key: "oneOf") \|\| schema.contains(key: "anyOf")) {
824	std::vector<json> alt_schemas = schema.contains(key: "oneOf") ? schema ["oneOf"].get<std::vector<json>>() : schema ["anyOf"].get<std::vector<json>>();
825	return _add_rule(name: rule_name, rule: _generate_union_rule(name, alt_schemas));
826	} else if (schema_type.is_array()) {
827	std::vector<json> schema_types;
828	for (const auto & t : schema_type) {
829	json schema_copy(schema);
830	schema_copy ["type"] = t;
831	schema_types.push_back(x: schema_copy);
832	}
833	return _add_rule(name: rule_name, rule: _generate_union_rule(name, alt_schemas: schema_types));
834	} else if (schema.contains(key: "const")) {
835	return _add_rule(name: rule_name, rule: _generate_constant_rule(value: schema ["const"]) + " space");
836	} else if (schema.contains(key: "enum")) {
837	std::vector<std::string> enum_values;
838	for (const auto & v : schema ["enum"]) {
839	enum_values.push_back(x: _generate_constant_rule(value: v));
840	}
841	return _add_rule(name: rule_name, rule: "(" + string_join(values: enum_values, separator: " \| ") + ") space");
842	} else if ((schema_type.is_null() \|\| schema_type == "object")
843	&& (schema.contains(key: "properties") \|\|
844	(schema.contains(key: "additionalProperties") && schema ["additionalProperties"] != true))) {
845	std::unordered_set<std::string> required;
846	if (schema.contains(key: "required") && schema ["required"].is_array()) {
847	for (const auto & item : schema ["required"]) {
848	if (item.is_string()) {
849	required.insert(x: item.get<std::string>());
850	}
851	}
852	}
853	std::vector<std::pair<std::string, json>> properties;
854	if (schema.contains(key: "properties")) {
855	for (const auto & prop : schema ["properties"].items()) {
856	properties.emplace_back(args: prop.key(), args: prop.value());
857	}
858	}
859	return _add_rule(name: rule_name,
860	rule: _build_object_rule(
861	properties, required, name,
862	additional_properties: schema.contains(key: "additionalProperties") ? schema ["additionalProperties"] : json ()));
863	} else if ((schema_type.is_null() \|\| schema_type == "object" \|\| schema_type == "string") && schema.contains(key: "allOf")) {
864	std::unordered_set<std::string> required;
865	std::vector<std::pair<std::string, json>> properties;
866	std::map<std::string, size_t> enum_values;
867	std::string hybrid_name = name;
868	std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
869	if (comp_schema.contains(key: "$ref")) {
870	add_component (_refs [comp_schema ["$ref"]], is_required);
871	} else if (comp_schema.contains(key: "properties")) {
872	for (const auto & prop : comp_schema ["properties"].items()) {
873	properties.emplace_back(args: prop.key(), args: prop.value());
874	if (is_required) {
875	required.insert(x: prop.key());
876	}
877	}
878	} else if (comp_schema.contains(key: "enum")) {
879	for (const auto & v : comp_schema ["enum"]) {
880	const auto rule = _generate_constant_rule(value: v);
881	if (enum_values.find(x: rule) == enum_values.end()) {
882	enum_values [rule] = `0`;
883	}
884	enum_values [rule] += `1`;
885	}
886	} else {
887	// todo warning
888	}
889	};
890	for (auto & t : schema ["allOf"]) {
891	if (t.contains(key: "anyOf")) {
892	for (auto & tt : t ["anyOf"]) {
893	add_component (tt, false);
894	}
895	} else {
896	add_component (t, true);
897	}
898	}
899	if (!enum_values.empty()) {
900	std::vector<std::string> enum_intersection;
901	for (const auto & p : enum_values) {
902	if (p.second == schema ["allOf"].size()) {
903	enum_intersection.push_back(x: p.first);
904	}
905	}
906	if (!enum_intersection.empty()) {
907	return _add_rule(name: rule_name, rule: "(" + string_join(values: enum_intersection, separator: " \| ") + ") space");
908	}
909	}
910	return _add_rule(name: rule_name, rule: _build_object_rule(properties, required, name: hybrid_name, additional_properties: json ()));
911	} else if ((schema_type.is_null() \|\| schema_type == "array") && (schema.contains(key: "items") \|\| schema.contains(key: "prefixItems"))) {
912	json items = schema.contains(key: "items") ? schema ["items"] : schema ["prefixItems"];
913	if (items.is_array()) {
914	std::string rule = "\"[\" space ";
915	for (size_t i = `0`; i < items.size(); i++) {
916	if (i > `0`) {
917	rule += " \",\" space ";
918	}
919	rule += visit(schema: items [i], name: name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(val: i));
920	}
921	rule += " \"]\" space";
922	return _add_rule(name: rule_name, rule);
923	} else {
924	std::string item_rule_name = visit(schema: items, name: name + (name.empty() ? "" : "-") + "item");
925	int min_items = schema.contains(key: "minItems") ? schema ["minItems"].get<int>() : `0`;
926	json max_items_json = schema.contains(key: "maxItems") ? schema ["maxItems"] : json ();
927	int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
928
929	return _add_rule(name: rule_name, rule: "\"[\" space " + build_repetition(item_rule: item_rule_name, min_items, max_items, separator_rule: "\",\" space") + " \"]\" space");
930	}
931	} else if ((schema_type.is_null() \|\| schema_type == "string") && schema.contains(key: "pattern")) {
932	return _visit_pattern(pattern: schema ["pattern"], name: rule_name);
933	} else if ((schema_type.is_null() \|\| schema_type == "string") && std::regex_match(s: schema_format, re: std::regex ("^uuid[1-5]?$"))) {
934	return _add_primitive(name: rule_name == "root" ? "root" : schema_format, rule: PRIMITIVE_RULES.at(k: "uuid"));
935	} else if ((schema_type.is_null() \|\| schema_type == "string") && STRING_FORMAT_RULES.find(x: schema_format + "-string") != STRING_FORMAT_RULES.end()) {
936	auto prim_name = schema_format + "-string";
937	return _add_rule(name: rule_name, rule: _add_primitive(name: prim_name, rule: STRING_FORMAT_RULES.at(k: prim_name)));
938	} else if (schema_type == "string" && (schema.contains(key: "minLength") \|\| schema.contains(key: "maxLength"))) {
939	std::string char_rule = _add_primitive(name: "char", rule: PRIMITIVE_RULES.at(k: "char"));
940	int min_len = schema.contains(key: "minLength") ? schema ["minLength"].get<int>() : `0`;
941	int max_len = schema.contains(key: "maxLength") ? schema ["maxLength"].get<int>() : std::numeric_limits<int>::max();
942	return _add_rule(name: rule_name, rule: "\"\\\"\" " + build_repetition(item_rule: char_rule, min_items: min_len, max_items: max_len) + " \"\\\"\" space");
943	} else if (schema_type == "integer" && (schema.contains(key: "minimum") \|\| schema.contains(key: "exclusiveMinimum") \|\| schema.contains(key: "maximum") \|\| schema.contains(key: "exclusiveMaximum"))) {
944	int64_t min_value = std::numeric_limits<int64_t>::min();
945	int64_t max_value = std::numeric_limits<int64_t>::max();
946	if (schema.contains(key: "minimum")) {
947	min_value = schema ["minimum"].get<int64_t>();
948	} else if (schema.contains(key: "exclusiveMinimum")) {
949	min_value = schema ["exclusiveMinimum"].get<int64_t>() + `1`;
950	}
951	if (schema.contains(key: "maximum")) {
952	max_value = schema ["maximum"].get<int64_t>();
953	} else if (schema.contains(key: "exclusiveMaximum")) {
954	max_value = schema ["exclusiveMaximum"].get<int64_t>() - `1`;
955	}
956	std::stringstream out;
957	out << "(";
958	_build_min_max_int(min_value, max_value, out);
959	out << ") space";
960	return _add_rule(name: rule_name, rule: out.str());
961	} else if (schema.empty() \|\| schema_type == "object") {
962	return _add_rule(name: rule_name, rule: _add_primitive(name: "object", rule: PRIMITIVE_RULES.at(k: "object")));
963	} else {
964	if (!schema_type.is_string() \|\| PRIMITIVE_RULES.find(x: schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
965	_errors.push_back(x: "Unrecognized schema: " + schema.dump());
966	return "";
967	}
968	// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
969	return _add_primitive(name: rule_name == "root" ? "root" : schema_type.get<std::string>(), rule: PRIMITIVE_RULES.at(k: schema_type.get<std::string>()));
970	}
971	}
972
973	void check_errors() {
974	if (!_errors.empty()) {
975	throw std::runtime_error ("JSON schema conversion failed:\n" + string_join(values: _errors, separator: "\n"));
976	}
977	if (!_warnings.empty()) {
978	fprintf(stderr, format: "WARNING: JSON schema conversion was incomplete: %s\n", string_join(values: _warnings, separator: "; ").c_str());
979	}
980	}
981
982	std::string format_grammar() {
983	std::stringstream ss;
984	for (const auto & kv : _rules) {
985	ss << kv.first << " ::= " << kv.second << std::endl;
986	}
987	return ss.str();
988	}
989	};
990
991	std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
992	#ifdef LLAMA_USE_LLGUIDANCE
993	if (!force_gbnf) {
994	return "%llguidance {}\nstart: %json " + schema.dump();
995	}
996	#else
997	(void)force_gbnf;
998	#endif // LLAMA_USE_LLGUIDANCE
999	return build_grammar(cb: [&](const common_grammar_builder & callbacks) {
1000	auto copy = schema;
1001	callbacks.resolve_refs (copy);
1002	callbacks.add_schema ("", copy);
1003	});
1004	}
1005
1006	std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
1007	SchemaConverter converter([&](const std::string &) { return json (); }, options.dotall);
1008	common_grammar_builder builder {
1009	/ .add_rule = / [&](const std::string & name, const std::string & rule) {
1010	return converter._add_rule(name, rule);
1011	},
1012	/ .add_schema = / [&](const std::string & name, const nlohmann::ordered_json & schema) {
1013	return converter.visit(schema, name: name == "root" ? "" : name);
1014	},
1015	/ .resolve_refs = / [&](nlohmann::ordered_json & schema) {
1016	converter.resolve_refs(schema, url: "");
1017	}
1018	};
1019	cb (builder);
1020	converter.check_errors();
1021	return converter.format_grammar();
1022	}
1023

Browse the source code of llama.cpp/common/json-schema-to-grammar.cpp