serialization-inl.h source code [Velox/build/_deps/simdjson-src/include/simdjson/dom/serialization-inl.h]

1
2	#ifndef SIMDJSON_SERIALIZATION_INL_H
3	#define SIMDJSON_SERIALIZATION_INL_H
4
5	#include "simdjson/dom/serialization.h"
6
7	#include <cinttypes>
8	#include <type_traits>
9
10	namespace simdjson {
11	namespace dom {
12	inline bool parser::print_json(std::ostream &os) const noexcept {
13	if (!valid) { return false; }
14	simdjson::internal::string_builder<> sb;
15	sb.append(value: doc.root());
16	std::string_view answer = sb.str();
17	os << answer;
18	return true;
19	}
20	}
21	/***
22	* Number utility functions
23	**/
24
25
26	namespace {
27	/@private
28	* Escape sequence like \b or \u0001
29	* We expect that most compilers will use 8 bytes for this data structure.
30	**/
31	struct escape_sequence {
32	uint8_t length;
33	const char string[`7`]; // technically, we only ever need 6 characters, we pad to 8
34	};
35	/@private
36	* This converts a signed integer into a character sequence.
37	* The caller is responsible for providing enough memory (at least
38	* 20 characters.)
39	* Though various runtime libraries provide itoa functions,
40	* it is not part of the C++ standard. The C++17 standard
41	* adds the to_chars functions which would do as well, but
42	* we want to support C++11.
43	*/
44	char fast_itoa(char* output, int64_t value) noexcept* {
45	// This is a standard implementation of itoa.
46	char buffer[`20`];
47	uint64_t value_positive;
48	// In general, negating a signed integer is unsafe.
49	if(value < `0`) {
50	*output++ = `'-'`;
51	// Doing value_positive = -value; while avoiding
52	// undefined behavior warnings.
53	// It assumes two complement's which is universal at this
54	// point in time.
55	std::memcpy(dest: &value_positive, src: &value, n: sizeof(value));
56	value_positive = (~value_positive) + `1`; // this is a negation
57	} else {
58	value_positive = value;
59	}
60	// We work solely with value_positive. It might* be easier*
61	// for an optimizing compiler to deal with an unsigned variable
62	// as far as performance goes.
63	const char *const end_buffer = buffer + `20`;
64	char *write_pointer = buffer + `19`;
65	// A faster approach is possible if we expect large integers:
66	// unroll the loop (work in 100s, 1000s) and use some kind of
67	// memoization.
68	while(value_positive >= `10`) {
69	write_pointer-- = char*(`'0'` + (value_positive % `10`));
70	value_positive /= `10`;
71	}
72	write_pointer = char*(`'0'` + value_positive);
73	size_t len = end_buffer - write_pointer;
74	std::memcpy(dest: output, src: write_pointer, n: len);
75	return output + len;
76	}
77	/@private
78	* This converts an unsigned integer into a character sequence.
79	* The caller is responsible for providing enough memory (at least
80	* 19 characters.)
81	* Though various runtime libraries provide itoa functions,
82	* it is not part of the C++ standard. The C++17 standard
83	* adds the to_chars functions which would do as well, but
84	* we want to support C++11.
85	*/
86	char fast_itoa(char* output, uint64_t value) noexcept* {
87	// This is a standard implementation of itoa.
88	char buffer[`20`];
89	const char *const end_buffer = buffer + `20`;
90	char *write_pointer = buffer + `19`;
91	// A faster approach is possible if we expect large integers:
92	// unroll the loop (work in 100s, 1000s) and use some kind of
93	// memoization.
94	while(value >= `10`) {
95	write_pointer-- = char*(`'0'` + (value % `10`));
96	value /= `10`;
97	};
98	write_pointer = char*(`'0'` + value);
99	size_t len = end_buffer - write_pointer;
100	std::memcpy(dest: output, src: write_pointer, n: len);
101	return output + len;
102	}
103	} // anonymous namespace
104	namespace internal {
105
106	/***
107	* Minifier/formatter code.
108	**/
109
110	simdjson_inline void mini_formatter::number(uint64_t x) {
111	char number_buffer[`24`];
112	char *newp = fast_itoa(output: number_buffer, value: x);
113	buffer.insert(position: buffer.end(), first: number_buffer, last: newp);
114	}
115
116	simdjson_inline void mini_formatter::number(int64_t x) {
117	char number_buffer[`24`];
118	char *newp = fast_itoa(output: number_buffer, value: x);
119	buffer.insert(position: buffer.end(), first: number_buffer, last: newp);
120	}
121
122	simdjson_inline void mini_formatter::number(double x) {
123	char number_buffer[`24`];
124	// Currently, passing the nullptr to the second argument is
125	// safe because our implementation does not check the second
126	// argument.
127	char newp = internal::to_chars(first: number_buffer, last: nullptr*, value: x);
128	buffer.insert(position: buffer.end(), first: number_buffer, last: newp);
129	}
130
131	simdjson_inline void mini_formatter::start_array() { one_char(c: `'['`); }
132	simdjson_inline void mini_formatter::end_array() { one_char(c: `']'`); }
133	simdjson_inline void mini_formatter::start_object() { one_char(c: `'{'`); }
134	simdjson_inline void mini_formatter::end_object() { one_char(c: `'}'`); }
135	simdjson_inline void mini_formatter::comma() { one_char(c: `','`); }
136
137
138	simdjson_inline void mini_formatter::true_atom() {
139	const char * s = "true";
140	buffer.insert(position: buffer.end(), first: s, last: s + `4`);
141	}
142	simdjson_inline void mini_formatter::false_atom() {
143	const char * s = "false";
144	buffer.insert(position: buffer.end(), first: s, last: s + `5`);
145	}
146	simdjson_inline void mini_formatter::null_atom() {
147	const char * s = "null";
148	buffer.insert(position: buffer.end(), first: s, last: s + `4`);
149	}
150	simdjson_inline void mini_formatter::one_char(char c) { buffer.push_back(x: c); }
151	simdjson_inline void mini_formatter::key(std::string_view unescaped) {
152	string(unescaped);
153	one_char(c: `':'`);
154	}
155	simdjson_inline void mini_formatter::string(std::string_view unescaped) {
156	one_char(c: `'\"'`);
157	size_t i = `0`;
158	// Fast path for the case where we have no control character, no ", and no backslash.
159	// This should include most keys.
160	//
161	// We would like to use 'bool' but some compilers take offense to bitwise operation
162	// with bool types.
163	constexpr static char needs_escaping[] = {`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
164	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
165	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
166	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `0`, `0`, `0`, `0`, `0`, `0`,
167	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
168	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
169	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
170	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
171	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
172	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`};
173	for(;i + `8` <= unescaped.length(); i += `8`) {
174	// Poor's man vectorization. This could get much faster if we used SIMD.
175	//
176	// It is not the case that replacing '\|' with '\|\|' would be neutral performance-wise.
177	if(needs_escaping[uint8_t(unescaped [i])] \| needs_escaping[uint8_t(unescaped [i+`1`])]
178	\| needs_escaping[uint8_t(unescaped [i+`2`])] \| needs_escaping[uint8_t(unescaped [i+`3`])]
179	\| needs_escaping[uint8_t(unescaped [i+`4`])] \| needs_escaping[uint8_t(unescaped [i+`5`])]
180	\| needs_escaping[uint8_t(unescaped [i+`6`])] \| needs_escaping[uint8_t(unescaped [i+`7`])]
181	) { break; }
182	}
183	for(;i < unescaped.length(); i++) {
184	if(needs_escaping[uint8_t(unescaped [i])]) { break; }
185	}
186	// The following is also possible and omits a 256-byte table, but it is slower:
187	// for (; (i < unescaped.length()) && (uint8_t(unescaped[i]) > 0x1F)
188	// && (unescaped[i] != '\"') && (unescaped[i] != '\\'); i++) {}
189
190	// At least for long strings, the following should be fast. We could
191	// do better by integrating the checks and the insertion.
192	buffer.insert(position: buffer.end(), first: unescaped.data(), last: unescaped.data() + i);
193	// We caught a control character if we enter this loop (slow).
194	// Note that we are do not restart from the beginning, but rather we continue
195	// from the point where we encountered something that requires escaping.
196	for (; i < unescaped.length(); i++) {
197	switch (unescaped [i]) {
198	case `'\"'`:
199	{
200	const char * s = "\\\"";
201	buffer.insert(position: buffer.end(), first: s, last: s + `2`);
202	}
203	break;
204	case `'\\'`:
205	{
206	const char * s = "\\\\";
207	buffer.insert(position: buffer.end(), first: s, last: s + `2`);
208	}
209	break;
210	default:
211	if (uint8_t(unescaped [i]) <= `0x1F`) {
212	// If packed, this uses 8 32 bytes.*
213	// Note that we expect most compilers to embed this code in the data
214	// section.
215	constexpr static escape_sequence escaped[`32`] = {
216	{.length: `6`, .string: "\\u0000"}, {.length: `6`, .string: "\\u0001"}, {.length: `6`, .string: "\\u0002"}, {.length: `6`, .string: "\\u0003"},
217	{.length: `6`, .string: "\\u0004"}, {.length: `6`, .string: "\\u0005"}, {.length: `6`, .string: "\\u0006"}, {.length: `6`, .string: "\\u0007"},
218	{.length: `2`, .string: "\\b"}, {.length: `2`, .string: "\\t"}, {.length: `2`, .string: "\\n"}, {.length: `6`, .string: "\\u000b"},
219	{.length: `2`, .string: "\\f"}, {.length: `2`, .string: "\\r"}, {.length: `6`, .string: "\\u000e"}, {.length: `6`, .string: "\\u000f"},
220	{.length: `6`, .string: "\\u0010"}, {.length: `6`, .string: "\\u0011"}, {.length: `6`, .string: "\\u0012"}, {.length: `6`, .string: "\\u0013"},
221	{.length: `6`, .string: "\\u0014"}, {.length: `6`, .string: "\\u0015"}, {.length: `6`, .string: "\\u0016"}, {.length: `6`, .string: "\\u0017"},
222	{.length: `6`, .string: "\\u0018"}, {.length: `6`, .string: "\\u0019"}, {.length: `6`, .string: "\\u001a"}, {.length: `6`, .string: "\\u001b"},
223	{.length: `6`, .string: "\\u001c"}, {.length: `6`, .string: "\\u001d"}, {.length: `6`, .string: "\\u001e"}, {.length: `6`, .string: "\\u001f"}};
224	auto u = escaped[uint8_t(unescaped [i])];
225	buffer.insert(position: buffer.end(), first: u.string, last: u.string + u.length);
226	} else {
227	one_char(c: unescaped [i]);
228	}
229	} // switch
230	} // for
231	one_char(c: `'\"'`);
232	}
233
234	inline void mini_formatter::clear() {
235	buffer.clear();
236	}
237
238	simdjson_inline std::string_view mini_formatter::str() const {
239	return std::string_view (buffer.data(), buffer.size());
240	}
241
242
243	/***
244	* String building code.
245	**/
246
247	template <class serializer>
248	inline void string_builder<serializer>::append(simdjson::dom::element value) {
249	// using tape_type = simdjson::internal::tape_type;
250	size_t depth = `0`;
251	constexpr size_t MAX_DEPTH = `16`;
252	bool is_object[MAX_DEPTH];
253	is_object[`0`] = false;
254	bool after_value = false;
255
256	internal::tape_ref iter(value.tape);
257	do {
258	// print commas after each value
259	if (after_value) {
260	format.comma();
261	}
262	// If we are in an object, print the next key and :, and skip to the next
263	// value.
264	if (is_object[depth]) {
265	format.key(iter.get_string_view());
266	iter.json_index++;
267	}
268	switch (iter.tape_ref_type()) {
269
270	// Arrays
271	case tape_type::START_ARRAY: {
272	// If we're too deep, we need to recurse to go deeper.
273	depth++;
274	if (simdjson_unlikely(depth >= MAX_DEPTH)) {
275	append(simdjson::dom::array (iter));
276	iter.json_index = iter.matching_brace_index() - `1`; // Jump to the ]
277	depth--;
278	break;
279	}
280
281	// Output start [
282	format.start_array();
283	iter.json_index++;
284
285	// Handle empty [] (we don't want to come back around and print commas)
286	if (iter.tape_ref_type() == tape_type::END_ARRAY) {
287	format.end_array();
288	depth--;
289	break;
290	}
291
292	is_object[depth] = false;
293	after_value = false;
294	continue;
295	}
296
297	// Objects
298	case tape_type::START_OBJECT: {
299	// If we're too deep, we need to recurse to go deeper.
300	depth++;
301	if (simdjson_unlikely(depth >= MAX_DEPTH)) {
302	append(simdjson::dom::object (iter));
303	iter.json_index = iter.matching_brace_index() - `1`; // Jump to the }
304	depth--;
305	break;
306	}
307
308	// Output start {
309	format.start_object();
310	iter.json_index++;
311
312	// Handle empty {} (we don't want to come back around and print commas)
313	if (iter.tape_ref_type() == tape_type::END_OBJECT) {
314	format.end_object();
315	depth--;
316	break;
317	}
318
319	is_object[depth] = true;
320	after_value = false;
321	continue;
322	}
323
324	// Scalars
325	case tape_type::STRING:
326	format.string(iter.get_string_view());
327	break;
328	case tape_type::INT64:
329	format.number(iter.next_tape_value<int64_t>());
330	iter.json_index++; // numbers take up 2 spots, so we need to increment
331	// extra
332	break;
333	case tape_type::UINT64:
334	format.number(iter.next_tape_value<uint64_t>());
335	iter.json_index++; // numbers take up 2 spots, so we need to increment
336	// extra
337	break;
338	case tape_type::DOUBLE:
339	format.number(iter.next_tape_value<double>());
340	iter.json_index++; // numbers take up 2 spots, so we need to increment
341	// extra
342	break;
343	case tape_type::TRUE_VALUE:
344	format.true_atom();
345	break;
346	case tape_type::FALSE_VALUE:
347	format.false_atom();
348	break;
349	case tape_type::NULL_VALUE:
350	format.null_atom();
351	break;
352
353	// These are impossible
354	case tape_type::END_ARRAY:
355	case tape_type::END_OBJECT:
356	case tape_type::ROOT:
357	SIMDJSON_UNREACHABLE();
358	}
359	iter.json_index++;
360	after_value = true;
361
362	// Handle multiple ends in a row
363	while (depth != `0` && (iter.tape_ref_type() == tape_type::END_ARRAY \|\|
364	iter.tape_ref_type() == tape_type::END_OBJECT)) {
365	if (iter.tape_ref_type() == tape_type::END_ARRAY) {
366	format.end_array();
367	} else {
368	format.end_object();
369	}
370	depth--;
371	iter.json_index++;
372	}
373
374	// Stop when we're at depth 0
375	} while (depth != `0`);
376	}
377
378	template <class serializer>
379	inline void string_builder<serializer>::append(simdjson::dom::object value) {
380	format.start_object();
381	auto pair = value.begin();
382	auto end = value.end();
383	if (pair != end) {
384	append(*pair);
385	for (++pair; pair != end; ++pair) {
386	format.comma();
387	append(*pair);
388	}
389	}
390	format.end_object();
391	}
392
393	template <class serializer>
394	inline void string_builder<serializer>::append(simdjson::dom::array value) {
395	format.start_array();
396	auto iter = value.begin();
397	auto end = value.end();
398	if (iter != end) {
399	append(*iter);
400	for (++iter; iter != end; ++iter) {
401	format.comma();
402	append(*iter);
403	}
404	}
405	format.end_array();
406	}
407
408	template <class serializer>
409	simdjson_inline void string_builder<serializer>::append(simdjson::dom::key_value_pair kv) {
410	format.key(kv.key);
411	append(kv.value);
412	}
413
414	template <class serializer>
415	simdjson_inline void string_builder<serializer>::clear() {
416	format.clear();
417	}
418
419	template <class serializer>
420	simdjson_inline std::string_view string_builder<serializer>::str() const {
421	return format.str();
422	}
423
424
425	} // namespace internal
426	} // namespace simdjson
427
428	#endif
429

Browse the source code of Velox/build/_deps/simdjson-src/include/simdjson/dom/serialization-inl.h