1
2#ifndef SIMDJSON_SERIALIZATION_INL_H
3#define SIMDJSON_SERIALIZATION_INL_H
4
5#include "simdjson/dom/serialization.h"
6
7#include <cinttypes>
8#include <type_traits>
9
10namespace simdjson {
11namespace dom {
12inline bool parser::print_json(std::ostream &os) const noexcept {
13 if (!valid) { return false; }
14 simdjson::internal::string_builder<> sb;
15 sb.append(value: doc.root());
16 std::string_view answer = sb.str();
17 os << answer;
18 return true;
19}
20}
21/***
22 * Number utility functions
23 **/
24
25
26namespace {
27/**@private
28 * Escape sequence like \b or \u0001
29 * We expect that most compilers will use 8 bytes for this data structure.
30 **/
31struct escape_sequence {
32 uint8_t length;
33 const char string[7]; // technically, we only ever need 6 characters, we pad to 8
34};
35/**@private
36 * This converts a signed integer into a character sequence.
37 * The caller is responsible for providing enough memory (at least
38 * 20 characters.)
39 * Though various runtime libraries provide itoa functions,
40 * it is not part of the C++ standard. The C++17 standard
41 * adds the to_chars functions which would do as well, but
42 * we want to support C++11.
43 */
44char *fast_itoa(char *output, int64_t value) noexcept {
45 // This is a standard implementation of itoa.
46 char buffer[20];
47 uint64_t value_positive;
48 // In general, negating a signed integer is unsafe.
49 if(value < 0) {
50 *output++ = '-';
51 // Doing value_positive = -value; while avoiding
52 // undefined behavior warnings.
53 // It assumes two complement's which is universal at this
54 // point in time.
55 std::memcpy(dest: &value_positive, src: &value, n: sizeof(value));
56 value_positive = (~value_positive) + 1; // this is a negation
57 } else {
58 value_positive = value;
59 }
60 // We work solely with value_positive. It *might* be easier
61 // for an optimizing compiler to deal with an unsigned variable
62 // as far as performance goes.
63 const char *const end_buffer = buffer + 20;
64 char *write_pointer = buffer + 19;
65 // A faster approach is possible if we expect large integers:
66 // unroll the loop (work in 100s, 1000s) and use some kind of
67 // memoization.
68 while(value_positive >= 10) {
69 *write_pointer-- = char('0' + (value_positive % 10));
70 value_positive /= 10;
71 }
72 *write_pointer = char('0' + value_positive);
73 size_t len = end_buffer - write_pointer;
74 std::memcpy(dest: output, src: write_pointer, n: len);
75 return output + len;
76}
77/**@private
78 * This converts an unsigned integer into a character sequence.
79 * The caller is responsible for providing enough memory (at least
80 * 19 characters.)
81 * Though various runtime libraries provide itoa functions,
82 * it is not part of the C++ standard. The C++17 standard
83 * adds the to_chars functions which would do as well, but
84 * we want to support C++11.
85 */
86char *fast_itoa(char *output, uint64_t value) noexcept {
87 // This is a standard implementation of itoa.
88 char buffer[20];
89 const char *const end_buffer = buffer + 20;
90 char *write_pointer = buffer + 19;
91 // A faster approach is possible if we expect large integers:
92 // unroll the loop (work in 100s, 1000s) and use some kind of
93 // memoization.
94 while(value >= 10) {
95 *write_pointer-- = char('0' + (value % 10));
96 value /= 10;
97 };
98 *write_pointer = char('0' + value);
99 size_t len = end_buffer - write_pointer;
100 std::memcpy(dest: output, src: write_pointer, n: len);
101 return output + len;
102}
103} // anonymous namespace
104namespace internal {
105
106/***
107 * Minifier/formatter code.
108 **/
109
110simdjson_inline void mini_formatter::number(uint64_t x) {
111 char number_buffer[24];
112 char *newp = fast_itoa(output: number_buffer, value: x);
113 buffer.insert(position: buffer.end(), first: number_buffer, last: newp);
114}
115
116simdjson_inline void mini_formatter::number(int64_t x) {
117 char number_buffer[24];
118 char *newp = fast_itoa(output: number_buffer, value: x);
119 buffer.insert(position: buffer.end(), first: number_buffer, last: newp);
120}
121
122simdjson_inline void mini_formatter::number(double x) {
123 char number_buffer[24];
124 // Currently, passing the nullptr to the second argument is
125 // safe because our implementation does not check the second
126 // argument.
127 char *newp = internal::to_chars(first: number_buffer, last: nullptr, value: x);
128 buffer.insert(position: buffer.end(), first: number_buffer, last: newp);
129}
130
131simdjson_inline void mini_formatter::start_array() { one_char(c: '['); }
132simdjson_inline void mini_formatter::end_array() { one_char(c: ']'); }
133simdjson_inline void mini_formatter::start_object() { one_char(c: '{'); }
134simdjson_inline void mini_formatter::end_object() { one_char(c: '}'); }
135simdjson_inline void mini_formatter::comma() { one_char(c: ','); }
136
137
138simdjson_inline void mini_formatter::true_atom() {
139 const char * s = "true";
140 buffer.insert(position: buffer.end(), first: s, last: s + 4);
141}
142simdjson_inline void mini_formatter::false_atom() {
143 const char * s = "false";
144 buffer.insert(position: buffer.end(), first: s, last: s + 5);
145}
146simdjson_inline void mini_formatter::null_atom() {
147 const char * s = "null";
148 buffer.insert(position: buffer.end(), first: s, last: s + 4);
149}
150simdjson_inline void mini_formatter::one_char(char c) { buffer.push_back(x: c); }
151simdjson_inline void mini_formatter::key(std::string_view unescaped) {
152 string(unescaped);
153 one_char(c: ':');
154}
155simdjson_inline void mini_formatter::string(std::string_view unescaped) {
156 one_char(c: '\"');
157 size_t i = 0;
158 // Fast path for the case where we have no control character, no ", and no backslash.
159 // This should include most keys.
160 //
161 // We would like to use 'bool' but some compilers take offense to bitwise operation
162 // with bool types.
163 constexpr static char needs_escaping[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
164 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
173 for(;i + 8 <= unescaped.length(); i += 8) {
174 // Poor's man vectorization. This could get much faster if we used SIMD.
175 //
176 // It is not the case that replacing '|' with '||' would be neutral performance-wise.
177 if(needs_escaping[uint8_t(unescaped[i])] | needs_escaping[uint8_t(unescaped[i+1])]
178 | needs_escaping[uint8_t(unescaped[i+2])] | needs_escaping[uint8_t(unescaped[i+3])]
179 | needs_escaping[uint8_t(unescaped[i+4])] | needs_escaping[uint8_t(unescaped[i+5])]
180 | needs_escaping[uint8_t(unescaped[i+6])] | needs_escaping[uint8_t(unescaped[i+7])]
181 ) { break; }
182 }
183 for(;i < unescaped.length(); i++) {
184 if(needs_escaping[uint8_t(unescaped[i])]) { break; }
185 }
186 // The following is also possible and omits a 256-byte table, but it is slower:
187 // for (; (i < unescaped.length()) && (uint8_t(unescaped[i]) > 0x1F)
188 // && (unescaped[i] != '\"') && (unescaped[i] != '\\'); i++) {}
189
190 // At least for long strings, the following should be fast. We could
191 // do better by integrating the checks and the insertion.
192 buffer.insert(position: buffer.end(), first: unescaped.data(), last: unescaped.data() + i);
193 // We caught a control character if we enter this loop (slow).
194 // Note that we are do not restart from the beginning, but rather we continue
195 // from the point where we encountered something that requires escaping.
196 for (; i < unescaped.length(); i++) {
197 switch (unescaped[i]) {
198 case '\"':
199 {
200 const char * s = "\\\"";
201 buffer.insert(position: buffer.end(), first: s, last: s + 2);
202 }
203 break;
204 case '\\':
205 {
206 const char * s = "\\\\";
207 buffer.insert(position: buffer.end(), first: s, last: s + 2);
208 }
209 break;
210 default:
211 if (uint8_t(unescaped[i]) <= 0x1F) {
212 // If packed, this uses 8 * 32 bytes.
213 // Note that we expect most compilers to embed this code in the data
214 // section.
215 constexpr static escape_sequence escaped[32] = {
216 {.length: 6, .string: "\\u0000"}, {.length: 6, .string: "\\u0001"}, {.length: 6, .string: "\\u0002"}, {.length: 6, .string: "\\u0003"},
217 {.length: 6, .string: "\\u0004"}, {.length: 6, .string: "\\u0005"}, {.length: 6, .string: "\\u0006"}, {.length: 6, .string: "\\u0007"},
218 {.length: 2, .string: "\\b"}, {.length: 2, .string: "\\t"}, {.length: 2, .string: "\\n"}, {.length: 6, .string: "\\u000b"},
219 {.length: 2, .string: "\\f"}, {.length: 2, .string: "\\r"}, {.length: 6, .string: "\\u000e"}, {.length: 6, .string: "\\u000f"},
220 {.length: 6, .string: "\\u0010"}, {.length: 6, .string: "\\u0011"}, {.length: 6, .string: "\\u0012"}, {.length: 6, .string: "\\u0013"},
221 {.length: 6, .string: "\\u0014"}, {.length: 6, .string: "\\u0015"}, {.length: 6, .string: "\\u0016"}, {.length: 6, .string: "\\u0017"},
222 {.length: 6, .string: "\\u0018"}, {.length: 6, .string: "\\u0019"}, {.length: 6, .string: "\\u001a"}, {.length: 6, .string: "\\u001b"},
223 {.length: 6, .string: "\\u001c"}, {.length: 6, .string: "\\u001d"}, {.length: 6, .string: "\\u001e"}, {.length: 6, .string: "\\u001f"}};
224 auto u = escaped[uint8_t(unescaped[i])];
225 buffer.insert(position: buffer.end(), first: u.string, last: u.string + u.length);
226 } else {
227 one_char(c: unescaped[i]);
228 }
229 } // switch
230 } // for
231 one_char(c: '\"');
232}
233
234inline void mini_formatter::clear() {
235 buffer.clear();
236}
237
238simdjson_inline std::string_view mini_formatter::str() const {
239 return std::string_view(buffer.data(), buffer.size());
240}
241
242
243/***
244 * String building code.
245 **/
246
247template <class serializer>
248inline void string_builder<serializer>::append(simdjson::dom::element value) {
249 // using tape_type = simdjson::internal::tape_type;
250 size_t depth = 0;
251 constexpr size_t MAX_DEPTH = 16;
252 bool is_object[MAX_DEPTH];
253 is_object[0] = false;
254 bool after_value = false;
255
256 internal::tape_ref iter(value.tape);
257 do {
258 // print commas after each value
259 if (after_value) {
260 format.comma();
261 }
262 // If we are in an object, print the next key and :, and skip to the next
263 // value.
264 if (is_object[depth]) {
265 format.key(iter.get_string_view());
266 iter.json_index++;
267 }
268 switch (iter.tape_ref_type()) {
269
270 // Arrays
271 case tape_type::START_ARRAY: {
272 // If we're too deep, we need to recurse to go deeper.
273 depth++;
274 if (simdjson_unlikely(depth >= MAX_DEPTH)) {
275 append(simdjson::dom::array(iter));
276 iter.json_index = iter.matching_brace_index() - 1; // Jump to the ]
277 depth--;
278 break;
279 }
280
281 // Output start [
282 format.start_array();
283 iter.json_index++;
284
285 // Handle empty [] (we don't want to come back around and print commas)
286 if (iter.tape_ref_type() == tape_type::END_ARRAY) {
287 format.end_array();
288 depth--;
289 break;
290 }
291
292 is_object[depth] = false;
293 after_value = false;
294 continue;
295 }
296
297 // Objects
298 case tape_type::START_OBJECT: {
299 // If we're too deep, we need to recurse to go deeper.
300 depth++;
301 if (simdjson_unlikely(depth >= MAX_DEPTH)) {
302 append(simdjson::dom::object(iter));
303 iter.json_index = iter.matching_brace_index() - 1; // Jump to the }
304 depth--;
305 break;
306 }
307
308 // Output start {
309 format.start_object();
310 iter.json_index++;
311
312 // Handle empty {} (we don't want to come back around and print commas)
313 if (iter.tape_ref_type() == tape_type::END_OBJECT) {
314 format.end_object();
315 depth--;
316 break;
317 }
318
319 is_object[depth] = true;
320 after_value = false;
321 continue;
322 }
323
324 // Scalars
325 case tape_type::STRING:
326 format.string(iter.get_string_view());
327 break;
328 case tape_type::INT64:
329 format.number(iter.next_tape_value<int64_t>());
330 iter.json_index++; // numbers take up 2 spots, so we need to increment
331 // extra
332 break;
333 case tape_type::UINT64:
334 format.number(iter.next_tape_value<uint64_t>());
335 iter.json_index++; // numbers take up 2 spots, so we need to increment
336 // extra
337 break;
338 case tape_type::DOUBLE:
339 format.number(iter.next_tape_value<double>());
340 iter.json_index++; // numbers take up 2 spots, so we need to increment
341 // extra
342 break;
343 case tape_type::TRUE_VALUE:
344 format.true_atom();
345 break;
346 case tape_type::FALSE_VALUE:
347 format.false_atom();
348 break;
349 case tape_type::NULL_VALUE:
350 format.null_atom();
351 break;
352
353 // These are impossible
354 case tape_type::END_ARRAY:
355 case tape_type::END_OBJECT:
356 case tape_type::ROOT:
357 SIMDJSON_UNREACHABLE();
358 }
359 iter.json_index++;
360 after_value = true;
361
362 // Handle multiple ends in a row
363 while (depth != 0 && (iter.tape_ref_type() == tape_type::END_ARRAY ||
364 iter.tape_ref_type() == tape_type::END_OBJECT)) {
365 if (iter.tape_ref_type() == tape_type::END_ARRAY) {
366 format.end_array();
367 } else {
368 format.end_object();
369 }
370 depth--;
371 iter.json_index++;
372 }
373
374 // Stop when we're at depth 0
375 } while (depth != 0);
376}
377
378template <class serializer>
379inline void string_builder<serializer>::append(simdjson::dom::object value) {
380 format.start_object();
381 auto pair = value.begin();
382 auto end = value.end();
383 if (pair != end) {
384 append(*pair);
385 for (++pair; pair != end; ++pair) {
386 format.comma();
387 append(*pair);
388 }
389 }
390 format.end_object();
391}
392
393template <class serializer>
394inline void string_builder<serializer>::append(simdjson::dom::array value) {
395 format.start_array();
396 auto iter = value.begin();
397 auto end = value.end();
398 if (iter != end) {
399 append(*iter);
400 for (++iter; iter != end; ++iter) {
401 format.comma();
402 append(*iter);
403 }
404 }
405 format.end_array();
406}
407
408template <class serializer>
409simdjson_inline void string_builder<serializer>::append(simdjson::dom::key_value_pair kv) {
410 format.key(kv.key);
411 append(kv.value);
412}
413
414template <class serializer>
415simdjson_inline void string_builder<serializer>::clear() {
416 format.clear();
417}
418
419template <class serializer>
420simdjson_inline std::string_view string_builder<serializer>::str() const {
421 return format.str();
422}
423
424
425} // namespace internal
426} // namespace simdjson
427
428#endif
429