1 | |
2 | #ifndef SIMDJSON_SERIALIZATION_INL_H |
3 | #define SIMDJSON_SERIALIZATION_INL_H |
4 | |
5 | #include "simdjson/dom/serialization.h" |
6 | |
7 | #include <cinttypes> |
8 | #include <type_traits> |
9 | |
10 | namespace simdjson { |
11 | namespace dom { |
12 | inline bool parser::print_json(std::ostream &os) const noexcept { |
13 | if (!valid) { return false; } |
14 | simdjson::internal::string_builder<> sb; |
15 | sb.append(value: doc.root()); |
16 | std::string_view answer = sb.str(); |
17 | os << answer; |
18 | return true; |
19 | } |
20 | } |
21 | /*** |
22 | * Number utility functions |
23 | **/ |
24 | |
25 | |
26 | namespace { |
27 | /**@private |
28 | * Escape sequence like \b or \u0001 |
29 | * We expect that most compilers will use 8 bytes for this data structure. |
30 | **/ |
31 | struct escape_sequence { |
32 | uint8_t length; |
33 | const char string[7]; // technically, we only ever need 6 characters, we pad to 8 |
34 | }; |
35 | /**@private |
36 | * This converts a signed integer into a character sequence. |
37 | * The caller is responsible for providing enough memory (at least |
38 | * 20 characters.) |
39 | * Though various runtime libraries provide itoa functions, |
40 | * it is not part of the C++ standard. The C++17 standard |
41 | * adds the to_chars functions which would do as well, but |
42 | * we want to support C++11. |
43 | */ |
44 | char *fast_itoa(char *output, int64_t value) noexcept { |
45 | // This is a standard implementation of itoa. |
46 | char buffer[20]; |
47 | uint64_t value_positive; |
48 | // In general, negating a signed integer is unsafe. |
49 | if(value < 0) { |
50 | *output++ = '-'; |
51 | // Doing value_positive = -value; while avoiding |
52 | // undefined behavior warnings. |
53 | // It assumes two complement's which is universal at this |
54 | // point in time. |
55 | std::memcpy(dest: &value_positive, src: &value, n: sizeof(value)); |
56 | value_positive = (~value_positive) + 1; // this is a negation |
57 | } else { |
58 | value_positive = value; |
59 | } |
60 | // We work solely with value_positive. It *might* be easier |
61 | // for an optimizing compiler to deal with an unsigned variable |
62 | // as far as performance goes. |
63 | const char *const end_buffer = buffer + 20; |
64 | char *write_pointer = buffer + 19; |
65 | // A faster approach is possible if we expect large integers: |
66 | // unroll the loop (work in 100s, 1000s) and use some kind of |
67 | // memoization. |
68 | while(value_positive >= 10) { |
69 | *write_pointer-- = char('0' + (value_positive % 10)); |
70 | value_positive /= 10; |
71 | } |
72 | *write_pointer = char('0' + value_positive); |
73 | size_t len = end_buffer - write_pointer; |
74 | std::memcpy(dest: output, src: write_pointer, n: len); |
75 | return output + len; |
76 | } |
77 | /**@private |
78 | * This converts an unsigned integer into a character sequence. |
79 | * The caller is responsible for providing enough memory (at least |
80 | * 19 characters.) |
81 | * Though various runtime libraries provide itoa functions, |
82 | * it is not part of the C++ standard. The C++17 standard |
83 | * adds the to_chars functions which would do as well, but |
84 | * we want to support C++11. |
85 | */ |
86 | char *fast_itoa(char *output, uint64_t value) noexcept { |
87 | // This is a standard implementation of itoa. |
88 | char buffer[20]; |
89 | const char *const end_buffer = buffer + 20; |
90 | char *write_pointer = buffer + 19; |
91 | // A faster approach is possible if we expect large integers: |
92 | // unroll the loop (work in 100s, 1000s) and use some kind of |
93 | // memoization. |
94 | while(value >= 10) { |
95 | *write_pointer-- = char('0' + (value % 10)); |
96 | value /= 10; |
97 | }; |
98 | *write_pointer = char('0' + value); |
99 | size_t len = end_buffer - write_pointer; |
100 | std::memcpy(dest: output, src: write_pointer, n: len); |
101 | return output + len; |
102 | } |
103 | } // anonymous namespace |
104 | namespace internal { |
105 | |
106 | /*** |
107 | * Minifier/formatter code. |
108 | **/ |
109 | |
110 | simdjson_inline void mini_formatter::number(uint64_t x) { |
111 | char number_buffer[24]; |
112 | char *newp = fast_itoa(output: number_buffer, value: x); |
113 | buffer.insert(position: buffer.end(), first: number_buffer, last: newp); |
114 | } |
115 | |
116 | simdjson_inline void mini_formatter::number(int64_t x) { |
117 | char number_buffer[24]; |
118 | char *newp = fast_itoa(output: number_buffer, value: x); |
119 | buffer.insert(position: buffer.end(), first: number_buffer, last: newp); |
120 | } |
121 | |
122 | simdjson_inline void mini_formatter::number(double x) { |
123 | char number_buffer[24]; |
124 | // Currently, passing the nullptr to the second argument is |
125 | // safe because our implementation does not check the second |
126 | // argument. |
127 | char *newp = internal::to_chars(first: number_buffer, last: nullptr, value: x); |
128 | buffer.insert(position: buffer.end(), first: number_buffer, last: newp); |
129 | } |
130 | |
131 | simdjson_inline void mini_formatter::start_array() { one_char(c: '['); } |
132 | simdjson_inline void mini_formatter::end_array() { one_char(c: ']'); } |
133 | simdjson_inline void mini_formatter::start_object() { one_char(c: '{'); } |
134 | simdjson_inline void mini_formatter::end_object() { one_char(c: '}'); } |
135 | simdjson_inline void mini_formatter::comma() { one_char(c: ','); } |
136 | |
137 | |
138 | simdjson_inline void mini_formatter::true_atom() { |
139 | const char * s = "true" ; |
140 | buffer.insert(position: buffer.end(), first: s, last: s + 4); |
141 | } |
142 | simdjson_inline void mini_formatter::false_atom() { |
143 | const char * s = "false" ; |
144 | buffer.insert(position: buffer.end(), first: s, last: s + 5); |
145 | } |
146 | simdjson_inline void mini_formatter::null_atom() { |
147 | const char * s = "null" ; |
148 | buffer.insert(position: buffer.end(), first: s, last: s + 4); |
149 | } |
150 | simdjson_inline void mini_formatter::one_char(char c) { buffer.push_back(x: c); } |
151 | simdjson_inline void mini_formatter::key(std::string_view unescaped) { |
152 | string(unescaped); |
153 | one_char(c: ':'); |
154 | } |
155 | simdjson_inline void mini_formatter::string(std::string_view unescaped) { |
156 | one_char(c: '\"'); |
157 | size_t i = 0; |
158 | // Fast path for the case where we have no control character, no ", and no backslash. |
159 | // This should include most keys. |
160 | // |
161 | // We would like to use 'bool' but some compilers take offense to bitwise operation |
162 | // with bool types. |
163 | constexpr static char needs_escaping[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
164 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, |
165 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
166 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, |
167 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
168 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
169 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
170 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
171 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
172 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
173 | for(;i + 8 <= unescaped.length(); i += 8) { |
174 | // Poor's man vectorization. This could get much faster if we used SIMD. |
175 | // |
176 | // It is not the case that replacing '|' with '||' would be neutral performance-wise. |
177 | if(needs_escaping[uint8_t(unescaped[i])] | needs_escaping[uint8_t(unescaped[i+1])] |
178 | | needs_escaping[uint8_t(unescaped[i+2])] | needs_escaping[uint8_t(unescaped[i+3])] |
179 | | needs_escaping[uint8_t(unescaped[i+4])] | needs_escaping[uint8_t(unescaped[i+5])] |
180 | | needs_escaping[uint8_t(unescaped[i+6])] | needs_escaping[uint8_t(unescaped[i+7])] |
181 | ) { break; } |
182 | } |
183 | for(;i < unescaped.length(); i++) { |
184 | if(needs_escaping[uint8_t(unescaped[i])]) { break; } |
185 | } |
186 | // The following is also possible and omits a 256-byte table, but it is slower: |
187 | // for (; (i < unescaped.length()) && (uint8_t(unescaped[i]) > 0x1F) |
188 | // && (unescaped[i] != '\"') && (unescaped[i] != '\\'); i++) {} |
189 | |
190 | // At least for long strings, the following should be fast. We could |
191 | // do better by integrating the checks and the insertion. |
192 | buffer.insert(position: buffer.end(), first: unescaped.data(), last: unescaped.data() + i); |
193 | // We caught a control character if we enter this loop (slow). |
194 | // Note that we are do not restart from the beginning, but rather we continue |
195 | // from the point where we encountered something that requires escaping. |
196 | for (; i < unescaped.length(); i++) { |
197 | switch (unescaped[i]) { |
198 | case '\"': |
199 | { |
200 | const char * s = "\\\"" ; |
201 | buffer.insert(position: buffer.end(), first: s, last: s + 2); |
202 | } |
203 | break; |
204 | case '\\': |
205 | { |
206 | const char * s = "\\\\" ; |
207 | buffer.insert(position: buffer.end(), first: s, last: s + 2); |
208 | } |
209 | break; |
210 | default: |
211 | if (uint8_t(unescaped[i]) <= 0x1F) { |
212 | // If packed, this uses 8 * 32 bytes. |
213 | // Note that we expect most compilers to embed this code in the data |
214 | // section. |
215 | constexpr static escape_sequence escaped[32] = { |
216 | {.length: 6, .string: "\\u0000" }, {.length: 6, .string: "\\u0001" }, {.length: 6, .string: "\\u0002" }, {.length: 6, .string: "\\u0003" }, |
217 | {.length: 6, .string: "\\u0004" }, {.length: 6, .string: "\\u0005" }, {.length: 6, .string: "\\u0006" }, {.length: 6, .string: "\\u0007" }, |
218 | {.length: 2, .string: "\\b" }, {.length: 2, .string: "\\t" }, {.length: 2, .string: "\\n" }, {.length: 6, .string: "\\u000b" }, |
219 | {.length: 2, .string: "\\f" }, {.length: 2, .string: "\\r" }, {.length: 6, .string: "\\u000e" }, {.length: 6, .string: "\\u000f" }, |
220 | {.length: 6, .string: "\\u0010" }, {.length: 6, .string: "\\u0011" }, {.length: 6, .string: "\\u0012" }, {.length: 6, .string: "\\u0013" }, |
221 | {.length: 6, .string: "\\u0014" }, {.length: 6, .string: "\\u0015" }, {.length: 6, .string: "\\u0016" }, {.length: 6, .string: "\\u0017" }, |
222 | {.length: 6, .string: "\\u0018" }, {.length: 6, .string: "\\u0019" }, {.length: 6, .string: "\\u001a" }, {.length: 6, .string: "\\u001b" }, |
223 | {.length: 6, .string: "\\u001c" }, {.length: 6, .string: "\\u001d" }, {.length: 6, .string: "\\u001e" }, {.length: 6, .string: "\\u001f" }}; |
224 | auto u = escaped[uint8_t(unescaped[i])]; |
225 | buffer.insert(position: buffer.end(), first: u.string, last: u.string + u.length); |
226 | } else { |
227 | one_char(c: unescaped[i]); |
228 | } |
229 | } // switch |
230 | } // for |
231 | one_char(c: '\"'); |
232 | } |
233 | |
234 | inline void mini_formatter::clear() { |
235 | buffer.clear(); |
236 | } |
237 | |
238 | simdjson_inline std::string_view mini_formatter::str() const { |
239 | return std::string_view(buffer.data(), buffer.size()); |
240 | } |
241 | |
242 | |
243 | /*** |
244 | * String building code. |
245 | **/ |
246 | |
247 | template <class serializer> |
248 | inline void string_builder<serializer>::append(simdjson::dom::element value) { |
249 | // using tape_type = simdjson::internal::tape_type; |
250 | size_t depth = 0; |
251 | constexpr size_t MAX_DEPTH = 16; |
252 | bool is_object[MAX_DEPTH]; |
253 | is_object[0] = false; |
254 | bool after_value = false; |
255 | |
256 | internal::tape_ref iter(value.tape); |
257 | do { |
258 | // print commas after each value |
259 | if (after_value) { |
260 | format.comma(); |
261 | } |
262 | // If we are in an object, print the next key and :, and skip to the next |
263 | // value. |
264 | if (is_object[depth]) { |
265 | format.key(iter.get_string_view()); |
266 | iter.json_index++; |
267 | } |
268 | switch (iter.tape_ref_type()) { |
269 | |
270 | // Arrays |
271 | case tape_type::START_ARRAY: { |
272 | // If we're too deep, we need to recurse to go deeper. |
273 | depth++; |
274 | if (simdjson_unlikely(depth >= MAX_DEPTH)) { |
275 | append(simdjson::dom::array(iter)); |
276 | iter.json_index = iter.matching_brace_index() - 1; // Jump to the ] |
277 | depth--; |
278 | break; |
279 | } |
280 | |
281 | // Output start [ |
282 | format.start_array(); |
283 | iter.json_index++; |
284 | |
285 | // Handle empty [] (we don't want to come back around and print commas) |
286 | if (iter.tape_ref_type() == tape_type::END_ARRAY) { |
287 | format.end_array(); |
288 | depth--; |
289 | break; |
290 | } |
291 | |
292 | is_object[depth] = false; |
293 | after_value = false; |
294 | continue; |
295 | } |
296 | |
297 | // Objects |
298 | case tape_type::START_OBJECT: { |
299 | // If we're too deep, we need to recurse to go deeper. |
300 | depth++; |
301 | if (simdjson_unlikely(depth >= MAX_DEPTH)) { |
302 | append(simdjson::dom::object(iter)); |
303 | iter.json_index = iter.matching_brace_index() - 1; // Jump to the } |
304 | depth--; |
305 | break; |
306 | } |
307 | |
308 | // Output start { |
309 | format.start_object(); |
310 | iter.json_index++; |
311 | |
312 | // Handle empty {} (we don't want to come back around and print commas) |
313 | if (iter.tape_ref_type() == tape_type::END_OBJECT) { |
314 | format.end_object(); |
315 | depth--; |
316 | break; |
317 | } |
318 | |
319 | is_object[depth] = true; |
320 | after_value = false; |
321 | continue; |
322 | } |
323 | |
324 | // Scalars |
325 | case tape_type::STRING: |
326 | format.string(iter.get_string_view()); |
327 | break; |
328 | case tape_type::INT64: |
329 | format.number(iter.next_tape_value<int64_t>()); |
330 | iter.json_index++; // numbers take up 2 spots, so we need to increment |
331 | // extra |
332 | break; |
333 | case tape_type::UINT64: |
334 | format.number(iter.next_tape_value<uint64_t>()); |
335 | iter.json_index++; // numbers take up 2 spots, so we need to increment |
336 | // extra |
337 | break; |
338 | case tape_type::DOUBLE: |
339 | format.number(iter.next_tape_value<double>()); |
340 | iter.json_index++; // numbers take up 2 spots, so we need to increment |
341 | // extra |
342 | break; |
343 | case tape_type::TRUE_VALUE: |
344 | format.true_atom(); |
345 | break; |
346 | case tape_type::FALSE_VALUE: |
347 | format.false_atom(); |
348 | break; |
349 | case tape_type::NULL_VALUE: |
350 | format.null_atom(); |
351 | break; |
352 | |
353 | // These are impossible |
354 | case tape_type::END_ARRAY: |
355 | case tape_type::END_OBJECT: |
356 | case tape_type::ROOT: |
357 | SIMDJSON_UNREACHABLE(); |
358 | } |
359 | iter.json_index++; |
360 | after_value = true; |
361 | |
362 | // Handle multiple ends in a row |
363 | while (depth != 0 && (iter.tape_ref_type() == tape_type::END_ARRAY || |
364 | iter.tape_ref_type() == tape_type::END_OBJECT)) { |
365 | if (iter.tape_ref_type() == tape_type::END_ARRAY) { |
366 | format.end_array(); |
367 | } else { |
368 | format.end_object(); |
369 | } |
370 | depth--; |
371 | iter.json_index++; |
372 | } |
373 | |
374 | // Stop when we're at depth 0 |
375 | } while (depth != 0); |
376 | } |
377 | |
378 | template <class serializer> |
379 | inline void string_builder<serializer>::append(simdjson::dom::object value) { |
380 | format.start_object(); |
381 | auto pair = value.begin(); |
382 | auto end = value.end(); |
383 | if (pair != end) { |
384 | append(*pair); |
385 | for (++pair; pair != end; ++pair) { |
386 | format.comma(); |
387 | append(*pair); |
388 | } |
389 | } |
390 | format.end_object(); |
391 | } |
392 | |
393 | template <class serializer> |
394 | inline void string_builder<serializer>::append(simdjson::dom::array value) { |
395 | format.start_array(); |
396 | auto iter = value.begin(); |
397 | auto end = value.end(); |
398 | if (iter != end) { |
399 | append(*iter); |
400 | for (++iter; iter != end; ++iter) { |
401 | format.comma(); |
402 | append(*iter); |
403 | } |
404 | } |
405 | format.end_array(); |
406 | } |
407 | |
408 | template <class serializer> |
409 | simdjson_inline void string_builder<serializer>::append(simdjson::dom::key_value_pair kv) { |
410 | format.key(kv.key); |
411 | append(kv.value); |
412 | } |
413 | |
414 | template <class serializer> |
415 | simdjson_inline void string_builder<serializer>::clear() { |
416 | format.clear(); |
417 | } |
418 | |
419 | template <class serializer> |
420 | simdjson_inline std::string_view string_builder<serializer>::str() const { |
421 | return format.str(); |
422 | } |
423 | |
424 | |
425 | } // namespace internal |
426 | } // namespace simdjson |
427 | |
428 | #endif |
429 | |