| 1 | |
| 2 | #ifndef SIMDJSON_SERIALIZATION_INL_H |
| 3 | #define SIMDJSON_SERIALIZATION_INL_H |
| 4 | |
| 5 | #include "simdjson/dom/serialization.h" |
| 6 | |
| 7 | #include <cinttypes> |
| 8 | #include <type_traits> |
| 9 | |
| 10 | namespace simdjson { |
| 11 | namespace dom { |
| 12 | inline bool parser::print_json(std::ostream &os) const noexcept { |
| 13 | if (!valid) { return false; } |
| 14 | simdjson::internal::string_builder<> sb; |
| 15 | sb.append(value: doc.root()); |
| 16 | std::string_view answer = sb.str(); |
| 17 | os << answer; |
| 18 | return true; |
| 19 | } |
| 20 | } |
| 21 | /*** |
| 22 | * Number utility functions |
| 23 | **/ |
| 24 | |
| 25 | |
| 26 | namespace { |
| 27 | /**@private |
| 28 | * Escape sequence like \b or \u0001 |
| 29 | * We expect that most compilers will use 8 bytes for this data structure. |
| 30 | **/ |
| 31 | struct escape_sequence { |
| 32 | uint8_t length; |
| 33 | const char string[7]; // technically, we only ever need 6 characters, we pad to 8 |
| 34 | }; |
| 35 | /**@private |
| 36 | * This converts a signed integer into a character sequence. |
| 37 | * The caller is responsible for providing enough memory (at least |
| 38 | * 20 characters.) |
| 39 | * Though various runtime libraries provide itoa functions, |
| 40 | * it is not part of the C++ standard. The C++17 standard |
| 41 | * adds the to_chars functions which would do as well, but |
| 42 | * we want to support C++11. |
| 43 | */ |
| 44 | char *fast_itoa(char *output, int64_t value) noexcept { |
| 45 | // This is a standard implementation of itoa. |
| 46 | char buffer[20]; |
| 47 | uint64_t value_positive; |
| 48 | // In general, negating a signed integer is unsafe. |
| 49 | if(value < 0) { |
| 50 | *output++ = '-'; |
| 51 | // Doing value_positive = -value; while avoiding |
| 52 | // undefined behavior warnings. |
| 53 | // It assumes two complement's which is universal at this |
| 54 | // point in time. |
| 55 | std::memcpy(dest: &value_positive, src: &value, n: sizeof(value)); |
| 56 | value_positive = (~value_positive) + 1; // this is a negation |
| 57 | } else { |
| 58 | value_positive = value; |
| 59 | } |
| 60 | // We work solely with value_positive. It *might* be easier |
| 61 | // for an optimizing compiler to deal with an unsigned variable |
| 62 | // as far as performance goes. |
| 63 | const char *const end_buffer = buffer + 20; |
| 64 | char *write_pointer = buffer + 19; |
| 65 | // A faster approach is possible if we expect large integers: |
| 66 | // unroll the loop (work in 100s, 1000s) and use some kind of |
| 67 | // memoization. |
| 68 | while(value_positive >= 10) { |
| 69 | *write_pointer-- = char('0' + (value_positive % 10)); |
| 70 | value_positive /= 10; |
| 71 | } |
| 72 | *write_pointer = char('0' + value_positive); |
| 73 | size_t len = end_buffer - write_pointer; |
| 74 | std::memcpy(dest: output, src: write_pointer, n: len); |
| 75 | return output + len; |
| 76 | } |
| 77 | /**@private |
| 78 | * This converts an unsigned integer into a character sequence. |
| 79 | * The caller is responsible for providing enough memory (at least |
| 80 | * 19 characters.) |
| 81 | * Though various runtime libraries provide itoa functions, |
| 82 | * it is not part of the C++ standard. The C++17 standard |
| 83 | * adds the to_chars functions which would do as well, but |
| 84 | * we want to support C++11. |
| 85 | */ |
| 86 | char *fast_itoa(char *output, uint64_t value) noexcept { |
| 87 | // This is a standard implementation of itoa. |
| 88 | char buffer[20]; |
| 89 | const char *const end_buffer = buffer + 20; |
| 90 | char *write_pointer = buffer + 19; |
| 91 | // A faster approach is possible if we expect large integers: |
| 92 | // unroll the loop (work in 100s, 1000s) and use some kind of |
| 93 | // memoization. |
| 94 | while(value >= 10) { |
| 95 | *write_pointer-- = char('0' + (value % 10)); |
| 96 | value /= 10; |
| 97 | }; |
| 98 | *write_pointer = char('0' + value); |
| 99 | size_t len = end_buffer - write_pointer; |
| 100 | std::memcpy(dest: output, src: write_pointer, n: len); |
| 101 | return output + len; |
| 102 | } |
| 103 | } // anonymous namespace |
| 104 | namespace internal { |
| 105 | |
| 106 | /*** |
| 107 | * Minifier/formatter code. |
| 108 | **/ |
| 109 | |
| 110 | simdjson_inline void mini_formatter::number(uint64_t x) { |
| 111 | char number_buffer[24]; |
| 112 | char *newp = fast_itoa(output: number_buffer, value: x); |
| 113 | buffer.insert(position: buffer.end(), first: number_buffer, last: newp); |
| 114 | } |
| 115 | |
| 116 | simdjson_inline void mini_formatter::number(int64_t x) { |
| 117 | char number_buffer[24]; |
| 118 | char *newp = fast_itoa(output: number_buffer, value: x); |
| 119 | buffer.insert(position: buffer.end(), first: number_buffer, last: newp); |
| 120 | } |
| 121 | |
| 122 | simdjson_inline void mini_formatter::number(double x) { |
| 123 | char number_buffer[24]; |
| 124 | // Currently, passing the nullptr to the second argument is |
| 125 | // safe because our implementation does not check the second |
| 126 | // argument. |
| 127 | char *newp = internal::to_chars(first: number_buffer, last: nullptr, value: x); |
| 128 | buffer.insert(position: buffer.end(), first: number_buffer, last: newp); |
| 129 | } |
| 130 | |
| 131 | simdjson_inline void mini_formatter::start_array() { one_char(c: '['); } |
| 132 | simdjson_inline void mini_formatter::end_array() { one_char(c: ']'); } |
| 133 | simdjson_inline void mini_formatter::start_object() { one_char(c: '{'); } |
| 134 | simdjson_inline void mini_formatter::end_object() { one_char(c: '}'); } |
| 135 | simdjson_inline void mini_formatter::comma() { one_char(c: ','); } |
| 136 | |
| 137 | |
| 138 | simdjson_inline void mini_formatter::true_atom() { |
| 139 | const char * s = "true" ; |
| 140 | buffer.insert(position: buffer.end(), first: s, last: s + 4); |
| 141 | } |
| 142 | simdjson_inline void mini_formatter::false_atom() { |
| 143 | const char * s = "false" ; |
| 144 | buffer.insert(position: buffer.end(), first: s, last: s + 5); |
| 145 | } |
| 146 | simdjson_inline void mini_formatter::null_atom() { |
| 147 | const char * s = "null" ; |
| 148 | buffer.insert(position: buffer.end(), first: s, last: s + 4); |
| 149 | } |
| 150 | simdjson_inline void mini_formatter::one_char(char c) { buffer.push_back(x: c); } |
| 151 | simdjson_inline void mini_formatter::key(std::string_view unescaped) { |
| 152 | string(unescaped); |
| 153 | one_char(c: ':'); |
| 154 | } |
| 155 | simdjson_inline void mini_formatter::string(std::string_view unescaped) { |
| 156 | one_char(c: '\"'); |
| 157 | size_t i = 0; |
| 158 | // Fast path for the case where we have no control character, no ", and no backslash. |
| 159 | // This should include most keys. |
| 160 | // |
| 161 | // We would like to use 'bool' but some compilers take offense to bitwise operation |
| 162 | // with bool types. |
| 163 | constexpr static char needs_escaping[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 164 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, |
| 165 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 166 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, |
| 167 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 168 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 169 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 170 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 171 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 172 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
| 173 | for(;i + 8 <= unescaped.length(); i += 8) { |
| 174 | // Poor's man vectorization. This could get much faster if we used SIMD. |
| 175 | // |
| 176 | // It is not the case that replacing '|' with '||' would be neutral performance-wise. |
| 177 | if(needs_escaping[uint8_t(unescaped[i])] | needs_escaping[uint8_t(unescaped[i+1])] |
| 178 | | needs_escaping[uint8_t(unescaped[i+2])] | needs_escaping[uint8_t(unescaped[i+3])] |
| 179 | | needs_escaping[uint8_t(unescaped[i+4])] | needs_escaping[uint8_t(unescaped[i+5])] |
| 180 | | needs_escaping[uint8_t(unescaped[i+6])] | needs_escaping[uint8_t(unescaped[i+7])] |
| 181 | ) { break; } |
| 182 | } |
| 183 | for(;i < unescaped.length(); i++) { |
| 184 | if(needs_escaping[uint8_t(unescaped[i])]) { break; } |
| 185 | } |
| 186 | // The following is also possible and omits a 256-byte table, but it is slower: |
| 187 | // for (; (i < unescaped.length()) && (uint8_t(unescaped[i]) > 0x1F) |
| 188 | // && (unescaped[i] != '\"') && (unescaped[i] != '\\'); i++) {} |
| 189 | |
| 190 | // At least for long strings, the following should be fast. We could |
| 191 | // do better by integrating the checks and the insertion. |
| 192 | buffer.insert(position: buffer.end(), first: unescaped.data(), last: unescaped.data() + i); |
| 193 | // We caught a control character if we enter this loop (slow). |
| 194 | // Note that we are do not restart from the beginning, but rather we continue |
| 195 | // from the point where we encountered something that requires escaping. |
| 196 | for (; i < unescaped.length(); i++) { |
| 197 | switch (unescaped[i]) { |
| 198 | case '\"': |
| 199 | { |
| 200 | const char * s = "\\\"" ; |
| 201 | buffer.insert(position: buffer.end(), first: s, last: s + 2); |
| 202 | } |
| 203 | break; |
| 204 | case '\\': |
| 205 | { |
| 206 | const char * s = "\\\\" ; |
| 207 | buffer.insert(position: buffer.end(), first: s, last: s + 2); |
| 208 | } |
| 209 | break; |
| 210 | default: |
| 211 | if (uint8_t(unescaped[i]) <= 0x1F) { |
| 212 | // If packed, this uses 8 * 32 bytes. |
| 213 | // Note that we expect most compilers to embed this code in the data |
| 214 | // section. |
| 215 | constexpr static escape_sequence escaped[32] = { |
| 216 | {.length: 6, .string: "\\u0000" }, {.length: 6, .string: "\\u0001" }, {.length: 6, .string: "\\u0002" }, {.length: 6, .string: "\\u0003" }, |
| 217 | {.length: 6, .string: "\\u0004" }, {.length: 6, .string: "\\u0005" }, {.length: 6, .string: "\\u0006" }, {.length: 6, .string: "\\u0007" }, |
| 218 | {.length: 2, .string: "\\b" }, {.length: 2, .string: "\\t" }, {.length: 2, .string: "\\n" }, {.length: 6, .string: "\\u000b" }, |
| 219 | {.length: 2, .string: "\\f" }, {.length: 2, .string: "\\r" }, {.length: 6, .string: "\\u000e" }, {.length: 6, .string: "\\u000f" }, |
| 220 | {.length: 6, .string: "\\u0010" }, {.length: 6, .string: "\\u0011" }, {.length: 6, .string: "\\u0012" }, {.length: 6, .string: "\\u0013" }, |
| 221 | {.length: 6, .string: "\\u0014" }, {.length: 6, .string: "\\u0015" }, {.length: 6, .string: "\\u0016" }, {.length: 6, .string: "\\u0017" }, |
| 222 | {.length: 6, .string: "\\u0018" }, {.length: 6, .string: "\\u0019" }, {.length: 6, .string: "\\u001a" }, {.length: 6, .string: "\\u001b" }, |
| 223 | {.length: 6, .string: "\\u001c" }, {.length: 6, .string: "\\u001d" }, {.length: 6, .string: "\\u001e" }, {.length: 6, .string: "\\u001f" }}; |
| 224 | auto u = escaped[uint8_t(unescaped[i])]; |
| 225 | buffer.insert(position: buffer.end(), first: u.string, last: u.string + u.length); |
| 226 | } else { |
| 227 | one_char(c: unescaped[i]); |
| 228 | } |
| 229 | } // switch |
| 230 | } // for |
| 231 | one_char(c: '\"'); |
| 232 | } |
| 233 | |
| 234 | inline void mini_formatter::clear() { |
| 235 | buffer.clear(); |
| 236 | } |
| 237 | |
| 238 | simdjson_inline std::string_view mini_formatter::str() const { |
| 239 | return std::string_view(buffer.data(), buffer.size()); |
| 240 | } |
| 241 | |
| 242 | |
| 243 | /*** |
| 244 | * String building code. |
| 245 | **/ |
| 246 | |
| 247 | template <class serializer> |
| 248 | inline void string_builder<serializer>::append(simdjson::dom::element value) { |
| 249 | // using tape_type = simdjson::internal::tape_type; |
| 250 | size_t depth = 0; |
| 251 | constexpr size_t MAX_DEPTH = 16; |
| 252 | bool is_object[MAX_DEPTH]; |
| 253 | is_object[0] = false; |
| 254 | bool after_value = false; |
| 255 | |
| 256 | internal::tape_ref iter(value.tape); |
| 257 | do { |
| 258 | // print commas after each value |
| 259 | if (after_value) { |
| 260 | format.comma(); |
| 261 | } |
| 262 | // If we are in an object, print the next key and :, and skip to the next |
| 263 | // value. |
| 264 | if (is_object[depth]) { |
| 265 | format.key(iter.get_string_view()); |
| 266 | iter.json_index++; |
| 267 | } |
| 268 | switch (iter.tape_ref_type()) { |
| 269 | |
| 270 | // Arrays |
| 271 | case tape_type::START_ARRAY: { |
| 272 | // If we're too deep, we need to recurse to go deeper. |
| 273 | depth++; |
| 274 | if (simdjson_unlikely(depth >= MAX_DEPTH)) { |
| 275 | append(simdjson::dom::array(iter)); |
| 276 | iter.json_index = iter.matching_brace_index() - 1; // Jump to the ] |
| 277 | depth--; |
| 278 | break; |
| 279 | } |
| 280 | |
| 281 | // Output start [ |
| 282 | format.start_array(); |
| 283 | iter.json_index++; |
| 284 | |
| 285 | // Handle empty [] (we don't want to come back around and print commas) |
| 286 | if (iter.tape_ref_type() == tape_type::END_ARRAY) { |
| 287 | format.end_array(); |
| 288 | depth--; |
| 289 | break; |
| 290 | } |
| 291 | |
| 292 | is_object[depth] = false; |
| 293 | after_value = false; |
| 294 | continue; |
| 295 | } |
| 296 | |
| 297 | // Objects |
| 298 | case tape_type::START_OBJECT: { |
| 299 | // If we're too deep, we need to recurse to go deeper. |
| 300 | depth++; |
| 301 | if (simdjson_unlikely(depth >= MAX_DEPTH)) { |
| 302 | append(simdjson::dom::object(iter)); |
| 303 | iter.json_index = iter.matching_brace_index() - 1; // Jump to the } |
| 304 | depth--; |
| 305 | break; |
| 306 | } |
| 307 | |
| 308 | // Output start { |
| 309 | format.start_object(); |
| 310 | iter.json_index++; |
| 311 | |
| 312 | // Handle empty {} (we don't want to come back around and print commas) |
| 313 | if (iter.tape_ref_type() == tape_type::END_OBJECT) { |
| 314 | format.end_object(); |
| 315 | depth--; |
| 316 | break; |
| 317 | } |
| 318 | |
| 319 | is_object[depth] = true; |
| 320 | after_value = false; |
| 321 | continue; |
| 322 | } |
| 323 | |
| 324 | // Scalars |
| 325 | case tape_type::STRING: |
| 326 | format.string(iter.get_string_view()); |
| 327 | break; |
| 328 | case tape_type::INT64: |
| 329 | format.number(iter.next_tape_value<int64_t>()); |
| 330 | iter.json_index++; // numbers take up 2 spots, so we need to increment |
| 331 | // extra |
| 332 | break; |
| 333 | case tape_type::UINT64: |
| 334 | format.number(iter.next_tape_value<uint64_t>()); |
| 335 | iter.json_index++; // numbers take up 2 spots, so we need to increment |
| 336 | // extra |
| 337 | break; |
| 338 | case tape_type::DOUBLE: |
| 339 | format.number(iter.next_tape_value<double>()); |
| 340 | iter.json_index++; // numbers take up 2 spots, so we need to increment |
| 341 | // extra |
| 342 | break; |
| 343 | case tape_type::TRUE_VALUE: |
| 344 | format.true_atom(); |
| 345 | break; |
| 346 | case tape_type::FALSE_VALUE: |
| 347 | format.false_atom(); |
| 348 | break; |
| 349 | case tape_type::NULL_VALUE: |
| 350 | format.null_atom(); |
| 351 | break; |
| 352 | |
| 353 | // These are impossible |
| 354 | case tape_type::END_ARRAY: |
| 355 | case tape_type::END_OBJECT: |
| 356 | case tape_type::ROOT: |
| 357 | SIMDJSON_UNREACHABLE(); |
| 358 | } |
| 359 | iter.json_index++; |
| 360 | after_value = true; |
| 361 | |
| 362 | // Handle multiple ends in a row |
| 363 | while (depth != 0 && (iter.tape_ref_type() == tape_type::END_ARRAY || |
| 364 | iter.tape_ref_type() == tape_type::END_OBJECT)) { |
| 365 | if (iter.tape_ref_type() == tape_type::END_ARRAY) { |
| 366 | format.end_array(); |
| 367 | } else { |
| 368 | format.end_object(); |
| 369 | } |
| 370 | depth--; |
| 371 | iter.json_index++; |
| 372 | } |
| 373 | |
| 374 | // Stop when we're at depth 0 |
| 375 | } while (depth != 0); |
| 376 | } |
| 377 | |
| 378 | template <class serializer> |
| 379 | inline void string_builder<serializer>::append(simdjson::dom::object value) { |
| 380 | format.start_object(); |
| 381 | auto pair = value.begin(); |
| 382 | auto end = value.end(); |
| 383 | if (pair != end) { |
| 384 | append(*pair); |
| 385 | for (++pair; pair != end; ++pair) { |
| 386 | format.comma(); |
| 387 | append(*pair); |
| 388 | } |
| 389 | } |
| 390 | format.end_object(); |
| 391 | } |
| 392 | |
| 393 | template <class serializer> |
| 394 | inline void string_builder<serializer>::append(simdjson::dom::array value) { |
| 395 | format.start_array(); |
| 396 | auto iter = value.begin(); |
| 397 | auto end = value.end(); |
| 398 | if (iter != end) { |
| 399 | append(*iter); |
| 400 | for (++iter; iter != end; ++iter) { |
| 401 | format.comma(); |
| 402 | append(*iter); |
| 403 | } |
| 404 | } |
| 405 | format.end_array(); |
| 406 | } |
| 407 | |
| 408 | template <class serializer> |
| 409 | simdjson_inline void string_builder<serializer>::append(simdjson::dom::key_value_pair kv) { |
| 410 | format.key(kv.key); |
| 411 | append(kv.value); |
| 412 | } |
| 413 | |
| 414 | template <class serializer> |
| 415 | simdjson_inline void string_builder<serializer>::clear() { |
| 416 | format.clear(); |
| 417 | } |
| 418 | |
| 419 | template <class serializer> |
| 420 | simdjson_inline std::string_view string_builder<serializer>::str() const { |
| 421 | return format.str(); |
| 422 | } |
| 423 | |
| 424 | |
| 425 | } // namespace internal |
| 426 | } // namespace simdjson |
| 427 | |
| 428 | #endif |
| 429 | |