| 1 | #include "simdjson/parsedjson.h" |
| 2 | #include "simdjson/jsonformatutils.h" |
| 3 | |
| 4 | namespace simdjson { |
| 5 | ParsedJson::ParsedJson() |
| 6 | : structural_indexes(nullptr), tape(nullptr), |
| 7 | containing_scope_offset(nullptr), ret_address(nullptr), |
| 8 | string_buf(nullptr), current_string_buf_loc(nullptr) {} |
| 9 | |
| 10 | ParsedJson::~ParsedJson() { deallocate(); } |
| 11 | |
| 12 | ParsedJson::ParsedJson(ParsedJson &&p) |
| 13 | : byte_capacity(p.byte_capacity), depth_capacity(p.depth_capacity), |
| 14 | tape_capacity(p.tape_capacity), string_capacity(p.string_capacity), |
| 15 | current_loc(p.current_loc), n_structural_indexes(p.n_structural_indexes), |
| 16 | structural_indexes(p.structural_indexes), tape(p.tape), |
| 17 | containing_scope_offset(p.containing_scope_offset), |
| 18 | ret_address(p.ret_address), string_buf(p.string_buf), |
| 19 | current_string_buf_loc(p.current_string_buf_loc), valid(p.valid) { |
| 20 | p.structural_indexes = nullptr; |
| 21 | p.tape = nullptr; |
| 22 | p.containing_scope_offset = nullptr; |
| 23 | p.ret_address = nullptr; |
| 24 | p.string_buf = nullptr; |
| 25 | p.current_string_buf_loc = nullptr; |
| 26 | } |
| 27 | |
| 28 | ParsedJson &ParsedJson::operator=(ParsedJson &&p) { |
| 29 | byte_capacity = p.byte_capacity; |
| 30 | p.byte_capacity = 0; |
| 31 | depth_capacity = p.depth_capacity; |
| 32 | p.depth_capacity = 0; |
| 33 | tape_capacity = p.tape_capacity; |
| 34 | p.tape_capacity = 0; |
| 35 | string_capacity = p.string_capacity; |
| 36 | p.string_capacity = 0; |
| 37 | current_loc = p.current_loc; |
| 38 | p.current_loc = 0; |
| 39 | n_structural_indexes = p.n_structural_indexes; |
| 40 | p.n_structural_indexes = 0; |
| 41 | structural_indexes = p.structural_indexes; |
| 42 | p.structural_indexes = nullptr; |
| 43 | tape = p.tape; |
| 44 | p.tape = nullptr; |
| 45 | containing_scope_offset = p.containing_scope_offset; |
| 46 | p.containing_scope_offset = nullptr; |
| 47 | ret_address = p.ret_address; |
| 48 | p.ret_address = nullptr; |
| 49 | string_buf = p.string_buf; |
| 50 | p.string_buf = nullptr; |
| 51 | current_string_buf_loc = p.current_string_buf_loc; |
| 52 | p.current_string_buf_loc = nullptr; |
| 53 | valid = p.valid; |
| 54 | p.valid = false; |
| 55 | return *this; |
| 56 | } |
| 57 | |
| 58 | WARN_UNUSED |
| 59 | bool ParsedJson::allocate_capacity(size_t len, size_t max_depth) { |
| 60 | if (max_depth <= 0) { |
| 61 | max_depth = 1; // don't let the user allocate nothing |
| 62 | } |
| 63 | if (len <= 0) { |
| 64 | len = 64; // allocating 0 bytes is wasteful. |
| 65 | } |
| 66 | if (len > SIMDJSON_MAXSIZE_BYTES) { |
| 67 | return false; |
| 68 | } |
| 69 | if ((len <= byte_capacity) && (max_depth <= depth_capacity)) { |
| 70 | return true; |
| 71 | } |
| 72 | deallocate(); |
| 73 | valid = false; |
| 74 | byte_capacity = 0; // will only set it to len after allocations are a success |
| 75 | n_structural_indexes = 0; |
| 76 | uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7; |
| 77 | structural_indexes = new (std::nothrow) uint32_t[max_structures]; |
| 78 | // a pathological input like "[[[[..." would generate len tape elements, so |
| 79 | // need a capacity of at least len + 1, but it is also possible to do |
| 80 | // worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6" |
| 81 | //where len + 1 tape elements are |
| 82 | // generated, see issue https://github.com/lemire/simdjson/issues/345 |
| 83 | size_t local_tape_capacity = ROUNDUP_N(len + 2, 64); |
| 84 | // a document with only zero-length strings... could have len/3 string |
| 85 | // and we would need len/3 * 5 bytes on the string buffer |
| 86 | size_t local_string_capacity = ROUNDUP_N(5 * len / 3 + 32, 64); |
| 87 | string_buf = new (std::nothrow) uint8_t[local_string_capacity]; |
| 88 | tape = new (std::nothrow) uint64_t[local_tape_capacity]; |
| 89 | containing_scope_offset = new (std::nothrow) uint32_t[max_depth]; |
| 90 | #ifdef SIMDJSON_USE_COMPUTED_GOTO |
| 91 | ret_address = new (std::nothrow) void *[max_depth]; |
| 92 | #else |
| 93 | ret_address = new (std::nothrow) char[max_depth]; |
| 94 | #endif |
| 95 | if ((string_buf == nullptr) || (tape == nullptr) || |
| 96 | (containing_scope_offset == nullptr) || (ret_address == nullptr) || |
| 97 | (structural_indexes == nullptr)) { |
| 98 | std::cerr << "Could not allocate memory" << std::endl; |
| 99 | delete[] ret_address; |
| 100 | delete[] containing_scope_offset; |
| 101 | delete[] tape; |
| 102 | delete[] string_buf; |
| 103 | delete[] structural_indexes; |
| 104 | |
| 105 | return false; |
| 106 | } |
| 107 | /* |
| 108 | // We do not need to initialize this content for parsing, though we could |
| 109 | // need to initialize it for safety. |
| 110 | memset(string_buf, 0 , local_string_capacity); |
| 111 | memset(structural_indexes, 0, max_structures * sizeof(uint32_t)); |
| 112 | memset(tape, 0, local_tape_capacity * sizeof(uint64_t)); |
| 113 | */ |
| 114 | byte_capacity = len; |
| 115 | depth_capacity = max_depth; |
| 116 | tape_capacity = local_tape_capacity; |
| 117 | string_capacity = local_string_capacity; |
| 118 | return true; |
| 119 | } |
| 120 | |
| 121 | bool ParsedJson::is_valid() const { return valid; } |
| 122 | |
| 123 | int ParsedJson::get_error_code() const { return error_code; } |
| 124 | |
| 125 | std::string ParsedJson::get_error_message() const { |
| 126 | return error_message(error_code); |
| 127 | } |
| 128 | |
| 129 | void ParsedJson::deallocate() { |
| 130 | byte_capacity = 0; |
| 131 | depth_capacity = 0; |
| 132 | tape_capacity = 0; |
| 133 | string_capacity = 0; |
| 134 | delete[] ret_address; |
| 135 | delete[] containing_scope_offset; |
| 136 | delete[] tape; |
| 137 | delete[] string_buf; |
| 138 | delete[] structural_indexes; |
| 139 | valid = false; |
| 140 | } |
| 141 | |
| 142 | void ParsedJson::init() { |
| 143 | current_string_buf_loc = string_buf; |
| 144 | current_loc = 0; |
| 145 | valid = false; |
| 146 | } |
| 147 | |
| 148 | WARN_UNUSED |
| 149 | bool ParsedJson::print_json(std::ostream &os) const { |
| 150 | if (!valid) { |
| 151 | return false; |
| 152 | } |
| 153 | uint32_t string_length; |
| 154 | size_t tape_idx = 0; |
| 155 | uint64_t tape_val = tape[tape_idx]; |
| 156 | uint8_t type = (tape_val >> 56); |
| 157 | size_t how_many = 0; |
| 158 | if (type == 'r') { |
| 159 | how_many = tape_val & JSON_VALUE_MASK; |
| 160 | } else { |
| 161 | fprintf(stderr, "Error: no starting root node?" ); |
| 162 | return false; |
| 163 | } |
| 164 | if (how_many > tape_capacity) { |
| 165 | fprintf( |
| 166 | stderr, |
| 167 | "We may be exceeding the tape capacity. Is this a valid document?\n" ); |
| 168 | return false; |
| 169 | } |
| 170 | tape_idx++; |
| 171 | bool *in_object = new bool[depth_capacity]; |
| 172 | auto *in_object_idx = new size_t[depth_capacity]; |
| 173 | int depth = 1; // only root at level 0 |
| 174 | in_object_idx[depth] = 0; |
| 175 | in_object[depth] = false; |
| 176 | for (; tape_idx < how_many; tape_idx++) { |
| 177 | tape_val = tape[tape_idx]; |
| 178 | uint64_t payload = tape_val & JSON_VALUE_MASK; |
| 179 | type = (tape_val >> 56); |
| 180 | if (!in_object[depth]) { |
| 181 | if ((in_object_idx[depth] > 0) && (type != ']')) { |
| 182 | os << "," ; |
| 183 | } |
| 184 | in_object_idx[depth]++; |
| 185 | } else { // if (in_object) { |
| 186 | if ((in_object_idx[depth] > 0) && ((in_object_idx[depth] & 1) == 0) && |
| 187 | (type != '}')) { |
| 188 | os << "," ; |
| 189 | } |
| 190 | if (((in_object_idx[depth] & 1) == 1)) { |
| 191 | os << ":" ; |
| 192 | } |
| 193 | in_object_idx[depth]++; |
| 194 | } |
| 195 | switch (type) { |
| 196 | case '"': // we have a string |
| 197 | os << '"'; |
| 198 | memcpy(&string_length, string_buf + payload, sizeof(uint32_t)); |
| 199 | print_with_escapes( |
| 200 | (const unsigned char *)(string_buf + payload + sizeof(uint32_t)), |
| 201 | os, string_length); |
| 202 | os << '"'; |
| 203 | break; |
| 204 | case 'l': // we have a long int |
| 205 | if (tape_idx + 1 >= how_many) { |
| 206 | delete[] in_object; |
| 207 | delete[] in_object_idx; |
| 208 | return false; |
| 209 | } |
| 210 | os << static_cast<int64_t>(tape[++tape_idx]); |
| 211 | break; |
| 212 | case 'u': |
| 213 | if (tape_idx + 1 >= how_many) { |
| 214 | delete[] in_object; |
| 215 | delete[] in_object_idx; |
| 216 | return false; |
| 217 | } |
| 218 | os << tape[++tape_idx]; |
| 219 | break; |
| 220 | case 'd': // we have a double |
| 221 | if (tape_idx + 1 >= how_many) { |
| 222 | delete[] in_object; |
| 223 | delete[] in_object_idx; |
| 224 | return false; |
| 225 | } |
| 226 | double answer; |
| 227 | memcpy(&answer, &tape[++tape_idx], sizeof(answer)); |
| 228 | os << answer; |
| 229 | break; |
| 230 | case 'n': // we have a null |
| 231 | os << "null" ; |
| 232 | break; |
| 233 | case 't': // we have a true |
| 234 | os << "true" ; |
| 235 | break; |
| 236 | case 'f': // we have a false |
| 237 | os << "false" ; |
| 238 | break; |
| 239 | case '{': // we have an object |
| 240 | os << '{'; |
| 241 | depth++; |
| 242 | in_object[depth] = true; |
| 243 | in_object_idx[depth] = 0; |
| 244 | break; |
| 245 | case '}': // we end an object |
| 246 | depth--; |
| 247 | os << '}'; |
| 248 | break; |
| 249 | case '[': // we start an array |
| 250 | os << '['; |
| 251 | depth++; |
| 252 | in_object[depth] = false; |
| 253 | in_object_idx[depth] = 0; |
| 254 | break; |
| 255 | case ']': // we end an array |
| 256 | depth--; |
| 257 | os << ']'; |
| 258 | break; |
| 259 | case 'r': // we start and end with the root node |
| 260 | fprintf(stderr, "should we be hitting the root node?\n" ); |
| 261 | delete[] in_object; |
| 262 | delete[] in_object_idx; |
| 263 | return false; |
| 264 | default: |
| 265 | fprintf(stderr, "bug %c\n" , type); |
| 266 | delete[] in_object; |
| 267 | delete[] in_object_idx; |
| 268 | return false; |
| 269 | } |
| 270 | } |
| 271 | delete[] in_object; |
| 272 | delete[] in_object_idx; |
| 273 | return true; |
| 274 | } |
| 275 | |
| 276 | WARN_UNUSED |
| 277 | bool ParsedJson::dump_raw_tape(std::ostream &os) const { |
| 278 | if (!valid) { |
| 279 | return false; |
| 280 | } |
| 281 | uint32_t string_length; |
| 282 | size_t tape_idx = 0; |
| 283 | uint64_t tape_val = tape[tape_idx]; |
| 284 | uint8_t type = (tape_val >> 56); |
| 285 | os << tape_idx << " : " << type; |
| 286 | tape_idx++; |
| 287 | size_t how_many = 0; |
| 288 | if (type == 'r') { |
| 289 | how_many = tape_val & JSON_VALUE_MASK; |
| 290 | } else { |
| 291 | fprintf(stderr, "Error: no starting root node?" ); |
| 292 | return false; |
| 293 | } |
| 294 | os << "\t// pointing to " << how_many << " (right after last node)\n" ; |
| 295 | uint64_t payload; |
| 296 | for (; tape_idx < how_many; tape_idx++) { |
| 297 | os << tape_idx << " : " ; |
| 298 | tape_val = tape[tape_idx]; |
| 299 | payload = tape_val & JSON_VALUE_MASK; |
| 300 | type = (tape_val >> 56); |
| 301 | switch (type) { |
| 302 | case '"': // we have a string |
| 303 | os << "string \"" ; |
| 304 | memcpy(&string_length, string_buf + payload, sizeof(uint32_t)); |
| 305 | print_with_escapes( |
| 306 | (const unsigned char *)(string_buf + payload + sizeof(uint32_t)), |
| 307 | string_length); |
| 308 | os << '"'; |
| 309 | os << '\n'; |
| 310 | break; |
| 311 | case 'l': // we have a long int |
| 312 | if (tape_idx + 1 >= how_many) { |
| 313 | return false; |
| 314 | } |
| 315 | os << "integer " << static_cast<int64_t>(tape[++tape_idx]) << "\n" ; |
| 316 | break; |
| 317 | case 'u': // we have a long uint |
| 318 | if (tape_idx + 1 >= how_many) { |
| 319 | return false; |
| 320 | } |
| 321 | os << "unsigned integer " << tape[++tape_idx] << "\n" ; |
| 322 | break; |
| 323 | case 'd': // we have a double |
| 324 | os << "float " ; |
| 325 | if (tape_idx + 1 >= how_many) { |
| 326 | return false; |
| 327 | } |
| 328 | double answer; |
| 329 | memcpy(&answer, &tape[++tape_idx], sizeof(answer)); |
| 330 | os << answer << '\n'; |
| 331 | break; |
| 332 | case 'n': // we have a null |
| 333 | os << "null\n" ; |
| 334 | break; |
| 335 | case 't': // we have a true |
| 336 | os << "true\n" ; |
| 337 | break; |
| 338 | case 'f': // we have a false |
| 339 | os << "false\n" ; |
| 340 | break; |
| 341 | case '{': // we have an object |
| 342 | os << "{\t// pointing to next tape location " << payload |
| 343 | << " (first node after the scope) \n" ; |
| 344 | break; |
| 345 | case '}': // we end an object |
| 346 | os << "}\t// pointing to previous tape location " << payload |
| 347 | << " (start of the scope) \n" ; |
| 348 | break; |
| 349 | case '[': // we start an array |
| 350 | os << "[\t// pointing to next tape location " << payload |
| 351 | << " (first node after the scope) \n" ; |
| 352 | break; |
| 353 | case ']': // we end an array |
| 354 | os << "]\t// pointing to previous tape location " << payload |
| 355 | << " (start of the scope) \n" ; |
| 356 | break; |
| 357 | case 'r': // we start and end with the root node |
| 358 | printf("end of root\n" ); |
| 359 | return false; |
| 360 | default: |
| 361 | return false; |
| 362 | } |
| 363 | } |
| 364 | tape_val = tape[tape_idx]; |
| 365 | payload = tape_val & JSON_VALUE_MASK; |
| 366 | type = (tape_val >> 56); |
| 367 | os << tape_idx << " : " << type << "\t// pointing to " << payload |
| 368 | << " (start root)\n" ; |
| 369 | return true; |
| 370 | } |
| 371 | } // namespace simdjson |
| 372 | |