1 | #include "simdjson/parsedjson.h" |
2 | #include "simdjson/jsonformatutils.h" |
3 | |
4 | namespace simdjson { |
5 | ParsedJson::ParsedJson() |
6 | : structural_indexes(nullptr), tape(nullptr), |
7 | containing_scope_offset(nullptr), ret_address(nullptr), |
8 | string_buf(nullptr), current_string_buf_loc(nullptr) {} |
9 | |
10 | ParsedJson::~ParsedJson() { deallocate(); } |
11 | |
12 | ParsedJson::ParsedJson(ParsedJson &&p) |
13 | : byte_capacity(p.byte_capacity), depth_capacity(p.depth_capacity), |
14 | tape_capacity(p.tape_capacity), string_capacity(p.string_capacity), |
15 | current_loc(p.current_loc), n_structural_indexes(p.n_structural_indexes), |
16 | structural_indexes(p.structural_indexes), tape(p.tape), |
17 | containing_scope_offset(p.containing_scope_offset), |
18 | ret_address(p.ret_address), string_buf(p.string_buf), |
19 | current_string_buf_loc(p.current_string_buf_loc), valid(p.valid) { |
20 | p.structural_indexes = nullptr; |
21 | p.tape = nullptr; |
22 | p.containing_scope_offset = nullptr; |
23 | p.ret_address = nullptr; |
24 | p.string_buf = nullptr; |
25 | p.current_string_buf_loc = nullptr; |
26 | } |
27 | |
28 | ParsedJson &ParsedJson::operator=(ParsedJson &&p) { |
29 | byte_capacity = p.byte_capacity; |
30 | p.byte_capacity = 0; |
31 | depth_capacity = p.depth_capacity; |
32 | p.depth_capacity = 0; |
33 | tape_capacity = p.tape_capacity; |
34 | p.tape_capacity = 0; |
35 | string_capacity = p.string_capacity; |
36 | p.string_capacity = 0; |
37 | current_loc = p.current_loc; |
38 | p.current_loc = 0; |
39 | n_structural_indexes = p.n_structural_indexes; |
40 | p.n_structural_indexes = 0; |
41 | structural_indexes = p.structural_indexes; |
42 | p.structural_indexes = nullptr; |
43 | tape = p.tape; |
44 | p.tape = nullptr; |
45 | containing_scope_offset = p.containing_scope_offset; |
46 | p.containing_scope_offset = nullptr; |
47 | ret_address = p.ret_address; |
48 | p.ret_address = nullptr; |
49 | string_buf = p.string_buf; |
50 | p.string_buf = nullptr; |
51 | current_string_buf_loc = p.current_string_buf_loc; |
52 | p.current_string_buf_loc = nullptr; |
53 | valid = p.valid; |
54 | p.valid = false; |
55 | return *this; |
56 | } |
57 | |
58 | WARN_UNUSED |
59 | bool ParsedJson::allocate_capacity(size_t len, size_t max_depth) { |
60 | if (max_depth <= 0) { |
61 | max_depth = 1; // don't let the user allocate nothing |
62 | } |
63 | if (len <= 0) { |
64 | len = 64; // allocating 0 bytes is wasteful. |
65 | } |
66 | if (len > SIMDJSON_MAXSIZE_BYTES) { |
67 | return false; |
68 | } |
69 | if ((len <= byte_capacity) && (max_depth <= depth_capacity)) { |
70 | return true; |
71 | } |
72 | deallocate(); |
73 | valid = false; |
74 | byte_capacity = 0; // will only set it to len after allocations are a success |
75 | n_structural_indexes = 0; |
76 | uint32_t max_structures = ROUNDUP_N(len, 64) + 2 + 7; |
77 | structural_indexes = new (std::nothrow) uint32_t[max_structures]; |
78 | // a pathological input like "[[[[..." would generate len tape elements, so |
79 | // need a capacity of at least len + 1, but it is also possible to do |
80 | // worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6" |
81 | //where len + 1 tape elements are |
82 | // generated, see issue https://github.com/lemire/simdjson/issues/345 |
83 | size_t local_tape_capacity = ROUNDUP_N(len + 2, 64); |
84 | // a document with only zero-length strings... could have len/3 string |
85 | // and we would need len/3 * 5 bytes on the string buffer |
86 | size_t local_string_capacity = ROUNDUP_N(5 * len / 3 + 32, 64); |
87 | string_buf = new (std::nothrow) uint8_t[local_string_capacity]; |
88 | tape = new (std::nothrow) uint64_t[local_tape_capacity]; |
89 | containing_scope_offset = new (std::nothrow) uint32_t[max_depth]; |
90 | #ifdef SIMDJSON_USE_COMPUTED_GOTO |
91 | ret_address = new (std::nothrow) void *[max_depth]; |
92 | #else |
93 | ret_address = new (std::nothrow) char[max_depth]; |
94 | #endif |
95 | if ((string_buf == nullptr) || (tape == nullptr) || |
96 | (containing_scope_offset == nullptr) || (ret_address == nullptr) || |
97 | (structural_indexes == nullptr)) { |
98 | std::cerr << "Could not allocate memory" << std::endl; |
99 | delete[] ret_address; |
100 | delete[] containing_scope_offset; |
101 | delete[] tape; |
102 | delete[] string_buf; |
103 | delete[] structural_indexes; |
104 | |
105 | return false; |
106 | } |
107 | /* |
108 | // We do not need to initialize this content for parsing, though we could |
109 | // need to initialize it for safety. |
110 | memset(string_buf, 0 , local_string_capacity); |
111 | memset(structural_indexes, 0, max_structures * sizeof(uint32_t)); |
112 | memset(tape, 0, local_tape_capacity * sizeof(uint64_t)); |
113 | */ |
114 | byte_capacity = len; |
115 | depth_capacity = max_depth; |
116 | tape_capacity = local_tape_capacity; |
117 | string_capacity = local_string_capacity; |
118 | return true; |
119 | } |
120 | |
121 | bool ParsedJson::is_valid() const { return valid; } |
122 | |
123 | int ParsedJson::get_error_code() const { return error_code; } |
124 | |
125 | std::string ParsedJson::get_error_message() const { |
126 | return error_message(error_code); |
127 | } |
128 | |
129 | void ParsedJson::deallocate() { |
130 | byte_capacity = 0; |
131 | depth_capacity = 0; |
132 | tape_capacity = 0; |
133 | string_capacity = 0; |
134 | delete[] ret_address; |
135 | delete[] containing_scope_offset; |
136 | delete[] tape; |
137 | delete[] string_buf; |
138 | delete[] structural_indexes; |
139 | valid = false; |
140 | } |
141 | |
142 | void ParsedJson::init() { |
143 | current_string_buf_loc = string_buf; |
144 | current_loc = 0; |
145 | valid = false; |
146 | } |
147 | |
148 | WARN_UNUSED |
149 | bool ParsedJson::print_json(std::ostream &os) const { |
150 | if (!valid) { |
151 | return false; |
152 | } |
153 | uint32_t string_length; |
154 | size_t tape_idx = 0; |
155 | uint64_t tape_val = tape[tape_idx]; |
156 | uint8_t type = (tape_val >> 56); |
157 | size_t how_many = 0; |
158 | if (type == 'r') { |
159 | how_many = tape_val & JSON_VALUE_MASK; |
160 | } else { |
161 | fprintf(stderr, "Error: no starting root node?" ); |
162 | return false; |
163 | } |
164 | if (how_many > tape_capacity) { |
165 | fprintf( |
166 | stderr, |
167 | "We may be exceeding the tape capacity. Is this a valid document?\n" ); |
168 | return false; |
169 | } |
170 | tape_idx++; |
171 | bool *in_object = new bool[depth_capacity]; |
172 | auto *in_object_idx = new size_t[depth_capacity]; |
173 | int depth = 1; // only root at level 0 |
174 | in_object_idx[depth] = 0; |
175 | in_object[depth] = false; |
176 | for (; tape_idx < how_many; tape_idx++) { |
177 | tape_val = tape[tape_idx]; |
178 | uint64_t payload = tape_val & JSON_VALUE_MASK; |
179 | type = (tape_val >> 56); |
180 | if (!in_object[depth]) { |
181 | if ((in_object_idx[depth] > 0) && (type != ']')) { |
182 | os << "," ; |
183 | } |
184 | in_object_idx[depth]++; |
185 | } else { // if (in_object) { |
186 | if ((in_object_idx[depth] > 0) && ((in_object_idx[depth] & 1) == 0) && |
187 | (type != '}')) { |
188 | os << "," ; |
189 | } |
190 | if (((in_object_idx[depth] & 1) == 1)) { |
191 | os << ":" ; |
192 | } |
193 | in_object_idx[depth]++; |
194 | } |
195 | switch (type) { |
196 | case '"': // we have a string |
197 | os << '"'; |
198 | memcpy(&string_length, string_buf + payload, sizeof(uint32_t)); |
199 | print_with_escapes( |
200 | (const unsigned char *)(string_buf + payload + sizeof(uint32_t)), |
201 | os, string_length); |
202 | os << '"'; |
203 | break; |
204 | case 'l': // we have a long int |
205 | if (tape_idx + 1 >= how_many) { |
206 | delete[] in_object; |
207 | delete[] in_object_idx; |
208 | return false; |
209 | } |
210 | os << static_cast<int64_t>(tape[++tape_idx]); |
211 | break; |
212 | case 'u': |
213 | if (tape_idx + 1 >= how_many) { |
214 | delete[] in_object; |
215 | delete[] in_object_idx; |
216 | return false; |
217 | } |
218 | os << tape[++tape_idx]; |
219 | break; |
220 | case 'd': // we have a double |
221 | if (tape_idx + 1 >= how_many) { |
222 | delete[] in_object; |
223 | delete[] in_object_idx; |
224 | return false; |
225 | } |
226 | double answer; |
227 | memcpy(&answer, &tape[++tape_idx], sizeof(answer)); |
228 | os << answer; |
229 | break; |
230 | case 'n': // we have a null |
231 | os << "null" ; |
232 | break; |
233 | case 't': // we have a true |
234 | os << "true" ; |
235 | break; |
236 | case 'f': // we have a false |
237 | os << "false" ; |
238 | break; |
239 | case '{': // we have an object |
240 | os << '{'; |
241 | depth++; |
242 | in_object[depth] = true; |
243 | in_object_idx[depth] = 0; |
244 | break; |
245 | case '}': // we end an object |
246 | depth--; |
247 | os << '}'; |
248 | break; |
249 | case '[': // we start an array |
250 | os << '['; |
251 | depth++; |
252 | in_object[depth] = false; |
253 | in_object_idx[depth] = 0; |
254 | break; |
255 | case ']': // we end an array |
256 | depth--; |
257 | os << ']'; |
258 | break; |
259 | case 'r': // we start and end with the root node |
260 | fprintf(stderr, "should we be hitting the root node?\n" ); |
261 | delete[] in_object; |
262 | delete[] in_object_idx; |
263 | return false; |
264 | default: |
265 | fprintf(stderr, "bug %c\n" , type); |
266 | delete[] in_object; |
267 | delete[] in_object_idx; |
268 | return false; |
269 | } |
270 | } |
271 | delete[] in_object; |
272 | delete[] in_object_idx; |
273 | return true; |
274 | } |
275 | |
276 | WARN_UNUSED |
277 | bool ParsedJson::dump_raw_tape(std::ostream &os) const { |
278 | if (!valid) { |
279 | return false; |
280 | } |
281 | uint32_t string_length; |
282 | size_t tape_idx = 0; |
283 | uint64_t tape_val = tape[tape_idx]; |
284 | uint8_t type = (tape_val >> 56); |
285 | os << tape_idx << " : " << type; |
286 | tape_idx++; |
287 | size_t how_many = 0; |
288 | if (type == 'r') { |
289 | how_many = tape_val & JSON_VALUE_MASK; |
290 | } else { |
291 | fprintf(stderr, "Error: no starting root node?" ); |
292 | return false; |
293 | } |
294 | os << "\t// pointing to " << how_many << " (right after last node)\n" ; |
295 | uint64_t payload; |
296 | for (; tape_idx < how_many; tape_idx++) { |
297 | os << tape_idx << " : " ; |
298 | tape_val = tape[tape_idx]; |
299 | payload = tape_val & JSON_VALUE_MASK; |
300 | type = (tape_val >> 56); |
301 | switch (type) { |
302 | case '"': // we have a string |
303 | os << "string \"" ; |
304 | memcpy(&string_length, string_buf + payload, sizeof(uint32_t)); |
305 | print_with_escapes( |
306 | (const unsigned char *)(string_buf + payload + sizeof(uint32_t)), |
307 | string_length); |
308 | os << '"'; |
309 | os << '\n'; |
310 | break; |
311 | case 'l': // we have a long int |
312 | if (tape_idx + 1 >= how_many) { |
313 | return false; |
314 | } |
315 | os << "integer " << static_cast<int64_t>(tape[++tape_idx]) << "\n" ; |
316 | break; |
317 | case 'u': // we have a long uint |
318 | if (tape_idx + 1 >= how_many) { |
319 | return false; |
320 | } |
321 | os << "unsigned integer " << tape[++tape_idx] << "\n" ; |
322 | break; |
323 | case 'd': // we have a double |
324 | os << "float " ; |
325 | if (tape_idx + 1 >= how_many) { |
326 | return false; |
327 | } |
328 | double answer; |
329 | memcpy(&answer, &tape[++tape_idx], sizeof(answer)); |
330 | os << answer << '\n'; |
331 | break; |
332 | case 'n': // we have a null |
333 | os << "null\n" ; |
334 | break; |
335 | case 't': // we have a true |
336 | os << "true\n" ; |
337 | break; |
338 | case 'f': // we have a false |
339 | os << "false\n" ; |
340 | break; |
341 | case '{': // we have an object |
342 | os << "{\t// pointing to next tape location " << payload |
343 | << " (first node after the scope) \n" ; |
344 | break; |
345 | case '}': // we end an object |
346 | os << "}\t// pointing to previous tape location " << payload |
347 | << " (start of the scope) \n" ; |
348 | break; |
349 | case '[': // we start an array |
350 | os << "[\t// pointing to next tape location " << payload |
351 | << " (first node after the scope) \n" ; |
352 | break; |
353 | case ']': // we end an array |
354 | os << "]\t// pointing to previous tape location " << payload |
355 | << " (start of the scope) \n" ; |
356 | break; |
357 | case 'r': // we start and end with the root node |
358 | printf("end of root\n" ); |
359 | return false; |
360 | default: |
361 | return false; |
362 | } |
363 | } |
364 | tape_val = tape[tape_idx]; |
365 | payload = tape_val & JSON_VALUE_MASK; |
366 | type = (tape_val >> 56); |
367 | os << tape_idx << " : " << type << "\t// pointing to " << payload |
368 | << " (start root)\n" ; |
369 | return true; |
370 | } |
371 | } // namespace simdjson |
372 | |