1 | #pragma once |
2 | |
3 | #include <cassert> // assert |
4 | #include <cmath> // isfinite |
5 | #include <cstdint> // uint8_t |
6 | #include <functional> // function |
7 | #include <string> // string |
8 | #include <utility> // move |
9 | #include <vector> // vector |
10 | |
11 | #include <nlohmann/detail/exceptions.hpp> |
12 | #include <nlohmann/detail/input/input_adapters.hpp> |
13 | #include <nlohmann/detail/input/json_sax.hpp> |
14 | #include <nlohmann/detail/input/lexer.hpp> |
15 | #include <nlohmann/detail/macro_scope.hpp> |
16 | #include <nlohmann/detail/meta/is_sax.hpp> |
17 | #include <nlohmann/detail/value_t.hpp> |
18 | |
19 | namespace nlohmann |
20 | { |
21 | namespace detail |
22 | { |
23 | //////////// |
24 | // parser // |
25 | //////////// |
26 | |
27 | /*! |
28 | @brief syntax analysis |
29 | |
30 | This class implements a recursive decent parser. |
31 | */ |
32 | template<typename BasicJsonType> |
33 | class parser |
34 | { |
35 | using number_integer_t = typename BasicJsonType::number_integer_t; |
36 | using number_unsigned_t = typename BasicJsonType::number_unsigned_t; |
37 | using number_float_t = typename BasicJsonType::number_float_t; |
38 | using string_t = typename BasicJsonType::string_t; |
39 | using lexer_t = lexer<BasicJsonType>; |
40 | using token_type = typename lexer_t::token_type; |
41 | |
42 | public: |
43 | enum class parse_event_t : uint8_t |
44 | { |
45 | /// the parser read `{` and started to process a JSON object |
46 | object_start, |
47 | /// the parser read `}` and finished processing a JSON object |
48 | object_end, |
49 | /// the parser read `[` and started to process a JSON array |
50 | array_start, |
51 | /// the parser read `]` and finished processing a JSON array |
52 | array_end, |
53 | /// the parser read a key of a value in an object |
54 | key, |
55 | /// the parser finished reading a JSON value |
56 | value |
57 | }; |
58 | |
59 | using parser_callback_t = |
60 | std::function<bool(int depth, parse_event_t event, BasicJsonType& parsed)>; |
61 | |
62 | /// a parser reading from an input adapter |
63 | explicit parser(detail::input_adapter_t&& adapter, |
64 | const parser_callback_t cb = nullptr, |
65 | const bool allow_exceptions_ = true) |
66 | : callback(cb), m_lexer(std::move(adapter)), allow_exceptions(allow_exceptions_) |
67 | { |
68 | // read first token |
69 | get_token(); |
70 | } |
71 | |
72 | /*! |
73 | @brief public parser interface |
74 | |
75 | @param[in] strict whether to expect the last token to be EOF |
76 | @param[in,out] result parsed JSON value |
77 | |
78 | @throw parse_error.101 in case of an unexpected token |
79 | @throw parse_error.102 if to_unicode fails or surrogate error |
80 | @throw parse_error.103 if to_unicode fails |
81 | */ |
82 | void parse(const bool strict, BasicJsonType& result) |
83 | { |
84 | if (callback) |
85 | { |
86 | json_sax_dom_callback_parser<BasicJsonType> sdp(result, callback, allow_exceptions); |
87 | sax_parse_internal(&sdp); |
88 | result.assert_invariant(); |
89 | |
90 | // in strict mode, input must be completely read |
91 | if (strict and (get_token() != token_type::end_of_input)) |
92 | { |
93 | sdp.parse_error(m_lexer.get_position(), |
94 | m_lexer.get_token_string(), |
95 | parse_error::create(101, m_lexer.get_position(), |
96 | exception_message(token_type::end_of_input, "value" ))); |
97 | } |
98 | |
99 | // in case of an error, return discarded value |
100 | if (sdp.is_errored()) |
101 | { |
102 | result = value_t::discarded; |
103 | return; |
104 | } |
105 | |
106 | // set top-level value to null if it was discarded by the callback |
107 | // function |
108 | if (result.is_discarded()) |
109 | { |
110 | result = nullptr; |
111 | } |
112 | } |
113 | else |
114 | { |
115 | json_sax_dom_parser<BasicJsonType> sdp(result, allow_exceptions); |
116 | sax_parse_internal(&sdp); |
117 | result.assert_invariant(); |
118 | |
119 | // in strict mode, input must be completely read |
120 | if (strict and (get_token() != token_type::end_of_input)) |
121 | { |
122 | sdp.parse_error(m_lexer.get_position(), |
123 | m_lexer.get_token_string(), |
124 | parse_error::create(101, m_lexer.get_position(), |
125 | exception_message(token_type::end_of_input, "value" ))); |
126 | } |
127 | |
128 | // in case of an error, return discarded value |
129 | if (sdp.is_errored()) |
130 | { |
131 | result = value_t::discarded; |
132 | return; |
133 | } |
134 | } |
135 | } |
136 | |
137 | /*! |
138 | @brief public accept interface |
139 | |
140 | @param[in] strict whether to expect the last token to be EOF |
141 | @return whether the input is a proper JSON text |
142 | */ |
143 | bool accept(const bool strict = true) |
144 | { |
145 | json_sax_acceptor<BasicJsonType> sax_acceptor; |
146 | return sax_parse(&sax_acceptor, strict); |
147 | } |
148 | |
149 | template <typename SAX> |
150 | JSON_HEDLEY_NON_NULL(2) |
151 | bool sax_parse(SAX* sax, const bool strict = true) |
152 | { |
153 | (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {}; |
154 | const bool result = sax_parse_internal(sax); |
155 | |
156 | // strict mode: next byte must be EOF |
157 | if (result and strict and (get_token() != token_type::end_of_input)) |
158 | { |
159 | return sax->parse_error(m_lexer.get_position(), |
160 | m_lexer.get_token_string(), |
161 | parse_error::create(101, m_lexer.get_position(), |
162 | exception_message(token_type::end_of_input, "value" ))); |
163 | } |
164 | |
165 | return result; |
166 | } |
167 | |
168 | private: |
169 | template <typename SAX> |
170 | JSON_HEDLEY_NON_NULL(2) |
171 | bool sax_parse_internal(SAX* sax) |
172 | { |
173 | // stack to remember the hierarchy of structured values we are parsing |
174 | // true = array; false = object |
175 | std::vector<bool> states; |
176 | // value to avoid a goto (see comment where set to true) |
177 | bool skip_to_state_evaluation = false; |
178 | |
179 | while (true) |
180 | { |
181 | if (not skip_to_state_evaluation) |
182 | { |
183 | // invariant: get_token() was called before each iteration |
184 | switch (last_token) |
185 | { |
186 | case token_type::begin_object: |
187 | { |
188 | if (JSON_HEDLEY_UNLIKELY(not sax->start_object(std::size_t(-1)))) |
189 | { |
190 | return false; |
191 | } |
192 | |
193 | // closing } -> we are done |
194 | if (get_token() == token_type::end_object) |
195 | { |
196 | if (JSON_HEDLEY_UNLIKELY(not sax->end_object())) |
197 | { |
198 | return false; |
199 | } |
200 | break; |
201 | } |
202 | |
203 | // parse key |
204 | if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string)) |
205 | { |
206 | return sax->parse_error(m_lexer.get_position(), |
207 | m_lexer.get_token_string(), |
208 | parse_error::create(101, m_lexer.get_position(), |
209 | exception_message(token_type::value_string, "object key" ))); |
210 | } |
211 | if (JSON_HEDLEY_UNLIKELY(not sax->key(m_lexer.get_string()))) |
212 | { |
213 | return false; |
214 | } |
215 | |
216 | // parse separator (:) |
217 | if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) |
218 | { |
219 | return sax->parse_error(m_lexer.get_position(), |
220 | m_lexer.get_token_string(), |
221 | parse_error::create(101, m_lexer.get_position(), |
222 | exception_message(token_type::name_separator, "object separator" ))); |
223 | } |
224 | |
225 | // remember we are now inside an object |
226 | states.push_back(false); |
227 | |
228 | // parse values |
229 | get_token(); |
230 | continue; |
231 | } |
232 | |
233 | case token_type::begin_array: |
234 | { |
235 | if (JSON_HEDLEY_UNLIKELY(not sax->start_array(std::size_t(-1)))) |
236 | { |
237 | return false; |
238 | } |
239 | |
240 | // closing ] -> we are done |
241 | if (get_token() == token_type::end_array) |
242 | { |
243 | if (JSON_HEDLEY_UNLIKELY(not sax->end_array())) |
244 | { |
245 | return false; |
246 | } |
247 | break; |
248 | } |
249 | |
250 | // remember we are now inside an array |
251 | states.push_back(true); |
252 | |
253 | // parse values (no need to call get_token) |
254 | continue; |
255 | } |
256 | |
257 | case token_type::value_float: |
258 | { |
259 | const auto res = m_lexer.get_number_float(); |
260 | |
261 | if (JSON_HEDLEY_UNLIKELY(not std::isfinite(res))) |
262 | { |
263 | return sax->parse_error(m_lexer.get_position(), |
264 | m_lexer.get_token_string(), |
265 | out_of_range::create(406, "number overflow parsing '" + m_lexer.get_token_string() + "'" )); |
266 | } |
267 | |
268 | if (JSON_HEDLEY_UNLIKELY(not sax->number_float(res, m_lexer.get_string()))) |
269 | { |
270 | return false; |
271 | } |
272 | |
273 | break; |
274 | } |
275 | |
276 | case token_type::literal_false: |
277 | { |
278 | if (JSON_HEDLEY_UNLIKELY(not sax->boolean(false))) |
279 | { |
280 | return false; |
281 | } |
282 | break; |
283 | } |
284 | |
285 | case token_type::literal_null: |
286 | { |
287 | if (JSON_HEDLEY_UNLIKELY(not sax->null())) |
288 | { |
289 | return false; |
290 | } |
291 | break; |
292 | } |
293 | |
294 | case token_type::literal_true: |
295 | { |
296 | if (JSON_HEDLEY_UNLIKELY(not sax->boolean(true))) |
297 | { |
298 | return false; |
299 | } |
300 | break; |
301 | } |
302 | |
303 | case token_type::value_integer: |
304 | { |
305 | if (JSON_HEDLEY_UNLIKELY(not sax->number_integer(m_lexer.get_number_integer()))) |
306 | { |
307 | return false; |
308 | } |
309 | break; |
310 | } |
311 | |
312 | case token_type::value_string: |
313 | { |
314 | if (JSON_HEDLEY_UNLIKELY(not sax->string(m_lexer.get_string()))) |
315 | { |
316 | return false; |
317 | } |
318 | break; |
319 | } |
320 | |
321 | case token_type::value_unsigned: |
322 | { |
323 | if (JSON_HEDLEY_UNLIKELY(not sax->number_unsigned(m_lexer.get_number_unsigned()))) |
324 | { |
325 | return false; |
326 | } |
327 | break; |
328 | } |
329 | |
330 | case token_type::parse_error: |
331 | { |
332 | // using "uninitialized" to avoid "expected" message |
333 | return sax->parse_error(m_lexer.get_position(), |
334 | m_lexer.get_token_string(), |
335 | parse_error::create(101, m_lexer.get_position(), |
336 | exception_message(token_type::uninitialized, "value" ))); |
337 | } |
338 | |
339 | default: // the last token was unexpected |
340 | { |
341 | return sax->parse_error(m_lexer.get_position(), |
342 | m_lexer.get_token_string(), |
343 | parse_error::create(101, m_lexer.get_position(), |
344 | exception_message(token_type::literal_or_value, "value" ))); |
345 | } |
346 | } |
347 | } |
348 | else |
349 | { |
350 | skip_to_state_evaluation = false; |
351 | } |
352 | |
353 | // we reached this line after we successfully parsed a value |
354 | if (states.empty()) |
355 | { |
356 | // empty stack: we reached the end of the hierarchy: done |
357 | return true; |
358 | } |
359 | |
360 | if (states.back()) // array |
361 | { |
362 | // comma -> next value |
363 | if (get_token() == token_type::value_separator) |
364 | { |
365 | // parse a new value |
366 | get_token(); |
367 | continue; |
368 | } |
369 | |
370 | // closing ] |
371 | if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array)) |
372 | { |
373 | if (JSON_HEDLEY_UNLIKELY(not sax->end_array())) |
374 | { |
375 | return false; |
376 | } |
377 | |
378 | // We are done with this array. Before we can parse a |
379 | // new value, we need to evaluate the new state first. |
380 | // By setting skip_to_state_evaluation to false, we |
381 | // are effectively jumping to the beginning of this if. |
382 | assert(not states.empty()); |
383 | states.pop_back(); |
384 | skip_to_state_evaluation = true; |
385 | continue; |
386 | } |
387 | |
388 | return sax->parse_error(m_lexer.get_position(), |
389 | m_lexer.get_token_string(), |
390 | parse_error::create(101, m_lexer.get_position(), |
391 | exception_message(token_type::end_array, "array" ))); |
392 | } |
393 | else // object |
394 | { |
395 | // comma -> next value |
396 | if (get_token() == token_type::value_separator) |
397 | { |
398 | // parse key |
399 | if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string)) |
400 | { |
401 | return sax->parse_error(m_lexer.get_position(), |
402 | m_lexer.get_token_string(), |
403 | parse_error::create(101, m_lexer.get_position(), |
404 | exception_message(token_type::value_string, "object key" ))); |
405 | } |
406 | |
407 | if (JSON_HEDLEY_UNLIKELY(not sax->key(m_lexer.get_string()))) |
408 | { |
409 | return false; |
410 | } |
411 | |
412 | // parse separator (:) |
413 | if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) |
414 | { |
415 | return sax->parse_error(m_lexer.get_position(), |
416 | m_lexer.get_token_string(), |
417 | parse_error::create(101, m_lexer.get_position(), |
418 | exception_message(token_type::name_separator, "object separator" ))); |
419 | } |
420 | |
421 | // parse values |
422 | get_token(); |
423 | continue; |
424 | } |
425 | |
426 | // closing } |
427 | if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object)) |
428 | { |
429 | if (JSON_HEDLEY_UNLIKELY(not sax->end_object())) |
430 | { |
431 | return false; |
432 | } |
433 | |
434 | // We are done with this object. Before we can parse a |
435 | // new value, we need to evaluate the new state first. |
436 | // By setting skip_to_state_evaluation to false, we |
437 | // are effectively jumping to the beginning of this if. |
438 | assert(not states.empty()); |
439 | states.pop_back(); |
440 | skip_to_state_evaluation = true; |
441 | continue; |
442 | } |
443 | |
444 | return sax->parse_error(m_lexer.get_position(), |
445 | m_lexer.get_token_string(), |
446 | parse_error::create(101, m_lexer.get_position(), |
447 | exception_message(token_type::end_object, "object" ))); |
448 | } |
449 | } |
450 | } |
451 | |
452 | /// get next token from lexer |
453 | token_type get_token() |
454 | { |
455 | return last_token = m_lexer.scan(); |
456 | } |
457 | |
458 | std::string exception_message(const token_type expected, const std::string& context) |
459 | { |
460 | std::string error_msg = "syntax error " ; |
461 | |
462 | if (not context.empty()) |
463 | { |
464 | error_msg += "while parsing " + context + " " ; |
465 | } |
466 | |
467 | error_msg += "- " ; |
468 | |
469 | if (last_token == token_type::parse_error) |
470 | { |
471 | error_msg += std::string(m_lexer.get_error_message()) + "; last read: '" + |
472 | m_lexer.get_token_string() + "'" ; |
473 | } |
474 | else |
475 | { |
476 | error_msg += "unexpected " + std::string(lexer_t::token_type_name(last_token)); |
477 | } |
478 | |
479 | if (expected != token_type::uninitialized) |
480 | { |
481 | error_msg += "; expected " + std::string(lexer_t::token_type_name(expected)); |
482 | } |
483 | |
484 | return error_msg; |
485 | } |
486 | |
487 | private: |
488 | /// callback function |
489 | const parser_callback_t callback = nullptr; |
490 | /// the type of the last read token |
491 | token_type last_token = token_type::uninitialized; |
492 | /// the lexer |
493 | lexer_t m_lexer; |
494 | /// whether to throw exceptions in case of errors |
495 | const bool allow_exceptions = true; |
496 | }; |
497 | } // namespace detail |
498 | } // namespace nlohmann |
499 | |