1#pragma once
2
3#include <array> // array
4#include <clocale> // localeconv
5#include <cstddef> // size_t
6#include <cstdio> // snprintf
7#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
8#include <initializer_list> // initializer_list
9#include <string> // char_traits, string
10#include <utility> // move
11#include <vector> // vector
12
13#include <nlohmann/detail/input/input_adapters.hpp>
14#include <nlohmann/detail/input/position_t.hpp>
15#include <nlohmann/detail/macro_scope.hpp>
16
17namespace nlohmann
18{
19namespace detail
20{
21///////////
22// lexer //
23///////////
24
25/*!
26@brief lexical analysis
27
28This class organizes the lexical analysis during JSON deserialization.
29*/
30template<typename BasicJsonType>
31class lexer
32{
33 using number_integer_t = typename BasicJsonType::number_integer_t;
34 using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
35 using number_float_t = typename BasicJsonType::number_float_t;
36 using string_t = typename BasicJsonType::string_t;
37
38 public:
39 /// token types for the parser
40 enum class token_type
41 {
42 uninitialized, ///< indicating the scanner is uninitialized
43 literal_true, ///< the `true` literal
44 literal_false, ///< the `false` literal
45 literal_null, ///< the `null` literal
46 value_string, ///< a string -- use get_string() for actual value
47 value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value
48 value_integer, ///< a signed integer -- use get_number_integer() for actual value
49 value_float, ///< an floating point number -- use get_number_float() for actual value
50 begin_array, ///< the character for array begin `[`
51 begin_object, ///< the character for object begin `{`
52 end_array, ///< the character for array end `]`
53 end_object, ///< the character for object end `}`
54 name_separator, ///< the name separator `:`
55 value_separator, ///< the value separator `,`
56 parse_error, ///< indicating a parse error
57 end_of_input, ///< indicating the end of the input buffer
58 literal_or_value ///< a literal or the begin of a value (only for diagnostics)
59 };
60
61 /// return name of values of type token_type (only used for errors)
62 JSON_HEDLEY_RETURNS_NON_NULL
63 JSON_HEDLEY_CONST
64 static const char* token_type_name(const token_type t) noexcept
65 {
66 switch (t)
67 {
68 case token_type::uninitialized:
69 return "<uninitialized>";
70 case token_type::literal_true:
71 return "true literal";
72 case token_type::literal_false:
73 return "false literal";
74 case token_type::literal_null:
75 return "null literal";
76 case token_type::value_string:
77 return "string literal";
78 case lexer::token_type::value_unsigned:
79 case lexer::token_type::value_integer:
80 case lexer::token_type::value_float:
81 return "number literal";
82 case token_type::begin_array:
83 return "'['";
84 case token_type::begin_object:
85 return "'{'";
86 case token_type::end_array:
87 return "']'";
88 case token_type::end_object:
89 return "'}'";
90 case token_type::name_separator:
91 return "':'";
92 case token_type::value_separator:
93 return "','";
94 case token_type::parse_error:
95 return "<parse error>";
96 case token_type::end_of_input:
97 return "end of input";
98 case token_type::literal_or_value:
99 return "'[', '{', or a literal";
100 // LCOV_EXCL_START
101 default: // catch non-enum values
102 return "unknown token";
103 // LCOV_EXCL_STOP
104 }
105 }
106
107 explicit lexer(detail::input_adapter_t&& adapter)
108 : ia(std::move(adapter)), decimal_point_char(get_decimal_point()) {}
109
110 // delete because of pointer members
111 lexer(const lexer&) = delete;
112 lexer(lexer&&) = delete;
113 lexer& operator=(lexer&) = delete;
114 lexer& operator=(lexer&&) = delete;
115 ~lexer() = default;
116
117 private:
118 /////////////////////
119 // locales
120 /////////////////////
121
122 /// return the locale-dependent decimal point
123 JSON_HEDLEY_PURE
124 static char get_decimal_point() noexcept
125 {
126 const auto loc = localeconv();
127 assert(loc != nullptr);
128 return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
129 }
130
131 /////////////////////
132 // scan functions
133 /////////////////////
134
135 /*!
136 @brief get codepoint from 4 hex characters following `\u`
137
138 For input "\u c1 c2 c3 c4" the codepoint is:
139 (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
140 = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
141
142 Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
143 must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
144 conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
145 between the ASCII value of the character and the desired integer value.
146
147 @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
148 non-hex character)
149 */
150 int get_codepoint()
151 {
152 // this function only makes sense after reading `\u`
153 assert(current == 'u');
154 int codepoint = 0;
155
156 const auto factors = { 12u, 8u, 4u, 0u };
157 for (const auto factor : factors)
158 {
159 get();
160
161 if (current >= '0' and current <= '9')
162 {
163 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
164 }
165 else if (current >= 'A' and current <= 'F')
166 {
167 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
168 }
169 else if (current >= 'a' and current <= 'f')
170 {
171 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
172 }
173 else
174 {
175 return -1;
176 }
177 }
178
179 assert(0x0000 <= codepoint and codepoint <= 0xFFFF);
180 return codepoint;
181 }
182
183 /*!
184 @brief check if the next byte(s) are inside a given range
185
186 Adds the current byte and, for each passed range, reads a new byte and
187 checks if it is inside the range. If a violation was detected, set up an
188 error message and return false. Otherwise, return true.
189
190 @param[in] ranges list of integers; interpreted as list of pairs of
191 inclusive lower and upper bound, respectively
192
193 @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
194 1, 2, or 3 pairs. This precondition is enforced by an assertion.
195
196 @return true if and only if no range violation was detected
197 */
198 bool next_byte_in_range(std::initializer_list<int> ranges)
199 {
200 assert(ranges.size() == 2 or ranges.size() == 4 or ranges.size() == 6);
201 add(current);
202
203 for (auto range = ranges.begin(); range != ranges.end(); ++range)
204 {
205 get();
206 if (JSON_HEDLEY_LIKELY(*range <= current and current <= *(++range)))
207 {
208 add(current);
209 }
210 else
211 {
212 error_message = "invalid string: ill-formed UTF-8 byte";
213 return false;
214 }
215 }
216
217 return true;
218 }
219
220 /*!
221 @brief scan a string literal
222
223 This function scans a string according to Sect. 7 of RFC 7159. While
224 scanning, bytes are escaped and copied into buffer token_buffer. Then the
225 function returns successfully, token_buffer is *not* null-terminated (as it
226 may contain \0 bytes), and token_buffer.size() is the number of bytes in the
227 string.
228
229 @return token_type::value_string if string could be successfully scanned,
230 token_type::parse_error otherwise
231
232 @note In case of errors, variable error_message contains a textual
233 description.
234 */
235 token_type scan_string()
236 {
237 // reset token_buffer (ignore opening quote)
238 reset();
239
240 // we entered the function by reading an open quote
241 assert(current == '\"');
242
243 while (true)
244 {
245 // get next character
246 switch (get())
247 {
248 // end of file while parsing string
249 case std::char_traits<char>::eof():
250 {
251 error_message = "invalid string: missing closing quote";
252 return token_type::parse_error;
253 }
254
255 // closing quote
256 case '\"':
257 {
258 return token_type::value_string;
259 }
260
261 // escapes
262 case '\\':
263 {
264 switch (get())
265 {
266 // quotation mark
267 case '\"':
268 add('\"');
269 break;
270 // reverse solidus
271 case '\\':
272 add('\\');
273 break;
274 // solidus
275 case '/':
276 add('/');
277 break;
278 // backspace
279 case 'b':
280 add('\b');
281 break;
282 // form feed
283 case 'f':
284 add('\f');
285 break;
286 // line feed
287 case 'n':
288 add('\n');
289 break;
290 // carriage return
291 case 'r':
292 add('\r');
293 break;
294 // tab
295 case 't':
296 add('\t');
297 break;
298
299 // unicode escapes
300 case 'u':
301 {
302 const int codepoint1 = get_codepoint();
303 int codepoint = codepoint1; // start with codepoint1
304
305 if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
306 {
307 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
308 return token_type::parse_error;
309 }
310
311 // check if code point is a high surrogate
312 if (0xD800 <= codepoint1 and codepoint1 <= 0xDBFF)
313 {
314 // expect next \uxxxx entry
315 if (JSON_HEDLEY_LIKELY(get() == '\\' and get() == 'u'))
316 {
317 const int codepoint2 = get_codepoint();
318
319 if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
320 {
321 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
322 return token_type::parse_error;
323 }
324
325 // check if codepoint2 is a low surrogate
326 if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 and codepoint2 <= 0xDFFF))
327 {
328 // overwrite codepoint
329 codepoint = static_cast<int>(
330 // high surrogate occupies the most significant 22 bits
331 (static_cast<unsigned int>(codepoint1) << 10u)
332 // low surrogate occupies the least significant 15 bits
333 + static_cast<unsigned int>(codepoint2)
334 // there is still the 0xD800, 0xDC00 and 0x10000 noise
335 // in the result so we have to subtract with:
336 // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
337 - 0x35FDC00u);
338 }
339 else
340 {
341 error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
342 return token_type::parse_error;
343 }
344 }
345 else
346 {
347 error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
348 return token_type::parse_error;
349 }
350 }
351 else
352 {
353 if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 and codepoint1 <= 0xDFFF))
354 {
355 error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
356 return token_type::parse_error;
357 }
358 }
359
360 // result of the above calculation yields a proper codepoint
361 assert(0x00 <= codepoint and codepoint <= 0x10FFFF);
362
363 // translate codepoint into bytes
364 if (codepoint < 0x80)
365 {
366 // 1-byte characters: 0xxxxxxx (ASCII)
367 add(codepoint);
368 }
369 else if (codepoint <= 0x7FF)
370 {
371 // 2-byte characters: 110xxxxx 10xxxxxx
372 add(static_cast<int>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
373 add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
374 }
375 else if (codepoint <= 0xFFFF)
376 {
377 // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
378 add(static_cast<int>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
379 add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
380 add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
381 }
382 else
383 {
384 // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
385 add(static_cast<int>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
386 add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
387 add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
388 add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
389 }
390
391 break;
392 }
393
394 // other characters after escape
395 default:
396 error_message = "invalid string: forbidden character after backslash";
397 return token_type::parse_error;
398 }
399
400 break;
401 }
402
403 // invalid control characters
404 case 0x00:
405 {
406 error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
407 return token_type::parse_error;
408 }
409
410 case 0x01:
411 {
412 error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
413 return token_type::parse_error;
414 }
415
416 case 0x02:
417 {
418 error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
419 return token_type::parse_error;
420 }
421
422 case 0x03:
423 {
424 error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
425 return token_type::parse_error;
426 }
427
428 case 0x04:
429 {
430 error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
431 return token_type::parse_error;
432 }
433
434 case 0x05:
435 {
436 error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
437 return token_type::parse_error;
438 }
439
440 case 0x06:
441 {
442 error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
443 return token_type::parse_error;
444 }
445
446 case 0x07:
447 {
448 error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
449 return token_type::parse_error;
450 }
451
452 case 0x08:
453 {
454 error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
455 return token_type::parse_error;
456 }
457
458 case 0x09:
459 {
460 error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
461 return token_type::parse_error;
462 }
463
464 case 0x0A:
465 {
466 error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
467 return token_type::parse_error;
468 }
469
470 case 0x0B:
471 {
472 error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
473 return token_type::parse_error;
474 }
475
476 case 0x0C:
477 {
478 error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
479 return token_type::parse_error;
480 }
481
482 case 0x0D:
483 {
484 error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
485 return token_type::parse_error;
486 }
487
488 case 0x0E:
489 {
490 error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
491 return token_type::parse_error;
492 }
493
494 case 0x0F:
495 {
496 error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
497 return token_type::parse_error;
498 }
499
500 case 0x10:
501 {
502 error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
503 return token_type::parse_error;
504 }
505
506 case 0x11:
507 {
508 error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
509 return token_type::parse_error;
510 }
511
512 case 0x12:
513 {
514 error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
515 return token_type::parse_error;
516 }
517
518 case 0x13:
519 {
520 error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
521 return token_type::parse_error;
522 }
523
524 case 0x14:
525 {
526 error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
527 return token_type::parse_error;
528 }
529
530 case 0x15:
531 {
532 error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
533 return token_type::parse_error;
534 }
535
536 case 0x16:
537 {
538 error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
539 return token_type::parse_error;
540 }
541
542 case 0x17:
543 {
544 error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
545 return token_type::parse_error;
546 }
547
548 case 0x18:
549 {
550 error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
551 return token_type::parse_error;
552 }
553
554 case 0x19:
555 {
556 error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
557 return token_type::parse_error;
558 }
559
560 case 0x1A:
561 {
562 error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
563 return token_type::parse_error;
564 }
565
566 case 0x1B:
567 {
568 error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
569 return token_type::parse_error;
570 }
571
572 case 0x1C:
573 {
574 error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
575 return token_type::parse_error;
576 }
577
578 case 0x1D:
579 {
580 error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
581 return token_type::parse_error;
582 }
583
584 case 0x1E:
585 {
586 error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
587 return token_type::parse_error;
588 }
589
590 case 0x1F:
591 {
592 error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
593 return token_type::parse_error;
594 }
595
596 // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
597 case 0x20:
598 case 0x21:
599 case 0x23:
600 case 0x24:
601 case 0x25:
602 case 0x26:
603 case 0x27:
604 case 0x28:
605 case 0x29:
606 case 0x2A:
607 case 0x2B:
608 case 0x2C:
609 case 0x2D:
610 case 0x2E:
611 case 0x2F:
612 case 0x30:
613 case 0x31:
614 case 0x32:
615 case 0x33:
616 case 0x34:
617 case 0x35:
618 case 0x36:
619 case 0x37:
620 case 0x38:
621 case 0x39:
622 case 0x3A:
623 case 0x3B:
624 case 0x3C:
625 case 0x3D:
626 case 0x3E:
627 case 0x3F:
628 case 0x40:
629 case 0x41:
630 case 0x42:
631 case 0x43:
632 case 0x44:
633 case 0x45:
634 case 0x46:
635 case 0x47:
636 case 0x48:
637 case 0x49:
638 case 0x4A:
639 case 0x4B:
640 case 0x4C:
641 case 0x4D:
642 case 0x4E:
643 case 0x4F:
644 case 0x50:
645 case 0x51:
646 case 0x52:
647 case 0x53:
648 case 0x54:
649 case 0x55:
650 case 0x56:
651 case 0x57:
652 case 0x58:
653 case 0x59:
654 case 0x5A:
655 case 0x5B:
656 case 0x5D:
657 case 0x5E:
658 case 0x5F:
659 case 0x60:
660 case 0x61:
661 case 0x62:
662 case 0x63:
663 case 0x64:
664 case 0x65:
665 case 0x66:
666 case 0x67:
667 case 0x68:
668 case 0x69:
669 case 0x6A:
670 case 0x6B:
671 case 0x6C:
672 case 0x6D:
673 case 0x6E:
674 case 0x6F:
675 case 0x70:
676 case 0x71:
677 case 0x72:
678 case 0x73:
679 case 0x74:
680 case 0x75:
681 case 0x76:
682 case 0x77:
683 case 0x78:
684 case 0x79:
685 case 0x7A:
686 case 0x7B:
687 case 0x7C:
688 case 0x7D:
689 case 0x7E:
690 case 0x7F:
691 {
692 add(current);
693 break;
694 }
695
696 // U+0080..U+07FF: bytes C2..DF 80..BF
697 case 0xC2:
698 case 0xC3:
699 case 0xC4:
700 case 0xC5:
701 case 0xC6:
702 case 0xC7:
703 case 0xC8:
704 case 0xC9:
705 case 0xCA:
706 case 0xCB:
707 case 0xCC:
708 case 0xCD:
709 case 0xCE:
710 case 0xCF:
711 case 0xD0:
712 case 0xD1:
713 case 0xD2:
714 case 0xD3:
715 case 0xD4:
716 case 0xD5:
717 case 0xD6:
718 case 0xD7:
719 case 0xD8:
720 case 0xD9:
721 case 0xDA:
722 case 0xDB:
723 case 0xDC:
724 case 0xDD:
725 case 0xDE:
726 case 0xDF:
727 {
728 if (JSON_HEDLEY_UNLIKELY(not next_byte_in_range({0x80, 0xBF})))
729 {
730 return token_type::parse_error;
731 }
732 break;
733 }
734
735 // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
736 case 0xE0:
737 {
738 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
739 {
740 return token_type::parse_error;
741 }
742 break;
743 }
744
745 // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
746 // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
747 case 0xE1:
748 case 0xE2:
749 case 0xE3:
750 case 0xE4:
751 case 0xE5:
752 case 0xE6:
753 case 0xE7:
754 case 0xE8:
755 case 0xE9:
756 case 0xEA:
757 case 0xEB:
758 case 0xEC:
759 case 0xEE:
760 case 0xEF:
761 {
762 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
763 {
764 return token_type::parse_error;
765 }
766 break;
767 }
768
769 // U+D000..U+D7FF: bytes ED 80..9F 80..BF
770 case 0xED:
771 {
772 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
773 {
774 return token_type::parse_error;
775 }
776 break;
777 }
778
779 // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
780 case 0xF0:
781 {
782 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
783 {
784 return token_type::parse_error;
785 }
786 break;
787 }
788
789 // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
790 case 0xF1:
791 case 0xF2:
792 case 0xF3:
793 {
794 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
795 {
796 return token_type::parse_error;
797 }
798 break;
799 }
800
801 // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
802 case 0xF4:
803 {
804 if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
805 {
806 return token_type::parse_error;
807 }
808 break;
809 }
810
811 // remaining bytes (80..C1 and F5..FF) are ill-formed
812 default:
813 {
814 error_message = "invalid string: ill-formed UTF-8 byte";
815 return token_type::parse_error;
816 }
817 }
818 }
819 }
820
821 JSON_HEDLEY_NON_NULL(2)
822 static void strtof(float& f, const char* str, char** endptr) noexcept
823 {
824 f = std::strtof(str, endptr);
825 }
826
827 JSON_HEDLEY_NON_NULL(2)
828 static void strtof(double& f, const char* str, char** endptr) noexcept
829 {
830 f = std::strtod(str, endptr);
831 }
832
833 JSON_HEDLEY_NON_NULL(2)
834 static void strtof(long double& f, const char* str, char** endptr) noexcept
835 {
836 f = std::strtold(str, endptr);
837 }
838
839 /*!
840 @brief scan a number literal
841
842 This function scans a string according to Sect. 6 of RFC 7159.
843
844 The function is realized with a deterministic finite state machine derived
845 from the grammar described in RFC 7159. Starting in state "init", the
846 input is read and used to determined the next state. Only state "done"
847 accepts the number. State "error" is a trap state to model errors. In the
848 table below, "anything" means any character but the ones listed before.
849
850 state | 0 | 1-9 | e E | + | - | . | anything
851 ---------|----------|----------|----------|---------|---------|----------|-----------
852 init | zero | any1 | [error] | [error] | minus | [error] | [error]
853 minus | zero | any1 | [error] | [error] | [error] | [error] | [error]
854 zero | done | done | exponent | done | done | decimal1 | done
855 any1 | any1 | any1 | exponent | done | done | decimal1 | done
856 decimal1 | decimal2 | [error] | [error] | [error] | [error] | [error] | [error]
857 decimal2 | decimal2 | decimal2 | exponent | done | done | done | done
858 exponent | any2 | any2 | [error] | sign | sign | [error] | [error]
859 sign | any2 | any2 | [error] | [error] | [error] | [error] | [error]
860 any2 | any2 | any2 | done | done | done | done | done
861
862 The state machine is realized with one label per state (prefixed with
863 "scan_number_") and `goto` statements between them. The state machine
864 contains cycles, but any cycle can be left when EOF is read. Therefore,
865 the function is guaranteed to terminate.
866
867 During scanning, the read bytes are stored in token_buffer. This string is
868 then converted to a signed integer, an unsigned integer, or a
869 floating-point number.
870
871 @return token_type::value_unsigned, token_type::value_integer, or
872 token_type::value_float if number could be successfully scanned,
873 token_type::parse_error otherwise
874
875 @note The scanner is independent of the current locale. Internally, the
876 locale's decimal point is used instead of `.` to work with the
877 locale-dependent converters.
878 */
879 token_type scan_number() // lgtm [cpp/use-of-goto]
880 {
881 // reset token_buffer to store the number's bytes
882 reset();
883
884 // the type of the parsed number; initially set to unsigned; will be
885 // changed if minus sign, decimal point or exponent is read
886 token_type number_type = token_type::value_unsigned;
887
888 // state (init): we just found out we need to scan a number
889 switch (current)
890 {
891 case '-':
892 {
893 add(current);
894 goto scan_number_minus;
895 }
896
897 case '0':
898 {
899 add(current);
900 goto scan_number_zero;
901 }
902
903 case '1':
904 case '2':
905 case '3':
906 case '4':
907 case '5':
908 case '6':
909 case '7':
910 case '8':
911 case '9':
912 {
913 add(current);
914 goto scan_number_any1;
915 }
916
917 // all other characters are rejected outside scan_number()
918 default: // LCOV_EXCL_LINE
919 assert(false); // LCOV_EXCL_LINE
920 }
921
922scan_number_minus:
923 // state: we just parsed a leading minus sign
924 number_type = token_type::value_integer;
925 switch (get())
926 {
927 case '0':
928 {
929 add(current);
930 goto scan_number_zero;
931 }
932
933 case '1':
934 case '2':
935 case '3':
936 case '4':
937 case '5':
938 case '6':
939 case '7':
940 case '8':
941 case '9':
942 {
943 add(current);
944 goto scan_number_any1;
945 }
946
947 default:
948 {
949 error_message = "invalid number; expected digit after '-'";
950 return token_type::parse_error;
951 }
952 }
953
954scan_number_zero:
955 // state: we just parse a zero (maybe with a leading minus sign)
956 switch (get())
957 {
958 case '.':
959 {
960 add(decimal_point_char);
961 goto scan_number_decimal1;
962 }
963
964 case 'e':
965 case 'E':
966 {
967 add(current);
968 goto scan_number_exponent;
969 }
970
971 default:
972 goto scan_number_done;
973 }
974
975scan_number_any1:
976 // state: we just parsed a number 0-9 (maybe with a leading minus sign)
977 switch (get())
978 {
979 case '0':
980 case '1':
981 case '2':
982 case '3':
983 case '4':
984 case '5':
985 case '6':
986 case '7':
987 case '8':
988 case '9':
989 {
990 add(current);
991 goto scan_number_any1;
992 }
993
994 case '.':
995 {
996 add(decimal_point_char);
997 goto scan_number_decimal1;
998 }
999
1000 case 'e':
1001 case 'E':
1002 {
1003 add(current);
1004 goto scan_number_exponent;
1005 }
1006
1007 default:
1008 goto scan_number_done;
1009 }
1010
1011scan_number_decimal1:
1012 // state: we just parsed a decimal point
1013 number_type = token_type::value_float;
1014 switch (get())
1015 {
1016 case '0':
1017 case '1':
1018 case '2':
1019 case '3':
1020 case '4':
1021 case '5':
1022 case '6':
1023 case '7':
1024 case '8':
1025 case '9':
1026 {
1027 add(current);
1028 goto scan_number_decimal2;
1029 }
1030
1031 default:
1032 {
1033 error_message = "invalid number; expected digit after '.'";
1034 return token_type::parse_error;
1035 }
1036 }
1037
1038scan_number_decimal2:
1039 // we just parsed at least one number after a decimal point
1040 switch (get())
1041 {
1042 case '0':
1043 case '1':
1044 case '2':
1045 case '3':
1046 case '4':
1047 case '5':
1048 case '6':
1049 case '7':
1050 case '8':
1051 case '9':
1052 {
1053 add(current);
1054 goto scan_number_decimal2;
1055 }
1056
1057 case 'e':
1058 case 'E':
1059 {
1060 add(current);
1061 goto scan_number_exponent;
1062 }
1063
1064 default:
1065 goto scan_number_done;
1066 }
1067
1068scan_number_exponent:
1069 // we just parsed an exponent
1070 number_type = token_type::value_float;
1071 switch (get())
1072 {
1073 case '+':
1074 case '-':
1075 {
1076 add(current);
1077 goto scan_number_sign;
1078 }
1079
1080 case '0':
1081 case '1':
1082 case '2':
1083 case '3':
1084 case '4':
1085 case '5':
1086 case '6':
1087 case '7':
1088 case '8':
1089 case '9':
1090 {
1091 add(current);
1092 goto scan_number_any2;
1093 }
1094
1095 default:
1096 {
1097 error_message =
1098 "invalid number; expected '+', '-', or digit after exponent";
1099 return token_type::parse_error;
1100 }
1101 }
1102
1103scan_number_sign:
1104 // we just parsed an exponent sign
1105 switch (get())
1106 {
1107 case '0':
1108 case '1':
1109 case '2':
1110 case '3':
1111 case '4':
1112 case '5':
1113 case '6':
1114 case '7':
1115 case '8':
1116 case '9':
1117 {
1118 add(current);
1119 goto scan_number_any2;
1120 }
1121
1122 default:
1123 {
1124 error_message = "invalid number; expected digit after exponent sign";
1125 return token_type::parse_error;
1126 }
1127 }
1128
1129scan_number_any2:
1130 // we just parsed a number after the exponent or exponent sign
1131 switch (get())
1132 {
1133 case '0':
1134 case '1':
1135 case '2':
1136 case '3':
1137 case '4':
1138 case '5':
1139 case '6':
1140 case '7':
1141 case '8':
1142 case '9':
1143 {
1144 add(current);
1145 goto scan_number_any2;
1146 }
1147
1148 default:
1149 goto scan_number_done;
1150 }
1151
1152scan_number_done:
1153 // unget the character after the number (we only read it to know that
1154 // we are done scanning a number)
1155 unget();
1156
1157 char* endptr = nullptr;
1158 errno = 0;
1159
1160 // try to parse integers first and fall back to floats
1161 if (number_type == token_type::value_unsigned)
1162 {
1163 const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
1164
1165 // we checked the number format before
1166 assert(endptr == token_buffer.data() + token_buffer.size());
1167
1168 if (errno == 0)
1169 {
1170 value_unsigned = static_cast<number_unsigned_t>(x);
1171 if (value_unsigned == x)
1172 {
1173 return token_type::value_unsigned;
1174 }
1175 }
1176 }
1177 else if (number_type == token_type::value_integer)
1178 {
1179 const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
1180
1181 // we checked the number format before
1182 assert(endptr == token_buffer.data() + token_buffer.size());
1183
1184 if (errno == 0)
1185 {
1186 value_integer = static_cast<number_integer_t>(x);
1187 if (value_integer == x)
1188 {
1189 return token_type::value_integer;
1190 }
1191 }
1192 }
1193
1194 // this code is reached if we parse a floating-point number or if an
1195 // integer conversion above failed
1196 strtof(value_float, token_buffer.data(), &endptr);
1197
1198 // we checked the number format before
1199 assert(endptr == token_buffer.data() + token_buffer.size());
1200
1201 return token_type::value_float;
1202 }
1203
1204 /*!
1205 @param[in] literal_text the literal text to expect
1206 @param[in] length the length of the passed literal text
1207 @param[in] return_type the token type to return on success
1208 */
1209 JSON_HEDLEY_NON_NULL(2)
1210 token_type scan_literal(const char* literal_text, const std::size_t length,
1211 token_type return_type)
1212 {
1213 assert(current == literal_text[0]);
1214 for (std::size_t i = 1; i < length; ++i)
1215 {
1216 if (JSON_HEDLEY_UNLIKELY(get() != literal_text[i]))
1217 {
1218 error_message = "invalid literal";
1219 return token_type::parse_error;
1220 }
1221 }
1222 return return_type;
1223 }
1224
1225 /////////////////////
1226 // input management
1227 /////////////////////
1228
1229 /// reset token_buffer; current character is beginning of token
1230 void reset() noexcept
1231 {
1232 token_buffer.clear();
1233 token_string.clear();
1234 token_string.push_back(std::char_traits<char>::to_char_type(current));
1235 }
1236
1237 /*
1238 @brief get next character from the input
1239
1240 This function provides the interface to the used input adapter. It does
1241 not throw in case the input reached EOF, but returns a
1242 `std::char_traits<char>::eof()` in that case. Stores the scanned characters
1243 for use in error messages.
1244
1245 @return character read from the input
1246 */
1247 std::char_traits<char>::int_type get()
1248 {
1249 ++position.chars_read_total;
1250 ++position.chars_read_current_line;
1251
1252 if (next_unget)
1253 {
1254 // just reset the next_unget variable and work with current
1255 next_unget = false;
1256 }
1257 else
1258 {
1259 current = ia->get_character();
1260 }
1261
1262 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char>::eof()))
1263 {
1264 token_string.push_back(std::char_traits<char>::to_char_type(current));
1265 }
1266
1267 if (current == '\n')
1268 {
1269 ++position.lines_read;
1270 position.chars_read_current_line = 0;
1271 }
1272
1273 return current;
1274 }
1275
1276 /*!
1277 @brief unget current character (read it again on next get)
1278
1279 We implement unget by setting variable next_unget to true. The input is not
1280 changed - we just simulate ungetting by modifying chars_read_total,
1281 chars_read_current_line, and token_string. The next call to get() will
1282 behave as if the unget character is read again.
1283 */
1284 void unget()
1285 {
1286 next_unget = true;
1287
1288 --position.chars_read_total;
1289
1290 // in case we "unget" a newline, we have to also decrement the lines_read
1291 if (position.chars_read_current_line == 0)
1292 {
1293 if (position.lines_read > 0)
1294 {
1295 --position.lines_read;
1296 }
1297 }
1298 else
1299 {
1300 --position.chars_read_current_line;
1301 }
1302
1303 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char>::eof()))
1304 {
1305 assert(not token_string.empty());
1306 token_string.pop_back();
1307 }
1308 }
1309
1310 /// add a character to token_buffer
1311 void add(int c)
1312 {
1313 token_buffer.push_back(std::char_traits<char>::to_char_type(c));
1314 }
1315
1316 public:
1317 /////////////////////
1318 // value getters
1319 /////////////////////
1320
1321 /// return integer value
1322 constexpr number_integer_t get_number_integer() const noexcept
1323 {
1324 return value_integer;
1325 }
1326
1327 /// return unsigned integer value
1328 constexpr number_unsigned_t get_number_unsigned() const noexcept
1329 {
1330 return value_unsigned;
1331 }
1332
1333 /// return floating-point value
1334 constexpr number_float_t get_number_float() const noexcept
1335 {
1336 return value_float;
1337 }
1338
1339 /// return current string value (implicitly resets the token; useful only once)
1340 string_t& get_string()
1341 {
1342 return token_buffer;
1343 }
1344
1345 /////////////////////
1346 // diagnostics
1347 /////////////////////
1348
1349 /// return position of last read token
1350 constexpr position_t get_position() const noexcept
1351 {
1352 return position;
1353 }
1354
1355 /// return the last read token (for errors only). Will never contain EOF
1356 /// (an arbitrary value that is not a valid char value, often -1), because
1357 /// 255 may legitimately occur. May contain NUL, which should be escaped.
1358 std::string get_token_string() const
1359 {
1360 // escape control characters
1361 std::string result;
1362 for (const auto c : token_string)
1363 {
1364 if ('\x00' <= c and c <= '\x1F')
1365 {
1366 // escape control characters
1367 std::array<char, 9> cs{{}};
1368 (std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c));
1369 result += cs.data();
1370 }
1371 else
1372 {
1373 // add character as is
1374 result.push_back(c);
1375 }
1376 }
1377
1378 return result;
1379 }
1380
1381 /// return syntax error message
1382 JSON_HEDLEY_RETURNS_NON_NULL
1383 constexpr const char* get_error_message() const noexcept
1384 {
1385 return error_message;
1386 }
1387
1388 /////////////////////
1389 // actual scanner
1390 /////////////////////
1391
1392 /*!
1393 @brief skip the UTF-8 byte order mark
1394 @return true iff there is no BOM or the correct BOM has been skipped
1395 */
1396 bool skip_bom()
1397 {
1398 if (get() == 0xEF)
1399 {
1400 // check if we completely parse the BOM
1401 return get() == 0xBB and get() == 0xBF;
1402 }
1403
1404 // the first character is not the beginning of the BOM; unget it to
1405 // process is later
1406 unget();
1407 return true;
1408 }
1409
1410 token_type scan()
1411 {
1412 // initially, skip the BOM
1413 if (position.chars_read_total == 0 and not skip_bom())
1414 {
1415 error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
1416 return token_type::parse_error;
1417 }
1418
1419 // read next character and ignore whitespace
1420 do
1421 {
1422 get();
1423 }
1424 while (current == ' ' or current == '\t' or current == '\n' or current == '\r');
1425
1426 switch (current)
1427 {
1428 // structural characters
1429 case '[':
1430 return token_type::begin_array;
1431 case ']':
1432 return token_type::end_array;
1433 case '{':
1434 return token_type::begin_object;
1435 case '}':
1436 return token_type::end_object;
1437 case ':':
1438 return token_type::name_separator;
1439 case ',':
1440 return token_type::value_separator;
1441
1442 // literals
1443 case 't':
1444 return scan_literal("true", 4, token_type::literal_true);
1445 case 'f':
1446 return scan_literal("false", 5, token_type::literal_false);
1447 case 'n':
1448 return scan_literal("null", 4, token_type::literal_null);
1449
1450 // string
1451 case '\"':
1452 return scan_string();
1453
1454 // number
1455 case '-':
1456 case '0':
1457 case '1':
1458 case '2':
1459 case '3':
1460 case '4':
1461 case '5':
1462 case '6':
1463 case '7':
1464 case '8':
1465 case '9':
1466 return scan_number();
1467
1468 // end of input (the null byte is needed when parsing from
1469 // string literals)
1470 case '\0':
1471 case std::char_traits<char>::eof():
1472 return token_type::end_of_input;
1473
1474 // error
1475 default:
1476 error_message = "invalid literal";
1477 return token_type::parse_error;
1478 }
1479 }
1480
1481 private:
1482 /// input adapter
1483 detail::input_adapter_t ia = nullptr;
1484
1485 /// the current character
1486 std::char_traits<char>::int_type current = std::char_traits<char>::eof();
1487
1488 /// whether the next get() call should just return current
1489 bool next_unget = false;
1490
1491 /// the start position of the current token
1492 position_t position {};
1493
1494 /// raw input token string (for error messages)
1495 std::vector<char> token_string {};
1496
1497 /// buffer for variable-length tokens (numbers, strings)
1498 string_t token_buffer {};
1499
1500 /// a description of occurred lexer errors
1501 const char* error_message = "";
1502
1503 // number values
1504 number_integer_t value_integer = 0;
1505 number_unsigned_t value_unsigned = 0;
1506 number_float_t value_float = 0;
1507
1508 /// the decimal point
1509 const char decimal_point_char = '.';
1510};
1511} // namespace detail
1512} // namespace nlohmann
1513