lexer.hpp source code [DeepinIDE/3rdparty/cppdap/third_party/json/include/nlohmann/detail/input/lexer.hpp]

1	#pragma once
2
3	#include <array> // array
4	#include <clocale> // localeconv
5	#include <cstddef> // size_t
6	#include <cstdio> // snprintf
7	#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
8	#include <initializer_list> // initializer_list
9	#include <string> // char_traits, string
10	#include <utility> // move
11	#include <vector> // vector
12
13	#include <nlohmann/detail/input/input_adapters.hpp>
14	#include <nlohmann/detail/input/position_t.hpp>
15	#include <nlohmann/detail/macro_scope.hpp>
16
17	namespace nlohmann
18	{
19	namespace detail
20	{
21	///////////
22	// lexer //
23	///////////
24
25	/!*
26	@brief lexical analysis
27
28	This class organizes the lexical analysis during JSON deserialization.
29	*/
30	template<typename BasicJsonType>
31	class lexer
32	{
33	using number_integer_t = typename BasicJsonType::number_integer_t;
34	using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
35	using number_float_t = typename BasicJsonType::number_float_t;
36	using string_t = typename BasicJsonType::string_t;
37
38	public:
39	/// token types for the parser
40	enum class token_type
41	{
42	uninitialized, ///< indicating the scanner is uninitialized
43	literal_true, ///< the `true` literal
44	literal_false, ///< the `false` literal
45	literal_null, ///< the `null` literal
46	value_string, ///< a string -- use get_string() for actual value
47	value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value
48	value_integer, ///< a signed integer -- use get_number_integer() for actual value
49	value_float, ///< an floating point number -- use get_number_float() for actual value
50	begin_array, ///< the character for array begin `[`
51	begin_object, ///< the character for object begin `{`
52	end_array, ///< the character for array end `]`
53	end_object, ///< the character for object end `}`
54	name_separator, ///< the name separator `:`
55	value_separator, ///< the value separator `,`
56	parse_error, ///< indicating a parse error
57	end_of_input, ///< indicating the end of the input buffer
58	literal_or_value ///< a literal or the begin of a value (only for diagnostics)
59	};
60
61	/// return name of values of type token_type (only used for errors)
62	JSON_HEDLEY_RETURNS_NON_NULL
63	JSON_HEDLEY_CONST
64	static const char* token_type_name(const token_type t) noexcept
65	{
66	switch (t)
67	{
68	case token_type::uninitialized:
69	return "<uninitialized>";
70	case token_type::literal_true:
71	return "true literal";
72	case token_type::literal_false:
73	return "false literal";
74	case token_type::literal_null:
75	return "null literal";
76	case token_type::value_string:
77	return "string literal";
78	case lexer::token_type::value_unsigned:
79	case lexer::token_type::value_integer:
80	case lexer::token_type::value_float:
81	return "number literal";
82	case token_type::begin_array:
83	return "'['";
84	case token_type::begin_object:
85	return "'{'";
86	case token_type::end_array:
87	return "']'";
88	case token_type::end_object:
89	return "'}'";
90	case token_type::name_separator:
91	return "':'";
92	case token_type::value_separator:
93	return "','";
94	case token_type::parse_error:
95	return "<parse error>";
96	case token_type::end_of_input:
97	return "end of input";
98	case token_type::literal_or_value:
99	return "'[', '{', or a literal";
100	// LCOV_EXCL_START
101	default: // catch non-enum values
102	return "unknown token";
103	// LCOV_EXCL_STOP
104	}
105	}
106
107	explicit lexer(detail::input_adapter_t&& adapter)
108	: ia (std::move(adapter)), decimal_point_char(get_decimal_point()) {}
109
110	// delete because of pointer members
111	lexer(const lexer&) = delete;
112	lexer(lexer&&) = delete;
113	lexer& operator=(lexer&) = delete;
114	lexer& operator=(lexer&&) = delete;
115	~lexer() = default;
116
117	private:
118	/////////////////////
119	// locales
120	/////////////////////
121
122	/// return the locale-dependent decimal point
123	JSON_HEDLEY_PURE
124	static char get_decimal_point() noexcept
125	{
126	const auto loc = localeconv();
127	assert(loc != nullptr);
128	return (loc->decimal_point == nullptr) ? `'.'` : *(loc->decimal_point);
129	}
130
131	/////////////////////
132	// scan functions
133	/////////////////////
134
135	/!*
136	@brief get codepoint from 4 hex characters following `\u`
137
138	For input "\u c1 c2 c3 c4" the codepoint is:
139	(c1 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4*
140	= (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
141
142	Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
143	must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
144	conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
145	between the ASCII value of the character and the desired integer value.
146
147	@return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
148	non-hex character)
149	*/
150	int get_codepoint()
151	{
152	// this function only makes sense after reading `\u`
153	assert(current == `'u'`);
154	int codepoint = `0`;
155
156	const auto factors = { `12u`, `8u`, `4u`, `0u` };
157	for (const auto factor : factors)
158	{
159	get();
160
161	if (current >= `'0'` and current <= `'9'`)
162	{
163	codepoint += static_cast<int>((static_cast<unsigned int>(current) - `0x30u`) << factor);
164	}
165	else if (current >= `'A'` and current <= `'F'`)
166	{
167	codepoint += static_cast<int>((static_cast<unsigned int>(current) - `0x37u`) << factor);
168	}
169	else if (current >= `'a'` and current <= `'f'`)
170	{
171	codepoint += static_cast<int>((static_cast<unsigned int>(current) - `0x57u`) << factor);
172	}
173	else
174	{
175	return -`1`;
176	}
177	}
178
179	assert(`0x0000` <= codepoint and codepoint <= `0xFFFF`);
180	return codepoint;
181	}
182
183	/!*
184	@brief check if the next byte(s) are inside a given range
185
186	Adds the current byte and, for each passed range, reads a new byte and
187	checks if it is inside the range. If a violation was detected, set up an
188	error message and return false. Otherwise, return true.
189
190	@param[in] ranges list of integers; interpreted as list of pairs of
191	inclusive lower and upper bound, respectively
192
193	@pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
194	1, 2, or 3 pairs. This precondition is enforced by an assertion.
195
196	@return true if and only if no range violation was detected
197	*/
198	bool next_byte_in_range(std::initializer_list<int> ranges)
199	{
200	assert(ranges.size() == `2` or ranges.size() == `4` or ranges.size() == `6`);
201	add(current);
202
203	for (auto range = ranges.begin(); range != ranges.end(); ++range)
204	{
205	get();
206	if (JSON_HEDLEY_LIKELY(range <= current and* current <= *(++range)))
207	{
208	add(current);
209	}
210	else
211	{
212	error_message = "invalid string: ill-formed UTF-8 byte";
213	return false;
214	}
215	}
216
217	return true;
218	}
219
220	/!*
221	@brief scan a string literal
222
223	This function scans a string according to Sect. 7 of RFC 7159. While
224	scanning, bytes are escaped and copied into buffer token_buffer. Then the
225	function returns successfully, token_buffer is not* null-terminated (as it*
226	may contain \0 bytes), and token_buffer.size() is the number of bytes in the
227	string.
228
229	@return token_type::value_string if string could be successfully scanned,
230	token_type::parse_error otherwise
231
232	@note In case of errors, variable error_message contains a textual
233	description.
234	*/
235	token_type scan_string()
236	{
237	// reset token_buffer (ignore opening quote)
238	reset();
239
240	// we entered the function by reading an open quote
241	assert(current == `'\"'`);
242
243	while (true)
244	{
245	// get next character
246	switch (get())
247	{
248	// end of file while parsing string
249	case std::char_traits<char>::eof():
250	{
251	error_message = "invalid string: missing closing quote";
252	return token_type::parse_error;
253	}
254
255	// closing quote
256	case `'\"'`:
257	{
258	return token_type::value_string;
259	}
260
261	// escapes
262	case `'\\'`:
263	{
264	switch (get())
265	{
266	// quotation mark
267	case `'\"'`:
268	add(`'\"'`);
269	break;
270	// reverse solidus
271	case `'\\'`:
272	add(`'\\'`);
273	break;
274	// solidus
275	case `'/'`:
276	add(`'/'`);
277	break;
278	// backspace
279	case `'b'`:
280	add(`'\b'`);
281	break;
282	// form feed
283	case `'f'`:
284	add(`'\f'`);
285	break;
286	// line feed
287	case `'n'`:
288	add(`'\n'`);
289	break;
290	// carriage return
291	case `'r'`:
292	add(`'\r'`);
293	break;
294	// tab
295	case `'t'`:
296	add(`'\t'`);
297	break;
298
299	// unicode escapes
300	case `'u'`:
301	{
302	const int codepoint1 = get_codepoint();
303	int codepoint = codepoint1; // start with codepoint1
304
305	if (JSON_HEDLEY_UNLIKELY(codepoint1 == -`1`))
306	{
307	error_message = "invalid string: '\\u' must be followed by 4 hex digits";
308	return token_type::parse_error;
309	}
310
311	// check if code point is a high surrogate
312	if (`0xD800` <= codepoint1 and codepoint1 <= `0xDBFF`)
313	{
314	// expect next \uxxxx entry
315	if (JSON_HEDLEY_LIKELY(get() == `'\\'` and get() == `'u'`))
316	{
317	const int codepoint2 = get_codepoint();
318
319	if (JSON_HEDLEY_UNLIKELY(codepoint2 == -`1`))
320	{
321	error_message = "invalid string: '\\u' must be followed by 4 hex digits";
322	return token_type::parse_error;
323	}
324
325	// check if codepoint2 is a low surrogate
326	if (JSON_HEDLEY_LIKELY(`0xDC00` <= codepoint2 and codepoint2 <= `0xDFFF`))
327	{
328	// overwrite codepoint
329	codepoint = static_cast<int>(
330	// high surrogate occupies the most significant 22 bits
331	(static_cast<unsigned int>(codepoint1) << `10u`)
332	// low surrogate occupies the least significant 15 bits
333	+ static_cast<unsigned int>(codepoint2)
334	// there is still the 0xD800, 0xDC00 and 0x10000 noise
335	// in the result so we have to subtract with:
336	// (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
337	- `0x35FDC00u`);
338	}
339	else
340	{
341	error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
342	return token_type::parse_error;
343	}
344	}
345	else
346	{
347	error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
348	return token_type::parse_error;
349	}
350	}
351	else
352	{
353	if (JSON_HEDLEY_UNLIKELY(`0xDC00` <= codepoint1 and codepoint1 <= `0xDFFF`))
354	{
355	error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
356	return token_type::parse_error;
357	}
358	}
359
360	// result of the above calculation yields a proper codepoint
361	assert(`0x00` <= codepoint and codepoint <= `0x10FFFF`);
362
363	// translate codepoint into bytes
364	if (codepoint < `0x80`)
365	{
366	// 1-byte characters: 0xxxxxxx (ASCII)
367	add(codepoint);
368	}
369	else if (codepoint <= `0x7FF`)
370	{
371	// 2-byte characters: 110xxxxx 10xxxxxx
372	add(static_cast<int>(`0xC0u` \| (static_cast<unsigned int>(codepoint) >> `6u`)));
373	add(static_cast<int>(`0x80u` \| (static_cast<unsigned int>(codepoint) & `0x3Fu`)));
374	}
375	else if (codepoint <= `0xFFFF`)
376	{
377	// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
378	add(static_cast<int>(`0xE0u` \| (static_cast<unsigned int>(codepoint) >> `12u`)));
379	add(static_cast<int>(`0x80u` \| ((static_cast<unsigned int>(codepoint) >> `6u`) & `0x3Fu`)));
380	add(static_cast<int>(`0x80u` \| (static_cast<unsigned int>(codepoint) & `0x3Fu`)));
381	}
382	else
383	{
384	// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
385	add(static_cast<int>(`0xF0u` \| (static_cast<unsigned int>(codepoint) >> `18u`)));
386	add(static_cast<int>(`0x80u` \| ((static_cast<unsigned int>(codepoint) >> `12u`) & `0x3Fu`)));
387	add(static_cast<int>(`0x80u` \| ((static_cast<unsigned int>(codepoint) >> `6u`) & `0x3Fu`)));
388	add(static_cast<int>(`0x80u` \| (static_cast<unsigned int>(codepoint) & `0x3Fu`)));
389	}
390
391	break;
392	}
393
394	// other characters after escape
395	default:
396	error_message = "invalid string: forbidden character after backslash";
397	return token_type::parse_error;
398	}
399
400	break;
401	}
402
403	// invalid control characters
404	case `0x00`:
405	{
406	error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
407	return token_type::parse_error;
408	}
409
410	case `0x01`:
411	{
412	error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
413	return token_type::parse_error;
414	}
415
416	case `0x02`:
417	{
418	error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
419	return token_type::parse_error;
420	}
421
422	case `0x03`:
423	{
424	error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
425	return token_type::parse_error;
426	}
427
428	case `0x04`:
429	{
430	error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
431	return token_type::parse_error;
432	}
433
434	case `0x05`:
435	{
436	error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
437	return token_type::parse_error;
438	}
439
440	case `0x06`:
441	{
442	error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
443	return token_type::parse_error;
444	}
445
446	case `0x07`:
447	{
448	error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
449	return token_type::parse_error;
450	}
451
452	case `0x08`:
453	{
454	error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
455	return token_type::parse_error;
456	}
457
458	case `0x09`:
459	{
460	error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
461	return token_type::parse_error;
462	}
463
464	case `0x0A`:
465	{
466	error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
467	return token_type::parse_error;
468	}
469
470	case `0x0B`:
471	{
472	error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
473	return token_type::parse_error;
474	}
475
476	case `0x0C`:
477	{
478	error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
479	return token_type::parse_error;
480	}
481
482	case `0x0D`:
483	{
484	error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
485	return token_type::parse_error;
486	}
487
488	case `0x0E`:
489	{
490	error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
491	return token_type::parse_error;
492	}
493
494	case `0x0F`:
495	{
496	error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
497	return token_type::parse_error;
498	}
499
500	case `0x10`:
501	{
502	error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
503	return token_type::parse_error;
504	}
505
506	case `0x11`:
507	{
508	error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
509	return token_type::parse_error;
510	}
511
512	case `0x12`:
513	{
514	error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
515	return token_type::parse_error;
516	}
517
518	case `0x13`:
519	{
520	error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
521	return token_type::parse_error;
522	}
523
524	case `0x14`:
525	{
526	error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
527	return token_type::parse_error;
528	}
529
530	case `0x15`:
531	{
532	error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
533	return token_type::parse_error;
534	}
535
536	case `0x16`:
537	{
538	error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
539	return token_type::parse_error;
540	}
541
542	case `0x17`:
543	{
544	error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
545	return token_type::parse_error;
546	}
547
548	case `0x18`:
549	{
550	error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
551	return token_type::parse_error;
552	}
553
554	case `0x19`:
555	{
556	error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
557	return token_type::parse_error;
558	}
559
560	case `0x1A`:
561	{
562	error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
563	return token_type::parse_error;
564	}
565
566	case `0x1B`:
567	{
568	error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
569	return token_type::parse_error;
570	}
571
572	case `0x1C`:
573	{
574	error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
575	return token_type::parse_error;
576	}
577
578	case `0x1D`:
579	{
580	error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
581	return token_type::parse_error;
582	}
583
584	case `0x1E`:
585	{
586	error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
587	return token_type::parse_error;
588	}
589
590	case `0x1F`:
591	{
592	error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
593	return token_type::parse_error;
594	}
595
596	// U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
597	case `0x20`:
598	case `0x21`:
599	case `0x23`:
600	case `0x24`:
601	case `0x25`:
602	case `0x26`:
603	case `0x27`:
604	case `0x28`:
605	case `0x29`:
606	case `0x2A`:
607	case `0x2B`:
608	case `0x2C`:
609	case `0x2D`:
610	case `0x2E`:
611	case `0x2F`:
612	case `0x30`:
613	case `0x31`:
614	case `0x32`:
615	case `0x33`:
616	case `0x34`:
617	case `0x35`:
618	case `0x36`:
619	case `0x37`:
620	case `0x38`:
621	case `0x39`:
622	case `0x3A`:
623	case `0x3B`:
624	case `0x3C`:
625	case `0x3D`:
626	case `0x3E`:
627	case `0x3F`:
628	case `0x40`:
629	case `0x41`:
630	case `0x42`:
631	case `0x43`:
632	case `0x44`:
633	case `0x45`:
634	case `0x46`:
635	case `0x47`:
636	case `0x48`:
637	case `0x49`:
638	case `0x4A`:
639	case `0x4B`:
640	case `0x4C`:
641	case `0x4D`:
642	case `0x4E`:
643	case `0x4F`:
644	case `0x50`:
645	case `0x51`:
646	case `0x52`:
647	case `0x53`:
648	case `0x54`:
649	case `0x55`:
650	case `0x56`:
651	case `0x57`:
652	case `0x58`:
653	case `0x59`:
654	case `0x5A`:
655	case `0x5B`:
656	case `0x5D`:
657	case `0x5E`:
658	case `0x5F`:
659	case `0x60`:
660	case `0x61`:
661	case `0x62`:
662	case `0x63`:
663	case `0x64`:
664	case `0x65`:
665	case `0x66`:
666	case `0x67`:
667	case `0x68`:
668	case `0x69`:
669	case `0x6A`:
670	case `0x6B`:
671	case `0x6C`:
672	case `0x6D`:
673	case `0x6E`:
674	case `0x6F`:
675	case `0x70`:
676	case `0x71`:
677	case `0x72`:
678	case `0x73`:
679	case `0x74`:
680	case `0x75`:
681	case `0x76`:
682	case `0x77`:
683	case `0x78`:
684	case `0x79`:
685	case `0x7A`:
686	case `0x7B`:
687	case `0x7C`:
688	case `0x7D`:
689	case `0x7E`:
690	case `0x7F`:
691	{
692	add(current);
693	break;
694	}
695
696	// U+0080..U+07FF: bytes C2..DF 80..BF
697	case `0xC2`:
698	case `0xC3`:
699	case `0xC4`:
700	case `0xC5`:
701	case `0xC6`:
702	case `0xC7`:
703	case `0xC8`:
704	case `0xC9`:
705	case `0xCA`:
706	case `0xCB`:
707	case `0xCC`:
708	case `0xCD`:
709	case `0xCE`:
710	case `0xCF`:
711	case `0xD0`:
712	case `0xD1`:
713	case `0xD2`:
714	case `0xD3`:
715	case `0xD4`:
716	case `0xD5`:
717	case `0xD6`:
718	case `0xD7`:
719	case `0xD8`:
720	case `0xD9`:
721	case `0xDA`:
722	case `0xDB`:
723	case `0xDC`:
724	case `0xDD`:
725	case `0xDE`:
726	case `0xDF`:
727	{
728	if (JSON_HEDLEY_UNLIKELY(not next_byte_in_range({`0x80`, `0xBF`})))
729	{
730	return token_type::parse_error;
731	}
732	break;
733	}
734
735	// U+0800..U+0FFF: bytes E0 A0..BF 80..BF
736	case `0xE0`:
737	{
738	if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({`0xA0`, `0xBF`, `0x80`, `0xBF`}))))
739	{
740	return token_type::parse_error;
741	}
742	break;
743	}
744
745	// U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
746	// U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
747	case `0xE1`:
748	case `0xE2`:
749	case `0xE3`:
750	case `0xE4`:
751	case `0xE5`:
752	case `0xE6`:
753	case `0xE7`:
754	case `0xE8`:
755	case `0xE9`:
756	case `0xEA`:
757	case `0xEB`:
758	case `0xEC`:
759	case `0xEE`:
760	case `0xEF`:
761	{
762	if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({`0x80`, `0xBF`, `0x80`, `0xBF`}))))
763	{
764	return token_type::parse_error;
765	}
766	break;
767	}
768
769	// U+D000..U+D7FF: bytes ED 80..9F 80..BF
770	case `0xED`:
771	{
772	if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({`0x80`, `0x9F`, `0x80`, `0xBF`}))))
773	{
774	return token_type::parse_error;
775	}
776	break;
777	}
778
779	// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
780	case `0xF0`:
781	{
782	if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({`0x90`, `0xBF`, `0x80`, `0xBF`, `0x80`, `0xBF`}))))
783	{
784	return token_type::parse_error;
785	}
786	break;
787	}
788
789	// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
790	case `0xF1`:
791	case `0xF2`:
792	case `0xF3`:
793	{
794	if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({`0x80`, `0xBF`, `0x80`, `0xBF`, `0x80`, `0xBF`}))))
795	{
796	return token_type::parse_error;
797	}
798	break;
799	}
800
801	// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
802	case `0xF4`:
803	{
804	if (JSON_HEDLEY_UNLIKELY(not (next_byte_in_range({`0x80`, `0x8F`, `0x80`, `0xBF`, `0x80`, `0xBF`}))))
805	{
806	return token_type::parse_error;
807	}
808	break;
809	}
810
811	// remaining bytes (80..C1 and F5..FF) are ill-formed
812	default:
813	{
814	error_message = "invalid string: ill-formed UTF-8 byte";
815	return token_type::parse_error;
816	}
817	}
818	}
819	}
820
821	JSON_HEDLEY_NON_NULL(`2`)
822	static void strtof(float& f, const char* str, char endptr) noexcept**
823	{
824	f = std::strtof(str, endptr);
825	}
826
827	JSON_HEDLEY_NON_NULL(`2`)
828	static void strtof(double& f, const char* str, char endptr) noexcept**
829	{
830	f = std::strtod(str, endptr);
831	}
832
833	JSON_HEDLEY_NON_NULL(`2`)
834	static void strtof(long double& f, const char* str, char endptr) noexcept**
835	{
836	f = std::strtold(str, endptr);
837	}
838
839	/!*
840	@brief scan a number literal
841
842	This function scans a string according to Sect. 6 of RFC 7159.
843
844	The function is realized with a deterministic finite state machine derived
845	from the grammar described in RFC 7159. Starting in state "init", the
846	input is read and used to determined the next state. Only state "done"
847	accepts the number. State "error" is a trap state to model errors. In the
848	table below, "anything" means any character but the ones listed before.
849
850	state \| 0 \| 1-9 \| e E \| + \| - \| . \| anything
851	---------\|----------\|----------\|----------\|---------\|---------\|----------\|-----------
852	init \| zero \| any1 \| [error] \| [error] \| minus \| [error] \| [error]
853	minus \| zero \| any1 \| [error] \| [error] \| [error] \| [error] \| [error]
854	zero \| done \| done \| exponent \| done \| done \| decimal1 \| done
855	any1 \| any1 \| any1 \| exponent \| done \| done \| decimal1 \| done
856	decimal1 \| decimal2 \| [error] \| [error] \| [error] \| [error] \| [error] \| [error]
857	decimal2 \| decimal2 \| decimal2 \| exponent \| done \| done \| done \| done
858	exponent \| any2 \| any2 \| [error] \| sign \| sign \| [error] \| [error]
859	sign \| any2 \| any2 \| [error] \| [error] \| [error] \| [error] \| [error]
860	any2 \| any2 \| any2 \| done \| done \| done \| done \| done
861
862	The state machine is realized with one label per state (prefixed with
863	"scan_number_") and `goto` statements between them. The state machine
864	contains cycles, but any cycle can be left when EOF is read. Therefore,
865	the function is guaranteed to terminate.
866
867	During scanning, the read bytes are stored in token_buffer. This string is
868	then converted to a signed integer, an unsigned integer, or a
869	floating-point number.
870
871	@return token_type::value_unsigned, token_type::value_integer, or
872	token_type::value_float if number could be successfully scanned,
873	token_type::parse_error otherwise
874
875	@note The scanner is independent of the current locale. Internally, the
876	locale's decimal point is used instead of `.` to work with the
877	locale-dependent converters.
878	*/
879	token_type scan_number() // lgtm [cpp/use-of-goto]
880	{
881	// reset token_buffer to store the number's bytes
882	reset();
883
884	// the type of the parsed number; initially set to unsigned; will be
885	// changed if minus sign, decimal point or exponent is read
886	token_type number_type = token_type::value_unsigned;
887
888	// state (init): we just found out we need to scan a number
889	switch (current)
890	{
891	case `'-'`:
892	{
893	add(current);
894	goto scan_number_minus;
895	}
896
897	case `'0'`:
898	{
899	add(current);
900	goto scan_number_zero;
901	}
902
903	case `'1'`:
904	case `'2'`:
905	case `'3'`:
906	case `'4'`:
907	case `'5'`:
908	case `'6'`:
909	case `'7'`:
910	case `'8'`:
911	case `'9'`:
912	{
913	add(current);
914	goto scan_number_any1;
915	}
916
917	// all other characters are rejected outside scan_number()
918	default: // LCOV_EXCL_LINE
919	assert(false); // LCOV_EXCL_LINE
920	}
921
922	scan_number_minus:
923	// state: we just parsed a leading minus sign
924	number_type = token_type::value_integer;
925	switch (get())
926	{
927	case `'0'`:
928	{
929	add(current);
930	goto scan_number_zero;
931	}
932
933	case `'1'`:
934	case `'2'`:
935	case `'3'`:
936	case `'4'`:
937	case `'5'`:
938	case `'6'`:
939	case `'7'`:
940	case `'8'`:
941	case `'9'`:
942	{
943	add(current);
944	goto scan_number_any1;
945	}
946
947	default:
948	{
949	error_message = "invalid number; expected digit after '-'";
950	return token_type::parse_error;
951	}
952	}
953
954	scan_number_zero:
955	// state: we just parse a zero (maybe with a leading minus sign)
956	switch (get())
957	{
958	case `'.'`:
959	{
960	add(decimal_point_char);
961	goto scan_number_decimal1;
962	}
963
964	case `'e'`:
965	case `'E'`:
966	{
967	add(current);
968	goto scan_number_exponent;
969	}
970
971	default:
972	goto scan_number_done;
973	}
974
975	scan_number_any1:
976	// state: we just parsed a number 0-9 (maybe with a leading minus sign)
977	switch (get())
978	{
979	case `'0'`:
980	case `'1'`:
981	case `'2'`:
982	case `'3'`:
983	case `'4'`:
984	case `'5'`:
985	case `'6'`:
986	case `'7'`:
987	case `'8'`:
988	case `'9'`:
989	{
990	add(current);
991	goto scan_number_any1;
992	}
993
994	case `'.'`:
995	{
996	add(decimal_point_char);
997	goto scan_number_decimal1;
998	}
999
1000	case `'e'`:
1001	case `'E'`:
1002	{
1003	add(current);
1004	goto scan_number_exponent;
1005	}
1006
1007	default:
1008	goto scan_number_done;
1009	}
1010
1011	scan_number_decimal1:
1012	// state: we just parsed a decimal point
1013	number_type = token_type::value_float;
1014	switch (get())
1015	{
1016	case `'0'`:
1017	case `'1'`:
1018	case `'2'`:
1019	case `'3'`:
1020	case `'4'`:
1021	case `'5'`:
1022	case `'6'`:
1023	case `'7'`:
1024	case `'8'`:
1025	case `'9'`:
1026	{
1027	add(current);
1028	goto scan_number_decimal2;
1029	}
1030
1031	default:
1032	{
1033	error_message = "invalid number; expected digit after '.'";
1034	return token_type::parse_error;
1035	}
1036	}
1037
1038	scan_number_decimal2:
1039	// we just parsed at least one number after a decimal point
1040	switch (get())
1041	{
1042	case `'0'`:
1043	case `'1'`:
1044	case `'2'`:
1045	case `'3'`:
1046	case `'4'`:
1047	case `'5'`:
1048	case `'6'`:
1049	case `'7'`:
1050	case `'8'`:
1051	case `'9'`:
1052	{
1053	add(current);
1054	goto scan_number_decimal2;
1055	}
1056
1057	case `'e'`:
1058	case `'E'`:
1059	{
1060	add(current);
1061	goto scan_number_exponent;
1062	}
1063
1064	default:
1065	goto scan_number_done;
1066	}
1067
1068	scan_number_exponent:
1069	// we just parsed an exponent
1070	number_type = token_type::value_float;
1071	switch (get())
1072	{
1073	case `'+'`:
1074	case `'-'`:
1075	{
1076	add(current);
1077	goto scan_number_sign;
1078	}
1079
1080	case `'0'`:
1081	case `'1'`:
1082	case `'2'`:
1083	case `'3'`:
1084	case `'4'`:
1085	case `'5'`:
1086	case `'6'`:
1087	case `'7'`:
1088	case `'8'`:
1089	case `'9'`:
1090	{
1091	add(current);
1092	goto scan_number_any2;
1093	}
1094
1095	default:
1096	{
1097	error_message =
1098	"invalid number; expected '+', '-', or digit after exponent";
1099	return token_type::parse_error;
1100	}
1101	}
1102
1103	scan_number_sign:
1104	// we just parsed an exponent sign
1105	switch (get())
1106	{
1107	case `'0'`:
1108	case `'1'`:
1109	case `'2'`:
1110	case `'3'`:
1111	case `'4'`:
1112	case `'5'`:
1113	case `'6'`:
1114	case `'7'`:
1115	case `'8'`:
1116	case `'9'`:
1117	{
1118	add(current);
1119	goto scan_number_any2;
1120	}
1121
1122	default:
1123	{
1124	error_message = "invalid number; expected digit after exponent sign";
1125	return token_type::parse_error;
1126	}
1127	}
1128
1129	scan_number_any2:
1130	// we just parsed a number after the exponent or exponent sign
1131	switch (get())
1132	{
1133	case `'0'`:
1134	case `'1'`:
1135	case `'2'`:
1136	case `'3'`:
1137	case `'4'`:
1138	case `'5'`:
1139	case `'6'`:
1140	case `'7'`:
1141	case `'8'`:
1142	case `'9'`:
1143	{
1144	add(current);
1145	goto scan_number_any2;
1146	}
1147
1148	default:
1149	goto scan_number_done;
1150	}
1151
1152	scan_number_done:
1153	// unget the character after the number (we only read it to know that
1154	// we are done scanning a number)
1155	unget();
1156
1157	char* endptr = nullptr;
1158	errno = `0`;
1159
1160	// try to parse integers first and fall back to floats
1161	if (number_type == token_type::value_unsigned)
1162	{
1163	const auto x = std::strtoull(token_buffer.data(), &endptr, `10`);
1164
1165	// we checked the number format before
1166	assert(endptr == token_buffer.data() + token_buffer.size());
1167
1168	if (errno == `0`)
1169	{
1170	value_unsigned = static_cast<number_unsigned_t>(x);
1171	if (value_unsigned == x)
1172	{
1173	return token_type::value_unsigned;
1174	}
1175	}
1176	}
1177	else if (number_type == token_type::value_integer)
1178	{
1179	const auto x = std::strtoll(token_buffer.data(), &endptr, `10`);
1180
1181	// we checked the number format before
1182	assert(endptr == token_buffer.data() + token_buffer.size());
1183
1184	if (errno == `0`)
1185	{
1186	value_integer = static_cast<number_integer_t>(x);
1187	if (value_integer == x)
1188	{
1189	return token_type::value_integer;
1190	}
1191	}
1192	}
1193
1194	// this code is reached if we parse a floating-point number or if an
1195	// integer conversion above failed
1196	strtof(value_float, token_buffer.data(), &endptr);
1197
1198	// we checked the number format before
1199	assert(endptr == token_buffer.data() + token_buffer.size());
1200
1201	return token_type::value_float;
1202	}
1203
1204	/!*
1205	@param[in] literal_text the literal text to expect
1206	@param[in] length the length of the passed literal text
1207	@param[in] return_type the token type to return on success
1208	*/
1209	JSON_HEDLEY_NON_NULL(`2`)
1210	token_type scan_literal(const char* literal_text, const std::size_t length,
1211	token_type return_type)
1212	{
1213	assert(current == literal_text[`0`]);
1214	for (std::size_t i = `1`; i < length; ++i)
1215	{
1216	if (JSON_HEDLEY_UNLIKELY(get() != literal_text[i]))
1217	{
1218	error_message = "invalid literal";
1219	return token_type::parse_error;
1220	}
1221	}
1222	return return_type;
1223	}
1224
1225	/////////////////////
1226	// input management
1227	/////////////////////
1228
1229	/// reset token_buffer; current character is beginning of token
1230	void reset() noexcept
1231	{
1232	token_buffer.clear();
1233	token_string.clear();
1234	token_string.push_back(std::char_traits<char>::to_char_type(current));
1235	}
1236
1237	/*
1238	@brief get next character from the input
1239
1240	This function provides the interface to the used input adapter. It does
1241	not throw in case the input reached EOF, but returns a
1242	`std::char_traits<char>::eof()` in that case. Stores the scanned characters
1243	for use in error messages.
1244
1245	@return character read from the input
1246	*/
1247	std::char_traits<char>::int_type get()
1248	{
1249	++position.chars_read_total;
1250	++position.chars_read_current_line;
1251
1252	if (next_unget)
1253	{
1254	// just reset the next_unget variable and work with current
1255	next_unget = false;
1256	}
1257	else
1258	{
1259	current = ia ->get_character();
1260	}
1261
1262	if (JSON_HEDLEY_LIKELY(current != std::char_traits<char>::eof()))
1263	{
1264	token_string.push_back(std::char_traits<char>::to_char_type(current));
1265	}
1266
1267	if (current == `'\n'`)
1268	{
1269	++position.lines_read;
1270	position.chars_read_current_line = `0`;
1271	}
1272
1273	return current;
1274	}
1275
1276	/!*
1277	@brief unget current character (read it again on next get)
1278
1279	We implement unget by setting variable next_unget to true. The input is not
1280	changed - we just simulate ungetting by modifying chars_read_total,
1281	chars_read_current_line, and token_string. The next call to get() will
1282	behave as if the unget character is read again.
1283	*/
1284	void unget()
1285	{
1286	next_unget = true;
1287
1288	--position.chars_read_total;
1289
1290	// in case we "unget" a newline, we have to also decrement the lines_read
1291	if (position.chars_read_current_line == `0`)
1292	{
1293	if (position.lines_read > `0`)
1294	{
1295	--position.lines_read;
1296	}
1297	}
1298	else
1299	{
1300	--position.chars_read_current_line;
1301	}
1302
1303	if (JSON_HEDLEY_LIKELY(current != std::char_traits<char>::eof()))
1304	{
1305	assert(not token_string.empty());
1306	token_string.pop_back();
1307	}
1308	}
1309
1310	/// add a character to token_buffer
1311	void add(int c)
1312	{
1313	token_buffer.push_back(std::char_traits<char>::to_char_type(c));
1314	}
1315
1316	public:
1317	/////////////////////
1318	// value getters
1319	/////////////////////
1320
1321	/// return integer value
1322	constexpr number_integer_t get_number_integer() const noexcept
1323	{
1324	return value_integer;
1325	}
1326
1327	/// return unsigned integer value
1328	constexpr number_unsigned_t get_number_unsigned() const noexcept
1329	{
1330	return value_unsigned;
1331	}
1332
1333	/// return floating-point value
1334	constexpr number_float_t get_number_float() const noexcept
1335	{
1336	return value_float;
1337	}
1338
1339	/// return current string value (implicitly resets the token; useful only once)
1340	string_t& get_string()
1341	{
1342	return token_buffer;
1343	}
1344
1345	/////////////////////
1346	// diagnostics
1347	/////////////////////
1348
1349	/// return position of last read token
1350	constexpr position_t get_position() const noexcept
1351	{
1352	return position;
1353	}
1354
1355	/// return the last read token (for errors only). Will never contain EOF
1356	/// (an arbitrary value that is not a valid char value, often -1), because
1357	/// 255 may legitimately occur. May contain NUL, which should be escaped.
1358	std::string get_token_string() const
1359	{
1360	// escape control characters
1361	std::string result;
1362	for (const auto c : token_string)
1363	{
1364	if (`'\x00'` <= c and c <= `'\x1F'`)
1365	{
1366	// escape control characters
1367	std::array<char, `9`> cs{{}};
1368	(std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c));
1369	result += cs.data();
1370	}
1371	else
1372	{
1373	// add character as is
1374	result.push_back(c);
1375	}
1376	}
1377
1378	return result;
1379	}
1380
1381	/// return syntax error message
1382	JSON_HEDLEY_RETURNS_NON_NULL
1383	constexpr const char* get_error_message() const noexcept
1384	{
1385	return error_message;
1386	}
1387
1388	/////////////////////
1389	// actual scanner
1390	/////////////////////
1391
1392	/!*
1393	@brief skip the UTF-8 byte order mark
1394	@return true iff there is no BOM or the correct BOM has been skipped
1395	*/
1396	bool skip_bom()
1397	{
1398	if (get() == `0xEF`)
1399	{
1400	// check if we completely parse the BOM
1401	return get() == `0xBB` and get() == `0xBF`;
1402	}
1403
1404	// the first character is not the beginning of the BOM; unget it to
1405	// process is later
1406	unget();
1407	return true;
1408	}
1409
1410	token_type scan()
1411	{
1412	// initially, skip the BOM
1413	if (position.chars_read_total == `0` and not skip_bom())
1414	{
1415	error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
1416	return token_type::parse_error;
1417	}
1418
1419	// read next character and ignore whitespace
1420	do
1421	{
1422	get();
1423	}
1424	while (current == `' '` or current == `'\t'` or current == `'\n'` or current == `'\r'`);
1425
1426	switch (current)
1427	{
1428	// structural characters
1429	case `'['`:
1430	return token_type::begin_array;
1431	case `']'`:
1432	return token_type::end_array;
1433	case `'{'`:
1434	return token_type::begin_object;
1435	case `'}'`:
1436	return token_type::end_object;
1437	case `':'`:
1438	return token_type::name_separator;
1439	case `','`:
1440	return token_type::value_separator;
1441
1442	// literals
1443	case `'t'`:
1444	return scan_literal("true", `4`, token_type::literal_true);
1445	case `'f'`:
1446	return scan_literal("false", `5`, token_type::literal_false);
1447	case `'n'`:
1448	return scan_literal("null", `4`, token_type::literal_null);
1449
1450	// string
1451	case `'\"'`:
1452	return scan_string();
1453
1454	// number
1455	case `'-'`:
1456	case `'0'`:
1457	case `'1'`:
1458	case `'2'`:
1459	case `'3'`:
1460	case `'4'`:
1461	case `'5'`:
1462	case `'6'`:
1463	case `'7'`:
1464	case `'8'`:
1465	case `'9'`:
1466	return scan_number();
1467
1468	// end of input (the null byte is needed when parsing from
1469	// string literals)
1470	case `'\0'`:
1471	case std::char_traits<char>::eof():
1472	return token_type::end_of_input;
1473
1474	// error
1475	default:
1476	error_message = "invalid literal";
1477	return token_type::parse_error;
1478	}
1479	}
1480
1481	private:
1482	/// input adapter
1483	detail::input_adapter_t ia = nullptr;
1484
1485	/// the current character
1486	std::char_traits<char>::int_type current = std::char_traits<char>::eof();
1487
1488	/// whether the next get() call should just return current
1489	bool next_unget = false;
1490
1491	/// the start position of the current token
1492	position_t position {};
1493
1494	/// raw input token string (for error messages)
1495	std::vector<char> token_string {};
1496
1497	/// buffer for variable-length tokens (numbers, strings)
1498	string_t token_buffer {};
1499
1500	/// a description of occurred lexer errors
1501	const char* error_message = "";
1502
1503	// number values
1504	number_integer_t value_integer = `0`;
1505	number_unsigned_t value_unsigned = `0`;
1506	number_float_t value_float = `0`;
1507
1508	/// the decimal point
1509	const char decimal_point_char = `'.'`;
1510	};
1511	} // namespace detail
1512	} // namespace nlohmann
1513

Browse the source code of DeepinIDE/3rdparty/cppdap/third_party/json/include/nlohmann/detail/input/lexer.hpp