parser.h source code [Velox/build/_deps/simdjson-src/include/simdjson/generic/ondemand/parser.h]

1	#include "simdjson/error.h"
2
3	namespace simdjson {
4	namespace SIMDJSON_IMPLEMENTATION {
5	namespace ondemand {
6
7	class array;
8	class object;
9	class value;
10	class raw_json_string;
11	class document_stream;
12
13	/**
14	* The default batch size for document_stream instances for this On Demand kernel.
15	* Note that different On Demand kernel may use a different DEFAULT_BATCH_SIZE value
16	* in the future.
17	*/
18	static constexpr size_t DEFAULT_BATCH_SIZE = `1000000`;
19	/**
20	* Some adversary might try to set the batch size to 0 or 1, which might cause problems.
21	* We set a minimum of 32B since anything else is highly likely to be an error. In practice,
22	* most users will want a much larger batch size.
23	*
24	* All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON
25	* document can ever span 0 or 1 byte and that very large values would create memory allocation issues.
26	*/
27	static constexpr size_t MINIMAL_BATCH_SIZE = `32`;
28
29	/**
30	* A JSON fragment iterator.
31	*
32	* This holds the actual iterator as well as the buffer for writing strings.
33	*/
34	class parser {
35	public:
36	/**
37	* Create a JSON parser.
38	*
39	* The new parser will have zero capacity.
40	*/
41	inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept;
42
43	inline parser(parser &&other) noexcept = default;
44	simdjson_inline parser(const parser &other) = delete;
45	simdjson_inline parser &operator=(const parser &other) = delete;
46	simdjson_inline parser &operator=(parser &&other) noexcept = default;
47
48	/* Deallocate the JSON parser. /
49	inline ~parser() noexcept = default;
50
51	/**
52	* Start iterating an on-demand JSON document.
53	*
54	* ondemand::parser parser;
55	* document doc = parser.iterate(json);
56	*
57	* It is expected that the content is a valid UTF-8 file, containing a valid JSON document.
58	* Otherwise the iterate method may return an error. In particular, the whole input should be
59	* valid: we do not attempt to tolerate incorrect content either before or after a JSON
60	* document.
61	*
62	* ### IMPORTANT: Validate what you use
63	*
64	* Calling iterate on an invalid JSON document may not immediately trigger an error. The call to
65	* iterate does not parse and validate the whole document.
66	*
67	* ### IMPORTANT: Buffer Lifetime
68	*
69	* Because parsing is done while you iterate, you must keep the JSON buffer around at least as
70	* long as the document iteration.
71	*
72	* ### IMPORTANT: Document Lifetime
73	*
74	* Only one iteration at a time can happen per parser, and the parser must be kept alive during
75	* iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before
76	* you call parse() again or destroy the parser.
77	*
78	* ### REQUIRED: Buffer Padding
79	*
80	* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
81	* those bytes are initialized to, as long as they are allocated. These bytes will be read: if you
82	* using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the
83	* SIMDJSON_PADDING bytes to avoid runtime warnings.
84	*
85	* @param json The JSON to parse.
86	* @param len The length of the JSON.
87	* @param capacity The number of bytes allocated in the JSON (must be at least len+SIMDJSON_PADDING).
88	*
89	* @return The document, or an error:
90	* - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes.
91	* - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory
92	* allocation fails.
93	* - EMPTY if the document is all whitespace.
94	* - UTF8_ERROR if the document is not valid UTF-8.
95	* - UNESCAPED_CHARS if a string contains control characters that must be escaped
96	* - UNCLOSED_STRING if there is an unclosed string in the document.
97	*/
98	simdjson_warn_unused simdjson_result<document> iterate(padded_string_view json) & noexcept;
99	/* @overload simdjson_result<document> iterate(padded_string_view json) & noexcept /
100	simdjson_warn_unused simdjson_result<document> iterate(const char json, size_t len, size_t capacity) & noexcept*;
101	/* @overload simdjson_result<document> iterate(padded_string_view json) & noexcept /
102	simdjson_warn_unused simdjson_result<document> iterate(const uint8_t json, size_t len, size_t capacity) & noexcept*;
103	/* @overload simdjson_result<document> iterate(padded_string_view json) & noexcept /
104	simdjson_warn_unused simdjson_result<document> iterate(std::string_view json, size_t capacity) & noexcept;
105	/* @overload simdjson_result<document> iterate(padded_string_view json) & noexcept /
106	simdjson_warn_unused simdjson_result<document> iterate(const std::string &json) & noexcept;
107	/* @overload simdjson_result<document> iterate(padded_string_view json) & noexcept /
108	simdjson_warn_unused simdjson_result<document> iterate(const simdjson_result<padded_string> &json) & noexcept;
109	/* @overload simdjson_result<document> iterate(padded_string_view json) & noexcept /
110	simdjson_warn_unused simdjson_result<document> iterate(const simdjson_result<padded_string_view> &json) & noexcept;
111	/* @overload simdjson_result<document> iterate(padded_string_view json) & noexcept /
112	simdjson_warn_unused simdjson_result<document> iterate(padded_string &&json) & noexcept = delete;
113
114	/**
115	* @private
116	*
117	* Start iterating an on-demand JSON document.
118	*
119	* ondemand::parser parser;
120	* json_iterator doc = parser.iterate(json);
121	*
122	* ### IMPORTANT: Buffer Lifetime
123	*
124	* Because parsing is done while you iterate, you must keep the JSON buffer around at least as
125	* long as the document iteration.
126	*
127	* ### IMPORTANT: Document Lifetime
128	*
129	* Only one iteration at a time can happen per parser, and the parser must be kept alive during
130	* iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before
131	* you call parse() again or destroy the parser.
132	*
133	* The ondemand::document instance holds the iterator. The document must remain in scope
134	* while you are accessing instances of ondemand::value, ondemand::object, ondemand::array.
135	*
136	* ### REQUIRED: Buffer Padding
137	*
138	* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
139	* those bytes are initialized to, as long as they are allocated. These bytes will be read: if you
140	* using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the
141	* SIMDJSON_PADDING bytes to avoid runtime warnings.
142	*
143	* @param json The JSON to parse.
144	*
145	* @return The iterator, or an error:
146	* - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes.
147	* - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory
148	* allocation fails.
149	* - EMPTY if the document is all whitespace.
150	* - UTF8_ERROR if the document is not valid UTF-8.
151	* - UNESCAPED_CHARS if a string contains control characters that must be escaped
152	* - UNCLOSED_STRING if there is an unclosed string in the document.
153	*/
154	simdjson_warn_unused simdjson_result<json_iterator> iterate_raw(padded_string_view json) & noexcept;
155
156
157	/**
158	* Parse a buffer containing many JSON documents.
159	*
160	* auto json = R"({ "foo": 1 } { "foo": 2 } { "foo": 3 } )"_padded;
161	* ondemand::parser parser;
162	* ondemand::document_stream docs = parser.iterate_many(json);
163	* for (auto & doc : docs) {
164	* std::cout << doc["foo"] << std::endl;
165	* }
166	* // Prints 1 2 3
167	*
168	* No copy of the input buffer is made.
169	*
170	* The function is lazy: it may be that no more than one JSON document at a time is parsed.
171	*
172	* The caller is responsabile to ensure that the input string data remains unchanged and is
173	* not deleted during the loop.
174	*
175	* ### Format
176	*
177	* The buffer must contain a series of one or more JSON documents, concatenated into a single
178	* buffer, separated by ASCII whitespace. It effectively parses until it has a fully valid document,
179	* then starts parsing the next document at that point. (It does this with more parallelism and
180	* lookahead than you might think, though.)
181	*
182	* documents that consist of an object or array may omit the whitespace between them, concatenating
183	* with no separator. Documents that consist of a single primitive (i.e. documents that are not
184	* arrays or objects) MUST be separated with ASCII whitespace.
185	*
186	* The characters inside a JSON document, and between JSON documents, must be valid Unicode (UTF-8).
187	*
188	* The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.
189	* Setting batch_size to excessively large or excessively small values may impact negatively the
190	* performance.
191	*
192	* ### REQUIRED: Buffer Padding
193	*
194	* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
195	* those bytes are initialized to, as long as they are allocated. These bytes will be read: if you
196	* using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the
197	* SIMDJSON_PADDING bytes to avoid runtime warnings.
198	*
199	* ### Threads
200	*
201	* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
202	* hood to do some lookahead.
203	*
204	* ### Parser Capacity
205	*
206	* If the parser's current capacity is less than batch_size, it will allocate enough capacity
207	* to handle it (up to max_capacity).
208	*
209	* @param buf The concatenated JSON to parse.
210	* @param len The length of the concatenated JSON.
211	* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
212	* spot is cache-related: small enough to fit in cache, yet big enough to
213	* parse as many documents as possible in one tight loop.
214	* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
215	* @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:
216	* - MEMALLOC if the parser does not have enough capacity and memory allocation fails
217	* - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity.
218	* - other json errors if parsing fails. You should not rely on these errors to always the same for the
219	* same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
220	*/
221	inline simdjson_result<document_stream> iterate_many(const uint8_t buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept*;
222	/* @overload parse_many(const uint8_t buf, size_t len, size_t batch_size) /*
223	inline simdjson_result<document_stream> iterate_many(const char buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept*;
224	/* @overload parse_many(const uint8_t buf, size_t len, size_t batch_size) /*
225	inline simdjson_result<document_stream> iterate_many(const std::string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
226	inline simdjson_result<document_stream> iterate_many(const std::string &&s, size_t batch_size) = delete;// unsafe
227	/* @overload parse_many(const uint8_t buf, size_t len, size_t batch_size) /*
228	inline simdjson_result<document_stream> iterate_many(const padded_string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
229	inline simdjson_result<document_stream> iterate_many(const padded_string &&s, size_t batch_size) = delete;// unsafe
230
231	/* @private We do not want to allow implicit conversion from C string to std::string. /
232	simdjson_result<document_stream> iterate_many(const char buf, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept* = delete;
233
234	/* The capacity of this parser (the largest document it can process). /
235	simdjson_inline size_t capacity() const noexcept;
236	/* The maximum capacity of this parser (the largest document it is allowed to process). /
237	simdjson_inline size_t max_capacity() const noexcept;
238	simdjson_inline void set_max_capacity(size_t max_capacity) noexcept;
239	/**
240	* The maximum depth of this parser (the most deeply nested objects and arrays it can process).
241	* This parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true.
242	* The document's instance current_depth() method should be used to monitor the parsing
243	* depth and limit it if desired.
244	*/
245	simdjson_inline size_t max_depth() const noexcept;
246
247	/**
248	* Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
249	* and `max_depth` depth.
250	*
251	* The max_depth parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true.
252	* The document's instance current_depth() method should be used to monitor the parsing
253	* depth and limit it if desired.
254	*
255	* @param capacity The new capacity.
256	* @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
257	* @return The error, if there is one.
258	*/
259	simdjson_warn_unused error_code allocate(size_t capacity, size_t max_depth=DEFAULT_MAX_DEPTH) noexcept;
260
261	#ifdef SIMDJSON_THREADS_ENABLED
262	/**
263	* The parser instance can use threads when they are available to speed up some
264	* operations. It is enabled by default. Changing this attribute will change the
265	* behavior of the parser for future operations.
266	*/
267	bool threaded{true};
268	#endif
269
270	/**
271	* Unescape this JSON string, replacing \\ with \, \n with newline, etc. to a user-provided buffer.
272	* The result must be valid UTF-8.
273	* The provided pointer is advanced to the end of the string by reference, and a string_view instance
274	* is returned. You can ensure that your buffer is large enough by allocating a block of memory at least
275	* as large as the input JSON plus SIMDJSON_PADDING and then unescape all strings to this one buffer.
276	*
277	* This unescape function is a low-level function. If you want a more user-friendly approach, you should
278	* avoid raw_json_string instances (e.g., by calling unescaped_key() instead of key() or get_string()
279	* instead of get_raw_json_string()).
280	*
281	* ## IMPORTANT: string_view lifetime
282	*
283	* The string_view is only valid as long as the bytes in dst.
284	*
285	* @param raw_json_string input
286	* @param dst A pointer to a buffer at least large enough to write this string as well as
287	* an additional SIMDJSON_PADDING bytes.
288	* @param allow_replacement Whether we allow a replacement if the input string contains unmatched surrogate pairs.
289	* @return A string_view pointing at the unescaped string in dst
290	* @error STRING_ERROR if escapes are incorrect.
291	*/
292	simdjson_inline simdjson_result<std::string_view> unescape(raw_json_string in, uint8_t &dst, bool* allow_replacement = false) const noexcept;
293
294	/**
295	* Unescape this JSON string, replacing \\ with \, \n with newline, etc. to a user-provided buffer.
296	* The result may not be valid UTF-8. See https://simonsapin.github.io/wtf-8/
297	* The provided pointer is advanced to the end of the string by reference, and a string_view instance
298	* is returned. You can ensure that your buffer is large enough by allocating a block of memory at least
299	* as large as the input JSON plus SIMDJSON_PADDING and then unescape all strings to this one buffer.
300	*
301	* This unescape function is a low-level function. If you want a more user-friendly approach, you should
302	* avoid raw_json_string instances (e.g., by calling unescaped_key() instead of key() or get_string()
303	* instead of get_raw_json_string()).
304	*
305	* ## IMPORTANT: string_view lifetime
306	*
307	* The string_view is only valid as long as the bytes in dst.
308	*
309	* @param raw_json_string input
310	* @param dst A pointer to a buffer at least large enough to write this string as well as
311	* an additional SIMDJSON_PADDING bytes.
312	* @return A string_view pointing at the unescaped string in dst
313	* @error STRING_ERROR if escapes are incorrect.
314	*/
315	simdjson_inline simdjson_result<std::string_view> unescape_wobbly(raw_json_string in, uint8_t &dst) const* noexcept;
316
317	private:
318	/* @private [for benchmarking access] The implementation to use /
319	std::unique_ptr<internal::dom_parser_implementation> implementation{};
320	size_t _capacity{`0`};
321	size_t _max_capacity;
322	size_t _max_depth{DEFAULT_MAX_DEPTH};
323	std::unique_ptr<uint8_t[]> string_buf{};
324	#if SIMDJSON_DEVELOPMENT_CHECKS
325	std::unique_ptr<token_position[]> start_positions{};
326	#endif
327
328	friend class json_iterator;
329	friend class document_stream;
330	};
331
332	} // namespace ondemand
333	} // namespace SIMDJSON_IMPLEMENTATION
334	} // namespace simdjson
335
336	namespace simdjson {
337
338	template<>
339	struct simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::parser> : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_IMPLEMENTATION::ondemand::parser> {
340	public:
341	simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::parser &&value) noexcept; ///< @private
342	simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
343	simdjson_inline simdjson_result() noexcept = default;
344	};
345
346	} // namespace simdjson
347

Browse the source code of Velox/build/_deps/simdjson-src/include/simdjson/generic/ondemand/parser.h