1 | #include "simdjson/error.h" |
2 | |
3 | namespace simdjson { |
4 | namespace SIMDJSON_IMPLEMENTATION { |
5 | namespace ondemand { |
6 | |
7 | class array; |
8 | class object; |
9 | class value; |
10 | class raw_json_string; |
11 | class document_stream; |
12 | |
13 | /** |
14 | * The default batch size for document_stream instances for this On Demand kernel. |
15 | * Note that different On Demand kernel may use a different DEFAULT_BATCH_SIZE value |
16 | * in the future. |
17 | */ |
18 | static constexpr size_t DEFAULT_BATCH_SIZE = 1000000; |
19 | /** |
20 | * Some adversary might try to set the batch size to 0 or 1, which might cause problems. |
21 | * We set a minimum of 32B since anything else is highly likely to be an error. In practice, |
22 | * most users will want a much larger batch size. |
23 | * |
24 | * All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON |
25 | * document can ever span 0 or 1 byte and that very large values would create memory allocation issues. |
26 | */ |
27 | static constexpr size_t MINIMAL_BATCH_SIZE = 32; |
28 | |
29 | /** |
30 | * A JSON fragment iterator. |
31 | * |
32 | * This holds the actual iterator as well as the buffer for writing strings. |
33 | */ |
34 | class parser { |
35 | public: |
36 | /** |
37 | * Create a JSON parser. |
38 | * |
39 | * The new parser will have zero capacity. |
40 | */ |
41 | inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept; |
42 | |
43 | inline parser(parser &&other) noexcept = default; |
44 | simdjson_inline parser(const parser &other) = delete; |
45 | simdjson_inline parser &operator=(const parser &other) = delete; |
46 | simdjson_inline parser &operator=(parser &&other) noexcept = default; |
47 | |
48 | /** Deallocate the JSON parser. */ |
49 | inline ~parser() noexcept = default; |
50 | |
51 | /** |
52 | * Start iterating an on-demand JSON document. |
53 | * |
54 | * ondemand::parser parser; |
55 | * document doc = parser.iterate(json); |
56 | * |
57 | * It is expected that the content is a valid UTF-8 file, containing a valid JSON document. |
58 | * Otherwise the iterate method may return an error. In particular, the whole input should be |
59 | * valid: we do not attempt to tolerate incorrect content either before or after a JSON |
60 | * document. |
61 | * |
62 | * ### IMPORTANT: Validate what you use |
63 | * |
64 | * Calling iterate on an invalid JSON document may not immediately trigger an error. The call to |
65 | * iterate does not parse and validate the whole document. |
66 | * |
67 | * ### IMPORTANT: Buffer Lifetime |
68 | * |
69 | * Because parsing is done while you iterate, you *must* keep the JSON buffer around at least as |
70 | * long as the document iteration. |
71 | * |
72 | * ### IMPORTANT: Document Lifetime |
73 | * |
74 | * Only one iteration at a time can happen per parser, and the parser *must* be kept alive during |
75 | * iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before |
76 | * you call parse() again or destroy the parser. |
77 | * |
78 | * ### REQUIRED: Buffer Padding |
79 | * |
80 | * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what |
81 | * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you |
82 | * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the |
83 | * SIMDJSON_PADDING bytes to avoid runtime warnings. |
84 | * |
85 | * @param json The JSON to parse. |
86 | * @param len The length of the JSON. |
87 | * @param capacity The number of bytes allocated in the JSON (must be at least len+SIMDJSON_PADDING). |
88 | * |
89 | * @return The document, or an error: |
90 | * - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes. |
91 | * - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory |
92 | * allocation fails. |
93 | * - EMPTY if the document is all whitespace. |
94 | * - UTF8_ERROR if the document is not valid UTF-8. |
95 | * - UNESCAPED_CHARS if a string contains control characters that must be escaped |
96 | * - UNCLOSED_STRING if there is an unclosed string in the document. |
97 | */ |
98 | simdjson_warn_unused simdjson_result<document> iterate(padded_string_view json) & noexcept; |
99 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
100 | simdjson_warn_unused simdjson_result<document> iterate(const char *json, size_t len, size_t capacity) & noexcept; |
101 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
102 | simdjson_warn_unused simdjson_result<document> iterate(const uint8_t *json, size_t len, size_t capacity) & noexcept; |
103 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
104 | simdjson_warn_unused simdjson_result<document> iterate(std::string_view json, size_t capacity) & noexcept; |
105 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
106 | simdjson_warn_unused simdjson_result<document> iterate(const std::string &json) & noexcept; |
107 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
108 | simdjson_warn_unused simdjson_result<document> iterate(const simdjson_result<padded_string> &json) & noexcept; |
109 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
110 | simdjson_warn_unused simdjson_result<document> iterate(const simdjson_result<padded_string_view> &json) & noexcept; |
111 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
112 | simdjson_warn_unused simdjson_result<document> iterate(padded_string &&json) & noexcept = delete; |
113 | |
114 | /** |
115 | * @private |
116 | * |
117 | * Start iterating an on-demand JSON document. |
118 | * |
119 | * ondemand::parser parser; |
120 | * json_iterator doc = parser.iterate(json); |
121 | * |
122 | * ### IMPORTANT: Buffer Lifetime |
123 | * |
124 | * Because parsing is done while you iterate, you *must* keep the JSON buffer around at least as |
125 | * long as the document iteration. |
126 | * |
127 | * ### IMPORTANT: Document Lifetime |
128 | * |
129 | * Only one iteration at a time can happen per parser, and the parser *must* be kept alive during |
130 | * iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before |
131 | * you call parse() again or destroy the parser. |
132 | * |
133 | * The ondemand::document instance holds the iterator. The document must remain in scope |
134 | * while you are accessing instances of ondemand::value, ondemand::object, ondemand::array. |
135 | * |
136 | * ### REQUIRED: Buffer Padding |
137 | * |
138 | * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what |
139 | * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you |
140 | * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the |
141 | * SIMDJSON_PADDING bytes to avoid runtime warnings. |
142 | * |
143 | * @param json The JSON to parse. |
144 | * |
145 | * @return The iterator, or an error: |
146 | * - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes. |
147 | * - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory |
148 | * allocation fails. |
149 | * - EMPTY if the document is all whitespace. |
150 | * - UTF8_ERROR if the document is not valid UTF-8. |
151 | * - UNESCAPED_CHARS if a string contains control characters that must be escaped |
152 | * - UNCLOSED_STRING if there is an unclosed string in the document. |
153 | */ |
154 | simdjson_warn_unused simdjson_result<json_iterator> iterate_raw(padded_string_view json) & noexcept; |
155 | |
156 | |
157 | /** |
158 | * Parse a buffer containing many JSON documents. |
159 | * |
160 | * auto json = R"({ "foo": 1 } { "foo": 2 } { "foo": 3 } )"_padded; |
161 | * ondemand::parser parser; |
162 | * ondemand::document_stream docs = parser.iterate_many(json); |
163 | * for (auto & doc : docs) { |
164 | * std::cout << doc["foo"] << std::endl; |
165 | * } |
166 | * // Prints 1 2 3 |
167 | * |
168 | * No copy of the input buffer is made. |
169 | * |
170 | * The function is lazy: it may be that no more than one JSON document at a time is parsed. |
171 | * |
172 | * The caller is responsabile to ensure that the input string data remains unchanged and is |
173 | * not deleted during the loop. |
174 | * |
175 | * ### Format |
176 | * |
177 | * The buffer must contain a series of one or more JSON documents, concatenated into a single |
178 | * buffer, separated by ASCII whitespace. It effectively parses until it has a fully valid document, |
179 | * then starts parsing the next document at that point. (It does this with more parallelism and |
180 | * lookahead than you might think, though.) |
181 | * |
182 | * documents that consist of an object or array may omit the whitespace between them, concatenating |
183 | * with no separator. Documents that consist of a single primitive (i.e. documents that are not |
184 | * arrays or objects) MUST be separated with ASCII whitespace. |
185 | * |
186 | * The characters inside a JSON document, and between JSON documents, must be valid Unicode (UTF-8). |
187 | * |
188 | * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse. |
189 | * Setting batch_size to excessively large or excessively small values may impact negatively the |
190 | * performance. |
191 | * |
192 | * ### REQUIRED: Buffer Padding |
193 | * |
194 | * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what |
195 | * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you |
196 | * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the |
197 | * SIMDJSON_PADDING bytes to avoid runtime warnings. |
198 | * |
199 | * ### Threads |
200 | * |
201 | * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the |
202 | * hood to do some lookahead. |
203 | * |
204 | * ### Parser Capacity |
205 | * |
206 | * If the parser's current capacity is less than batch_size, it will allocate enough capacity |
207 | * to handle it (up to max_capacity). |
208 | * |
209 | * @param buf The concatenated JSON to parse. |
210 | * @param len The length of the concatenated JSON. |
211 | * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet |
212 | * spot is cache-related: small enough to fit in cache, yet big enough to |
213 | * parse as many documents as possible in one tight loop. |
214 | * Defaults to 10MB, which has been a reasonable sweet spot in our tests. |
215 | * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors: |
216 | * - MEMALLOC if the parser does not have enough capacity and memory allocation fails |
217 | * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity. |
218 | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
219 | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
220 | */ |
221 | inline simdjson_result<document_stream> iterate_many(const uint8_t *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; |
222 | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
223 | inline simdjson_result<document_stream> iterate_many(const char *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; |
224 | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
225 | inline simdjson_result<document_stream> iterate_many(const std::string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; |
226 | inline simdjson_result<document_stream> iterate_many(const std::string &&s, size_t batch_size) = delete;// unsafe |
227 | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
228 | inline simdjson_result<document_stream> iterate_many(const padded_string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; |
229 | inline simdjson_result<document_stream> iterate_many(const padded_string &&s, size_t batch_size) = delete;// unsafe |
230 | |
231 | /** @private We do not want to allow implicit conversion from C string to std::string. */ |
232 | simdjson_result<document_stream> iterate_many(const char *buf, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept = delete; |
233 | |
234 | /** The capacity of this parser (the largest document it can process). */ |
235 | simdjson_inline size_t capacity() const noexcept; |
236 | /** The maximum capacity of this parser (the largest document it is allowed to process). */ |
237 | simdjson_inline size_t max_capacity() const noexcept; |
238 | simdjson_inline void set_max_capacity(size_t max_capacity) noexcept; |
239 | /** |
240 | * The maximum depth of this parser (the most deeply nested objects and arrays it can process). |
241 | * This parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true. |
242 | * The document's instance current_depth() method should be used to monitor the parsing |
243 | * depth and limit it if desired. |
244 | */ |
245 | simdjson_inline size_t max_depth() const noexcept; |
246 | |
247 | /** |
248 | * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length |
249 | * and `max_depth` depth. |
250 | * |
251 | * The max_depth parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true. |
252 | * The document's instance current_depth() method should be used to monitor the parsing |
253 | * depth and limit it if desired. |
254 | * |
255 | * @param capacity The new capacity. |
256 | * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. |
257 | * @return The error, if there is one. |
258 | */ |
259 | simdjson_warn_unused error_code allocate(size_t capacity, size_t max_depth=DEFAULT_MAX_DEPTH) noexcept; |
260 | |
261 | #ifdef SIMDJSON_THREADS_ENABLED |
262 | /** |
263 | * The parser instance can use threads when they are available to speed up some |
264 | * operations. It is enabled by default. Changing this attribute will change the |
265 | * behavior of the parser for future operations. |
266 | */ |
267 | bool threaded{true}; |
268 | #endif |
269 | |
270 | /** |
271 | * Unescape this JSON string, replacing \\ with \, \n with newline, etc. to a user-provided buffer. |
272 | * The result must be valid UTF-8. |
273 | * The provided pointer is advanced to the end of the string by reference, and a string_view instance |
274 | * is returned. You can ensure that your buffer is large enough by allocating a block of memory at least |
275 | * as large as the input JSON plus SIMDJSON_PADDING and then unescape all strings to this one buffer. |
276 | * |
277 | * This unescape function is a low-level function. If you want a more user-friendly approach, you should |
278 | * avoid raw_json_string instances (e.g., by calling unescaped_key() instead of key() or get_string() |
279 | * instead of get_raw_json_string()). |
280 | * |
281 | * ## IMPORTANT: string_view lifetime |
282 | * |
283 | * The string_view is only valid as long as the bytes in dst. |
284 | * |
285 | * @param raw_json_string input |
286 | * @param dst A pointer to a buffer at least large enough to write this string as well as |
287 | * an additional SIMDJSON_PADDING bytes. |
288 | * @param allow_replacement Whether we allow a replacement if the input string contains unmatched surrogate pairs. |
289 | * @return A string_view pointing at the unescaped string in dst |
290 | * @error STRING_ERROR if escapes are incorrect. |
291 | */ |
292 | simdjson_inline simdjson_result<std::string_view> unescape(raw_json_string in, uint8_t *&dst, bool allow_replacement = false) const noexcept; |
293 | |
294 | /** |
295 | * Unescape this JSON string, replacing \\ with \, \n with newline, etc. to a user-provided buffer. |
296 | * The result may not be valid UTF-8. See https://simonsapin.github.io/wtf-8/ |
297 | * The provided pointer is advanced to the end of the string by reference, and a string_view instance |
298 | * is returned. You can ensure that your buffer is large enough by allocating a block of memory at least |
299 | * as large as the input JSON plus SIMDJSON_PADDING and then unescape all strings to this one buffer. |
300 | * |
301 | * This unescape function is a low-level function. If you want a more user-friendly approach, you should |
302 | * avoid raw_json_string instances (e.g., by calling unescaped_key() instead of key() or get_string() |
303 | * instead of get_raw_json_string()). |
304 | * |
305 | * ## IMPORTANT: string_view lifetime |
306 | * |
307 | * The string_view is only valid as long as the bytes in dst. |
308 | * |
309 | * @param raw_json_string input |
310 | * @param dst A pointer to a buffer at least large enough to write this string as well as |
311 | * an additional SIMDJSON_PADDING bytes. |
312 | * @return A string_view pointing at the unescaped string in dst |
313 | * @error STRING_ERROR if escapes are incorrect. |
314 | */ |
315 | simdjson_inline simdjson_result<std::string_view> unescape_wobbly(raw_json_string in, uint8_t *&dst) const noexcept; |
316 | |
317 | private: |
318 | /** @private [for benchmarking access] The implementation to use */ |
319 | std::unique_ptr<internal::dom_parser_implementation> implementation{}; |
320 | size_t _capacity{0}; |
321 | size_t _max_capacity; |
322 | size_t _max_depth{DEFAULT_MAX_DEPTH}; |
323 | std::unique_ptr<uint8_t[]> string_buf{}; |
324 | #if SIMDJSON_DEVELOPMENT_CHECKS |
325 | std::unique_ptr<token_position[]> start_positions{}; |
326 | #endif |
327 | |
328 | friend class json_iterator; |
329 | friend class document_stream; |
330 | }; |
331 | |
332 | } // namespace ondemand |
333 | } // namespace SIMDJSON_IMPLEMENTATION |
334 | } // namespace simdjson |
335 | |
336 | namespace simdjson { |
337 | |
338 | template<> |
339 | struct simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::parser> : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_IMPLEMENTATION::ondemand::parser> { |
340 | public: |
341 | simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::parser &&value) noexcept; ///< @private |
342 | simdjson_inline simdjson_result(error_code error) noexcept; ///< @private |
343 | simdjson_inline simdjson_result() noexcept = default; |
344 | }; |
345 | |
346 | } // namespace simdjson |
347 | |