1#include "simdjson/error.h"
2
3namespace simdjson {
4namespace SIMDJSON_IMPLEMENTATION {
5namespace ondemand {
6
7class array;
8class object;
9class value;
10class raw_json_string;
11class document_stream;
12
13/**
14 * The default batch size for document_stream instances for this On Demand kernel.
15 * Note that different On Demand kernel may use a different DEFAULT_BATCH_SIZE value
16 * in the future.
17 */
18static constexpr size_t DEFAULT_BATCH_SIZE = 1000000;
19/**
20 * Some adversary might try to set the batch size to 0 or 1, which might cause problems.
21 * We set a minimum of 32B since anything else is highly likely to be an error. In practice,
22 * most users will want a much larger batch size.
23 *
24 * All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON
25 * document can ever span 0 or 1 byte and that very large values would create memory allocation issues.
26 */
27static constexpr size_t MINIMAL_BATCH_SIZE = 32;
28
29/**
30 * A JSON fragment iterator.
31 *
32 * This holds the actual iterator as well as the buffer for writing strings.
33 */
34class parser {
35public:
36 /**
37 * Create a JSON parser.
38 *
39 * The new parser will have zero capacity.
40 */
41 inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept;
42
43 inline parser(parser &&other) noexcept = default;
44 simdjson_inline parser(const parser &other) = delete;
45 simdjson_inline parser &operator=(const parser &other) = delete;
46 simdjson_inline parser &operator=(parser &&other) noexcept = default;
47
48 /** Deallocate the JSON parser. */
49 inline ~parser() noexcept = default;
50
51 /**
52 * Start iterating an on-demand JSON document.
53 *
54 * ondemand::parser parser;
55 * document doc = parser.iterate(json);
56 *
57 * It is expected that the content is a valid UTF-8 file, containing a valid JSON document.
58 * Otherwise the iterate method may return an error. In particular, the whole input should be
59 * valid: we do not attempt to tolerate incorrect content either before or after a JSON
60 * document.
61 *
62 * ### IMPORTANT: Validate what you use
63 *
64 * Calling iterate on an invalid JSON document may not immediately trigger an error. The call to
65 * iterate does not parse and validate the whole document.
66 *
67 * ### IMPORTANT: Buffer Lifetime
68 *
69 * Because parsing is done while you iterate, you *must* keep the JSON buffer around at least as
70 * long as the document iteration.
71 *
72 * ### IMPORTANT: Document Lifetime
73 *
74 * Only one iteration at a time can happen per parser, and the parser *must* be kept alive during
75 * iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before
76 * you call parse() again or destroy the parser.
77 *
78 * ### REQUIRED: Buffer Padding
79 *
80 * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
81 * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you
82 * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the
83 * SIMDJSON_PADDING bytes to avoid runtime warnings.
84 *
85 * @param json The JSON to parse.
86 * @param len The length of the JSON.
87 * @param capacity The number of bytes allocated in the JSON (must be at least len+SIMDJSON_PADDING).
88 *
89 * @return The document, or an error:
90 * - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes.
91 * - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory
92 * allocation fails.
93 * - EMPTY if the document is all whitespace.
94 * - UTF8_ERROR if the document is not valid UTF-8.
95 * - UNESCAPED_CHARS if a string contains control characters that must be escaped
96 * - UNCLOSED_STRING if there is an unclosed string in the document.
97 */
98 simdjson_warn_unused simdjson_result<document> iterate(padded_string_view json) & noexcept;
99 /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
100 simdjson_warn_unused simdjson_result<document> iterate(const char *json, size_t len, size_t capacity) & noexcept;
101 /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
102 simdjson_warn_unused simdjson_result<document> iterate(const uint8_t *json, size_t len, size_t capacity) & noexcept;
103 /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
104 simdjson_warn_unused simdjson_result<document> iterate(std::string_view json, size_t capacity) & noexcept;
105 /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
106 simdjson_warn_unused simdjson_result<document> iterate(const std::string &json) & noexcept;
107 /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
108 simdjson_warn_unused simdjson_result<document> iterate(const simdjson_result<padded_string> &json) & noexcept;
109 /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
110 simdjson_warn_unused simdjson_result<document> iterate(const simdjson_result<padded_string_view> &json) & noexcept;
111 /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
112 simdjson_warn_unused simdjson_result<document> iterate(padded_string &&json) & noexcept = delete;
113
114 /**
115 * @private
116 *
117 * Start iterating an on-demand JSON document.
118 *
119 * ondemand::parser parser;
120 * json_iterator doc = parser.iterate(json);
121 *
122 * ### IMPORTANT: Buffer Lifetime
123 *
124 * Because parsing is done while you iterate, you *must* keep the JSON buffer around at least as
125 * long as the document iteration.
126 *
127 * ### IMPORTANT: Document Lifetime
128 *
129 * Only one iteration at a time can happen per parser, and the parser *must* be kept alive during
130 * iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before
131 * you call parse() again or destroy the parser.
132 *
133 * The ondemand::document instance holds the iterator. The document must remain in scope
134 * while you are accessing instances of ondemand::value, ondemand::object, ondemand::array.
135 *
136 * ### REQUIRED: Buffer Padding
137 *
138 * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
139 * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you
140 * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the
141 * SIMDJSON_PADDING bytes to avoid runtime warnings.
142 *
143 * @param json The JSON to parse.
144 *
145 * @return The iterator, or an error:
146 * - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes.
147 * - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory
148 * allocation fails.
149 * - EMPTY if the document is all whitespace.
150 * - UTF8_ERROR if the document is not valid UTF-8.
151 * - UNESCAPED_CHARS if a string contains control characters that must be escaped
152 * - UNCLOSED_STRING if there is an unclosed string in the document.
153 */
154 simdjson_warn_unused simdjson_result<json_iterator> iterate_raw(padded_string_view json) & noexcept;
155
156
157 /**
158 * Parse a buffer containing many JSON documents.
159 *
160 * auto json = R"({ "foo": 1 } { "foo": 2 } { "foo": 3 } )"_padded;
161 * ondemand::parser parser;
162 * ondemand::document_stream docs = parser.iterate_many(json);
163 * for (auto & doc : docs) {
164 * std::cout << doc["foo"] << std::endl;
165 * }
166 * // Prints 1 2 3
167 *
168 * No copy of the input buffer is made.
169 *
170 * The function is lazy: it may be that no more than one JSON document at a time is parsed.
171 *
172 * The caller is responsabile to ensure that the input string data remains unchanged and is
173 * not deleted during the loop.
174 *
175 * ### Format
176 *
177 * The buffer must contain a series of one or more JSON documents, concatenated into a single
178 * buffer, separated by ASCII whitespace. It effectively parses until it has a fully valid document,
179 * then starts parsing the next document at that point. (It does this with more parallelism and
180 * lookahead than you might think, though.)
181 *
182 * documents that consist of an object or array may omit the whitespace between them, concatenating
183 * with no separator. Documents that consist of a single primitive (i.e. documents that are not
184 * arrays or objects) MUST be separated with ASCII whitespace.
185 *
186 * The characters inside a JSON document, and between JSON documents, must be valid Unicode (UTF-8).
187 *
188 * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.
189 * Setting batch_size to excessively large or excessively small values may impact negatively the
190 * performance.
191 *
192 * ### REQUIRED: Buffer Padding
193 *
194 * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
195 * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you
196 * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the
197 * SIMDJSON_PADDING bytes to avoid runtime warnings.
198 *
199 * ### Threads
200 *
201 * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
202 * hood to do some lookahead.
203 *
204 * ### Parser Capacity
205 *
206 * If the parser's current capacity is less than batch_size, it will allocate enough capacity
207 * to handle it (up to max_capacity).
208 *
209 * @param buf The concatenated JSON to parse.
210 * @param len The length of the concatenated JSON.
211 * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
212 * spot is cache-related: small enough to fit in cache, yet big enough to
213 * parse as many documents as possible in one tight loop.
214 * Defaults to 10MB, which has been a reasonable sweet spot in our tests.
215 * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:
216 * - MEMALLOC if the parser does not have enough capacity and memory allocation fails
217 * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity.
218 * - other json errors if parsing fails. You should not rely on these errors to always the same for the
219 * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
220 */
221 inline simdjson_result<document_stream> iterate_many(const uint8_t *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
222 /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
223 inline simdjson_result<document_stream> iterate_many(const char *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
224 /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
225 inline simdjson_result<document_stream> iterate_many(const std::string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
226 inline simdjson_result<document_stream> iterate_many(const std::string &&s, size_t batch_size) = delete;// unsafe
227 /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
228 inline simdjson_result<document_stream> iterate_many(const padded_string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
229 inline simdjson_result<document_stream> iterate_many(const padded_string &&s, size_t batch_size) = delete;// unsafe
230
231 /** @private We do not want to allow implicit conversion from C string to std::string. */
232 simdjson_result<document_stream> iterate_many(const char *buf, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept = delete;
233
234 /** The capacity of this parser (the largest document it can process). */
235 simdjson_inline size_t capacity() const noexcept;
236 /** The maximum capacity of this parser (the largest document it is allowed to process). */
237 simdjson_inline size_t max_capacity() const noexcept;
238 simdjson_inline void set_max_capacity(size_t max_capacity) noexcept;
239 /**
240 * The maximum depth of this parser (the most deeply nested objects and arrays it can process).
241 * This parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true.
242 * The document's instance current_depth() method should be used to monitor the parsing
243 * depth and limit it if desired.
244 */
245 simdjson_inline size_t max_depth() const noexcept;
246
247 /**
248 * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
249 * and `max_depth` depth.
250 *
251 * The max_depth parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true.
252 * The document's instance current_depth() method should be used to monitor the parsing
253 * depth and limit it if desired.
254 *
255 * @param capacity The new capacity.
256 * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
257 * @return The error, if there is one.
258 */
259 simdjson_warn_unused error_code allocate(size_t capacity, size_t max_depth=DEFAULT_MAX_DEPTH) noexcept;
260
261 #ifdef SIMDJSON_THREADS_ENABLED
262 /**
263 * The parser instance can use threads when they are available to speed up some
264 * operations. It is enabled by default. Changing this attribute will change the
265 * behavior of the parser for future operations.
266 */
267 bool threaded{true};
268 #endif
269
270 /**
271 * Unescape this JSON string, replacing \\ with \, \n with newline, etc. to a user-provided buffer.
272 * The result must be valid UTF-8.
273 * The provided pointer is advanced to the end of the string by reference, and a string_view instance
274 * is returned. You can ensure that your buffer is large enough by allocating a block of memory at least
275 * as large as the input JSON plus SIMDJSON_PADDING and then unescape all strings to this one buffer.
276 *
277 * This unescape function is a low-level function. If you want a more user-friendly approach, you should
278 * avoid raw_json_string instances (e.g., by calling unescaped_key() instead of key() or get_string()
279 * instead of get_raw_json_string()).
280 *
281 * ## IMPORTANT: string_view lifetime
282 *
283 * The string_view is only valid as long as the bytes in dst.
284 *
285 * @param raw_json_string input
286 * @param dst A pointer to a buffer at least large enough to write this string as well as
287 * an additional SIMDJSON_PADDING bytes.
288 * @param allow_replacement Whether we allow a replacement if the input string contains unmatched surrogate pairs.
289 * @return A string_view pointing at the unescaped string in dst
290 * @error STRING_ERROR if escapes are incorrect.
291 */
292 simdjson_inline simdjson_result<std::string_view> unescape(raw_json_string in, uint8_t *&dst, bool allow_replacement = false) const noexcept;
293
294 /**
295 * Unescape this JSON string, replacing \\ with \, \n with newline, etc. to a user-provided buffer.
296 * The result may not be valid UTF-8. See https://simonsapin.github.io/wtf-8/
297 * The provided pointer is advanced to the end of the string by reference, and a string_view instance
298 * is returned. You can ensure that your buffer is large enough by allocating a block of memory at least
299 * as large as the input JSON plus SIMDJSON_PADDING and then unescape all strings to this one buffer.
300 *
301 * This unescape function is a low-level function. If you want a more user-friendly approach, you should
302 * avoid raw_json_string instances (e.g., by calling unescaped_key() instead of key() or get_string()
303 * instead of get_raw_json_string()).
304 *
305 * ## IMPORTANT: string_view lifetime
306 *
307 * The string_view is only valid as long as the bytes in dst.
308 *
309 * @param raw_json_string input
310 * @param dst A pointer to a buffer at least large enough to write this string as well as
311 * an additional SIMDJSON_PADDING bytes.
312 * @return A string_view pointing at the unescaped string in dst
313 * @error STRING_ERROR if escapes are incorrect.
314 */
315 simdjson_inline simdjson_result<std::string_view> unescape_wobbly(raw_json_string in, uint8_t *&dst) const noexcept;
316
317private:
318 /** @private [for benchmarking access] The implementation to use */
319 std::unique_ptr<internal::dom_parser_implementation> implementation{};
320 size_t _capacity{0};
321 size_t _max_capacity;
322 size_t _max_depth{DEFAULT_MAX_DEPTH};
323 std::unique_ptr<uint8_t[]> string_buf{};
324#if SIMDJSON_DEVELOPMENT_CHECKS
325 std::unique_ptr<token_position[]> start_positions{};
326#endif
327
328 friend class json_iterator;
329 friend class document_stream;
330};
331
332} // namespace ondemand
333} // namespace SIMDJSON_IMPLEMENTATION
334} // namespace simdjson
335
336namespace simdjson {
337
338template<>
339struct simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::parser> : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_IMPLEMENTATION::ondemand::parser> {
340public:
341 simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::parser &&value) noexcept; ///< @private
342 simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
343 simdjson_inline simdjson_result() noexcept = default;
344};
345
346} // namespace simdjson
347