| 1 | #ifndef SIMDJSON_DOM_PARSER_H |
| 2 | #define SIMDJSON_DOM_PARSER_H |
| 3 | |
| 4 | #include "simdjson/common_defs.h" |
| 5 | #include "simdjson/dom/document.h" |
| 6 | #include "simdjson/error.h" |
| 7 | #include "simdjson/internal/dom_parser_implementation.h" |
| 8 | #include "simdjson/internal/tape_ref.h" |
| 9 | #include "simdjson/padded_string.h" |
| 10 | #include "simdjson/portability.h" |
| 11 | #include <memory> |
| 12 | #include <ostream> |
| 13 | #include <string> |
| 14 | |
| 15 | namespace simdjson { |
| 16 | |
| 17 | namespace dom { |
| 18 | |
| 19 | class document_stream; |
| 20 | class element; |
| 21 | |
| 22 | /** The default batch size for parser.parse_many() and parser.load_many() */ |
| 23 | static constexpr size_t DEFAULT_BATCH_SIZE = 1000000; |
| 24 | /** |
| 25 | * Some adversary might try to set the batch size to 0 or 1, which might cause problems. |
| 26 | * We set a minimum of 32B since anything else is highly likely to be an error. In practice, |
| 27 | * most users will want a much larger batch size. |
| 28 | * |
| 29 | * All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON |
| 30 | * document can ever span 0 or 1 byte and that very large values would create memory allocation issues. |
| 31 | */ |
| 32 | static constexpr size_t MINIMAL_BATCH_SIZE = 32; |
| 33 | |
| 34 | /** |
| 35 | * It is wasteful to allocate memory for tiny documents (e.g., 4 bytes). |
| 36 | */ |
| 37 | static constexpr size_t MINIMAL_DOCUMENT_CAPACITY = 32; |
| 38 | |
| 39 | /** |
| 40 | * A persistent document parser. |
| 41 | * |
| 42 | * The parser is designed to be reused, holding the internal buffers necessary to do parsing, |
| 43 | * as well as memory for a single document. The parsed document is overwritten on each parse. |
| 44 | * |
| 45 | * This class cannot be copied, only moved, to avoid unintended allocations. |
| 46 | * |
| 47 | * @note Moving a parser instance may invalidate "dom::element" instances. If you need to |
| 48 | * preserve both the "dom::element" instances and the parser, consider wrapping the parser |
| 49 | * instance in a std::unique_ptr instance: |
| 50 | * |
| 51 | * std::unique_ptr<dom::parser> parser(new dom::parser{}); |
| 52 | * auto error = parser->load(f).get(root); |
| 53 | * |
| 54 | * You can then move std::unique_ptr safely. |
| 55 | * |
| 56 | * @note This is not thread safe: one parser cannot produce two documents at the same time! |
| 57 | */ |
| 58 | class parser { |
| 59 | public: |
| 60 | /** |
| 61 | * Create a JSON parser. |
| 62 | * |
| 63 | * The new parser will have zero capacity. |
| 64 | * |
| 65 | * @param max_capacity The maximum document length the parser can automatically handle. The parser |
| 66 | * will allocate more capacity on an as needed basis (when it sees documents too big to handle) |
| 67 | * up to this amount. The parser still starts with zero capacity no matter what this number is: |
| 68 | * to allocate an initial capacity, call allocate() after constructing the parser. |
| 69 | * Defaults to SIMDJSON_MAXSIZE_BYTES (the largest single document simdjson can process). |
| 70 | */ |
| 71 | simdjson_inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept; |
| 72 | /** |
| 73 | * Take another parser's buffers and state. |
| 74 | * |
| 75 | * @param other The parser to take. Its capacity is zeroed. |
| 76 | */ |
| 77 | simdjson_inline parser(parser &&other) noexcept; |
| 78 | parser(const parser &) = delete; ///< @private Disallow copying |
| 79 | /** |
| 80 | * Take another parser's buffers and state. |
| 81 | * |
| 82 | * @param other The parser to take. Its capacity is zeroed. |
| 83 | */ |
| 84 | simdjson_inline parser &operator=(parser &&other) noexcept; |
| 85 | parser &operator=(const parser &) = delete; ///< @private Disallow copying |
| 86 | |
| 87 | /** Deallocate the JSON parser. */ |
| 88 | ~parser()=default; |
| 89 | |
| 90 | /** |
| 91 | * Load a JSON document from a file and return a reference to it. |
| 92 | * |
| 93 | * dom::parser parser; |
| 94 | * const element doc = parser.load("jsonexamples/twitter.json"); |
| 95 | * |
| 96 | * The function is eager: the file's content is loaded in memory inside the parser instance |
| 97 | * and immediately parsed. The file can be deleted after the `parser.load` call. |
| 98 | * |
| 99 | * ### IMPORTANT: Document Lifetime |
| 100 | * |
| 101 | * The JSON document still lives in the parser: this is the most efficient way to parse JSON |
| 102 | * documents because it reuses the same buffers, but you *must* use the document before you |
| 103 | * destroy the parser or call parse() again. |
| 104 | * |
| 105 | * Moving the parser instance is safe, but it invalidates the element instances. You may store |
| 106 | * the parser instance without moving it by wrapping it inside an `unique_ptr` instance like |
| 107 | * so: `std::unique_ptr<dom::parser> parser(new dom::parser{});`. |
| 108 | * |
| 109 | * ### Parser Capacity |
| 110 | * |
| 111 | * If the parser's current capacity is less than the file length, it will allocate enough capacity |
| 112 | * to handle it (up to max_capacity). |
| 113 | * |
| 114 | * @param path The path to load. |
| 115 | * @return The document, or an error: |
| 116 | * - IO_ERROR if there was an error opening or reading the file. |
| 117 | * Be mindful that on some 32-bit systems, |
| 118 | * the file size might be limited to 2 GB. |
| 119 | * - MEMALLOC if the parser does not have enough capacity and memory allocation fails. |
| 120 | * - CAPACITY if the parser does not have enough capacity and len > max_capacity. |
| 121 | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
| 122 | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
| 123 | */ |
| 124 | inline simdjson_result<element> load(const std::string &path) & noexcept; |
| 125 | inline simdjson_result<element> load(const std::string &path) && = delete ; |
| 126 | /** |
| 127 | * Parse a JSON document and return a temporary reference to it. |
| 128 | * |
| 129 | * dom::parser parser; |
| 130 | * element doc_root = parser.parse(buf, len); |
| 131 | * |
| 132 | * The function eagerly parses the input: the input can be modified and discarded after |
| 133 | * the `parser.parse(buf, len)` call has completed. |
| 134 | * |
| 135 | * ### IMPORTANT: Document Lifetime |
| 136 | * |
| 137 | * The JSON document still lives in the parser: this is the most efficient way to parse JSON |
| 138 | * documents because it reuses the same buffers, but you *must* use the document before you |
| 139 | * destroy the parser or call parse() again. |
| 140 | * |
| 141 | * Moving the parser instance is safe, but it invalidates the element instances. You may store |
| 142 | * the parser instance without moving it by wrapping it inside an `unique_ptr` instance like |
| 143 | * so: `std::unique_ptr<dom::parser> parser(new dom::parser{});`. |
| 144 | * |
| 145 | * ### REQUIRED: Buffer Padding |
| 146 | * |
| 147 | * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what |
| 148 | * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you |
| 149 | * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the |
| 150 | * SIMDJSON_PADDING bytes to avoid runtime warnings. |
| 151 | * |
| 152 | * If realloc_if_needed is true (the default), it is assumed that the buffer does *not* have enough padding, |
| 153 | * and it is copied into an enlarged temporary buffer before parsing. Thus the following is safe: |
| 154 | * |
| 155 | * const char *json = R"({"key":"value"})"; |
| 156 | * const size_t json_len = std::strlen(json); |
| 157 | * simdjson::dom::parser parser; |
| 158 | * simdjson::dom::element element = parser.parse(json, json_len); |
| 159 | * |
| 160 | * If you set realloc_if_needed to false (e.g., parser.parse(json, json_len, false)), |
| 161 | * you must provide a buffer with at least SIMDJSON_PADDING extra bytes at the end. |
| 162 | * The benefit of setting realloc_if_needed to false is that you avoid a temporary |
| 163 | * memory allocation and a copy. |
| 164 | * |
| 165 | * The padded bytes may be read. It is not important how you initialize |
| 166 | * these bytes though we recommend a sensible default like null character values or spaces. |
| 167 | * For example, the following low-level code is safe: |
| 168 | * |
| 169 | * const char *json = R"({"key":"value"})"; |
| 170 | * const size_t json_len = std::strlen(json); |
| 171 | * std::unique_ptr<char[]> padded_json_copy{new char[json_len + SIMDJSON_PADDING]}; |
| 172 | * std::memcpy(padded_json_copy.get(), json, json_len); |
| 173 | * std::memset(padded_json_copy.get() + json_len, '\0', SIMDJSON_PADDING); |
| 174 | * simdjson::dom::parser parser; |
| 175 | * simdjson::dom::element element = parser.parse(padded_json_copy.get(), json_len, false); |
| 176 | * |
| 177 | * ### Parser Capacity |
| 178 | * |
| 179 | * If the parser's current capacity is less than len, it will allocate enough capacity |
| 180 | * to handle it (up to max_capacity). |
| 181 | * |
| 182 | * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless |
| 183 | * realloc_if_needed is true. |
| 184 | * @param len The length of the JSON. |
| 185 | * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding. |
| 186 | * @return An element pointing at the root of the document, or an error: |
| 187 | * - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity, |
| 188 | * and memory allocation fails. |
| 189 | * - CAPACITY if the parser does not have enough capacity and len > max_capacity. |
| 190 | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
| 191 | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
| 192 | */ |
| 193 | inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept; |
| 194 | inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete; |
| 195 | /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
| 196 | simdjson_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) & noexcept; |
| 197 | simdjson_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) && =delete; |
| 198 | /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
| 199 | simdjson_inline simdjson_result<element> parse(const std::string &s) & noexcept; |
| 200 | simdjson_inline simdjson_result<element> parse(const std::string &s) && =delete; |
| 201 | /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
| 202 | simdjson_inline simdjson_result<element> parse(const padded_string &s) & noexcept; |
| 203 | simdjson_inline simdjson_result<element> parse(const padded_string &s) && =delete; |
| 204 | /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
| 205 | simdjson_inline simdjson_result<element> parse(const padded_string_view &v) & noexcept; |
| 206 | simdjson_inline simdjson_result<element> parse(const padded_string_view &v) && =delete; |
| 207 | |
| 208 | /** @private We do not want to allow implicit conversion from C string to std::string. */ |
| 209 | simdjson_inline simdjson_result<element> parse(const char *buf) noexcept = delete; |
| 210 | |
| 211 | /** |
| 212 | * Parse a JSON document into a provide document instance and return a temporary reference to it. |
| 213 | * It is similar to the function `parse` except that instead of parsing into the internal |
| 214 | * `document` instance associated with the parser, it allows the user to provide a document |
| 215 | * instance. |
| 216 | * |
| 217 | * dom::parser parser; |
| 218 | * dom::document doc; |
| 219 | * element doc_root = parser.parse_into_document(doc, buf, len); |
| 220 | * |
| 221 | * The function eagerly parses the input: the input can be modified and discarded after |
| 222 | * the `parser.parse(buf, len)` call has completed. |
| 223 | * |
| 224 | * ### IMPORTANT: Document Lifetime |
| 225 | * |
| 226 | * After the call to parse_into_document, the parser is no longer needed. |
| 227 | * |
| 228 | * The JSON document lives in the document instance: you must keep the document |
| 229 | * instance alive while you navigate through it (i.e., used the returned value from |
| 230 | * parse_into_document). You are encourage to reuse the document instance |
| 231 | * many times with new data to avoid reallocations: |
| 232 | * |
| 233 | * dom::document doc; |
| 234 | * element doc_root1 = parser.parse_into_document(doc, buf1, len); |
| 235 | * //... doc_root1 is a pointer inside doc |
| 236 | * element doc_root2 = parser.parse_into_document(doc, buf1, len); |
| 237 | * //... doc_root2 is a pointer inside doc |
| 238 | * // at this point doc_root1 is no longer safe |
| 239 | * |
| 240 | * Moving the document instance is safe, but it invalidates the element instances. After |
| 241 | * moving a document, you can recover safe access to the document root with its `root()` method. |
| 242 | * |
| 243 | * @param doc The document instance where the parsed data will be stored (on success). |
| 244 | * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless |
| 245 | * realloc_if_needed is true. |
| 246 | * @param len The length of the JSON. |
| 247 | * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding. |
| 248 | * @return An element pointing at the root of document, or an error: |
| 249 | * - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity, |
| 250 | * and memory allocation fails. |
| 251 | * - CAPACITY if the parser does not have enough capacity and len > max_capacity. |
| 252 | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
| 253 | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
| 254 | */ |
| 255 | inline simdjson_result<element> parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept; |
| 256 | inline simdjson_result<element> parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete; |
| 257 | /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
| 258 | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) & noexcept; |
| 259 | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) && =delete; |
| 260 | /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
| 261 | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const std::string &s) & noexcept; |
| 262 | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const std::string &s) && =delete; |
| 263 | /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */ |
| 264 | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const padded_string &s) & noexcept; |
| 265 | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const padded_string &s) && =delete; |
| 266 | |
| 267 | /** @private We do not want to allow implicit conversion from C string to std::string. */ |
| 268 | simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf) noexcept = delete; |
| 269 | |
| 270 | /** |
| 271 | * Load a file containing many JSON documents. |
| 272 | * |
| 273 | * dom::parser parser; |
| 274 | * for (const element doc : parser.load_many(path)) { |
| 275 | * cout << std::string(doc["title"]) << endl; |
| 276 | * } |
| 277 | * |
| 278 | * The file is loaded in memory and can be safely deleted after the `parser.load_many(path)` |
| 279 | * function has returned. The memory is held by the `parser` instance. |
| 280 | * |
| 281 | * The function is lazy: it may be that no more than one JSON document at a time is parsed. |
| 282 | * And, possibly, no document many have been parsed when the `parser.load_many(path)` function |
| 283 | * returned. |
| 284 | * |
| 285 | * ### Format |
| 286 | * |
| 287 | * The file must contain a series of one or more JSON documents, concatenated into a single |
| 288 | * buffer, separated by whitespace. It effectively parses until it has a fully valid document, |
| 289 | * then starts parsing the next document at that point. (It does this with more parallelism and |
| 290 | * lookahead than you might think, though.) |
| 291 | * |
| 292 | * Documents that consist of an object or array may omit the whitespace between them, concatenating |
| 293 | * with no separator. documents that consist of a single primitive (i.e. documents that are not |
| 294 | * arrays or objects) MUST be separated with whitespace. |
| 295 | * |
| 296 | * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse. |
| 297 | * Setting batch_size to excessively large or excesively small values may impact negatively the |
| 298 | * performance. |
| 299 | * |
| 300 | * ### Error Handling |
| 301 | * |
| 302 | * All errors are returned during iteration: if there is a global error such as memory allocation, |
| 303 | * it will be yielded as the first result. Iteration always stops after the first error. |
| 304 | * |
| 305 | * As with all other simdjson methods, non-exception error handling is readily available through |
| 306 | * the same interface, requiring you to check the error before using the document: |
| 307 | * |
| 308 | * dom::parser parser; |
| 309 | * dom::document_stream docs; |
| 310 | * auto error = parser.load_many(path).get(docs); |
| 311 | * if (error) { cerr << error << endl; exit(1); } |
| 312 | * for (auto doc : docs) { |
| 313 | * std::string_view title; |
| 314 | * if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); } |
| 315 | * cout << title << endl; |
| 316 | * } |
| 317 | * |
| 318 | * ### Threads |
| 319 | * |
| 320 | * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the |
| 321 | * hood to do some lookahead. |
| 322 | * |
| 323 | * ### Parser Capacity |
| 324 | * |
| 325 | * If the parser's current capacity is less than batch_size, it will allocate enough capacity |
| 326 | * to handle it (up to max_capacity). |
| 327 | * |
| 328 | * @param path File name pointing at the concatenated JSON to parse. |
| 329 | * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet |
| 330 | * spot is cache-related: small enough to fit in cache, yet big enough to |
| 331 | * parse as many documents as possible in one tight loop. |
| 332 | * Defaults to 1MB (as simdjson::dom::DEFAULT_BATCH_SIZE), which has been a reasonable sweet |
| 333 | * spot in our tests. |
| 334 | * If you set the batch_size to a value smaller than simdjson::dom::MINIMAL_BATCH_SIZE |
| 335 | * (currently 32B), it will be replaced by simdjson::dom::MINIMAL_BATCH_SIZE. |
| 336 | * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors: |
| 337 | * - IO_ERROR if there was an error opening or reading the file. |
| 338 | * - MEMALLOC if the parser does not have enough capacity and memory allocation fails. |
| 339 | * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity. |
| 340 | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
| 341 | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
| 342 | */ |
| 343 | inline simdjson_result<document_stream> load_many(const std::string &path, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; |
| 344 | |
| 345 | /** |
| 346 | * Parse a buffer containing many JSON documents. |
| 347 | * |
| 348 | * dom::parser parser; |
| 349 | * for (element doc : parser.parse_many(buf, len)) { |
| 350 | * cout << std::string(doc["title"]) << endl; |
| 351 | * } |
| 352 | * |
| 353 | * No copy of the input buffer is made. |
| 354 | * |
| 355 | * The function is lazy: it may be that no more than one JSON document at a time is parsed. |
| 356 | * And, possibly, no document many have been parsed when the `parser.load_many(path)` function |
| 357 | * returned. |
| 358 | * |
| 359 | * The caller is responsabile to ensure that the input string data remains unchanged and is |
| 360 | * not deleted during the loop. In particular, the following is unsafe and will not compile: |
| 361 | * |
| 362 | * auto docs = parser.parse_many("[\"temporary data\"]"_padded); |
| 363 | * // here the string "[\"temporary data\"]" may no longer exist in memory |
| 364 | * // the parser instance may not have even accessed the input yet |
| 365 | * for (element doc : docs) { |
| 366 | * cout << std::string(doc["title"]) << endl; |
| 367 | * } |
| 368 | * |
| 369 | * The following is safe: |
| 370 | * |
| 371 | * auto json = "[\"temporary data\"]"_padded; |
| 372 | * auto docs = parser.parse_many(json); |
| 373 | * for (element doc : docs) { |
| 374 | * cout << std::string(doc["title"]) << endl; |
| 375 | * } |
| 376 | * |
| 377 | * ### Format |
| 378 | * |
| 379 | * The buffer must contain a series of one or more JSON documents, concatenated into a single |
| 380 | * buffer, separated by whitespace. It effectively parses until it has a fully valid document, |
| 381 | * then starts parsing the next document at that point. (It does this with more parallelism and |
| 382 | * lookahead than you might think, though.) |
| 383 | * |
| 384 | * documents that consist of an object or array may omit the whitespace between them, concatenating |
| 385 | * with no separator. documents that consist of a single primitive (i.e. documents that are not |
| 386 | * arrays or objects) MUST be separated with whitespace. |
| 387 | * |
| 388 | * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse. |
| 389 | * Setting batch_size to excessively large or excesively small values may impact negatively the |
| 390 | * performance. |
| 391 | * |
| 392 | * ### Error Handling |
| 393 | * |
| 394 | * All errors are returned during iteration: if there is a global error such as memory allocation, |
| 395 | * it will be yielded as the first result. Iteration always stops after the first error. |
| 396 | * |
| 397 | * As with all other simdjson methods, non-exception error handling is readily available through |
| 398 | * the same interface, requiring you to check the error before using the document: |
| 399 | * |
| 400 | * dom::parser parser; |
| 401 | * dom::document_stream docs; |
| 402 | * auto error = parser.load_many(path).get(docs); |
| 403 | * if (error) { cerr << error << endl; exit(1); } |
| 404 | * for (auto doc : docs) { |
| 405 | * std::string_view title; |
| 406 | * if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); } |
| 407 | * cout << title << endl; |
| 408 | * } |
| 409 | * |
| 410 | * ### REQUIRED: Buffer Padding |
| 411 | * |
| 412 | * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what |
| 413 | * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you |
| 414 | * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the |
| 415 | * SIMDJSON_PADDING bytes to avoid runtime warnings. |
| 416 | * |
| 417 | * ### Threads |
| 418 | * |
| 419 | * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the |
| 420 | * hood to do some lookahead. |
| 421 | * |
| 422 | * ### Parser Capacity |
| 423 | * |
| 424 | * If the parser's current capacity is less than batch_size, it will allocate enough capacity |
| 425 | * to handle it (up to max_capacity). |
| 426 | * |
| 427 | * @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes. |
| 428 | * @param len The length of the concatenated JSON. |
| 429 | * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet |
| 430 | * spot is cache-related: small enough to fit in cache, yet big enough to |
| 431 | * parse as many documents as possible in one tight loop. |
| 432 | * Defaults to 10MB, which has been a reasonable sweet spot in our tests. |
| 433 | * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors: |
| 434 | * - MEMALLOC if the parser does not have enough capacity and memory allocation fails |
| 435 | * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity. |
| 436 | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
| 437 | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
| 438 | */ |
| 439 | inline simdjson_result<document_stream> parse_many(const uint8_t *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; |
| 440 | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
| 441 | inline simdjson_result<document_stream> parse_many(const char *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; |
| 442 | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
| 443 | inline simdjson_result<document_stream> parse_many(const std::string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; |
| 444 | inline simdjson_result<document_stream> parse_many(const std::string &&s, size_t batch_size) = delete;// unsafe |
| 445 | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
| 446 | inline simdjson_result<document_stream> parse_many(const padded_string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; |
| 447 | inline simdjson_result<document_stream> parse_many(const padded_string &&s, size_t batch_size) = delete;// unsafe |
| 448 | |
| 449 | /** @private We do not want to allow implicit conversion from C string to std::string. */ |
| 450 | simdjson_result<document_stream> parse_many(const char *buf, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept = delete; |
| 451 | |
| 452 | /** |
| 453 | * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length |
| 454 | * and `max_depth` depth. |
| 455 | * |
| 456 | * @param capacity The new capacity. |
| 457 | * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. |
| 458 | * @return The error, if there is one. |
| 459 | */ |
| 460 | simdjson_warn_unused inline error_code allocate(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept; |
| 461 | |
| 462 | #ifndef SIMDJSON_DISABLE_DEPRECATED_API |
| 463 | /** |
| 464 | * @private deprecated because it returns bool instead of error_code, which is our standard for |
| 465 | * failures. Use allocate() instead. |
| 466 | * |
| 467 | * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length |
| 468 | * and `max_depth` depth. |
| 469 | * |
| 470 | * @param capacity The new capacity. |
| 471 | * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. |
| 472 | * @return true if successful, false if allocation failed. |
| 473 | */ |
| 474 | [[deprecated("Use allocate() instead." )]] |
| 475 | simdjson_warn_unused inline bool allocate_capacity(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept; |
| 476 | #endif // SIMDJSON_DISABLE_DEPRECATED_API |
| 477 | /** |
| 478 | * The largest document this parser can support without reallocating. |
| 479 | * |
| 480 | * @return Current capacity, in bytes. |
| 481 | */ |
| 482 | simdjson_inline size_t capacity() const noexcept; |
| 483 | |
| 484 | /** |
| 485 | * The largest document this parser can automatically support. |
| 486 | * |
| 487 | * The parser may reallocate internal buffers as needed up to this amount. |
| 488 | * |
| 489 | * @return Maximum capacity, in bytes. |
| 490 | */ |
| 491 | simdjson_inline size_t max_capacity() const noexcept; |
| 492 | |
| 493 | /** |
| 494 | * The maximum level of nested object and arrays supported by this parser. |
| 495 | * |
| 496 | * @return Maximum depth, in bytes. |
| 497 | */ |
| 498 | simdjson_inline size_t max_depth() const noexcept; |
| 499 | |
| 500 | /** |
| 501 | * Set max_capacity. This is the largest document this parser can automatically support. |
| 502 | * |
| 503 | * The parser may reallocate internal buffers as needed up to this amount as documents are passed |
| 504 | * to it. |
| 505 | * |
| 506 | * Note: To avoid limiting the memory to an absurd value, such as zero or two bytes, |
| 507 | * iff you try to set max_capacity to a value lower than MINIMAL_DOCUMENT_CAPACITY, |
| 508 | * then the maximal capacity is set to MINIMAL_DOCUMENT_CAPACITY. |
| 509 | * |
| 510 | * This call will not allocate or deallocate, even if capacity is currently above max_capacity. |
| 511 | * |
| 512 | * @param max_capacity The new maximum capacity, in bytes. |
| 513 | */ |
| 514 | simdjson_inline void set_max_capacity(size_t max_capacity) noexcept; |
| 515 | |
| 516 | #ifdef SIMDJSON_THREADS_ENABLED |
| 517 | /** |
| 518 | * The parser instance can use threads when they are available to speed up some |
| 519 | * operations. It is enabled by default. Changing this attribute will change the |
| 520 | * behavior of the parser for future operations. |
| 521 | */ |
| 522 | bool threaded{true}; |
| 523 | #endif |
| 524 | /** @private Use the new DOM API instead */ |
| 525 | class Iterator; |
| 526 | /** @private Use simdjson_error instead */ |
| 527 | using InvalidJSON [[deprecated("Use simdjson_error instead" )]] = simdjson_error; |
| 528 | |
| 529 | /** @private [for benchmarking access] The implementation to use */ |
| 530 | std::unique_ptr<internal::dom_parser_implementation> implementation{}; |
| 531 | |
| 532 | /** @private Use `if (parser.parse(...).error())` instead */ |
| 533 | bool valid{false}; |
| 534 | /** @private Use `parser.parse(...).error()` instead */ |
| 535 | error_code error{UNINITIALIZED}; |
| 536 | |
| 537 | /** @private Use `parser.parse(...).value()` instead */ |
| 538 | document doc{}; |
| 539 | |
| 540 | /** @private returns true if the document parsed was valid */ |
| 541 | [[deprecated("Use the result of parser.parse() instead" )]] |
| 542 | inline bool is_valid() const noexcept; |
| 543 | |
| 544 | /** |
| 545 | * @private return an error code corresponding to the last parsing attempt, see |
| 546 | * simdjson.h will return UNINITIALIZED if no parsing was attempted |
| 547 | */ |
| 548 | [[deprecated("Use the result of parser.parse() instead" )]] |
| 549 | inline int get_error_code() const noexcept; |
| 550 | |
| 551 | /** @private return the string equivalent of "get_error_code" */ |
| 552 | [[deprecated("Use error_message() on the result of parser.parse() instead, or cout << error" )]] |
| 553 | inline std::string get_error_message() const noexcept; |
| 554 | |
| 555 | /** @private */ |
| 556 | [[deprecated("Use cout << on the result of parser.parse() instead" )]] |
| 557 | inline bool print_json(std::ostream &os) const noexcept; |
| 558 | |
| 559 | /** @private Private and deprecated: use `parser.parse(...).doc.dump_raw_tape()` instead */ |
| 560 | inline bool dump_raw_tape(std::ostream &os) const noexcept; |
| 561 | |
| 562 | |
| 563 | private: |
| 564 | /** |
| 565 | * The maximum document length this parser will automatically support. |
| 566 | * |
| 567 | * The parser will not be automatically allocated above this amount. |
| 568 | */ |
| 569 | size_t _max_capacity; |
| 570 | |
| 571 | /** |
| 572 | * The loaded buffer (reused each time load() is called) |
| 573 | */ |
| 574 | std::unique_ptr<char[]> loaded_bytes; |
| 575 | |
| 576 | /** Capacity of loaded_bytes buffer. */ |
| 577 | size_t _loaded_bytes_capacity{0}; |
| 578 | |
| 579 | // all nodes are stored on the doc.tape using a 64-bit word. |
| 580 | // |
| 581 | // strings, double and ints are stored as |
| 582 | // a 64-bit word with a pointer to the actual value |
| 583 | // |
| 584 | // |
| 585 | // |
| 586 | // for objects or arrays, store [ or { at the beginning and } and ] at the |
| 587 | // end. For the openings ([ or {), we annotate them with a reference to the |
| 588 | // location on the doc.tape of the end, and for then closings (} and ]), we |
| 589 | // annotate them with a reference to the location of the opening |
| 590 | // |
| 591 | // |
| 592 | |
| 593 | /** |
| 594 | * Ensure we have enough capacity to handle at least desired_capacity bytes, |
| 595 | * and auto-allocate if not. This also allocates memory if needed in the |
| 596 | * internal document. |
| 597 | */ |
| 598 | inline error_code ensure_capacity(size_t desired_capacity) noexcept; |
| 599 | /** |
| 600 | * Ensure we have enough capacity to handle at least desired_capacity bytes, |
| 601 | * and auto-allocate if not. This also allocates memory if needed in the |
| 602 | * provided document. |
| 603 | */ |
| 604 | inline error_code ensure_capacity(document& doc, size_t desired_capacity) noexcept; |
| 605 | |
| 606 | /** Read the file into loaded_bytes */ |
| 607 | inline simdjson_result<size_t> read_file(const std::string &path) noexcept; |
| 608 | |
| 609 | friend class parser::Iterator; |
| 610 | friend class document_stream; |
| 611 | |
| 612 | |
| 613 | }; // class parser |
| 614 | |
| 615 | } // namespace dom |
| 616 | } // namespace simdjson |
| 617 | |
| 618 | #endif // SIMDJSON_DOM_PARSER_H |
| 619 | |